tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ipred.S (11120B)


      1 /******************************************************************************
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2024, Bogdan Gligorijevic
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 *****************************************************************************/
     27 
     28 #include "src/riscv/asm.S"
     29 
     30 // void ipred_v_8bpc_rvv(pixel *dst, const ptrdiff_t stride,
     31 //                       const pixel *const topleft,
     32 //                       const int width, const int height, const int a,
     33 //                       const int max_width, const int max_height)
     34 function ipred_v_8bpc_rvv, export=1, ext="v,zba"
     35    csrw vxrm, zero
     36    addi a2, a2, 1
     37    vsetvli t0, a3, e8, m1, ta, ma
     38    bne t0, a3, 3f  // Go to slow path - whole row doesn't fit in regsiter
     39 1:
     40    // Fast path - row fits in register
     41    add t1, a0, a1
     42    vle8.v v4, (a2)
     43 2:
     44    vse8.v v4, (a0)
     45    sh1add a0, a1, a0
     46    vse8.v v4, (t1)
     47    sh1add t1, a1, t1
     48    addi a4, a4, -2
     49    bnez a4, 2b
     50    ret
     51 
     52    // Row doesn't fit in register.
     53 3:
     54    vsetvli t0, a3, e8, m2, ta, ma  // Try using 2 registers at once (LMUL=2)
     55    beq t0, a3, 1b  // Back to fast path - now it fits
     56 
     57 4:
     58    // No need for more vsetli, since both width and VLEN are power of 2, so there is no tail.
     59    vle8.v v4, (a2)
     60    mv t2, a0
     61    mv t1, a4
     62 5:
     63    vse8.v v4, (t2)
     64    add t2, t2, a1
     65    addi t1, t1, -1
     66    bnez t1, 5b  // Loop over rows.
     67 
     68    sub a3, a3, t0
     69    add a2, a2, t0
     70    add a0, a0, t0
     71    bnez a3, 4b  // Loop over columns
     72 
     73    ret
     74 endfunc
     75 
     76 
     77 function dc_gen_8bpc_rvv, export=1, ext="v,zbb"
     78    .variant_cc dav1d_dc_gen_8bpc_rvv
     79    add t1, a1, a2
     80    srli t5, t1, 1
     81    mv t1, a1
     82    addi t2, a0, 1
     83    vsetvli zero, t1, e16, m4, ta, ma
     84    vmv.v.x v0, zero
     85 1:
     86    vsetvli t3, t1, e8, m2, tu, ma
     87    vle8.v v4, (t2)
     88    vwaddu.wv v0, v0, v4
     89 
     90    sub t1, t1, t3
     91    add t2, t2, t3
     92    bnez t1, 1b
     93 
     94    mv t1, a2
     95    mv t2, a0
     96    vsetvli zero, t1, e16, m4, ta, ma
     97    vmv.v.x v8, zero
     98 2:
     99    vsetvli t3, t1, e8, m2, tu, ma
    100    sub t2, t2, t3
    101    vle8.v v4, (t2)
    102    vwaddu.wv v8, v8, v4
    103    sub t1, t1, t3
    104 
    105    bnez t1, 2b
    106 
    107    vsetvli zero, zero, e32, m8, ta, ma
    108    vmv.s.x v16, t5
    109    vmv.s.x v12, zero
    110    vsetvli zero, a1, e16, m4, ta, ma
    111    vwredsum.vs v24, v0, v16
    112    vsetvli zero, a2, e16, m4, ta, ma
    113    vwredsum.vs v16, v8, v12
    114    vsetvli zero, zero, e32, m8, ta, ma
    115    vmv.x.s t5, v24
    116    vmv.x.s t1, v16
    117    add t5, t5, t1
    118 
    119    add t1, a1, a2
    120    ctz t1, t1
    121 
    122    srl a0, t5, t1
    123 
    124 
    125    beq a1, a2, 5f
    126    slli t1, a1, 1
    127    sltu t2, t1, a2
    128    slli t3, a2, 1
    129    sltu t1, t3, a1
    130    or t1, t1, t2
    131    bnez t1, 3f
    132 
    133    li t1, 0x5556
    134    j 4f
    135 3:
    136    li t1, 0x3334
    137 4:
    138    mul a0, a0, t1
    139    srli a0, a0, 16
    140 5:
    141    jr t0
    142 endfunc
    143 
    144 function dc_gen_top_8bpc_rvv, export=1, ext="v,zbb"
    145    .variant_cc dav1d_dc_gen_top_8bpc_rvv
    146    mv t1, a1
    147    srli t5, a1, 1
    148    addi a0, a0, 1
    149    vsetvli zero, t1, e16, m4, ta, ma
    150    vmv.v.x v0, zero
    151 1:
    152    vsetvli t3, t1, e8, m2, tu, ma
    153    vle8.v v4, (a0)
    154    vwaddu.wv v0, v0, v4
    155    sub t1, t1, t3
    156 
    157    add a0, a0, t3
    158    bnez t1, 1b
    159    j dc_gen_sum_up_8bpc_rvv
    160 endfunc
    161 
    162 function dc_gen_left_8bpc_rvv, export=1, ext="v,zbb"
    163    .variant_cc dav1d_dc_gen_left_8bpc_rvv
    164    mv t1, a1
    165    srli t5, a1, 1
    166    vsetvli t2, t1, e16, m4, ta, ma
    167    vmv.v.x v0, zero
    168 
    169 1:
    170    vsetvli t3, t1, e8, m2, tu, ma
    171    sub a0, a0, t3
    172    vle8.v v4, (a0)
    173    vwaddu.wv v0, v0, v4
    174    sub t1, t1, t3
    175    bnez t1, 1b
    176 
    177    j dc_gen_sum_up_8bpc_rvv
    178 endfunc
    179 
    180 function dc_gen_sum_up_8bpc_rvv, export=1, ext="v,zbb"
    181    .variant_cc dav1d_dc_gen_sum_up_8bpc_rvv
    182    vsetvli zero, a1, e32, m8, ta, ma
    183    vmv.s.x v4, t5
    184    vsetvli zero, zero, e16, m4, ta, ma
    185    vwredsum.vs v8, v0, v4
    186    vsetvli zero, zero, e32, m8, ta, ma
    187    vmv.x.s t5, v8
    188 
    189    ctz t1, a1
    190 
    191    srl a0, t5, t1
    192    jr t0
    193 endfunc
    194 
    195 function cfl_pred_8bpc_rvv, export=1, ext="v,zba"
    196    csrw vxrm, zero
    197 1:
    198    li t2, 0
    199    mv t3, a2
    200 2:
    201    vsetvli t0, t3, e16, m2, ta, ma
    202    add t4, a0, t2
    203    vle16.v v0, (a5)
    204    sh1add a5, t0, a5
    205 
    206    vwmul.vx v4, v0, a6
    207    vsetvli zero, zero, e32, m4, ta, mu
    208    vneg.v v8, v4
    209    vmslt.vx v0, v4, x0
    210    vmax.vv v12, v8, v4
    211    vssra.vi v16, v12, 6
    212    vneg.v v16, v16, v0.t
    213    vadd.vx v20, v16, a4
    214    vmax.vx v0, v20, zero
    215    vsetvli zero, zero, e16, m2, ta, ma
    216    vnclipu.wi v4, v0, 0
    217    vsetvli zero, zero, e8, m1, ta, ma
    218    vnclipu.wi v0, v4, 0
    219    vse8.v v0, (t4)
    220    add t2, t0, t2
    221    sub t3, t3, t0
    222    bnez t3, 2b
    223    addi a3, a3, -1
    224    add a0, a0, a1
    225 
    226    bnez a3, 1b
    227    ret
    228 endfunc
    229 
    230 function ipred_cfl_8bpc_rvv, export=1, ext=v
    231    mv t6, a0 # dst
    232    mv a0, a2 # topleft
    233    mv t4, a1 # stride
    234    mv a1, a3 # width
    235    mv a2, a4 # height
    236    jal t0, dc_gen_8bpc_rvv
    237    mv a2, a3 # width
    238    mv a3, a4 # height
    239    mv a4, a0 # dc_get_top
    240    mv a0, t6 # dst
    241    mv a1, t4 # stride
    242    j cfl_pred_8bpc_rvv
    243 endfunc
    244 
    245 function ipred_cfl_128_8bpc_rvv, export=1, ext="v,zba"
    246    # dc = 128, then just rearrange registers
    247    mv a2, a3
    248    mv a3, a4
    249    li a4, 128
    250 
    251    j cfl_pred_8bpc_rvv
    252 endfunc
    253 
    254 function ipred_cfl_top_8bpc_rvv, export=1, ext=v
    255    mv t6, a0 # dst
    256    mv a0, a2 # topleft
    257    mv t4, a1 # stride
    258    mv a1, a3 # width
    259    jal t0, dc_gen_top_8bpc_rvv
    260    mv a3, a4 # height
    261    mv a4, a0 # dc_get_top
    262    mv a0, t6 # dst
    263    mv a2, a1 # width
    264    mv a1, t4 # stride
    265    j cfl_pred_8bpc_rvv
    266 endfunc
    267 
    268 function ipred_cfl_left_8bpc_rvv, export=1, ext="v,zba"
    269    mv t6, a0 # dst
    270    mv a0, a2 # topleft
    271    mv t4, a1 # stride
    272    mv a1, a4 # height
    273    mv a2, a3 # width
    274    jal t0, dc_gen_left_8bpc_rvv
    275    mv a3, a4 # height
    276    mv a4, a0 # dc_get_left
    277    mv a1, t4 # stride
    278    mv a0, t6 # dst
    279    j cfl_pred_8bpc_rvv
    280 endfunc
    281 
    282 function ipred_paeth_8bpc_rvv, export=1, ext="v,zba"
    283    csrw vxrm, zero
    284    li t0, 0
    285    mv t3, a2
    286    lbu t1, (a2)
    287    addi a6, a2, -1
    288    addi a2, a2, 1
    289 1:
    290    lbu t2, (a6)
    291    mv t3, a3
    292 2:
    293    sub t5, a3, t3
    294    add t5, a2, t5
    295    vsetvli t6, t3, e8, m1, ta, ma
    296    vle8.v v2, (t5)
    297    vwaddu.vx v4, v2, t2
    298    vsetvli zero, zero, e16, m2, ta, ma
    299    vwsub.vx v8, v4, t1
    300 
    301    vsetvli zero, zero, e32, m4, ta, mu
    302    vzext.vf4 v24, v2
    303    vsub.vx v12, v8, t1
    304    vmslt.vx v0, v12, zero
    305    vneg.v v12, v12, v0.t
    306    vsub.vx v16, v8, t2
    307    vmslt.vx v0, v16, zero
    308    vneg.v v16, v16, v0.t
    309    vsub.vv v20, v8, v24
    310    vmslt.vx v0, v20, zero
    311    vneg.v v20, v20, v0.t
    312 
    313    sub t5, a3, t3
    314    vmsleu.vv v4, v16, v20
    315    vmsleu.vv v5, v16, v12
    316    vmsgtu.vv v0, v20, v12
    317    vmand.mm v6, v4, v5
    318 
    319    vsetvli zero, zero, e8, m1, ta, ma
    320    vmerge.vxm v8, v2, t1, v0
    321    vmmv.m v0, v6
    322    add t5, a0, t5
    323    sub t3, t3, t6
    324    vmerge.vxm v4, v8, t2, v0
    325 
    326    vse8.v v4, (t5)
    327 
    328    bnez t3, 2b
    329 
    330    addi a4, a4, -1
    331    addi a6, a6, -1
    332    add a0, a0, a1
    333    bnez a4, 1b
    334    ret
    335 endfunc
    336 
    337 function ipred_smooth_8bpc_rvv, export=1, ext="v,zba"
    338    csrw vxrm, zero
    339    la t0, dav1d_sm_weights
    340    add t1, t0, a3
    341    add t2, a2, a3
    342    add t0, t0, a4
    343    lbu t2, (t2)
    344    sub t3, a2, a4
    345    addi a6, a2, -1
    346    addi a2, a2, 1
    347    lbu t3, (t3)
    348 1:
    349    mv t6, a3
    350 
    351    lbu a7, (a6)
    352    lbu t4, (t0)
    353 2:
    354    li a5, 256
    355    vsetvli t5, t6, e8, m1, ta, ma
    356    vle8.v v2, (t1)
    357    add t1, t1, t5
    358    vle8.v v4, (a2)
    359    add a2, a2, t5
    360    sub a5, a5, t4
    361 
    362    vwmulu.vx v8, v4, t4
    363    vsetvli zero, zero, e16, m2, ta, ma
    364    mul a5, a5, t3
    365 
    366    vadd.vx v4, v8, a5
    367    vsetvli zero, zero, e8, m1, ta, ma
    368    vwmulu.vx v8, v2, a7
    369 
    370    vneg.v v12, v2
    371    vwmaccu.vx v8, t2, v12
    372    vsetvli zero, zero, e16, m2, ta, ma
    373    vwaddu.vv v12, v4, v8
    374 
    375    sub a5, a3, t6
    376    sub t6, t6, t5
    377    add a5, a5, a0
    378    vnclipu.wi v2, v12, 9
    379    vsetvli zero, zero, e8, m1, ta, ma
    380    vnclipu.wi v0, v2, 0
    381    vse8.v v0, (a5)
    382 
    383    bnez t6, 2b
    384 
    385    sub t1, t1, a3
    386    add a0, a0, a1
    387    sub a2, a2, a3
    388    addi a4, a4, -1
    389    addi t0, t0, 1
    390    addi a6, a6, -1
    391    bnez a4, 1b
    392 
    393    ret
    394 endfunc
    395 
    396 function ipred_smooth_v_8bpc_rvv, export=1, ext="v,zba"
    397    csrw vxrm, zero
    398    la t0, dav1d_sm_weights
    399    add t2, a2, a3
    400    add t0, t0, a4
    401    sub t3, a2, a4
    402    addi a2, a2, 1
    403    lbu t3, (t3)
    404 1:
    405    mv t6, a3
    406 
    407    lbu t4, (t0)
    408 2:
    409    li a5, 256
    410    vsetvli t5, t6, e8, m1, ta, ma
    411    vle8.v v4, (a2)
    412    add a2, a2, t5
    413    sub a5, a5, t4
    414 
    415    vwmulu.vx v8, v4, t4
    416    vsetvli zero, zero, e16, m2, ta, ma
    417    mul a5, a5, t3
    418    vwaddu.vx v4, v8, a5
    419 
    420    sub a5, a3, t6
    421    sub t6, t6, t5
    422    add a5, a5, a0
    423    vsetvli zero, zero, e16, m2, ta, ma
    424    vnclipu.wi v2, v4, 8
    425    vsetvli zero, zero, e8, m1, ta, ma
    426    vnclipu.wi v0, v2, 0
    427    vse8.v v0, (a5)
    428 
    429    bnez t6, 2b
    430 
    431    add a0, a0, a1
    432    sub a2, a2, a3
    433    addi a4, a4, -1
    434    addi t0, t0, 1
    435    bnez a4, 1b
    436 
    437    ret
    438 endfunc
    439 
    440 function ipred_smooth_h_8bpc_rvv, export=1, ext="v,zba"
    441    csrw vxrm, zero
    442    la t0, dav1d_sm_weights
    443    add t1, t0, a3
    444    add t2, a2, a3
    445    lbu t2, (t2)
    446    addi a6, a2, -1
    447 1:
    448    mv t6, a3
    449 
    450    lbu a7, (a6)
    451 2:
    452    vsetvli t5, t6, e8, m1, ta, ma
    453    vle8.v v2, (t1)
    454    add t1, t1, t5
    455 
    456    vwmulu.vx v8, v2, a7
    457 
    458    vneg.v v12, v2
    459    vwmaccu.vx v8, t2, v12
    460 
    461    sub a5, a3, t6
    462    sub t6, t6, t5
    463    add a5, a5, a0
    464    vsetvli zero, zero, e8, m1, ta, ma
    465    vnclipu.wi v0, v8, 8
    466    vse8.v v0, (a5)
    467 
    468    bnez t6, 2b
    469 
    470    sub t1, t1, a3
    471    add a0, a0, a1
    472    addi a4, a4, -1
    473    addi a6, a6, -1
    474    bnez a4, 1b
    475 
    476    ret
    477 endfunc
    478 
    479 function pal_pred_8bpc_rvv, export=1, ext="v,zba"
    480    csrw vxrm, zero
    481    vsetivli t5, 8, e8, m1, ta, ma
    482    vle8.v v30, (a2)
    483    li t0, 2
    484    srli t1, a4, 1
    485 1:
    486    mv t4, a4
    487 2:
    488    vsetvli t5, t1, e8, m1, ta, ma
    489    vle8.v v0, (a3)
    490    add a3, a3, t5
    491    vsrl.vi v2, v0, 4
    492    sub t6, a4, t4
    493    vand.vi v1, v0, 7
    494    add t6, a0, t6
    495    vrgather.vv v3, v30, v1
    496    addi t2, t6, 1
    497    vrgather.vv v4, v30, v2
    498    slli t5, t5, 1
    499    vsse8.v v3, (t6), t0
    500    sub t4, t4, t5
    501    vsse8.v v4, (t2), t0
    502 
    503    bnez t4, 2b
    504    addi a5, a5, -1
    505    add a0, a0, a1
    506    bnez a5, 1b
    507    ret
    508 endfunc