tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ipred16.S (10065B)


      1 /******************************************************************************
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2024, Bogdan Gligorijevic
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 *****************************************************************************/
     27 
     28 #include "src/riscv/asm.S"
     29 
     30 function dc_gen_16bpc_rvv, export=1, ext="v,zba,zbb"
     31    .variant_cc dav1d_dc_gen_16bpc_rvv
     32    add t1, a1, a2
     33    srli t5, t1, 1
     34    mv t1, a1
     35    addi t2, a0, 2
     36    vsetvli zero, t1, e32, m8, ta, ma
     37    vmv.v.x v0, zero
     38 1:
     39    vsetvli t3, t1, e16, m4, tu, ma
     40    vle16.v v8, (t2)
     41    vwaddu.wv v0, v0, v8
     42    sub t1, t1, t3
     43 
     44    sh1add t2, t3, t2
     45    bnez t1, 1b
     46 
     47    mv t1, a2
     48    mv t2, a0
     49    vsetvli zero, t1, e32, m8, ta, ma
     50    vmv.v.x v16, zero
     51 2:
     52    vsetvli t3, t1, e16, m4, tu, ma
     53    sub t1, t1, t3
     54    sll t3, t3, 1
     55    sub t2, t2, t3
     56    vle16.v v8, (t2)
     57    vwaddu.wv v16, v16, v8
     58 
     59    bnez t1, 2b
     60 
     61    vsetvli zero, a1, e32, m8, ta, ma
     62    vmv.s.x v24, t5
     63    vmv.s.x v25, zero
     64    vredsum.vs v8, v0, v24
     65    vsetvli zero, a2, e32, m8, ta, ma
     66    vredsum.vs v0, v16, v25
     67    vmv.x.s t5, v8
     68    vmv.x.s t1, v0
     69    add t5, t5, t1
     70 
     71    add t1, a1, a2
     72    ctz t1, t1
     73 
     74    srl a0, t5, t1
     75 
     76    beq a1, a2, 5f
     77    slli t1, a1, 1
     78    sltu t2, t1, a2
     79    slli t3, a2, 1
     80    sltu t1, t3, a1
     81    or t1, t1, t2
     82    bnez t1, 3f
     83 
     84    li t1, 0xAAAB
     85    j 4f
     86 3:
     87    li t1, 0x6667
     88 4:
     89    mul a0, a0, t1
     90    li t1, 17
     91    srl a0, a0, t1
     92 5:
     93    jr t0
     94 endfunc
     95 
     96 function dc_gen_top_16bpc_rvv, export=1, ext="v,zba,zbb"
     97    .variant_cc dav1d_dc_gen_top_16bpc_rvv
     98    mv t1, a1
     99    srli t5, a1, 1
    100    addi a0, a0, 2
    101    vsetvli zero, t1, e32, m2, ta, ma
    102    vmv.v.x v0, zero
    103 1:
    104    vsetvli t3, t1, e16, m1, tu, ma
    105    vle16.v v4, (a0)
    106    vwaddu.wv v0, v0, v4
    107 
    108    sh1add a0, t3, a0
    109    sub t1, t1, t3
    110    bnez t1, 1b
    111 
    112    j dc_gen_sum_up_16bpc_rvv
    113 endfunc
    114 
    115 function dc_gen_left_16bpc_rvv, export=1, ext="v,zba,zbb"
    116    .variant_cc dav1d_dc_gen_left_16bpc_rvv
    117    mv t1, a1
    118    srli t5, a1, 1
    119    vsetvli zero, t1, e32, m2, ta, ma
    120    vmv.v.x v0, zero
    121 1:
    122    vsetvli t3, t1, e16, m1, tu, ma
    123    sub t1, t1, t3
    124    slli t3, t3, 1
    125    sub a0, a0, t3
    126    vle16.v v4, (a0)
    127    vwaddu.wv v0, v0, v4
    128 
    129    bnez t1, 1b
    130 
    131    j dc_gen_sum_up_16bpc_rvv
    132 endfunc
    133 
    134 function dc_gen_sum_up_16bpc_rvv, export=1, ext="v,zba,zbb"
    135    .variant_cc dav1d_dc_gen_sum_up_16bpc_rvv
    136 
    137    vsetvli zero, a1, e32, m2, ta, ma
    138    vmv.s.x v4, t5
    139    vredsum.vs v8, v0, v4
    140    vmv.x.s t5, v8
    141 
    142    ctz t1, a1
    143 
    144    srl a0, t5, t1
    145    jr t0
    146 endfunc
    147 
    148 function cfl_pred_16bpc_rvv, export=1, ext="v,zba"
    149    csrw vxrm, zero
    150 1:
    151    li t2, 0
    152    mv t3, a2
    153 2:
    154    vsetvli t0, t3, e16, m2, ta, ma
    155    sh1add t4, t2, a0
    156    vle16.v v0, (a5)
    157    sh1add a5, t0, a5
    158 
    159    vwmul.vx v4, v0, a6
    160    vsetvli zero, zero, e32, m4, ta, mu
    161    vneg.v v8, v4
    162    vmslt.vx v0, v4, x0
    163    vmax.vv v12, v8, v4
    164    vssra.vi v16, v12, 6
    165    vneg.v v16, v16, v0.t
    166    vadd.vx v20, v16, a4
    167    vmax.vx v0, v20, zero
    168    vmin.vx v0, v0, a7
    169    vsetvli zero, zero, e16, m2, ta, ma
    170    vnclipu.wi v4, v0, 0
    171    vse16.v v4, (t4)
    172    add t2, t0, t2
    173    sub t3, t3, t0
    174    bnez t3, 2b
    175    addi a3, a3, -1
    176    add a0, a0, a1
    177 
    178    bnez a3, 1b
    179    ret
    180 endfunc
    181 
    182 function ipred_cfl_16bpc_rvv, export=1, ext=v
    183    mv t6, a0 # dst
    184    mv a0, a2 # topleft
    185    mv t4, a1 # stride
    186    mv a1, a3 # width
    187    mv a2, a4 # height
    188    jal t0, dc_gen_16bpc_rvv
    189    mv a2, a3 # width
    190    mv a3, a4 # height
    191    mv a4, a0 # dc_get_top
    192    mv a0, t6 # dst
    193    mv a1, t4 # stride
    194    j cfl_pred_16bpc_rvv
    195 endfunc
    196 
    197 function ipred_cfl_128_16bpc_rvv, export=1, ext="v,zba"
    198    # dc = (bitdepth_max + 1) >> 1, then just rearrange registers
    199    mv a2, a3
    200    mv a3, a4
    201    addi a4, a7, 1
    202    srli a4, a4, 1
    203 
    204    j cfl_pred_16bpc_rvv
    205 endfunc
    206 
    207 function ipred_cfl_top_16bpc_rvv, export=1, ext=v
    208    mv t6, a0 # dst
    209    mv a0, a2 # topleft
    210    mv t4, a1 # stride
    211    mv a1, a3 # width
    212    jal t0, dc_gen_top_16bpc_rvv
    213    mv a3, a4 # height
    214    mv a4, a0 # dc_get_top
    215    mv a0, t6 # dst
    216    mv a2, a1 # width
    217    mv a1, t4 # stride
    218    j cfl_pred_16bpc_rvv
    219 endfunc
    220 
    221 function ipred_cfl_left_16bpc_rvv, export=1, ext=v
    222    mv t6, a0 # dst
    223    mv a0, a2 # topleft
    224    mv t4, a1 # stride
    225    mv a1, a4 # height
    226    mv a2, a3 # width
    227    jal t0, dc_gen_left_16bpc_rvv
    228    mv a3, a4 # height
    229    mv a4, a0 # dc_get_top
    230    mv a1, t4 # stride
    231    mv a0, t6 # dst
    232    j cfl_pred_16bpc_rvv
    233 endfunc
    234 
    235 function ipred_paeth_16bpc_rvv, export=1, ext="v,zba"
    236    csrw vxrm, zero
    237    li t0, 0
    238    mv t3, a2
    239    lhu t1, (a2)
    240    addi a6, a2, -2
    241    addi a2, a2, 2
    242 1:
    243    lhu t2, (a6)
    244    mv t3, a3
    245 2:
    246    sub t5, a3, t3
    247    sh1add t5, t5, a2
    248    vsetvli t6, t3, e16, m2, ta, ma
    249    vle16.v v2, (t5)
    250    vwaddu.vx v4, v2, t2
    251 
    252    vsetvli zero, zero, e32, m4, ta, mu
    253    vsub.vx v8, v4, t1
    254    vzext.vf2 v24, v2
    255    vsub.vx v12, v8, t1
    256    vmslt.vx v0, v12, zero
    257    vneg.v v12, v12, v0.t
    258    vsub.vx v16, v8, t2
    259    vmslt.vx v0, v16, zero
    260    vneg.v v16, v16, v0.t
    261    vsub.vv v20, v8, v24
    262    vmslt.vx v0, v20, zero
    263    vneg.v v20, v20, v0.t
    264 
    265    sub t5, a3, t3
    266    vmsleu.vv v4, v16, v20
    267    vmsleu.vv v5, v16, v12
    268    vmsgtu.vv v0, v20, v12
    269    vmand.mm v6, v4, v5
    270 
    271    vsetvli zero, zero, e16, m2, ta, ma
    272    vmerge.vxm v8, v2, t1, v0
    273    vmmv.m v0, v6
    274    sh1add t5, t5, a0
    275    sub t3, t3, t6
    276    vmerge.vxm v4, v8, t2, v0
    277 
    278    vse16.v v4, (t5)
    279 
    280    bnez t3, 2b
    281 
    282    addi a4, a4, -1
    283    addi a6, a6, -2
    284    add a0, a0, a1
    285    bnez a4, 1b
    286    ret
    287 endfunc
    288 
    289 function ipred_smooth_16bpc_rvv, export=1, ext="v,zba"
    290    csrw vxrm, zero
    291    la t0, dav1d_sm_weights
    292    add t1, t0, a3
    293    sh1add t2, a3, a2
    294    slli t3, a4, 1
    295    add t0, t0, a4
    296    lhu t2, (t2)
    297    sub t3, a2, t3
    298    addi a6, a2, -2
    299    addi a2, a2, 2
    300    lhu t3, (t3)
    301 1:
    302    mv t6, a3
    303 
    304    lhu a7, (a6)
    305    lbu t4, (t0)
    306 2:
    307    li a5, 256
    308    vsetvli t5, t6, e16, m2, ta, ma
    309    vle8.v v2, (t1)
    310    add t1, t1, t5
    311    vle16.v v4, (a2)
    312    sh1add a2, t5, a2
    313    sub a5, a5, t4
    314 
    315    vwmul.vx v8, v4, t4
    316    mul a5, a5, t3
    317 
    318    vsetvli zero, zero, e32, m4, ta, ma
    319    vadd.vx v4, v8, a5
    320 
    321    li a5, 256
    322    vzext.vf4 v12, v2
    323    vmul.vx v8, v12, a7
    324 
    325    vrsub.vx v12, v12, a5
    326    vmacc.vx v8, t2, v12
    327    vadd.vv v12, v4, v8
    328    vsetvli zero, zero, e32, m4, ta, ma
    329 
    330    sub a5, a3, t6
    331    sub t6, t6, t5
    332    sh1add a5, a5, a0
    333    vsetvli zero, zero, e16, m2, ta, ma
    334    vnclipu.wi v2, v12, 9
    335    vse16.v v2, (a5)
    336 
    337    bnez t6, 2b
    338 
    339    sub t1, t1, a3
    340    slli t6, a3, 1
    341    add a0, a0, a1
    342    sub a2, a2, t6
    343    addi a4, a4, -1
    344    addi t0, t0, 1
    345    addi a6, a6, -2
    346    bnez a4, 1b
    347 
    348    ret
    349 endfunc
    350 
    351 function ipred_smooth_v_16bpc_rvv, export=1, ext="v,zba"
    352    csrw vxrm, zero
    353    la t0, dav1d_sm_weights
    354    slli t3, a4, 1
    355    add t0, t0, a4
    356    sub t3, a2, t3
    357    addi a2, a2, 2
    358    lhu t3, (t3)
    359 1:
    360    mv t6, a3
    361 
    362    lbu t4, (t0)
    363 2:
    364    li a5, 256
    365    vsetvli t5, t6, e16, m2, ta, ma
    366    vle16.v v4, (a2)
    367    sh1add a2, t5, a2
    368    sub a5, a5, t4
    369 
    370    vwmul.vx v8, v4, t4
    371    mul a5, a5, t3
    372 
    373    vsetvli zero, zero, e32, m4, ta, ma
    374    vadd.vx v4, v8, a5
    375    vsetvli zero, zero, e32, m4, ta, ma
    376 
    377    sub a5, a3, t6
    378    sub t6, t6, t5
    379    sh1add a5, a5, a0
    380    vsetvli zero, zero, e16, m2, ta, ma
    381    vnclipu.wi v2, v4, 8
    382    vse16.v v2, (a5)
    383 
    384    bnez t6, 2b
    385 
    386    slli t6, a3, 1
    387    add a0, a0, a1
    388    sub a2, a2, t6
    389    addi a4, a4, -1
    390    addi t0, t0, 1
    391    bnez a4, 1b
    392 
    393    ret
    394 endfunc
    395 
    396 function ipred_smooth_h_16bpc_rvv, export=1, ext="v,zba"
    397    csrw vxrm, zero
    398    la t0, dav1d_sm_weights
    399    add t1, t0, a3
    400    sh1add t2, a3, a2
    401    lhu t2, (t2)
    402    addi a6, a2, -2
    403 1:
    404    mv t6, a3
    405 
    406    lhu a7, (a6)
    407 2:
    408    vsetvli t5, t6, e16, m2, ta, ma
    409    vle8.v v2, (t1)
    410    add t1, t1, t5
    411 
    412    li a5, 256
    413    vsetvli zero, zero, e32, m4, ta, ma
    414    vzext.vf4 v12, v2
    415    vmul.vx v8, v12, a7
    416 
    417    vrsub.vx v12, v12, a5
    418    vmacc.vx v8, t2, v12
    419 
    420    sub a5, a3, t6
    421    sub t6, t6, t5
    422    sh1add a5, a5, a0
    423    vsetvli zero, zero, e16, m2, ta, ma
    424    vnclipu.wi v2, v8, 8
    425    vse16.v v2, (a5)
    426 
    427    bnez t6, 2b
    428 
    429    sub t1, t1, a3
    430    add a0, a0, a1
    431    addi a4, a4, -1
    432    addi a6, a6, -2
    433    bnez a4, 1b
    434 
    435    ret
    436 endfunc
    437 
    438 function pal_pred_16bpc_rvv, export=1, ext="v,zba"
    439    csrw vxrm, zero
    440    vsetivli t5, 8, e16, m1, ta, ma
    441    vle16.v v30, (a2)
    442    li t0, 4
    443    srli t1, a4, 1
    444    li t2, 1
    445 1:
    446    mv t4, a4
    447 2:
    448    vsetvli t5, t1, e8, mf2, ta, ma
    449    vle8.v v0, (a3)
    450    add a3, a3, t5
    451    vand.vi v1, v0, 7
    452    sub t6, a4, t4
    453    vsrl.vi v2, v0, 4
    454    vwmul.vx v4, v1, t2
    455    vwmul.vx v6, v2, t2
    456    vsetvli zero, zero, e16, m1, ta, ma
    457    sh1add t6, t6, a0
    458    vrgather.vv v8, v30, v4
    459    addi t3, t6, 2
    460    vrgather.vv v10, v30, v6
    461    slli t5, t5, 1
    462    vsse16.v v8, (t6), t0
    463    vsse16.v v10, (t3), t0
    464 
    465    sub t4, t4, t5
    466    bnez t4, 2b
    467    add a0, a0, a1
    468    addi a5, a5, -1
    469    bnez a5, 1b
    470    ret
    471 endfunc