tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef.S (13264B)


      1 /******************************************************************************
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2024, Bogdan Gligorijevic
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 *****************************************************************************/
     27 
     28 #include "src/riscv/asm.S"
     29 
     30 .macro constrain_vectors vec1, vec2, vec_sub, strength, shift, vec_tmp1, vec_tmp2
     31    vmslt.vx v0, \vec_tmp1, zero
     32    vneg.v \vec_tmp1, \vec_tmp1, v0.t
     33    vmmv.m v1, v0
     34 
     35    vmslt.vx v0, \vec_tmp2, zero
     36    vneg.v \vec_tmp2, \vec_tmp2, v0.t
     37 
     38    vsra.vx \vec1, \vec_tmp1, \shift
     39    vsra.vx \vec2, \vec_tmp2, \shift
     40 
     41    vrsub.vx \vec1, \vec1, \strength
     42    vrsub.vx \vec2, \vec2, \strength
     43 
     44    vmax.vx \vec1, \vec1, zero
     45    vmax.vx \vec2, \vec2, zero
     46 
     47    vmin.vv \vec_tmp1, \vec1, \vec_tmp1
     48    vmin.vv \vec_tmp2, \vec2, \vec_tmp2
     49 
     50    vneg.v \vec_tmp2, \vec_tmp2, v0.t
     51 
     52    vmmv.m v0, v1
     53    vneg.v \vec_tmp1, \vec_tmp1, v0.t
     54 .endm
     55 
     56 .macro padding_fn w, h
     57    li t5, -32768 # INT16_MIN
     58 
     59    andi t4, a7, 4
     60    li t2, -2 # y_start
     61 
     62 .if \w == 4
     63    vsetivli zero, \w + 4, e16, m1, ta, ma
     64 .else
     65    vsetivli zero, \w + 4, e16, m2, ta, ma
     66 .endif
     67    vmv.v.x v0, t5
     68    bnez t4, L(top_done_\w\()x\h)
     69 
     70    slli t5, a1, 1
     71    addi t5, t5, 2
     72    slli t5, t5, 1
     73    sub t5, a0, t5
     74 
     75    sh1add t4, a1, t5
     76    vse16.v v0, (t5)
     77    vse16.v v0, (t4)
     78    li t2, 0
     79 
     80 L(top_done_\w\()x\h):
     81    andi t4, a7, 8
     82    li t3, 2 + \h # y_end
     83    bnez t4, L(bottom_done_\w\()x\h)
     84 
     85    li t5, \h
     86    mul t5, a1, t5
     87    addi t5, t5, -2
     88    sh1add t5, t5, a0
     89 
     90    sh1add t4, a1, t5
     91    vse16.v v0, (t5)
     92    vse16.v v0, (t4)
     93    addi t3, t3, -2
     94 
     95 L(bottom_done_\w\()x\h):
     96    andi t4, a7, 1
     97    li t0, -2 # x_start
     98 
     99 .if \w == 4
    100    vsetivli zero, 2, e16, m1, ta, ma
    101 .else
    102    vsetivli zero, 2, e16, m2, ta, ma
    103 .endif
    104 
    105    bnez t4, L(left_done_\w\()x\h)
    106 
    107    mul t5, a1, t2
    108    addi t5, t5, -2
    109    sh1add t5, t5, a0
    110 
    111    sub t0, t3, t2
    112 
    113 3:
    114    vse16.v v0, (t5)
    115    sh1add t5, a1, t5
    116    addi t0, t0, -1
    117    bnez t0, 3b
    118 
    119 L(left_done_\w\()x\h):
    120 
    121    andi t4, a7, 2
    122    li t1, 2 + \w # x_end
    123    bnez t4, L(right_done_\w\()x\h)
    124 
    125    mul t5, t2, a1
    126    addi t5, t5, \w
    127    sh1add t5, t5, a0
    128 
    129    sub t1, t3, t2
    130 
    131 4:
    132    vse16.v v0, (t5)
    133    sh1add t5, a1, t5
    134    addi t1, t1, -1
    135    bnez t1, 4b
    136 
    137    li t1, \w
    138 
    139 L(right_done_\w\()x\h):
    140 
    141    beqz t2, L(top_skip_\w\()x\h)
    142 
    143    mul t5, a1, t2
    144    add t5, t0, t5
    145    sh1add a0, t5, a0 # tmp += y_start * tmp_stride + x_start
    146    add a5, a5, t0
    147 
    148    sub t5, t1, t0 # x_end - x_start
    149    slli t6, t0, 1
    150 .if \w == 4
    151    vsetvli zero, t5, e16, m1, ta, ma
    152 .else
    153    vsetvli zero, t5, e16, m2, ta, ma
    154 .endif
    155 
    156 5:
    157    vle8.v v0, (a5)
    158    addi t2, t2, 1
    159    vzext.vf2 v2, v0
    160    add a5, a3, a5
    161    vse16.v v2, (a0)
    162    sh1add a0, a1, a0
    163    bnez t2, 5b
    164 
    165    sub a0, a0, t6 # tmp -= x_start
    166 
    167 L(top_skip_\w\()x\h):
    168 
    169    li a5, \h
    170    beqz t0, L(left_skip_\w\()x\h)
    171 
    172    sh1add a0, t0, a0 # tmp += x_start
    173 
    174 7:
    175 .if \w == 4
    176    vsetivli zero, 2, e16, m1, ta, ma
    177 .else
    178    vsetivli zero, 2, e16, m2, ta, ma
    179 .endif
    180 
    181    vle8.v v0, (a4)
    182    addi a5, a5, -1
    183    vzext.vf2 v2, v0
    184    addi a4, a4, 2
    185    vse16.v v2, (a0)
    186    sh1add a0, a1, a0
    187    bnez a5, 7b
    188 
    189    li a5, \h
    190    mul t5, a1, a5
    191    add t5, t5, t0
    192    slli t5, t5, 1
    193    sub a0, a0, t5 # tmp -= h * tmp_stride + x_start
    194 
    195 L(left_skip_\w\()x\h):
    196 
    197 8:
    198 .if \w == 4
    199    vsetvli zero, t1, e16, m1, ta, ma
    200 .else
    201    vsetvli zero, t1, e16, m2, ta, ma
    202 .endif
    203 
    204    vle8.v v0, (a2)
    205    vzext.vf2 v2, v0
    206    vse16.v v2, (a0)
    207    add a2, a3, a2
    208    sh1add a0, a1, a0
    209    addi a5, a5, -1
    210    bnez a5, 8b
    211 
    212 
    213    li a5, \h
    214    sh1add a0, t0, a0 # tmp += x_start
    215    add a6, a6, t0 # bottom += x_start
    216    beq a5, t3, L(bottom_skip_\w\()x\h)
    217 
    218    sub t5, t1, t0
    219 .if \w == 4
    220    vsetvli zero, t5, e16, m1, ta, ma
    221 .else
    222    vsetvli zero, t5, e16, m2, ta, ma
    223 .endif
    224 
    225 9:
    226    vle8.v v0, (a6)
    227    add a6, a3, a6
    228    vzext.vf2 v2, v0
    229    addi a5, a5, 1
    230    vse16.v v2, (a0)
    231    sh1add a0, a1, a0
    232    bne a5, t3, 9b
    233 
    234 L(bottom_skip_\w\()x\h):
    235    li t6, \h
    236    mul t6, a3, t6
    237    sub a2, a2, t6 # src -= h * src_stride
    238    mul t5, a1, t3
    239    add t5, t5, t0
    240    slli t5, t5, 1
    241    sub a0, a0, t5 # tmp -= y_end * tmp_stride + x_start
    242 .endm
    243 
    244 
    245 .macro cdef_fn w, h
    246 function cdef_filter_block_\w\()x\h\()_8bpc_rvv, export=1, ext="v,zba,zbb"
    247    csrw vxrm, zero
    248 
    249    addi sp, sp, -32 - 144*2
    250    sd a5, 24(sp) # pri_strength
    251    sd a6, 16(sp) # sec_strength
    252    sd a7, 8(sp) # dir
    253 
    254 
    255    ld a7, 8 + 32 + 144*2(sp) # edges
    256    mv a6, a4 # bottom
    257    mv a5, a3 # top
    258    mv a4, a2 # left
    259    mv a3, a1 # dst_stride
    260    mv a2, a0 # dst
    261    li a1, 12 # tmp_stride
    262    addi a0, sp, 32 + 2*(2*12+2)
    263    padding_fn \w, \h
    264 
    265    ld a4, 32 + 2*144(sp) # damping
    266    ld a5, 24(sp) # pri_strength
    267    ld a6, 16(sp) # sec_strength
    268    ld a7, 8(sp) # dir
    269 
    270    beqz a5, cdef_filter_sec_only_\w\()x\h
    271 
    272    bnez a6, cdef_filter_pri_sec_\w\()x\h
    273 
    274    andi t0, a5, 1
    275    li t1, 4
    276    sub t4, t1, t0
    277 
    278    li t1, 63
    279    clz t2, a5
    280    sub t1, t1, t2
    281    sub t1, a4, t1
    282 
    283    li t0, \h
    284 
    285    la t2, dav1d_cdef_directions
    286    addi t3, a7, 2
    287    sh1add t2, t3, t2
    288 
    289    blt zero, t1, 1f
    290    mv t1, zero
    291 1:
    292    vsetivli zero, \w, e16, m1, ta, mu
    293 
    294    lb t3, 0(t2)
    295 
    296    vle8.v v0, (a2)
    297    vzext.vf2 v2, v0
    298 
    299    sh1add t6, t3, a0
    300    slli t3, t3, 1
    301    sub t3, a0, t3
    302 
    303    vle16.v v4, (t6)
    304    vle16.v v6, (t3)
    305 
    306    vwsub.vv v8, v4, v2
    307    vwsub.vv v16, v6, v2
    308 
    309    vsetvli zero, zero, e32, m2, ta, mu
    310 
    311    constrain_vectors v4, v6, v12, a5, t1, v8, v16
    312 
    313    vmul.vx v28, v16, t4
    314    vmacc.vx v28, t4, v8
    315 
    316    lb t3, 1(t2)
    317 
    318    andi t5, t4, 3
    319    ori t5, t5, 2
    320 
    321    sh1add t6, t3, a0
    322    slli t3, t3, 1
    323    sub t3, a0, t3
    324 
    325    vsetvli zero, zero, e16, m1, ta, mu
    326 
    327    vle16.v v4, (t6)
    328    vle16.v v6, (t3)
    329 
    330    vwsub.vv v8, v4, v2
    331    vwsub.vv v16, v6, v2
    332 
    333    vsetvli zero, zero, e32, m2, ta, mu
    334 
    335    constrain_vectors v4, v6, v12, a5, t1, v8, v16
    336 
    337    vmacc.vx v28, t5, v16
    338    vmacc.vx v28, t5, v8
    339 
    340    vmslt.vx v0, v28, zero
    341    vadd.vi v28, v28, -1, v0.t
    342 
    343    vsetvli zero, zero, e16, m1, ta, ma
    344 
    345    vnclip.wi v24, v28, 4
    346 
    347    vadd.vv v28, v2, v24
    348 
    349    vsetvli zero, zero, e8, mf2, ta, ma
    350 
    351    vnclipu.wi v24, v28, 0
    352 
    353    vse8.v v24, (a2)
    354 
    355    addi t0, t0, -1
    356    add a2, a2, a3
    357    sh1add a0, a1, a0
    358 
    359    bnez t0, 1b
    360 
    361    addi sp, sp, 32 + 144*2
    362    ret
    363 
    364 cdef_filter_sec_only_\w\()x\h:
    365    li t1, 63
    366    clz t2, a6
    367    sub t1, t1, t2
    368    sub t1, a4, t1
    369 
    370    li t0, \h
    371 
    372    la t2, dav1d_cdef_directions
    373    addi t3, a7, 4
    374    sh1add t3, t3, t2
    375    sh1add t2, a7, t2
    376 
    377 2:
    378    vsetivli zero, \w, e16, m1, ta, mu
    379 
    380    lb t4, 0(t3)
    381    lb t5, 0(t2)
    382 
    383    vle8.v v0, (a2)
    384    vzext.vf2 v2, v0
    385 
    386    sh1add t6, t4, a0
    387    slli t4, t4, 1
    388    sub t4, a0, t4
    389 
    390    vle16.v v4, (t6)
    391    vle16.v v6, (t4)
    392 
    393    sh1add t4, t5, a0
    394    slli t5, t5, 1
    395    sub t5, a0, t5
    396 
    397    vle16.v v8, (t4)
    398    vle16.v v10, (t5)
    399 
    400    vwsub.vv v12, v4, v2
    401    vwsub.vv v14, v6, v2
    402    vwsub.vv v16, v8, v2
    403    vwsub.vv v18, v10, v2
    404 
    405    vsetvli zero, zero, e32, m2, ta, mu
    406 
    407    li t4, 2
    408    constrain_vectors v4, v6, v12, a6, t1, v12, v14
    409    constrain_vectors v8, v10, v14, a6, t1, v16, v18
    410 
    411    vmul.vx v28, v18, t4
    412    vmacc.vx v28, t4, v16
    413    vmacc.vx v28, t4, v14
    414    vmacc.vx v28, t4, v12
    415 
    416 
    417    lb t4, 1(t3)
    418    lb t5, 1(t2)
    419 
    420    sh1add t6, t4, a0
    421    slli t4, t4, 1
    422    sub t4, a0, t4
    423 
    424    vsetvli zero, zero, e16, m1, ta, mu
    425 
    426    vle16.v v4, (t6)
    427    vle16.v v6, (t4)
    428 
    429    sh1add t4, t5, a0
    430    slli t5, t5, 1
    431    sub t5, a0, t5
    432 
    433    vle16.v v8, (t4)
    434    vle16.v v10, (t5)
    435 
    436    vwsub.vv v12, v4, v2
    437    vwsub.vv v14, v6, v2
    438    vwsub.vv v16, v8, v2
    439    vwsub.vv v18, v10, v2
    440 
    441    vsetvli zero, zero, e32, m2, ta, mu
    442 
    443    constrain_vectors v4, v6, v12, a6, t1, v12, v14
    444    constrain_vectors v8, v10, v14, a6, t1, v16, v18
    445 
    446    vadd.vv v4, v28, v12
    447    vadd.vv v28, v4, v14
    448    vadd.vv v4, v28, v16
    449    vadd.vv v28, v4, v18
    450 
    451    vmslt.vx v0, v28, zero
    452    vadd.vi v28, v28, -1, v0.t
    453 
    454    vsetvli zero, zero, e16, m1, ta, ma
    455 
    456    vnclip.wi v24, v28, 4
    457 
    458    vadd.vv v28, v2, v24
    459 
    460    vsetvli zero, zero, e8, mf2, ta, ma
    461 
    462    vnclipu.wi v24, v28, 0
    463 
    464    vse8.v v24, (a2)
    465 
    466    addi t0, t0, -1
    467    add a2, a2, a3
    468    sh1add a0, a1, a0
    469 
    470    bnez t0, 2b
    471 
    472    addi sp, sp, 32 + 144*2
    473    ret
    474 cdef_filter_pri_sec_\w\()x\h:
    475 
    476    li t1, 63
    477    clz t2, a5
    478    clz t3, a6
    479    sub t2, t1, t2
    480    sub t3, t1, t3
    481    sub t1, a4, t2
    482    sub t2, a4, t3
    483 
    484    li t0, \h
    485 
    486    la t3, dav1d_cdef_directions
    487 
    488    blt zero, t1, 3f
    489    mv t1, zero
    490 3:
    491    vsetivli zero, \w, e16, m1, ta, ma
    492 
    493    li t4, 4
    494    andi t6, a5, 1
    495    addi t5, a7, 2
    496    sub t4, t4, t6
    497 
    498    sh1add t5, t5, t3
    499 
    500    vle8.v v0, (a2)
    501 
    502    lb t6, 0(t5)
    503 
    504    vzext.vf2 v2, v0
    505 
    506    sh1add a4, t6, a0
    507    slli t6, t6, 1
    508    sub t6, a0, t6
    509 
    510    vle16.v v4, (a4)
    511    vle16.v v6, (t6)
    512 
    513    vminu.vv v20, v4, v2
    514    vmax.vv v24, v4, v2
    515    vminu.vv v20, v6, v20
    516    vmax.vv v24, v6, v24
    517 
    518    vwsub.vv v8, v4, v2
    519    vwsub.vv v16, v6, v2
    520 
    521    vsetvli zero, zero, e32, m2, ta, mu
    522 
    523    constrain_vectors v4, v6, v12, a5, t1, v8, v16
    524 
    525    vmul.vx v28, v16, t4
    526    vmacc.vx v28, t4, v8
    527 
    528    lb t6, 1(t5)
    529 
    530    andi t4, t4, 3
    531    ori t4, t4, 2
    532 
    533 
    534    sh1add a4, t6, a0
    535    slli t6, t6, 1
    536    sub t6, a0, t6
    537 
    538    vsetvli zero, zero, e16, m1, ta, ma
    539 
    540    vle16.v v4, (a4)
    541    vle16.v v6, (t6)
    542 
    543    vminu.vv v20, v4, v20
    544    vmax.vv v24, v4, v24
    545    vminu.vv v20, v6, v20
    546    vmax.vv v24, v6, v24
    547 
    548    vwsub.vv v8, v4, v2
    549    vwsub.vv v16, v6, v2
    550 
    551    vsetvli zero, zero, e32, m2, ta, mu
    552 
    553    constrain_vectors v4, v6, v12, a5, t1, v8, v16
    554 
    555    addi t5, a7, 4
    556    vmacc.vx v28, t4, v16
    557    vmacc.vx v28, t4, v8
    558 
    559    sh1add t5, t5, t3
    560 
    561    lb t6, 0(t5)
    562 
    563    sh1add a4, t6, a0
    564    slli t6, t6, 1
    565    sub t6, a0, t6
    566 
    567    vsetvli zero, zero, e16, m1, ta, ma
    568 
    569    vle16.v v4, (a4)
    570    vle16.v v6, (t6)
    571 
    572    vminu.vv v20, v4, v20
    573    vmax.vv v24, v4, v24
    574    vminu.vv v20, v6, v20
    575    vmax.vv v24, v6, v24
    576 
    577    vwsub.vv v8, v4, v2
    578    vwsub.vv v16, v6, v2
    579 
    580    vsetvli zero, zero, e32, m2, ta, mu
    581 
    582    li t6, 2
    583    constrain_vectors v4, v6, v12, a6, t2, v8, v16
    584 
    585    vmacc.vx v28, t6, v16
    586    vmacc.vx v28, t6, v8
    587 
    588    lb t6, 1(t5)
    589 
    590    sh1add a4, t6, a0
    591    slli t6, t6, 1
    592    sub t6, a0, t6
    593 
    594    vsetvli zero, zero, e16, m1, ta, ma
    595 
    596    vle16.v v4, (a4)
    597    vle16.v v6, (t6)
    598 
    599    vminu.vv v20, v4, v20
    600    vmax.vv v24, v4, v24
    601    vminu.vv v20, v6, v20
    602    vmax.vv v24, v6, v24
    603 
    604    vwsub.vv v8, v4, v2
    605    vwsub.vv v16, v6, v2
    606 
    607    vsetvli zero, zero, e32, m2, ta, mu
    608 
    609    constrain_vectors v4, v6, v12, a6, t2, v8, v16
    610 
    611    sh1add t5, a7, t3
    612 
    613    vadd.vv v4, v28, v8
    614    vadd.vv v28, v4, v16
    615 
    616    vsetvli zero, zero, e16, m1, ta, ma
    617 
    618    lb t6, 0(t5)
    619 
    620    sh1add a4, t6, a0
    621    slli t6, t6, 1
    622    sub t6, a0, t6
    623 
    624    vle16.v v4, (a4)
    625    vle16.v v6, (t6)
    626 
    627    vminu.vv v20, v4, v20
    628    vmax.vv v24, v4, v24
    629    vminu.vv v20, v6, v20
    630    vmax.vv v24, v6, v24
    631 
    632    vwsub.vv v8, v4, v2
    633    vwsub.vv v16, v6, v2
    634 
    635    vsetvli zero, zero, e32, m2, ta, mu
    636 
    637    li t6, 2
    638    constrain_vectors v4, v6, v12, a6, t2, v8, v16
    639 
    640    vmacc.vx v28, t6, v16
    641    vmacc.vx v28, t6, v8
    642 
    643    lb t6, 1(t5)
    644 
    645    sh1add a4, t6, a0
    646    slli t6, t6, 1
    647    sub t6, a0, t6
    648 
    649    vsetvli zero, zero, e16, m1, ta, ma
    650 
    651    vle16.v v4, (a4)
    652    vle16.v v6, (t6)
    653 
    654    vminu.vv v20, v4, v20
    655    vmax.vv v24, v4, v24
    656    vminu.vv v20, v6, v20
    657    vmax.vv v24, v6, v24
    658 
    659    vwsub.vv v8, v4, v2
    660    vwsub.vv v16, v6, v2
    661 
    662    vsetvli zero, zero, e32, m2, ta, mu
    663 
    664    constrain_vectors v4, v6, v12, a6, t2, v8, v16
    665 
    666    vadd.vv v4, v28, v8
    667    vadd.vv v28, v4, v16
    668 
    669    vmslt.vx v0, v28, zero
    670    vadd.vi v28, v28, -1, v0.t
    671 
    672    vsetvli zero, zero, e16, m1, ta, mu
    673 
    674    vnclip.wi v16, v28, 4
    675 
    676    vadd.vv v28, v2, v16
    677 
    678    vmslt.vv v0, v20, v28
    679    vmerge.vvm v4, v20, v28, v0
    680 
    681    vmslt.vv v0, v4, v24
    682    vmerge.vvm v28, v24, v4, v0
    683 
    684    vsetvli zero, zero, e8, mf2, ta, ma
    685 
    686    vnclipu.wi v24, v28, 0
    687 
    688    vse8.v v24, (a2)
    689 
    690    addi t0, t0, -1
    691    add a2, a2, a3
    692    sh1add a0, a1, a0
    693 
    694    bnez t0, 3b
    695 
    696    addi sp, sp, 32 + 144*2
    697    ret
    698 endfunc
    699 .endm
    700 
    701 cdef_fn 4, 4
    702 cdef_fn 4, 8
    703 cdef_fn 8, 8