tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pal.asm (17823B)


      1 ; Copyright © 2023, VideoLAN and dav1d authors
      2 ; Copyright © 2023, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 SECTION_RODATA 64
     30 
     31 const pb_0to63,  db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     32 %if ARCH_X86_64
     33                 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
     34                 db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
     35                 db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
     36 %endif
     37 pal_idx_w8_padh: db  0,  1,  2,  3,  3,  3,  3,  3,  8,  9, 10, 11, 11, 11, 11, 11
     38 
     39 pb_1_16: times 4 db  1, 16
     40 %if ARCH_X86_64
     41 pb_32:   times 4 db 32
     42 %endif
     43 
     44 %macro JMP_TABLE 2-*
     45    %xdefine %1_table (%%table - 2*4)
     46    %xdefine %%base mangle(private_prefix %+ _%1)
     47    %%table:
     48    %rep %0 - 1
     49        dd %%base %+ .w%2 - (%%table - 2*4)
     50        %rotate 1
     51    %endrep
     52 %endmacro
     53 
     54 JMP_TABLE pal_idx_finish_ssse3,     4, 8, 16, 32, 64
     55 %if ARCH_X86_64
     56 JMP_TABLE pal_idx_finish_avx2,      4, 8, 16, 32, 64
     57 JMP_TABLE pal_idx_finish_avx512icl, 4, 8, 16, 32, 64
     58 %endif
     59 
     60 SECTION .text
     61 
     62 INIT_XMM ssse3
     63 cglobal pal_idx_finish, 2, 7, 6, dst, src, bw, bh, w, h
     64 %define base r6-pal_idx_finish_ssse3_table
     65    LEA                  r6, pal_idx_finish_ssse3_table
     66    tzcnt               bwd, bwm
     67    movifnidn           bhd, bhm
     68    movifnidn            wd, wm
     69    movifnidn            hd, hm
     70    movsxd              bwq, [r6+bwq*4]
     71    movddup              m3, [base+pb_1_16]
     72    add                 bwq, r6
     73    sub                 bhd, hd
     74    jmp                 bwq
     75 .w4:
     76    mova                 m0, [srcq]
     77    add                srcq, 16
     78    pmaddubsw            m0, m3
     79    packuswb             m0, m0
     80    movq             [dstq], m0
     81    add                dstq, 8
     82    sub                  hd, 4
     83    jg .w4
     84    test                bhd, bhd
     85    jz .w4_end
     86    pshuflw              m0, m0, q3333
     87 .w4_padv:
     88    movq             [dstq], m0
     89    add                dstq, 8
     90    sub                 bhd, 4
     91    jg .w4_padv
     92 .w4_end:
     93    RET
     94 .w8_padh:
     95    pshufb               m0, m2
     96    pshufb               m1, m2
     97    jmp .w8_main
     98 .w8:
     99    mova                 m2, [base+pal_idx_w8_padh]
    100 .w8_loop:
    101    mova                 m0, [srcq+16*0]
    102    mova                 m1, [srcq+16*1]
    103    cmp                  wd, 8
    104    jl .w8_padh
    105 .w8_main:
    106    pmaddubsw            m0, m3
    107    pmaddubsw            m1, m3
    108    add                srcq, 16*2
    109    packuswb             m0, m1
    110    movu             [dstq], m0
    111    add                dstq, 16
    112    sub                  hd, 4
    113    jg .w8_loop
    114    test                bhd, bhd
    115    jz .w8_end
    116    pshufd               m0, m0, q3333
    117 .w8_padv:
    118    movu             [dstq], m0
    119    add                dstq, 16
    120    sub                 bhd, 4
    121    jg .w8_padv
    122 .w8_end:
    123    RET
    124 .w16_padh:
    125    pshufb               m0, m4
    126    pshufb               m1, m4
    127    jmp .w16_main
    128 .w16:
    129    cmp                  wd, 16
    130    je .w16_loop
    131    call .setup_padh
    132 .w16_loop:
    133    mova                 m0, [srcq+16*0]
    134    mova                 m1, [srcq+16*1]
    135    cmp                  wd, 16
    136    jl .w16_padh
    137 .w16_main:
    138    pmaddubsw            m0, m3
    139    pmaddubsw            m1, m3
    140    add                srcq, 16*2
    141    packuswb             m0, m1
    142    movu             [dstq], m0
    143    add                dstq, 16
    144    sub                  hd, 2
    145    jg .w16_loop
    146    test                bhd, bhd
    147    jz .w16_end
    148    punpckhqdq           m0, m0
    149 .w16_padv:
    150    movu        [dstq+16*0], m0
    151    movu        [dstq+16*1], m0
    152    add                dstq, 16*2
    153    sub                 bhd, 4
    154    jg .w16_padv
    155 .w16_end:
    156    RET
    157 .w32_padh:
    158    cmp                  wd, 16
    159    jg .w32_padh2
    160    pshufb               m1, m0, m5
    161    pshufb               m0, m4
    162    jmp .w32_main
    163 .w32_padh2:
    164    pshufb               m1, m4
    165    jmp .w32_main
    166 .w32:
    167    cmp                  wd, 32
    168    je .w32_loop
    169    call .setup_padh
    170 .w32_loop:
    171    mova                 m0, [srcq+16*0]
    172    mova                 m1, [srcq+16*1]
    173    cmp                  wd, 32
    174    jl .w32_padh
    175 .w32_main:
    176    pmaddubsw            m0, m3
    177    pmaddubsw            m1, m3
    178    add                srcq, 16*2
    179    packuswb             m0, m1
    180    movu             [dstq], m0
    181    add                dstq, 16
    182    dec                  hd
    183    jg .w32_loop
    184    test                bhd, bhd
    185    jz .w32_end
    186 .w32_padv:
    187    movu        [dstq+16*0], m0
    188    movu        [dstq+16*1], m0
    189    movu        [dstq+16*2], m0
    190    movu        [dstq+16*3], m0
    191    add                dstq, 16*4
    192    sub                 bhd, 4
    193    jg .w32_padv
    194 .w32_end:
    195    RET
    196 .w64_padh:
    197    cmp                  wd, 16
    198    jg .w64_padh2
    199    pshufb               m1, m0, m5
    200    pshufb               m0, m4
    201    pmaddubsw            m0, m3
    202    pmaddubsw            m1, m3
    203    packuswb             m0, m1
    204    packuswb             m1, m1
    205    jmp .w64_main
    206 .w64_padh2:
    207    pshufb               m1, m4
    208    pmaddubsw            m0, m3
    209    pmaddubsw            m2, m1, m3
    210    pshufb               m1, m5
    211    pmaddubsw            m1, m3
    212    packuswb             m0, m2
    213    packuswb             m1, m1
    214    jmp .w64_main
    215 .w64_padh3:
    216    cmp                  wd, 48
    217    jg .w64_padh4
    218    pshufb               m2, m1, m5
    219    pshufb               m1, m4
    220    jmp .w64_main2
    221 .w64_padh4:
    222    pshufb               m2, m4
    223    jmp .w64_main2
    224 .w64:
    225    cmp                  wd, 64
    226    je .w64_loop
    227    call .setup_padh
    228 .w64_loop:
    229    mova                 m0, [srcq+16*0]
    230    mova                 m1, [srcq+16*1]
    231    cmp                  wd, 32
    232    jle .w64_padh
    233    pmaddubsw            m0, m3
    234    pmaddubsw            m1, m3
    235    packuswb             m0, m1
    236    mova                 m1, [srcq+16*2]
    237    mova                 m2, [srcq+16*3]
    238    cmp                  wd, 64
    239    jl .w64_padh3
    240 .w64_main2:
    241    pmaddubsw            m1, m3
    242    pmaddubsw            m2, m3
    243    packuswb             m1, m2
    244 .w64_main:
    245    add                srcq, 16*4
    246    movu        [dstq+16*0], m0
    247    movu        [dstq+16*1], m1
    248    add                dstq, 16*2
    249    dec                  hd
    250    jg .w64_loop
    251    test                bhd, bhd
    252    jz .w64_end
    253 .w64_padv:
    254    movu        [dstq+16*0], m0
    255    movu        [dstq+16*1], m1
    256    movu        [dstq+16*2], m0
    257    movu        [dstq+16*3], m1
    258    add                dstq, 16*4
    259    sub                 bhd, 2
    260    jg .w64_padv
    261 .w64_end:
    262    RET
    263 .setup_padh:
    264    mova                 m4, [base+pb_0to63]
    265    lea                 r6d, [wq-1]
    266    and                 r6d, 15
    267    movd                 m5, r6d
    268    pxor                 m0, m0
    269    pshufb               m5, m0
    270    pminub               m4, m5
    271    ret
    272 
    273 %if ARCH_X86_64
    274 
    275 INIT_YMM avx2
    276 cglobal pal_idx_finish, 4, 7, 5, dst, src, bw, bh, w, h
    277 %define base r6-pal_idx_finish_avx2_table
    278    lea                  r6, [pal_idx_finish_avx2_table]
    279    tzcnt               bwd, bwd
    280    movifnidn            wd, wm
    281    movifnidn            hd, hm
    282    movsxd              bwq, [r6+bwq*4]
    283    vpbroadcastd         m2, [base+pb_1_16]
    284    dec                  wd
    285    add                 bwq, r6
    286    sub                 bhd, hd
    287    jmp                 bwq
    288 .w4:
    289    mova                xm0, [srcq]
    290    add                srcq, 16
    291    pmaddubsw           xm0, xm2
    292    packuswb            xm0, xm0
    293    movq             [dstq], xm0
    294    add                dstq, 8
    295    sub                  hd, 4
    296    jg .w4
    297    test                bhd, bhd
    298    jz .w4_end
    299    pshuflw             xm0, xm0, q3333
    300 .w4_padv:
    301    movq             [dstq], xm0
    302    add                dstq, 8
    303    sub                 bhd, 4
    304    jg .w4_padv
    305 .w4_end:
    306    RET
    307 .w8_padh:
    308    pshufb              xm0, xm3
    309    pshufb              xm1, xm3
    310    jmp .w8_main
    311 .w8:
    312    mova                xm3, [base+pal_idx_w8_padh]
    313 .w8_loop:
    314    mova                xm0, [srcq+16*0]
    315    mova                xm1, [srcq+16*1]
    316    cmp                  wd, 7
    317    jl .w8_padh
    318 .w8_main:
    319    pmaddubsw           xm0, xm2
    320    pmaddubsw           xm1, xm2
    321    add                srcq, 16*2
    322    packuswb            xm0, xm1
    323    movu             [dstq], xm0
    324    add                dstq, 16
    325    sub                  hd, 4
    326    jg .w8_loop
    327    test                bhd, bhd
    328    jz .w8_end
    329    pshufd              xm0, xm0, q3333
    330 .w8_padv:
    331    movu             [dstq], xm0
    332    add                dstq, 16
    333    sub                 bhd, 4
    334    jg .w8_padv
    335 .w8_end:
    336    RET
    337 .w16_padh:
    338    pshufb               m0, m3
    339    pshufb               m1, m3
    340    jmp .w16_main
    341 .w16:
    342    cmp                  wd, 15
    343    je .w16_loop
    344    vbroadcasti128       m0, [base+pb_0to63]
    345    movd                xm3, wd
    346    vpbroadcastb         m3, xm3
    347    pminub               m3, m0
    348 .w16_loop:
    349    mova                 m0, [srcq+32*0]
    350    mova                 m1, [srcq+32*1]
    351    cmp                  wd, 15
    352    jl .w16_padh
    353 .w16_main:
    354    pmaddubsw            m0, m2
    355    pmaddubsw            m1, m2
    356    add                srcq, 32*2
    357    packuswb             m0, m1
    358    vpermq               m1, m0, q3120
    359    movu             [dstq], m1
    360    add                dstq, 32
    361    sub                  hd, 4
    362    jg .w16_loop
    363    test                bhd, bhd
    364    jz .w16_end
    365    vpermq               m0, m0, q3333
    366 .w16_padv:
    367    movu             [dstq], m0
    368    add                dstq, 32
    369    sub                 bhd, 4
    370    jg .w16_padv
    371 .w16_end:
    372    RET
    373 .w32_padh:
    374    cmp                  wd, 15
    375    jg .w32_padh2
    376    vinserti128          m0, xm0, 1
    377    vinserti128          m1, xm1, 1
    378 .w32_padh2:
    379    pshufb               m0, m3
    380    pshufb               m1, m3
    381    jmp .w32_main
    382 .w32:
    383    cmp                  wd, 31
    384    je .w32_loop
    385    movd                xm3, wd
    386    vpbroadcastb         m3, xm3
    387    pminub               m3, [base+pb_0to63]
    388 .w32_loop:
    389    mova                 m0, [srcq+32*0]
    390    mova                 m1, [srcq+32*1]
    391    cmp                  wd, 31
    392    jl .w32_padh
    393 .w32_main:
    394    pmaddubsw            m0, m2
    395    pmaddubsw            m1, m2
    396    add                srcq, 32*2
    397    packuswb             m0, m1
    398    vpermq               m1, m0, q3120
    399    movu             [dstq], m1
    400    add                dstq, 32
    401    sub                  hd, 2
    402    jg .w32_loop
    403    test                bhd, bhd
    404    jz .w32_end
    405    vpermq               m0, m0, q3131
    406 .w32_padv:
    407    movu        [dstq+32*0], m0
    408    movu        [dstq+32*1], m0
    409    add                dstq, 32*2
    410    sub                 bhd, 4
    411    jg .w32_padv
    412 .w32_end:
    413    RET
    414 .w64_padh:
    415    cmp                  wd, 15
    416    jg .w64_padh2
    417    vinserti128          m1, m0, xm0, 1
    418    pshufb               m0, m1, m3
    419    pshufb               m1, m4
    420    jmp .w64_main
    421 .w64_padh2:
    422    cmp                  wd, 31
    423    jg .w64_padh3
    424    vperm2i128           m1, m0, m0, 0x11
    425    pshufb               m0, m3
    426    pshufb               m1, m4
    427    jmp .w64_main
    428 .w64_padh3:
    429    cmp                  wd, 47
    430    jg .w64_padh4
    431    vinserti128          m1, xm1, 1
    432 .w64_padh4:
    433    pshufb               m1, m3
    434    jmp .w64_main
    435 .w64:
    436    cmp                  wd, 63
    437    je .w64_loop
    438    mov                 r6d, wd
    439    and                 r6d, 31
    440    movd                xm4, r6d
    441    vpbroadcastb         m4, xm4
    442    pminub               m3, m4, [pb_0to63]
    443 .w64_loop:
    444    mova                 m0, [srcq+32*0]
    445    mova                 m1, [srcq+32*1]
    446    cmp                  wd, 63
    447    jl .w64_padh
    448 .w64_main:
    449    pmaddubsw            m0, m2
    450    pmaddubsw            m1, m2
    451    add                srcq, 32*2
    452    packuswb             m0, m1
    453    vpermq               m0, m0, q3120
    454    movu             [dstq], m0
    455    add                dstq, 32
    456    dec                  hd
    457    jg .w64_loop
    458    test                bhd, bhd
    459    jz .w64_end
    460 .w64_padv:
    461    movu        [dstq+32*0], m0
    462    movu        [dstq+32*1], m0
    463    movu        [dstq+32*2], m0
    464    movu        [dstq+32*3], m0
    465    add                dstq, 32*4
    466    sub                 bhd, 4
    467    jg .w64_padv
    468 .w64_end:
    469    RET
    470 
    471 INIT_ZMM avx512icl
    472 cglobal pal_idx_finish, 4, 7, 7, dst, src, bw, bh, w, h
    473 %define base r6-pal_idx_finish_avx512icl_table
    474    lea                  r6, [pal_idx_finish_avx512icl_table]
    475    tzcnt               bwd, bwd
    476    movifnidn            wd, wm
    477    movifnidn            hd, hm
    478    movsxd              bwq, [r6+bwq*4]
    479    vpbroadcastd         m4, [base+pb_1_16]
    480    dec                  wd
    481    add                 bwq, r6
    482    sub                 bhd, hd
    483    jmp                 bwq
    484 .w4:
    485    mova               xmm0, [srcq]
    486    add                srcq, 16
    487    pmaddubsw          xmm0, xm4
    488    packuswb           xmm0, xmm0
    489    movq             [dstq], xmm0
    490    add                dstq, 8
    491    sub                  hd, 4
    492    jg .w4
    493    test                bhd, bhd
    494    jz .w4_end
    495    pshuflw            xmm0, xmm0, q3333
    496 .w4_padv:
    497    movq             [dstq], xmm0
    498    add                dstq, 8
    499    sub                 bhd, 4
    500    jg .w4_padv
    501 .w4_end:
    502    RET
    503 .w8_padh:
    504    pshufb             xmm0, xmm2
    505    pshufb             xmm1, xmm2
    506    jmp .w8_main
    507 .w8:
    508    mova               xmm2, [base+pal_idx_w8_padh]
    509 .w8_loop:
    510    mova               xmm0, [srcq+16*0]
    511    mova               xmm1, [srcq+16*1]
    512    cmp                  wd, 7
    513    jl .w8_padh
    514 .w8_main:
    515    pmaddubsw          xmm0, xm4
    516    pmaddubsw          xmm1, xm4
    517    add                srcq, 16*2
    518    packuswb           xmm0, xmm1
    519    movu             [dstq], xmm0
    520    add                dstq, 16
    521    sub                  hd, 4
    522    jg .w8_loop
    523    test                bhd, bhd
    524    jz .w8_end
    525    pshufd             xmm0, xmm0, q3333
    526 .w8_padv:
    527    movu             [dstq], xmm0
    528    add                dstq, 16
    529    sub                 bhd, 4
    530    jg .w8_padv
    531 .w8_end:
    532    RET
    533 .w16_padh:
    534    pshufb               m0, m2
    535    jmp .w16_main
    536 .w16:
    537    cmp                  wd, 15
    538    je .w16_loop
    539    vbroadcasti32x4      m2, [base+pb_0to63]
    540    vpbroadcastb         m0, wd
    541    pminub               m2, m0
    542 .w16_loop:
    543    mova                 m0, [srcq]
    544    cmp                  wd, 15
    545    jl .w16_padh
    546 .w16_main:
    547    pmaddubsw            m0, m4
    548    add                srcq, 64
    549    vpmovwb             ym0, m0
    550    movu             [dstq], ym0
    551    add                dstq, 32
    552    sub                  hd, 4
    553    jg .w16_loop
    554    test                bhd, bhd
    555    jz .w16_end
    556    vpermq              ym0, ym0, q3333
    557 .w16_padv:
    558    movu             [dstq], ym0
    559    add                dstq, 32
    560    sub                 bhd, 4
    561    jg .w16_padv
    562 .w16_end:
    563    RET
    564 .w32_padh:
    565    vpermb               m0, m2, m0
    566    vpermb               m1, m2, m1
    567    jmp .w32_main
    568 .w32:
    569    mova                 m2, [base+pb_0to63]
    570    paddb                m3, m2, m2
    571    cmp                  wd, 31
    572    je .w32_loop
    573    vpbroadcastb         m0, wd
    574    mov                 r6d, 0xff00
    575    kmovw                k1, r6d
    576    vpaddd           m0{k1}, [pb_32] {1to16}
    577    pminub               m2, m0
    578 .w32_loop:
    579    mova                 m0, [srcq+64*0]
    580    mova                 m1, [srcq+64*1]
    581    cmp                  wd, 31
    582    jl .w32_padh
    583 .w32_main:
    584    pmaddubsw            m0, m4
    585    pmaddubsw            m1, m4
    586    add                srcq, 64*2
    587    vpermt2b             m0, m3, m1
    588    movu             [dstq], m0
    589    add                dstq, 64
    590    sub                  hd, 4
    591    jg .w32_loop
    592    test                bhd, bhd
    593    jz .w32_end
    594    vshufi32x4           m0, m0, q3333
    595 .w32_padv:
    596    movu             [dstq], m0
    597    add                dstq, 64
    598    sub                 bhd, 4
    599    jg .w32_padv
    600 .w32_end:
    601    RET
    602 .w64_padh:
    603    REPX  {vpermb x, m5, x}, m0, m1, m2, m3
    604    jmp .w64_main
    605 .w64:
    606    mova                 m5, [base+pb_0to63]
    607    paddb                m6, m5, m5
    608    cmp                  wd, 63
    609    je .w64_loop
    610    vpbroadcastb         m0, wd
    611    pminub               m5, m0
    612 .w64_loop:
    613    mova                 m0, [srcq+64*0]
    614    mova                 m1, [srcq+64*1]
    615    mova                 m2, [srcq+64*2]
    616    mova                 m3, [srcq+64*3]
    617    cmp                  wd, 63
    618    jl .w64_padh
    619 .w64_main:
    620    REPX  {pmaddubsw x, m4}, m0, m1, m2, m3
    621    add                srcq, 64*4
    622    vpermt2b             m0, m6, m1
    623    vpermt2b             m2, m6, m3
    624    movu        [dstq+64*0], m0
    625    movu        [dstq+64*1], m2
    626    add                dstq, 64*2
    627    sub                  hd, 4
    628    jg .w64_loop
    629    test                bhd, bhd
    630    jz .w64_end
    631    vshufi32x4           m2, m2, q3232
    632 .w64_padv:
    633    movu        [dstq+64*0], m2
    634    movu        [dstq+64*1], m2
    635    add                dstq, 64*2
    636    sub                 bhd, 4
    637    jg .w64_padv
    638 .w64_end:
    639    RET
    640 
    641 %endif ; ARCH_X86_64