tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef16_avx512.asm (21670B)


      1 ; Copyright © 2022, VideoLAN and dav1d authors
      2 ; Copyright © 2022, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA 64
     32 
     33 cdef_perm:     db  2, 18, 16, 18, 24, 19,  0, 19, 25, 20,  1, 20, 26, 21,  2, 21
     34               db  3, 26,  3, 26, 28, 27,  4, 27, 29, 28, -1, 28, 30, 29, -1, 29
     35               db  0, 34, 17, 34, 16, 35,  8, 35, 17, 36,  9, 36, 18, 37, 10, 37
     36               db  1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45
     37 end_perm4:     db  1,  2,  5,  6,  9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
     38               db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
     39 edge_mask4:    dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111
     40               dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011
     41               dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111
     42 pri_taps4:     dw 64, 32, 48, 48                 ; left-shifted by 4
     43 cdef_dirs4:    dw  8, 16,  8, 15, -7,-14,  1, -6
     44               dw  1,  2,  1, 10,  9, 18,  8, 17
     45               dw  8, 16,  8, 15, -7,-14,  1, -6
     46 deint_shuf:    db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
     47 cdef_dirs8:    db 32, 64, 32, 62,-30,-60,  2,-28
     48               db  2,  4,  2, 36, 34, 68, 32, 66
     49               db 32, 64, 32, 62,-30,-60,  2,-28
     50 pri_taps8:     dw  4,  4,  2,  2,  3,  3,  3,  3
     51 sec_taps4:     dw 32, 16
     52 pw_m16384:     times 2 dw -16384
     53 pw_2048:       times 2 dw 2048
     54 pd_268435568:  dd 268435568                      ; (1 << 28) + (7 << 4)
     55 edge_mask8:    dw 0x2121, 0x2020, 0x0101
     56 
     57 SECTION .text
     58 
     59 %macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp
     60    psubw           %1, %2, %3
     61    pabsw           %1, %1
     62    vpcmpgtw        k1, %3, %2
     63    vpsrlvw         %7, %1, %6
     64    psubusw         %7, %5, %7
     65    pminsw          %1, %7
     66    vpsubw      %1{k1}, %4, %1
     67 %endmacro
     68 
     69 ; t0 t1 t2 t3 t4 t5 t6 t7   L4 L5 20 21 22 23 24 25
     70 ; T0 T1 T2 T3 T4 T5 T6 T7   L6 L7 30 31 32 33 34 35
     71 ; L0 L1 00 01 02 03 04 05   b0 b1 b2 b3 b4 b5 b6 b7
     72 ; L2 L3 10 11 12 13 14 15   B0 B1 B2 B3 B4 B5 B6 B7
     73 
     74 INIT_ZMM avx512icl
     75 cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \
     76                                         pri, sec, dir, damping, edge
     77 %define base r6-cdef_dirs4
     78    lea             r6, [cdef_dirs4]
     79    movu           xm3, [dstq+strideq*0]
     80    vinserti32x4   ym3, [dstq+strideq*1], 1
     81    mova           xm2, [leftq]
     82    lea             r2, [dstq+strideq*2]
     83    vinserti32x4    m3, [r2+strideq*0], 2
     84    mova            m5, [base+cdef_perm]
     85    vinserti32x4    m3, [r2+strideq*1], 3
     86    vpermt2d        m2, m5, m3
     87    vinserti32x4    m1, m2, [topq+strideq*0-4], 0
     88    vinserti32x4    m1, [topq+strideq*1-4], 1
     89    mov            r3d, edgem
     90    movifnidn     prid, prim
     91    punpcklwd       m3, m3     ; px
     92    psrlw           m5, 8
     93    vpbroadcastd    m0, [base+pd_268435568]
     94    pxor           m12, m12
     95    cmp            r3d, 0x0f
     96    jne .mask_edges
     97    vinserti32x4    m2, [botq+strideq*0-4], 2
     98    vinserti32x4    m2, [botq+strideq*1-4], 3
     99 .main:
    100    test          prid, prid
    101    jz .sec_only
    102    lzcnt          r4d, prid
    103    rorx           r3d, prid, 2
    104    vpbroadcastw   m13, prim
    105    cmp     dword r10m, 0xfff  ; if (bpc == 12)
    106    cmove         prid, r3d    ;     pri >>= 2
    107    mov            r3d, dampingm
    108    and           prid, 4
    109    sub            r3d, 31
    110    vpbroadcastd   m15, [base+pri_taps4+priq]
    111    xor           prid, prid
    112    add            r4d, r3d
    113    cmovns        prid, r4d    ; pri_shift
    114    mov            r4d, dirm
    115    vpbroadcastw   m14, prid
    116    mov            r5d, secm
    117    vpbroadcastd    m9, [base+cdef_dirs4+(r4+2)*4]
    118    call .constrain
    119    test           r5d, r5d
    120    jz .end_no_clip
    121    lzcnt          r5d, r5d
    122    vpbroadcastw   m13, secm
    123    add            r3d, r5d
    124    pminuw          m6, m3, m8
    125    pmaxsw          m7, m3, m8
    126    pminuw          m6, m9
    127    pmaxsw          m7, m9
    128    call .constrain_sec
    129    pminuw          m6, m8
    130    pmaxsw          m7, m8
    131    pminuw          m6, m9
    132    pmaxsw          m7, m9
    133    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
    134    call .constrain
    135    pminuw          m6, m8
    136    pmaxsw          m7, m8
    137    pminuw          m6, m9
    138    pmaxsw          m7, m9
    139    psrldq          m8, m6, 2
    140    vpshldd         m3, m0, 8
    141    psrldq          m9, m7, 2
    142    paddd           m0, m3
    143    pminuw          m6, m8
    144    psrldq          m0, 1
    145    pmaxsw          m7, m9
    146    pmaxsw          m0, m6
    147    pminsw          m0, m7
    148    vpmovdw        ym0, m0
    149    jmp .end
    150 .sec_only:
    151    tzcnt          r5d, secm
    152    mov            r3d, dampingm
    153    vpbroadcastw   m13, secm
    154    mov            r4d, dirm
    155    sub            r3d, r5d    ; sec_shift
    156    call .constrain_sec
    157    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
    158    call .constrain
    159 .end_no_clip:
    160    mova           ym1, [base+end_perm4]
    161    vpshldd         m3, m0, 8  ; (px << 8) + ((sum > -8) << 4)
    162    paddd           m0, m3     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
    163    vpermb          m0, m1, m0
    164 .end:
    165    movq   [dstq+strideq*0], xm0
    166    movhps [dstq+strideq*1], xm0
    167    vextracti32x4  xm0, ym0, 1
    168    movq   [r2+strideq*0], xm0
    169    movhps [r2+strideq*1], xm0
    170    RET
    171 .mask_edges:
    172    vpbroadcastd    m6, [base+pw_m16384]
    173    test           r3b, 0x08
    174    jz .mask_edges_no_bottom  ; avoid buffer overread
    175    vinserti32x4    m2, [botq+strideq*0-4], 2
    176    vinserti32x4    m2, [botq+strideq*1-4], 3
    177    kmovw           k1, [base+edge_mask4-8+r3*2]
    178    jmp .mask_edges_main
    179 .mask_edges_no_bottom:
    180    kmovw           k1, [base+edge_mask4+8+r3*2]
    181 .mask_edges_main:
    182    or             r3d, 0x04
    183    vmovdqa32   m1{k1}, m6     ; edge pixels = -16384
    184    kmovw           k1, [base+edge_mask4-8+r3*2]
    185    vmovdqa32   m2{k1}, m6
    186    jmp .main
    187 .constrain_sec:
    188    vpbroadcastd    m9, [base+cdef_dirs4+(r4+4)*4]
    189    vpbroadcastw   m14, r3d
    190    vpbroadcastd   m15, [base+sec_taps4]
    191 .constrain:
    192    paddw           m8, m5, m9
    193    vpermi2w        m8, m1, m2 ; k0p0 k1p0
    194    psubw           m9, m5, m9
    195    vpermi2w        m9, m1, m2 ; k0p1 k1p1
    196    CONSTRAIN      m10, m8, m3, m12, m13, m14, m11
    197    vpdpwssd        m0, m10, m15
    198    CONSTRAIN      m10, m9, m3, m12, m13, m14, m11
    199    vpdpwssd        m0, m10, m15
    200    ret
    201 
    202 ; t0 t1 t2 t3 t4 t5 t6 t7   L4 L5 20 21 22 23 24 25   Lc Ld 60 61 62 63 64 65
    203 ; T0 T1 T2 T3 T4 T5 T6 T7   L6 L7 30 31 32 33 34 35   Le Lf 70 71 72 73 74 75
    204 ; L0 L1 00 01 02 03 04 05   L8 L9 40 41 42 43 44 45   b0 b1 b2 b3 b4 b5 b6 b7
    205 ; L2 L3 10 11 12 13 14 15   La Lb 50 51 52 53 54 55   B0 B1 B2 B3 B4 B5 B6 B7
    206 
    207 cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \
    208                                         pri, sec, dir, damping, edge
    209    lea             r6, [cdef_dirs4]
    210    movu          xm18, [dstq+strideq*0]
    211    vinserti128   ym18, [dstq+strideq*1], 1
    212    mova           xm1, [leftq+16*0]
    213    mova           xm2, [leftq+16*1]
    214    lea             r2, [strideq*3]
    215    vinserti32x4   m18, [dstq+strideq*2], 2
    216    mova            m5, [base+cdef_perm]
    217    vinserti32x4   m18, [dstq+r2       ], 3
    218    vpermt2d        m1, m5, m18
    219    vinserti32x4    m0, m1, [topq+strideq*0-4], 0
    220    vinserti32x4    m0, [topq+strideq*1-4], 1
    221    lea             r3, [dstq+strideq*4]
    222    movu          xm19, [r3+strideq*0]
    223    vinserti128   ym19, [r3+strideq*1], 1
    224    vinserti32x4   m19, [r3+strideq*2], 2
    225    vinserti32x4   m19, [r3+r2       ], 3
    226    mov            r3d, edgem
    227    movifnidn     prid, prim
    228    vpermt2d        m2, m5, m19
    229    vpbroadcastd   m16, [base+pd_268435568]
    230    pxor           m12, m12
    231    punpcklwd      m18, m18    ; px (top)
    232    psrlw           m5, 8
    233    punpcklwd      m19, m19    ; px (bottom)
    234    mova           m17, m16
    235    vshufi32x4      m1, m2, q3210
    236    cmp            r3d, 0x0f
    237    jne .mask_edges
    238    vinserti32x4    m2, [botq+strideq*0-4], 2
    239    vinserti32x4    m2, [botq+strideq*1-4], 3
    240 .main:
    241    test          prid, prid
    242    jz .sec_only
    243    lzcnt          r4d, prid
    244    rorx           r3d, prid, 2
    245    vpbroadcastw   m13, prim
    246    cmp     dword r10m, 0xfff  ; if (bpc == 12)
    247    cmove         prid, r3d    ;     pri >>= 2
    248    mov            r3d, dampingm
    249    and           prid, 4
    250    sub            r3d, 31
    251    vpbroadcastd   m15, [base+pri_taps4+priq]
    252    xor           prid, prid
    253    add            r4d, r3d
    254    cmovns        prid, r4d    ; pri_shift
    255    mov            r4d, dirm
    256    vpbroadcastw   m14, prid
    257    mov            r5d, secm
    258    vpbroadcastd    m9, [base+cdef_dirs4+(r4+2)*4]
    259    call .constrain
    260    test           r5d, r5d
    261    jz .end_no_clip
    262    lzcnt          r5d, r5d
    263    vpbroadcastw   m13, secm
    264    add            r3d, r5d
    265    pminuw          m3, m18, m6
    266    pmaxsw          m4, m18, m6
    267    pminuw         m20, m19, m7
    268    pmaxsw         m21, m19, m7
    269    pminuw          m3, m8
    270    pmaxsw          m4, m8
    271    pminuw         m20, m9
    272    pmaxsw         m21, m9
    273    call .constrain_sec
    274    pminuw          m3, m6
    275    pmaxsw          m4, m6
    276    pminuw         m20, m7
    277    pmaxsw         m21, m7
    278    pminuw          m3, m8
    279    pmaxsw          m4, m8
    280    pminuw         m20, m9
    281    pmaxsw         m21, m9
    282    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
    283    call .constrain
    284    pminuw          m3, m6
    285    pmaxsw          m4, m6
    286    mov             r3, 0xcccccccccccccccc
    287    pminuw         m20, m7
    288    pmaxsw         m21, m7
    289    kmovq           k1, r3
    290    pminuw          m3, m8
    291    pmaxsw          m4, m8
    292    pminuw         m20, m9
    293    pmaxsw         m21, m9
    294    vbroadcasti32x4 m0, [base+deint_shuf]
    295    vpshldd         m6, m20, m3, 16
    296    vmovdqu8    m3{k1}, m20
    297    vpshldd        m18, m16, 8
    298    vpshldd         m7, m21, m4, 16
    299    vmovdqu8    m4{k1}, m21
    300    vpshldd        m19, m17, 8
    301    pminuw          m3, m6
    302    paddd          m16, m18
    303    pmaxsw          m4, m7
    304    paddd          m17, m19
    305    psrldq         m16, 1
    306    palignr    m16{k1}, m17, m17, 15
    307    lea             r6, [dstq+strideq*4]
    308    pmaxsw         m16, m3
    309    pminsw         m16, m4
    310    pshufb         m16, m0
    311    movq   [dstq+strideq*0], xm16
    312    movhps [r6  +strideq*0], xm16
    313    vextracti128  xm17, ym16, 1
    314    movq   [dstq+strideq*1], xm17
    315    movhps [r6  +strideq*1], xm17
    316    vextracti32x4  xm17, m16, 2
    317    movq   [dstq+strideq*2], xm17
    318    movhps [r6  +strideq*2], xm17
    319    vextracti32x4  xm16, m16, 3
    320    movq   [dstq+r2       ], xm16
    321    movhps [r6  +r2       ], xm16
    322    RET
    323 .sec_only:
    324    mov            r4d, dirm
    325    tzcnt          r5d, secm
    326    mov            r3d, dampingm
    327    vpbroadcastw   m13, secm
    328    sub            r3d, r5d    ; sec_shift
    329    call .constrain_sec
    330    vpbroadcastd    m9, [base+cdef_dirs4+(r4+0)*4]
    331    call .constrain
    332 .end_no_clip:
    333    mova          ym20, [base+end_perm4]
    334    vpshldd        m18, m16, 8 ; (px << 8) + ((sum > -8) << 4)
    335    vpshldd        m19, m17, 8
    336    paddd          m16, m18    ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
    337    paddd          m17, m19
    338    vpermb         m16, m20, m16
    339    vpermb         m17, m20, m17
    340    movq   [dstq+strideq*0], xm16
    341    movhps [dstq+strideq*1], xm16
    342    vextracti128  xm16, ym16, 1
    343    movq   [dstq+strideq*2], xm16
    344    movhps [dstq+r2       ], xm16
    345    lea           dstq, [dstq+strideq*4]
    346    movq   [dstq+strideq*0], xm17
    347    movhps [dstq+strideq*1], xm17
    348    vextracti128  xm17, ym17, 1
    349    movq   [dstq+strideq*2], xm17
    350    movhps [dstq+r2       ], xm17
    351    RET
    352 .mask_edges:
    353    vpbroadcastd    m6, [base+pw_m16384]
    354    test           r3b, 0x08
    355    jz .mask_edges_no_bottom   ; avoid buffer overread
    356    vinserti32x4    m2, [botq+strideq*0-4], 2
    357    vinserti32x4    m2, [botq+strideq*1-4], 3
    358    kmovw           k1, [base+edge_mask4-8+r3*2]
    359    jmp .mask_edges_main
    360 .mask_edges_no_bottom:
    361    kmovw           k1, [base+edge_mask4+8+r3*2]
    362 .mask_edges_main:
    363    mov            r4d, r3d
    364    or             r3d, 0x0c
    365    vmovdqa32   m0{k1}, m6     ; edge pixels = -16384
    366    kmovw           k1, [base+edge_mask4-8+r3*2]
    367    or             r4d, 0x04
    368    vmovdqa32   m1{k1}, m6
    369    kmovw           k1, [base+edge_mask4-8+r4*2]
    370    vmovdqa32   m2{k1}, m6
    371    jmp .main
    372 .constrain_sec:
    373    vpbroadcastd    m9, [base+cdef_dirs4+(r4+4)*4]
    374    vpbroadcastw   m14, r3d
    375    vpbroadcastd   m15, [base+sec_taps4]
    376 .constrain:
    377    paddw           m7, m5, m9
    378    mova            m6, m0
    379    vpermt2w        m6, m7, m1 ; k0p0 k1p0 (top)
    380    psubw           m9, m5, m9
    381    mova            m8, m0
    382    vpermi2w        m7, m1, m2 ; k0p0 k1p0 (bottom)
    383    CONSTRAIN      m10, m6, m18, m12, m13, m14, m11
    384    vpermt2w        m8, m9, m1 ; k0p1 k1p1 (top)
    385    vpdpwssd       m16, m10, m15
    386    CONSTRAIN      m10, m7, m19, m12, m13, m14, m11
    387    vpermi2w        m9, m1, m2 ; k0p1 k1p1 (bottom)
    388    vpdpwssd       m17, m10, m15
    389    CONSTRAIN      m10, m8, m18, m12, m13, m14, m11
    390    vpdpwssd       m16, m10, m15
    391    CONSTRAIN      m10, m9, m19, m12, m13, m14, m11
    392    vpdpwssd       m17, m10, m15
    393    ret
    394 
    395 cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \
    396                                               pri, sec, dir, damping, edge
    397 %define base r6-cdef_dirs8
    398    lea             r6, [cdef_dirs8]
    399    movu          ym17, [dstq+strideq*0]
    400    vinserti32x8   m17, [dstq+strideq*1], 1
    401    movq           xm4, [leftq+8*0]
    402    movq           xm5, [leftq+8*1]
    403    psrld           m2, [base+cdef_perm], 16
    404    movq           xm6, [leftq+8*2]
    405    movq           xm7, [leftq+8*3]
    406    lea             r2, [strideq*3]
    407    movu          ym16, [topq+strideq*0-4]
    408    vinserti32x8   m16, [topq+strideq*1-4], 1
    409    lea             r3, [dstq+strideq*4]
    410    movu          ym18, [dstq+strideq*2]
    411    vinserti32x8   m18, [dstq+r2       ], 1
    412    movu          ym19, [r3+strideq*0]
    413    vinserti32x8   m19, [r3+strideq*1], 1
    414    movu          ym20, [r3+strideq*2]
    415    vinserti32x8   m20, [r3+r2       ], 1
    416    vshufi32x4      m0, m17, m18, q2020 ; px (top)
    417    mov            r3d, edgem
    418    vshufi32x4      m1, m19, m20, q2020 ; px (bottom)
    419    movifnidn     prid, prim
    420    vpermt2d       m17, m2, m4
    421    vpermt2d       m18, m2, m5
    422    pxor           m12, m12
    423    vpermt2d       m19, m2, m6
    424    vpermt2d       m20, m2, m7
    425    cmp            r3d, 0x0f
    426    jne .mask_edges
    427    movu          ym21, [botq+strideq*0-4]
    428    vinserti32x8   m21, [botq+strideq*1-4], 1
    429 .main:
    430    mova    [rsp+64*0], m16    ; top
    431    mova    [rsp+64*1], m17    ; 0 1
    432    mova    [rsp+64*2], m18    ; 2 3
    433    mova    [rsp+64*3], m19    ; 4 5
    434    mova    [rsp+64*4], m20    ; 6 7
    435    mova    [rsp+64*5], m21    ; bottom
    436    test          prid, prid
    437    jz .sec_only
    438    lzcnt          r4d, prid
    439    rorx           r3d, prid, 2
    440    vpbroadcastw   m13, prim
    441    cmp     dword r10m, 0xfff  ; if (bpc == 12)
    442    cmove         prid, r3d    ;     pri >>= 2
    443    mov            r3d, dampingm
    444    and           prid, 4
    445    sub            r3d, 31
    446    add            r4d, r3d    ; pri_shift
    447    vpbroadcastw   m14, r4d
    448    mov            r4d, dirm
    449    vpbroadcastd    m2, [base+pri_taps8+priq*2+0]
    450    vpbroadcastd    m3, [base+pri_taps8+priq*2+4]
    451    movsx           r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1
    452    pmaxsw         m14, m12
    453    call .constrain
    454    mov            r5d, secm
    455    pmullw         m16, m8, m2
    456    pmullw         m17, m9, m2
    457    test           r5d, r5d
    458    jnz .pri_sec
    459    movsx           r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
    460    call .constrain
    461    pmullw          m8, m3
    462    pmullw          m9, m3
    463    jmp .end_no_clip
    464 .pri_sec:
    465    lzcnt          r5d, r5d
    466    add            r3d, r5d    ; sec_shift
    467    movsx           r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
    468    pminuw         m18, m0, m4
    469    pmaxsw         m19, m0, m4
    470    pminuw         m20, m1, m5
    471    pmaxsw         m21, m1, m5
    472    call .min_max_constrain2
    473    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2
    474    pmullw          m8, m3
    475    pmullw          m9, m3
    476    vpbroadcastw   m13, secm
    477    vpbroadcastw   m14, r3d
    478    paddw          m16, m8
    479    paddw          m17, m9
    480    call .min_max_constrain
    481    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3
    482    mova            m2, m8
    483    mova            m3, m9
    484    call .min_max_constrain
    485    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2
    486    paddw           m2, m8
    487    paddw           m3, m9
    488    call .min_max_constrain
    489    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3
    490    paddw           m2, m2
    491    paddw           m3, m3
    492    paddw          m16, m8
    493    paddw          m17, m9
    494    call .min_max_constrain
    495    vpbroadcastd   m10, [base+pw_2048]
    496    paddw          m16, m2
    497    paddw          m17, m3
    498    paddw          m16, m8
    499    paddw          m17, m9
    500    psraw           m8, m16, 15
    501    psraw           m9, m17, 15
    502    paddw          m16, m8
    503    paddw          m17, m9
    504    pmulhrsw       m16, m10
    505    pmulhrsw       m17, m10
    506    pminuw         m18, m4
    507    pmaxsw         m19, m4
    508    pminuw         m20, m5
    509    pmaxsw         m21, m5
    510    pminuw         m18, m6
    511    pmaxsw         m19, m6
    512    pminuw         m20, m7
    513    pmaxsw         m21, m7
    514    paddw          m16, m0
    515    paddw          m17, m1
    516    pmaxsw         m16, m18
    517    pmaxsw         m17, m20
    518    pminsw         m16, m19
    519    pminsw         m17, m21
    520    jmp .end
    521 .sec_only:
    522    tzcnt          r5d, secm
    523    mov            r4d, dirm
    524    mov            r3d, dampingm
    525    vpbroadcastw   m13, secm
    526    sub            r3d, r5d
    527    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+0]
    528    vpbroadcastw   m14, r3d
    529    call .constrain
    530    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+0]
    531    mova           m16, m8
    532    mova           m17, m9
    533    call .constrain
    534    movsx           r5, byte [base+cdef_dirs8+(r4+0)*2+1]
    535    paddw          m16, m8
    536    paddw          m17, m9
    537    call .constrain
    538    movsx           r5, byte [base+cdef_dirs8+(r4+4)*2+1]
    539    paddw          m16, m16
    540    paddw          m17, m17
    541    paddw          m16, m8
    542    paddw          m17, m9
    543    call .constrain
    544 .end_no_clip:
    545    vpbroadcastd   m10, [base+pw_2048]
    546    paddw          m16, m8
    547    paddw          m17, m9
    548    psraw           m8, m16, 15
    549    psraw           m9, m17, 15
    550    paddw          m16, m8
    551    paddw          m17, m9
    552    pmulhrsw       m16, m10
    553    pmulhrsw       m17, m10
    554    paddw          m16, m0
    555    paddw          m17, m1
    556 .end:
    557    mova          [dstq+strideq*0], xm16
    558    vextracti128  [dstq+strideq*1], ym16, 1
    559    vextracti32x4 [dstq+strideq*2], m16, 2
    560    vextracti32x4 [dstq+r2       ], m16, 3
    561    lea           dstq, [dstq+strideq*4]
    562    mova          [dstq+strideq*0], xm17
    563    vextracti128  [dstq+strideq*1], ym17, 1
    564    vextracti32x4 [dstq+strideq*2], m17, 2
    565    vextracti32x4 [dstq+r2       ], m17, 3
    566    RET
    567 .mask_edges:
    568    vpbroadcastd    m2, [base+pw_m16384]
    569    test           r3b, 0x08
    570    jz .mask_edges_no_bottom  ; avoid buffer overread
    571    movu          ym21, [botq+strideq*0-4]
    572    vinserti32x8   m21, [botq+strideq*1-4], 1
    573    jmp .mask_edges_top
    574 .mask_edges_no_bottom:
    575    mova           m21, m2
    576 .mask_edges_top:
    577    test           r3b, 0x04
    578    jnz .mask_edges_main
    579    mova           m16, m2
    580 .mask_edges_main:
    581    and            r3d, 0x03
    582    cmp            r3d, 0x03
    583    je .main
    584    kmovw           k1, [base+edge_mask8+r3*2]
    585    vmovdqa32  m16{k1}, m2     ; edge pixels = -16384
    586    vmovdqa32  m17{k1}, m2
    587    vmovdqa32  m18{k1}, m2
    588    vmovdqa32  m19{k1}, m2
    589    vmovdqa32  m20{k1}, m2
    590    vmovdqa32  m21{k1}, m2
    591    jmp .main
    592 ALIGN function_align
    593 .min_max_constrain:
    594    pminuw         m18, m4
    595    pmaxsw         m19, m4
    596    pminuw         m20, m5
    597    pmaxsw         m21, m5
    598 .min_max_constrain2:
    599    pminuw         m18, m6
    600    pmaxsw         m19, m6
    601    pminuw         m20, m7
    602    pmaxsw         m21, m7
    603 .constrain:
    604    %define        tmp  rsp+gprsize+68
    605    movu            m4, [tmp+r5+64*0]
    606    vshufi32x4      m4, [tmp+r5+64*1], q2020 ; k0p0 (top)
    607    movu            m5, [tmp+r5+64*2]
    608    vshufi32x4      m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom)
    609    neg             r5
    610    movu            m6, [tmp+r5+64*0]
    611    vshufi32x4      m6, [tmp+r5+64*1], q2020 ; k0p1 (top)
    612    movu            m7, [tmp+r5+64*2]
    613    vshufi32x4      m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom)
    614    CONSTRAIN       m8, m4, m0, m12, m13, m14, m15
    615    CONSTRAIN       m9, m5, m1, m12, m13, m14, m15
    616    CONSTRAIN      m10, m6, m0, m12, m13, m14, m15
    617    CONSTRAIN      m11, m7, m1, m12, m13, m14, m15
    618    paddw           m8, m10
    619    paddw           m9, m11
    620    ret
    621 
    622 %endif ; ARCH_X86_64