tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef16_avx2.asm (29671B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA
     32 
     33 %macro DIR_TABLE 1 ; stride
     34    db  1 * %1 + 0,  2 * %1 + 0
     35    db  1 * %1 + 0,  2 * %1 - 2
     36    db -1 * %1 + 2, -2 * %1 + 4
     37    db  0 * %1 + 2, -1 * %1 + 4
     38    db  0 * %1 + 2,  0 * %1 + 4
     39    db  0 * %1 + 2,  1 * %1 + 4
     40    db  1 * %1 + 2,  2 * %1 + 4
     41    db  1 * %1 + 0,  2 * %1 + 2
     42    db  1 * %1 + 0,  2 * %1 + 0
     43    db  1 * %1 + 0,  2 * %1 - 2
     44    db -1 * %1 + 2, -2 * %1 + 4
     45    db  0 * %1 + 2, -1 * %1 + 4
     46 %endmacro
     47 
     48 dir_table4: DIR_TABLE 16
     49 dir_table8: DIR_TABLE 32
     50 pri_taps:   dw  4, 4, 3, 3, 2, 2, 3, 3
     51 
     52 dir_shift:  times 2 dw 0x4000
     53            times 2 dw 0x1000
     54 
     55 pw_2048:    times 2 dw 2048
     56 pw_m16384:  times 2 dw -16384
     57 
     58 cextern cdef_dir_8bpc_avx2.main
     59 
     60 SECTION .text
     61 
     62 %macro CDEF_FILTER 2 ; w, h
     63    DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp
     64    movifnidn     prid, r5m
     65    movifnidn     secd, r6m
     66    mov           dird, r7m
     67    vpbroadcastd    m8, [base+pw_2048]
     68    lea           dirq, [base+dir_table%1+dirq*2]
     69    test          prid, prid
     70    jz .sec_only
     71 %if WIN64
     72    vpbroadcastw    m6, prim
     73    movaps  [rsp+16*0], xmm9
     74    movaps  [rsp+16*1], xmm10
     75 %else
     76    movd           xm6, prid
     77    vpbroadcastw    m6, xm6
     78 %endif
     79    lzcnt      pridmpd, prid
     80    rorx          tmpd, prid, 2
     81    cmp     dword r10m, 0xfff ; if (bpc == 12)
     82    cmove         prid, tmpd  ;     pri >>= 2
     83    mov           tmpd, r8m   ; damping
     84    and           prid, 4
     85    sub           tmpd, 31
     86    vpbroadcastd    m9, [base+pri_taps+priq+8*0]
     87    vpbroadcastd   m10, [base+pri_taps+priq+8*1]
     88    test          secd, secd
     89    jz .pri_only
     90 %if WIN64
     91    movaps         r8m, xmm13
     92    vpbroadcastw   m13, secm
     93    movaps         r4m, xmm11
     94    movaps         r6m, xmm12
     95 %else
     96    movd           xm0, secd
     97    vpbroadcastw   m13, xm0
     98 %endif
     99    lzcnt         secd, secd
    100    xor           prid, prid
    101    add        pridmpd, tmpd
    102    cmovs      pridmpd, prid
    103    add           secd, tmpd
    104    lea           tmpq, [px]
    105    mov    [pri_shift], pridmpq
    106    mov    [sec_shift], secq
    107 %rep %1*%2/16
    108    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
    109 %endrep
    110 %if WIN64
    111    movaps       xmm11, r4m
    112    movaps       xmm12, r6m
    113    movaps       xmm13, r8m
    114 %endif
    115    jmp .pri_end
    116 .pri_only:
    117    add        pridmpd, tmpd
    118    cmovs      pridmpd, secd
    119    lea           tmpq, [px]
    120    mov    [pri_shift], pridmpq
    121 %rep %1*%2/16
    122    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
    123 %endrep
    124 .pri_end:
    125 %if WIN64
    126    movaps        xmm9, [rsp+16*0]
    127    movaps       xmm10, [rsp+16*1]
    128 %endif
    129 .end:
    130    RET
    131 .sec_only:
    132    mov           tmpd, r8m ; damping
    133 %if WIN64
    134    vpbroadcastw    m6, secm
    135 %else
    136    movd           xm6, secd
    137    vpbroadcastw    m6, xm6
    138 %endif
    139    tzcnt         secd, secd
    140    sub           tmpd, secd
    141    mov    [sec_shift], tmpq
    142    lea           tmpq, [px]
    143 %rep %1*%2/16
    144    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
    145 %endrep
    146    jmp .end
    147 %if %1 == %2
    148 ALIGN function_align
    149 .pri:
    150    movsx         offq, byte [dirq+4]    ; off_k0
    151 %if %1 == 4
    152    mova            m1, [tmpq+32*0]
    153    punpcklqdq      m1, [tmpq+32*1]      ; 0 2 1 3
    154    movu            m2, [tmpq+offq+32*0]
    155    punpcklqdq      m2, [tmpq+offq+32*1] ; k0p0
    156    neg           offq
    157    movu            m3, [tmpq+offq+32*0]
    158    punpcklqdq      m3, [tmpq+offq+32*1] ; k0p1
    159 %else
    160    mova           xm1, [tmpq+32*0]
    161    vinserti128     m1, [tmpq+32*1], 1
    162    movu           xm2, [tmpq+offq+32*0]
    163    vinserti128     m2, [tmpq+offq+32*1], 1
    164    neg           offq
    165    movu           xm3, [tmpq+offq+32*0]
    166    vinserti128     m3, [tmpq+offq+32*1], 1
    167 %endif
    168    movsx         offq, byte [dirq+5]    ; off_k1
    169    psubw           m2, m1               ; diff_k0p0
    170    psubw           m3, m1               ; diff_k0p1
    171    pabsw           m4, m2               ; adiff_k0p0
    172    psrlw           m5, m4, [pri_shift+gprsize]
    173    psubusw         m0, m6, m5
    174    pabsw           m5, m3               ; adiff_k0p1
    175    pminsw          m0, m4
    176    psrlw           m4, m5, [pri_shift+gprsize]
    177    psignw          m0, m2               ; constrain(diff_k0p0)
    178    psubusw         m2, m6, m4
    179    pminsw          m2, m5
    180 %if %1 == 4
    181    movu            m4, [tmpq+offq+32*0]
    182    punpcklqdq      m4, [tmpq+offq+32*1] ; k1p0
    183    neg           offq
    184    movu            m5, [tmpq+offq+32*0]
    185    punpcklqdq      m5, [tmpq+offq+32*1] ; k1p1
    186 %else
    187    movu           xm4, [tmpq+offq+32*0]
    188    vinserti128     m4, [tmpq+offq+32*1], 1
    189    neg           offq
    190    movu           xm5, [tmpq+offq+32*0]
    191    vinserti128     m5, [tmpq+offq+32*1], 1
    192 %endif
    193    psubw           m4, m1               ; diff_k1p0
    194    psubw           m5, m1               ; diff_k1p1
    195    psignw          m2, m3               ; constrain(diff_k0p1)
    196    pabsw           m3, m4               ; adiff_k1p0
    197    paddw           m0, m2               ; constrain(diff_k0)
    198    psrlw           m2, m3, [pri_shift+gprsize]
    199    psubusw         m7, m6, m2
    200    pabsw           m2, m5               ; adiff_k1p1
    201    pminsw          m7, m3
    202    psrlw           m3, m2, [pri_shift+gprsize]
    203    psignw          m7, m4               ; constrain(diff_k1p0)
    204    psubusw         m4, m6, m3
    205    pminsw          m4, m2
    206    psignw          m4, m5               ; constrain(diff_k1p1)
    207    paddw           m7, m4               ; constrain(diff_k1)
    208    pmullw          m0, m9               ; pri_tap_k0
    209    pmullw          m7, m10              ; pri_tap_k1
    210    paddw           m0, m7               ; sum
    211    psraw           m2, m0, 15
    212    paddw           m0, m2
    213    pmulhrsw        m0, m8
    214    add           tmpq, 32*2
    215    paddw           m0, m1
    216 %if %1 == 4
    217    vextracti128   xm1, m0, 1
    218    movq   [dstq+strideq*0], xm0
    219    movq   [dstq+strideq*1], xm1
    220    movhps [dstq+strideq*2], xm0
    221    movhps [dstq+r9       ], xm1
    222    lea           dstq, [dstq+strideq*4]
    223 %else
    224    mova         [dstq+strideq*0], xm0
    225    vextracti128 [dstq+strideq*1], m0, 1
    226    lea           dstq, [dstq+strideq*2]
    227 %endif
    228    ret
    229 ALIGN function_align
    230 .sec:
    231    movsx         offq, byte [dirq+8]    ; off1_k0
    232 %if %1 == 4
    233    mova            m1, [tmpq+32*0]
    234    punpcklqdq      m1, [tmpq+32*1]
    235    movu            m2, [tmpq+offq+32*0]
    236    punpcklqdq      m2, [tmpq+offq+32*1] ; k0s0
    237    neg           offq
    238    movu            m3, [tmpq+offq+32*0]
    239    punpcklqdq      m3, [tmpq+offq+32*1] ; k0s1
    240 %else
    241    mova           xm1, [tmpq+32*0]
    242    vinserti128     m1, [tmpq+32*1], 1
    243    movu           xm2, [tmpq+offq+32*0]
    244    vinserti128     m2, [tmpq+offq+32*1], 1
    245    neg           offq
    246    movu           xm3, [tmpq+offq+32*0]
    247    vinserti128     m3, [tmpq+offq+32*1], 1
    248 %endif
    249    movsx         offq, byte [dirq+0]    ; off2_k0
    250    psubw           m2, m1               ; diff_k0s0
    251    psubw           m3, m1               ; diff_k0s1
    252    pabsw           m4, m2               ; adiff_k0s0
    253    psrlw           m5, m4, [sec_shift+gprsize]
    254    psubusw         m0, m6, m5
    255    pabsw           m5, m3               ; adiff_k0s1
    256    pminsw          m0, m4
    257    psrlw           m4, m5, [sec_shift+gprsize]
    258    psignw          m0, m2               ; constrain(diff_k0s0)
    259    psubusw         m2, m6, m4
    260    pminsw          m2, m5
    261 %if %1 == 4
    262    movu            m4, [tmpq+offq+32*0]
    263    punpcklqdq      m4, [tmpq+offq+32*1] ; k0s2
    264    neg           offq
    265    movu            m5, [tmpq+offq+32*0]
    266    punpcklqdq      m5, [tmpq+offq+32*1] ; k0s3
    267 %else
    268    movu           xm4, [tmpq+offq+32*0]
    269    vinserti128     m4, [tmpq+offq+32*1], 1
    270    neg           offq
    271    movu           xm5, [tmpq+offq+32*0]
    272    vinserti128     m5, [tmpq+offq+32*1], 1
    273 %endif
    274    movsx         offq, byte [dirq+9]    ; off1_k1
    275    psubw           m4, m1               ; diff_k0s2
    276    psubw           m5, m1               ; diff_k0s3
    277    psignw          m2, m3               ; constrain(diff_k0s1)
    278    pabsw           m3, m4               ; adiff_k0s2
    279    paddw           m0, m2
    280    psrlw           m2, m3, [sec_shift+gprsize]
    281    psubusw         m7, m6, m2
    282    pabsw           m2, m5               ; adiff_k0s3
    283    pminsw          m7, m3
    284    psrlw           m3, m2, [sec_shift+gprsize]
    285    psignw          m7, m4               ; constrain(diff_k0s2)
    286    psubusw         m4, m6, m3
    287    pminsw          m4, m2
    288 %if %1 == 4
    289    movu            m2, [tmpq+offq+32*0]
    290    punpcklqdq      m2, [tmpq+offq+32*1] ; k1s0
    291    neg           offq
    292    movu            m3, [tmpq+offq+32*0]
    293    punpcklqdq      m3, [tmpq+offq+32*1] ; k1s1
    294 %else
    295    movu           xm2, [tmpq+offq+32*0]
    296    vinserti128     m2, [tmpq+offq+32*1], 1
    297    neg           offq
    298    movu           xm3, [tmpq+offq+32*0]
    299    vinserti128     m3, [tmpq+offq+32*1], 1
    300 %endif
    301    movsx         offq, byte [dirq+1]    ; off2_k1
    302    paddw           m0, m7
    303    psignw          m4, m5               ; constrain(diff_k0s3)
    304    paddw           m0, m4               ; constrain(diff_k0)
    305    psubw           m2, m1               ; diff_k1s0
    306    psubw           m3, m1               ; diff_k1s1
    307    paddw           m0, m0               ; sec_tap_k0
    308    pabsw           m4, m2               ; adiff_k1s0
    309    psrlw           m5, m4, [sec_shift+gprsize]
    310    psubusw         m7, m6, m5
    311    pabsw           m5, m3               ; adiff_k1s1
    312    pminsw          m7, m4
    313    psrlw           m4, m5, [sec_shift+gprsize]
    314    psignw          m7, m2               ; constrain(diff_k1s0)
    315    psubusw         m2, m6, m4
    316    pminsw          m2, m5
    317 %if %1 == 4
    318    movu            m4, [tmpq+offq+32*0]
    319    punpcklqdq      m4, [tmpq+offq+32*1] ; k1s2
    320    neg           offq
    321    movu            m5, [tmpq+offq+32*0]
    322    punpcklqdq      m5, [tmpq+offq+32*1] ; k1s3
    323 %else
    324    movu           xm4, [tmpq+offq+32*0]
    325    vinserti128     m4, [tmpq+offq+32*1], 1
    326    neg           offq
    327    movu           xm5, [tmpq+offq+32*0]
    328    vinserti128     m5, [tmpq+offq+32*1], 1
    329 %endif
    330    paddw           m0, m7
    331    psubw           m4, m1               ; diff_k1s2
    332    psubw           m5, m1               ; diff_k1s3
    333    psignw          m2, m3               ; constrain(diff_k1s1)
    334    pabsw           m3, m4               ; adiff_k1s2
    335    paddw           m0, m2
    336    psrlw           m2, m3, [sec_shift+gprsize]
    337    psubusw         m7, m6, m2
    338    pabsw           m2, m5               ; adiff_k1s3
    339    pminsw          m7, m3
    340    psrlw           m3, m2, [sec_shift+gprsize]
    341    psignw          m7, m4               ; constrain(diff_k1s2)
    342    psubusw         m4, m6, m3
    343    pminsw          m4, m2
    344    paddw           m0, m7
    345    psignw          m4, m5               ; constrain(diff_k1s3)
    346    paddw           m0, m4               ; sum
    347    psraw           m2, m0, 15
    348    paddw           m0, m2
    349    pmulhrsw        m0, m8
    350    add           tmpq, 32*2
    351    paddw           m0, m1
    352 %if %1 == 4
    353    vextracti128   xm1, m0, 1
    354    movq   [dstq+strideq*0], xm0
    355    movq   [dstq+strideq*1], xm1
    356    movhps [dstq+strideq*2], xm0
    357    movhps [dstq+r9       ], xm1
    358    lea           dstq, [dstq+strideq*4]
    359 %else
    360    mova         [dstq+strideq*0], xm0
    361    vextracti128 [dstq+strideq*1], m0, 1
    362    lea           dstq, [dstq+strideq*2]
    363 %endif
    364    ret
    365 ALIGN function_align
    366 .pri_sec:
    367    movsx         offq, byte [dirq+8]    ; off2_k0
    368 %if %1 == 4
    369    mova            m1, [tmpq+32*0]
    370    punpcklqdq      m1, [tmpq+32*1]
    371    movu            m2, [tmpq+offq+32*0]
    372    punpcklqdq      m2, [tmpq+offq+32*1] ; k0s0
    373    neg           offq
    374    movu            m3, [tmpq+offq+32*0]
    375    punpcklqdq      m3, [tmpq+offq+32*1] ; k0s1
    376 %else
    377    mova           xm1, [dstq+strideq*0]
    378    vinserti128     m1, [dstq+strideq*1], 1
    379    movu           xm2, [tmpq+offq+32*0]
    380    vinserti128     m2, [tmpq+offq+32*1], 1
    381    neg           offq
    382    movu           xm3, [tmpq+offq+32*0]
    383    vinserti128     m3, [tmpq+offq+32*1], 1
    384 %endif
    385    movsx         offq, byte [dirq+0]    ; off3_k0
    386    pmaxsw         m11, m2, m3
    387    pminuw         m12, m2, m3
    388    psubw           m2, m1               ; diff_k0s0
    389    psubw           m3, m1               ; diff_k0s1
    390    pabsw           m4, m2               ; adiff_k0s0
    391    psrlw           m5, m4, [sec_shift+gprsize]
    392    psubusw         m0, m13, m5
    393    pabsw           m5, m3               ; adiff_k0s1
    394    pminsw          m0, m4
    395    psrlw           m4, m5, [sec_shift+gprsize]
    396    psignw          m0, m2               ; constrain(diff_k0s0)
    397    psubusw         m2, m13, m4
    398    pminsw          m2, m5
    399 %if %1 == 4
    400    movu            m4, [tmpq+offq+32*0]
    401    punpcklqdq      m4, [tmpq+offq+32*1] ; k0s2
    402    neg           offq
    403    movu            m5, [tmpq+offq+32*0]
    404    punpcklqdq      m5, [tmpq+offq+32*1] ; k0s3
    405 %else
    406    movu           xm4, [tmpq+offq+32*0]
    407    vinserti128     m4, [tmpq+offq+32*1], 1
    408    neg           offq
    409    movu           xm5, [tmpq+offq+32*0]
    410    vinserti128     m5, [tmpq+offq+32*1], 1
    411 %endif
    412    movsx         offq, byte [dirq+9]    ; off2_k1
    413    psignw          m2, m3               ; constrain(diff_k0s1)
    414    pmaxsw         m11, m4
    415    pminuw         m12, m4
    416    pmaxsw         m11, m5
    417    pminuw         m12, m5
    418    psubw           m4, m1               ; diff_k0s2
    419    psubw           m5, m1               ; diff_k0s3
    420    paddw           m0, m2
    421    pabsw           m3, m4               ; adiff_k0s2
    422    psrlw           m2, m3, [sec_shift+gprsize]
    423    psubusw         m7, m13, m2
    424    pabsw           m2, m5               ; adiff_k0s3
    425    pminsw          m7, m3
    426    psrlw           m3, m2, [sec_shift+gprsize]
    427    psignw          m7, m4               ; constrain(diff_k0s2)
    428    psubusw         m4, m13, m3
    429    pminsw          m4, m2
    430 %if %1 == 4
    431    movu            m2, [tmpq+offq+32*0]
    432    punpcklqdq      m2, [tmpq+offq+32*1] ; k1s0
    433    neg           offq
    434    movu            m3, [tmpq+offq+32*0]
    435    punpcklqdq      m3, [tmpq+offq+32*1] ; k1s1
    436 %else
    437    movu           xm2, [tmpq+offq+32*0]
    438    vinserti128     m2, [tmpq+offq+32*1], 1
    439    neg           offq
    440    movu           xm3, [tmpq+offq+32*0]
    441    vinserti128     m3, [tmpq+offq+32*1], 1
    442 %endif
    443    movsx         offq, byte [dirq+1]    ; off3_k1
    444    paddw           m0, m7
    445    psignw          m4, m5               ; constrain(diff_k0s3)
    446    pmaxsw         m11, m2
    447    pminuw         m12, m2
    448    pmaxsw         m11, m3
    449    pminuw         m12, m3
    450    paddw           m0, m4               ; constrain(diff_k0)
    451    psubw           m2, m1               ; diff_k1s0
    452    psubw           m3, m1               ; diff_k1s1
    453    paddw           m0, m0               ; sec_tap_k0
    454    pabsw           m4, m2               ; adiff_k1s0
    455    psrlw           m5, m4, [sec_shift+gprsize]
    456    psubusw         m7, m13, m5
    457    pabsw           m5, m3               ; adiff_k1s1
    458    pminsw          m7, m4
    459    psrlw           m4, m5, [sec_shift+gprsize]
    460    psignw          m7, m2               ; constrain(diff_k1s0)
    461    psubusw         m2, m13, m4
    462    pminsw          m2, m5
    463 %if %1 == 4
    464    movu            m4, [tmpq+offq+32*0]
    465    punpcklqdq      m4, [tmpq+offq+32*1] ; k1s2
    466    neg           offq
    467    movu            m5, [tmpq+offq+32*0]
    468    punpcklqdq      m5, [tmpq+offq+32*1] ; k1s3
    469 %else
    470    movu           xm4, [tmpq+offq+32*0]
    471    vinserti128     m4, [tmpq+offq+32*1], 1
    472    neg           offq
    473    movu           xm5, [tmpq+offq+32*0]
    474    vinserti128     m5, [tmpq+offq+32*1], 1
    475 %endif
    476    movsx         offq, byte [dirq+4]    ; off1_k0
    477    paddw           m0, m7
    478    psignw          m2, m3               ; constrain(diff_k1s1)
    479    pmaxsw         m11, m4
    480    pminuw         m12, m4
    481    pmaxsw         m11, m5
    482    pminuw         m12, m5
    483    psubw           m4, m1               ; diff_k1s2
    484    psubw           m5, m1               ; diff_k1s3
    485    pabsw           m3, m4               ; adiff_k1s2
    486    paddw           m0, m2
    487    psrlw           m2, m3, [sec_shift+gprsize]
    488    psubusw         m7, m13, m2
    489    pabsw           m2, m5               ; adiff_k1s3
    490    pminsw          m7, m3
    491    psrlw           m3, m2, [sec_shift+gprsize]
    492    psignw          m7, m4               ; constrain(diff_k1s2)
    493    psubusw         m4, m13, m3
    494    pminsw          m4, m2
    495    paddw           m0, m7
    496 %if %1 == 4
    497    movu            m2, [tmpq+offq+32*0]
    498    punpcklqdq      m2, [tmpq+offq+32*1] ; k0p0
    499    neg           offq
    500    movu            m3, [tmpq+offq+32*0]
    501    punpcklqdq      m3, [tmpq+offq+32*1] ; k0p1
    502 %else
    503    movu           xm2, [tmpq+offq+32*0]
    504    vinserti128     m2, [tmpq+offq+32*1], 1
    505    neg           offq
    506    movu           xm3, [tmpq+offq+32*0]
    507    vinserti128     m3, [tmpq+offq+32*1], 1
    508 %endif
    509    movsx         offq, byte [dirq+5]    ; off1_k1
    510    psignw          m4, m5               ; constrain(diff_k1s3)
    511    pmaxsw         m11, m2
    512    pminuw         m12, m2
    513    pmaxsw         m11, m3
    514    pminuw         m12, m3
    515    psubw           m2, m1               ; diff_k0p0
    516    psubw           m3, m1               ; diff_k0p1
    517    paddw           m0, m4
    518    pabsw           m4, m2               ; adiff_k0p0
    519    psrlw           m5, m4, [pri_shift+gprsize]
    520    psubusw         m7, m6, m5
    521    pabsw           m5, m3               ; adiff_k0p1
    522    pminsw          m7, m4
    523    psrlw           m4, m5, [pri_shift+gprsize]
    524    psignw          m7, m2               ; constrain(diff_k0p0)
    525    psubusw         m2, m6, m4
    526    pminsw          m2, m5
    527 %if %1 == 4
    528    movu            m4, [tmpq+offq+32*0]
    529    punpcklqdq      m4, [tmpq+offq+32*1] ; k1p0
    530    neg           offq
    531    movu            m5, [tmpq+offq+32*0]
    532    punpcklqdq      m5, [tmpq+offq+32*1] ; k1p1
    533 %else
    534    movu           xm4, [tmpq+offq+32*0]
    535    vinserti128     m4, [tmpq+offq+32*1], 1
    536    neg           offq
    537    movu           xm5, [tmpq+offq+32*0]
    538    vinserti128     m5, [tmpq+offq+32*1], 1
    539 %endif
    540    psignw          m2, m3               ; constrain(diff_k0p1)
    541    paddw           m7, m2               ; constrain(diff_k0)
    542    pmaxsw         m11, m4
    543    pminuw         m12, m4
    544    pmaxsw         m11, m5
    545    pminuw         m12, m5
    546    psubw           m4, m1               ; diff_k1p0
    547    psubw           m5, m1               ; diff_k1p1
    548    pabsw           m3, m4               ; adiff_k1p0
    549    pmullw          m7, m9               ; pri_tap_k0
    550    paddw           m0, m7
    551    psrlw           m2, m3, [pri_shift+gprsize]
    552    psubusw         m7, m6, m2
    553    pabsw           m2, m5               ; adiff_k1p1
    554    pminsw          m7, m3
    555    psrlw           m3, m2, [pri_shift+gprsize]
    556    psignw          m7, m4               ; constrain(diff_k1p0)
    557    psubusw         m4, m6, m3
    558    pminsw          m4, m2
    559    psignw          m4, m5               ; constrain(diff_k1p1)
    560    paddw           m7, m4               ; constrain(diff_k1)
    561    pmullw          m7, m10              ; pri_tap_k1
    562    paddw           m0, m7               ; sum
    563    psraw           m2, m0, 15
    564    paddw           m0, m2
    565    pmulhrsw        m0, m8
    566    add           tmpq, 32*2
    567    pmaxsw         m11, m1
    568    pminuw         m12, m1
    569    paddw           m0, m1
    570    pminsw          m0, m11
    571    pmaxsw          m0, m12
    572 %if %1 == 4
    573    vextracti128   xm1, m0, 1
    574    movq   [dstq+strideq*0], xm0
    575    movq   [dstq+strideq*1], xm1
    576    movhps [dstq+strideq*2], xm0
    577    movhps [dstq+r9       ], xm1
    578    lea           dstq, [dstq+strideq*4]
    579 %else
    580    mova         [dstq+strideq*0], xm0
    581    vextracti128 [dstq+strideq*1], m0, 1
    582    lea           dstq, [dstq+strideq*2]
    583 %endif
    584    ret
    585 %endif
    586 %endmacro
    587 
    588 INIT_YMM avx2
    589 cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \
    590                                                pri, sec, edge
    591 %if WIN64
    592    %define         px  rsp+16*6
    593    %define       offq  r8
    594    %define  pri_shift  rsp+16*2
    595    %define  sec_shift  rsp+16*3
    596 %else
    597    %define         px  rsp+16*4
    598    %define       offq  r4
    599    %define  pri_shift  rsp+16*0
    600    %define  sec_shift  rsp+16*1
    601 %endif
    602    %define       base  r8-dir_table4
    603    mov          edged, r9m
    604    lea             r8, [dir_table4]
    605    movu           xm0, [dstq+strideq*0]
    606    movu           xm1, [dstq+strideq*1]
    607    lea             r9, [strideq*3]
    608    movu           xm2, [dstq+strideq*2]
    609    movu           xm3, [dstq+r9       ]
    610    vpbroadcastd    m7, [base+pw_m16384]
    611    mova   [px+16*0+0], xm0
    612    mova   [px+16*1+0], xm1
    613    mova   [px+16*2+0], xm2
    614    mova   [px+16*3+0], xm3
    615    test         edgeb, 4 ; HAVE_TOP
    616    jz .no_top
    617    movu           xm0, [topq+strideq*0]
    618    movu           xm1, [topq+strideq*1]
    619    mova   [px-16*2+0], xm0
    620    mova   [px-16*1+0], xm1
    621    test         edgeb, 1 ; HAVE_LEFT
    622    jz .top_no_left
    623    movd           xm0, [topq+strideq*0-4]
    624    movd           xm1, [topq+strideq*1-4]
    625    movd   [px-16*2-4], xm0
    626    movd   [px-16*1-4], xm1
    627    jmp .top_done
    628 .no_top:
    629    mova   [px-16*2+0], m7
    630 .top_no_left:
    631    movd   [px-16*2-4], xm7
    632    movd   [px-16*1-4], xm7
    633 .top_done:
    634    test         edgeb, 8 ; HAVE_BOTTOM
    635    jz .no_bottom
    636    movu           xm0, [botq+strideq*0]
    637    movu           xm1, [botq+strideq*1]
    638    mova   [px+16*4+0], xm0
    639    mova   [px+16*5+0], xm1
    640    test         edgeb, 1 ; HAVE_LEFT
    641    jz .bottom_no_left
    642    movd           xm0, [botq+strideq*0-4]
    643    movd           xm1, [botq+strideq*1-4]
    644    movd   [px+16*4-4], xm0
    645    movd   [px+16*5-4], xm1
    646    jmp .bottom_done
    647 .no_bottom:
    648    mova   [px+16*4+0], m7
    649 .bottom_no_left:
    650    movd   [px+16*4-4], xm7
    651    movd   [px+16*5-4], xm7
    652 .bottom_done:
    653    test         edgeb, 1 ; HAVE_LEFT
    654    jz .no_left
    655    movd           xm0, [leftq+4*0]
    656    movd           xm1, [leftq+4*1]
    657    movd           xm2, [leftq+4*2]
    658    movd           xm3, [leftq+4*3]
    659    movd   [px+16*0-4], xm0
    660    movd   [px+16*1-4], xm1
    661    movd   [px+16*2-4], xm2
    662    movd   [px+16*3-4], xm3
    663    jmp .left_done
    664 .no_left:
    665    REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3
    666 .left_done:
    667    test         edgeb, 2 ; HAVE_RIGHT
    668    jnz .padding_done
    669    REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5
    670 .padding_done:
    671    CDEF_FILTER      4, 4
    672 
    673 cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \
    674                                                pri, sec, edge
    675    mov          edged, r9m
    676    movu           xm0, [dstq+strideq*0]
    677    movu           xm1, [dstq+strideq*1]
    678    lea             r9, [strideq*3]
    679    movu           xm2, [dstq+strideq*2]
    680    movu           xm3, [dstq+r9       ]
    681    lea             r6, [dstq+strideq*4]
    682    movu           xm4, [r6  +strideq*0]
    683    movu           xm5, [r6  +strideq*1]
    684    movu           xm6, [r6  +strideq*2]
    685    movu           xm7, [r6  +r9       ]
    686    lea             r8, [dir_table4]
    687    mova   [px+16*0+0], xm0
    688    mova   [px+16*1+0], xm1
    689    mova   [px+16*2+0], xm2
    690    mova   [px+16*3+0], xm3
    691    mova   [px+16*4+0], xm4
    692    mova   [px+16*5+0], xm5
    693    mova   [px+16*6+0], xm6
    694    mova   [px+16*7+0], xm7
    695    vpbroadcastd    m7, [base+pw_m16384]
    696    test         edgeb, 4 ; HAVE_TOP
    697    jz .no_top
    698    movu           xm0, [topq+strideq*0]
    699    movu           xm1, [topq+strideq*1]
    700    mova   [px-16*2+0], xm0
    701    mova   [px-16*1+0], xm1
    702    test         edgeb, 1 ; HAVE_LEFT
    703    jz .top_no_left
    704    movd           xm0, [topq+strideq*0-4]
    705    movd           xm1, [topq+strideq*1-4]
    706    movd   [px-16*2-4], xm0
    707    movd   [px-16*1-4], xm1
    708    jmp .top_done
    709 .no_top:
    710    mova   [px-16*2+0], m7
    711 .top_no_left:
    712    movd   [px-16*2-4], xm7
    713    movd   [px-16*1-4], xm7
    714 .top_done:
    715    test         edgeb, 8 ; HAVE_BOTTOM
    716    jz .no_bottom
    717    movu           xm0, [botq+strideq*0]
    718    movu           xm1, [botq+strideq*1]
    719    mova   [px+16*8+0], xm0
    720    mova   [px+16*9+0], xm1
    721    test         edgeb, 1 ; HAVE_LEFT
    722    jz .bottom_no_left
    723    movd           xm0, [botq+strideq*0-4]
    724    movd           xm1, [botq+strideq*1-4]
    725    movd   [px+16*8-4], xm0
    726    movd   [px+16*9-4], xm1
    727    jmp .bottom_done
    728 .no_bottom:
    729    mova   [px+16*8+0], m7
    730 .bottom_no_left:
    731    movd   [px+16*8-4], xm7
    732    movd   [px+16*9-4], xm7
    733 .bottom_done:
    734    test         edgeb, 1 ; HAVE_LEFT
    735    jz .no_left
    736    movd           xm0, [leftq+4*0]
    737    movd           xm1, [leftq+4*1]
    738    movd           xm2, [leftq+4*2]
    739    movd           xm3, [leftq+4*3]
    740    movd   [px+16*0-4], xm0
    741    movd   [px+16*1-4], xm1
    742    movd   [px+16*2-4], xm2
    743    movd   [px+16*3-4], xm3
    744    movd           xm0, [leftq+4*4]
    745    movd           xm1, [leftq+4*5]
    746    movd           xm2, [leftq+4*6]
    747    movd           xm3, [leftq+4*7]
    748    movd   [px+16*4-4], xm0
    749    movd   [px+16*5-4], xm1
    750    movd   [px+16*6-4], xm2
    751    movd   [px+16*7-4], xm3
    752    jmp .left_done
    753 .no_left:
    754    REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
    755 .left_done:
    756    test         edgeb, 2 ; HAVE_RIGHT
    757    jnz .padding_done
    758    REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
    759 .padding_done:
    760    CDEF_FILTER      4, 8
    761 
    762 cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \
    763                                               pri, sec, edge
    764 %if WIN64
    765    %define         px  rsp+32*4
    766 %else
    767    %define         px  rsp+32*3
    768 %endif
    769    %define       base  r8-dir_table8
    770    mov          edged, r9m
    771    movu            m0, [dstq+strideq*0]
    772    movu            m1, [dstq+strideq*1]
    773    lea             r6, [dstq+strideq*2]
    774    movu            m2, [r6  +strideq*0]
    775    movu            m3, [r6  +strideq*1]
    776    lea             r6, [r6  +strideq*2]
    777    movu            m4, [r6  +strideq*0]
    778    movu            m5, [r6  +strideq*1]
    779    lea             r6, [r6  +strideq*2]
    780    movu            m6, [r6  +strideq*0]
    781    movu            m7, [r6  +strideq*1]
    782    lea             r8, [dir_table8]
    783    mova   [px+32*0+0], m0
    784    mova   [px+32*1+0], m1
    785    mova   [px+32*2+0], m2
    786    mova   [px+32*3+0], m3
    787    mova   [px+32*4+0], m4
    788    mova   [px+32*5+0], m5
    789    mova   [px+32*6+0], m6
    790    mova   [px+32*7+0], m7
    791    vpbroadcastd    m7, [base+pw_m16384]
    792    test         edgeb, 4 ; HAVE_TOP
    793    jz .no_top
    794    movu            m0, [topq+strideq*0]
    795    movu            m1, [topq+strideq*1]
    796    mova   [px-32*2+0], m0
    797    mova   [px-32*1+0], m1
    798    test         edgeb, 1 ; HAVE_LEFT
    799    jz .top_no_left
    800    movd           xm0, [topq+strideq*0-4]
    801    movd           xm1, [topq+strideq*1-4]
    802    movd   [px-32*2-4], xm0
    803    movd   [px-32*1-4], xm1
    804    jmp .top_done
    805 .no_top:
    806    mova   [px-32*2+0], m7
    807    mova   [px-32*1+0], m7
    808 .top_no_left:
    809    movd   [px-32*2-4], xm7
    810    movd   [px-32*1-4], xm7
    811 .top_done:
    812    test         edgeb, 8 ; HAVE_BOTTOM
    813    jz .no_bottom
    814    movu            m0, [botq+strideq*0]
    815    movu            m1, [botq+strideq*1]
    816    mova   [px+32*8+0], m0
    817    mova   [px+32*9+0], m1
    818    test         edgeb, 1 ; HAVE_LEFT
    819    jz .bottom_no_left
    820    movd           xm0, [botq+strideq*0-4]
    821    movd           xm1, [botq+strideq*1-4]
    822    movd   [px+32*8-4], xm0
    823    movd   [px+32*9-4], xm1
    824    jmp .bottom_done
    825 .no_bottom:
    826    mova   [px+32*8+0], m7
    827    mova   [px+32*9+0], m7
    828 .bottom_no_left:
    829    movd   [px+32*8-4], xm7
    830    movd   [px+32*9-4], xm7
    831 .bottom_done:
    832    test         edgeb, 1 ; HAVE_LEFT
    833    jz .no_left
    834    movd           xm0, [leftq+4*0]
    835    movd           xm1, [leftq+4*1]
    836    movd           xm2, [leftq+4*2]
    837    movd           xm3, [leftq+4*3]
    838    movd   [px+32*0-4], xm0
    839    movd   [px+32*1-4], xm1
    840    movd   [px+32*2-4], xm2
    841    movd   [px+32*3-4], xm3
    842    movd           xm0, [leftq+4*4]
    843    movd           xm1, [leftq+4*5]
    844    movd           xm2, [leftq+4*6]
    845    movd           xm3, [leftq+4*7]
    846    movd   [px+32*4-4], xm0
    847    movd   [px+32*5-4], xm1
    848    movd   [px+32*6-4], xm2
    849    movd   [px+32*7-4], xm3
    850    jmp .left_done
    851 .no_left:
    852    REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
    853 .left_done:
    854    test         edgeb, 2 ; HAVE_RIGHT
    855    jnz .padding_done
    856    REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
    857 .padding_done:
    858    CDEF_FILTER      8, 8
    859 
    860 cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax
    861    lea             r6, [dir_shift]
    862    shr         bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
    863    vpbroadcastd    m4, [r6+bdmaxq*4]
    864    lea             r6, [strideq*3]
    865    mova           xm0, [srcq+strideq*0]
    866    mova           xm1, [srcq+strideq*1]
    867    mova           xm2, [srcq+strideq*2]
    868    mova           xm3, [srcq+r6       ]
    869    lea           srcq, [srcq+strideq*4]
    870    vinserti128     m0, [srcq+r6       ], 1
    871    vinserti128     m1, [srcq+strideq*2], 1
    872    vinserti128     m2, [srcq+strideq*1], 1
    873    vinserti128     m3, [srcq+strideq*0], 1
    874    REPX {pmulhuw x, m4}, m0, m1, m2, m3
    875    jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
    876 
    877 %endif ; ARCH_X86_64