tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef16_sse.asm (33371B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; Copyright (c) 2017-2021, The rav1e contributors
      4 ; Copyright (c) 2021, Nathan Egge
      5 ; All rights reserved.
      6 ;
      7 ; Redistribution and use in source and binary forms, with or without
      8 ; modification, are permitted provided that the following conditions are met:
      9 ;
     10 ; 1. Redistributions of source code must retain the above copyright notice, this
     11 ;    list of conditions and the following disclaimer.
     12 ;
     13 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     14 ;    this list of conditions and the following disclaimer in the documentation
     15 ;    and/or other materials provided with the distribution.
     16 ;
     17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     18 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     19 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     20 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     21 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     22 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     24 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 
     28 %include "config.asm"
     29 %include "ext/x86/x86inc.asm"
     30 
     31 SECTION_RODATA
     32 
     33 %macro DUP8 1-*
     34    %rep %0
     35        times 8 dw %1
     36        %rotate 1
     37    %endrep
     38 %endmacro
     39 
     40 pri_taps:  DUP8 4, 2, 3, 3
     41 dir_table: db  1 * 32 + 0,  2 * 32 + 0
     42           db  1 * 32 + 0,  2 * 32 - 2
     43           db -1 * 32 + 2, -2 * 32 + 4
     44           db  0 * 32 + 2, -1 * 32 + 4
     45           db  0 * 32 + 2,  0 * 32 + 4
     46           db  0 * 32 + 2,  1 * 32 + 4
     47           db  1 * 32 + 2,  2 * 32 + 4
     48           db  1 * 32 + 0,  2 * 32 + 2
     49           db  1 * 32 + 0,  2 * 32 + 0
     50           db  1 * 32 + 0,  2 * 32 - 2
     51           db -1 * 32 + 2, -2 * 32 + 4
     52           db  0 * 32 + 2, -1 * 32 + 4
     53 
     54 dir_shift: times 4 dw 0x4000
     55           times 4 dw 0x1000
     56 
     57 pw_128:    times 4 dw 128
     58 pw_2048:   times 8 dw 2048
     59 pw_m16384: times 8 dw -16384
     60 
     61 cextern cdef_dir_8bpc_ssse3.main
     62 cextern cdef_dir_8bpc_sse4.main
     63 cextern shufw_6543210x
     64 
     65 SECTION .text
     66 
     67 %if ARCH_X86_32
     68 DECLARE_REG_TMP 5, 3
     69 %elif WIN64
     70 DECLARE_REG_TMP 8, 4
     71 %else
     72 DECLARE_REG_TMP 8, 6
     73 %endif
     74 
     75 %macro CDEF_FILTER 2 ; w, h
     76 %if ARCH_X86_64
     77    DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir
     78    mova            m8, [base+pw_2048]
     79 %else
     80    DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir
     81    %define         m8  [base+pw_2048]
     82    %define         m9  [rsp+16*1+gprsize]
     83    %define        m10  [rsp+16*2+gprsize]
     84 %endif
     85    movifnidn     prid, r5m
     86    movifnidn     secd, r6m
     87    test          prid, prid
     88    jz .sec_only
     89    movd            m6, r5m
     90 %if ARCH_X86_32
     91    mov       [rsp+24], pridmpd
     92 %endif
     93    bsr        pridmpd, prid
     94    lea           tmpd, [priq*4]
     95    cmp     dword r10m, 0x3ff ; if (bpc == 10)
     96    cmove         prid, tmpd  ;     pri <<= 2
     97    mov           tmpd, r8m   ; damping
     98    mov           dird, r7m
     99    and           prid, 16
    100    pshufb          m6, m7    ; splat
    101    lea           dirq, [base+dir_table+dirq*2]
    102    lea           priq, [base+pri_taps+priq*2]
    103    test          secd, secd
    104    jz .pri_only
    105    mova         [rsp], m6
    106    movd            m6, secd
    107    tzcnt         secd, secd
    108    sub        pridmpd, tmpd
    109    sub           tmpd, secd
    110    pshufb          m6, m7
    111    xor           secd, secd
    112    neg        pridmpd
    113    cmovs      pridmpd, secd
    114 %if ARCH_X86_32
    115    mov  [pri_shift+4], secd
    116    mov  [sec_shift+4], secd
    117 %endif
    118    mov  [pri_shift+0], pridmpq
    119    mov  [sec_shift+0], tmpq
    120    lea           tmpq, [px]
    121 %if WIN64
    122    movaps         r4m, m9
    123    movaps         r6m, m10
    124 %elif ARCH_X86_32
    125    mov        pridmpd, [rsp+24]
    126 %endif
    127 %rep %1*%2/8
    128    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
    129 %endrep
    130 %if WIN64
    131    movaps          m9, r4m
    132    movaps         m10, r6m
    133 %endif
    134    jmp .end
    135 .pri_only:
    136    sub           tmpd, pridmpd
    137    cmovs         tmpd, secd
    138 %if ARCH_X86_32
    139    mov        pridmpd, [rsp+24]
    140    mov  [pri_shift+4], secd
    141 %endif
    142    mov  [pri_shift+0], tmpq
    143    lea           tmpq, [px]
    144 %rep %1*%2/8
    145    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
    146 %endrep
    147 .end:
    148    RET
    149 .sec_only:
    150    mov           tmpd, r8m ; damping
    151    movd            m6, r6m
    152    tzcnt         secd, secd
    153    mov           dird, r7m
    154    pshufb          m6, m7
    155    sub           tmpd, secd
    156    lea           dirq, [base+dir_table+dirq*2]
    157 %if ARCH_X86_32
    158    mov  [sec_shift+4], prid
    159 %endif
    160    mov  [sec_shift+0], tmpq
    161    lea           tmpq, [px]
    162 %rep %1*%2/8
    163    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
    164 %endrep
    165    jmp .end
    166 %if %1 == %2
    167 %if ARCH_X86_64
    168  DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir
    169 %else
    170  DEFINE_ARGS dst, stride, tmp, off, pri, _, dir
    171 %endif
    172 ALIGN function_align
    173 .pri:
    174    movsx         offq, byte [dirq+4]    ; off_k0
    175 %if %1 == 4
    176    movq            m1, [dstq+strideq*0]
    177    movhps          m1, [dstq+strideq*1]
    178    movq            m2, [tmpq+offq+32*0] ; k0p0
    179    movhps          m2, [tmpq+offq+32*1]
    180    neg           offq
    181    movq            m3, [tmpq+offq+32*0] ; k0p1
    182    movhps          m3, [tmpq+offq+32*1]
    183 %else
    184    mova            m1, [dstq]
    185    movu            m2, [tmpq+offq]
    186    neg           offq
    187    movu            m3, [tmpq+offq]
    188 %endif
    189    movsx         offq, byte [dirq+5]    ; off_k1
    190    psubw           m2, m1               ; diff_k0p0
    191    psubw           m3, m1               ; diff_k0p1
    192    pabsw           m4, m2               ; adiff_k0p0
    193    psrlw           m5, m4, [pri_shift+gprsize]
    194    psubusw         m0, m6, m5
    195    pabsw           m5, m3               ; adiff_k0p1
    196    pminsw          m0, m4
    197    psrlw           m4, m5, [pri_shift+gprsize]
    198    psignw          m0, m2               ; constrain(diff_k0p0)
    199    psubusw         m2, m6, m4
    200    pminsw          m2, m5
    201 %if %1 == 4
    202    movq            m4, [tmpq+offq+32*0] ; k1p0
    203    movhps          m4, [tmpq+offq+32*1]
    204    neg           offq
    205    movq            m5, [tmpq+offq+32*0] ; k1p1
    206    movhps          m5, [tmpq+offq+32*1]
    207 %else
    208    movu            m4, [tmpq+offq]
    209    neg           offq
    210    movu            m5, [tmpq+offq]
    211 %endif
    212    psubw           m4, m1               ; diff_k1p0
    213    psubw           m5, m1               ; diff_k1p1
    214    psignw          m2, m3               ; constrain(diff_k0p1)
    215    pabsw           m3, m4               ; adiff_k1p0
    216    paddw           m0, m2               ; constrain(diff_k0)
    217    psrlw           m2, m3, [pri_shift+gprsize]
    218    psubusw         m7, m6, m2
    219    pabsw           m2, m5               ; adiff_k1p1
    220    pminsw          m7, m3
    221    psrlw           m3, m2, [pri_shift+gprsize]
    222    psignw          m7, m4               ; constrain(diff_k1p0)
    223    psubusw         m4, m6, m3
    224    pminsw          m4, m2
    225    psignw          m4, m5               ; constrain(diff_k1p1)
    226    paddw           m7, m4               ; constrain(diff_k1)
    227    pmullw          m0, [priq+16*0]      ; pri_tap_k0
    228    pmullw          m7, [priq+16*1]      ; pri_tap_k1
    229    paddw           m0, m7               ; sum
    230    psraw           m2, m0, 15
    231    paddw           m0, m2
    232    pmulhrsw        m0, m8
    233    paddw           m0, m1
    234 %if %1 == 4
    235    add           tmpq, 32*2
    236    movq   [dstq+strideq*0], m0
    237    movhps [dstq+strideq*1], m0
    238    lea           dstq, [dstq+strideq*2]
    239 %else
    240    add           tmpq, 32
    241    mova        [dstq], m0
    242    add           dstq, strideq
    243 %endif
    244    ret
    245 ALIGN function_align
    246 .sec:
    247    movsx         offq, byte [dirq+8]    ; off1_k0
    248 %if %1 == 4
    249    movq            m1, [dstq+strideq*0]
    250    movhps          m1, [dstq+strideq*1]
    251    movq            m2, [tmpq+offq+32*0] ; k0s0
    252    movhps          m2, [tmpq+offq+32*1]
    253    neg           offq
    254    movq            m3, [tmpq+offq+32*0] ; k0s1
    255    movhps          m3, [tmpq+offq+32*1]
    256 %else
    257    mova            m1, [dstq]
    258    movu            m2, [tmpq+offq]
    259    neg           offq
    260    movu            m3, [tmpq+offq]
    261 %endif
    262    movsx         offq, byte [dirq+0]    ; off2_k0
    263    psubw           m2, m1               ; diff_k0s0
    264    psubw           m3, m1               ; diff_k0s1
    265    pabsw           m4, m2               ; adiff_k0s0
    266    psrlw           m5, m4, [sec_shift+gprsize]
    267    psubusw         m0, m6, m5
    268    pabsw           m5, m3               ; adiff_k0s1
    269    pminsw          m0, m4
    270    psrlw           m4, m5, [sec_shift+gprsize]
    271    psignw          m0, m2               ; constrain(diff_k0s0)
    272    psubusw         m2, m6, m4
    273    pminsw          m2, m5
    274 %if %1 == 4
    275    movq            m4, [tmpq+offq+32*0] ; k0s2
    276    movhps          m4, [tmpq+offq+32*1]
    277    neg           offq
    278    movq            m5, [tmpq+offq+32*0] ; k0s3
    279    movhps          m5, [tmpq+offq+32*1]
    280 %else
    281    movu            m4, [tmpq+offq]
    282    neg           offq
    283    movu            m5, [tmpq+offq]
    284 %endif
    285    movsx         offq, byte [dirq+9]    ; off1_k1
    286    psubw           m4, m1               ; diff_k0s2
    287    psubw           m5, m1               ; diff_k0s3
    288    psignw          m2, m3               ; constrain(diff_k0s1)
    289    pabsw           m3, m4               ; adiff_k0s2
    290    paddw           m0, m2
    291    psrlw           m2, m3, [sec_shift+gprsize]
    292    psubusw         m7, m6, m2
    293    pabsw           m2, m5               ; adiff_k0s3
    294    pminsw          m7, m3
    295    psrlw           m3, m2, [sec_shift+gprsize]
    296    psignw          m7, m4               ; constrain(diff_k0s2)
    297    psubusw         m4, m6, m3
    298    pminsw          m4, m2
    299 %if %1 == 4
    300    movq            m2, [tmpq+offq+32*0] ; k1s0
    301    movhps          m2, [tmpq+offq+32*1]
    302    neg           offq
    303    movq            m3, [tmpq+offq+32*0] ; k1s1
    304    movhps          m3, [tmpq+offq+32*1]
    305 %else
    306    movu            m2, [tmpq+offq]
    307    neg           offq
    308    movu            m3, [tmpq+offq]
    309 %endif
    310    movsx         offq, byte [dirq+1]    ; off2_k1
    311    paddw           m0, m7
    312    psignw          m4, m5               ; constrain(diff_k0s3)
    313    paddw           m0, m4               ; constrain(diff_k0)
    314    psubw           m2, m1               ; diff_k1s0
    315    psubw           m3, m1               ; diff_k1s1
    316    paddw           m0, m0               ; sec_tap_k0
    317    pabsw           m4, m2               ; adiff_k1s0
    318    psrlw           m5, m4, [sec_shift+gprsize]
    319    psubusw         m7, m6, m5
    320    pabsw           m5, m3               ; adiff_k1s1
    321    pminsw          m7, m4
    322    psrlw           m4, m5, [sec_shift+gprsize]
    323    psignw          m7, m2               ; constrain(diff_k1s0)
    324    psubusw         m2, m6, m4
    325    pminsw          m2, m5
    326 %if %1 == 4
    327    movq            m4, [tmpq+offq+32*0] ; k1s2
    328    movhps          m4, [tmpq+offq+32*1]
    329    neg           offq
    330    movq            m5, [tmpq+offq+32*0] ; k1s3
    331    movhps          m5, [tmpq+offq+32*1]
    332 %else
    333    movu            m4, [tmpq+offq]
    334    neg           offq
    335    movu            m5, [tmpq+offq]
    336 %endif
    337    paddw           m0, m7
    338    psubw           m4, m1               ; diff_k1s2
    339    psubw           m5, m1               ; diff_k1s3
    340    psignw          m2, m3               ; constrain(diff_k1s1)
    341    pabsw           m3, m4               ; adiff_k1s2
    342    paddw           m0, m2
    343    psrlw           m2, m3, [sec_shift+gprsize]
    344    psubusw         m7, m6, m2
    345    pabsw           m2, m5               ; adiff_k1s3
    346    pminsw          m7, m3
    347    psrlw           m3, m2, [sec_shift+gprsize]
    348    psignw          m7, m4               ; constrain(diff_k1s2)
    349    psubusw         m4, m6, m3
    350    pminsw          m4, m2
    351    paddw           m0, m7
    352    psignw          m4, m5               ; constrain(diff_k1s3)
    353    paddw           m0, m4               ; sum
    354    psraw           m2, m0, 15
    355    paddw           m0, m2
    356    pmulhrsw        m0, m8
    357    paddw           m0, m1
    358 %if %1 == 4
    359    add           tmpq, 32*2
    360    movq   [dstq+strideq*0], m0
    361    movhps [dstq+strideq*1], m0
    362    lea           dstq, [dstq+strideq*2]
    363 %else
    364    add           tmpq, 32
    365    mova        [dstq], m0
    366    add           dstq, strideq
    367 %endif
    368    ret
    369 ALIGN function_align
    370 .pri_sec:
    371    movsx         offq, byte [dirq+8]    ; off2_k0
    372 %if %1 == 4
    373    movq            m1, [dstq+strideq*0]
    374    movhps          m1, [dstq+strideq*1]
    375    movq            m2, [tmpq+offq+32*0] ; k0s0
    376    movhps          m2, [tmpq+offq+32*1]
    377    neg           offq
    378    movq            m3, [tmpq+offq+32*0] ; k0s1
    379    movhps          m3, [tmpq+offq+32*1]
    380 %else
    381    mova            m1, [dstq]
    382    movu            m2, [tmpq+offq]
    383    neg           offq
    384    movu            m3, [tmpq+offq]
    385 %endif
    386    movsx         offq, byte [dirq+0]    ; off3_k0
    387    pabsw           m4, m2
    388 %if ARCH_X86_64
    389    pabsw          m10, m3
    390    pmaxsw          m9, m2, m3
    391    pminsw         m10, m4
    392 %else
    393    pabsw           m7, m3
    394    pmaxsw          m5, m2, m3
    395    pminsw          m4, m7
    396    mova            m9, m5
    397    mova           m10, m4
    398 %endif
    399    psubw           m2, m1               ; diff_k0s0
    400    psubw           m3, m1               ; diff_k0s1
    401    pabsw           m4, m2               ; adiff_k0s0
    402    psrlw           m5, m4, [sec_shift+gprsize]
    403    psubusw         m0, m6, m5
    404    pabsw           m5, m3               ; adiff_k0s1
    405    pminsw          m0, m4
    406    psrlw           m4, m5, [sec_shift+gprsize]
    407    psignw          m0, m2               ; constrain(diff_k0s0)
    408    psubusw         m2, m6, m4
    409    pminsw          m2, m5
    410 %if %1 == 4
    411    movq            m4, [tmpq+offq+32*0] ; k0s2
    412    movhps          m4, [tmpq+offq+32*1]
    413    neg           offq
    414    movq            m5, [tmpq+offq+32*0] ; k0s3
    415    movhps          m5, [tmpq+offq+32*1]
    416 %else
    417    movu            m4, [tmpq+offq]
    418    neg           offq
    419    movu            m5, [tmpq+offq]
    420 %endif
    421    movsx         offq, byte [dirq+9]    ; off2_k1
    422    pabsw           m7, m4
    423    psignw          m2, m3
    424    pabsw           m3, m5               ; constrain(diff_k0s1)
    425 %if ARCH_X86_64
    426    pmaxsw          m9, m4
    427    pminsw         m10, m7
    428    pmaxsw          m9, m5
    429    pminsw         m10, m3
    430 %else
    431    pminsw          m7, m10
    432    pminsw          m7, m3
    433    pmaxsw          m3, m9, m4
    434    pmaxsw          m3, m5
    435    mova           m10, m7
    436    mova            m9, m3
    437 %endif
    438    psubw           m4, m1               ; diff_k0s2
    439    psubw           m5, m1               ; diff_k0s3
    440    paddw           m0, m2
    441    pabsw           m3, m4               ; adiff_k0s2
    442    psrlw           m2, m3, [sec_shift+gprsize]
    443    psubusw         m7, m6, m2
    444    pabsw           m2, m5               ; adiff_k0s3
    445    pminsw          m7, m3
    446    psrlw           m3, m2, [sec_shift+gprsize]
    447    psignw          m7, m4               ; constrain(diff_k0s2)
    448    psubusw         m4, m6, m3
    449    pminsw          m4, m2
    450 %if %1 == 4
    451    movq            m2, [tmpq+offq+32*0] ; k1s0
    452    movhps          m2, [tmpq+offq+32*1]
    453    neg           offq
    454    movq            m3, [tmpq+offq+32*0] ; k1s1
    455    movhps          m3, [tmpq+offq+32*1]
    456 %else
    457    movu            m2, [tmpq+offq]
    458    neg           offq
    459    movu            m3, [tmpq+offq]
    460 %endif
    461    movsx         offq, byte [dirq+1]    ; off3_k1
    462    paddw           m0, m7
    463    pabsw           m7, m2
    464    psignw          m4, m5               ; constrain(diff_k0s3)
    465    pabsw           m5, m3
    466 %if ARCH_X86_64
    467    pmaxsw          m9, m2
    468    pminsw         m10, m7
    469    pmaxsw          m9, m3
    470    pminsw         m10, m5
    471 %else
    472    pminsw          m7, m10
    473    pminsw          m7, m5
    474    pmaxsw          m5, m9, m2
    475    pmaxsw          m5, m3
    476    mova           m10, m7
    477    mova            m9, m5
    478 %endif
    479    paddw           m0, m4               ; constrain(diff_k0)
    480    psubw           m2, m1               ; diff_k1s0
    481    psubw           m3, m1               ; diff_k1s1
    482    paddw           m0, m0               ; sec_tap_k0
    483    pabsw           m4, m2               ; adiff_k1s0
    484    psrlw           m5, m4, [sec_shift+gprsize]
    485    psubusw         m7, m6, m5
    486    pabsw           m5, m3               ; adiff_k1s1
    487    pminsw          m7, m4
    488    psrlw           m4, m5, [sec_shift+gprsize]
    489    psignw          m7, m2               ; constrain(diff_k1s0)
    490    psubusw         m2, m6, m4
    491    pminsw          m2, m5
    492 %if %1 == 4
    493    movq            m4, [tmpq+offq+32*0] ; k1s2
    494    movhps          m4, [tmpq+offq+32*1]
    495    neg           offq
    496    movq            m5, [tmpq+offq+32*0] ; k1s3
    497    movhps          m5, [tmpq+offq+32*1]
    498 %else
    499    movu            m4, [tmpq+offq]
    500    neg           offq
    501    movu            m5, [tmpq+offq]
    502 %endif
    503    movsx         offq, byte [dirq+4]    ; off1_k0
    504    paddw           m0, m7
    505    pabsw           m7, m4
    506    psignw          m2, m3               ; constrain(diff_k1s1)
    507    pabsw           m3, m5
    508 %if ARCH_X86_64
    509    pmaxsw          m9, m4
    510    pminsw         m10, m7
    511    pmaxsw          m9, m5
    512    pminsw         m10, m3
    513 %else
    514    pminsw          m7, m10
    515    pminsw          m7, m3
    516    pmaxsw          m3, m9, m4
    517    pmaxsw          m3, m5
    518    mova           m10, m7
    519    mova            m9, m3
    520 %endif
    521    psubw           m4, m1               ; diff_k1s2
    522    psubw           m5, m1               ; diff_k1s3
    523    pabsw           m3, m4               ; adiff_k1s2
    524    paddw           m0, m2
    525    psrlw           m2, m3, [sec_shift+gprsize]
    526    psubusw         m7, m6, m2
    527    pabsw           m2, m5               ; adiff_k1s3
    528    pminsw          m7, m3
    529    psrlw           m3, m2, [sec_shift+gprsize]
    530    psignw          m7, m4               ; constrain(diff_k1s2)
    531    psubusw         m4, m6, m3
    532    pminsw          m4, m2
    533    paddw           m0, m7
    534 %if %1 == 4
    535    movq            m2, [tmpq+offq+32*0] ; k0p0
    536    movhps          m2, [tmpq+offq+32*1]
    537    neg           offq
    538    movq            m3, [tmpq+offq+32*0] ; k0p1
    539    movhps          m3, [tmpq+offq+32*1]
    540 %else
    541    movu            m2, [tmpq+offq]
    542    neg           offq
    543    movu            m3, [tmpq+offq]
    544 %endif
    545    movsx         offq, byte [dirq+5]    ; off1_k1
    546    pabsw           m7, m2
    547    psignw          m4, m5               ; constrain(diff_k1s3)
    548    pabsw           m5, m3
    549 %if ARCH_X86_64
    550    pmaxsw          m9, m2
    551    pminsw         m10, m7
    552    pmaxsw          m9, m3
    553    pminsw         m10, m5
    554 %else
    555    pminsw          m7, m10
    556    pminsw          m7, m5
    557    pmaxsw          m5, m9, m2
    558    pmaxsw          m5, m3
    559    mova           m10, m7
    560    mova            m9, m5
    561 %endif
    562    psubw           m2, m1               ; diff_k0p0
    563    psubw           m3, m1               ; diff_k0p1
    564    paddw           m0, m4
    565    pabsw           m4, m2               ; adiff_k0p0
    566    psrlw           m5, m4, [pri_shift+gprsize]
    567    psubusw         m7, [rsp+gprsize], m5
    568    pabsw           m5, m3               ; adiff_k0p1
    569    pminsw          m7, m4
    570    psrlw           m4, m5, [pri_shift+gprsize]
    571    psignw          m7, m2               ; constrain(diff_k0p0)
    572    psubusw         m2, [rsp+gprsize], m4
    573    pminsw          m2, m5
    574 %if %1 == 4
    575    movq            m4, [tmpq+offq+32*0] ; k1p0
    576    movhps          m4, [tmpq+offq+32*1]
    577    neg           offq
    578    movq            m5, [tmpq+offq+32*0] ; k1p1
    579    movhps          m5, [tmpq+offq+32*1]
    580 %else
    581    movu            m4, [tmpq+offq]
    582    neg           offq
    583    movu            m5, [tmpq+offq]
    584 %endif
    585    psignw          m2, m3               ; constrain(diff_k0p1)
    586    pabsw           m3, m4
    587    paddw           m7, m2               ; constrain(diff_k0)
    588    pabsw           m2, m5
    589 %if ARCH_X86_64
    590    pmaxsw          m9, m4
    591    pminsw         m10, m3
    592    pmaxsw          m9, m5
    593    pminsw         m10, m2
    594 %else
    595    pminsw          m3, m10
    596    pminsw          m3, m2
    597    pmaxsw          m2, m9, m4
    598    pmaxsw          m2, m5
    599    mova           m10, m3
    600    mova            m9, m2
    601 %endif
    602    psubw           m4, m1               ; diff_k1p0
    603    psubw           m5, m1               ; diff_k1p1
    604    pabsw           m3, m4               ; adiff_k1p0
    605    pmullw          m7, [priq+16*0]      ; pri_tap_k0
    606    paddw           m0, m7
    607    psrlw           m2, m3, [pri_shift+gprsize]
    608    psubusw         m7, [rsp+16*0+gprsize], m2
    609    pabsw           m2, m5               ; adiff_k1p1
    610    pminsw          m7, m3
    611    psrlw           m3, m2, [pri_shift+gprsize]
    612    psignw          m7, m4               ; constrain(diff_k1p0)
    613    psubusw         m4, [rsp+16*0+gprsize], m3
    614    pminsw          m4, m2
    615    psignw          m4, m5               ; constrain(diff_k1p1)
    616    paddw           m7, m4               ; constrain(diff_k1)
    617    pmullw          m7, [priq+16*1]      ; pri_tap_k1
    618    paddw           m0, m7               ; sum
    619    psraw           m2, m0, 15
    620    paddw           m0, m2
    621    pmulhrsw        m0, m8
    622    paddw           m0, m1
    623 %if ARCH_X86_64
    624    pmaxsw          m9, m1
    625    pminsw          m0, m9
    626 %else
    627    pmaxsw          m2, m9, m1
    628    pminsw          m0, m2
    629 %endif
    630    pminsw          m1, m10
    631    pmaxsw          m0, m1
    632 %if %1 == 4
    633    add           tmpq, 32*2
    634    movq   [dstq+strideq*0], m0
    635    movhps [dstq+strideq*1], m0
    636    lea           dstq, [dstq+strideq*2]
    637 %else
    638    add           tmpq, 32
    639    mova        [dstq], m0
    640    add           dstq, strideq
    641 %endif
    642    ret
    643 %endif
    644 %endmacro
    645 
    646 INIT_XMM ssse3
    647 %if ARCH_X86_64
    648 cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \
    649                                               pri, sec, edge
    650    %define         px  rsp+32*4
    651 %else
    652 cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left
    653    %define       botq  topq
    654    %define         px  rsp+32*5
    655 %endif
    656    %define       base  t0-dir_table
    657    %define  pri_shift  px-16*6
    658    %define  sec_shift  px-16*5
    659    mov          edged, r9m
    660    LEA             t0, dir_table
    661    movu            m0, [dstq+strideq*0]
    662    movu            m1, [dstq+strideq*1]
    663    lea             t1, [dstq+strideq*2]
    664    movu            m2, [t1  +strideq*0]
    665    movu            m3, [t1  +strideq*1]
    666    movddup         m7, [base+pw_m16384]
    667    mova   [px+32*0+0], m0
    668    mova   [px+32*1+0], m1
    669    mova   [px+32*2+0], m2
    670    mova   [px+32*3+0], m3
    671    test         edgeb, 4 ; HAVE_TOP
    672    jz .no_top
    673    movifnidn     topq, topmp
    674    movu            m0, [topq+strideq*0]
    675    movu            m1, [topq+strideq*1]
    676    mova   [px-32*2+0], m0
    677    mova   [px-32*1+0], m1
    678    test         edgeb, 1 ; HAVE_LEFT
    679    jz .top_no_left
    680    movd            m0, [topq+strideq*0-4]
    681    movd            m1, [topq+strideq*1-4]
    682    movd   [px-32*2-4], m0
    683    movd   [px-32*1-4], m1
    684    jmp .top_done
    685 .no_top:
    686    mova   [px-32*2+0], m7
    687    mova   [px-32*1+0], m7
    688 .top_no_left:
    689    movd   [px-32*2-4], m7
    690    movd   [px-32*1-4], m7
    691 .top_done:
    692    test         edgeb, 8 ; HAVE_BOTTOM
    693    jz .no_bottom
    694    movifnidn     botq, r4mp
    695    movu            m0, [botq+strideq*0]
    696    movu            m1, [botq+strideq*1]
    697    mova   [px+32*4+0], m0
    698    mova   [px+32*5+0], m1
    699    test         edgeb, 1 ; HAVE_LEFT
    700    jz .bottom_no_left
    701    movd            m0, [botq+strideq*0-4]
    702    movd            m1, [botq+strideq*1-4]
    703    movd   [px+32*4-4], m0
    704    movd   [px+32*5-4], m1
    705    jmp .bottom_done
    706 .no_bottom:
    707    mova   [px+32*4+0], m7
    708    mova   [px+32*5+0], m7
    709 .bottom_no_left:
    710    movd   [px+32*4-4], m7
    711    movd   [px+32*5-4], m7
    712 .bottom_done:
    713    test         edgeb, 1 ; HAVE_LEFT
    714    jz .no_left
    715    movifnidn    leftq, r2mp
    716    movd            m0, [leftq+4*0]
    717    movd            m1, [leftq+4*1]
    718    movd            m2, [leftq+4*2]
    719    movd            m3, [leftq+4*3]
    720    movd   [px+32*0-4], m0
    721    movd   [px+32*1-4], m1
    722    movd   [px+32*2-4], m2
    723    movd   [px+32*3-4], m3
    724    jmp .left_done
    725 .no_left:
    726    REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3
    727 .left_done:
    728    test         edgeb, 2 ; HAVE_RIGHT
    729    jnz .padding_done
    730    REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5
    731 .padding_done:
    732    CDEF_FILTER      4, 4
    733 
    734 %if ARCH_X86_64
    735 cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
    736                                               pri, sec, edge
    737 %else
    738 cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
    739 %endif
    740    mov          edged, r9m
    741    LEA             t0, dir_table
    742    movu            m0, [dstq+strideq*0]
    743    movu            m1, [dstq+strideq*1]
    744    lea             t1, [dstq+strideq*2]
    745    movu            m2, [t1  +strideq*0]
    746    movu            m3, [t1  +strideq*1]
    747    lea             t1, [t1  +strideq*2]
    748    movu            m4, [t1  +strideq*0]
    749    movu            m5, [t1  +strideq*1]
    750    lea             t1, [t1  +strideq*2]
    751    movu            m6, [t1  +strideq*0]
    752    movu            m7, [t1  +strideq*1]
    753    mova   [px+32*0+0], m0
    754    mova   [px+32*1+0], m1
    755    mova   [px+32*2+0], m2
    756    mova   [px+32*3+0], m3
    757    mova   [px+32*4+0], m4
    758    mova   [px+32*5+0], m5
    759    mova   [px+32*6+0], m6
    760    mova   [px+32*7+0], m7
    761    movddup         m7, [base+pw_m16384]
    762    test         edgeb, 4 ; HAVE_TOP
    763    jz .no_top
    764    movifnidn     topq, topmp
    765    movu            m0, [topq+strideq*0]
    766    movu            m1, [topq+strideq*1]
    767    mova   [px-32*2+0], m0
    768    mova   [px-32*1+0], m1
    769    test         edgeb, 1 ; HAVE_LEFT
    770    jz .top_no_left
    771    movd            m0, [topq+strideq*0-4]
    772    movd            m1, [topq+strideq*1-4]
    773    movd   [px-32*2-4], m0
    774    movd   [px-32*1-4], m1
    775    jmp .top_done
    776 .no_top:
    777    mova   [px-32*2+0], m7
    778    mova   [px-32*1+0], m7
    779 .top_no_left:
    780    movd   [px-32*2-4], m7
    781    movd   [px-32*1-4], m7
    782 .top_done:
    783    test         edgeb, 8 ; HAVE_BOTTOM
    784    jz .no_bottom
    785    movifnidn     botq, r4mp
    786    movu            m0, [botq+strideq*0]
    787    movu            m1, [botq+strideq*1]
    788    mova   [px+32*8+0], m0
    789    mova   [px+32*9+0], m1
    790    test         edgeb, 1 ; HAVE_LEFT
    791    jz .bottom_no_left
    792    movd            m0, [botq+strideq*0-4]
    793    movd            m1, [botq+strideq*1-4]
    794    movd   [px+32*8-4], m0
    795    movd   [px+32*9-4], m1
    796    jmp .bottom_done
    797 .no_bottom:
    798    mova   [px+32*8+0], m7
    799    mova   [px+32*9+0], m7
    800 .bottom_no_left:
    801    movd   [px+32*8-4], m7
    802    movd   [px+32*9-4], m7
    803 .bottom_done:
    804    test         edgeb, 1 ; HAVE_LEFT
    805    jz .no_left
    806    movifnidn    leftq, r2mp
    807    movd            m0, [leftq+4*0]
    808    movd            m1, [leftq+4*1]
    809    movd            m2, [leftq+4*2]
    810    movd            m3, [leftq+4*3]
    811    movd   [px+32*0-4], m0
    812    movd   [px+32*1-4], m1
    813    movd   [px+32*2-4], m2
    814    movd   [px+32*3-4], m3
    815    movd            m0, [leftq+4*4]
    816    movd            m1, [leftq+4*5]
    817    movd            m2, [leftq+4*6]
    818    movd            m3, [leftq+4*7]
    819    movd   [px+32*4-4], m0
    820    movd   [px+32*5-4], m1
    821    movd   [px+32*6-4], m2
    822    movd   [px+32*7-4], m3
    823    jmp .left_done
    824 .no_left:
    825    REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
    826 .left_done:
    827    test         edgeb, 2 ; HAVE_RIGHT
    828    jnz .padding_done
    829    REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
    830 .padding_done:
    831    CDEF_FILTER      4, 8
    832 
    833 %if ARCH_X86_64
    834 cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
    835                                               pri, sec, edge
    836 %else
    837 cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
    838 %endif
    839    mov          edged, r9m
    840    LEA             t0, dir_table
    841    mova            m0, [dstq+strideq*0+ 0]
    842    movd            m1, [dstq+strideq*0+16]
    843    mova            m2, [dstq+strideq*1+ 0]
    844    movd            m3, [dstq+strideq*1+16]
    845    lea             t1, [dstq+strideq*2]
    846    mova            m4, [t1  +strideq*0+ 0]
    847    movd            m5, [t1  +strideq*0+16]
    848    mova            m6, [t1  +strideq*1+ 0]
    849    movd            m7, [t1  +strideq*1+16]
    850    lea             t1, [t1  +strideq*2]
    851    mova  [px+32*0+ 0], m0
    852    movd  [px+32*0+16], m1
    853    mova  [px+32*1+ 0], m2
    854    movd  [px+32*1+16], m3
    855    mova  [px+32*2+ 0], m4
    856    movd  [px+32*2+16], m5
    857    mova  [px+32*3+ 0], m6
    858    movd  [px+32*3+16], m7
    859    mova            m0, [t1  +strideq*0+ 0]
    860    movd            m1, [t1  +strideq*0+16]
    861    mova            m2, [t1  +strideq*1+ 0]
    862    movd            m3, [t1  +strideq*1+16]
    863    lea             t1, [t1  +strideq*2]
    864    mova            m4, [t1  +strideq*0+ 0]
    865    movd            m5, [t1  +strideq*0+16]
    866    mova            m6, [t1  +strideq*1+ 0]
    867    movd            m7, [t1  +strideq*1+16]
    868    mova  [px+32*4+ 0], m0
    869    movd  [px+32*4+16], m1
    870    mova  [px+32*5+ 0], m2
    871    movd  [px+32*5+16], m3
    872    mova  [px+32*6+ 0], m4
    873    movd  [px+32*6+16], m5
    874    mova  [px+32*7+ 0], m6
    875    movd  [px+32*7+16], m7
    876    movddup         m7, [base+pw_m16384]
    877    test         edgeb, 4 ; HAVE_TOP
    878    jz .no_top
    879    movifnidn     topq, topmp
    880    mova            m0, [topq+strideq*0+ 0]
    881    mova            m1, [topq+strideq*0+16]
    882    mova            m2, [topq+strideq*1+ 0]
    883    mova            m3, [topq+strideq*1+16]
    884    mova  [px-32*2+ 0], m0
    885    movd  [px-32*2+16], m1
    886    mova  [px-32*1+ 0], m2
    887    movd  [px-32*1+16], m3
    888    test         edgeb, 1 ; HAVE_LEFT
    889    jz .top_no_left
    890    movd            m0, [topq+strideq*0-4]
    891    movd            m1, [topq+strideq*1-4]
    892    movd   [px-32*2-4], m0
    893    movd   [px-32*1-4], m1
    894    jmp .top_done
    895 .no_top:
    896    mova  [px-32*2+ 0], m7
    897    movd  [px-32*2+16], m7
    898    mova  [px-32*1+ 0], m7
    899    movd  [px-32*1+16], m7
    900 .top_no_left:
    901    movd  [px-32*2- 4], m7
    902    movd  [px-32*1- 4], m7
    903 .top_done:
    904    test         edgeb, 8 ; HAVE_BOTTOM
    905    jz .no_bottom
    906    movifnidn     botq, r4mp
    907    mova            m0, [botq+strideq*0+ 0]
    908    movd            m1, [botq+strideq*0+16]
    909    mova            m2, [botq+strideq*1+ 0]
    910    movd            m3, [botq+strideq*1+16]
    911    mova  [px+32*8+ 0], m0
    912    movd  [px+32*8+16], m1
    913    mova  [px+32*9+ 0], m2
    914    movd  [px+32*9+16], m3
    915    test         edgeb, 1 ; HAVE_LEFT
    916    jz .bottom_no_left
    917    movd            m0, [botq+strideq*0-4]
    918    movd            m1, [botq+strideq*1-4]
    919    movd  [px+32*8- 4], m0
    920    movd  [px+32*9- 4], m1
    921    jmp .bottom_done
    922 .no_bottom:
    923    mova  [px+32*8+ 0], m7
    924    movd  [px+32*8+16], m7
    925    mova  [px+32*9+ 0], m7
    926    movd  [px+32*9+16], m7
    927 .bottom_no_left:
    928    movd  [px+32*8- 4], m7
    929    movd  [px+32*9- 4], m7
    930 .bottom_done:
    931    test         edgeb, 1 ; HAVE_LEFT
    932    jz .no_left
    933    movifnidn    leftq, r2mp
    934    movd            m0, [leftq+4*0]
    935    movd            m1, [leftq+4*1]
    936    movd            m2, [leftq+4*2]
    937    movd            m3, [leftq+4*3]
    938    movd  [px+32*0- 4], m0
    939    movd  [px+32*1- 4], m1
    940    movd  [px+32*2- 4], m2
    941    movd  [px+32*3- 4], m3
    942    movd            m0, [leftq+4*4]
    943    movd            m1, [leftq+4*5]
    944    movd            m2, [leftq+4*6]
    945    movd            m3, [leftq+4*7]
    946    movd  [px+32*4- 4], m0
    947    movd  [px+32*5- 4], m1
    948    movd  [px+32*6- 4], m2
    949    movd  [px+32*7- 4], m3
    950    jmp .left_done
    951 .no_left:
    952    REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
    953 .left_done:
    954    test         edgeb, 2 ; HAVE_RIGHT
    955    jnz .padding_done
    956    REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
    957 .padding_done:
    958    CDEF_FILTER      8, 8
    959 
    960 %macro CDEF_DIR 0
    961 %if ARCH_X86_64
    962 cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax
    963    lea             r6, [dir_shift]
    964    shr         bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
    965    movddup         m7, [r6+bdmaxq*8]
    966    lea             r6, [strideq*3]
    967    mova            m0, [srcq+strideq*0]
    968    mova            m1, [srcq+strideq*1]
    969    mova            m2, [srcq+strideq*2]
    970    mova            m3, [srcq+r6       ]
    971    lea           srcq, [srcq+strideq*4]
    972    mova            m4, [srcq+strideq*0]
    973    mova            m5, [srcq+strideq*1]
    974    mova            m6, [srcq+strideq*2]
    975    REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6
    976    pmulhuw         m7, [srcq+r6       ]
    977    pxor            m8, m8
    978    packuswb        m9, m0, m1
    979    packuswb       m10, m2, m3
    980    packuswb       m11, m4, m5
    981    packuswb       m12, m6, m7
    982    REPX {psadbw x, m8}, m9, m10, m11, m12
    983    packssdw        m9, m10
    984    packssdw       m11, m12
    985    packssdw        m9, m11
    986    jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
    987 %else
    988 cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax
    989    mov         bdmaxd, bdmaxm
    990    LEA             r2, dir_shift
    991    shr         bdmaxd, 11
    992    movddup         m7, [r2+bdmaxq*8]
    993    lea             r3, [strideq*3]
    994    pmulhuw         m3, m7, [srcq+strideq*0]
    995    pmulhuw         m4, m7, [srcq+strideq*1]
    996    pmulhuw         m5, m7, [srcq+strideq*2]
    997    pmulhuw         m6, m7, [srcq+r3       ]
    998    movddup         m1, [r2-dir_shift+pw_128]
    999    lea           srcq, [srcq+strideq*4]
   1000    pxor            m0, m0
   1001    packuswb        m2, m3, m4
   1002    psubw           m3, m1
   1003    psubw           m4, m1
   1004    mova    [esp+0x00], m3
   1005    mova    [esp+0x10], m4
   1006    packuswb        m3, m5, m6
   1007    psadbw          m2, m0
   1008    psadbw          m3, m0
   1009    psubw           m5, m1
   1010    psubw           m6, m1
   1011    packssdw        m2, m3
   1012    mova    [esp+0x20], m5
   1013    mova    [esp+0x50], m6
   1014    pmulhuw         m4, m7, [srcq+strideq*0]
   1015    pmulhuw         m5, m7, [srcq+strideq*1]
   1016    pmulhuw         m6, m7, [srcq+strideq*2]
   1017    pmulhuw         m7,     [srcq+r3       ]
   1018    packuswb        m3, m4, m5
   1019    packuswb        m1, m6, m7
   1020    psadbw          m3, m0
   1021    psadbw          m1, m0
   1022    packssdw        m3, m1
   1023    movddup         m1, [r2-dir_shift+pw_128]
   1024    LEA             r2, shufw_6543210x
   1025    jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
   1026 %endif
   1027 %endmacro
   1028 
   1029 INIT_XMM ssse3
   1030 CDEF_DIR
   1031 
   1032 INIT_XMM sse4
   1033 CDEF_DIR