tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef_sse.asm (40722B)


      1 ; Copyright © 2018, VideoLAN and dav1d authors
      2 ; Copyright © 2018, Two Orioles, LLC
      3 ; Copyright © 2019, VideoLabs
      4 ; All rights reserved.
      5 ;
      6 ; Redistribution and use in source and binary forms, with or without
      7 ; modification, are permitted provided that the following conditions are met:
      8 ;
      9 ; 1. Redistributions of source code must retain the above copyright notice, this
     10 ;    list of conditions and the following disclaimer.
     11 ;
     12 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     13 ;    this list of conditions and the following disclaimer in the documentation
     14 ;    and/or other materials provided with the distribution.
     15 ;
     16 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 
     27 %include "config.asm"
     28 %include "ext/x86/x86inc.asm"
     29 
     30 SECTION_RODATA 16
     31 
     32 %macro DUP8 1-*
     33    %rep %0
     34        times 8 db %1
     35        %rotate 1
     36    %endrep
     37 %endmacro
     38 
     39 div_table_sse4:  dd 840, 420, 280, 210, 168, 140, 120, 105
     40                 dd 420, 210, 140, 105, 105, 105, 105, 105
     41 div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210
     42                 dw 168, 168, 140, 140, 120, 120, 105, 105
     43                 dw 420, 420, 210, 210, 140, 140, 105, 105
     44                 dw 105, 105, 105, 105, 105, 105, 105, 105
     45 const shufw_6543210x, \
     46            db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
     47 shufb_lohi: db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
     48 pw_8:      times 8 dw 8
     49 pw_128:    times 8 dw 128
     50 pw_256:    times 8 dw 256
     51 pw_2048:   times 8 dw 2048
     52 pw_0x7FFF: times 8 dw 0x7FFF
     53 pw_0x8000: times 8 dw 0x8000
     54 tap_table: ; masks for 8-bit shift emulation
     55           DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80
     56           ; weights
     57           DUP8 4, 2, 3, 3, 2, 1
     58           ; taps indices
     59           db -1 * 16 + 1, -2 * 16 + 2
     60           db  0 * 16 + 1, -1 * 16 + 2
     61           db  0 * 16 + 1,  0 * 16 + 2
     62           db  0 * 16 + 1,  1 * 16 + 2
     63           db  1 * 16 + 1,  2 * 16 + 2
     64           db  1 * 16 + 0,  2 * 16 + 1
     65           db  1 * 16 + 0,  2 * 16 + 0
     66           db  1 * 16 + 0,  2 * 16 - 1
     67           ; the last 6 are repeats of the first 6 so we don't need to & 7
     68           db -1 * 16 + 1, -2 * 16 + 2
     69           db  0 * 16 + 1, -1 * 16 + 2
     70           db  0 * 16 + 1,  0 * 16 + 2
     71           db  0 * 16 + 1,  1 * 16 + 2
     72           db  1 * 16 + 1,  2 * 16 + 2
     73           db  1 * 16 + 0,  2 * 16 + 1
     74 
     75 SECTION .text
     76 
     77 %macro movif32 2
     78 %if ARCH_X86_32
     79    mov     %1, %2
     80 %endif
     81 %endmacro
     82 
     83 %macro PMOVZXBW 2-3 0 ; %3 = half
     84 %if cpuflag(sse4) && %3 == 0
     85    pmovzxbw        %1, %2
     86 %else
     87  %if %3 == 1
     88    movd            %1, %2
     89  %else
     90    movq            %1, %2
     91  %endif
     92    punpcklbw       %1, m7
     93 %endif
     94 %endmacro
     95 
     96 %macro PSHUFB_0 2
     97 %if cpuflag(ssse3)
     98    pshufb          %1, %2
     99 %else
    100    punpcklbw       %1, %1
    101    pshuflw         %1, %1, q0000
    102    punpcklqdq      %1, %1
    103 %endif
    104 %endmacro
    105 
    106 %macro MOVDDUP 2
    107 %if cpuflag(ssse3)
    108    movddup         %1, %2
    109 %else
    110    movq            %1, %2
    111    punpcklqdq      %1, %1
    112 %endif
    113 %endmacro
    114 
    115 %macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax
    116    ; load p0/p1
    117    movsx         offq, byte [dirq+kq+%1+14*8]  ; off1
    118 %if %6 == 4
    119    movq            m5, [stkq+offq*2+32*0]      ; p0
    120    movhps          m5, [stkq+offq*2+32*1]
    121 %else
    122    movu            m5, [stkq+offq*2+32*0]      ; p0
    123 %endif
    124    neg           offq                          ; -off1
    125 %if %6 == 4
    126    movq            m6, [stkq+offq*2+32*0]      ; p1
    127    movhps          m6, [stkq+offq*2+32*1]
    128 %else
    129    movu            m6, [stkq+offq*2+32*0]      ; p1
    130 %endif
    131 %if %7
    132  %if cpuflag(sse4)
    133    ; out of bounds values are set to a value that is a both a large unsigned
    134    ; value and a negative signed value.
    135    ; use signed max and unsigned min to remove them
    136    pmaxsw          m7, m5
    137    pminuw          m8, m5
    138    pmaxsw          m7, m6
    139    pminuw          m8, m6
    140  %else
    141    pcmpeqw         m3, m14, m5
    142    pminsw          m8, m5     ; min after p0
    143    pandn           m3, m5
    144    pmaxsw          m7, m3     ; max after p0
    145    pcmpeqw         m3, m14, m6
    146    pminsw          m8, m6     ; min after p1
    147    pandn           m3, m6
    148    pmaxsw          m7, m3     ; max after p1
    149  %endif
    150 %endif
    151 
    152    ; accumulate sum[m13] over p0/p1
    153    psubw           m5, m4     ; diff_p0(p0 - px)
    154    psubw           m6, m4     ; diff_p1(p1 - px)
    155    packsswb        m5, m6     ; convert pixel diff to 8-bit
    156 %if cpuflag(ssse3)
    157    pshufb          m5, m13    ; group diffs p0 and p1 into pairs
    158    pabsb           m6, m5
    159    psignb          m3, %5, m5
    160 %else
    161    movlhps         m6, m5
    162    punpckhbw       m6, m5
    163    pxor            m5, m5
    164    pcmpgtb         m5, m6
    165    paddb           m6, m5
    166    pxor            m6, m5
    167    paddb           m3, %5, m5
    168    pxor            m3, m5
    169 %endif
    170    pand            m9, %3, m6 ; emulate 8-bit shift
    171    psrlw           m9, %2
    172    psubusb         m5, %4, m9
    173    pminub          m5, m6     ; constrain(diff_p)
    174 %if cpuflag(ssse3)
    175    pmaddubsw       m5, m3     ; constrain(diff_p) * taps
    176 %else
    177    psrlw           m9, m5, 8
    178    psraw           m6, m3, 8
    179    psllw           m5, 8
    180    psllw           m3, 8
    181    pmullw          m9, m6
    182    pmulhw          m5, m3
    183    paddw           m5, m9
    184 %endif
    185    paddw           m0, m5
    186 %endmacro
    187 
    188 %macro LOAD_BODY 3 ; dst, src, block_width
    189 %if %3 == 4
    190    PMOVZXBW        m0, [%2+strideq*0]
    191    PMOVZXBW        m1, [%2+strideq*1]
    192    PMOVZXBW        m2, [%2+strideq*2]
    193    PMOVZXBW        m3, [%2+stride3q]
    194    mova     [%1+32*0], m0
    195    mova     [%1+32*1], m1
    196    mova     [%1+32*2], m2
    197    mova     [%1+32*3], m3
    198 %else
    199    movu            m0, [%2+strideq*0]
    200    movu            m1, [%2+strideq*1]
    201    movu            m2, [%2+strideq*2]
    202    movu            m3, [%2+stride3q]
    203    punpcklbw       m4, m0, m7
    204    punpckhbw       m0, m7
    205    mova  [%1+32*0+ 0], m4
    206    mova  [%1+32*0+16], m0
    207    punpcklbw       m4, m1, m7
    208    punpckhbw       m1, m7
    209    mova  [%1+32*1+ 0], m4
    210    mova  [%1+32*1+16], m1
    211    punpcklbw       m4, m2, m7
    212    punpckhbw       m2, m7
    213    mova  [%1+32*2+ 0], m4
    214    mova  [%1+32*2+16], m2
    215    punpcklbw       m4, m3, m7
    216    punpckhbw       m3, m7
    217    mova  [%1+32*3+ 0], m4
    218    mova  [%1+32*3+16], m3
    219 %endif
    220 %endmacro
    221 
    222 %macro CDEF_FILTER_END 2 ; w, minmax
    223    pxor            m6, m6
    224    pcmpgtw         m6, m0
    225    paddw           m0, m6
    226 %if cpuflag(ssse3)
    227    pmulhrsw        m0, m15
    228 %else
    229    paddw           m0, m15
    230    psraw           m0, 4
    231 %endif
    232    paddw           m4, m0
    233 %if %2
    234    pminsw          m4, m7
    235    pmaxsw          m4, m8
    236 %endif
    237    packuswb        m4, m4
    238 %if %1 == 4
    239    movd [dstq+strideq*0], m4
    240    psrlq           m4, 32
    241    movd [dstq+strideq*1], m4
    242    add           stkq, 32*2
    243    lea           dstq, [dstq+strideq*2]
    244 %else
    245    movq        [dstq], m4
    246    add           stkq, 32
    247    add           dstq, strideq
    248 %endif
    249 %endmacro
    250 
    251 %macro CDEF_FILTER 2 ; w, h
    252 %if ARCH_X86_64
    253 cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \
    254                                dst, stride, left, top, bot, pri, dst4, edge, \
    255                                stride3
    256  %define px rsp+3*16+2*32
    257  %define base 0
    258 %else
    259 cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
    260                                dst, stride, left, edge, stride3
    261    %define       topq  r2
    262    %define       botq  r2
    263    %define      dst4q  r2
    264    LEA             r5, tap_table
    265  %define px esp+7*16+2*32
    266  %define base r5-tap_table
    267 %endif
    268    mov          edged, r9m
    269 %if cpuflag(sse4)
    270   %define OUT_OF_BOUNDS_MEM [base+pw_0x8000]
    271 %else
    272   %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF]
    273 %endif
    274    mova            m6, OUT_OF_BOUNDS_MEM
    275    pxor            m7, m7
    276 
    277    ; prepare pixel buffers - body/right
    278 %if %2 == 8
    279    lea          dst4q, [dstq+strideq*4]
    280 %endif
    281    lea       stride3q, [strideq*3]
    282    test         edgeb, 2 ; have_right
    283    jz .no_right
    284    LOAD_BODY       px, dstq, %1
    285 %if %2 == 8
    286    LOAD_BODY  px+4*32, dst4q, %1
    287 %endif
    288    jmp .body_done
    289 .no_right:
    290    PMOVZXBW        m0, [dstq+strideq*0], %1 == 4
    291    PMOVZXBW        m1, [dstq+strideq*1], %1 == 4
    292    PMOVZXBW        m2, [dstq+strideq*2], %1 == 4
    293    PMOVZXBW        m3, [dstq+stride3q ], %1 == 4
    294    mova     [px+32*0], m0
    295    mova     [px+32*1], m1
    296    mova     [px+32*2], m2
    297    mova     [px+32*3], m3
    298    movd [px+32*0+%1*2], m6
    299    movd [px+32*1+%1*2], m6
    300    movd [px+32*2+%1*2], m6
    301    movd [px+32*3+%1*2], m6
    302 %if %2 == 8
    303    PMOVZXBW        m0, [dst4q+strideq*0], %1 == 4
    304    PMOVZXBW        m1, [dst4q+strideq*1], %1 == 4
    305    PMOVZXBW        m2, [dst4q+strideq*2], %1 == 4
    306    PMOVZXBW        m3, [dst4q+stride3q ], %1 == 4
    307    mova     [px+32*4], m0
    308    mova     [px+32*5], m1
    309    mova     [px+32*6], m2
    310    mova     [px+32*7], m3
    311    movd [px+32*4+%1*2], m6
    312    movd [px+32*5+%1*2], m6
    313    movd [px+32*6+%1*2], m6
    314    movd [px+32*7+%1*2], m6
    315 %endif
    316 .body_done:
    317 
    318    ; top
    319    movifnidn     topq, r3mp
    320    test         edgeb, 4 ; have_top
    321    jz .no_top
    322    test         edgeb, 1 ; have_left
    323    jz .top_no_left
    324    test         edgeb, 2 ; have_right
    325    jz .top_no_right
    326 %if %1 == 4
    327    PMOVZXBW        m0, [topq+strideq*0-2]
    328    PMOVZXBW        m1, [topq+strideq*1-2]
    329 %else
    330    movu            m0, [topq+strideq*0-4]
    331    movu            m1, [topq+strideq*1-4]
    332    punpckhbw       m2, m0, m7
    333    punpcklbw       m0, m7
    334    punpckhbw       m3, m1, m7
    335    punpcklbw       m1, m7
    336    movu  [px-32*2+8], m2
    337    movu  [px-32*1+8], m3
    338 %endif
    339    movu  [px-32*2-%1], m0
    340    movu  [px-32*1-%1], m1
    341    jmp .top_done
    342 .top_no_right:
    343 %if %1 == 4
    344    PMOVZXBW        m0, [topq+strideq*0-%1]
    345    PMOVZXBW        m1, [topq+strideq*1-%1]
    346    movu   [px-32*2-8], m0
    347    movu   [px-32*1-8], m1
    348 %else
    349    movu            m0, [topq+strideq*0-%1]
    350    movu            m1, [topq+strideq*1-%2]
    351    punpckhbw       m2, m0, m7
    352    punpcklbw       m0, m7
    353    punpckhbw       m3, m1, m7
    354    punpcklbw       m1, m7
    355    mova  [px-32*2-16], m0
    356    mova  [px-32*2+ 0], m2
    357    mova  [px-32*1-16], m1
    358    mova  [px-32*1+ 0], m3
    359 %endif
    360    movd [px-32*2+%1*2], m6
    361    movd [px-32*1+%1*2], m6
    362    jmp .top_done
    363 .top_no_left:
    364    test         edgeb, 2 ; have_right
    365    jz .top_no_left_right
    366 %if %1 == 4
    367    PMOVZXBW        m0, [topq+strideq*0]
    368    PMOVZXBW        m1, [topq+strideq*1]
    369 %else
    370    movu            m0, [topq+strideq*0]
    371    movu            m1, [topq+strideq*1]
    372    punpckhbw       m2, m0, m7
    373    punpcklbw       m0, m7
    374    punpckhbw       m3, m1, m7
    375    punpcklbw       m1, m7
    376    movd  [px-32*2+16], m2
    377    movd  [px-32*1+16], m3
    378 %endif
    379    movd  [px-32*2- 4], m6
    380    movd  [px-32*1- 4], m6
    381    mova  [px-32*2+ 0], m0
    382    mova  [px-32*1+ 0], m1
    383    jmp .top_done
    384 .top_no_left_right:
    385    PMOVZXBW        m0, [topq+strideq*0], %1 == 4
    386    PMOVZXBW        m1, [topq+strideq*1], %1 == 4
    387    movd   [px-32*2-4], m6
    388    movd   [px-32*1-4], m6
    389    mova   [px-32*2+0], m0
    390    mova   [px-32*1+0], m1
    391    movd [px-32*2+%1*2], m6
    392    movd [px-32*1+%1*2], m6
    393    jmp .top_done
    394 .no_top:
    395    movu  [px-32*2- 4], m6
    396    movu  [px-32*1- 4], m6
    397 %if %1 == 8
    398    movq  [px-32*2+12], m6
    399    movq  [px-32*1+12], m6
    400 %endif
    401 .top_done:
    402 
    403    ; left
    404    test         edgeb, 1 ; have_left
    405    jz .no_left
    406    movifnidn    leftq, leftmp
    407 %if %2 == 4
    408    movq            m0, [leftq]
    409 %else
    410    movu            m0, [leftq]
    411 %endif
    412 %if %2 == 4
    413    punpcklbw       m0, m7
    414 %else
    415    punpckhbw       m1, m0, m7
    416    punpcklbw       m0, m7
    417    movhlps         m3, m1
    418    movd   [px+32*4-4], m1
    419    movd   [px+32*6-4], m3
    420    psrlq           m1, 32
    421    psrlq           m3, 32
    422    movd   [px+32*5-4], m1
    423    movd   [px+32*7-4], m3
    424 %endif
    425    movhlps         m2, m0
    426    movd   [px+32*0-4], m0
    427    movd   [px+32*2-4], m2
    428    psrlq           m0, 32
    429    psrlq           m2, 32
    430    movd   [px+32*1-4], m0
    431    movd   [px+32*3-4], m2
    432    jmp .left_done
    433 .no_left:
    434    movd   [px+32*0-4], m6
    435    movd   [px+32*1-4], m6
    436    movd   [px+32*2-4], m6
    437    movd   [px+32*3-4], m6
    438 %if %2 == 8
    439    movd   [px+32*4-4], m6
    440    movd   [px+32*5-4], m6
    441    movd   [px+32*6-4], m6
    442    movd   [px+32*7-4], m6
    443 %endif
    444 .left_done:
    445 
    446    ; bottom
    447    movifnidn     botq, r4mp
    448    test         edgeb, 8 ; have_bottom
    449    jz .no_bottom
    450    test         edgeb, 1 ; have_left
    451    jz .bottom_no_left
    452    test         edgeb, 2 ; have_right
    453    jz .bottom_no_right
    454 %if %1 == 4
    455    PMOVZXBW        m0, [botq+strideq*0-(%1/2)]
    456    PMOVZXBW        m1, [botq+strideq*1-(%1/2)]
    457 %else
    458    movu            m0, [botq+strideq*0-4]
    459    movu            m1, [botq+strideq*1-4]
    460    punpckhbw       m2, m0, m7
    461    punpcklbw       m0, m7
    462    punpckhbw       m3, m1, m7
    463    punpcklbw       m1, m7
    464    movu [px+32*(%2+0)+8], m2
    465    movu [px+32*(%2+1)+8], m3
    466 %endif
    467    movu [px+32*(%2+0)-%1], m0
    468    movu [px+32*(%2+1)-%1], m1
    469    jmp .bottom_done
    470 .bottom_no_right:
    471 %if %1 == 4
    472    PMOVZXBW        m0, [botq+strideq*0-4]
    473    PMOVZXBW        m1, [botq+strideq*1-4]
    474    movu [px+32*(%2+0)-8], m0
    475    movu [px+32*(%2+1)-8], m1
    476 %else
    477    movu            m0, [botq+strideq*0-8]
    478    movu            m1, [botq+strideq*1-8]
    479    punpckhbw       m2, m0, m7
    480    punpcklbw       m0, m7
    481    punpckhbw       m3, m1, m7
    482    punpcklbw       m1, m7
    483    mova [px+32*(%2+0)-16], m0
    484    mova [px+32*(%2+0)+ 0], m2
    485    mova [px+32*(%2+1)-16], m1
    486    mova [px+32*(%2+1)+ 0], m3
    487    movd [px+32*(%2-1)+16], m6 ; overwritten by first mova
    488 %endif
    489    movd [px+32*(%2+0)+%1*2], m6
    490    movd [px+32*(%2+1)+%1*2], m6
    491    jmp .bottom_done
    492 .bottom_no_left:
    493    test         edgeb, 2 ; have_right
    494    jz .bottom_no_left_right
    495 %if %1 == 4
    496    PMOVZXBW        m0, [botq+strideq*0]
    497    PMOVZXBW        m1, [botq+strideq*1]
    498 %else
    499    movu            m0, [botq+strideq*0]
    500    movu            m1, [botq+strideq*1]
    501    punpckhbw       m2, m0, m7
    502    punpcklbw       m0, m7
    503    punpckhbw       m3, m1, m7
    504    punpcklbw       m1, m7
    505    mova [px+32*(%2+0)+16], m2
    506    mova [px+32*(%2+1)+16], m3
    507 %endif
    508    mova [px+32*(%2+0)+ 0], m0
    509    mova [px+32*(%2+1)+ 0], m1
    510    movd [px+32*(%2+0)- 4], m6
    511    movd [px+32*(%2+1)- 4], m6
    512    jmp .bottom_done
    513 .bottom_no_left_right:
    514    PMOVZXBW        m0, [botq+strideq*0], %1 == 4
    515    PMOVZXBW        m1, [botq+strideq*1], %1 == 4
    516    mova [px+32*(%2+0)+ 0], m0
    517    mova [px+32*(%2+1)+ 0], m1
    518    movd [px+32*(%2+0)+%1*2], m6
    519    movd [px+32*(%2+1)+%1*2], m6
    520    movd [px+32*(%2+0)- 4], m6
    521    movd [px+32*(%2+1)- 4], m6
    522    jmp .bottom_done
    523 .no_bottom:
    524    movu [px+32*(%2+0)- 4], m6
    525    movu [px+32*(%2+1)- 4], m6
    526 %if %1 == 8
    527    movq [px+32*(%2+0)+12], m6
    528    movq [px+32*(%2+1)+12], m6
    529 %endif
    530 .bottom_done:
    531 
    532    ; actual filter
    533 %if ARCH_X86_64
    534    DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec
    535    mova           m13, [shufb_lohi]
    536 %if cpuflag(ssse3)
    537    mova           m15, [pw_2048]
    538 %else
    539    mova           m15, [pw_8]
    540 %endif
    541    mova           m14, m6
    542 %else
    543    DEFINE_ARGS dst, pridmp, sec, damping, pri, tap
    544    %xdefine        m8  m1
    545    %xdefine        m9  m2
    546    %xdefine       m10  m0
    547    %xdefine       m13  [base+shufb_lohi]
    548    %xdefine       m14  OUT_OF_BOUNDS_MEM
    549 %if cpuflag(ssse3)
    550    %xdefine       m15  [base+pw_2048]
    551 %else
    552    %xdefine       m15  [base+pw_8]
    553 %endif
    554 %endif
    555    movifnidn     prid, r5m
    556    movifnidn     secd, r6m
    557    mov       dampingd, r8m
    558    movif32 [esp+0x3C], r1d
    559    test          prid, prid
    560    jz .sec_only
    561    movd            m1, r5m
    562    bsr        pridmpd, prid
    563    test          secd, secd
    564    jz .pri_only
    565    movd           m10, r6m
    566    tzcnt         secd, secd
    567    and           prid, 1
    568    sub        pridmpd, dampingd
    569    sub           secd, dampingd
    570    xor       dampingd, dampingd
    571    add           prid, prid
    572    neg        pridmpd
    573    cmovs      pridmpd, dampingd
    574    neg           secd
    575    PSHUFB_0        m1, m7
    576    PSHUFB_0       m10, m7
    577 %if ARCH_X86_64
    578    DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec
    579    lea           tapq, [tap_table]
    580    MOVDDUP        m11, [tapq+pridmpq*8] ; pri_shift_mask
    581    MOVDDUP        m12, [tapq+secq*8]    ; sec_shift_mask
    582    mov     [rsp+0x00], pridmpq          ; pri_shift
    583    mov     [rsp+0x10], secq             ; sec_shift
    584    DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off
    585 %else
    586    MOVDDUP         m2, [tapq+pridmpq*8]
    587    MOVDDUP         m3, [tapq+secq*8]
    588    mov     [esp+0x04], dampingd         ; zero upper 32 bits of psrlw
    589    mov     [esp+0x34], dampingd         ; source operand in ACCUMULATE_TAP
    590    mov     [esp+0x00], pridmpd
    591    mov     [esp+0x30], secd
    592    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
    593  %define         offq  dstq
    594  %define           kd  strided
    595  %define           kq  strideq
    596    mova    [esp+0x10], m2
    597    mova    [esp+0x40], m3
    598    mova    [esp+0x20], m1
    599    mova    [esp+0x50], m10
    600 %endif
    601    mov           dird, r7m
    602    lea           stkq, [px]
    603    lea           priq, [tapq+8*8+priq*8] ; pri_taps
    604    mov             hd, %1*%2/8
    605    lea           dirq, [tapq+dirq*2]
    606 .v_loop:
    607    movif32 [esp+0x38], dstd
    608    mov             kd, 1
    609 %if %1 == 4
    610    movq            m4, [stkq+32*0]
    611    movhps          m4, [stkq+32*1]
    612 %else
    613    mova            m4, [stkq+32*0]       ; px
    614 %endif
    615    pxor            m0, m0                ; sum
    616    mova            m7, m4                ; max
    617    mova            m8, m4                ; min
    618 .k_loop:
    619    MOVDDUP         m2, [priq+kq*8]
    620 %if ARCH_X86_64
    621    ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1
    622    MOVDDUP         m2, [tapq+12*8+kq*8]
    623    ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1
    624    ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1
    625 %else
    626    ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1
    627    MOVDDUP         m2, [tapq+12*8+kq*8]
    628    ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
    629    MOVDDUP         m2, [tapq+12*8+kq*8]
    630    ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
    631 %endif
    632    dec             kd
    633    jge .k_loop
    634    movif32       dstq, [esp+0x38]
    635    movif32    strideq, [esp+0x3C]
    636    CDEF_FILTER_END %1, 1
    637    dec             hd
    638    jg .v_loop
    639    RET
    640 
    641 .pri_only:
    642 %if ARCH_X86_64
    643    DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap
    644    lea           tapq, [tap_table]
    645 %else
    646    DEFINE_ARGS dst, pridmp, zero, damping, pri, tap
    647 %endif
    648    and           prid, 1
    649    xor          zerod, zerod
    650    sub       dampingd, pridmpd
    651    cmovs     dampingd, zerod
    652    add           prid, prid
    653    PSHUFB_0        m1, m7
    654    MOVDDUP         m7, [tapq+dampingq*8]
    655    mov     [rsp+0x00], dampingq
    656 %if ARCH_X86_64
    657    DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off
    658 %else
    659    mov     [rsp+0x04], zerod
    660    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
    661 %endif
    662    mov           dird, r7m
    663    lea           stkq, [px]
    664    lea           priq, [tapq+8*8+priq*8]
    665    mov             hd, %1*%2/8
    666    lea           dirq, [tapq+dirq*2]
    667 .pri_v_loop:
    668    movif32 [esp+0x38], dstd
    669    mov             kd, 1
    670 %if %1 == 4
    671    movq            m4, [stkq+32*0]
    672    movhps          m4, [stkq+32*1]
    673 %else
    674    mova            m4, [stkq+32*0]
    675 %endif
    676    pxor            m0, m0
    677 .pri_k_loop:
    678    MOVDDUP         m2, [priq+kq*8]
    679    ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0
    680    dec             kd
    681    jge .pri_k_loop
    682    movif32       dstq, [esp+0x38]
    683    movif32    strideq, [esp+0x3C]
    684    CDEF_FILTER_END %1, 0
    685    dec             hd
    686    jg .pri_v_loop
    687    RET
    688 
    689 .sec_only:
    690 %if ARCH_X86_64
    691    DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec
    692 %else
    693    DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
    694 %endif
    695    movd            m1, r6m
    696    tzcnt         secd, secd
    697    mov           dird, r7m
    698    xor          zerod, zerod
    699    sub       dampingd, secd
    700    cmovs     dampingd, zerod
    701    PSHUFB_0        m1, m7
    702 %if ARCH_X86_64
    703    lea           tapq, [tap_table]
    704 %else
    705    mov     [rsp+0x04], zerod
    706 %endif
    707    mov     [rsp+0x00], dampingq
    708    MOVDDUP         m7, [tapq+dampingq*8]
    709    lea           dirq, [tapq+dirq*2]
    710 %if ARCH_X86_64
    711    DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k
    712 %else
    713    DEFINE_ARGS dst, stride, off, stk, dir, tap, h
    714 %endif
    715    lea           stkq, [px]
    716    mov             hd, %1*%2/8
    717 .sec_v_loop:
    718    mov             kd, 1
    719 %if %1 == 4
    720    movq            m4, [stkq+32*0]
    721    movhps          m4, [stkq+32*1]
    722 %else
    723    mova            m4, [stkq+32*0]
    724 %endif
    725    pxor            m0, m0
    726 .sec_k_loop:
    727    MOVDDUP         m2, [tapq+12*8+kq*8]
    728    ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0
    729 %if ARCH_X86_32
    730    MOVDDUP         m2, [tapq+12*8+kq*8]
    731 %endif
    732    ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0
    733    dec             kd
    734    jge .sec_k_loop
    735    movif32    strideq, [esp+0x3C]
    736    CDEF_FILTER_END %1, 0
    737    dec             hd
    738    jg .sec_v_loop
    739    RET
    740 %endmacro
    741 
    742 %macro MULLD 2
    743 %if cpuflag(sse4)
    744    pmulld          %1, %2
    745 %else
    746  %if ARCH_X86_32
    747   %define m15 m1
    748  %endif
    749    pmulhuw        m15, %1, %2
    750    pmullw          %1, %2
    751    pslld          m15, 16
    752    paddd           %1, m15
    753 %endif
    754 %endmacro
    755 
    756 %macro CDEF_DIR 0
    757 %if ARCH_X86_64
    758 cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var
    759    lea             r6, [strideq*3]
    760    movq            m1, [srcq+strideq*0]
    761    movhps          m1, [srcq+strideq*1]
    762    movq            m3, [srcq+strideq*2]
    763    movhps          m3, [srcq+r6       ]
    764    lea           srcq, [srcq+strideq*4]
    765    movq            m5, [srcq+strideq*0]
    766    movhps          m5, [srcq+strideq*1]
    767    movq            m7, [srcq+strideq*2]
    768    movhps          m7, [srcq+r6       ]
    769 
    770    pxor            m8, m8
    771    psadbw          m9, m1, m8
    772    psadbw          m2, m3, m8
    773    psadbw          m4, m5, m8
    774    psadbw          m6, m7, m8
    775    packssdw        m9, m2
    776    packssdw        m4, m6
    777    packssdw        m9, m4
    778 
    779    punpcklbw       m0, m1, m8
    780    punpckhbw       m1, m8
    781    punpcklbw       m2, m3, m8
    782    punpckhbw       m3, m8
    783    punpcklbw       m4, m5, m8
    784    punpckhbw       m5, m8
    785    punpcklbw       m6, m7, m8
    786    punpckhbw       m7, m8
    787 cglobal_label .main
    788    mova            m8, [pw_128]
    789    psubw           m0, m8
    790    psubw           m1, m8
    791    psubw           m2, m8
    792    psubw           m3, m8
    793    psubw           m4, m8
    794    psubw           m5, m8
    795    psubw           m6, m8
    796    psubw           m7, m8
    797    psllw           m8, 3
    798    psubw           m9, m8                  ; partial_sum_hv[0]
    799 
    800    paddw           m8, m0, m1
    801    paddw          m10, m2, m3
    802    paddw           m8, m4
    803    paddw          m10, m5
    804    paddw           m8, m6
    805    paddw          m10, m7
    806    paddw           m8, m10                 ; partial_sum_hv[1]
    807 
    808    pmaddwd         m8, m8
    809    pmaddwd         m9, m9
    810    phaddd          m9, m8
    811    SWAP            m8, m9
    812    MULLD           m8, [div_table%+SUFFIX+48]
    813 
    814    pslldq          m9, m1, 2
    815    psrldq         m10, m1, 14
    816    pslldq         m11, m2, 4
    817    psrldq         m12, m2, 12
    818    pslldq         m13, m3, 6
    819    psrldq         m14, m3, 10
    820    paddw           m9, m0
    821    paddw          m10, m12
    822    paddw          m11, m13
    823    paddw          m10, m14                 ; partial_sum_diag[0] top/right half
    824    paddw           m9, m11                 ; partial_sum_diag[0] top/left half
    825    pslldq         m11, m4, 8
    826    psrldq         m12, m4, 8
    827    pslldq         m13, m5, 10
    828    psrldq         m14, m5, 6
    829    paddw           m9, m11
    830    paddw          m10, m12
    831    paddw           m9, m13
    832    paddw          m10, m14
    833    pslldq         m11, m6, 12
    834    psrldq         m12, m6, 4
    835    pslldq         m13, m7, 14
    836    psrldq         m14, m7, 2
    837    paddw           m9, m11
    838    paddw          m10, m12
    839    paddw           m9, m13                 ; partial_sum_diag[0][0-7]
    840    paddw          m10, m14                 ; partial_sum_diag[0][8-14,zero]
    841    pshufb         m10, [shufw_6543210x]
    842    punpckhwd      m11, m9, m10
    843    punpcklwd       m9, m10
    844    pmaddwd        m11, m11
    845    pmaddwd         m9, m9
    846    MULLD          m11, [div_table%+SUFFIX+16]
    847    MULLD           m9, [div_table%+SUFFIX+0]
    848    paddd           m9, m11                 ; cost[0a-d]
    849 
    850    pslldq         m10, m0, 14
    851    psrldq         m11, m0, 2
    852    pslldq         m12, m1, 12
    853    psrldq         m13, m1, 4
    854    pslldq         m14, m2, 10
    855    psrldq         m15, m2, 6
    856    paddw          m10, m12
    857    paddw          m11, m13
    858    paddw          m10, m14
    859    paddw          m11, m15
    860    pslldq         m12, m3, 8
    861    psrldq         m13, m3, 8
    862    pslldq         m14, m4, 6
    863    psrldq         m15, m4, 10
    864    paddw          m10, m12
    865    paddw          m11, m13
    866    paddw          m10, m14
    867    paddw          m11, m15
    868    pslldq         m12, m5, 4
    869    psrldq         m13, m5, 12
    870    pslldq         m14, m6, 2
    871    psrldq         m15, m6, 14
    872    paddw          m10, m12
    873    paddw          m11, m13
    874    paddw          m10, m14
    875    paddw          m11, m15                 ; partial_sum_diag[1][8-14,zero]
    876    paddw          m10, m7                  ; partial_sum_diag[1][0-7]
    877    pshufb         m11, [shufw_6543210x]
    878    punpckhwd      m12, m10, m11
    879    punpcklwd      m10, m11
    880    pmaddwd        m12, m12
    881    pmaddwd        m10, m10
    882    MULLD          m12, [div_table%+SUFFIX+16]
    883    MULLD          m10, [div_table%+SUFFIX+0]
    884    paddd          m10, m12                 ; cost[4a-d]
    885    phaddd          m9, m10                 ; cost[0a/b,4a/b]
    886 
    887    paddw          m10, m0, m1
    888    paddw          m11, m2, m3
    889    paddw          m12, m4, m5
    890    paddw          m13, m6, m7
    891    phaddw          m0, m4
    892    phaddw          m1, m5
    893    phaddw          m2, m6
    894    phaddw          m3, m7
    895 
    896    ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
    897    pslldq          m4, m11, 2
    898    psrldq          m5, m11, 14
    899    pslldq          m6, m12, 4
    900    psrldq          m7, m12, 12
    901    pslldq         m14, m13, 6
    902    psrldq         m15, m13, 10
    903    paddw           m4, m10
    904    paddw           m5, m7
    905    paddw           m4, m6
    906    paddw           m5, m15                 ; partial_sum_alt[3] right
    907    paddw           m4, m14                 ; partial_sum_alt[3] left
    908    pshuflw         m6, m5, q3012
    909    punpckhwd       m5, m4
    910    punpcklwd       m4, m6
    911    pmaddwd         m5, m5
    912    pmaddwd         m4, m4
    913    MULLD           m5, [div_table%+SUFFIX+48]
    914    MULLD           m4, [div_table%+SUFFIX+32]
    915    paddd           m4, m5                  ; cost[7a-d]
    916 
    917    pslldq          m5, m10, 6
    918    psrldq          m6, m10, 10
    919    pslldq          m7, m11, 4
    920    psrldq         m10, m11, 12
    921    pslldq         m11, m12, 2
    922    psrldq         m12, 14
    923    paddw           m5, m7
    924    paddw           m6, m10
    925    paddw           m5, m11
    926    paddw           m6, m12
    927    paddw           m5, m13
    928    pshuflw         m7, m6, q3012
    929    punpckhwd       m6, m5
    930    punpcklwd       m5, m7
    931    pmaddwd         m6, m6
    932    pmaddwd         m5, m5
    933    MULLD           m6, [div_table%+SUFFIX+48]
    934    MULLD           m5, [div_table%+SUFFIX+32]
    935    paddd           m5, m6                  ; cost[5a-d]
    936 
    937    pslldq          m6, m1, 2
    938    psrldq          m7, m1, 14
    939    pslldq         m10, m2, 4
    940    psrldq         m11, m2, 12
    941    pslldq         m12, m3, 6
    942    psrldq         m13, m3, 10
    943    paddw           m6, m0
    944    paddw           m7, m11
    945    paddw           m6, m10
    946    paddw           m7, m13                 ; partial_sum_alt[3] right
    947    paddw           m6, m12                 ; partial_sum_alt[3] left
    948    pshuflw        m10, m7, q3012
    949    punpckhwd       m7, m6
    950    punpcklwd       m6, m10
    951    pmaddwd         m7, m7
    952    pmaddwd         m6, m6
    953    MULLD           m7, [div_table%+SUFFIX+48]
    954    MULLD           m6, [div_table%+SUFFIX+32]
    955    paddd           m6, m7                  ; cost[1a-d]
    956 
    957    pshufd          m0, m0, q1032
    958    pshufd          m1, m1, q1032
    959    pshufd          m2, m2, q1032
    960    pshufd          m3, m3, q1032
    961 
    962    pslldq         m10, m0, 6
    963    psrldq         m11, m0, 10
    964    pslldq         m12, m1, 4
    965    psrldq         m13, m1, 12
    966    pslldq         m14, m2, 2
    967    psrldq          m2, 14
    968    paddw          m10, m12
    969    paddw          m11, m13
    970    paddw          m10, m14
    971    paddw          m11, m2
    972    paddw          m10, m3
    973    pshuflw        m12, m11, q3012
    974    punpckhwd      m11, m10
    975    punpcklwd      m10, m12
    976    pmaddwd        m11, m11
    977    pmaddwd        m10, m10
    978    MULLD          m11, [div_table%+SUFFIX+48]
    979    MULLD          m10, [div_table%+SUFFIX+32]
    980    paddd          m10, m11                 ; cost[3a-d]
    981 
    982    phaddd          m9, m8                  ; cost[0,4,2,6]
    983    phaddd          m6, m10
    984    phaddd          m5, m4
    985    phaddd          m6, m5                  ; cost[1,3,5,7]
    986    pshufd          m4, m9, q3120
    987 
    988    ; now find the best cost
    989  %if cpuflag(sse4)
    990    pmaxsd          m9, m6
    991    pshufd          m0, m9, q1032
    992    pmaxsd          m0, m9
    993    pshufd          m1, m0, q2301
    994    pmaxsd          m0, m1                  ; best cost
    995  %else
    996    pcmpgtd         m0, m9, m6
    997    pand            m9, m0
    998    pandn           m0, m6
    999    por             m9, m0
   1000    pshufd          m1, m9, q1032
   1001    pcmpgtd         m0, m9, m1
   1002    pand            m9, m0
   1003    pandn           m0, m1
   1004    por             m9, m0
   1005    pshufd          m1, m9, q2301
   1006    pcmpgtd         m0, m9, m1
   1007    pand            m9, m0
   1008    pandn           m0, m1
   1009    por             m0, m9
   1010  %endif
   1011 
   1012    ; get direction and variance
   1013    punpckhdq       m1, m4, m6
   1014    punpckldq       m4, m6
   1015    psubd           m2, m0, m1
   1016    psubd           m3, m0, m4
   1017 %if WIN64
   1018    WIN64_RESTORE_XMM
   1019    %define tmp rsp+stack_offset+8
   1020 %else
   1021    %define tmp rsp-40
   1022 %endif
   1023    mova    [tmp+0x00], m2                  ; emulate ymm in stack
   1024    mova    [tmp+0x10], m3
   1025    pcmpeqd         m1, m0                  ; compute best cost mask
   1026    pcmpeqd         m4, m0
   1027    packssdw        m4, m1
   1028    pmovmskb       eax, m4                  ; get byte-idx from mask
   1029    tzcnt          eax, eax
   1030    mov            r1d, [tmp+rax*2]         ; get idx^4 complement from emulated ymm
   1031    shr            eax, 1                   ; get direction by converting byte-idx to word-idx
   1032    shr            r1d, 10
   1033    mov         [varq], r1d
   1034 %else
   1035 cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3
   1036 %define base r2-shufw_6543210x
   1037    LEA             r2, shufw_6543210x
   1038    pxor            m0, m0
   1039    lea       stride3q, [strideq*3]
   1040    movq            m5, [srcq+strideq*0]
   1041    movhps          m5, [srcq+strideq*1]
   1042    movq            m7, [srcq+strideq*2]
   1043    movhps          m7, [srcq+stride3q]
   1044    mova            m1, [base+pw_128]
   1045    psadbw          m2, m5, m0
   1046    psadbw          m3, m7, m0
   1047    packssdw        m2, m3
   1048    punpcklbw       m4, m5, m0
   1049    punpckhbw       m5, m0
   1050    punpcklbw       m6, m7, m0
   1051    punpckhbw       m7, m0
   1052    psubw           m4, m1
   1053    psubw           m5, m1
   1054    psubw           m6, m1
   1055    psubw           m7, m1
   1056 
   1057    mova    [esp+0x00], m4
   1058    mova    [esp+0x10], m5
   1059    mova    [esp+0x20], m6
   1060    mova    [esp+0x50], m7
   1061 
   1062    lea           srcq, [srcq+strideq*4]
   1063    movq            m5, [srcq+strideq*0]
   1064    movhps          m5, [srcq+strideq*1]
   1065    movq            m7, [srcq+strideq*2]
   1066    movhps          m7, [srcq+stride3q]
   1067    psadbw          m3, m5, m0
   1068    psadbw          m0, m7
   1069    packssdw        m3, m0
   1070    pxor            m0, m0
   1071    punpcklbw       m4, m5, m0
   1072    punpckhbw       m5, m0
   1073    punpcklbw       m6, m7, m0
   1074    punpckhbw       m7, m0
   1075 cglobal_label .main
   1076    psubw           m4, m1
   1077    psubw           m5, m1
   1078    psubw           m6, m1
   1079    psubw           m7, m1
   1080    packssdw        m2, m3
   1081    psllw           m1, 3
   1082    psubw           m2, m1                  ; partial_sum_hv[0]
   1083    pmaddwd         m2, m2
   1084 
   1085    mova            m3, [esp+0x50]
   1086    mova            m0, [esp+0x00]
   1087    paddw           m0, [esp+0x10]
   1088    paddw           m1, m3, [esp+0x20]
   1089    paddw           m0, m4
   1090    paddw           m1, m5
   1091    paddw           m0, m6
   1092    paddw           m1, m7
   1093    paddw           m0, m1                  ; partial_sum_hv[1]
   1094    pmaddwd         m0, m0
   1095 
   1096    phaddd          m2, m0
   1097    MULLD           m2, [base+div_table%+SUFFIX+48]
   1098    mova    [esp+0x30], m2
   1099 
   1100    mova            m1, [esp+0x10]
   1101    pslldq          m0, m1, 2
   1102    psrldq          m1, 14
   1103    paddw           m0, [esp+0x00]
   1104    pslldq          m2, m3, 6
   1105    psrldq          m3, 10
   1106    paddw           m0, m2
   1107    paddw           m1, m3
   1108    mova            m3, [esp+0x20]
   1109    pslldq          m2, m3, 4
   1110    psrldq          m3, 12
   1111    paddw           m0, m2                  ; partial_sum_diag[0] top/left half
   1112    paddw           m1, m3                  ; partial_sum_diag[0] top/right half
   1113    pslldq          m2, m4, 8
   1114    psrldq          m3, m4, 8
   1115    paddw           m0, m2
   1116    paddw           m1, m3
   1117    pslldq          m2, m5, 10
   1118    psrldq          m3, m5, 6
   1119    paddw           m0, m2
   1120    paddw           m1, m3
   1121    pslldq          m2, m6, 12
   1122    psrldq          m3, m6, 4
   1123    paddw           m0, m2
   1124    paddw           m1, m3
   1125    pslldq          m2, m7, 14
   1126    psrldq          m3, m7, 2
   1127    paddw           m0, m2                  ; partial_sum_diag[0][0-7]
   1128    paddw           m1, m3                  ; partial_sum_diag[0][8-14,zero]
   1129    mova            m3, [esp+0x50]
   1130    pshufb          m1, [base+shufw_6543210x]
   1131    punpckhwd       m2, m0, m1
   1132    punpcklwd       m0, m1
   1133    pmaddwd         m2, m2
   1134    pmaddwd         m0, m0
   1135    MULLD           m2, [base+div_table%+SUFFIX+16]
   1136    MULLD           m0, [base+div_table%+SUFFIX+ 0]
   1137    paddd           m0, m2                  ; cost[0a-d]
   1138    mova    [esp+0x40], m0
   1139 
   1140    mova            m1, [esp+0x00]
   1141    pslldq          m0, m1, 14
   1142    psrldq          m1, 2
   1143    paddw           m0, m7
   1144    pslldq          m2, m3, 8
   1145    psrldq          m3, 8
   1146    paddw           m0, m2
   1147    paddw           m1, m3
   1148    mova            m3, [esp+0x20]
   1149    pslldq          m2, m3, 10
   1150    psrldq          m3, 6
   1151    paddw           m0, m2
   1152    paddw           m1, m3
   1153    mova            m3, [esp+0x10]
   1154    pslldq          m2, m3, 12
   1155    psrldq          m3, 4
   1156    paddw           m0, m2
   1157    paddw           m1, m3
   1158    pslldq          m2, m4, 6
   1159    psrldq          m3, m4, 10
   1160    paddw           m0, m2
   1161    paddw           m1, m3
   1162    pslldq          m2, m5, 4
   1163    psrldq          m3, m5, 12
   1164    paddw           m0, m2
   1165    paddw           m1, m3
   1166    pslldq          m2, m6, 2
   1167    psrldq          m3, m6, 14
   1168    paddw           m0, m2                  ; partial_sum_diag[1][0-7]
   1169    paddw           m1, m3                  ; partial_sum_diag[1][8-14,zero]
   1170    mova            m3, [esp+0x50]
   1171    pshufb          m1, [base+shufw_6543210x]
   1172    punpckhwd       m2, m0, m1
   1173    punpcklwd       m0, m1
   1174    pmaddwd         m2, m2
   1175    pmaddwd         m0, m0
   1176    MULLD           m2, [base+div_table%+SUFFIX+16]
   1177    MULLD           m0, [base+div_table%+SUFFIX+ 0]
   1178    paddd           m0, m2                  ; cost[4a-d]
   1179    phaddd          m1, [esp+0x40], m0      ; cost[0a/b,4a/b]
   1180    phaddd          m1, [esp+0x30]          ; cost[0,4,2,6]
   1181    mova    [esp+0x30], m1
   1182 
   1183    phaddw          m0, [esp+0x00], m4
   1184    phaddw          m1, [esp+0x10], m5
   1185    paddw           m4, m5
   1186    mova            m2, [esp+0x20]
   1187    paddw           m5, m2, m3
   1188    phaddw          m2, m6
   1189    paddw           m6, m7
   1190    phaddw          m3, m7
   1191    mova            m7, [esp+0x00]
   1192    paddw           m7, [esp+0x10]
   1193    mova    [esp+0x00], m0
   1194    mova    [esp+0x10], m1
   1195    mova    [esp+0x20], m2
   1196 
   1197    pslldq          m1, m4, 4
   1198    pslldq          m2, m6, 6
   1199    pslldq          m0, m5, 2
   1200    paddw           m1, m2
   1201    paddw           m0, m7
   1202    psrldq          m2, m5, 14
   1203    paddw           m0, m1                  ; partial_sum_alt[3] left
   1204    psrldq          m1, m4, 12
   1205    paddw           m1, m2
   1206    psrldq          m2, m6, 10
   1207    paddw           m1, m2                  ; partial_sum_alt[3] right
   1208    pshuflw         m1, m1, q3012
   1209    punpckhwd       m2, m0, m1
   1210    punpcklwd       m0, m1
   1211    pmaddwd         m2, m2
   1212    pmaddwd         m0, m0
   1213    MULLD           m2, [base+div_table%+SUFFIX+48]
   1214    MULLD           m0, [base+div_table%+SUFFIX+32]
   1215    paddd           m0, m2                  ; cost[7a-d]
   1216    mova    [esp+0x40], m0
   1217 
   1218    pslldq          m0, m7, 6
   1219    psrldq          m7, 10
   1220    pslldq          m1, m5, 4
   1221    psrldq          m5, 12
   1222    pslldq          m2, m4, 2
   1223    psrldq          m4, 14
   1224    paddw           m0, m6
   1225    paddw           m7, m5
   1226    paddw           m0, m1
   1227    paddw           m7, m4
   1228    paddw           m0, m2
   1229    pshuflw         m2, m7, q3012
   1230    punpckhwd       m7, m0
   1231    punpcklwd       m0, m2
   1232    pmaddwd         m7, m7
   1233    pmaddwd         m0, m0
   1234    MULLD           m7, [base+div_table%+SUFFIX+48]
   1235    MULLD           m0, [base+div_table%+SUFFIX+32]
   1236    paddd           m0, m7                  ; cost[5a-d]
   1237    mova    [esp+0x50], m0
   1238 
   1239    mova            m7, [esp+0x10]
   1240    mova            m2, [esp+0x20]
   1241    pslldq          m0, m7, 2
   1242    psrldq          m7, 14
   1243    pslldq          m4, m2, 4
   1244    psrldq          m2, 12
   1245    pslldq          m5, m3, 6
   1246    psrldq          m6, m3, 10
   1247    paddw           m0, [esp+0x00]
   1248    paddw           m7, m2
   1249    paddw           m4, m5
   1250    paddw           m7, m6                  ; partial_sum_alt[3] right
   1251    paddw           m0, m4                  ; partial_sum_alt[3] left
   1252    pshuflw         m2, m7, q3012
   1253    punpckhwd       m7, m0
   1254    punpcklwd       m0, m2
   1255    pmaddwd         m7, m7
   1256    pmaddwd         m0, m0
   1257    MULLD           m7, [base+div_table%+SUFFIX+48]
   1258    MULLD           m0, [base+div_table%+SUFFIX+32]
   1259    paddd           m0, m7                  ; cost[1a-d]
   1260    SWAP            m0, m4
   1261 
   1262    pshufd          m0, [esp+0x00], q1032
   1263    pshufd          m1, [esp+0x10], q1032
   1264    pshufd          m2, [esp+0x20], q1032
   1265    pshufd          m3, m3, q1032
   1266    mova    [esp+0x00], m4
   1267 
   1268    pslldq          m4, m0, 6
   1269    psrldq          m0, 10
   1270    pslldq          m5, m1, 4
   1271    psrldq          m1, 12
   1272    pslldq          m6, m2, 2
   1273    psrldq          m2, 14
   1274    paddw           m4, m3
   1275    paddw           m0, m1
   1276    paddw           m5, m6
   1277    paddw           m0, m2
   1278    paddw           m4, m5
   1279    pshuflw         m2, m0, q3012
   1280    punpckhwd       m0, m4
   1281    punpcklwd       m4, m2
   1282    pmaddwd         m0, m0
   1283    pmaddwd         m4, m4
   1284    MULLD           m0, [base+div_table%+SUFFIX+48]
   1285    MULLD           m4, [base+div_table%+SUFFIX+32]
   1286    paddd           m4, m0                   ; cost[3a-d]
   1287 
   1288    mova            m1, [esp+0x00]
   1289    mova            m2, [esp+0x50]
   1290    mova            m0, [esp+0x30]          ; cost[0,4,2,6]
   1291    phaddd          m1, m4
   1292    phaddd          m2, [esp+0x40]          ; cost[1,3,5,7]
   1293    phaddd          m1, m2
   1294    pshufd          m2, m0, q3120
   1295 
   1296    ; now find the best cost
   1297  %if cpuflag(sse4)
   1298    pmaxsd          m0, m1
   1299    pshufd          m3, m0, q1032
   1300    pmaxsd          m3, m0
   1301    pshufd          m0, m3, q2301
   1302    pmaxsd          m0, m3
   1303  %else
   1304    pcmpgtd         m3, m0, m1
   1305    pand            m0, m3
   1306    pandn           m3, m1
   1307    por             m0, m3
   1308    pshufd          m4, m0, q1032
   1309    pcmpgtd         m3, m0, m4
   1310    pand            m0, m3
   1311    pandn           m3, m4
   1312    por             m0, m3
   1313    pshufd          m4, m0, q2301
   1314    pcmpgtd         m3, m0, m4
   1315    pand            m0, m3
   1316    pandn           m3, m4
   1317    por             m0, m3
   1318  %endif
   1319 
   1320    ; get direction and variance
   1321    mov           vard, varm
   1322    punpckhdq       m3, m2, m1
   1323    punpckldq       m2, m1
   1324    psubd           m1, m0, m3
   1325    psubd           m4, m0, m2
   1326    mova    [esp+0x00], m1                  ; emulate ymm in stack
   1327    mova    [esp+0x10], m4
   1328    pcmpeqd         m3, m0                  ; compute best cost mask
   1329    pcmpeqd         m2, m0
   1330    packssdw        m2, m3
   1331    pmovmskb       eax, m2                  ; get byte-idx from mask
   1332    tzcnt          eax, eax
   1333    mov            r1d, [esp+eax*2]         ; get idx^4 complement from emulated ymm
   1334    shr            eax, 1                   ; get direction by converting byte-idx to word-idx
   1335    shr            r1d, 10
   1336    mov         [vard], r1d
   1337 %endif
   1338 
   1339    RET
   1340 %endmacro
   1341 
   1342 INIT_XMM sse4
   1343 CDEF_FILTER 8, 8
   1344 CDEF_FILTER 4, 8
   1345 CDEF_FILTER 4, 4
   1346 CDEF_DIR
   1347 
   1348 INIT_XMM ssse3
   1349 CDEF_FILTER 8, 8
   1350 CDEF_FILTER 4, 8
   1351 CDEF_FILTER 4, 4
   1352 CDEF_DIR
   1353 
   1354 INIT_XMM sse2
   1355 CDEF_FILTER 8, 8
   1356 CDEF_FILTER 4, 8
   1357 CDEF_FILTER 4, 4