tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef_avx2.asm (58545B)


      1 ; Copyright © 2018, VideoLAN and dav1d authors
      2 ; Copyright © 2018, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 %macro JMP_TABLE 2-*
     32 %xdefine %1_jmptable %%table
     33 %xdefine %%base mangle(private_prefix %+ _%1_avx2)
     34 %%table:
     35 %rep %0 - 1
     36    dd %%base %+ .%2 - %%table
     37  %rotate 1
     38 %endrep
     39 %endmacro
     40 
     41 %macro CDEF_FILTER_JMP_TABLE 1
     42 JMP_TABLE cdef_filter_%1_8bpc, \
     43    d6k0, d6k1, d7k0, d7k1, \
     44    d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
     45    d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
     46    d0k0, d0k1, d1k0, d1k1
     47 %endmacro
     48 
     49 SECTION_RODATA 32
     50 
     51 pd_47130256:   dd  4,  7,  1,  3,  0,  2,  5,  6
     52 blend_4x4:     dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
     53               dd 0x80, 0x00, 0x00
     54 blend_4x8_0:   dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
     55 blend_4x8_1:   dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
     56               dd 0x00, 0x00
     57 blend_4x8_2:   dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
     58               dd 0x0000
     59 blend_4x8_3:   dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
     60               dd 0x0000, 0x0000
     61 blend_8x8_0:   dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
     62 blend_8x8_1:   dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
     63 div_table:     dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
     64 shufw_6543210x:db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
     65 shufb_lohi:    db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
     66 pw_128:        times 2 dw 128
     67 pw_2048:       times 2 dw 2048
     68 tap_table:     ; masks for 8 bit shifts
     69               db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
     70               ; weights
     71               db  4,  2,  3,  3,  2,  1
     72               db -1 * 16 + 1, -2 * 16 + 2
     73               db  0 * 16 + 1, -1 * 16 + 2
     74               db  0 * 16 + 1,  0 * 16 + 2
     75               db  0 * 16 + 1,  1 * 16 + 2
     76               db  1 * 16 + 1,  2 * 16 + 2
     77               db  1 * 16 + 0,  2 * 16 + 1
     78               db  1 * 16 + 0,  2 * 16 + 0
     79               db  1 * 16 + 0,  2 * 16 - 1
     80               ; the last 6 are repeats of the first 6 so we don't need to & 7
     81               db -1 * 16 + 1, -2 * 16 + 2
     82               db  0 * 16 + 1, -1 * 16 + 2
     83               db  0 * 16 + 1,  0 * 16 + 2
     84               db  0 * 16 + 1,  1 * 16 + 2
     85               db  1 * 16 + 1,  2 * 16 + 2
     86               db  1 * 16 + 0,  2 * 16 + 1
     87 
     88 CDEF_FILTER_JMP_TABLE 4x4
     89 CDEF_FILTER_JMP_TABLE 4x8
     90 CDEF_FILTER_JMP_TABLE 8x8
     91 
     92 SECTION .text
     93 
     94 %macro PREP_REGS 2 ; w, h
     95    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
     96    mov           dird, r7m
     97    lea         tableq, [cdef_filter_%1x%2_8bpc_jmptable]
     98    lea           dirq, [tableq+dirq*2*4]
     99 %if %1 == 4
    100 %if %2 == 4
    101  DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
    102              table, dir, dirjmp, stride3, k
    103 %else
    104  DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
    105              table, dir, dirjmp, dst4, stride3, k
    106    lea          dst4q, [dstq+strideq*4]
    107 %endif
    108 %else
    109  DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \
    110              table, dir, dirjmp, top2, stride3, k
    111    mov             hq, -8
    112    lea          top1q, [top1q+strideq*0]
    113    lea          top2q, [top1q+strideq*1]
    114 %endif
    115 %if %1 == 4
    116    lea       stride3q, [strideq*3]
    117 %endif
    118 %endmacro
    119 
    120 %macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
    121    mov             kd, 1
    122    pxor           m15, m15                     ; sum
    123 %if %2 == 8
    124    pxor           m12, m12
    125 %if %1 == 4
    126    movd           xm4, [dstq +strideq*0]
    127    movd           xm6, [dstq +strideq*1]
    128    movd           xm5, [dstq +strideq*2]
    129    movd           xm7, [dstq +stride3q ]
    130    vinserti128     m4, [dst4q+strideq*0], 1
    131    vinserti128     m6, [dst4q+strideq*1], 1
    132    vinserti128     m5, [dst4q+strideq*2], 1
    133    vinserti128     m7, [dst4q+stride3q ], 1
    134    punpckldq       m4, m6
    135    punpckldq       m5, m7
    136 %else
    137    movq           xm4, [dstq+strideq*0]
    138    movq           xm5, [dstq+strideq*1]
    139    vinserti128     m4, [dstq+strideq*2], 1
    140    vinserti128     m5, [dstq+stride3q ], 1
    141 %endif
    142    punpcklqdq      m4, m5
    143 %else
    144    movd           xm4, [dstq+strideq*0]
    145    movd           xm5, [dstq+strideq*1]
    146    vinserti128     m4, [dstq+strideq*2], 1
    147    vinserti128     m5, [dstq+stride3q ], 1
    148    punpckldq       m4, m5
    149 %endif
    150 %if %3 == 1
    151    mova            m7, m4                      ; min
    152    mova            m8, m4                      ; max
    153 %endif
    154 %endmacro
    155 
    156 %macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
    157                                 ; mul_tap, w, h, clip
    158    ; load p0/p1
    159    movsxd     dirjmpq, [dirq+kq*4+%1*2*4]
    160    add        dirjmpq, tableq
    161    call       dirjmpq
    162 
    163 %if %8 == 1
    164    pmaxub          m7, m5
    165    pminub          m8, m5
    166    pmaxub          m7, m6
    167    pminub          m8, m6
    168 %endif
    169 
    170    ; accumulate sum[m15] over p0/p1
    171 %if %7 == 4
    172    punpcklbw       m5, m6
    173    punpcklbw       m6, m4, m4
    174    psubusb         m9, m5, m6
    175    psubusb         m5, m6, m5
    176    por             m9, m5     ; abs_diff_p01(p01 - px)
    177    pcmpeqb         m5, m9
    178    por             m5, %5
    179    psignb          m6, %5, m5
    180    psrlw           m5, m9, %2 ; emulate 8-bit shift
    181    pand            m5, %3
    182    psubusb         m5, %4, m5
    183    pminub          m5, m9
    184    pmaddubsw       m5, m6
    185    paddw          m15, m5
    186 %else
    187    psubusb         m9, m5, m4
    188    psubusb         m5, m4, m5
    189    psubusb        m11, m6, m4
    190    psubusb         m6, m4, m6
    191    por             m9, m5      ; abs_diff_p0(p0 - px)
    192    por            m11, m6      ; abs_diff_p1(p1 - px)
    193    pcmpeqb         m5, m9
    194    pcmpeqb         m6, m11
    195    punpckhbw      m10, m9, m11
    196    punpcklbw       m9, m11
    197    por             m5, %5
    198    por            m11, m6, %5
    199    punpckhbw       m6, m5, m11
    200    punpcklbw       m5, m11
    201    psignb         m11, %5, m6
    202    psrlw           m6, m10, %2 ; emulate 8-bit shift
    203    pand            m6, %3
    204    psubusb         m6, %4, m6
    205    pminub          m6, m10
    206    pmaddubsw       m6, m11
    207    paddw          m12, m6
    208    psignb         m11, %5, m5
    209    psrlw           m5, m9, %2  ; emulate 8-bit shift
    210    pand            m5, %3
    211    psubusb         m5, %4, m5
    212    pminub          m5, m9
    213    pmaddubsw       m5, m11
    214    paddw          m15, m5
    215 %endif
    216 %endmacro
    217 
    218 %macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
    219 %if %2 == 4
    220 %if %5 == 1
    221    punpcklbw       m4, %3
    222 %endif
    223    pcmpgtw         %3, m15
    224    paddw          m15, %3
    225    pmulhrsw       m15, %4
    226 %if %5 == 0
    227    packsswb       m15, m15
    228    paddb           m4, m15
    229 %else
    230    paddw           m4, m15
    231    packuswb        m4, m4 ; clip px in [0x0,0xff]
    232    pminub          m4, m7
    233    pmaxub          m4, m8
    234 %endif
    235    vextracti128   xm5, m4, 1
    236    movd   [dstq+strideq*0], xm4
    237    movd   [dstq+strideq*2], xm5
    238    pextrd [dstq+strideq*1], xm4, 1
    239    pextrd [dstq+stride3q ], xm5, 1
    240 %else
    241    pcmpgtw         m6, %3, m12
    242    pcmpgtw         m5, %3, m15
    243    paddw          m12, m6
    244    paddw          m15, m5
    245 %if %5 == 1
    246    punpckhbw       m5, m4, %3
    247    punpcklbw       m4, %3
    248 %endif
    249    pmulhrsw       m12, %4
    250    pmulhrsw       m15, %4
    251 %if %5 == 0
    252    packsswb       m15, m12
    253    paddb           m4, m15
    254 %else
    255    paddw           m5, m12
    256    paddw           m4, m15
    257    packuswb        m4, m5 ; clip px in [0x0,0xff]
    258    pminub          m4, m7
    259    pmaxub          m4, m8
    260 %endif
    261    vextracti128   xm5, m4, 1
    262 %if %1 == 4
    263    movd   [dstq +strideq*0], xm4
    264    movd   [dst4q+strideq*0], xm5
    265    pextrd [dstq +strideq*1], xm4, 1
    266    pextrd [dst4q+strideq*1], xm5, 1
    267    pextrd [dstq +strideq*2], xm4, 2
    268    pextrd [dst4q+strideq*2], xm5, 2
    269    pextrd [dstq +stride3q ], xm4, 3
    270    pextrd [dst4q+stride3q ], xm5, 3
    271 %else
    272    movq   [dstq+strideq*0], xm4
    273    movq   [dstq+strideq*2], xm5
    274    movhps [dstq+strideq*1], xm4
    275    movhps [dstq+stride3q ], xm5
    276 %endif
    277 %endif
    278 %endmacro
    279 
    280 %macro BORDER_PREP_REGS 2 ; w, h
    281    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
    282    mov           dird, r7m
    283    lea           dirq, [tableq+dirq*2+14]
    284 %if %1*%2*2/mmsize > 1
    285 %if %1 == 4
    286    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off
    287 %else
    288    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off
    289 %endif
    290    mov             hd, %1*%2*2/mmsize
    291 %else
    292    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off
    293 %endif
    294    lea           stkq, [px]
    295    pxor           m11, m11
    296 %endmacro
    297 
    298 %macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
    299    mov             kd, 1
    300 %if %1 == 4
    301    movq           xm4, [stkq+32*0]
    302    movhps         xm4, [stkq+32*1]
    303    movq           xm5, [stkq+32*2]
    304    movhps         xm5, [stkq+32*3]
    305    vinserti128     m4, xm5, 1
    306 %else
    307    mova           xm4, [stkq+32*0]             ; px
    308    vinserti128     m4, [stkq+32*1], 1
    309 %endif
    310    pxor           m15, m15                     ; sum
    311 %if %3 == 1
    312    mova            m7, m4                      ; max
    313    mova            m8, m4                      ; min
    314 %endif
    315 %endmacro
    316 
    317 %macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
    318                                 ; mul_tap, w, clip
    319    ; load p0/p1
    320    movsx         offq, byte [dirq+kq+%1]       ; off1
    321 %if %6 == 4
    322    movq           xm5, [stkq+offq*2+32*0]      ; p0
    323    movq           xm6, [stkq+offq*2+32*2]
    324    movhps         xm5, [stkq+offq*2+32*1]
    325    movhps         xm6, [stkq+offq*2+32*3]
    326    vinserti128     m5, xm6, 1
    327 %else
    328    movu           xm5, [stkq+offq*2+32*0]      ; p0
    329    vinserti128     m5, [stkq+offq*2+32*1], 1
    330 %endif
    331    neg           offq                          ; -off1
    332 %if %6 == 4
    333    movq           xm6, [stkq+offq*2+32*0]      ; p1
    334    movq           xm9, [stkq+offq*2+32*2]
    335    movhps         xm6, [stkq+offq*2+32*1]
    336    movhps         xm9, [stkq+offq*2+32*3]
    337    vinserti128     m6, xm9, 1
    338 %else
    339    movu           xm6, [stkq+offq*2+32*0]      ; p1
    340    vinserti128     m6, [stkq+offq*2+32*1], 1
    341 %endif
    342 %if %7 == 1
    343    ; out of bounds values are set to a value that is a both a large unsigned
    344    ; value and a negative signed value.
    345    ; use signed max and unsigned min to remove them
    346    pmaxsw          m7, m5                      ; max after p0
    347    pminuw          m8, m5                      ; min after p0
    348    pmaxsw          m7, m6                      ; max after p1
    349    pminuw          m8, m6                      ; min after p1
    350 %endif
    351 
    352    ; accumulate sum[m15] over p0/p1
    353    ; calculate difference before converting
    354    psubw           m5, m4                      ; diff_p0(p0 - px)
    355    psubw           m6, m4                      ; diff_p1(p1 - px)
    356 
    357    ; convert to 8-bits with signed saturation
    358    ; saturating to large diffs has no impact on the results
    359    packsswb        m5, m6
    360 
    361    ; group into pairs so we can accumulate using maddubsw
    362    pshufb          m5, m12
    363    pabsb           m9, m5
    364    psignb         m10, %5, m5
    365    psrlw           m5, m9, %2                  ; emulate 8-bit shift
    366    pand            m5, %3
    367    psubusb         m5, %4, m5
    368 
    369    ; use unsigned min since abs diff can equal 0x80
    370    pminub          m5, m9
    371    pmaddubsw       m5, m10
    372    paddw          m15, m5
    373 %endmacro
    374 
    375 %macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
    376    pcmpgtw         m9, m11, m15
    377    paddw          m15, m9
    378    pmulhrsw       m15, %2
    379    paddw           m4, m15
    380 %if %3 == 1
    381    pminsw          m4, m7
    382    pmaxsw          m4, m8
    383 %endif
    384    packuswb        m4, m4
    385    vextracti128   xm5, m4, 1
    386 %if %1 == 4
    387    movd   [dstq+strideq*0], xm4
    388    pextrd [dstq+strideq*1], xm4, 1
    389    movd   [dstq+strideq*2], xm5
    390    pextrd [dstq+stride3q ], xm5, 1
    391 %else
    392    movq [dstq+strideq*0], xm4
    393    movq [dstq+strideq*1], xm5
    394 %endif
    395 %endmacro
    396 
    397 %macro CDEF_FILTER 2 ; w, h
    398 INIT_YMM avx2
    399 cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
    400                                          pri, sec, dir, damping, edge
    401    mov          edged, edgem
    402    cmp          edged, 0xf
    403    jne .border_block
    404 
    405    PUSH           r11
    406    PUSH           r12
    407 %if %2 == 4
    408 %assign regs_used 13
    409    ALLOC_STACK   0x60, 16
    410    pmovzxbw       xm0, [leftq+1]
    411    vpermq          m0, m0, q0110
    412    psrldq          m1, m0, 4
    413    vpalignr        m2, m0, m0, 12
    414    movu    [rsp+0x10], m0
    415    movu    [rsp+0x28], m1
    416    movu    [rsp+0x40], m2
    417 %elif %1 == 4
    418 %assign regs_used 14
    419    PUSH           r13
    420    ALLOC_STACK 8*2+%1*%2*1, 16
    421    pmovzxwd        m0, [leftq]
    422    mova    [rsp+0x10], m0
    423 %else
    424 %assign regs_used 15
    425    PUSH           r13
    426    PUSH           r14
    427    ALLOC_STACK 8*4+%1*%2*2+32, 16
    428    lea            r11, [strideq*3]
    429    movu           xm4, [dstq+strideq*2]
    430    pmovzxwq        m0, [leftq+0]
    431    pmovzxwq        m1, [leftq+8]
    432    vinserti128     m4, [dstq+r11], 1
    433    pmovzxbd        m2, [leftq+1]
    434    pmovzxbd        m3, [leftq+9]
    435    mov       [rsp+16], botq
    436    mova    [rsp+0x20], m0
    437    mova    [rsp+0x40], m1
    438    mova    [rsp+0x60], m2
    439    mova    [rsp+0x80], m3
    440    mova    [rsp+0xa0], m4
    441    lea           botq, [dstq+strideq*4]
    442 %endif
    443 
    444 DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping
    445    mov       dampingd, r8m
    446    xor          zerod, zerod
    447    movifnidn     prid, prim
    448    sub       dampingd, 31
    449    movifnidn  secdmpd, secdmpm
    450    test          prid, prid
    451    jz .sec_only
    452    movd           xm0, prid
    453    lzcnt      pridmpd, prid
    454    add        pridmpd, dampingd
    455    cmovs      pridmpd, zerod
    456    mov        [rsp+0], pridmpq                 ; pri_shift
    457    test       secdmpd, secdmpd
    458    jz .pri_only
    459    movd           xm1, secdmpd
    460    lzcnt      secdmpd, secdmpd
    461    add        secdmpd, dampingd
    462    mov        [rsp+8], secdmpq                 ; sec_shift
    463 
    464 DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp
    465    lea         tableq, [tap_table]
    466    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
    467    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
    468 
    469    ; pri/sec_taps[k] [4 total]
    470 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir
    471    vpbroadcastb    m0, xm0                     ; pri_strength
    472    vpbroadcastb    m1, xm1                     ; sec_strength
    473    and           prid, 1
    474    lea           priq, [tableq+priq*2+8]       ; pri_taps
    475    lea           secq, [tableq+12]             ; sec_taps
    476 
    477    PREP_REGS       %1, %2
    478 %if %1*%2 > mmsize
    479 .v_loop:
    480 %endif
    481    LOAD_BLOCK      %1, %2, 1
    482 .k_loop:
    483    vpbroadcastb    m2, [priq+kq]                          ; pri_taps
    484    vpbroadcastb    m3, [secq+kq]                          ; sec_taps
    485    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
    486    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
    487    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
    488    dec             kq
    489    jge .k_loop
    490 
    491    vpbroadcastd   m10, [pw_2048]
    492    pxor            m9, m9
    493    ADJUST_PIXEL    %1, %2, m9, m10, 1
    494 %if %1*%2 > mmsize
    495    lea           dstq, [dstq+strideq*4]
    496    lea          top1q, [rsp+0xa0]
    497    lea          top2q, [rsp+0xb0]
    498    mov           botq, [rsp+16]
    499    add             hq, 4
    500    jl .v_loop
    501 %endif
    502    RET
    503 
    504 .pri_only:
    505 DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp
    506    lea         tableq, [tap_table]
    507    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
    508    ; pri/sec_taps[k] [4 total]
    509 DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir
    510    vpbroadcastb    m0, xm0                     ; pri_strength
    511    and           prid, 1
    512    lea           priq, [tableq+priq*2+8]       ; pri_taps
    513    PREP_REGS       %1, %2
    514    vpbroadcastd    m3, [pw_2048]
    515    pxor            m1, m1
    516 %if %1*%2 > mmsize
    517 .pri_v_loop:
    518 %endif
    519    LOAD_BLOCK      %1, %2
    520 .pri_k_loop:
    521    vpbroadcastb    m2, [priq+kq]                       ; pri_taps
    522    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
    523    dec             kq
    524    jge .pri_k_loop
    525    ADJUST_PIXEL    %1, %2, m1, m3
    526 %if %1*%2 > mmsize
    527    lea           dstq, [dstq+strideq*4]
    528    lea          top1q, [rsp+0xa0]
    529    lea          top2q, [rsp+0xb0]
    530    mov           botq, [rsp+16]
    531    add             hq, 4
    532    jl .pri_v_loop
    533 %endif
    534    RET
    535 
    536 .sec_only:
    537 DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping
    538    movd           xm1, secdmpd
    539    lzcnt      secdmpd, secdmpd
    540    add        secdmpd, dampingd
    541    mov        [rsp+8], secdmpq                 ; sec_shift
    542 DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table
    543    lea         tableq, [tap_table]
    544    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
    545    ; pri/sec_taps[k] [4 total]
    546 DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir
    547    vpbroadcastb    m1, xm1                     ; sec_strength
    548    lea           secq, [tableq+12]             ; sec_taps
    549    PREP_REGS       %1, %2
    550    vpbroadcastd    m2, [pw_2048]
    551    pxor            m0, m0
    552 %if %1*%2 > mmsize
    553 .sec_v_loop:
    554 %endif
    555    LOAD_BLOCK      %1, %2
    556 .sec_k_loop:
    557    vpbroadcastb    m3, [secq+kq]                       ; sec_taps
    558    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
    559    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
    560    dec             kq
    561    jge .sec_k_loop
    562    ADJUST_PIXEL    %1, %2, m0, m2
    563 %if %1*%2 > mmsize
    564    lea           dstq, [dstq+strideq*4]
    565    lea          top1q, [rsp+0xa0]
    566    lea          top2q, [rsp+0xb0]
    567    mov           botq, [rsp+16]
    568    add             hq, 4
    569    jl .sec_v_loop
    570 %endif
    571    RET
    572 
    573 .d0k0:
    574 %if %1 == 4
    575 %if %2 == 4
    576    vpbroadcastq    m6, [dstq+strideq*1-1]
    577    vpbroadcastq   m10, [dstq+strideq*2-1]
    578    movd           xm5, [topq+strideq*1+1]
    579    movd           xm9, [dstq+strideq*0+1]
    580    psrldq         m11, m6, 2
    581    psrldq         m12, m10, 2
    582    vinserti128     m6, [dstq+stride3q -1], 1
    583    vinserti128    m10, [botq          -1], 1
    584    vpblendd        m5, m11, 0x10
    585    vpblendd        m9, m12, 0x10
    586    movu           m11, [blend_4x4+16]
    587    punpckldq       m6, m10
    588    punpckldq       m5, m9
    589    vpblendvb       m6, [rsp+gprsize+0x28], m11
    590 %else
    591    movd           xm5, [topq +strideq*1+1]
    592    movq           xm6, [dstq +strideq*1-1]
    593    movq          xm10, [dstq +stride3q -1]
    594    movq          xm11, [dst4q+strideq*1-1]
    595    pinsrd         xm5, [dstq +strideq*0+1], 1
    596    movhps         xm6, [dstq +strideq*2-1]
    597    movhps        xm10, [dst4q+strideq*0-1]
    598    movhps        xm11, [dst4q+strideq*2-1]
    599    psrldq         xm9, xm6, 2
    600    shufps         xm5, xm9, q2010   ; -1 +0 +1 +2
    601    shufps         xm6, xm10, q2020  ; +1 +2 +3 +4
    602    psrldq         xm9, xm11, 2
    603    psrldq        xm10, 2
    604    shufps        xm10, xm9, q2020   ; +3 +4 +5 +6
    605    movd           xm9, [dst4q+stride3q -1]
    606    pinsrd         xm9, [botq           -1], 1
    607    shufps        xm11, xm9, q1020   ; +5 +6 +7 +8
    608    pmovzxbw        m9, [leftq+3]
    609    vinserti128     m6, xm11, 1
    610    movu           m11, [blend_4x8_0+4]
    611    vinserti128     m5, xm10, 1
    612    vpblendvb       m6, m9, m11
    613 %endif
    614 %else
    615    lea            r13, [blend_8x8_0+16]
    616    movq           xm5, [top2q         +1]
    617    vbroadcasti128 m10, [dstq+strideq*1-1]
    618    vbroadcasti128 m11, [dstq+strideq*2-1]
    619    movhps         xm5, [dstq+strideq*0+1]
    620    vinserti128     m6, m10, [dstq+stride3q-1], 1
    621    vinserti128     m9, m11, [botq         -1], 1
    622    psrldq         m10, 2
    623    psrldq         m11, 2
    624    punpcklqdq      m6, m9
    625    movu            m9, [r13+hq*2*1+16*1]
    626    punpcklqdq     m10, m11
    627    vpblendd        m5, m10, 0xF0
    628    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9
    629 %endif
    630    ret
    631 .d1k0:
    632 .d2k0:
    633 .d3k0:
    634 %if %1 == 4
    635 %if %2 == 4
    636    movq           xm6, [dstq+strideq*0-1]
    637    movq           xm9, [dstq+strideq*1-1]
    638    vinserti128     m6, [dstq+strideq*2-1], 1
    639    vinserti128     m9, [dstq+stride3q -1], 1
    640    movu           m11, [rsp+gprsize+0x10]
    641    pcmpeqd        m12, m12
    642    psrldq          m5, m6, 2
    643    psrldq         m10, m9, 2
    644    psrld          m12, 24
    645    punpckldq       m6, m9
    646    punpckldq       m5, m10
    647    vpblendvb       m6, m11, m12
    648 %else
    649    movq           xm6, [dstq +strideq*0-1]
    650    movq           xm9, [dstq +strideq*2-1]
    651    movhps         xm6, [dstq +strideq*1-1]
    652    movhps         xm9, [dstq +stride3q -1]
    653    movq          xm10, [dst4q+strideq*0-1]
    654    movhps        xm10, [dst4q+strideq*1-1]
    655    psrldq         xm5, xm6, 2
    656    psrldq        xm11, xm9, 2
    657    shufps         xm5, xm11, q2020
    658    movq          xm11, [dst4q+strideq*2-1]
    659    movhps        xm11, [dst4q+stride3q -1]
    660    shufps         xm6, xm9, q2020
    661    shufps         xm9, xm10, xm11, q2020
    662    vinserti128     m6, xm9, 1
    663    pmovzxbw        m9, [leftq+1]
    664    psrldq        xm10, 2
    665    psrldq        xm11, 2
    666    shufps        xm10, xm11, q2020
    667    vpbroadcastd   m11, [blend_4x8_0+4]
    668    vinserti128     m5, xm10, 1
    669    vpblendvb       m6, m9, m11
    670 %endif
    671 %else
    672    movu           xm5, [dstq+strideq*0-1]
    673    movu           xm9, [dstq+strideq*1-1]
    674    vinserti128     m5, [dstq+strideq*2-1], 1
    675    vinserti128     m9, [dstq+stride3q -1], 1
    676    movu           m10, [blend_8x8_0+16]
    677    punpcklqdq      m6, m5, m9
    678    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64], m10
    679    psrldq          m5, 2
    680    psrldq          m9, 2
    681    punpcklqdq      m5, m9
    682 %endif
    683    ret
    684 .d4k0:
    685 %if %1 == 4
    686 %if %2 == 4
    687    vpbroadcastq   m10, [dstq+strideq*1-1]
    688    vpbroadcastq   m11, [dstq+strideq*2-1]
    689    movd           xm6, [topq+strideq*1-1]
    690    movd           xm9, [dstq+strideq*0-1]
    691    psrldq          m5, m10, 2
    692    psrldq         m12, m11, 2
    693    vpblendd        m6, m10, 0x10
    694    vpblendd        m9, m11, 0x10
    695    movu           m10, [blend_4x4]
    696    vinserti128     m5, [dstq+stride3q +1], 1
    697    vinserti128    m12, [botq          +1], 1
    698    punpckldq       m6, m9
    699    punpckldq       m5, m12
    700    vpblendvb       m6, [rsp+gprsize+0x40], m10
    701 %else
    702    movd           xm6, [topq +strideq*1-1]
    703    movq           xm9, [dstq +strideq*1-1]
    704    movq          xm10, [dstq +stride3q -1]
    705    movq          xm11, [dst4q+strideq*1-1]
    706    pinsrd         xm6, [dstq +strideq*0-1], 1
    707    movhps         xm9, [dstq +strideq*2-1]
    708    movhps        xm10, [dst4q+strideq*0-1]
    709    movhps        xm11, [dst4q+strideq*2-1]
    710    psrldq         xm5, xm9, 2
    711    shufps         xm6, xm9, q2010
    712    psrldq         xm9, xm10, 2
    713    shufps         xm5, xm9, q2020
    714    shufps        xm10, xm11, q2020
    715    movd           xm9, [dst4q+stride3q +1]
    716    vinserti128     m6, xm10, 1
    717    pinsrd         xm9, [botq           +1], 1
    718    psrldq        xm11, 2
    719    pmovzxbw       m10, [leftq-1]
    720    shufps        xm11, xm9, q1020
    721    movu            m9, [blend_4x8_0]
    722    vinserti128     m5, xm11, 1
    723    vpblendvb       m6, m10, m9
    724 %endif
    725 %else
    726    lea            r13, [blend_8x8_0+8]
    727    movq           xm6, [top2q         -1]
    728    vbroadcasti128  m5, [dstq+strideq*1-1]
    729    vbroadcasti128  m9, [dstq+strideq*2-1]
    730    movhps         xm6, [dstq+strideq*0-1]
    731    movu           m11, [r13+hq*2*1+16*1]
    732    punpcklqdq     m10, m5, m9
    733    vinserti128     m5, [dstq+stride3q -1], 1
    734    vinserti128     m9, [botq          -1], 1
    735    vpblendd        m6, m10, 0xF0
    736    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11
    737    psrldq          m5, 2
    738    psrldq          m9, 2
    739    punpcklqdq      m5, m9
    740 %endif
    741    ret
    742 .d5k0:
    743 .d6k0:
    744 .d7k0:
    745 %if %1 == 4
    746 %if %2 == 4
    747    movd           xm6, [topq+strideq*1  ]
    748    vpbroadcastd    m5, [dstq+strideq*1  ]
    749    vpbroadcastd    m9, [dstq+strideq*2  ]
    750    vpblendd       xm6, [dstq+strideq*0-4], 0x2
    751    vpblendd        m5, m9, 0x22
    752    vpblendd        m6, m5, 0x30
    753    vinserti128     m5, [dstq+stride3q   ], 1
    754    vpblendd        m5, [botq         -20], 0x20
    755 %else
    756    movd           xm6, [topq +strideq*1]
    757    movd           xm5, [dstq +strideq*1]
    758    movd           xm9, [dstq +stride3q ]
    759    movd          xm10, [dst4q+strideq*1]
    760    movd          xm11, [dst4q+stride3q ]
    761    pinsrd         xm6, [dstq +strideq*0], 1
    762    pinsrd         xm5, [dstq +strideq*2], 1
    763    pinsrd         xm9, [dst4q+strideq*0], 1
    764    pinsrd        xm10, [dst4q+strideq*2], 1
    765    pinsrd        xm11, [botq           ], 1
    766    punpcklqdq     xm6, xm5
    767    punpcklqdq     xm5, xm9
    768    punpcklqdq     xm9, xm10
    769    punpcklqdq    xm10, xm11
    770    vinserti128     m6, xm9, 1
    771    vinserti128     m5, xm10, 1
    772 %endif
    773 %else
    774    movq           xm6, [top2q         ]
    775    movq           xm5, [dstq+strideq*1]
    776    movq           xm9, [dstq+stride3q ]
    777    movhps         xm6, [dstq+strideq*0]
    778    movhps         xm5, [dstq+strideq*2]
    779    movhps         xm9, [botq          ]
    780    vinserti128     m6, xm5, 1
    781    vinserti128     m5, xm9, 1
    782 %endif
    783    ret
    784 .d0k1:
    785 %if %1 == 4
    786 %if %2 == 4
    787    movd           xm6, [dstq+strideq*2-2]
    788    movd           xm9, [dstq+stride3q -2]
    789    movd           xm5, [topq+strideq*0+2]
    790    movd          xm10, [topq+strideq*1+2]
    791    pinsrw         xm6, [leftq+4], 0
    792    pinsrw         xm9, [leftq+6], 0
    793    vinserti128     m5, [dstq+strideq*0+2], 1
    794    vinserti128    m10, [dstq+strideq*1+2], 1
    795    vinserti128     m6, [botq+strideq*0-2], 1
    796    vinserti128     m9, [botq+strideq*1-2], 1
    797    punpckldq       m5, m10
    798    punpckldq       m6, m9
    799 %else
    800    movq           xm6, [dstq +strideq*2-2]
    801    movd          xm10, [dst4q+strideq*2-2]
    802    movd           xm5, [topq +strideq*0+2]
    803    movq           xm9, [dst4q+strideq*0-2]
    804    movhps         xm6, [dstq +stride3q -2]
    805    pinsrw        xm10, [dst4q+stride3q   ], 3
    806    pinsrd         xm5, [topq +strideq*1+2], 1
    807    movhps         xm9, [dst4q+strideq*1-2]
    808    pinsrd        xm10, [botq +strideq*0-2], 2
    809    pinsrd         xm5, [dstq +strideq*0+2], 2
    810    pinsrd        xm10, [botq +strideq*1-2], 3
    811    pinsrd         xm5, [dstq +strideq*1+2], 3
    812    shufps        xm11, xm6, xm9, q3131
    813    shufps         xm6, xm9, q2020
    814    movu            m9, [blend_4x8_3+8]
    815    vinserti128     m6, xm10, 1
    816    vinserti128     m5, xm11, 1
    817    vpblendvb       m6, [rsp+gprsize+0x10+8], m9
    818 %endif
    819 %else
    820    lea            r13, [blend_8x8_1+16]
    821    movq           xm6, [dstq+strideq*2-2]
    822    movq           xm9, [dstq+stride3q -2]
    823    movq           xm5, [top1q         +2]
    824    movq          xm10, [top2q         +2]
    825    movu           m11, [r13+hq*2*2+16*2]
    826    vinserti128     m6, [botq+strideq*0-2], 1
    827    vinserti128     m9, [botq+strideq*1-2], 1
    828    vinserti128     m5, [dstq+strideq*0+2], 1
    829    vinserti128    m10, [dstq+strideq*1+2], 1
    830    punpcklqdq      m6, m9
    831    punpcklqdq      m5, m10
    832    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11
    833 %endif
    834    ret
    835 .d1k1:
    836 %if %1 == 4
    837 %if %2 == 4
    838    vpbroadcastq    m6, [dstq+strideq*1-2]
    839    vpbroadcastq    m9, [dstq+strideq*2-2]
    840    movd           xm5, [topq+strideq*1+2]
    841    movd          xm10, [dstq+strideq*0+2]
    842    psrldq         m11, m6, 4
    843    psrldq         m12, m9, 4
    844    vpblendd        m5, m11, 0x10
    845    movq          xm11, [leftq+2]
    846    vinserti128     m6, [dstq+stride3q-2], 1
    847    punpckldq     xm11, xm11
    848    vpblendd       m10, m12, 0x10
    849    pcmpeqd        m12, m12
    850    pmovzxwd       m11, xm11
    851    psrld          m12, 16
    852    punpckldq       m6, m9
    853    vpbroadcastd    m9, [botq-2]
    854    vpblendvb       m6, m11, m12
    855    punpckldq       m5, m10
    856    vpblendd        m6, m9, 0x20
    857 %else
    858    movd           xm5, [topq +strideq*1+2]
    859    movq           xm6, [dstq +strideq*1-2]
    860    movq           xm9, [dstq +stride3q -2]
    861    movq          xm10, [dst4q+strideq*1-2]
    862    movd          xm11, [dst4q+stride3q -2]
    863    pinsrd         xm5, [dstq +strideq*0+2], 1
    864    movhps         xm6, [dstq +strideq*2-2]
    865    movhps         xm9, [dst4q+strideq*0-2]
    866    movhps        xm10, [dst4q+strideq*2-2]
    867    pinsrd        xm11, [botq           -2], 1
    868    shufps         xm5, xm6, q3110
    869    shufps         xm6, xm9, q2020
    870    shufps         xm9, xm10, q3131
    871    shufps        xm10, xm11, q1020
    872    movu           m11, [blend_4x8_2+4]
    873    vinserti128     m6, xm10, 1
    874    vinserti128     m5, xm9, 1
    875    vpblendvb       m6, [rsp+gprsize+0x10+4], m11
    876 %endif
    877 %else
    878    lea            r13, [blend_8x8_1+16]
    879    movq           xm5, [top2q         +2]
    880    vbroadcasti128  m6, [dstq+strideq*1-2]
    881    vbroadcasti128  m9, [dstq+strideq*2-2]
    882    movhps         xm5, [dstq+strideq*0+2]
    883    shufps         m10, m6, m9, q2121
    884    vinserti128     m6, [dstq+stride3q -2], 1
    885    vinserti128     m9, [botq          -2], 1
    886    movu           m11, [r13+hq*2*1+16*1]
    887    vpblendd        m5, m10, 0xF0
    888    punpcklqdq      m6, m9
    889    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11
    890 %endif
    891    ret
    892 .d2k1:
    893 %if %1 == 4
    894 %if %2 == 4
    895    movq          xm11, [leftq]
    896    movq           xm6, [dstq+strideq*0-2]
    897    movq           xm9, [dstq+strideq*1-2]
    898    vinserti128     m6, [dstq+strideq*2-2], 1
    899    vinserti128     m9, [dstq+stride3q -2], 1
    900    punpckldq     xm11, xm11
    901    psrldq          m5, m6, 4
    902    psrldq         m10, m9, 4
    903    pmovzxwd       m11, xm11
    904    punpckldq       m6, m9
    905    punpckldq       m5, m10
    906    pblendw         m6, m11, 0x05
    907 %else
    908    movq           xm5, [dstq +strideq*0-2]
    909    movq           xm9, [dstq +strideq*2-2]
    910    movq          xm10, [dst4q+strideq*0-2]
    911    movq          xm11, [dst4q+strideq*2-2]
    912    movhps         xm5, [dstq +strideq*1-2]
    913    movhps         xm9, [dstq +stride3q -2]
    914    movhps        xm10, [dst4q+strideq*1-2]
    915    movhps        xm11, [dst4q+stride3q -2]
    916    shufps         xm6, xm5, xm9, q2020
    917    shufps         xm5, xm9, q3131
    918    shufps         xm9, xm10, xm11, q2020
    919    shufps        xm10, xm11, q3131
    920    pmovzxwd       m11, [leftq]
    921    vinserti128     m6, xm9, 1
    922    vinserti128     m5, xm10, 1
    923    pblendw         m6, m11, 0x55
    924 %endif
    925 %else
    926    mova           m11, [rsp+gprsize+0x20+hq*8+64]
    927    movu           xm5, [dstq+strideq*0-2]
    928    movu           xm9, [dstq+strideq*1-2]
    929    vinserti128     m5, [dstq+strideq*2-2], 1
    930    vinserti128     m9, [dstq+stride3q -2], 1
    931    shufps          m6, m5, m9, q1010
    932    shufps          m5, m9, q2121
    933    pblendw         m6, m11, 0x11
    934 %endif
    935    ret
    936 .d3k1:
    937 %if %1 == 4
    938 %if %2 == 4
    939    vpbroadcastq   m11, [dstq+strideq*1-2]
    940    vpbroadcastq   m12, [dstq+strideq*2-2]
    941    movd           xm6, [topq+strideq*1-2]
    942    movd           xm9, [dstq+strideq*0-2]
    943    pblendw        m11, [leftq-16+2], 0x01
    944    pblendw        m12, [leftq-16+4], 0x01
    945    pinsrw         xm9, [leftq- 0+0], 0
    946    psrldq          m5, m11, 4
    947    psrldq         m10, m12, 4
    948    vinserti128     m5, [dstq+stride3q +2], 1
    949    vinserti128    m10, [botq          +2], 1
    950    vpblendd        m6, m11, 0x10
    951    vpblendd        m9, m12, 0x10
    952    punpckldq       m6, m9
    953    punpckldq       m5, m10
    954 %else
    955    movd           xm6, [topq +strideq*1-2]
    956    movq           xm5, [dstq +strideq*1-2]
    957    movq           xm9, [dstq +stride3q -2]
    958    movq          xm10, [dst4q+strideq*1-2]
    959    movd          xm11, [dst4q+stride3q +2]
    960    pinsrw         xm6, [dstq +strideq*0  ], 3
    961    movhps         xm5, [dstq +strideq*2-2]
    962    movhps         xm9, [dst4q+strideq*0-2]
    963    movhps        xm10, [dst4q+strideq*2-2]
    964    pinsrd        xm11, [botq           +2], 1
    965    shufps         xm6, xm5, q2010
    966    shufps         xm5, xm9, q3131
    967    shufps         xm9, xm10, q2020
    968    shufps        xm10, xm11, q1031
    969    movu           m11, [blend_4x8_2]
    970    vinserti128     m6, xm9, 1
    971    vinserti128     m5, xm10, 1
    972    vpblendvb       m6, [rsp+gprsize+0x10-4], m11
    973 %endif
    974 %else
    975    lea            r13, [blend_8x8_1+8]
    976    movq           xm6, [top2q         -2]
    977    vbroadcasti128  m5, [dstq+strideq*1-2]
    978    vbroadcasti128 m10, [dstq+strideq*2-2]
    979    movhps         xm6, [dstq+strideq*0-2]
    980    punpcklqdq      m9, m5, m10
    981    vinserti128     m5, [dstq+stride3q -2], 1
    982    vinserti128    m10, [botq          -2], 1
    983    movu           m11, [r13+hq*2*1+16*1]
    984    vpblendd        m6, m9, 0xF0
    985    shufps          m5, m10, q2121
    986    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11
    987 %endif
    988    ret
    989 .d4k1:
    990 %if %1 == 4
    991 %if %2 == 4
    992    vinserti128     m6, [dstq+strideq*0-2], 1
    993    vinserti128     m9, [dstq+strideq*1-2], 1
    994    movd           xm5, [dstq+strideq*2+2]
    995    movd          xm10, [dstq+stride3q +2]
    996    pblendw         m6, [leftq-16+0], 0x01
    997    pblendw         m9, [leftq-16+2], 0x01
    998    vinserti128     m5, [botq+strideq*0+2], 1
    999    vinserti128    m10, [botq+strideq*1+2], 1
   1000    vpblendd        m6, [topq+strideq*0-2], 0x01
   1001    vpblendd        m9, [topq+strideq*1-2], 0x01
   1002    punpckldq       m5, m10
   1003    punpckldq       m6, m9
   1004 %else
   1005    movd           xm6, [topq +strideq*0-2]
   1006    movq           xm5, [dstq +strideq*2-2]
   1007    movq           xm9, [dst4q+strideq*0-2]
   1008    movd          xm10, [dst4q+strideq*2+2]
   1009    pinsrd         xm6, [topq +strideq*1-2], 1
   1010    movhps         xm5, [dstq +stride3q -2]
   1011    movhps         xm9, [dst4q+strideq*1-2]
   1012    pinsrd        xm10, [dst4q+stride3q +2], 1
   1013    pinsrd         xm6, [dstq +strideq*0-2], 2
   1014    pinsrd        xm10, [botq +strideq*0+2], 2
   1015    pinsrd         xm6, [dstq +strideq*1-2], 3
   1016    pinsrd        xm10, [botq +strideq*1+2], 3
   1017    shufps        xm11, xm5, xm9, q2020
   1018    shufps         xm5, xm9, q3131
   1019    movu            m9, [blend_4x8_3]
   1020    vinserti128     m6, xm11, 1
   1021    vinserti128     m5, xm10, 1
   1022    vpblendvb       m6, [rsp+gprsize+0x10-8], m9
   1023 %endif
   1024 %else
   1025    lea            r13, [blend_8x8_1]
   1026    movu           m11, [r13+hq*2*2+16*2]
   1027    movq           xm6, [top1q         -2]
   1028    movq           xm9, [top2q         -2]
   1029    movq           xm5, [dstq+strideq*2+2]
   1030    movq          xm10, [dstq+stride3q +2]
   1031    vinserti128     m6, [dstq+strideq*0-2], 1
   1032    vinserti128     m9, [dstq+strideq*1-2], 1
   1033    vinserti128     m5, [botq+strideq*0+2], 1
   1034    vinserti128    m10, [botq+strideq*1+2], 1
   1035    punpcklqdq      m6, m9
   1036    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11
   1037    punpcklqdq      m5, m10
   1038 %endif
   1039    ret
   1040 .d5k1:
   1041 %if %1 == 4
   1042 %if %2 == 4
   1043    movd           xm6, [topq+strideq*0-1]
   1044    movd           xm9, [topq+strideq*1-1]
   1045    movd           xm5, [dstq+strideq*2+1]
   1046    movd          xm10, [dstq+stride3q +1]
   1047    pcmpeqd        m12, m12
   1048    pmovzxbw       m11, [leftq-8+1]
   1049    psrld          m12, 24
   1050    vinserti128     m6, [dstq+strideq*0-1], 1
   1051    vinserti128     m9, [dstq+strideq*1-1], 1
   1052    vinserti128     m5, [botq+strideq*0+1], 1
   1053    vinserti128    m10, [botq+strideq*1+1], 1
   1054    punpckldq       m6, m9
   1055    pxor            m9, m9
   1056    vpblendd       m12, m9, 0x0F
   1057    punpckldq       m5, m10
   1058    vpblendvb       m6, m11, m12
   1059 %else
   1060    movd           xm6, [topq +strideq*0-1]
   1061    movq           xm5, [dstq +strideq*2-1]
   1062    movq           xm9, [dst4q+strideq*0-1]
   1063    movd          xm10, [dst4q+strideq*2+1]
   1064    pinsrd         xm6, [topq +strideq*1-1], 1
   1065    movhps         xm5, [dstq +stride3q -1]
   1066    movhps         xm9, [dst4q+strideq*1-1]
   1067    pinsrd        xm10, [dst4q+stride3q +1], 1
   1068    pinsrd         xm6, [dstq +strideq*0-1], 2
   1069    pinsrd        xm10, [botq +strideq*0+1], 2
   1070    pinsrd         xm6, [dstq +strideq*1-1], 3
   1071    pinsrd        xm10, [botq +strideq*1+1], 3
   1072    shufps        xm11, xm5, xm9, q2020
   1073    vinserti128     m6, xm11, 1
   1074    pmovzxbw       m11, [leftq-3]
   1075    psrldq         xm5, 2
   1076    psrldq         xm9, 2
   1077    shufps         xm5, xm9, q2020
   1078    movu            m9, [blend_4x8_1]
   1079    vinserti128     m5, xm10, 1
   1080    vpblendvb       m6, m11, m9
   1081 %endif
   1082 %else
   1083    lea            r13, [blend_8x8_0]
   1084    movu           m11, [r13+hq*2*2+16*2]
   1085    movq           xm6, [top1q         -1]
   1086    movq           xm9, [top2q         -1]
   1087    movq           xm5, [dstq+strideq*2+1]
   1088    movq          xm10, [dstq+stride3q +1]
   1089    vinserti128     m6, [dstq+strideq*0-1], 1
   1090    vinserti128     m9, [dstq+strideq*1-1], 1
   1091    vinserti128     m5, [botq+strideq*0+1], 1
   1092    vinserti128    m10, [botq+strideq*1+1], 1
   1093    punpcklqdq      m6, m9
   1094    punpcklqdq      m5, m10
   1095    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11
   1096 %endif
   1097    ret
   1098 .d6k1:
   1099 %if %1 == 4
   1100 %if %2 == 4
   1101    movd           xm6, [topq+strideq*0]
   1102    movd           xm9, [topq+strideq*1]
   1103    movd           xm5, [dstq+strideq*2]
   1104    movd          xm10, [dstq+stride3q ]
   1105    vinserti128     m6, [dstq+strideq*0], 1
   1106    vinserti128     m9, [dstq+strideq*1], 1
   1107    vinserti128     m5, [botq+strideq*0], 1
   1108    vinserti128    m10, [botq+strideq*1], 1
   1109    punpckldq       m6, m9
   1110    punpckldq       m5, m10
   1111 %else
   1112    movd           xm5, [dstq +strideq*2]
   1113    movd           xm6, [topq +strideq*0]
   1114    movd           xm9, [dst4q+strideq*2]
   1115    pinsrd         xm5, [dstq +stride3q ], 1
   1116    pinsrd         xm6, [topq +strideq*1], 1
   1117    pinsrd         xm9, [dst4q+stride3q ], 1
   1118    pinsrd         xm5, [dst4q+strideq*0], 2
   1119    pinsrd         xm6, [dstq +strideq*0], 2
   1120    pinsrd         xm9, [botq +strideq*0], 2
   1121    pinsrd         xm5, [dst4q+strideq*1], 3
   1122    pinsrd         xm6, [dstq +strideq*1], 3
   1123    pinsrd         xm9, [botq +strideq*1], 3
   1124    vinserti128     m6, xm5, 1
   1125    vinserti128     m5, xm9, 1
   1126 %endif
   1127 %else
   1128    movq           xm5, [dstq+strideq*2]
   1129    movq           xm9, [botq+strideq*0]
   1130    movq           xm6, [top1q         ]
   1131    movq          xm10, [dstq+strideq*0]
   1132    movhps         xm5, [dstq+stride3q ]
   1133    movhps         xm9, [botq+strideq*1]
   1134    movhps         xm6, [top2q         ]
   1135    movhps        xm10, [dstq+strideq*1]
   1136    vinserti128     m5, xm9, 1
   1137    vinserti128     m6, xm10, 1
   1138 %endif
   1139    ret
   1140 .d7k1:
   1141 %if %1 == 4
   1142 %if %2 == 4
   1143    movd           xm5, [dstq+strideq*2-1]
   1144    movd           xm9, [dstq+stride3q -1]
   1145    movd           xm6, [topq+strideq*0+1]
   1146    movd          xm10, [topq+strideq*1+1]
   1147    pinsrb         xm5, [leftq+ 5], 0
   1148    pinsrb         xm9, [leftq+ 7], 0
   1149    vinserti128     m6, [dstq+strideq*0+1], 1
   1150    vinserti128    m10, [dstq+strideq*1+1], 1
   1151    vinserti128     m5, [botq+strideq*0-1], 1
   1152    vinserti128     m9, [botq+strideq*1-1], 1
   1153    punpckldq       m6, m10
   1154    punpckldq       m5, m9
   1155 %else
   1156    movd           xm6, [topq +strideq*0+1]
   1157    movq           xm9, [dstq +strideq*2-1]
   1158    movq          xm10, [dst4q+strideq*0-1]
   1159    movd          xm11, [dst4q+strideq*2-1]
   1160    pinsrd         xm6, [topq +strideq*1+1], 1
   1161    movhps         xm9, [dstq +stride3q -1]
   1162    movhps        xm10, [dst4q+strideq*1-1]
   1163    pinsrd        xm11, [dst4q+stride3q -1], 1
   1164    pinsrd         xm6, [dstq +strideq*0+1], 2
   1165    pinsrd        xm11, [botq +strideq*0-1], 2
   1166    pinsrd         xm6, [dstq +strideq*1+1], 3
   1167    pinsrd        xm11, [botq +strideq*1-1], 3
   1168    shufps         xm5, xm9, xm10, q2020
   1169    vinserti128     m5, xm11, 1
   1170    pmovzxbw       m11, [leftq+5]
   1171    psrldq         xm9, 2
   1172    psrldq        xm10, 2
   1173    shufps         xm9, xm10, q2020
   1174    movu           m10, [blend_4x8_1+8]
   1175    vinserti128     m6, xm9, 1
   1176    vpblendvb       m5, m11, m10
   1177 %endif
   1178 %else
   1179    lea            r13, [blend_8x8_0+16]
   1180    movq           xm5, [dstq+strideq*2-1]
   1181    movq           xm9, [botq+strideq*0-1]
   1182    movq           xm6, [top1q         +1]
   1183    movq          xm10, [dstq+strideq*0+1]
   1184    movhps         xm5, [dstq+stride3q -1]
   1185    movhps         xm9, [botq+strideq*1-1]
   1186    movhps         xm6, [top2q         +1]
   1187    movhps        xm10, [dstq+strideq*1+1]
   1188    movu           m11, [r13+hq*2*2+16*2]
   1189    vinserti128     m5, xm9, 1
   1190    vinserti128     m6, xm10, 1
   1191    vpblendvb       m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11
   1192 %endif
   1193    ret
   1194 
   1195 .border_block:
   1196 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge
   1197    RESET_STACK_STATE
   1198    %assign stack_offset stack_offset - (regs_used - 11) * gprsize
   1199    %assign regs_used 11
   1200    ALLOC_STACK 2*16+(%2+4)*32, 16
   1201 %define px rsp+2*16+2*32
   1202 
   1203    pcmpeqw        m14, m14
   1204    psllw          m14, 15                  ; 0x8000
   1205 
   1206    ; prepare pixel buffers - body/right
   1207 %if %1 == 4
   1208    INIT_XMM avx2
   1209 %endif
   1210 %if %2 == 8
   1211    lea          dst4q, [dstq+strideq*4]
   1212 %endif
   1213    lea       stride3q, [strideq*3]
   1214    test         edgeb, 2                   ; have_right
   1215    jz .no_right
   1216    pmovzxbw        m1, [dstq+strideq*0]
   1217    pmovzxbw        m2, [dstq+strideq*1]
   1218    pmovzxbw        m3, [dstq+strideq*2]
   1219    pmovzxbw        m4, [dstq+stride3q]
   1220    mova     [px+0*32], m1
   1221    mova     [px+1*32], m2
   1222    mova     [px+2*32], m3
   1223    mova     [px+3*32], m4
   1224 %if %2 == 8
   1225    pmovzxbw        m1, [dst4q+strideq*0]
   1226    pmovzxbw        m2, [dst4q+strideq*1]
   1227    pmovzxbw        m3, [dst4q+strideq*2]
   1228    pmovzxbw        m4, [dst4q+stride3q]
   1229    mova     [px+4*32], m1
   1230    mova     [px+5*32], m2
   1231    mova     [px+6*32], m3
   1232    mova     [px+7*32], m4
   1233 %endif
   1234    jmp .body_done
   1235 .no_right:
   1236 %if %1 == 4
   1237    movd           xm1, [dstq+strideq*0]
   1238    movd           xm2, [dstq+strideq*1]
   1239    movd           xm3, [dstq+strideq*2]
   1240    movd           xm4, [dstq+stride3q]
   1241    pmovzxbw       xm1, xm1
   1242    pmovzxbw       xm2, xm2
   1243    pmovzxbw       xm3, xm3
   1244    pmovzxbw       xm4, xm4
   1245    movq     [px+0*32], xm1
   1246    movq     [px+1*32], xm2
   1247    movq     [px+2*32], xm3
   1248    movq     [px+3*32], xm4
   1249 %else
   1250    pmovzxbw       xm1, [dstq+strideq*0]
   1251    pmovzxbw       xm2, [dstq+strideq*1]
   1252    pmovzxbw       xm3, [dstq+strideq*2]
   1253    pmovzxbw       xm4, [dstq+stride3q]
   1254    mova     [px+0*32], xm1
   1255    mova     [px+1*32], xm2
   1256    mova     [px+2*32], xm3
   1257    mova     [px+3*32], xm4
   1258 %endif
   1259    movd [px+0*32+%1*2], xm14
   1260    movd [px+1*32+%1*2], xm14
   1261    movd [px+2*32+%1*2], xm14
   1262    movd [px+3*32+%1*2], xm14
   1263 %if %2 == 8
   1264 %if %1 == 4
   1265    movd           xm1, [dst4q+strideq*0]
   1266    movd           xm2, [dst4q+strideq*1]
   1267    movd           xm3, [dst4q+strideq*2]
   1268    movd           xm4, [dst4q+stride3q]
   1269    pmovzxbw       xm1, xm1
   1270    pmovzxbw       xm2, xm2
   1271    pmovzxbw       xm3, xm3
   1272    pmovzxbw       xm4, xm4
   1273    movq     [px+4*32], xm1
   1274    movq     [px+5*32], xm2
   1275    movq     [px+6*32], xm3
   1276    movq     [px+7*32], xm4
   1277 %else
   1278    pmovzxbw       xm1, [dst4q+strideq*0]
   1279    pmovzxbw       xm2, [dst4q+strideq*1]
   1280    pmovzxbw       xm3, [dst4q+strideq*2]
   1281    pmovzxbw       xm4, [dst4q+stride3q]
   1282    mova     [px+4*32], xm1
   1283    mova     [px+5*32], xm2
   1284    mova     [px+6*32], xm3
   1285    mova     [px+7*32], xm4
   1286 %endif
   1287    movd [px+4*32+%1*2], xm14
   1288    movd [px+5*32+%1*2], xm14
   1289    movd [px+6*32+%1*2], xm14
   1290    movd [px+7*32+%1*2], xm14
   1291 %endif
   1292 .body_done:
   1293 
   1294    ; top
   1295    test         edgeb, 4                    ; have_top
   1296    jz .no_top
   1297    test         edgeb, 1                    ; have_left
   1298    jz .top_no_left
   1299    test         edgeb, 2                    ; have_right
   1300    jz .top_no_right
   1301    pmovzxbw        m1, [topq+strideq*0-(%1/2)]
   1302    pmovzxbw        m2, [topq+strideq*1-(%1/2)]
   1303    movu  [px-2*32-%1], m1
   1304    movu  [px-1*32-%1], m2
   1305    jmp .top_done
   1306 .top_no_right:
   1307    pmovzxbw        m1, [topq+strideq*0-%1]
   1308    pmovzxbw        m2, [topq+strideq*1-%1]
   1309    movu [px-2*32-%1*2], m1
   1310    movu [px-1*32-%1*2], m2
   1311    movd [px-2*32+%1*2], xm14
   1312    movd [px-1*32+%1*2], xm14
   1313    jmp .top_done
   1314 .top_no_left:
   1315    test         edgeb, 2                   ; have_right
   1316    jz .top_no_left_right
   1317    pmovzxbw        m1, [topq+strideq*0]
   1318    pmovzxbw        m2, [topq+strideq*1]
   1319    mova   [px-2*32+0], m1
   1320    mova   [px-1*32+0], m2
   1321    movd   [px-2*32-4], xm14
   1322    movd   [px-1*32-4], xm14
   1323    jmp .top_done
   1324 .top_no_left_right:
   1325 %if %1 == 4
   1326    movd           xm1, [topq+strideq*0]
   1327    pinsrd         xm1, [topq+strideq*1], 1
   1328    pmovzxbw       xm1, xm1
   1329    movq   [px-2*32+0], xm1
   1330    movhps [px-1*32+0], xm1
   1331 %else
   1332    pmovzxbw       xm1, [topq+strideq*0]
   1333    pmovzxbw       xm2, [topq+strideq*1]
   1334    mova   [px-2*32+0], xm1
   1335    mova   [px-1*32+0], xm2
   1336 %endif
   1337    movd   [px-2*32-4], xm14
   1338    movd   [px-1*32-4], xm14
   1339    movd [px-2*32+%1*2], xm14
   1340    movd [px-1*32+%1*2], xm14
   1341    jmp .top_done
   1342 .no_top:
   1343    movu   [px-2*32-%1], m14
   1344    movu   [px-1*32-%1], m14
   1345 .top_done:
   1346 
   1347    ; left
   1348    test         edgeb, 1                   ; have_left
   1349    jz .no_left
   1350    pmovzxbw       xm1, [leftq+ 0]
   1351 %if %2 == 8
   1352    pmovzxbw       xm2, [leftq+ 8]
   1353 %endif
   1354    movd   [px+0*32-4], xm1
   1355    pextrd [px+1*32-4], xm1, 1
   1356    pextrd [px+2*32-4], xm1, 2
   1357    pextrd [px+3*32-4], xm1, 3
   1358 %if %2 == 8
   1359    movd   [px+4*32-4], xm2
   1360    pextrd [px+5*32-4], xm2, 1
   1361    pextrd [px+6*32-4], xm2, 2
   1362    pextrd [px+7*32-4], xm2, 3
   1363 %endif
   1364    jmp .left_done
   1365 .no_left:
   1366    movd   [px+0*32-4], xm14
   1367    movd   [px+1*32-4], xm14
   1368    movd   [px+2*32-4], xm14
   1369    movd   [px+3*32-4], xm14
   1370 %if %2 == 8
   1371    movd   [px+4*32-4], xm14
   1372    movd   [px+5*32-4], xm14
   1373    movd   [px+6*32-4], xm14
   1374    movd   [px+7*32-4], xm14
   1375 %endif
   1376 .left_done:
   1377 
   1378    ; bottom
   1379 DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge
   1380    test         edgeb, 8                   ; have_bottom
   1381    jz .no_bottom
   1382    test         edgeb, 1                   ; have_left
   1383    jz .bottom_no_left
   1384    test         edgeb, 2                   ; have_right
   1385    jz .bottom_no_right
   1386    pmovzxbw        m1, [botq+strideq*0-(%1/2)]
   1387    pmovzxbw        m2, [botq+strideq*1-(%1/2)]
   1388    movu   [px+(%2+0)*32-%1], m1
   1389    movu   [px+(%2+1)*32-%1], m2
   1390    jmp .bottom_done
   1391 .bottom_no_right:
   1392    pmovzxbw        m1, [botq+strideq*0-%1]
   1393    pmovzxbw        m2, [botq+strideq*1-%1]
   1394    movu  [px+(%2+0)*32-%1*2], m1
   1395    movu  [px+(%2+1)*32-%1*2], m2
   1396 %if %1 == 8
   1397    movd  [px+(%2-1)*32+%1*2], xm14                ; overwritten by previous movu
   1398 %endif
   1399    movd  [px+(%2+0)*32+%1*2], xm14
   1400    movd  [px+(%2+1)*32+%1*2], xm14
   1401    jmp .bottom_done
   1402 .bottom_no_left:
   1403    test          edgeb, 2                  ; have_right
   1404    jz .bottom_no_left_right
   1405    pmovzxbw        m1, [botq+strideq*0]
   1406    pmovzxbw        m2, [botq+strideq*1]
   1407    mova   [px+(%2+0)*32+0], m1
   1408    mova   [px+(%2+1)*32+0], m2
   1409    movd   [px+(%2+0)*32-4], xm14
   1410    movd   [px+(%2+1)*32-4], xm14
   1411    jmp .bottom_done
   1412 .bottom_no_left_right:
   1413 %if %1 == 4
   1414    movd           xm1, [botq+strideq*0]
   1415    pinsrd         xm1, [botq+strideq*1], 1
   1416    pmovzxbw       xm1, xm1
   1417    movq   [px+(%2+0)*32+0], xm1
   1418    movhps [px+(%2+1)*32+0], xm1
   1419 %else
   1420    pmovzxbw       xm1, [botq+strideq*0]
   1421    pmovzxbw       xm2, [botq+strideq*1]
   1422    mova   [px+(%2+0)*32+0], xm1
   1423    mova   [px+(%2+1)*32+0], xm2
   1424 %endif
   1425    movd   [px+(%2+0)*32-4], xm14
   1426    movd   [px+(%2+1)*32-4], xm14
   1427    movd  [px+(%2+0)*32+%1*2], xm14
   1428    movd  [px+(%2+1)*32+%1*2], xm14
   1429    jmp .bottom_done
   1430 .no_bottom:
   1431    movu   [px+(%2+0)*32-%1], m14
   1432    movu   [px+(%2+1)*32-%1], m14
   1433 .bottom_done:
   1434 
   1435    ; actual filter
   1436 INIT_YMM avx2
   1437 DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero
   1438 %undef edged
   1439    ; register to shuffle values into after packing
   1440    vbroadcasti128 m12, [shufb_lohi]
   1441 
   1442    mov       dampingd, r8m
   1443    xor          zerod, zerod
   1444    movifnidn     prid, prim
   1445    sub       dampingd, 31
   1446    movifnidn  secdmpd, secdmpm
   1447    test          prid, prid
   1448    jz .border_sec_only
   1449    movd           xm0, prid
   1450    lzcnt      pridmpd, prid
   1451    add        pridmpd, dampingd
   1452    cmovs      pridmpd, zerod
   1453    mov        [rsp+0], pridmpq                 ; pri_shift
   1454    test       secdmpd, secdmpd
   1455    jz .border_pri_only
   1456    movd           xm1, secdmpd
   1457    lzcnt      secdmpd, secdmpd
   1458    add        secdmpd, dampingd
   1459    mov        [rsp+8], secdmpq                 ; sec_shift
   1460 
   1461 DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3
   1462    lea         tableq, [tap_table]
   1463    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
   1464    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
   1465 
   1466    ; pri/sec_taps[k] [4 total]
   1467 DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3
   1468    vpbroadcastb    m0, xm0                     ; pri_strength
   1469    vpbroadcastb    m1, xm1                     ; sec_strength
   1470    and           prid, 1
   1471    lea           priq, [tableq+priq*2+8]       ; pri_taps
   1472    lea           secq, [tableq+12]             ; sec_taps
   1473 
   1474    BORDER_PREP_REGS %1, %2
   1475 %if %1*%2*2/mmsize > 1
   1476 .border_v_loop:
   1477 %endif
   1478    BORDER_LOAD_BLOCK %1, %2, 1
   1479 .border_k_loop:
   1480    vpbroadcastb    m2, [priq+kq]               ; pri_taps
   1481    vpbroadcastb    m3, [secq+kq]               ; sec_taps
   1482    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
   1483    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
   1484    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
   1485    dec             kq
   1486    jge .border_k_loop
   1487 
   1488    vpbroadcastd   m10, [pw_2048]
   1489    BORDER_ADJUST_PIXEL %1, m10, 1
   1490 %if %1*%2*2/mmsize > 1
   1491 %define vloop_lines (mmsize/(%1*2))
   1492    lea           dstq, [dstq+strideq*vloop_lines]
   1493    add           stkq, 32*vloop_lines
   1494    dec             hd
   1495    jg .border_v_loop
   1496 %endif
   1497    RET
   1498 
   1499 .border_pri_only:
   1500 DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3
   1501    lea         tableq, [tap_table]
   1502    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
   1503 DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3
   1504    vpbroadcastb    m0, xm0                     ; pri_strength
   1505    and           prid, 1
   1506    lea           priq, [tableq+priq*2+8]       ; pri_taps
   1507    BORDER_PREP_REGS %1, %2
   1508    vpbroadcastd    m1, [pw_2048]
   1509 %if %1*%2*2/mmsize > 1
   1510 .border_pri_v_loop:
   1511 %endif
   1512    BORDER_LOAD_BLOCK %1, %2
   1513 .border_pri_k_loop:
   1514    vpbroadcastb    m2, [priq+kq]               ; pri_taps
   1515    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
   1516    dec             kq
   1517    jge .border_pri_k_loop
   1518    BORDER_ADJUST_PIXEL %1, m1
   1519 %if %1*%2*2/mmsize > 1
   1520 %define vloop_lines (mmsize/(%1*2))
   1521    lea           dstq, [dstq+strideq*vloop_lines]
   1522    add           stkq, 32*vloop_lines
   1523    dec             hd
   1524    jg .border_pri_v_loop
   1525 %endif
   1526    RET
   1527 
   1528 .border_sec_only:
   1529 DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3
   1530    movd           xm1, secdmpd
   1531    lzcnt      secdmpd, secdmpd
   1532    add        secdmpd, dampingd
   1533    mov        [rsp+8], secdmpq                 ; sec_shift
   1534 DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3
   1535    lea         tableq, [tap_table]
   1536    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
   1537 DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3
   1538    vpbroadcastb    m1, xm1                     ; sec_strength
   1539    lea           secq, [tableq+12]             ; sec_taps
   1540    BORDER_PREP_REGS %1, %2
   1541    vpbroadcastd    m0, [pw_2048]
   1542 %if %1*%2*2/mmsize > 1
   1543 .border_sec_v_loop:
   1544 %endif
   1545    BORDER_LOAD_BLOCK %1, %2
   1546 .border_sec_k_loop:
   1547    vpbroadcastb    m3, [secq+kq]               ; sec_taps
   1548    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
   1549    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
   1550    dec             kq
   1551    jge .border_sec_k_loop
   1552    BORDER_ADJUST_PIXEL %1, m0
   1553 %if %1*%2*2/mmsize > 1
   1554 %define vloop_lines (mmsize/(%1*2))
   1555    lea           dstq, [dstq+strideq*vloop_lines]
   1556    add           stkq, 32*vloop_lines
   1557    dec             hd
   1558    jg .border_sec_v_loop
   1559 %endif
   1560    RET
   1561 %endmacro
   1562 
   1563 CDEF_FILTER 8, 8
   1564 CDEF_FILTER 4, 8
   1565 CDEF_FILTER 4, 4
   1566 
   1567 INIT_YMM avx2
   1568 cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3
   1569    lea       stride3q, [strideq*3]
   1570    movq           xm0, [srcq+strideq*0]
   1571    movq           xm1, [srcq+strideq*1]
   1572    movq           xm2, [srcq+strideq*2]
   1573    movq           xm3, [srcq+stride3q ]
   1574    lea           srcq, [srcq+strideq*4]
   1575    vpbroadcastq    m4, [srcq+stride3q ]
   1576    vpbroadcastq    m5, [srcq+strideq*2]
   1577    vpblendd        m0, m4, 0xf0
   1578    vpblendd        m1, m5, 0xf0
   1579    vpbroadcastq    m4, [srcq+strideq*1]
   1580    vpbroadcastq    m5, [srcq+strideq*0]
   1581    vpblendd        m2, m4, 0xf0
   1582    vpblendd        m3, m5, 0xf0
   1583    pxor            m4, m4
   1584    punpcklbw       m0, m4
   1585    punpcklbw       m1, m4
   1586    punpcklbw       m2, m4
   1587    punpcklbw       m3, m4
   1588 cglobal_label .main
   1589    vpbroadcastd    m4, [pw_128]
   1590    PROLOGUE 3, 4, 15
   1591    psubw           m0, m4
   1592    psubw           m1, m4
   1593    psubw           m2, m4
   1594    psubw           m3, m4
   1595 
   1596    ; shuffle registers to generate partial_sum_diag[0-1] together
   1597    vperm2i128      m7, m0, m0, 0x01
   1598    vperm2i128      m6, m1, m1, 0x01
   1599    vperm2i128      m5, m2, m2, 0x01
   1600    vperm2i128      m4, m3, m3, 0x01
   1601 
   1602    ; start with partial_sum_hv[0-1]
   1603    paddw           m8, m0, m1
   1604    paddw           m9, m2, m3
   1605    phaddw         m10, m0, m1
   1606    phaddw         m11, m2, m3
   1607    paddw           m8, m9
   1608    phaddw         m10, m11
   1609    vextracti128   xm9, m8, 1
   1610    vextracti128  xm11, m10, 1
   1611    paddw          xm8, xm9                 ; partial_sum_hv[1]
   1612    phaddw        xm10, xm11                ; partial_sum_hv[0]
   1613    vinserti128     m8, xm10, 1
   1614    vpbroadcastd    m9, [div_table+44]
   1615    pmaddwd         m8, m8
   1616    pmulld          m8, m9                  ; cost6[2a-d] | cost2[a-d]
   1617 
   1618    ; create aggregates [lower half]:
   1619    ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
   1620    ;      m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
   1621    ; m10=             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
   1622    ;      m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
   1623    ; and [upper half]:
   1624    ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
   1625    ;      m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
   1626    ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
   1627    ;      m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
   1628    ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
   1629 
   1630    pslldq          m9, m1, 2
   1631    psrldq         m10, m1, 14
   1632    pslldq         m11, m2, 4
   1633    psrldq         m12, m2, 12
   1634    pslldq         m13, m3, 6
   1635    psrldq         m14, m3, 10
   1636    paddw           m9, m11
   1637    paddw          m10, m12
   1638    paddw           m9, m13
   1639    paddw          m10, m14
   1640    pslldq         m11, m4, 8
   1641    psrldq         m12, m4, 8
   1642    pslldq         m13, m5, 10
   1643    psrldq         m14, m5, 6
   1644    paddw           m9, m11
   1645    paddw          m10, m12
   1646    paddw           m9, m13
   1647    paddw          m10, m14
   1648    pslldq         m11, m6, 12
   1649    psrldq         m12, m6, 4
   1650    pslldq         m13, m7, 14
   1651    psrldq         m14, m7, 2
   1652    paddw           m9, m11
   1653    paddw          m10, m12
   1654    paddw           m9, m13
   1655    paddw          m10, m14                 ; partial_sum_diag[0/1][8-14,zero]
   1656    vbroadcasti128 m14, [shufw_6543210x]
   1657    vbroadcasti128 m13, [div_table+16]
   1658    vbroadcasti128 m12, [div_table+0]
   1659    paddw           m9, m0                  ; partial_sum_diag[0/1][0-7]
   1660    pshufb         m10, m14
   1661    punpckhwd      m11, m9, m10
   1662    punpcklwd       m9, m10
   1663    pmaddwd        m11, m11
   1664    pmaddwd         m9, m9
   1665    pmulld         m11, m13
   1666    pmulld          m9, m12
   1667    paddd           m9, m11                 ; cost0[a-d] | cost4[a-d]
   1668 
   1669    ; merge horizontally and vertically for partial_sum_alt[0-3]
   1670    paddw          m10, m0, m1
   1671    paddw          m11, m2, m3
   1672    paddw          m12, m4, m5
   1673    paddw          m13, m6, m7
   1674    phaddw          m0, m4
   1675    phaddw          m1, m5
   1676    phaddw          m2, m6
   1677    phaddw          m3, m7
   1678 
   1679    ; create aggregates [lower half]:
   1680    ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
   1681    ; m11=              m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
   1682    ; and [upper half]:
   1683    ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
   1684    ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
   1685    ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
   1686 
   1687    pslldq          m4, m11, 2
   1688    psrldq         m11, 14
   1689    pslldq          m5, m12, 4
   1690    psrldq         m12, 12
   1691    pslldq          m6, m13, 6
   1692    psrldq         m13, 10
   1693    paddw           m4, m10
   1694    paddw          m11, m12
   1695    vpbroadcastd   m12, [div_table+44]
   1696    paddw           m5, m6
   1697    paddw          m11, m13                 ; partial_sum_alt[3/2] right
   1698    vbroadcasti128 m13, [div_table+32]
   1699    paddw           m4, m5                  ; partial_sum_alt[3/2] left
   1700    pshuflw         m5, m11, q3012
   1701    punpckhwd       m6, m11, m4
   1702    punpcklwd       m4, m5
   1703    pmaddwd         m6, m6
   1704    pmaddwd         m4, m4
   1705    pmulld          m6, m12
   1706    pmulld          m4, m13
   1707    paddd           m4, m6                  ; cost7[a-d] | cost5[a-d]
   1708 
   1709    ; create aggregates [lower half]:
   1710    ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
   1711    ; m1 =             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
   1712    ; and [upper half]:
   1713    ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
   1714    ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
   1715    ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
   1716 
   1717    pslldq          m5, m1, 2
   1718    psrldq          m1, 14
   1719    pslldq          m6, m2, 4
   1720    psrldq          m2, 12
   1721    pslldq          m7, m3, 6
   1722    psrldq          m3, 10
   1723    paddw           m5, m0
   1724    paddw           m1, m2
   1725    paddw           m6, m7
   1726    paddw           m1, m3                  ; partial_sum_alt[0/1] right
   1727    paddw           m5, m6                  ; partial_sum_alt[0/1] left
   1728    pshuflw         m0, m1, q3012
   1729    punpckhwd       m1, m5
   1730    punpcklwd       m5, m0
   1731    pmaddwd         m1, m1
   1732    pmaddwd         m5, m5
   1733    pmulld          m1, m12
   1734    pmulld          m5, m13
   1735    paddd           m5, m1                  ; cost1[a-d] | cost3[a-d]
   1736 
   1737    mova           xm0, [pd_47130256+ 16]
   1738    mova            m1, [pd_47130256]
   1739    phaddd          m9, m8
   1740    phaddd          m5, m4
   1741    phaddd          m9, m5
   1742    vpermd          m0, m9                  ; cost[0-3]
   1743    vpermd          m1, m9                  ; cost[4-7] | cost[0-3]
   1744 
   1745    ; now find the best cost
   1746    pmaxsd         xm2, xm0, xm1
   1747    pshufd         xm3, xm2, q1032
   1748    pmaxsd         xm2, xm3
   1749    pshufd         xm3, xm2, q2301
   1750    pmaxsd         xm2, xm3 ; best cost
   1751 
   1752    ; find the idx using minpos
   1753    ; make everything other than the best cost negative via subtraction
   1754    ; find the min of unsigned 16-bit ints to sort out the negative values
   1755    psubd          xm4, xm1, xm2
   1756    psubd          xm3, xm0, xm2
   1757    packssdw       xm3, xm4
   1758    phminposuw     xm3, xm3
   1759 
   1760    ; convert idx to 32-bits
   1761    psrld          xm3, 16
   1762    movd           eax, xm3
   1763 
   1764    ; get idx^4 complement
   1765    vpermd          m3, m1
   1766    psubd          xm2, xm3
   1767    psrld          xm2, 10
   1768    movd        [varq], xm2
   1769    RET
   1770 
   1771 %endif ; ARCH_X86_64