tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration_sse.asm (101312B)


      1 ; Copyright © 2018, VideoLAN and dav1d authors
      2 ; Copyright © 2018, Two Orioles, LLC
      3 ; Copyright © 2018, VideoLabs
      4 ; All rights reserved.
      5 ;
      6 ; Redistribution and use in source and binary forms, with or without
      7 ; modification, are permitted provided that the following conditions are met:
      8 ;
      9 ; 1. Redistributions of source code must retain the above copyright notice, this
     10 ;    list of conditions and the following disclaimer.
     11 ;
     12 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     13 ;    this list of conditions and the following disclaimer in the documentation
     14 ;    and/or other materials provided with the distribution.
     15 ;
     16 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 
     27 %include "config.asm"
     28 %include "ext/x86/x86inc.asm"
     29 
     30 SECTION_RODATA 16
     31 
     32 wiener_init:   db  6,  7,  6,  7,  6,  7,  6,  7,  0,  0,  0,  0,  2,  4,  2,  4
     33 wiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
     34 wiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
     35 wiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
     36 wiener_shufD:  db  4, -1,  5, -1,  6, -1,  7, -1,  8, -1,  9, -1, 10, -1, 11, -1
     37 wiener_l_shuf: db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
     38 sgr_lshuf3:    db  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
     39 sgr_lshuf5:    db  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12
     40 pb_0to15:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     41 
     42 pb_right_ext_mask: times 24 db 0xff
     43                   times 8 db 0
     44 pb_1:          times 16 db 1
     45 pw_256:        times 8 dw 256
     46 pw_2056:       times 8 dw 2056
     47 pw_m16380:     times 8 dw -16380
     48 pw_164_24:     times 4 dw 164, 24
     49 pw_455_24:     times 4 dw 455, 24
     50 pd_4096:       times 4 dd 4096
     51 pd_34816:      times 4 dd 34816
     52 pd_0xffff:     times 4 dd 0xffff
     53 pf_256:        times 4 dd 256.0
     54 
     55 SECTION .text
     56 
     57 %macro movif64 2 ; dst, src
     58 %if ARCH_X86_64
     59    mov             %1, %2
     60 %endif
     61 %endmacro
     62 
     63 %macro movif32 2 ; dst, src
     64 %if ARCH_X86_32
     65    mov             %1, %2
     66 %endif
     67 %endmacro
     68 
     69 %if ARCH_X86_32
     70 %define PIC_base_offset $$
     71 
     72 %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
     73  %assign pic_reg_stk_off 4
     74  %xdefine PIC_reg %1
     75  %if %2 == 1
     76    mov        [esp], %1
     77  %endif
     78    LEA      PIC_reg, PIC_base_offset
     79  %if %3 == 1
     80    XCHG_PIC_REG
     81  %endif
     82 %endmacro
     83 
     84 %macro XCHG_PIC_REG 0
     85    mov [esp+pic_reg_stk_off], PIC_reg
     86    %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
     87    mov PIC_reg, [esp+pic_reg_stk_off]
     88 %endmacro
     89 
     90 %define PIC_sym(sym)   (PIC_reg+(sym)-PIC_base_offset)
     91 
     92 %else
     93 %macro XCHG_PIC_REG 0
     94 %endmacro
     95 
     96 %define PIC_sym(sym)   (sym)
     97 %endif
     98 
     99 %macro WIENER 0
    100 %if ARCH_X86_64
    101 DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers
    102 cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
    103                                                    w, h, edge, flt, x
    104    %define tmpstrideq strideq
    105    %define base 0
    106    mov           fltq, r6mp
    107    mov             wd, wm
    108    movifnidn       hd, hm
    109    mov          edged, r7m
    110    movq           m14, [fltq]
    111    add           lpfq, wq
    112    movq            m7, [fltq+16]
    113    add           dstq, wq
    114    lea             t1, [rsp+wq*2+16]
    115    mova           m15, [pw_2056]
    116    neg             wq
    117 %if cpuflag(ssse3)
    118    pshufb         m14, [wiener_init]
    119    mova            m8, [wiener_shufA]
    120    pshufd         m12, m14, q2222  ; x0 x0
    121    mova            m9, [wiener_shufB]
    122    pshufd         m13, m14, q3333  ; x1 x2
    123    mova           m10, [wiener_shufC]
    124    punpcklqdq     m14, m14         ; x3
    125    mova           m11, [wiener_shufD]
    126 %else
    127    mova           m10, [pw_m16380]
    128    punpcklwd      m14, m14
    129    pshufd         m11, m14, q0000 ; x0
    130    pshufd         m12, m14, q1111 ; x1
    131    pshufd         m13, m14, q2222 ; x2
    132    pshufd         m14, m14, q3333 ; x3
    133 %endif
    134 %else
    135 DECLARE_REG_TMP 4, 0, _, 5
    136 %if cpuflag(ssse3)
    137    %define m10         [base+wiener_shufC]
    138    %define m11         [base+wiener_shufD]
    139    %define stk_off     96
    140 %else
    141    %define m10         [base+pw_m16380]
    142    %define m11         [stk+96]
    143    %define stk_off     112
    144 %endif
    145 cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride
    146    %define base        r6-pb_right_ext_mask-21
    147    %define stk         esp
    148    %define dstq        leftq
    149    %define edgeb       byte edged
    150    %define edged       [stk+ 8]
    151    %define dstmp       [stk+12]
    152    %define hd    dword [stk+16]
    153    %define wq          [stk+20]
    154    %define strideq     [stk+24]
    155    %define leftmp      [stk+28]
    156    %define t2          [stk+32]
    157    %define t4          [stk+36]
    158    %define t5          [stk+40]
    159    %define t6          [stk+44]
    160    %define m8          [base+wiener_shufA]
    161    %define m9          [base+wiener_shufB]
    162    %define m12         [stk+48]
    163    %define m13         [stk+64]
    164    %define m14         [stk+80]
    165    %define m15         [base+pw_2056]
    166    mov             r1, r6m ; flt
    167    mov             r0, r0m ; dst
    168    mov             r4, r4m ; w
    169    mov           lpfq, lpfm
    170    mov             r2, r7m ; edge
    171    mov             r5, r5m ; h
    172    movq            m3, [r1+ 0]
    173    movq            m7, [r1+16]
    174    add             r0, r4
    175    mov             r1, r1m ; stride
    176    add           lpfq, r4
    177    mov          edged, r2
    178    mov             r2, r2m ; left
    179    mov          dstmp, r0
    180    lea             t1, [rsp+r4*2+stk_off]
    181    mov             hd, r5
    182    neg             r4
    183    LEA             r6, pb_right_ext_mask+21
    184    mov             wq, r4
    185    mov        strideq, r1
    186    mov         leftmp, r2
    187    mov             r4, r1
    188 %if cpuflag(ssse3)
    189    pshufb          m3, [base+wiener_init]
    190    pshufd          m1, m3, q2222
    191    pshufd          m2, m3, q3333
    192    punpcklqdq      m3, m3
    193 %else
    194    punpcklwd       m3, m3
    195    pshufd          m0, m3, q0000
    196    pshufd          m1, m3, q1111
    197    pshufd          m2, m3, q2222
    198    pshufd          m3, m3, q3333
    199    mova           m11, m0
    200 %endif
    201    mova           m12, m1
    202    mova           m13, m2
    203    mova           m14, m3
    204 %endif
    205    psllw           m7, 5
    206    pshufd          m6, m7, q0000 ; y0 y1
    207    pshufd          m7, m7, q1111 ; y2 y3
    208    test         edgeb, 4 ; LR_HAVE_TOP
    209    jz .no_top
    210    call .h_top
    211    add           lpfq, strideq
    212    mov             t6, t1
    213    mov             t5, t1
    214    add             t1, 384*2
    215    call .h_top
    216    lea             t3, [lpfq+tmpstrideq*4]
    217    mov           lpfq, dstmp
    218    add             t3, tmpstrideq
    219    mov          [rsp], t3 ; below
    220    mov             t4, t1
    221    add             t1, 384*2
    222    call .h
    223    mov             t3, t1
    224    mov             t2, t1
    225    dec             hd
    226    jz .v1
    227    add           lpfq, strideq
    228    add             t1, 384*2
    229    call .h
    230    mov             t2, t1
    231    dec             hd
    232    jz .v2
    233    add           lpfq, strideq
    234    add             t1, 384*2
    235    call .h
    236    dec             hd
    237    jz .v3
    238 .main:
    239    lea             t0, [t1+384*2]
    240 .main_loop:
    241    call .hv
    242    dec             hd
    243    jnz .main_loop
    244    test         edgeb, 8 ; LR_HAVE_BOTTOM
    245    jz .v3
    246    mov           lpfq, [rsp]
    247    call .hv_bottom
    248    add           lpfq, strideq
    249    call .hv_bottom
    250 .v1:
    251    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
    252    RET
    253 .no_top:
    254    lea             t3, [lpfq+tmpstrideq*4]
    255    mov           lpfq, dstmp
    256    lea             t3, [t3+tmpstrideq*2]
    257    mov          [rsp], t3
    258    call .h
    259    mov             t6, t1
    260    mov             t5, t1
    261    mov             t4, t1
    262    mov             t3, t1
    263    mov             t2, t1
    264    dec             hd
    265    jz .v1
    266    add           lpfq, strideq
    267    add             t1, 384*2
    268    call .h
    269    mov             t2, t1
    270    dec             hd
    271    jz .v2
    272    add           lpfq, strideq
    273    add             t1, 384*2
    274    call .h
    275    dec             hd
    276    jz .v3
    277    lea             t0, [t1+384*2]
    278    call .hv
    279    dec             hd
    280    jz .v3
    281    add             t0, 384*8
    282    call .hv
    283    dec             hd
    284    jnz .main
    285 .v3:
    286    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
    287 .v2:
    288    call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
    289    jmp .v1
    290 .extend_right:
    291    movd            m2, [lpfq-1]
    292 %if ARCH_X86_64
    293    push            r0
    294    lea             r0, [pb_right_ext_mask+21]
    295    movu            m0, [r0+xq+0]
    296    movu            m1, [r0+xq+8]
    297    pop             r0
    298 %else
    299    movu            m0, [r6+xq+0]
    300    movu            m1, [r6+xq+8]
    301 %endif
    302 %if cpuflag(ssse3)
    303    pxor            m3, m3
    304    pshufb          m2, m3
    305 %else
    306    punpcklbw       m2, m2
    307    pshuflw         m2, m2, q0000
    308    punpcklqdq      m2, m2
    309 %endif
    310    pand            m4, m0
    311    pand            m5, m1
    312    pandn           m0, m2
    313    pandn           m1, m2
    314    por             m4, m0
    315    por             m5, m1
    316    ret
    317 .h:
    318    %define stk esp+4 ; offset due to call
    319    mov             xq, wq
    320    test         edgeb, 1 ; LR_HAVE_LEFT
    321    jz .h_extend_left
    322    movifnidn    leftq, leftmp
    323    mova            m4, [lpfq+xq]
    324    movd            m5, [leftq]
    325    add          leftq, 4
    326    pslldq          m4, 4
    327    por             m4, m5
    328    movifnidn   leftmp, leftq
    329    jmp .h_main
    330 .h_extend_left:
    331 %if cpuflag(ssse3)
    332    mova            m4, [lpfq+xq]
    333    pshufb          m4, [base+wiener_l_shuf]
    334 %else
    335    mova            m5, [lpfq+xq]
    336    pshufd          m4, m5, q2103
    337    punpcklbw       m5, m5
    338    punpcklwd       m5, m5
    339    movss           m4, m5
    340 %endif
    341    jmp .h_main
    342 .h_top:
    343    mov             xq, wq
    344    test         edgeb, 1 ; LR_HAVE_LEFT
    345    jz .h_extend_left
    346 .h_loop:
    347    movu            m4, [lpfq+xq-4]
    348 .h_main:
    349    movu            m5, [lpfq+xq+4]
    350    test         edgeb, 2 ; LR_HAVE_RIGHT
    351    jnz .h_have_right
    352    cmp             xd, -18
    353    jl .h_have_right
    354    call .extend_right
    355 .h_have_right:
    356 %macro %%h7 0
    357 %if cpuflag(ssse3)
    358    pshufb          m0, m4, m8
    359    pmaddubsw       m0, m12
    360    pshufb          m1, m5, m8
    361    pmaddubsw       m1, m12
    362    pshufb          m2, m4, m9
    363    pmaddubsw       m2, m13
    364    pshufb          m3, m5, m9
    365    pmaddubsw       m3, m13
    366    paddw           m0, m2
    367    pshufb          m2, m4, m10
    368    pmaddubsw       m2, m13
    369    paddw           m1, m3
    370    pshufb          m3, m5, m10
    371    pmaddubsw       m3, m13
    372    pshufb          m4, m11
    373    paddw           m0, m2
    374    pmullw          m2, m14, m4
    375    pshufb          m5, m11
    376    paddw           m1, m3
    377    pmullw          m3, m14, m5
    378    psllw           m4, 7
    379    psllw           m5, 7
    380    paddw           m0, m2
    381    mova            m2, [base+pw_m16380]
    382    paddw           m1, m3
    383    paddw           m4, m2
    384    paddw           m5, m2
    385    paddsw          m0, m4
    386    paddsw          m1, m5
    387 %else
    388    psrldq          m0, m4, 1
    389    pslldq          m1, m4, 1
    390    pxor            m3, m3
    391    punpcklbw       m0, m3
    392    punpckhbw       m1, m3
    393    paddw           m0, m1
    394    pmullw          m0, m11
    395    psrldq          m1, m4, 2
    396    pslldq          m2, m4, 2
    397    punpcklbw       m1, m3
    398    punpckhbw       m2, m3
    399    paddw           m1, m2
    400    pmullw          m1, m12
    401    paddw           m0, m1
    402    pshufd          m2, m4, q0321
    403    punpcklbw       m2, m3
    404    pmullw          m1, m14, m2
    405    paddw           m0, m1
    406    psrldq          m1, m4, 3
    407    pslldq          m4, 3
    408    punpcklbw       m1, m3
    409    punpckhbw       m4, m3
    410    paddw           m1, m4
    411    pmullw          m1, m13
    412    paddw           m0, m1
    413    psllw           m2, 7
    414    paddw           m2, m10
    415    paddsw          m0, m2
    416    psrldq          m1, m5, 1
    417    pslldq          m2, m5, 1
    418    punpcklbw       m1, m3
    419    punpckhbw       m2, m3
    420    paddw           m1, m2
    421    pmullw          m1, m11
    422    psrldq          m2, m5, 2
    423    pslldq          m4, m5, 2
    424    punpcklbw       m2, m3
    425    punpckhbw       m4, m3
    426    paddw           m2, m4
    427    pmullw          m2, m12
    428    paddw           m1, m2
    429    pshufd          m4, m5, q0321
    430    punpcklbw       m4, m3
    431    pmullw          m2, m14, m4
    432    paddw           m1, m2
    433    psrldq          m2, m5, 3
    434    pslldq          m5, 3
    435    punpcklbw       m2, m3
    436    punpckhbw       m5, m3
    437    paddw           m2, m5
    438    pmullw          m2, m13
    439    paddw           m1, m2
    440    psllw           m4, 7
    441    paddw           m4, m10
    442    paddsw          m1, m4
    443 %endif
    444 %endmacro
    445    %%h7
    446    psraw           m0, 3
    447    psraw           m1, 3
    448    paddw           m0, m15
    449    paddw           m1, m15
    450    mova  [t1+xq*2+ 0], m0
    451    mova  [t1+xq*2+16], m1
    452    add             xq, 16
    453    jl .h_loop
    454    ret
    455 ALIGN function_align
    456 .hv:
    457    add           lpfq, strideq
    458    mov             xq, wq
    459    test         edgeb, 1 ; LR_HAVE_LEFT
    460    jz .hv_extend_left
    461    movifnidn    leftq, leftmp
    462    mova            m4, [lpfq+xq]
    463    movd            m5, [leftq]
    464    add          leftq, 4
    465    pslldq          m4, 4
    466    por             m4, m5
    467    movifnidn   leftmp, leftq
    468    jmp .hv_main
    469 .hv_extend_left:
    470 %if cpuflag(ssse3)
    471    mova            m4, [lpfq+xq]
    472    pshufb          m4, [base+wiener_l_shuf]
    473 %else
    474    mova            m5, [lpfq+xq]
    475    pshufd          m4, m5, q2103
    476    punpcklbw       m5, m5
    477    punpcklwd       m5, m5
    478    movss           m4, m5
    479 %endif
    480    jmp .hv_main
    481 .hv_bottom:
    482    mov             xq, wq
    483    test         edgeb, 1 ; LR_HAVE_LEFT
    484    jz .hv_extend_left
    485 .hv_loop:
    486    movu            m4, [lpfq+xq-4]
    487 .hv_main:
    488    movu            m5, [lpfq+xq+4]
    489    test         edgeb, 2 ; LR_HAVE_RIGHT
    490    jnz .hv_have_right
    491    cmp             xd, -18
    492    jl .hv_have_right
    493    call .extend_right
    494 .hv_have_right:
    495    %%h7
    496 %if ARCH_X86_64
    497    mova            m2, [t4+xq*2]
    498    paddw           m2, [t2+xq*2]
    499 %else
    500    mov             r2, t4
    501    mova            m2, [r2+xq*2]
    502    mov             r2, t2
    503    paddw           m2, [r2+xq*2]
    504    mov             r2, t5
    505 %endif
    506    mova            m3, [t3+xq*2]
    507 %if ARCH_X86_64
    508    mova            m5, [t5+xq*2]
    509 %else
    510    mova            m5, [r2+xq*2]
    511    mov             r2, t6
    512 %endif
    513    paddw           m5, [t1+xq*2]
    514    psraw           m0, 3
    515    psraw           m1, 3
    516    paddw           m0, m15
    517    paddw           m1, m15
    518 %if ARCH_X86_64
    519    paddw           m4, m0, [t6+xq*2]
    520 %else
    521    paddw           m4, m0, [r2+xq*2]
    522    mov             r2, t4
    523 %endif
    524    mova     [t0+xq*2], m0
    525    punpcklwd       m0, m2, m3
    526    pmaddwd         m0, m7
    527    punpckhwd       m2, m3
    528    pmaddwd         m2, m7
    529    punpcklwd       m3, m4, m5
    530    pmaddwd         m3, m6
    531    punpckhwd       m4, m5
    532    pmaddwd         m4, m6
    533    paddd           m0, m3
    534    mova            m3, [t3+xq*2+16]
    535    paddd           m4, m2
    536 %if ARCH_X86_64
    537    mova            m2, [t4+xq*2+16]
    538    paddw           m2, [t2+xq*2+16]
    539    mova            m5, [t5+xq*2+16]
    540 %else
    541    mova            m2, [r2+xq*2+16]
    542    mov             r2, t2
    543    paddw           m2, [r2+xq*2+16]
    544    mov             r2, t5
    545    mova            m5, [r2+xq*2+16]
    546    mov             r2, t6
    547 %endif
    548    paddw           m5, [t1+xq*2+16]
    549    packuswb        m0, m4
    550 %if ARCH_X86_64
    551    paddw           m4, m1, [t6+xq*2+16]
    552 %else
    553    paddw           m4, m1, [r2+xq*2+16]
    554    mov           dstq, dstmp
    555 %endif
    556    mova  [t0+xq*2+16], m1
    557    punpcklwd       m1, m2, m3
    558    pmaddwd         m1, m7
    559    punpckhwd       m2, m3
    560    pmaddwd         m2, m7
    561    punpcklwd       m3, m4, m5
    562    pmaddwd         m3, m6
    563    punpckhwd       m4, m5
    564    pmaddwd         m4, m6
    565    paddd           m1, m3
    566    paddd           m2, m4
    567    packuswb        m1, m2
    568    psrlw           m0, 8
    569    psrlw           m1, 8
    570    packuswb        m0, m1
    571    mova     [dstq+xq], m0
    572    add             xq, 16
    573    jl .hv_loop
    574    add           dstq, strideq
    575 %if ARCH_X86_64
    576    mov             t6, t5
    577    mov             t5, t4
    578    mov             t4, t3
    579    mov             t3, t2
    580    mov             t2, t1
    581    mov             t1, t0
    582    mov             t0, t6
    583 %else
    584    mov          dstmp, dstq
    585    mov             r1, t5
    586    mov             r2, t4
    587    mov             t6, r1
    588    mov             t5, r2
    589    mov             t4, t3
    590    mov             t3, t2
    591    mov             t2, t1
    592    mov             t1, t0
    593    mov             t0, r1
    594 %endif
    595    ret
    596 %if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
    597 .v:
    598    mov             xq, wq
    599 .v_loop:
    600 %if ARCH_X86_64
    601    mova            m1, [t4+xq*2]
    602    paddw           m1, [t2+xq*2]
    603 %else
    604    mov             r2, t4
    605    mova            m1, [r2+xq*2]
    606    mov             r2, t2
    607    paddw           m1, [r2+xq*2]
    608    mov             r2, t6
    609 %endif
    610    mova            m2, [t3+xq*2]
    611    mova            m4, [t1+xq*2]
    612 %if ARCH_X86_64
    613    paddw           m3, m4, [t6+xq*2]
    614    paddw           m4, [t5+xq*2]
    615 %else
    616    paddw           m3, m4, [r2+xq*2]
    617    mov             r2, t5
    618    paddw           m4, [r2+xq*2]
    619    mov             r2, t4
    620 %endif
    621    punpcklwd       m0, m1, m2
    622    pmaddwd         m0, m7
    623    punpckhwd       m1, m2
    624    pmaddwd         m1, m7
    625    punpcklwd       m2, m3, m4
    626    pmaddwd         m2, m6
    627    punpckhwd       m3, m4
    628    pmaddwd         m3, m6
    629    paddd           m0, m2
    630    paddd           m1, m3
    631 %if ARCH_X86_64
    632    mova            m2, [t4+xq*2+16]
    633    paddw           m2, [t2+xq*2+16]
    634 %else
    635    mova            m2, [r2+xq*2+16]
    636    mov             r2, t2
    637    paddw           m2, [r2+xq*2+16]
    638    mov             r2, t6
    639 %endif
    640    mova            m3, [t3+xq*2+16]
    641    mova            m5, [t1+xq*2+16]
    642 %if ARCH_X86_64
    643    paddw           m4, m5, [t6+xq*2+16]
    644    paddw           m5, [t5+xq*2+16]
    645 %else
    646    paddw           m4, m5, [r2+xq*2+16]
    647    mov             r2, t5
    648    paddw           m5, [r2+xq*2+16]
    649    movifnidn     dstq, dstmp
    650 %endif
    651    packuswb        m0, m1
    652    punpcklwd       m1, m2, m3
    653    pmaddwd         m1, m7
    654    punpckhwd       m2, m3
    655    pmaddwd         m2, m7
    656    punpcklwd       m3, m4, m5
    657    pmaddwd         m3, m6
    658    punpckhwd       m4, m5
    659    pmaddwd         m4, m6
    660    paddd           m1, m3
    661    paddd           m2, m4
    662    packuswb        m1, m2
    663    psrlw           m0, 8
    664    psrlw           m1, 8
    665    packuswb        m0, m1
    666    mova     [dstq+xq], m0
    667    add             xq, 16
    668    jl .v_loop
    669    add           dstq, strideq
    670 %if ARCH_X86_64
    671    mov             t6, t5
    672    mov             t5, t4
    673 %else
    674    mov          dstmp, dstq
    675    mov             r1, t5
    676    mov             r2, t4
    677    mov             t6, r1
    678    mov             t5, r2
    679 %endif
    680    mov             t4, t3
    681    mov             t3, t2
    682    mov             t2, t1
    683    ret
    684 %endif
    685 
    686 %if ARCH_X86_64
    687 cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
    688                                                  w, h, edge, flt, x
    689    mov           fltq, r6mp
    690    mov             wd, wm
    691    movifnidn       hd, hm
    692    mov          edged, r7m
    693    movq           m14, [fltq]
    694    add           lpfq, wq
    695    movq            m7, [fltq+16]
    696    add           dstq, wq
    697    mova            m8, [pw_m16380]
    698    lea             t1, [rsp+wq*2+16]
    699    mova           m15, [pw_2056]
    700    neg             wq
    701 %if cpuflag(ssse3)
    702    pshufb         m14, [wiener_init]
    703    mova            m9, [wiener_shufB]
    704    pshufd         m13, m14, q3333  ; x1 x2
    705    mova           m10, [wiener_shufC]
    706    punpcklqdq     m14, m14         ; x3
    707    mova           m11, [wiener_shufD]
    708    mova           m12, [wiener_l_shuf]
    709 %else
    710    punpcklwd      m14, m14
    711    pshufd         m11, m14, q1111 ; x1
    712    pshufd         m13, m14, q2222 ; x2
    713    pshufd         m14, m14, q3333 ; x3
    714 %endif
    715 %else
    716 %if cpuflag(ssse3)
    717    %define stk_off     80
    718 %else
    719    %define m11         [stk+80]
    720    %define stk_off     96
    721 %endif
    722 cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride
    723    %define stk         esp
    724    %define leftmp      [stk+28]
    725    %define m8          [base+pw_m16380]
    726    %define m12         [base+wiener_l_shuf]
    727    %define m14         [stk+48]
    728    mov             r1, r6m ; flt
    729    mov             r0, r0m ; dst
    730    mov             r4, r4m ; w
    731    mov           lpfq, lpfm
    732    mov             r2, r7m ; edge
    733    mov             r5, r5m ; h
    734    movq            m2, [r1+ 0]
    735    movq            m7, [r1+16]
    736    add             r0, r4
    737    mov             r1, r1m ; stride
    738    add           lpfq, r4
    739    mov          edged, r2
    740    mov             r2, r2m ; left
    741    mov          dstmp, r0
    742    lea             t1, [rsp+r4*2+stk_off]
    743    mov             hd, r5
    744    neg             r4
    745    LEA             r6, pb_right_ext_mask+21
    746    mov             wq, r4
    747    mov        strideq, r1
    748    mov         leftmp, r2
    749    mov             r4, r1
    750 %if cpuflag(ssse3)
    751    pshufb          m2, [base+wiener_init]
    752    pshufd          m1, m2, q3333
    753    punpcklqdq      m2, m2
    754 %else
    755    punpcklwd       m2, m2
    756    pshufd          m0, m2, q1111
    757    pshufd          m1, m2, q2222
    758    pshufd          m2, m2, q3333
    759    mova           m11, m0
    760 %endif
    761    mova           m13, m1
    762    mova           m14, m2
    763 %endif
    764    psllw           m7, 5
    765    pshufd          m6, m7, q0000 ; __ y1
    766    pshufd          m7, m7, q1111 ; y2 y3
    767    test         edgeb, 4 ; LR_HAVE_TOP
    768    jz .no_top
    769    call .h_top
    770    add           lpfq, strideq
    771    mov             t4, t1
    772    add             t1, 384*2
    773    call .h_top
    774    lea             xq, [lpfq+tmpstrideq*4]
    775    mov           lpfq, dstmp
    776    mov             t3, t1
    777    add             t1, 384*2
    778    add             xq, tmpstrideq
    779    mov          [rsp], xq ; below
    780    call .h
    781    mov             t2, t1
    782    dec             hd
    783    jz .v1
    784    add           lpfq, strideq
    785    add             t1, 384*2
    786    call .h
    787    dec             hd
    788    jz .v2
    789 .main:
    790    mov             t0, t4
    791 .main_loop:
    792    call .hv
    793    dec             hd
    794    jnz .main_loop
    795    test         edgeb, 8 ; LR_HAVE_BOTTOM
    796    jz .v2
    797    mov           lpfq, [rsp]
    798    call .hv_bottom
    799    add           lpfq, strideq
    800    call .hv_bottom
    801 .end:
    802    RET
    803 .no_top:
    804    lea             t3, [lpfq+tmpstrideq*4]
    805    mov           lpfq, dstmp
    806    lea             t3, [t3+tmpstrideq*2]
    807    mov          [rsp], t3
    808    call .h
    809    mov             t4, t1
    810    mov             t3, t1
    811    mov             t2, t1
    812    dec             hd
    813    jz .v1
    814    add           lpfq, strideq
    815    add             t1, 384*2
    816    call .h
    817    dec             hd
    818    jz .v2
    819    lea             t0, [t1+384*2]
    820    call .hv
    821    dec             hd
    822    jz .v2
    823    add             t0, 384*6
    824    call .hv
    825    dec             hd
    826    jnz .main
    827 .v2:
    828    call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
    829    add           dstq, strideq
    830    mov             t4, t3
    831    mov             t3, t2
    832    mov             t2, t1
    833    movifnidn    dstmp, dstq
    834 .v1:
    835    call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
    836    jmp .end
    837 .h:
    838    %define stk esp+4
    839    mov             xq, wq
    840    test         edgeb, 1 ; LR_HAVE_LEFT
    841    jz .h_extend_left
    842    movifnidn    leftq, leftmp
    843    mova            m4, [lpfq+xq]
    844    movd            m5, [leftq]
    845    add          leftq, 4
    846    pslldq          m4, 4
    847    por             m4, m5
    848    movifnidn   leftmp, leftq
    849    jmp .h_main
    850 .h_extend_left:
    851 %if cpuflag(ssse3)
    852    mova            m4, [lpfq+xq]
    853    pshufb          m4, m12
    854 %else
    855    mova            m5, [lpfq+xq]
    856    pshufd          m4, m5, q2103
    857    punpcklbw       m5, m5
    858    punpcklwd       m5, m5
    859    movss           m4, m5
    860 %endif
    861    jmp .h_main
    862 .h_top:
    863    mov             xq, wq
    864    test         edgeb, 1 ; LR_HAVE_LEFT
    865    jz .h_extend_left
    866 .h_loop:
    867    movu            m4, [lpfq+xq-4]
    868 .h_main:
    869    movu            m5, [lpfq+xq+4]
    870    test         edgeb, 2 ; LR_HAVE_RIGHT
    871    jnz .h_have_right
    872    cmp             xd, -17
    873    jl .h_have_right
    874    call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
    875 .h_have_right:
    876 %macro %%h5 0
    877 %if cpuflag(ssse3)
    878    pshufb          m0, m4, m9
    879    pmaddubsw       m0, m13
    880    pshufb          m1, m5, m9
    881    pmaddubsw       m1, m13
    882    pshufb          m2, m4, m10
    883    pmaddubsw       m2, m13
    884    pshufb          m3, m5, m10
    885    pmaddubsw       m3, m13
    886    pshufb          m4, m11
    887    paddw           m0, m2
    888    pmullw          m2, m14, m4
    889    pshufb          m5, m11
    890    paddw           m1, m3
    891    pmullw          m3, m14, m5
    892    psllw           m4, 7
    893    psllw           m5, 7
    894    paddw           m4, m8
    895    paddw           m5, m8
    896    paddw           m0, m2
    897    paddw           m1, m3
    898    paddsw          m0, m4
    899    paddsw          m1, m5
    900 %else
    901    psrldq          m0, m4, 2
    902    pslldq          m1, m4, 2
    903    pxor            m3, m3
    904    punpcklbw       m0, m3
    905    punpckhbw       m1, m3
    906    paddw           m0, m1
    907    pmullw          m0, m11
    908    pshufd          m2, m4, q0321
    909    punpcklbw       m2, m3
    910    pmullw          m1, m14, m2
    911    paddw           m0, m1
    912    psrldq          m1, m4, 3
    913    pslldq          m4, 3
    914    punpcklbw       m1, m3
    915    punpckhbw       m4, m3
    916    paddw           m1, m4
    917    pmullw          m1, m13
    918    paddw           m0, m1
    919    psllw           m2, 7
    920    paddw           m2, m8
    921    paddsw          m0, m2
    922    psrldq          m1, m5, 2
    923    pslldq          m4, m5, 2
    924    punpcklbw       m1, m3
    925    punpckhbw       m4, m3
    926    paddw           m1, m4
    927    pmullw          m1, m11
    928    pshufd          m4, m5, q0321
    929    punpcklbw       m4, m3
    930    pmullw          m2, m14, m4
    931    paddw           m1, m2
    932    psrldq          m2, m5, 3
    933    pslldq          m5, 3
    934    punpcklbw       m2, m3
    935    punpckhbw       m5, m3
    936    paddw           m2, m5
    937    pmullw          m2, m13
    938    paddw           m1, m2
    939    psllw           m4, 7
    940    paddw           m4, m8
    941    paddsw          m1, m4
    942 %endif
    943 %endmacro
    944    %%h5
    945    psraw           m0, 3
    946    psraw           m1, 3
    947    paddw           m0, m15
    948    paddw           m1, m15
    949    mova  [t1+xq*2+ 0], m0
    950    mova  [t1+xq*2+16], m1
    951    add             xq, 16
    952    jl .h_loop
    953    ret
    954 ALIGN function_align
    955 .hv:
    956    add           lpfq, strideq
    957    mov             xq, wq
    958    test         edgeb, 1 ; LR_HAVE_LEFT
    959    jz .hv_extend_left
    960    movifnidn    leftq, leftmp
    961    mova            m4, [lpfq+xq]
    962    movd            m5, [leftq]
    963    add          leftq, 4
    964    pslldq          m4, 4
    965    por             m4, m5
    966    movifnidn   leftmp, leftq
    967    jmp .hv_main
    968 .hv_extend_left:
    969 %if cpuflag(ssse3)
    970    mova            m4, [lpfq+xq]
    971    pshufb          m4, m12
    972 %else
    973    mova            m5, [lpfq+xq]
    974    pshufd          m4, m5, q2103
    975    punpcklbw       m5, m5
    976    punpcklwd       m5, m5
    977    movss           m4, m5
    978 %endif
    979    jmp .hv_main
    980 .hv_bottom:
    981    mov             xq, wq
    982    test         edgeb, 1 ; LR_HAVE_LEFT
    983    jz .hv_extend_left
    984 .hv_loop:
    985    movu            m4, [lpfq+xq-4]
    986 .hv_main:
    987    movu            m5, [lpfq+xq+4]
    988    test         edgeb, 2 ; LR_HAVE_RIGHT
    989    jnz .hv_have_right
    990    cmp             xd, -17
    991    jl .hv_have_right
    992    call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
    993 .hv_have_right:
    994    %%h5
    995    mova            m2, [t3+xq*2]
    996    paddw           m2, [t1+xq*2]
    997    psraw           m0, 3
    998    psraw           m1, 3
    999    paddw           m0, m15
   1000    paddw           m1, m15
   1001 %if ARCH_X86_64
   1002    mova            m3, [t2+xq*2]
   1003    paddw           m4, m0, [t4+xq*2]
   1004 %else
   1005    mov             r2, t2
   1006    mova            m3, [r2+xq*2]
   1007    mov             r2, t4
   1008    paddw           m4, m0, [r2+xq*2]
   1009 %endif
   1010    mova     [t0+xq*2], m0
   1011    punpcklwd       m0, m2, m3
   1012    pmaddwd         m0, m7
   1013    punpckhwd       m2, m3
   1014    pmaddwd         m2, m7
   1015    punpcklwd       m3, m4, m4
   1016    pmaddwd         m3, m6
   1017    punpckhwd       m4, m4
   1018    pmaddwd         m4, m6
   1019    paddd           m0, m3
   1020    paddd           m4, m2
   1021    mova            m2, [t3+xq*2+16]
   1022    paddw           m2, [t1+xq*2+16]
   1023    packuswb        m0, m4
   1024 %if ARCH_X86_64
   1025    mova            m3, [t2+xq*2+16]
   1026    paddw           m4, m1, [t4+xq*2+16]
   1027 %else
   1028    paddw           m4, m1, [r2+xq*2+16]
   1029    mov             r2, t2
   1030    mova            m3, [r2+xq*2+16]
   1031    mov           dstq, dstmp
   1032 %endif
   1033    mova  [t0+xq*2+16], m1
   1034    punpcklwd       m1, m2, m3
   1035    pmaddwd         m1, m7
   1036    punpckhwd       m2, m3
   1037    pmaddwd         m2, m7
   1038    punpcklwd       m3, m4, m4
   1039    pmaddwd         m3, m6
   1040    punpckhwd       m4, m4
   1041    pmaddwd         m4, m6
   1042    paddd           m1, m3
   1043    paddd           m2, m4
   1044    packuswb        m1, m2
   1045    psrlw           m0, 8
   1046    psrlw           m1, 8
   1047    packuswb        m0, m1
   1048    mova     [dstq+xq], m0
   1049    add             xq, 16
   1050    jl .hv_loop
   1051    add           dstq, strideq
   1052    mov             t4, t3
   1053    mov             t3, t2
   1054    mov             t2, t1
   1055    mov             t1, t0
   1056    mov             t0, t4
   1057    movifnidn    dstmp, dstq
   1058    ret
   1059 %if cpuflag(ssse3)
   1060 .v:
   1061    mov             xq, wq
   1062 .v_loop:
   1063    mova            m3, [t1+xq*2]
   1064    paddw           m1, m3, [t3+xq*2]
   1065 %if ARCH_X86_64
   1066    mova            m2, [t2+xq*2]
   1067    paddw           m3, [t4+xq*2]
   1068 %else
   1069    mov             r2, t2
   1070    mova            m2, [r2+xq*2]
   1071    mov             r2, t4
   1072    paddw           m3, [r2+xq*2]
   1073 %endif
   1074    punpcklwd       m0, m1, m2
   1075    pmaddwd         m0, m7
   1076    punpckhwd       m1, m2
   1077    pmaddwd         m1, m7
   1078    punpcklwd       m2, m3
   1079    pmaddwd         m2, m6
   1080    punpckhwd       m3, m3
   1081    pmaddwd         m3, m6
   1082    paddd           m0, m2
   1083    paddd           m1, m3
   1084    mova            m4, [t1+xq*2+16]
   1085    paddw           m2, m4, [t3+xq*2+16]
   1086 %if ARCH_X86_64
   1087    mova            m3, [t2+xq*2+16]
   1088    paddw           m4, [t4+xq*2+16]
   1089 %else
   1090    paddw           m4, [r2+xq*2+16]
   1091    mov             r2, t2
   1092    mova            m3, [r2+xq*2+16]
   1093    mov           dstq, dstmp
   1094 %endif
   1095    packuswb        m0, m1
   1096    punpcklwd       m1, m2, m3
   1097    pmaddwd         m1, m7
   1098    punpckhwd       m2, m3
   1099    pmaddwd         m2, m7
   1100    punpcklwd       m3, m4
   1101    pmaddwd         m3, m6
   1102    punpckhwd       m4, m4
   1103    pmaddwd         m4, m6
   1104    paddd           m1, m3
   1105    paddd           m2, m4
   1106    packuswb        m1, m2
   1107    psrlw           m0, 8
   1108    psrlw           m1, 8
   1109    packuswb        m0, m1
   1110    mova     [dstq+xq], m0
   1111    add             xq, 16
   1112    jl .v_loop
   1113    ret
   1114 %endif
   1115 %endmacro
   1116 
   1117 INIT_XMM sse2
   1118 WIENER
   1119 
   1120 INIT_XMM ssse3
   1121 WIENER
   1122 
   1123 ;;;;;;;;;;;;;;;;;;;;;;;;;;
   1124 ;;      self-guided     ;;
   1125 ;;;;;;;;;;;;;;;;;;;;;;;;;;
   1126 
   1127 %macro MUL_32X16X2 6 ; dst[1-2], src[1-2], tmp[1-2]
   1128    pmulhuw         %5, %1, %3
   1129    pmulhuw         %6, %2, %4
   1130    pmullw          %1, %3
   1131    pmullw          %2, %4
   1132    pslld           %5, 16
   1133    pslld           %6, 16
   1134    paddd           %1, %5
   1135    paddd           %2, %6
   1136 %endmacro
   1137 
   1138 %macro SGR_CALC_X 9 ; dst, tmp, b[1-2], an[1-2], s, b_mul, pf_256
   1139    pmaddwd         %1, %3, %3             ; b * b
   1140    pmaddwd         %2, %4, %4
   1141    psubd           %5, %1                 ; p
   1142    psubd           %6, %2
   1143    MUL_32X16X2     %5, %6, %7, %7, %1, %2 ; p * s
   1144    pmaddwd         %3, %8                 ; b * b_mul
   1145    pmaddwd         %4, %8
   1146    paddw           %5, %8
   1147    paddw           %6, %8
   1148    psrld           %5, 20                 ; z + 1
   1149    psrld           %6, 20
   1150    cvtdq2ps        %5, %5
   1151    cvtdq2ps        %6, %6
   1152    rcpps           %1, %5                 ; 1 / (z + 1)
   1153    rcpps           %2, %6
   1154    cmpltps         %5, %9
   1155    cmpltps         %6, %9
   1156    mulps           %1, %9                 ; 256 / (z + 1)
   1157    mulps           %2, %9
   1158    packssdw        %5, %6
   1159    cvtps2dq        %1, %1
   1160    cvtps2dq        %2, %2
   1161    psrlw           %5, 8                  ; z < 255 ? 255 : 0
   1162    packssdw        %1, %2
   1163    pminsw          %1, %5                 ; x
   1164 %endmacro
   1165 
   1166 %if ARCH_X86_32
   1167 DECLARE_REG_TMP 0, 1, 2, 3, 5
   1168 %if STACK_ALIGNMENT < 16
   1169  %assign extra_stack 5*16
   1170 %else
   1171  %assign extra_stack 3*16
   1172 %endif
   1173 cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \
   1174                             dst, stride, left, lpf, w
   1175 %if STACK_ALIGNMENT < 16
   1176  %define dstm         dword [esp+calloff+16*0+4*6]
   1177  %define stridemp     dword [esp+calloff+16*0+4*7]
   1178  %define leftm        dword [esp+calloff+16*3+4*0]
   1179  %define lpfm         dword [esp+calloff+16*3+4*1]
   1180  %define w0m          dword [esp+calloff+16*3+4*2]
   1181  %define hd           dword [esp+calloff+16*3+4*3]
   1182  %define edgeb         byte [esp+calloff+16*3+4*4]
   1183  %define edged        dword [esp+calloff+16*3+4*4]
   1184  %define leftmp leftm
   1185 %else
   1186  %define w0m wm
   1187  %define hd dword r5m
   1188  %define edgeb  byte r7m
   1189  %define edged dword r7m
   1190 %endif
   1191 %define hvsrcm dword [esp+calloff+4*0]
   1192 %define w1m    dword [esp+calloff+4*1]
   1193 %define t0m    dword [esp+calloff+4*2]
   1194 %define t2m    dword [esp+calloff+4*3]
   1195 %define t3m    dword [esp+calloff+4*4]
   1196 %define t4m    dword [esp+calloff+4*5]
   1197 %define  m8 [base+pb_1]
   1198 %define  m9 [esp+calloff+16*2]
   1199 %define m10 [base+pw_164_24]
   1200 %define m11 [base+sgr_lshuf5]
   1201 %define m12 [base+pd_34816]
   1202 %define m13 [base+pb_0to15]
   1203 %define m14 [base+pf_256]
   1204 %define r10 r4
   1205 %define base r6-pw_2056
   1206 %assign calloff 0
   1207 %if STACK_ALIGNMENT < 16
   1208    mov        strideq, [rstk+stack_offset+ 8]
   1209    mov          leftq, [rstk+stack_offset+12]
   1210    mov           lpfq, [rstk+stack_offset+16]
   1211    mov             wd, [rstk+stack_offset+20]
   1212    mov           dstm, dstq
   1213    mov       stridemp, strideq
   1214    mov          leftm, leftq
   1215    mov             r1, [rstk+stack_offset+24]
   1216    mov             r2, [rstk+stack_offset+32]
   1217    mov           lpfm, lpfq
   1218    mov             hd, r1
   1219    mov          edged, r2
   1220 %endif
   1221 %else
   1222 DECLARE_REG_TMP 8, 7, 9, 11, 12
   1223 cglobal sgr_filter_5x5_8bpc, 4, 13, 15, -400*24-16, dst, stride, left, lpf, \
   1224                                                    w, h, edge, params
   1225 %endif
   1226 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16
   1227    mov             wd, wm
   1228 %endif
   1229 %if ARCH_X86_64
   1230    mov        paramsq, r6mp
   1231    movifnidn       hd, hm
   1232    mov          edged, r7m
   1233    movu            m9, [paramsq]
   1234    add           lpfq, wq
   1235    mova            m8, [pb_1]
   1236    lea             t1, [rsp+wq*2+20]
   1237    mova           m10, [pw_164_24]
   1238    add           dstq, wq
   1239    lea             t3, [rsp+wq*4+400*12+16]
   1240    mova           m12, [pd_34816]  ; (1 << 11) + (1 << 15)
   1241    lea             t4, [rsp+wq*2+400*20+16]
   1242    pshufhw         m7, m9, q0000
   1243    pshufb          m9, [pw_256]  ; s0
   1244    punpckhqdq      m7, m7        ; w0
   1245    neg             wq
   1246    mova           m13, [pb_0to15]
   1247    pxor            m6, m6
   1248    mova           m11, [sgr_lshuf5]
   1249    psllw           m7, 4
   1250    movaps         m14, [pf_256]
   1251 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
   1252 %define lpfm [rsp]
   1253 %else
   1254    mov             r1, [rstk+stack_offset+28] ; params
   1255    LEA             r6, pw_2056
   1256    movu            m1, [r1]
   1257    add           lpfm, wq
   1258    lea             t1, [rsp+extra_stack+wq*2+20]
   1259    add           dstq, wq
   1260    lea             t3, [rsp+extra_stack+wq*4+400*12+16]
   1261    mov           dstm, dstq
   1262    lea             t4, [rsp+extra_stack+wq*2+400*20+16]
   1263    mov            t3m, t3
   1264    pshufhw         m7, m1, q0000
   1265    mov            t4m, t4
   1266    pshufb          m1, [base+pw_256] ; s0
   1267    punpckhqdq      m7, m7            ; w0
   1268    psllw           m7, 4
   1269    neg             wq
   1270    mova            m9, m1
   1271    pxor            m6, m6
   1272    mov            w1m, wd
   1273    sub             wd, 2
   1274    mov           lpfq, lpfm
   1275    mov            w0m, wd
   1276 %define strideq r5
   1277 %endif
   1278    test         edgeb, 4 ; LR_HAVE_TOP
   1279    jz .no_top
   1280    call .h_top
   1281    add           lpfq, stridemp
   1282    mov             t2, t1
   1283    call .top_fixup
   1284    add             t1, 400*6
   1285    call .h_top
   1286    movif32    strideq, stridemp
   1287    lea            r10, [lpfq+strideq*4]
   1288    mov           lpfq, dstq
   1289    add            r10, strideq
   1290    mov           lpfm, r10 ; below
   1291    movif32        t0m, t2
   1292    mov             t0, t2
   1293    dec             hd
   1294    jz .height1
   1295    or           edged, 16
   1296    call .h
   1297 .main:
   1298    add           lpfq, stridemp
   1299    movif32         t4, t4m
   1300    call .hv
   1301    call .prep_n
   1302    sub             hd, 2
   1303    jl .extend_bottom
   1304 .main_loop:
   1305    movif32       lpfq, hvsrcm
   1306    add           lpfq, stridemp
   1307 %if ARCH_X86_64
   1308    test            hb, hb
   1309 %else
   1310    mov             r4, hd
   1311    test            r4, r4
   1312 %endif
   1313    jz .odd_height
   1314    call .h
   1315    add           lpfq, stridemp
   1316    call .hv
   1317    movif32       dstq, dstm
   1318    call .n0
   1319    call .n1
   1320    sub             hd, 2
   1321    movif32         t0, t0m
   1322    jge .main_loop
   1323    test         edgeb, 8 ; LR_HAVE_BOTTOM
   1324    jz .extend_bottom
   1325    mov           lpfq, lpfm
   1326    call .h_top
   1327    add           lpfq, stridemp
   1328    call .hv_bottom
   1329 .end:
   1330    movif32       dstq, dstm
   1331    call .n0
   1332    call .n1
   1333 .end2:
   1334    RET
   1335 .height1:
   1336    movif32         t4, t4m
   1337    call .hv
   1338    call .prep_n
   1339    jmp .odd_height_end
   1340 .odd_height:
   1341    call .hv
   1342    movif32       dstq, dstm
   1343    call .n0
   1344    call .n1
   1345 .odd_height_end:
   1346    call .v
   1347    movif32       dstq, dstm
   1348    call .n0
   1349    jmp .end2
   1350 .extend_bottom:
   1351    call .v
   1352    jmp .end
   1353 .no_top:
   1354    movif32    strideq, stridemp
   1355    lea            r10, [lpfq+strideq*4]
   1356    mov           lpfq, dstq
   1357    lea            r10, [r10+strideq*2]
   1358    mov           lpfm, r10
   1359    call .h
   1360    lea             t2, [t1+400*6]
   1361    call .top_fixup
   1362    dec             hd
   1363    jz .no_top_height1
   1364    or           edged, 16
   1365    mov             t0, t1
   1366    mov             t1, t2
   1367    movif32        t0m, t0
   1368    jmp .main
   1369 .no_top_height1:
   1370    movif32         t3, t3m
   1371    movif32         t4, t4m
   1372    call .v
   1373    call .prep_n
   1374    jmp .odd_height_end
   1375 .extend_right:
   1376 %assign stack_offset stack_offset+8
   1377 %assign calloff 8
   1378    movd            m1, wd
   1379    movd            m3, [lpfq-1]
   1380    pshufb          m1, m6
   1381    pshufb          m3, m6
   1382    psubb           m2, m8, m1
   1383    pcmpgtb         m2, m13
   1384    pand            m5, m2
   1385    pandn           m2, m3
   1386    por             m5, m2
   1387    ret
   1388 %assign stack_offset stack_offset-4
   1389 %assign calloff 4
   1390 .h: ; horizontal boxsum
   1391 %if ARCH_X86_64
   1392    lea             wq, [r4-2]
   1393 %else
   1394 %define leftq r4
   1395 %endif
   1396    test         edgeb, 1 ; LR_HAVE_LEFT
   1397    jz .h_extend_left
   1398    movif32      leftq, leftm
   1399    movddup         m4, [leftq-4]
   1400    movif32         wq, w0m
   1401    mova            m5, [lpfq+wq+2]
   1402    add         leftmp, 4
   1403    palignr         m5, m4, 13
   1404    jmp .h_main
   1405 .h_extend_left:
   1406    movif32         wq, w0m
   1407    mova            m5, [lpfq+wq+2]
   1408    pshufb          m5, m11
   1409    jmp .h_main
   1410 .h_top:
   1411 %if ARCH_X86_64
   1412    lea             wq, [r4-2]
   1413 %endif
   1414    test         edgeb, 1 ; LR_HAVE_LEFT
   1415    jz .h_extend_left
   1416    movif32         wq, w0m
   1417 .h_loop:
   1418    movu            m5, [lpfq+wq-1]
   1419 .h_main:
   1420    test         edgeb, 2 ; LR_HAVE_RIGHT
   1421    jnz .h_have_right
   1422    cmp             wd, -10
   1423    jl .h_have_right
   1424    call .extend_right
   1425 .h_have_right:
   1426    punpcklbw       m4, m5, m6
   1427    punpckhbw       m5, m6
   1428    palignr         m2, m5, m4, 2
   1429    paddw           m0, m4, m2
   1430    palignr         m3, m5, m4, 6
   1431    paddw           m0, m3
   1432    punpcklwd       m1, m2, m3
   1433    pmaddwd         m1, m1
   1434    punpckhwd       m2, m3
   1435    pmaddwd         m2, m2
   1436    palignr         m5, m4, 8
   1437    paddw           m0, m5
   1438    punpcklwd       m3, m4, m5
   1439    pmaddwd         m3, m3
   1440    paddd           m1, m3
   1441    punpckhwd       m3, m4, m5
   1442    pmaddwd         m3, m3
   1443    shufps          m4, m5, q2121
   1444    paddw           m0, m4             ; sum
   1445    punpcklwd       m5, m4, m6
   1446    pmaddwd         m5, m5
   1447    punpckhwd       m4, m6
   1448    pmaddwd         m4, m4
   1449    paddd           m2, m3
   1450    test         edgeb, 16             ; y > 0
   1451    jz .h_loop_end
   1452    paddw           m0, [t1+wq*2+400*0]
   1453    paddd           m1, [t1+wq*2+400*2]
   1454    paddd           m2, [t1+wq*2+400*4]
   1455 .h_loop_end:
   1456    paddd           m1, m5             ; sumsq
   1457    paddd           m2, m4
   1458    mova [t1+wq*2+400*0], m0
   1459    mova [t1+wq*2+400*2], m1
   1460    mova [t1+wq*2+400*4], m2
   1461    add             wq, 8
   1462    jl .h_loop
   1463    ret
   1464 .top_fixup:
   1465 %if ARCH_X86_64
   1466    lea             wq, [r4-2]
   1467 %else
   1468    mov             wd, w0m
   1469 %endif
   1470 .top_fixup_loop: ; the sums of the first row needs to be doubled
   1471    mova            m0, [t1+wq*2+400*0]
   1472    mova            m1, [t1+wq*2+400*2]
   1473    mova            m2, [t1+wq*2+400*4]
   1474    paddw           m0, m0
   1475    paddd           m1, m1
   1476    paddd           m2, m2
   1477    mova [t2+wq*2+400*0], m0
   1478    mova [t2+wq*2+400*2], m1
   1479    mova [t2+wq*2+400*4], m2
   1480    add             wq, 8
   1481    jl .top_fixup_loop
   1482    ret
   1483 ALIGN function_align
   1484 .hv: ; horizontal boxsum + vertical boxsum + ab
   1485 %if ARCH_X86_64
   1486    lea             wq, [r4-2]
   1487 %else
   1488    mov         hvsrcm, lpfq
   1489 %endif
   1490    test         edgeb, 1 ; LR_HAVE_LEFT
   1491    jz .hv_extend_left
   1492    movif32      leftq, leftm
   1493    movddup         m4, [leftq-4]
   1494    movif32         wq, w0m
   1495    mova            m5, [lpfq+wq+2]
   1496    add         leftmp, 4
   1497    palignr         m5, m4, 13
   1498    jmp .hv_main
   1499 .hv_extend_left:
   1500    movif32         wq, w0m
   1501    mova            m5, [lpfq+wq+2]
   1502    pshufb          m5, m11
   1503    jmp .hv_main
   1504 .hv_bottom:
   1505 %if ARCH_X86_64
   1506    lea             wq, [r4-2]
   1507 %else
   1508    mov         hvsrcm, lpfq
   1509 %endif
   1510    test         edgeb, 1 ; LR_HAVE_LEFT
   1511    jz .hv_extend_left
   1512    movif32         wq, w0m
   1513 %if ARCH_X86_32
   1514    jmp .hv_loop_start
   1515 %endif
   1516 .hv_loop:
   1517    movif32       lpfq, hvsrcm
   1518 .hv_loop_start:
   1519    movu            m5, [lpfq+wq-1]
   1520 .hv_main:
   1521    test         edgeb, 2 ; LR_HAVE_RIGHT
   1522    jnz .hv_have_right
   1523    cmp             wd, -10
   1524    jl .hv_have_right
   1525    call .extend_right
   1526 .hv_have_right:
   1527    movif32         t3, hd
   1528    punpcklbw       m4, m5, m6
   1529    punpckhbw       m5, m6
   1530    palignr         m3, m5, m4, 2
   1531    paddw           m0, m4, m3
   1532    palignr         m1, m5, m4, 6
   1533    paddw           m0, m1
   1534    punpcklwd       m2, m3, m1
   1535    pmaddwd         m2, m2
   1536    punpckhwd       m3, m1
   1537    pmaddwd         m3, m3
   1538    palignr         m5, m4, 8
   1539    paddw           m0, m5
   1540    punpcklwd       m1, m4, m5
   1541    pmaddwd         m1, m1
   1542    paddd           m2, m1
   1543    punpckhwd       m1, m4, m5
   1544    pmaddwd         m1, m1
   1545    shufps          m4, m5, q2121
   1546    paddw           m0, m4            ; h sum
   1547    punpcklwd       m5, m4, m6
   1548    pmaddwd         m5, m5
   1549    punpckhwd       m4, m6
   1550    pmaddwd         m4, m4
   1551    paddd           m3, m1
   1552    paddd           m2, m5            ; h sumsq
   1553    paddd           m3, m4
   1554    paddw           m1, m0, [t1+wq*2+400*0]
   1555    paddd           m4, m2, [t1+wq*2+400*2]
   1556    paddd           m5, m3, [t1+wq*2+400*4]
   1557 %if ARCH_X86_64
   1558    test            hd, hd
   1559 %else
   1560    test            t3, t3
   1561 %endif
   1562    jz .hv_last_row
   1563 .hv_main2:
   1564    paddw           m1, [t2+wq*2+400*0] ; hv sum
   1565    paddd           m4, [t2+wq*2+400*2] ; hv sumsq
   1566    paddd           m5, [t2+wq*2+400*4]
   1567    mova [t0+wq*2+400*0], m0
   1568    pslld           m0, m4, 4
   1569    mova [t0+wq*2+400*2], m2
   1570    mova [t0+wq*2+400*4], m3
   1571    pslld           m2, m4, 3
   1572    paddd           m4, m0
   1573    pslld           m0, m5, 4
   1574    paddd           m4, m2             ; a * 25
   1575    pslld           m2, m5, 3
   1576    paddd           m5, m0
   1577    paddd           m5, m2
   1578    punpcklwd       m0, m1, m6         ; b
   1579    punpckhwd       m1, m6
   1580    SGR_CALC_X      m3, m2, m0, m1, m4, m5, m9, m10, m14
   1581    movif32         t3, t3m
   1582    punpcklwd       m2, m3, m3
   1583    mova   [t4+wq*2+4], m3
   1584    punpckhwd       m3, m3
   1585    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   1586    paddd           m0, m12            ; x * b * 164 + (1 << 11) + (1 << 15)
   1587    paddd           m1, m12
   1588    psrld           m0, 12             ; b
   1589    psrld           m1, 12
   1590    mova  [t3+wq*4+ 8], m0
   1591    mova  [t3+wq*4+24], m1
   1592    add             wq, 8
   1593    jl .hv_loop
   1594    mov             t2, t1
   1595    mov             t1, t0
   1596    mov             t0, t2
   1597    movif32        t0m, t0
   1598    ret
   1599 .hv_last_row: ; esoteric edge case for odd heights
   1600    mova [t1+wq*2+400*0], m1
   1601    paddw             m1, m0
   1602    mova [t1+wq*2+400*2], m4
   1603    paddd             m4, m2
   1604    mova [t1+wq*2+400*4], m5
   1605    paddd             m5, m3
   1606    jmp .hv_main2
   1607 .v: ; vertical boxsum + ab
   1608 %if ARCH_X86_64
   1609    lea             wq, [r4-2]
   1610 %else
   1611    mov             wd, w0m
   1612 %endif
   1613 .v_loop:
   1614    mova            m0, [t1+wq*2+400*0]
   1615    mova            m2, [t1+wq*2+400*2]
   1616    mova            m3, [t1+wq*2+400*4]
   1617    paddw           m1, m0, [t2+wq*2+400*0]
   1618    paddd           m4, m2, [t2+wq*2+400*2]
   1619    paddd           m5, m3, [t2+wq*2+400*4]
   1620    paddw           m0, m0
   1621    paddd           m2, m2
   1622    paddd           m3, m3
   1623    paddw           m1, m0             ; hv sum
   1624    paddd           m4, m2             ; hv sumsq
   1625    pslld           m0, m4, 4
   1626    paddd           m5, m3
   1627    pslld           m2, m4, 3
   1628    paddd           m4, m0
   1629    pslld           m0, m5, 4
   1630    paddd           m4, m2             ; a * 25
   1631    pslld           m2, m5, 3
   1632    paddd           m5, m0
   1633    paddd           m5, m2
   1634    punpcklwd       m0, m1, m6
   1635    punpckhwd       m1, m6
   1636    SGR_CALC_X      m3, m2, m0, m1, m4, m5, m9, m10, m14
   1637    punpcklwd       m2, m3, m3
   1638    mova   [t4+wq*2+4], m3
   1639    punpckhwd       m3, m3
   1640    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   1641    paddd           m0, m12            ; x * b * 164 + (1 << 11) + (1 << 15)
   1642    paddd           m1, m12
   1643    psrld           m0, 12             ; b
   1644    psrld           m1, 12
   1645    mova  [t3+wq*4+ 8], m0
   1646    mova  [t3+wq*4+24], m1
   1647    add             wq, 8
   1648    jl .v_loop
   1649    ret
   1650 .prep_n: ; initial neighbor setup
   1651    movif64         wq, r4
   1652    movif32         wd, w1m
   1653 .prep_n_loop:
   1654    movu            m0, [t4+wq*2+ 2]
   1655    movu            m3, [t4+wq*2+ 4]
   1656    movu            m1, [t3+wq*4+ 4]
   1657    movu            m4, [t3+wq*4+ 8]
   1658    movu            m2, [t3+wq*4+20]
   1659    movu            m5, [t3+wq*4+24]
   1660    paddw           m3, m0
   1661    paddd           m4, m1
   1662    paddd           m5, m2
   1663    paddw           m3, [t4+wq*2+ 0]
   1664    paddd           m4, [t3+wq*4+ 0]
   1665    paddd           m5, [t3+wq*4+16]
   1666    paddw           m0, m3
   1667    psllw           m3, 2
   1668    paddd           m1, m4
   1669    pslld           m4, 2
   1670    paddd           m2, m5
   1671    pslld           m5, 2
   1672    paddw           m0, m3             ; a 565
   1673    paddd           m1, m4             ; b 565
   1674    paddd           m2, m5
   1675    mova [t4+wq*2+400*2+ 0], m0
   1676    mova [t3+wq*4+400*4+ 0], m1
   1677    mova [t3+wq*4+400*4+16], m2
   1678    add             wq, 8
   1679    jl .prep_n_loop
   1680    ret
   1681 ALIGN function_align
   1682 .n0: ; neighbor + output (even rows)
   1683    movif64         wq, r4
   1684    movif32         wd, w1m
   1685 .n0_loop:
   1686    movu            m0, [t4+wq*2+ 2]
   1687    movu            m3, [t4+wq*2+ 4]
   1688    movu            m1, [t3+wq*4+ 4]
   1689    movu            m4, [t3+wq*4+ 8]
   1690    movu            m2, [t3+wq*4+20]
   1691    movu            m5, [t3+wq*4+24]
   1692    paddw           m3, m0
   1693    paddd           m4, m1
   1694    paddd           m5, m2
   1695    paddw           m3, [t4+wq*2+ 0]
   1696    paddd           m4, [t3+wq*4+ 0]
   1697    paddd           m5, [t3+wq*4+16]
   1698    paddw           m0, m3
   1699    psllw           m3, 2
   1700    paddd           m1, m4
   1701    pslld           m4, 2
   1702    paddd           m2, m5
   1703    pslld           m5, 2
   1704    paddw           m0, m3             ; a 565
   1705    paddd           m1, m4             ; b 565
   1706    paddd           m2, m5
   1707    paddw           m3, m0, [t4+wq*2+400*2+ 0]
   1708    paddd           m4, m1, [t3+wq*4+400*4+ 0]
   1709    paddd           m5, m2, [t3+wq*4+400*4+16]
   1710    mova [t4+wq*2+400*2+ 0], m0
   1711    mova [t3+wq*4+400*4+ 0], m1
   1712    mova [t3+wq*4+400*4+16], m2
   1713    movq            m0, [dstq+wq]
   1714    punpcklbw       m0, m6
   1715    punpcklwd       m1, m0, m6          ; src
   1716    punpcklwd       m2, m3, m6          ; a
   1717    pmaddwd         m2, m1              ; a * src
   1718    punpckhwd       m1, m0, m6
   1719    punpckhwd       m3, m6
   1720    pmaddwd         m3, m1
   1721    psubd           m4, m2              ; b - a * src + (1 << 8)
   1722    psubd           m5, m3
   1723    psrad           m4, 9
   1724    psrad           m5, 9
   1725    packssdw        m4, m5
   1726    pmulhrsw        m4, m7
   1727    paddw           m0, m4
   1728    packuswb        m0, m0
   1729    movq     [dstq+wq], m0
   1730    add             wq, 8
   1731    jl .n0_loop
   1732    add           dstq, stridemp
   1733    ret
   1734 ALIGN function_align
   1735 .n1: ; neighbor + output (odd rows)
   1736    movif64         wq, r4
   1737    movif32         wd, w1m
   1738 .n1_loop:
   1739    movq            m0, [dstq+wq]
   1740    mova            m3, [t4+wq*2+400*2+ 0]
   1741    mova            m4, [t3+wq*4+400*4+ 0]
   1742    mova            m5, [t3+wq*4+400*4+16]
   1743    punpcklbw       m0, m6
   1744    punpcklwd       m1, m0, m6          ; src
   1745    punpcklwd       m2, m3, m6          ; a
   1746    pmaddwd         m2, m1              ; a * src
   1747    punpckhwd       m1, m0, m6
   1748    punpckhwd       m3, m6
   1749    pmaddwd         m3, m1
   1750    psubd           m4, m2              ; b - a * src + (1 << 7)
   1751    psubd           m5, m3
   1752    psrad           m4, 8
   1753    psrad           m5, 8
   1754    packssdw        m4, m5
   1755    pmulhrsw        m4, m7
   1756    paddw           m0, m4
   1757    packuswb        m0, m0
   1758    movq     [dstq+wq], m0
   1759    add             wq, 8
   1760    jl .n1_loop
   1761    add           dstq, stridemp
   1762    movif32       dstm, dstq
   1763    ret
   1764 
   1765 %if ARCH_X86_32
   1766 %if STACK_ALIGNMENT < 16
   1767  %assign extra_stack 4*16
   1768 %else
   1769  %assign extra_stack 2*16
   1770 %endif
   1771 cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \
   1772                             dst, stride, left, lpf, w
   1773 %if STACK_ALIGNMENT < 16
   1774  %define dstm         dword [esp+calloff+16*2+4*0]
   1775  %define stridemp     dword [esp+calloff+16*2+4*1]
   1776  %define leftm        dword [esp+calloff+16*2+4*2]
   1777  %define lpfm         dword [esp+calloff+16*2+4*3]
   1778  %define w0m          dword [esp+calloff+16*2+4*4]
   1779  %define hd           dword [esp+calloff+16*2+4*5]
   1780  %define edgeb         byte [esp+calloff+16*2+4*6]
   1781  %define edged        dword [esp+calloff+16*2+4*6]
   1782  %define leftmp leftm
   1783 %else
   1784  %define w0m wm
   1785  %define hd dword r5m
   1786  %define edgeb  byte r7m
   1787  %define edged dword r7m
   1788 %endif
   1789 %define hvsrcm dword [esp+calloff+4*0]
   1790 %define w1m    dword [esp+calloff+4*1]
   1791 %define t3m    dword [esp+calloff+4*2]
   1792 %define t4m    dword [esp+calloff+4*3]
   1793 %define  m8 [base+pb_0to15]
   1794 %define  m9 [esp+calloff+16*1]
   1795 %define m10 [base+pw_455_24]
   1796 %define m11 [base+pd_34816]
   1797 %define m12 m6
   1798 %define m13 [base+sgr_lshuf3]
   1799 %define m14 [base+pf_256]
   1800 %define base r6-pw_2056
   1801 %assign calloff 0
   1802 %if STACK_ALIGNMENT < 16
   1803    mov        strideq, [rstk+stack_offset+ 8]
   1804    mov          leftq, [rstk+stack_offset+12]
   1805    mov           lpfq, [rstk+stack_offset+16]
   1806    mov             wd, [rstk+stack_offset+20]
   1807    mov           dstm, dstq
   1808    mov       stridemp, strideq
   1809    mov          leftm, leftq
   1810    mov             r1, [rstk+stack_offset+24]
   1811    mov             r2, [rstk+stack_offset+32]
   1812    mov           lpfm, lpfq
   1813    mov             hd, r1
   1814    mov          edged, r2
   1815 %endif
   1816 %else
   1817 cglobal sgr_filter_3x3_8bpc, 4, 13, 15, -400*42-8, dst, stride, left, lpf, \
   1818                                                   w, h, edge, params
   1819 %endif
   1820 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16
   1821    mov             wd, wm
   1822 %endif
   1823 %if ARCH_X86_64
   1824    mov        paramsq, r6mp
   1825    mov             hd, hm
   1826    mov          edged, r7m
   1827    movq            m9, [paramsq+4]
   1828    add           lpfq, wq
   1829    lea             t1, [rsp+wq*2+12]
   1830    mova            m8, [pb_0to15]
   1831    add           dstq, wq
   1832    lea             t3, [rsp+wq*4+400*12+8]
   1833    mova           m10, [pw_455_24]
   1834    lea             t4, [rsp+wq*2+400*32+8]
   1835    mova           m11, [pd_34816]
   1836    pshuflw         m7, m9, q3333
   1837    pshufb          m9, [pw_256]  ; s1
   1838    punpcklqdq      m7, m7        ; w1
   1839    neg             wq
   1840    pxor            m6, m6
   1841    mova           m13, [sgr_lshuf3]
   1842    psllw           m7, 4
   1843    movaps         m14, [pf_256]
   1844 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
   1845 %define lpfm [rsp]
   1846 %else
   1847    mov             r1, [rstk+stack_offset+28] ; params
   1848    LEA             r6, pw_2056
   1849    movq            m1, [r1+4]
   1850    add           lpfm, wq
   1851    lea             t1, [rsp+extra_stack+wq*2+20]
   1852    add           dstq, wq
   1853    lea             t3, [rsp+extra_stack+wq*4+400*12+16]
   1854    mov           dstm, dstq
   1855    lea             t4, [rsp+extra_stack+wq*2+400*32+16]
   1856    mov            t3m, t3
   1857    pshuflw         m7, m1, q3333
   1858    mov            t4m, t4
   1859    pshufb          m1, [base+pw_256] ; s1
   1860    punpcklqdq      m7, m7            ; w1
   1861    psllw           m7, 4
   1862    neg             wq
   1863    mova            m9, m1
   1864    pxor            m6, m6
   1865    mov            w1m, wd
   1866    sub             wd, 2
   1867    mov           lpfq, lpfm
   1868    mov            w0m, wd
   1869 %define strideq r5
   1870 %endif
   1871    test         edgeb, 4 ; LR_HAVE_TOP
   1872    jz .no_top
   1873    call .h_top
   1874    add           lpfq, stridemp
   1875    mov             t2, t1
   1876    add             t1, 400*6
   1877    call .h_top
   1878    movif32    strideq, stridemp
   1879    lea            r10, [lpfq+strideq*4]
   1880    mov           lpfq, dstq
   1881    add            r10, strideq
   1882    mov           lpfm, r10 ; below
   1883    movif32         t4, t4m
   1884    call .hv0
   1885 .main:
   1886    dec             hd
   1887    jz .height1
   1888    movif32       lpfq, hvsrcm
   1889    add           lpfq, stridemp
   1890    call .hv1
   1891    call .prep_n
   1892    sub             hd, 2
   1893    jl .extend_bottom
   1894 .main_loop:
   1895    movif32       lpfq, hvsrcm
   1896    add           lpfq, stridemp
   1897    call .hv0
   1898 %if ARCH_X86_64
   1899    test            hb, hb
   1900 %else
   1901    mov             r4, hd
   1902    test            r4, r4
   1903 %endif
   1904    jz .odd_height
   1905    movif32       lpfq, hvsrcm
   1906    add           lpfq, stridemp
   1907    call .hv1
   1908    call .n0
   1909    call .n1
   1910    sub             hd, 2
   1911    jge .main_loop
   1912    test         edgeb, 8 ; LR_HAVE_BOTTOM
   1913    jz .extend_bottom
   1914    mov           lpfq, lpfm
   1915    call .hv0_bottom
   1916    movif32       lpfq, hvsrcm
   1917    add           lpfq, stridemp
   1918    call .hv1_bottom
   1919 .end:
   1920    call .n0
   1921    call .n1
   1922 .end2:
   1923    RET
   1924 .height1:
   1925    call .v1
   1926    call .prep_n
   1927    jmp .odd_height_end
   1928 .odd_height:
   1929    call .v1
   1930    call .n0
   1931    call .n1
   1932 .odd_height_end:
   1933    call .v0
   1934    call .v1
   1935    call .n0
   1936    jmp .end2
   1937 .extend_bottom:
   1938    call .v0
   1939    call .v1
   1940    jmp .end
   1941 .no_top:
   1942    movif32    strideq, stridemp
   1943    lea            r10, [lpfq+strideq*4]
   1944    mov           lpfq, dstq
   1945    lea            r10, [r10+strideq*2]
   1946    mov           lpfm, r10
   1947    call .h
   1948 %if ARCH_X86_64
   1949    lea             wq, [r4-2]
   1950 %else
   1951    mov             wq, w0m
   1952    mov         hvsrcm, lpfq
   1953 %endif
   1954    lea             t2, [t1+400*6]
   1955 .top_fixup_loop:
   1956    mova            m0, [t1+wq*2+400*0]
   1957    mova            m1, [t1+wq*2+400*2]
   1958    mova            m2, [t1+wq*2+400*4]
   1959    mova [t2+wq*2+400*0], m0
   1960    mova [t2+wq*2+400*2], m1
   1961    mova [t2+wq*2+400*4], m2
   1962    add             wq, 8
   1963    jl .top_fixup_loop
   1964    movif32         t3, t3m
   1965    movif32         t4, t4m
   1966    call .v0
   1967    jmp .main
   1968 .extend_right:
   1969 %assign stack_offset stack_offset+8
   1970 %assign calloff 8
   1971    movd            m0, [lpfq-1]
   1972    movd            m1, wd
   1973    mova            m3, m8
   1974    pshufb          m0, m6
   1975    pshufb          m1, m6
   1976    mova            m2, m6
   1977    psubb           m2, m1
   1978    pcmpgtb         m2, m3
   1979    pand            m5, m2
   1980    pandn           m2, m0
   1981    por             m5, m2
   1982    ret
   1983 %assign stack_offset stack_offset-4
   1984 %assign calloff 4
   1985 .h: ; horizontal boxsum
   1986 %if ARCH_X86_64
   1987    lea             wq, [r4-2]
   1988 %else
   1989 %define leftq r4
   1990 %endif
   1991    test         edgeb, 1 ; LR_HAVE_LEFT
   1992    jz .h_extend_left
   1993    movif32      leftq, leftm
   1994    movddup         m4, [leftq-4]
   1995    movif32         wq, w0m
   1996    mova            m5, [lpfq+wq+2]
   1997    add         leftmp, 4
   1998    palignr         m5, m4, 14
   1999    jmp .h_main
   2000 .h_extend_left:
   2001    movif32         wq, w0m
   2002    mova            m5, [lpfq+wq+2]
   2003    pshufb          m5, m13
   2004    jmp .h_main
   2005 .h_top:
   2006 %if ARCH_X86_64
   2007    lea             wq, [r4-2]
   2008 %endif
   2009    test         edgeb, 1 ; LR_HAVE_LEFT
   2010    jz .h_extend_left
   2011    movif32         wq, w0m
   2012 .h_loop:
   2013    movu            m5, [lpfq+wq]
   2014 .h_main:
   2015    test         edgeb, 2 ; LR_HAVE_RIGHT
   2016    jnz .h_have_right
   2017    cmp             wd, -9
   2018    jl .h_have_right
   2019    call .extend_right
   2020 .h_have_right:
   2021    punpcklbw       m4, m5, m6
   2022    punpckhbw       m5, m6
   2023    palignr         m0, m5, m4, 2
   2024    paddw           m1, m4, m0
   2025    punpcklwd       m2, m4, m0
   2026    pmaddwd         m2, m2
   2027    punpckhwd       m3, m4, m0
   2028    pmaddwd         m3, m3
   2029    palignr         m5, m4, 4
   2030    paddw           m1, m5             ; sum
   2031    punpcklwd       m4, m5, m6
   2032    pmaddwd         m4, m4
   2033    punpckhwd       m5, m6
   2034    pmaddwd         m5, m5
   2035    paddd           m2, m4             ; sumsq
   2036    paddd           m3, m5
   2037    mova [t1+wq*2+400*0], m1
   2038    mova [t1+wq*2+400*2], m2
   2039    mova [t1+wq*2+400*4], m3
   2040    add             wq, 8
   2041    jl .h_loop
   2042    ret
   2043 ALIGN function_align
   2044 .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
   2045 %if ARCH_X86_64
   2046    lea             wq, [r4-2]
   2047 %else
   2048    mov         hvsrcm, lpfq
   2049 %endif
   2050    test         edgeb, 1 ; LR_HAVE_LEFT
   2051    jz .hv0_extend_left
   2052    movif32      leftq, leftm
   2053    movddup         m4, [leftq-4]
   2054    movif32         wq, w0m
   2055    mova            m5, [lpfq+wq+2]
   2056    add         leftmp, 4
   2057    palignr         m5, m4, 14
   2058    jmp .hv0_main
   2059 .hv0_extend_left:
   2060    movif32         wq, w0m
   2061    mova            m5, [lpfq+wq+2]
   2062    pshufb          m5, m13
   2063    jmp .hv0_main
   2064 .hv0_bottom:
   2065 %if ARCH_X86_64
   2066    lea             wq, [r4-2]
   2067 %else
   2068    mov         hvsrcm, lpfq
   2069 %endif
   2070    test         edgeb, 1 ; LR_HAVE_LEFT
   2071    jz .hv0_extend_left
   2072    movif32         wq, w0m
   2073 %if ARCH_X86_32
   2074    jmp .hv0_loop_start
   2075 %endif
   2076 .hv0_loop:
   2077    movif32       lpfq, hvsrcm
   2078 .hv0_loop_start:
   2079    movu            m5, [lpfq+wq]
   2080 .hv0_main:
   2081    test         edgeb, 2 ; LR_HAVE_RIGHT
   2082    jnz .hv0_have_right
   2083    cmp             wd, -9
   2084    jl .hv0_have_right
   2085    call .extend_right
   2086 .hv0_have_right:
   2087    punpcklbw       m4, m5, m6
   2088    punpckhbw       m5, m6
   2089    palignr         m0, m5, m4, 2
   2090    paddw           m1, m4, m0
   2091    punpcklwd       m2, m4, m0
   2092    pmaddwd         m2, m2
   2093    punpckhwd       m3, m4, m0
   2094    pmaddwd         m3, m3
   2095    palignr         m5, m4, 4
   2096    paddw           m1, m5             ; sum
   2097    punpcklwd       m4, m5, m6
   2098    pmaddwd         m4, m4
   2099    punpckhwd       m5, m6
   2100    pmaddwd         m5, m5
   2101    paddd           m2, m4             ; sumsq
   2102    paddd           m3, m5
   2103    paddw           m0, m1, [t1+wq*2+400*0]
   2104    paddd           m4, m2, [t1+wq*2+400*2]
   2105    paddd           m5, m3, [t1+wq*2+400*4]
   2106    mova [t1+wq*2+400*0], m1
   2107    mova [t1+wq*2+400*2], m2
   2108    mova [t1+wq*2+400*4], m3
   2109    paddw           m1, m0, [t2+wq*2+400*0]
   2110    paddd           m2, m4, [t2+wq*2+400*2]
   2111    paddd           m3, m5, [t2+wq*2+400*4]
   2112    mova [t2+wq*2+400*0], m0
   2113    mova [t2+wq*2+400*2], m4
   2114    mova [t2+wq*2+400*4], m5
   2115    pslld           m4, m2, 3
   2116    pslld           m5, m3, 3
   2117    paddd           m4, m2             ; a * 9
   2118    paddd           m5, m3
   2119    punpcklwd       m0, m1, m6         ; b
   2120    punpckhwd       m1, m6
   2121    SGR_CALC_X      m3, m2, m0, m1, m4, m5, m9, m10, m14
   2122    movif32         t3, t3m
   2123    punpcklwd       m2, m3, m3
   2124    mova   [t4+wq*2+4], m3
   2125    punpckhwd       m3, m3
   2126    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   2127    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
   2128    paddd           m1, m11
   2129    psrld           m0, 12
   2130    psrld           m1, 12
   2131    mova  [t3+wq*4+ 8], m0
   2132    mova  [t3+wq*4+24], m1
   2133    add             wq, 8
   2134    jl .hv0_loop
   2135    ret
   2136 ALIGN function_align
   2137 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
   2138 %if ARCH_X86_64
   2139    lea             wq, [r4-2]
   2140 %else
   2141    mov         hvsrcm, lpfq
   2142 %endif
   2143    test         edgeb, 1 ; LR_HAVE_LEFT
   2144    jz .hv1_extend_left
   2145    movif32      leftq, leftm
   2146    movddup         m4, [leftq-4]
   2147    movif32         wq, w0m
   2148    mova            m5, [lpfq+wq+2]
   2149    add         leftmp, 4
   2150    palignr         m5, m4, 14
   2151    jmp .hv1_main
   2152 .hv1_extend_left:
   2153    movif32         wq, w0m
   2154    mova            m5, [lpfq+wq+2]
   2155    pshufb          m5, m13
   2156    jmp .hv1_main
   2157 .hv1_bottom:
   2158 %if ARCH_X86_64
   2159    lea             wq, [r4-2]
   2160 %else
   2161    mov         hvsrcm, lpfq
   2162 %endif
   2163    test         edgeb, 1 ; LR_HAVE_LEFT
   2164    jz .hv1_extend_left
   2165    movif32         wq, w0m
   2166 %if ARCH_X86_32
   2167    jmp .hv1_loop_start
   2168 %endif
   2169 .hv1_loop:
   2170    movif32       lpfq, hvsrcm
   2171 .hv1_loop_start:
   2172    movu            m5, [lpfq+wq]
   2173 .hv1_main:
   2174    test         edgeb, 2 ; LR_HAVE_RIGHT
   2175    jnz .hv1_have_right
   2176    cmp             wd, -9
   2177    jl .hv1_have_right
   2178    call .extend_right
   2179 .hv1_have_right:
   2180    punpcklbw       m4, m5, m6
   2181    punpckhbw       m5, m6
   2182    palignr         m1, m5, m4, 2
   2183    paddw           m0, m4, m1
   2184    punpcklwd       m2, m4, m1
   2185    pmaddwd         m2, m2
   2186    punpckhwd       m3, m4, m1
   2187    pmaddwd         m3, m3
   2188    palignr         m5, m4, 4
   2189    paddw           m0, m5             ; h sum
   2190    punpcklwd       m1, m5, m6
   2191    pmaddwd         m1, m1
   2192    punpckhwd       m5, m6
   2193    pmaddwd         m5, m5
   2194    paddd           m2, m1             ; h sumsq
   2195    paddd           m3, m5
   2196    paddw           m1, m0, [t2+wq*2+400*0]
   2197    paddd           m4, m2, [t2+wq*2+400*2]
   2198    paddd           m5, m3, [t2+wq*2+400*4]
   2199    mova [t2+wq*2+400*0], m0
   2200    mova [t2+wq*2+400*2], m2
   2201    mova [t2+wq*2+400*4], m3
   2202    pslld           m2, m4, 3
   2203    pslld           m3, m5, 3
   2204    paddd           m4, m2             ; a * 9
   2205    paddd           m5, m3
   2206    punpcklwd       m0, m1, m6         ; b
   2207    punpckhwd       m1, m6
   2208    SGR_CALC_X      m3, m2, m0, m1, m4, m5, m9, m10, m14
   2209    movif32         t3, t3m
   2210    punpcklwd       m2, m3, m3
   2211    mova [t4+wq*2+400*2 +4], m3
   2212    punpckhwd       m3, m3
   2213    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   2214    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
   2215    paddd           m1, m11
   2216    psrld           m0, 12
   2217    psrld           m1, 12
   2218    mova [t3+wq*4+400*4+ 8], m0
   2219    mova [t3+wq*4+400*4+24], m1
   2220    add             wq, 8
   2221    jl .hv1_loop
   2222    mov            r10, t2
   2223    mov             t2, t1
   2224    mov             t1, r10
   2225    ret
   2226 .v0: ; vertical boxsums + ab (even rows)
   2227 %if ARCH_X86_64
   2228    lea             wq, [r4-2]
   2229 %else
   2230    mov             wd, w0m
   2231 %endif
   2232 .v0_loop:
   2233    mova            m0, [t1+wq*2+400*0]
   2234    mova            m4, [t1+wq*2+400*2]
   2235    mova            m5, [t1+wq*2+400*4]
   2236    paddw           m0, m0
   2237    paddd           m4, m4
   2238    paddd           m5, m5
   2239    paddw           m1, m0, [t2+wq*2+400*0]
   2240    paddd           m2, m4, [t2+wq*2+400*2]
   2241    paddd           m3, m5, [t2+wq*2+400*4]
   2242    mova [t2+wq*2+400*0], m0
   2243    mova [t2+wq*2+400*2], m4
   2244    mova [t2+wq*2+400*4], m5
   2245    pslld           m4, m2, 3
   2246    pslld           m5, m3, 3
   2247    paddd           m4, m2             ; a * 9
   2248    paddd           m5, m3
   2249    punpcklwd       m0, m1, m6         ; b
   2250    punpckhwd       m1, m6
   2251    SGR_CALC_X      m3, m2, m0, m1, m4, m5, m9, m10, m14
   2252    punpcklwd       m2, m3, m3
   2253    mova   [t4+wq*2+4], m3
   2254    punpckhwd       m3, m3
   2255    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   2256    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
   2257    paddd           m1, m11
   2258    psrld           m0, 12
   2259    psrld           m1, 12
   2260    mova  [t3+wq*4+ 8], m0
   2261    mova  [t3+wq*4+24], m1
   2262    add             wq, 8
   2263    jl .v0_loop
   2264    ret
   2265 .v1: ; vertical boxsums + ab (odd rows)
   2266 %if ARCH_X86_64
   2267    lea             wq, [r4-2]
   2268 %else
   2269    mov             wd, w0m
   2270 %endif
   2271 .v1_loop:
   2272    mova            m0, [t1+wq*2+400*0]
   2273    mova            m4, [t1+wq*2+400*2]
   2274    mova            m5, [t1+wq*2+400*4]
   2275    paddw           m1, m0, [t2+wq*2+400*0]
   2276    paddd           m2, m4, [t2+wq*2+400*2]
   2277    paddd           m3, m5, [t2+wq*2+400*4]
   2278    mova [t2+wq*2+400*0], m0
   2279    mova [t2+wq*2+400*2], m4
   2280    mova [t2+wq*2+400*4], m5
   2281    pslld           m4, m2, 3
   2282    pslld           m5, m3, 3
   2283    paddd           m4, m2             ; a * 9
   2284    paddd           m5, m3
   2285    punpcklwd       m0, m1, m6         ; b
   2286    punpckhwd       m1, m6
   2287    SGR_CALC_X      m3, m2, m0, m1, m4, m5, m9, m10, m14
   2288    punpcklwd       m2, m3, m3
   2289    mova [t4+wq*2+400*2+ 4], m3
   2290    punpckhwd       m3, m3
   2291    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   2292    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
   2293    paddd           m1, m11
   2294    psrld           m0, 12
   2295    psrld           m1, 12
   2296    mova [t3+wq*4+400*4+ 8], m0
   2297    mova [t3+wq*4+400*4+24], m1
   2298    add             wq, 8
   2299    jl .v1_loop
   2300    mov            r10, t2
   2301    mov             t2, t1
   2302    mov             t1, r10
   2303    ret
   2304 .prep_n: ; initial neighbor setup
   2305    movif64         wq, r4
   2306    movif32         wd, w1m
   2307 .prep_n_loop:
   2308    movu            m0, [t4+wq*2+400*0+ 4]
   2309    movu            m1, [t3+wq*4+400*0+ 8]
   2310    movu            m2, [t3+wq*4+400*0+24]
   2311    movu            m3, [t4+wq*2+400*0+ 2]
   2312    movu            m4, [t3+wq*4+400*0+ 4]
   2313    movu            m5, [t3+wq*4+400*0+20]
   2314    paddw           m0, [t4+wq*2+400*0+ 0]
   2315    paddd           m1, [t3+wq*4+400*0+ 0]
   2316    paddd           m2, [t3+wq*4+400*0+16]
   2317    paddw           m3, m0
   2318    paddd           m4, m1
   2319    paddd           m5, m2
   2320    psllw           m3, 2                ; a[-1] 444
   2321    pslld           m4, 2                ; b[-1] 444
   2322    pslld           m5, 2
   2323    psubw           m3, m0               ; a[-1] 343
   2324    psubd           m4, m1               ; b[-1] 343
   2325    psubd           m5, m2
   2326    mova [t4+wq*2+400*4], m3
   2327    mova [t3+wq*4+400*8+ 0], m4
   2328    mova [t3+wq*4+400*8+16], m5
   2329    movu            m0, [t4+wq*2+400*2+ 4]
   2330    movu            m1, [t3+wq*4+400*4+ 8]
   2331    movu            m2, [t3+wq*4+400*4+24]
   2332    movu            m3, [t4+wq*2+400*2+ 2]
   2333    movu            m4, [t3+wq*4+400*4+ 4]
   2334    movu            m5, [t3+wq*4+400*4+20]
   2335    paddw           m0, [t4+wq*2+400*2+ 0]
   2336    paddd           m1, [t3+wq*4+400*4+ 0]
   2337    paddd           m2, [t3+wq*4+400*4+16]
   2338    paddw           m3, m0
   2339    paddd           m4, m1
   2340    paddd           m5, m2
   2341    psllw           m3, 2                 ; a[ 0] 444
   2342    pslld           m4, 2                 ; b[ 0] 444
   2343    pslld           m5, 2
   2344    mova [t4+wq*2+400* 6], m3
   2345    mova [t3+wq*4+400*12+ 0], m4
   2346    mova [t3+wq*4+400*12+16], m5
   2347    psubw           m3, m0                ; a[ 0] 343
   2348    psubd           m4, m1                ; b[ 0] 343
   2349    psubd           m5, m2
   2350    mova [t4+wq*2+400* 8], m3
   2351    mova [t3+wq*4+400*16+ 0], m4
   2352    mova [t3+wq*4+400*16+16], m5
   2353    add             wq, 8
   2354    jl .prep_n_loop
   2355    ret
   2356 ALIGN function_align
   2357 .n0: ; neighbor + output (even rows)
   2358    movif64         wq, r4
   2359    movif32         wd, w1m
   2360 .n0_loop:
   2361    movu            m3, [t4+wq*2+400*0+4]
   2362    movu            m1, [t4+wq*2+400*0+2]
   2363    paddw           m3, [t4+wq*2+400*0+0]
   2364    paddw           m1, m3
   2365    psllw           m1, 2                ; a[ 1] 444
   2366    psubw           m2, m1, m3           ; a[ 1] 343
   2367    paddw           m3, m2, [t4+wq*2+400*4]
   2368    paddw           m3, [t4+wq*2+400*6]
   2369    mova [t4+wq*2+400*4], m2
   2370    mova [t4+wq*2+400*6], m1
   2371    movu            m4, [t3+wq*4+400*0+8]
   2372    movu            m1, [t3+wq*4+400*0+4]
   2373    paddd           m4, [t3+wq*4+400*0+0]
   2374    paddd           m1, m4
   2375    pslld           m1, 2                ; b[ 1] 444
   2376    psubd           m2, m1, m4           ; b[ 1] 343
   2377    paddd           m4, m2, [t3+wq*4+400* 8+ 0]
   2378    paddd           m4, [t3+wq*4+400*12+ 0]
   2379    mova [t3+wq*4+400* 8+ 0], m2
   2380    mova [t3+wq*4+400*12+ 0], m1
   2381    movu            m5, [t3+wq*4+400*0+24]
   2382    movu            m1, [t3+wq*4+400*0+20]
   2383    paddd           m5, [t3+wq*4+400*0+16]
   2384    paddd           m1, m5
   2385    pslld           m1, 2
   2386    psubd           m2, m1, m5
   2387    paddd           m5, m2, [t3+wq*4+400* 8+16]
   2388    paddd           m5, [t3+wq*4+400*12+16]
   2389    mova [t3+wq*4+400* 8+16], m2
   2390    mova [t3+wq*4+400*12+16], m1
   2391    movq            m0, [dstq+wq]
   2392    punpcklbw       m0, m6
   2393    punpcklwd       m1, m0, m6
   2394    punpcklwd       m2, m3, m6
   2395    pmaddwd         m2, m1               ; a * src
   2396    punpckhwd       m1, m0, m6
   2397    punpckhwd       m3, m6
   2398    pmaddwd         m3, m1
   2399    psubd           m4, m2               ; b - a * src + (1 << 8)
   2400    psubd           m5, m3
   2401    psrad           m4, 9
   2402    psrad           m5, 9
   2403    packssdw        m4, m5
   2404    pmulhrsw        m4, m7
   2405    paddw           m0, m4
   2406    packuswb        m0, m0
   2407    movq     [dstq+wq], m0
   2408    add             wq, 8
   2409    jl .n0_loop
   2410    add           dstq, stridemp
   2411    ret
   2412 ALIGN function_align
   2413 .n1: ; neighbor + output (odd rows)
   2414    movif64         wq, r4
   2415    movif32         wd, w1m
   2416 .n1_loop:
   2417    movu            m3, [t4+wq*2+400*2+4]
   2418    movu            m1, [t4+wq*2+400*2+2]
   2419    paddw           m3, [t4+wq*2+400*2+0]
   2420    paddw           m1, m3
   2421    psllw           m1, 2                ; a[ 1] 444
   2422    psubw           m2, m1, m3           ; a[ 1] 343
   2423    paddw           m3, m2, [t4+wq*2+400*6]
   2424    paddw           m3, [t4+wq*2+400*8]
   2425    mova [t4+wq*2+400*6], m1
   2426    mova [t4+wq*2+400*8], m2
   2427    movu            m4, [t3+wq*4+400*4+8]
   2428    movu            m1, [t3+wq*4+400*4+4]
   2429    paddd           m4, [t3+wq*4+400*4+0]
   2430    paddd           m1, m4
   2431    pslld           m1, 2                ; b[ 1] 444
   2432    psubd           m2, m1, m4           ; b[ 1] 343
   2433    paddd           m4, m2, [t3+wq*4+400*12+ 0]
   2434    paddd           m4, [t3+wq*4+400*16+ 0]
   2435    mova [t3+wq*4+400*12+ 0], m1
   2436    mova [t3+wq*4+400*16+ 0], m2
   2437    movu            m5, [t3+wq*4+400*4+24]
   2438    movu            m1, [t3+wq*4+400*4+20]
   2439    paddd           m5, [t3+wq*4+400*4+16]
   2440    paddd           m1, m5
   2441    pslld           m1, 2
   2442    psubd           m2, m1, m5
   2443    paddd           m5, m2, [t3+wq*4+400*12+16]
   2444    paddd           m5, [t3+wq*4+400*16+16]
   2445    mova [t3+wq*4+400*12+16], m1
   2446    mova [t3+wq*4+400*16+16], m2
   2447    movq            m0, [dstq+wq]
   2448    punpcklbw       m0, m6
   2449    punpcklwd       m1, m0, m6
   2450    punpcklwd       m2, m3, m6
   2451    pmaddwd         m2, m1               ; a * src
   2452    punpckhwd       m1, m0, m6
   2453    punpckhwd       m3, m6
   2454    pmaddwd         m3, m1
   2455    psubd           m4, m2               ; b - a * src + (1 << 8)
   2456    psubd           m5, m3
   2457    psrad           m4, 9
   2458    psrad           m5, 9
   2459    packssdw        m4, m5
   2460    pmulhrsw        m4, m7
   2461    paddw           m0, m4
   2462    packuswb        m0, m0
   2463    movq     [dstq+wq], m0
   2464    add             wq, 8
   2465    jl .n1_loop
   2466    add           dstq, stridemp
   2467    movif32       dstm, dstq
   2468    ret
   2469 
   2470 %if ARCH_X86_32
   2471 %if STACK_ALIGNMENT < 16
   2472  %assign extra_stack 10*16
   2473 %else
   2474  %assign extra_stack 8*16
   2475 %endif
   2476 cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \
   2477                             dst, stride, left, lpf, w
   2478 %if STACK_ALIGNMENT < 16
   2479  %define dstm         dword [esp+calloff+16*8+4*0]
   2480  %define stridemp     dword [esp+calloff+16*8+4*1]
   2481  %define leftm        dword [esp+calloff+16*8+4*2]
   2482  %define lpfm         dword [esp+calloff+16*8+4*3]
   2483  %define w0m          dword [esp+calloff+16*8+4*4]
   2484  %define hd           dword [esp+calloff+16*8+4*5]
   2485  %define edgeb         byte [esp+calloff+16*8+4*6]
   2486  %define edged        dword [esp+calloff+16*8+4*6]
   2487  %define leftmp leftm
   2488 %else
   2489  %define w0m wm
   2490  %define hd dword r5m
   2491  %define edgeb  byte r7m
   2492  %define edged dword r7m
   2493 %endif
   2494 %define hvsrcm dword [esp+calloff+4*0]
   2495 %define w1m    dword [esp+calloff+4*1]
   2496 %define t3m    dword [esp+calloff+4*2]
   2497 %define t4m    dword [esp+calloff+4*3]
   2498 %xdefine m8 m6
   2499 %define  m9 [base+pd_0xffff]
   2500 %define m10 [base+pd_34816]
   2501 %define m11 [base+pw_455_24]
   2502 %define m12 [base+pw_164_24]
   2503 %define m13 [esp+calloff+16*4]
   2504 %define m14 [esp+calloff+16*5]
   2505 %define m15 [esp+calloff+16*6]
   2506 %define  m6 [esp+calloff+16*7]
   2507 %define base r6-pw_2056
   2508 %assign calloff 0
   2509 %if STACK_ALIGNMENT < 16
   2510    mov        strideq, [rstk+stack_offset+ 8]
   2511    mov          leftq, [rstk+stack_offset+12]
   2512    mov           lpfq, [rstk+stack_offset+16]
   2513    mov             wd, [rstk+stack_offset+20]
   2514    mov           dstm, dstq
   2515    mov       stridemp, strideq
   2516    mov          leftm, leftq
   2517    mov             r1, [rstk+stack_offset+24]
   2518    mov             r2, [rstk+stack_offset+32]
   2519    mov           lpfm, lpfq
   2520    mov             hd, r1
   2521    mov          edged, r2
   2522 %endif
   2523 %else
   2524 cglobal sgr_filter_mix_8bpc, 4, 13, 16, -400*66-40, dst, stride, left, lpf, \
   2525                                                    w, h, edge, params
   2526 %endif
   2527 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16
   2528    mov             wd, wm
   2529 %endif
   2530 %if ARCH_X86_64
   2531    mov        paramsq, r6mp
   2532    movifnidn       hd, hm
   2533    mov          edged, r7m
   2534    mova           m15, [paramsq]
   2535    add           lpfq, wq
   2536    mova            m9, [pd_0xffff]
   2537    lea             t1, [rsp+wq*2+44]
   2538    mova           m10, [pd_34816]
   2539    add           dstq, wq
   2540    lea             t3, [rsp+wq*4+400*24+40]
   2541    mova           m11, [pw_455_24]
   2542    lea             t4, [rsp+wq*2+400*52+40]
   2543    mova           m12, [pw_164_24]
   2544    neg             wq
   2545    pshuflw        m13, m15, q0000
   2546    pshuflw        m14, m15, q2222
   2547    pshufhw        m15, m15, q1010
   2548    punpcklqdq     m13, m13 ; s0
   2549    punpcklqdq     m14, m14 ; s1
   2550    punpckhqdq     m15, m15 ; w0 w1
   2551    pxor            m6, m6
   2552    psllw          m15, 2
   2553 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
   2554 %define lpfm [rsp]
   2555 %else
   2556    mov             r1, [rstk+stack_offset+28] ; params
   2557    LEA             r6, pw_2056
   2558    mova            m2, [r1]
   2559    add           lpfm, wq
   2560    lea             t1, [rsp+extra_stack+wq*2+52]
   2561    add           dstq, wq
   2562    lea             t3, [rsp+extra_stack+wq*4+400*24+48]
   2563    mov           dstm, dstq
   2564    lea             t4, [rsp+extra_stack+wq*2+400*52+48]
   2565    mov            t3m, t3
   2566    mov            t4m, t4
   2567    neg             wq
   2568    pshuflw         m0, m2, q0000
   2569    pshuflw         m1, m2, q2222
   2570    pshufhw         m2, m2, q1010
   2571    punpcklqdq      m0, m0 ; s0
   2572    punpcklqdq      m1, m1 ; s1
   2573    punpckhqdq      m2, m2 ; w0 w1
   2574    mov            w1m, wd
   2575    pxor            m3, m3
   2576    psllw           m2, 2
   2577    mova           m13, m0
   2578    mova           m14, m1
   2579    sub             wd, 2
   2580    mova           m15, m2
   2581    mova            m6, m3
   2582    mov           lpfq, lpfm
   2583    mov            w0m, wd
   2584 %define strideq r5
   2585 %endif
   2586    test         edgeb, 4 ; LR_HAVE_TOP
   2587    jz .no_top
   2588    call .h_top
   2589    add           lpfq, stridemp
   2590    mov             t2, t1
   2591 %if ARCH_X86_64
   2592    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup
   2593 %else
   2594    mov             wq, w0m
   2595    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop
   2596 %endif
   2597    add             t1, 400*12
   2598    call .h_top
   2599    movif32    strideq, stridemp
   2600    lea            r10, [lpfq+strideq*4]
   2601    mov           lpfq, dstq
   2602    add            r10, strideq
   2603    mov           lpfm, r10 ; below
   2604    movif32         t4, t4m
   2605    call .hv0
   2606 .main:
   2607    dec             hd
   2608    jz .height1
   2609    movif32       lpfq, hvsrcm
   2610    add           lpfq, stridemp
   2611    call .hv1
   2612    call .prep_n
   2613    sub             hd, 2
   2614    jl .extend_bottom
   2615 .main_loop:
   2616    movif32       lpfq, hvsrcm
   2617    add           lpfq, stridemp
   2618    call .hv0
   2619 %if ARCH_X86_64
   2620    test            hd, hd
   2621 %else
   2622    mov             r4, hd
   2623    test            r4, r4
   2624 %endif
   2625    jz .odd_height
   2626    movif32       lpfq, hvsrcm
   2627    add           lpfq, stridemp
   2628    call .hv1
   2629    call .n0
   2630    call .n1
   2631    sub             hd, 2
   2632    jge .main_loop
   2633    test         edgeb, 8 ; LR_HAVE_BOTTOM
   2634    jz .extend_bottom
   2635    mov           lpfq, lpfm
   2636    call .hv0_bottom
   2637    movif32       lpfq, hvsrcm
   2638    add           lpfq, stridemp
   2639    call .hv1_bottom
   2640 .end:
   2641    call .n0
   2642    call .n1
   2643 .end2:
   2644    RET
   2645 .height1:
   2646    call .v1
   2647    call .prep_n
   2648    jmp .odd_height_end
   2649 .odd_height:
   2650    call .v1
   2651    call .n0
   2652    call .n1
   2653 .odd_height_end:
   2654    call .v0
   2655    call .v1
   2656    call .n0
   2657    jmp .end2
   2658 .extend_bottom:
   2659    call .v0
   2660    call .v1
   2661    jmp .end
   2662 .no_top:
   2663    movif32    strideq, stridemp
   2664    lea            r10, [lpfq+strideq*4]
   2665    mov           lpfq, dstq
   2666    lea            r10, [r10+strideq*2]
   2667    mov           lpfm, r10
   2668    call .h
   2669 %if ARCH_X86_64
   2670    lea             wq, [r4-2]
   2671 %else
   2672    mov             wq, w0m
   2673    mov         hvsrcm, lpfq
   2674 %endif
   2675    lea             t2, [t1+400*12]
   2676 .top_fixup_loop:
   2677    mova            m0, [t1+wq*2+400* 0]
   2678    mova            m1, [t1+wq*2+400* 2]
   2679    mova            m2, [t1+wq*2+400* 4]
   2680    paddw           m0, m0
   2681    mova            m3, [t1+wq*2+400* 6]
   2682    paddd           m1, m1
   2683    mova            m4, [t1+wq*2+400* 8]
   2684    paddd           m2, m2
   2685    mova            m5, [t1+wq*2+400*10]
   2686    mova [t2+wq*2+400* 0], m0
   2687    mova [t2+wq*2+400* 2], m1
   2688    mova [t2+wq*2+400* 4], m2
   2689    mova [t2+wq*2+400* 6], m3
   2690    mova [t2+wq*2+400* 8], m4
   2691    mova [t2+wq*2+400*10], m5
   2692    add             wq, 8
   2693    jl .top_fixup_loop
   2694    movif32         t3, t3m
   2695    movif32         t4, t4m
   2696    call .v0
   2697    jmp .main
   2698 .extend_right:
   2699 %assign stack_offset stack_offset+8
   2700 %assign calloff 8
   2701 %if ARCH_X86_64
   2702    SWAP            m8, m6
   2703 %endif
   2704    movd            m1, wd
   2705    movd            m3, [lpfq-1]
   2706    pshufb          m1, m8
   2707    pshufb          m3, m8
   2708    psubb           m2, [base+pb_1], m1
   2709    pcmpgtb         m2, [base+pb_0to15]
   2710    pand            m5, m2
   2711    pandn           m2, m3
   2712    por             m5, m2
   2713 %if ARCH_X86_64
   2714    SWAP            m6, m8
   2715 %endif
   2716    ret
   2717 %assign stack_offset stack_offset-4
   2718 %assign calloff 4
   2719 .h: ; horizontal boxsum
   2720 %if ARCH_X86_64
   2721    lea             wq, [r4-2]
   2722 %else
   2723 %define leftq r4
   2724 %endif
   2725    test         edgeb, 1 ; LR_HAVE_LEFT
   2726    jz .h_extend_left
   2727    movif32      leftq, leftm
   2728    movddup         m4, [leftq-4]
   2729    movif32         wq, w0m
   2730    mova            m5, [lpfq+wq+2]
   2731    add         leftmp, 4
   2732    palignr         m5, m4, 13
   2733    jmp .h_main
   2734 .h_extend_left:
   2735    movif32         wq, w0m
   2736    mova            m5, [lpfq+wq+2]
   2737    pshufb          m5, [base+sgr_lshuf5]
   2738    jmp .h_main
   2739 .h_top:
   2740 %if ARCH_X86_64
   2741    lea             wq, [r4-2]
   2742 %endif
   2743    test         edgeb, 1 ; LR_HAVE_LEFT
   2744    jz .h_extend_left
   2745    movif32         wq, w0m
   2746 .h_loop:
   2747    movu            m5, [lpfq+wq-1]
   2748 .h_main:
   2749    test         edgeb, 2 ; LR_HAVE_RIGHT
   2750 %if ARCH_X86_32
   2751    pxor            m8, m8
   2752 %else
   2753    SWAP            m8, m6
   2754 %endif
   2755    jnz .h_have_right
   2756    cmp             wd, -10
   2757    jl .h_have_right
   2758    call .extend_right
   2759 .h_have_right:
   2760    punpcklbw       m4, m5, m8
   2761    punpckhbw       m5, m8
   2762    palignr         m3, m5, m4, 2
   2763    palignr         m0, m5, m4, 4
   2764    paddw           m1, m3, m0
   2765    punpcklwd       m2, m3, m0
   2766    pmaddwd         m2, m2
   2767    punpckhwd       m3, m0
   2768    pmaddwd         m3, m3
   2769    palignr         m0, m5, m4, 6
   2770    paddw           m1, m0             ; sum3
   2771    punpcklwd       m7, m0, m8
   2772    pmaddwd         m7, m7
   2773    punpckhwd       m0, m8
   2774    pmaddwd         m0, m0
   2775 %if ARCH_X86_64
   2776    SWAP            m6, m8
   2777 %endif
   2778    paddd           m2, m7             ; sumsq3
   2779    palignr         m5, m4, 8
   2780    punpcklwd       m7, m5, m4
   2781    paddw           m8, m4, m5
   2782    pmaddwd         m7, m7
   2783    punpckhwd       m5, m4
   2784    pmaddwd         m5, m5
   2785    paddd           m3, m0
   2786    mova [t1+wq*2+400* 6], m1
   2787    mova [t1+wq*2+400* 8], m2
   2788    mova [t1+wq*2+400*10], m3
   2789    paddw           m8, m1             ; sum5
   2790    paddd           m7, m2             ; sumsq5
   2791    paddd           m5, m3
   2792    mova [t1+wq*2+400* 0], m8
   2793    mova [t1+wq*2+400* 2], m7
   2794    mova [t1+wq*2+400* 4], m5
   2795    add             wq, 8
   2796    jl .h_loop
   2797    ret
   2798 ALIGN function_align
   2799 .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
   2800 %if ARCH_X86_64
   2801    lea             wq, [r4-2]
   2802 %else
   2803    mov         hvsrcm, lpfq
   2804 %endif
   2805    test         edgeb, 1 ; LR_HAVE_LEFT
   2806    jz .hv0_extend_left
   2807    movif32      leftq, leftm
   2808    movddup         m4, [leftq-4]
   2809    movif32         wq, w0m
   2810    mova            m5, [lpfq+wq+2]
   2811    add         leftmp, 4
   2812    palignr         m5, m4, 13
   2813    jmp .hv0_main
   2814 .hv0_extend_left:
   2815    movif32         wq, w0m
   2816    mova            m5, [lpfq+wq+2]
   2817    pshufb          m5, [base+sgr_lshuf5]
   2818    jmp .hv0_main
   2819 .hv0_bottom:
   2820 %if ARCH_X86_64
   2821    lea             wq, [r4-2]
   2822 %else
   2823    mov         hvsrcm, lpfq
   2824 %endif
   2825    test         edgeb, 1 ; LR_HAVE_LEFT
   2826    jz .hv0_extend_left
   2827    movif32         wq, w0m
   2828 %if ARCH_X86_32
   2829    jmp .hv0_loop_start
   2830 %endif
   2831 .hv0_loop:
   2832    movif32       lpfq, hvsrcm
   2833 .hv0_loop_start:
   2834    movu            m5, [lpfq+wq-1]
   2835 .hv0_main:
   2836    test         edgeb, 2 ; LR_HAVE_RIGHT
   2837 %if ARCH_X86_32
   2838    pxor            m8, m8
   2839 %else
   2840    SWAP            m8, m6
   2841 %endif
   2842    jnz .hv0_have_right
   2843    cmp             wd, -10
   2844    jl .hv0_have_right
   2845    call .extend_right
   2846 .hv0_have_right:
   2847    punpcklbw       m4, m5, m8
   2848    punpckhbw       m5, m8
   2849    palignr         m3, m5, m4, 2
   2850    palignr         m0, m5, m4, 4
   2851    movif32         t3, t3m
   2852    paddw           m1, m3, m0
   2853    punpcklwd       m2, m3, m0
   2854    pmaddwd         m2, m2
   2855    punpckhwd       m3, m0
   2856    pmaddwd         m3, m3
   2857    palignr         m0, m5, m4, 6
   2858    paddw           m1, m0             ; h sum3
   2859    punpcklwd       m7, m0, m8
   2860    pmaddwd         m7, m7
   2861    punpckhwd       m0, m8
   2862 %if ARCH_X86_64
   2863    SWAP            m6, m8
   2864 %endif
   2865    pmaddwd         m0, m0
   2866    paddd           m2, m7             ; h sumsq3
   2867    palignr         m5, m4, 8
   2868    punpcklwd       m7, m5, m4
   2869    paddw           m8, m4, m5
   2870    pmaddwd         m7, m7
   2871    punpckhwd       m5, m4
   2872    pmaddwd         m5, m5
   2873    paddd           m3, m0
   2874    paddw           m8, m1             ; h sum5
   2875    paddd           m7, m2             ; h sumsq5
   2876    paddd           m5, m3
   2877    mova [t3+wq*4+400*8+ 8], m8
   2878    mova [t3+wq*4+400*0+ 8], m7
   2879    mova [t3+wq*4+400*0+24], m5
   2880    paddw           m8, [t1+wq*2+400* 0]
   2881    paddd           m7, [t1+wq*2+400* 2]
   2882    paddd           m5, [t1+wq*2+400* 4]
   2883    mova [t1+wq*2+400* 0], m8
   2884    mova [t1+wq*2+400* 2], m7
   2885    mova [t1+wq*2+400* 4], m5
   2886    paddw           m0, m1, [t1+wq*2+400* 6]
   2887    paddd           m4, m2, [t1+wq*2+400* 8]
   2888    paddd           m5, m3, [t1+wq*2+400*10]
   2889    mova [t1+wq*2+400* 6], m1
   2890    mova [t1+wq*2+400* 8], m2
   2891    mova [t1+wq*2+400*10], m3
   2892    paddw           m1, m0, [t2+wq*2+400* 6]
   2893    paddd           m2, m4, [t2+wq*2+400* 8]
   2894    paddd           m3, m5, [t2+wq*2+400*10]
   2895    mova [t2+wq*2+400* 6], m0
   2896    mova [t2+wq*2+400* 8], m4
   2897    mova [t2+wq*2+400*10], m5
   2898    pslld           m4, m2, 3
   2899    pslld           m5, m3, 3
   2900    paddd           m4, m2             ; a3 * 9
   2901    paddd           m5, m3
   2902    movaps          m7, [base+pf_256]
   2903 %if ARCH_X86_32
   2904    pxor            m2, m2
   2905    punpcklwd       m0, m1, m2
   2906    punpckhwd       m1, m2
   2907 %else
   2908    punpcklwd       m0, m1, m6         ; b3
   2909    punpckhwd       m1, m6
   2910 %endif
   2911    SGR_CALC_X      m3, m2, m0, m1, m4, m5, m14, m11, m7
   2912    punpcklwd       m2, m3, m3
   2913    mova [t4+wq*2+400*2+ 4], m3
   2914    punpckhwd       m3, m3
   2915    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   2916    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   2917    paddd           m1, m10
   2918    psrld           m0, 12
   2919    psrld           m1, 12
   2920    mova [t3+wq*4+400*4+ 8], m0
   2921    mova [t3+wq*4+400*4+24], m1
   2922    add             wq, 8
   2923    jl .hv0_loop
   2924    ret
   2925 ALIGN function_align
   2926 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
   2927 %if ARCH_X86_64
   2928    lea             wq, [r4-2]
   2929 %else
   2930    mov         hvsrcm, lpfq
   2931 %endif
   2932    test         edgeb, 1 ; LR_HAVE_LEFT
   2933    jz .hv1_extend_left
   2934    movif32      leftq, leftm
   2935    movddup         m4, [leftq-4]
   2936    movif32         wq, w0m
   2937    mova            m5, [lpfq+wq+2]
   2938    add         leftmp, 4
   2939    palignr         m5, m4, 13
   2940    jmp .hv1_main
   2941 .hv1_extend_left:
   2942    movif32         wq, w0m
   2943    mova            m5, [lpfq+wq+2]
   2944    pshufb          m5, [base+sgr_lshuf5]
   2945    jmp .hv1_main
   2946 .hv1_bottom:
   2947 %if ARCH_X86_64
   2948    lea             wq, [r4-2]
   2949 %else
   2950    mov         hvsrcm, lpfq
   2951 %endif
   2952    test         edgeb, 1 ; LR_HAVE_LEFT
   2953    jz .hv1_extend_left
   2954    movif32         wq, w0m
   2955 %if ARCH_X86_32
   2956    jmp .hv1_loop_start
   2957 %endif
   2958 .hv1_loop:
   2959    movif32       lpfq, hvsrcm
   2960 .hv1_loop_start:
   2961    movu            m5, [lpfq+wq-1]
   2962 .hv1_main:
   2963    test         edgeb, 2 ; LR_HAVE_RIGHT
   2964 %if ARCH_X86_32
   2965    pxor            m8, m8
   2966 %else
   2967    SWAP            m8, m6
   2968 %endif
   2969    jnz .hv1_have_right
   2970    cmp             wd, -10
   2971    jl .hv1_have_right
   2972    call .extend_right
   2973 .hv1_have_right:
   2974    punpcklbw       m4, m5, m8
   2975    punpckhbw       m5, m8
   2976    palignr         m7, m5, m4, 2
   2977    palignr         m3, m5, m4, 4
   2978    paddw           m2, m7, m3
   2979    punpcklwd       m0, m7, m3
   2980    pmaddwd         m0, m0
   2981    punpckhwd       m7, m3
   2982    pmaddwd         m7, m7
   2983    palignr         m3, m5, m4, 6
   2984    paddw           m2, m3             ; h sum3
   2985    punpcklwd       m1, m3, m8
   2986    pmaddwd         m1, m1
   2987    punpckhwd       m3, m8
   2988 %if ARCH_X86_64
   2989    SWAP            m6, m8
   2990 %endif
   2991    pmaddwd         m3, m3
   2992    paddd           m0, m1             ; h sumsq3
   2993    palignr         m5, m4, 8
   2994    punpckhwd       m1, m4, m5
   2995    paddw           m8, m4, m5
   2996    pmaddwd         m1, m1
   2997    punpcklwd       m4, m5
   2998    pmaddwd         m4, m4
   2999    paddd           m7, m3
   3000    paddw           m5, m2, [t2+wq*2+400* 6]
   3001    mova [t2+wq*2+400* 6], m2
   3002    paddw           m8, m2             ; h sum5
   3003    paddd           m2, m0, [t2+wq*2+400* 8]
   3004    paddd           m3, m7, [t2+wq*2+400*10]
   3005    mova [t2+wq*2+400* 8], m0
   3006    mova [t2+wq*2+400*10], m7
   3007    paddd           m4, m0             ; h sumsq5
   3008    paddd           m1, m7
   3009    pslld           m0, m2, 3
   3010    pslld           m7, m3, 3
   3011    paddd           m2, m0             ; a3 * 9
   3012    paddd           m3, m7
   3013 %if ARCH_X86_32
   3014    mova      [esp+20], m8
   3015    pxor            m8, m8
   3016 %else
   3017    SWAP            m8, m6
   3018 %endif
   3019    punpcklwd       m0, m5, m8         ; b3
   3020    punpckhwd       m5, m8
   3021    SGR_CALC_X      m8, m7, m0, m5, m2, m3, m14, m11, [base+pf_256]
   3022    movif32         t3, t3m
   3023    punpcklwd       m2, m8, m8
   3024    mova [t4+wq*2+400*4+ 4], m8
   3025    punpckhwd       m8, m8
   3026    MUL_32X16X2     m0, m5, m2, m8, m3, m7
   3027    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   3028    paddd           m5, m10
   3029    psrld           m0, 12
   3030    psrld           m5, 12
   3031    mova [t3+wq*4+400*8+ 8], m0
   3032    mova [t3+wq*4+400*8+24], m5
   3033 %if ARCH_X86_32
   3034    mova            m8, [esp+20]
   3035 %else
   3036    SWAP            m6, m8
   3037    pxor            m6, m6
   3038 %endif
   3039    paddw           m5, m8, [t2+wq*2+400*0]
   3040    paddd           m2, m4, [t2+wq*2+400*2]
   3041    paddd           m3, m1, [t2+wq*2+400*4]
   3042    paddw           m5, [t1+wq*2+400*0]
   3043    paddd           m2, [t1+wq*2+400*2]
   3044    paddd           m3, [t1+wq*2+400*4]
   3045    mova [t2+wq*2+400*0], m8
   3046    pslld           m0, m2, 4
   3047    mova [t2+wq*2+400*2], m4
   3048    pslld           m8, m3, 4
   3049    mova [t2+wq*2+400*4], m1
   3050    pslld           m4, m2, 3
   3051    paddd           m2, m0
   3052    pslld           m0, m3, 3
   3053    paddd           m3, m8
   3054    paddd           m2, m4             ; a5 * 25
   3055    paddd           m3, m0
   3056 %if ARCH_X86_32
   3057    pxor            m7, m7
   3058    punpcklwd       m0, m5, m7
   3059    punpckhwd       m5, m7
   3060 %else
   3061    punpcklwd       m0, m5, m6         ; b5
   3062    punpckhwd       m5, m6
   3063 %endif
   3064    movaps          m8, [base+pf_256]
   3065    SGR_CALC_X      m1, m4, m0, m5, m2, m3, m13, m12, m8
   3066    punpcklwd       m2, m1, m1
   3067    mova   [t4+wq*2+4], m1
   3068    punpckhwd       m1, m1
   3069    MUL_32X16X2     m0, m5, m2, m1, m3, m4
   3070    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
   3071    paddd           m5, m10
   3072    psrld           m0, 12
   3073    psrld           m5, 12
   3074    mova  [t3+wq*4+ 8], m0
   3075    mova  [t3+wq*4+24], m5
   3076    add             wq, 8
   3077    jl .hv1_loop
   3078    mov            r10, t2
   3079    mov             t2, t1
   3080    mov             t1, r10
   3081    ret
   3082 .v0: ; vertical boxsums + ab3 (even rows)
   3083 %if ARCH_X86_64
   3084    lea             wq, [r4-2]
   3085 %else
   3086    mov             wd, w0m
   3087 %endif
   3088    movaps          m8, [base+pf_256]
   3089 .v0_loop:
   3090    mova            m0, [t1+wq*2+400* 6]
   3091    mova            m4, [t1+wq*2+400* 8]
   3092    mova            m5, [t1+wq*2+400*10]
   3093    paddw           m0, m0
   3094    paddd           m4, m4
   3095    paddd           m5, m5
   3096    paddw           m1, m0, [t2+wq*2+400* 6]
   3097    paddd           m2, m4, [t2+wq*2+400* 8]
   3098    paddd           m3, m5, [t2+wq*2+400*10]
   3099    mova [t2+wq*2+400* 6], m0
   3100    mova [t2+wq*2+400* 8], m4
   3101    mova [t2+wq*2+400*10], m5
   3102    pslld           m4, m2, 3
   3103    pslld           m5, m3, 3
   3104    paddd           m4, m2             ; a3 * 9
   3105    paddd           m5, m3
   3106 %if ARCH_X86_32
   3107    pxor            m7, m7
   3108    punpcklwd       m0, m1, m7
   3109    punpckhwd       m1, m7
   3110 %else
   3111    punpcklwd       m0, m1, m6         ; b3
   3112    punpckhwd       m1, m6
   3113 %endif
   3114    SGR_CALC_X      m3, m2, m0, m1, m4, m5, m14, m11, m8
   3115    punpcklwd       m2, m3, m3
   3116    mova [t4+wq*2+400*2+4], m3
   3117    punpckhwd       m3, m3
   3118    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   3119    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   3120    paddd           m1, m10
   3121    psrld           m0, 12
   3122    psrld           m1, 12
   3123    mova            m3, [t1+wq*2+400*0]
   3124    mova            m4, [t1+wq*2+400*2]
   3125    mova            m5, [t1+wq*2+400*4]
   3126    mova [t3+wq*4+400*8+ 8], m3
   3127    mova [t3+wq*4+400*0+ 8], m4
   3128    mova [t3+wq*4+400*0+24], m5
   3129    paddw           m3, m3 ; cc5
   3130    paddd           m4, m4
   3131    paddd           m5, m5
   3132    mova [t1+wq*2+400*0], m3
   3133    mova [t1+wq*2+400*2], m4
   3134    mova [t1+wq*2+400*4], m5
   3135    mova [t3+wq*4+400*4+ 8], m0
   3136    mova [t3+wq*4+400*4+24], m1
   3137    add             wq, 8
   3138    jl .v0_loop
   3139    ret
   3140 .v1: ; vertical boxsums + ab (odd rows)
   3141 %if ARCH_X86_64
   3142    lea             wq, [r4-2]
   3143 %else
   3144    mov             wd, w0m
   3145 %endif
   3146 .v1_loop:
   3147    mova            m4, [t1+wq*2+400* 6]
   3148    mova            m5, [t1+wq*2+400* 8]
   3149    mova            m7, [t1+wq*2+400*10]
   3150    paddw           m8, m4, [t2+wq*2+400* 6]
   3151    paddd           m2, m5, [t2+wq*2+400* 8]
   3152    paddd           m3, m7, [t2+wq*2+400*10]
   3153    mova [t2+wq*2+400* 6], m4
   3154    mova [t2+wq*2+400* 8], m5
   3155    mova [t2+wq*2+400*10], m7
   3156    pslld           m4, m2, 3
   3157    pslld           m5, m3, 3
   3158    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
   3159    paddd           m5, m3
   3160    movaps          m1, [base+pf_256]
   3161 %if ARCH_X86_32
   3162    pxor            m7, m7
   3163    punpcklwd       m0, m8, m7
   3164    punpckhwd       m8, m7
   3165 %else
   3166    punpcklwd       m0, m8, m6         ; b3
   3167    punpckhwd       m8, m6
   3168 %endif
   3169    SGR_CALC_X      m3, m2, m0, m8, m4, m5, m14, m11, m1
   3170    punpcklwd       m2, m3, m3
   3171    mova [t4+wq*2+400*4+4], m3
   3172    punpckhwd       m3, m3
   3173    MUL_32X16X2     m0, m8, m2, m3, m4, m5
   3174    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   3175    paddd           m8, m10
   3176    psrld           m0, 12
   3177    psrld           m8, 12
   3178    mova            m4, [t3+wq*4+400*8+ 8]
   3179    mova            m5, [t3+wq*4+400*0+ 8]
   3180    mova            m7, [t3+wq*4+400*0+24]
   3181    paddw           m1, m4, [t2+wq*2+400*0]
   3182    paddd           m2, m5, [t2+wq*2+400*2]
   3183    paddd           m3, m7, [t2+wq*2+400*4]
   3184    paddw           m1, [t1+wq*2+400*0]
   3185    paddd           m2, [t1+wq*2+400*2]
   3186    paddd           m3, [t1+wq*2+400*4]
   3187    mova [t2+wq*2+400*0], m4
   3188    mova [t2+wq*2+400*2], m5
   3189    mova [t2+wq*2+400*4], m7
   3190    pslld           m4, m2, 4
   3191    mova [t3+wq*4+400*8+ 8], m0
   3192    pslld           m5, m3, 4
   3193    mova [t3+wq*4+400*8+24], m8
   3194    pslld           m7, m2, 3
   3195    paddd           m2, m4
   3196    pslld           m4, m3, 3
   3197    paddd           m3, m5
   3198    paddd           m2, m7             ; a5 * 25
   3199    paddd           m3, m4
   3200    movaps          m8, [base+pf_256]
   3201 %if ARCH_X86_32
   3202    pxor            m7, m7
   3203    punpcklwd       m0, m1, m7
   3204    punpckhwd       m1, m7
   3205 %else
   3206    punpcklwd       m0, m1, m6         ; b5
   3207    punpckhwd       m1, m6
   3208 %endif
   3209    SGR_CALC_X      m5, m4, m0, m1, m2, m3, m13, m12, m8
   3210    punpcklwd       m4, m5, m5
   3211    mova   [t4+wq*2+4], m5
   3212    punpckhwd       m5, m5
   3213    MUL_32X16X2     m0, m1, m4, m5, m2, m3
   3214    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
   3215    paddd           m1, m10
   3216    psrld           m0, 12
   3217    psrld           m1, 12
   3218    mova  [t3+wq*4+ 8], m0
   3219    mova  [t3+wq*4+24], m1
   3220    add             wq, 8
   3221    jl .v1_loop
   3222    mov            r10, t2
   3223    mov             t2, t1
   3224    mov             t1, r10
   3225    ret
   3226 .prep_n: ; initial neighbor setup
   3227    movif64         wq, r4
   3228    movif32         wd, w1m
   3229 .prep_n_loop:
   3230    movu            m0, [t4+wq*2+400*0+ 2]
   3231    movu            m1, [t3+wq*4+400*0+ 4]
   3232    movu            m2, [t3+wq*4+400*0+20]
   3233    movu            m7, [t4+wq*2+400*0+ 4]
   3234    movu            m8, [t3+wq*4+400*0+ 8]
   3235    paddw           m3, m0, [t4+wq*2+400*0+ 0]
   3236    paddd           m4, m1, [t3+wq*4+400*0+ 0]
   3237    paddd           m5, m2, [t3+wq*4+400*0+16]
   3238    paddw           m3, m7
   3239    paddd           m4, m8
   3240    movu            m7, [t3+wq*4+400*0+24]
   3241    paddw           m0, m3
   3242    paddd           m1, m4
   3243    psllw           m3, 2
   3244    pslld           m4, 2
   3245    paddd           m5, m7
   3246    paddd           m2, m5
   3247    pslld           m5, 2
   3248    paddw           m0, m3               ; a5 565
   3249    paddd           m1, m4               ; b5 565
   3250    paddd           m2, m5
   3251    mova [t4+wq*2+400* 6+ 0], m0
   3252    mova [t3+wq*4+400*12+ 0], m1
   3253    mova [t3+wq*4+400*12+16], m2
   3254    movu            m0, [t4+wq*2+400*2+ 4]
   3255    movu            m1, [t3+wq*4+400*4+ 8]
   3256    movu            m2, [t3+wq*4+400*4+24]
   3257    movu            m3, [t4+wq*2+400*2+ 2]
   3258    movu            m4, [t3+wq*4+400*4+ 4]
   3259    movu            m5, [t3+wq*4+400*4+20]
   3260    paddw           m0, [t4+wq*2+400*2+ 0]
   3261    paddd           m1, [t3+wq*4+400*4+ 0]
   3262    paddd           m2, [t3+wq*4+400*4+16]
   3263    paddw           m3, m0
   3264    paddd           m4, m1
   3265    paddd           m5, m2
   3266    psllw           m3, 2                ; a3[-1] 444
   3267    pslld           m4, 2                ; b3[-1] 444
   3268    pslld           m5, 2
   3269    psubw           m3, m0               ; a3[-1] 343
   3270    psubd           m4, m1               ; b3[-1] 343
   3271    psubd           m5, m2
   3272    mova [t4+wq*2+400* 8+ 0], m3
   3273    mova [t3+wq*4+400*16+ 0], m4
   3274    mova [t3+wq*4+400*16+16], m5
   3275    movu            m0, [t4+wq*2+400*4+ 4]
   3276    movu            m1, [t3+wq*4+400*8+ 8]
   3277    movu            m2, [t3+wq*4+400*8+24]
   3278    movu            m3, [t4+wq*2+400*4+ 2]
   3279    movu            m4, [t3+wq*4+400*8+ 4]
   3280    movu            m5, [t3+wq*4+400*8+20]
   3281    paddw           m0, [t4+wq*2+400*4+ 0]
   3282    paddd           m1, [t3+wq*4+400*8+ 0]
   3283    paddd           m2, [t3+wq*4+400*8+16]
   3284    paddw           m3, m0
   3285    paddd           m4, m1
   3286    paddd           m5, m2
   3287    psllw           m3, 2                 ; a3[ 0] 444
   3288    pslld           m4, 2                 ; b3[ 0] 444
   3289    pslld           m5, 2
   3290    mova [t4+wq*2+400*10+ 0], m3
   3291    mova [t3+wq*4+400*20+ 0], m4
   3292    mova [t3+wq*4+400*20+16], m5
   3293    psubw           m3, m0                ; a3[ 0] 343
   3294    psubd           m4, m1                ; b3[ 0] 343
   3295    psubd           m5, m2
   3296    mova [t4+wq*2+400*12+ 0], m3
   3297    mova [t3+wq*4+400*24+ 0], m4
   3298    mova [t3+wq*4+400*24+16], m5
   3299    add             wq, 8
   3300    jl .prep_n_loop
   3301    ret
   3302 ALIGN function_align
   3303 .n0: ; neighbor + output (even rows)
   3304    movif64         wq, r4
   3305    movif32         wd, w1m
   3306 .n0_loop:
   3307    movu            m0, [t4+wq*2+ 4]
   3308    movu            m2, [t4+wq*2+ 2]
   3309    paddw           m0, [t4+wq*2+ 0]
   3310    paddw           m0, m2
   3311    paddw           m2, m0
   3312    psllw           m0, 2
   3313    paddw           m0, m2               ; a5
   3314    movu            m4, [t3+wq*4+ 8]
   3315    movu            m5, [t3+wq*4+24]
   3316    movu            m1, [t3+wq*4+ 4]
   3317    movu            m3, [t3+wq*4+20]
   3318    paddd           m4, [t3+wq*4+ 0]
   3319    paddd           m5, [t3+wq*4+16]
   3320    paddd           m4, m1
   3321    paddd           m5, m3
   3322    paddd           m1, m4
   3323    paddd           m3, m5
   3324    pslld           m4, 2
   3325    pslld           m5, 2
   3326    paddd           m4, m1               ; b5
   3327    paddd           m5, m3
   3328    movu            m2, [t4+wq*2+400* 6]
   3329    paddw           m2, m0
   3330    mova [t4+wq*2+400* 6], m0
   3331    paddd           m0, m4, [t3+wq*4+400*12+ 0]
   3332    paddd           m1, m5, [t3+wq*4+400*12+16]
   3333    mova [t3+wq*4+400*12+ 0], m4
   3334    mova [t3+wq*4+400*12+16], m5
   3335    mova [rsp+16+ARCH_X86_32*4], m1
   3336    movu            m3, [t4+wq*2+400*2+4]
   3337    movu            m5, [t4+wq*2+400*2+2]
   3338    paddw           m3, [t4+wq*2+400*2+0]
   3339    paddw           m5, m3
   3340    psllw           m5, 2                ; a3[ 1] 444
   3341    psubw           m4, m5, m3           ; a3[ 1] 343
   3342    movu            m3, [t4+wq*2+400* 8]
   3343    paddw           m3, [t4+wq*2+400*10]
   3344    paddw           m3, m4
   3345    mova [t4+wq*2+400* 8], m4
   3346    mova [t4+wq*2+400*10], m5
   3347    movu            m1, [t3+wq*4+400*4+ 8]
   3348    movu            m5, [t3+wq*4+400*4+ 4]
   3349    movu            m7, [t3+wq*4+400*4+24]
   3350    movu            m8, [t3+wq*4+400*4+20]
   3351    paddd           m1, [t3+wq*4+400*4+ 0]
   3352    paddd           m7, [t3+wq*4+400*4+16]
   3353    paddd           m5, m1
   3354    paddd           m8, m7
   3355    pslld           m5, 2                ; b3[ 1] 444
   3356    pslld           m8, 2
   3357    psubd           m4, m5, m1           ; b3[ 1] 343
   3358 %if ARCH_X86_32
   3359    mova      [esp+52], m8
   3360    psubd           m8, m7
   3361 %else
   3362    psubd           m6, m8, m7
   3363    SWAP            m8, m6
   3364 %endif
   3365    paddd           m1, m4, [t3+wq*4+400*16+ 0]
   3366    paddd           m7, m8, [t3+wq*4+400*16+16]
   3367    paddd           m1, [t3+wq*4+400*20+ 0]
   3368    paddd           m7, [t3+wq*4+400*20+16]
   3369    mova [t3+wq*4+400*16+ 0], m4
   3370    mova [t3+wq*4+400*16+16], m8
   3371    mova [t3+wq*4+400*20+ 0], m5
   3372 %if ARCH_X86_32
   3373    mova            m8, [esp+52]
   3374 %else
   3375    SWAP            m8, m6
   3376    pxor            m6, m6
   3377 %endif
   3378    mova [t3+wq*4+400*20+16], m8
   3379    mova [rsp+32+ARCH_X86_32*4], m7
   3380    movq            m4, [dstq+wq]
   3381    punpcklbw       m4, m6
   3382    punpcklwd       m5, m4, m6
   3383    punpcklwd       m7, m2, m6
   3384    pmaddwd         m7, m5               ; a5 * src
   3385    punpcklwd       m8, m3, m6
   3386    pmaddwd         m8, m5               ; a3 * src
   3387    punpckhwd       m5, m4, m6
   3388    punpckhwd       m2, m6
   3389    pmaddwd         m2, m5
   3390    punpckhwd       m3, m6
   3391    pmaddwd         m3, m5
   3392    psubd           m0, m7               ; b5 - a5 * src + (1 << 8) - (src << 13)
   3393    psubd           m1, m8               ; b3 - a3 * src + (1 << 8) - (src << 13)
   3394    psrld           m0, 9
   3395    pslld           m1, 7
   3396    pand            m0, m9
   3397    pandn           m8, m9, m1
   3398    por             m0, m8
   3399    mova            m1, [rsp+16+ARCH_X86_32*4]
   3400    psubd           m1, m2
   3401    mova            m2, [rsp+32+ARCH_X86_32*4]
   3402    psubd           m2, m3
   3403    mova            m3, [base+pd_4096]
   3404    psrld           m1, 9
   3405    pslld           m2, 7
   3406    pand            m1, m9
   3407    pandn           m5, m9, m2
   3408    por             m1, m5
   3409    pmaddwd         m0, m15
   3410    pmaddwd         m1, m15
   3411    paddd           m0, m3
   3412    paddd           m1, m3
   3413    psrad           m0, 13
   3414    psrad           m1, 13
   3415    packssdw        m0, m1
   3416    paddw           m0, m4
   3417    packuswb        m0, m0
   3418    movq     [dstq+wq], m0
   3419    add             wq, 8
   3420    jl .n0_loop
   3421    add           dstq, stridemp
   3422    ret
   3423 ALIGN function_align
   3424 .n1: ; neighbor + output (odd rows)
   3425    movif64         wq, r4
   3426    movif32         wd, w1m
   3427 .n1_loop:
   3428    movu            m3, [t4+wq*2+400*4+4]
   3429    movu            m5, [t4+wq*2+400*4+2]
   3430    paddw           m3, [t4+wq*2+400*4+0]
   3431    paddw           m5, m3
   3432    psllw           m5, 2                ; a3[ 1] 444
   3433    psubw           m4, m5, m3           ; a3[ 1] 343
   3434    paddw           m3, m4, [t4+wq*2+400*12]
   3435    paddw           m3, [t4+wq*2+400*10]
   3436    mova [t4+wq*2+400*10], m5
   3437    mova [t4+wq*2+400*12], m4
   3438    movu            m1, [t3+wq*4+400*8+ 8]
   3439    movu            m5, [t3+wq*4+400*8+ 4]
   3440    movu            m7, [t3+wq*4+400*8+24]
   3441    movu            m8, [t3+wq*4+400*8+20]
   3442    paddd           m1, [t3+wq*4+400*8+ 0]
   3443    paddd           m7, [t3+wq*4+400*8+16]
   3444    paddd           m5, m1
   3445    paddd           m8, m7
   3446    pslld           m5, 2                ; b3[ 1] 444
   3447    pslld           m8, 2
   3448    psubd           m4, m5, m1           ; b3[ 1] 343
   3449    psubd           m0, m8, m7
   3450    paddd           m1, m4, [t3+wq*4+400*24+ 0]
   3451    paddd           m7, m0, [t3+wq*4+400*24+16]
   3452    paddd           m1, [t3+wq*4+400*20+ 0]
   3453    paddd           m7, [t3+wq*4+400*20+16]
   3454    mova [t3+wq*4+400*20+ 0], m5
   3455    mova [t3+wq*4+400*20+16], m8
   3456    mova [t3+wq*4+400*24+ 0], m4
   3457    mova [t3+wq*4+400*24+16], m0
   3458    movq            m5, [dstq+wq]
   3459    mova            m2, [t4+wq*2+400* 6]
   3460    punpcklbw       m5, m6
   3461    punpcklwd       m4, m5, m6
   3462    punpcklwd       m8, m2, m6
   3463    pmaddwd         m8, m4               ; a5 * src
   3464    punpcklwd       m0, m3, m6
   3465    pmaddwd         m0, m4               ; a3 * src
   3466    punpckhwd       m4, m5, m6
   3467    punpckhwd       m2, m6
   3468    pmaddwd         m2, m4
   3469    punpckhwd       m3, m6
   3470    pmaddwd         m3, m4
   3471    psubd           m1, m0               ; b3 - a3 * src + (1 << 8) - (src << 13)
   3472    mova            m0, [t3+wq*4+400*12+ 0]
   3473    psubd           m0, m8               ; b5 - a5 * src + (1 << 8) - (src << 13)
   3474    mova            m4, [t3+wq*4+400*12+16]
   3475    psubd           m4, m2
   3476    psubd           m7, m3
   3477    pslld           m1, 7
   3478    psrld           m0, 8
   3479    psrld           m4, 8
   3480    pslld           m7, 7
   3481    pandn           m3, m9, m1
   3482    pand            m0, m9
   3483    por             m0, m3
   3484    pand            m4, m9
   3485    pandn           m2, m9, m7
   3486    por             m2, m4
   3487    mova            m1, [base+pd_4096]
   3488    pmaddwd         m0, m15
   3489    pmaddwd         m2, m15
   3490    paddd           m0, m1
   3491    paddd           m2, m1
   3492    psrad           m0, 13
   3493    psrad           m2, 13
   3494    packssdw        m0, m2
   3495    paddw           m0, m5
   3496    packuswb        m0, m0
   3497    movq     [dstq+wq], m0
   3498    add             wq, 8
   3499    jl .n1_loop
   3500    add           dstq, stridemp
   3501    movif32       dstm, dstq
   3502    ret