tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration16_sse.asm (102502B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 SECTION_RODATA
     30 
     31 wiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
     32 wiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
     33 wiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
     34 wiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
     35 wiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
     36 wiener_lshuf5: db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
     37 wiener_lshuf7: db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
     38 sgr_lshuf3:    db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
     39 sgr_lshuf5:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9
     40 pb_0to15:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     41 
     42 pb_m14_m13:    times 8 db -14,-13
     43 pb_m10_m9:     times 8 db -10, -9
     44 pb_m6_m5:      times 8 db  -6, -5
     45 pb_m2_m1:      times 8 db  -2, -1
     46 pb_2_3:        times 8 db   2,  3
     47 pb_6_7:        times 8 db   6,  7
     48 pw_256:        times 8 dw 256
     49 pw_1023:       times 8 dw 1023
     50 pw_164_24:     times 4 dw 164, 24
     51 pw_455_24:     times 4 dw 455, 24
     52 pd_8:          times 4 dd 8
     53 pd_4096:       times 4 dd 4096
     54 pd_34816:      times 4 dd 34816
     55 pd_m262128:    times 4 dd -262128
     56 pd_0xffff:     times 4 dd 0xffff
     57 pd_0xfffffff0: times 4 dd 0xfffffff0
     58 pf_256:        times 4 dd 256.0
     59 
     60 wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192
     61 wiener_round:  dd 1049600, 1048832
     62 
     63 SECTION .text
     64 
     65 %macro movif64 2 ; dst, src
     66 %if ARCH_X86_64
     67    mov             %1, %2
     68 %endif
     69 %endmacro
     70 
     71 %macro movif32 2 ; dst, src
     72 %if ARCH_X86_32
     73    mov             %1, %2
     74 %endif
     75 %endmacro
     76 
     77 INIT_XMM ssse3
     78 %if ARCH_X86_32
     79 DECLARE_REG_TMP 5, 6
     80 %if STACK_ALIGNMENT < 16
     81  %assign extra_stack 13*16
     82 %else
     83  %assign extra_stack 12*16
     84 %endif
     85 cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \
     86                              dst, stride, left, lpf, w, flt
     87 %if STACK_ALIGNMENT < 16
     88  %define lpfm        dword [esp+calloff+16*12+ 0]
     89  %define wm          dword [esp+calloff+16*12+ 4]
     90  %define hd          dword [esp+calloff+16*12+ 8]
     91  %define edgeb        byte [esp+calloff+16*12+12]
     92  %define edged       dword [esp+calloff+16*12+12]
     93 %else
     94  %define hd dword r5m
     95  %define edgeb byte r7m
     96 %endif
     97 %define PICmem dword [esp+calloff+4*0]
     98 %define t0m    dword [esp+calloff+4*1] ; wiener ring buffer pointers
     99 %define t1m    dword [esp+calloff+4*2]
    100 %define t2m    dword [esp+calloff+4*3]
    101 %define t3m    dword [esp+calloff+4*4]
    102 %define t4m    dword [esp+calloff+4*5]
    103 %define t5m    dword [esp+calloff+4*6]
    104 %define t6m    dword [esp+calloff+4*7]
    105 %define t2 t2m
    106 %define t3 t3m
    107 %define t4 t4m
    108 %define t5 t5m
    109 %define t6 t6m
    110 %define  m8 [esp+calloff+16*2]
    111 %define  m9 [esp+calloff+16*3]
    112 %define m10 [esp+calloff+16*4]
    113 %define m11 [esp+calloff+16*5]
    114 %define m12 [esp+calloff+16*6]
    115 %define m13 [esp+calloff+16*7]
    116 %define m14 [esp+calloff+16*8]
    117 %define m15 [esp+calloff+16*9]
    118 %define r10 r4
    119 %define base t0-wiener_shifts
    120 %assign calloff 0
    121 %if STACK_ALIGNMENT < 16
    122    mov             wd, [rstk+stack_offset+20]
    123    mov             wm, wd
    124    mov             r5, [rstk+stack_offset+24]
    125    mov             hd, r5
    126    mov             r5, [rstk+stack_offset+32]
    127    mov          edged, r5 ; edge
    128 %endif
    129 %else
    130 DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
    131 cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
    132                                                     w, h, edge, flt
    133 %define base
    134 %endif
    135 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16
    136    movifnidn       wd, wm
    137 %endif
    138 %if ARCH_X86_64
    139    mov           fltq, r6mp
    140    movifnidn       hd, hm
    141    mov          edged, r7m
    142    mov            t3d, r8m ; pixel_max
    143    movq           m13, [fltq]
    144    movq           m15, [fltq+16]
    145 %else
    146 %if STACK_ALIGNMENT < 16
    147    mov             t0, [rstk+stack_offset+28]
    148    mov             t1, [rstk+stack_offset+36] ; pixel_max
    149    movq            m1, [t0]    ; fx
    150    movq            m3, [t0+16] ; fy
    151    LEA             t0, wiener_shifts
    152 %else
    153    mov           fltq, r6m
    154    movq            m1, [fltq]
    155    movq            m3, [fltq+16]
    156    LEA             t0, wiener_shifts
    157    mov             t1, r8m ; pixel_max
    158 %endif
    159    mov         PICmem, t0
    160 %endif
    161    mova            m6, [base+wiener_shufA]
    162    mova            m7, [base+wiener_shufB]
    163 %if ARCH_X86_64
    164    lea             t4, [wiener_shifts]
    165    add             wd, wd
    166    pshufd         m12, m13, q0000 ; x0 x1
    167    pshufd         m13, m13, q1111 ; x2 x3
    168    pshufd         m14, m15, q0000 ; y0 y1
    169    pshufd         m15, m15, q1111 ; y2 y3
    170    mova            m8, [wiener_shufC]
    171    mova            m9, [wiener_shufD]
    172    add           lpfq, wq
    173    lea             t1, [rsp+wq+16]
    174    add           dstq, wq
    175    neg             wq
    176    shr            t3d, 11
    177 %define base t4-wiener_shifts
    178    movd           m10, [base+wiener_round+t3*4]
    179    movq           m11, [base+wiener_shifts+t3*8]
    180    pshufd         m10, m10, q0000
    181    pshufd          m0, m11, q0000
    182    pshufd         m11, m11, q1111
    183    pmullw         m12, m0 ; upshift filter coefs to make the
    184    pmullw         m13, m0 ; horizontal downshift constant
    185 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
    186 %define lpfm [rsp]
    187 %define base
    188 %define wiener_lshuf7_mem [wiener_lshuf7]
    189 %define pd_m262128_mem [pd_m262128]
    190 %else
    191    add             wd, wd
    192    mova            m4, [base+wiener_shufC]
    193    mova            m5, [base+wiener_shufD]
    194    pshufd          m0, m1, q0000
    195    pshufd          m1, m1, q1111
    196    pshufd          m2, m3, q0000
    197    pshufd          m3, m3, q1111
    198    mova            m8, m4
    199    mova            m9, m5
    200    mova           m14, m2
    201    mova           m15, m3
    202    shr             t1, 11
    203    add           lpfq, wq
    204    mova            m3, [base+pd_m262128]
    205    movd            m4, [base+wiener_round+t1*4]
    206    movq            m5, [base+wiener_shifts+t1*8]
    207    lea             t1, [esp+extra_stack+wq+16]
    208    add           dstq, wq
    209    neg             wq
    210    pshufd          m4, m4, q0000
    211    pshufd          m2, m5, q0000
    212    pshufd          m5, m5, q1111
    213    mov             wm, wq
    214    pmullw          m0, m2
    215    pmullw          m1, m2
    216    mova            m2, [base+wiener_lshuf7]
    217 %define pd_m262128_mem [esp+calloff+16*10]
    218    mova pd_m262128_mem, m3
    219    mova           m10, m4
    220    mova           m11, m5
    221    mova           m12, m0
    222    mova           m13, m1
    223 %define wiener_lshuf7_mem [esp+calloff+16*11]
    224    mova wiener_lshuf7_mem, m2
    225 %endif
    226    test         edgeb, 4 ; LR_HAVE_TOP
    227    jz .no_top
    228    call .h_top
    229    add           lpfq, strideq
    230    mov             t6, t1
    231    mov             t5, t1
    232    add             t1, 384*2
    233    call .h_top
    234    lea            r10, [lpfq+strideq*4]
    235    mov           lpfq, dstq
    236    mov             t4, t1
    237    add             t1, 384*2
    238    add            r10, strideq
    239    mov           lpfm, r10 ; below
    240    call .h
    241    mov             t3, t1
    242    mov             t2, t1
    243    dec             hd
    244    jz .v1
    245    add           lpfq, strideq
    246    add             t1, 384*2
    247    call .h
    248    mov             t2, t1
    249    dec             hd
    250    jz .v2
    251    add           lpfq, strideq
    252    add             t1, 384*2
    253    call .h
    254    dec             hd
    255    jz .v3
    256 .main:
    257    lea             t0, [t1+384*2]
    258 .main_loop:
    259    call .hv
    260    dec             hd
    261    jnz .main_loop
    262    test         edgeb, 8 ; LR_HAVE_BOTTOM
    263    jz .v3
    264    mov           lpfq, lpfm
    265    call .hv_bottom
    266    add           lpfq, strideq
    267    call .hv_bottom
    268 .v1:
    269    call .v
    270    RET
    271 .no_top:
    272    lea            r10, [lpfq+strideq*4]
    273    mov           lpfq, dstq
    274    lea            r10, [r10+strideq*2]
    275    mov           lpfm, r10
    276    call .h
    277    mov             t6, t1
    278    mov             t5, t1
    279    mov             t4, t1
    280    mov             t3, t1
    281    mov             t2, t1
    282    dec             hd
    283    jz .v1
    284    add           lpfq, strideq
    285    add             t1, 384*2
    286    call .h
    287    mov             t2, t1
    288    dec             hd
    289    jz .v2
    290    add           lpfq, strideq
    291    add             t1, 384*2
    292    call .h
    293    dec             hd
    294    jz .v3
    295    lea             t0, [t1+384*2]
    296    call .hv
    297    dec             hd
    298    jz .v3
    299    add             t0, 384*8
    300    call .hv
    301    dec             hd
    302    jnz .main
    303 .v3:
    304    call .v
    305    movif32         wq, wm
    306 .v2:
    307    call .v
    308    movif32         wq, wm
    309    jmp .v1
    310 .extend_right:
    311 %assign stack_offset stack_offset+8
    312 %assign calloff 8
    313    movif32         t0, PICmem
    314    pxor            m0, m0
    315    movd            m1, wd
    316    mova            m2, [base+pb_0to15]
    317    pshufb          m1, m0
    318    mova            m0, [base+pb_6_7]
    319    psubb           m0, m1
    320    pminub          m0, m2
    321    pshufb          m3, m0
    322    mova            m0, [base+pb_m2_m1]
    323    psubb           m0, m1
    324    pminub          m0, m2
    325    pshufb          m4, m0
    326    mova            m0, [base+pb_m10_m9]
    327    psubb           m0, m1
    328    pminub          m0, m2
    329    pshufb          m5, m0
    330    movif32         t0, t0m
    331    ret
    332 %assign stack_offset stack_offset-4
    333 %assign calloff 4
    334 .h:
    335    movif64         wq, r4
    336    movif32         wq, wm
    337    test         edgeb, 1 ; LR_HAVE_LEFT
    338    jz .h_extend_left
    339    movq            m3, [leftq]
    340    movhps          m3, [lpfq+wq]
    341    add          leftq, 8
    342    jmp .h_main
    343 .h_extend_left:
    344    mova            m3, [lpfq+wq]         ; avoid accessing memory located
    345    pshufb          m3, wiener_lshuf7_mem ; before the start of the buffer
    346    jmp .h_main
    347 .h_top:
    348    movif64         wq, r4
    349    test         edgeb, 1 ; LR_HAVE_LEFT
    350    jz .h_extend_left
    351 .h_loop:
    352    movu            m3, [lpfq+wq-8]
    353 .h_main:
    354    mova            m4, [lpfq+wq+0]
    355    movu            m5, [lpfq+wq+8]
    356    test         edgeb, 2 ; LR_HAVE_RIGHT
    357    jnz .h_have_right
    358    cmp             wd, -20
    359    jl .h_have_right
    360    call .extend_right
    361 .h_have_right:
    362    pshufb          m0, m3, m6
    363    pshufb          m1, m4, m7
    364    paddw           m0, m1
    365    pshufb          m3, m8
    366    pmaddwd         m0, m12
    367    pshufb          m1, m4, m9
    368    paddw           m3, m1
    369    pshufb          m1, m4, m6
    370    pmaddwd         m3, m13
    371    pshufb          m2, m5, m7
    372    paddw           m1, m2
    373    mova            m2, pd_m262128_mem ; (1 << 4) - (1 << 18)
    374    pshufb          m4, m8
    375    pmaddwd         m1, m12
    376    pshufb          m5, m9
    377    paddw           m4, m5
    378    pmaddwd         m4, m13
    379    paddd           m0, m2
    380    paddd           m1, m2
    381    paddd           m0, m3
    382    paddd           m1, m4
    383    psrad           m0, 4
    384    psrad           m1, 4
    385    packssdw        m0, m1
    386    psraw           m0, 1
    387    mova       [t1+wq], m0
    388    add             wq, 16
    389    jl .h_loop
    390    movif32         wq, wm
    391    ret
    392 ALIGN function_align
    393 .hv:
    394    add           lpfq, strideq
    395    movif64         wq, r4
    396    movif32        t0m, t0
    397    movif32        t1m, t1
    398    test         edgeb, 1 ; LR_HAVE_LEFT
    399    jz .hv_extend_left
    400    movq            m3, [leftq]
    401    movhps          m3, [lpfq+wq]
    402    add          leftq, 8
    403    jmp .hv_main
    404 .hv_extend_left:
    405    mova            m3, [lpfq+wq]
    406    pshufb          m3, wiener_lshuf7_mem
    407    jmp .hv_main
    408 .hv_bottom:
    409    movif64         wq, r4
    410    movif32        t0m, t0
    411    movif32        t1m, t1
    412    test         edgeb, 1 ; LR_HAVE_LEFT
    413    jz .hv_extend_left
    414 .hv_loop:
    415    movu            m3, [lpfq+wq-8]
    416 .hv_main:
    417    mova            m4, [lpfq+wq+0]
    418    movu            m5, [lpfq+wq+8]
    419    test         edgeb, 2 ; LR_HAVE_RIGHT
    420    jnz .hv_have_right
    421    cmp             wd, -20
    422    jl .hv_have_right
    423    call .extend_right
    424 .hv_have_right:
    425    movif32         t1, t4m
    426    movif32         t0, t2m
    427    pshufb          m0, m3, m6
    428    pshufb          m1, m4, m7
    429    paddw           m0, m1
    430    pshufb          m3, m8
    431    pmaddwd         m0, m12
    432    pshufb          m1, m4, m9
    433    paddw           m3, m1
    434    pshufb          m1, m4, m6
    435    pmaddwd         m3, m13
    436    pshufb          m2, m5, m7
    437    paddw           m1, m2
    438    mova            m2, pd_m262128_mem
    439    pshufb          m4, m8
    440    pmaddwd         m1, m12
    441    pshufb          m5, m9
    442    paddw           m4, m5
    443    pmaddwd         m4, m13
    444    paddd           m0, m2
    445    paddd           m1, m2
    446 %if ARCH_X86_64
    447    mova            m2, [t4+wq]
    448    paddw           m2, [t2+wq]
    449    mova            m5, [t3+wq]
    450 %else
    451    mova            m2, [t1+wq]
    452    paddw           m2, [t0+wq]
    453    mov             t1, t3m
    454    mov             t0, t5m
    455    mova            m5, [t1+wq]
    456    mov             t1, t1m
    457 %endif
    458    paddd           m0, m3
    459    paddd           m1, m4
    460    psrad           m0, 4
    461    psrad           m1, 4
    462    packssdw        m0, m1
    463 %if ARCH_X86_64
    464    mova            m4, [t5+wq]
    465    paddw           m4, [t1+wq]
    466    psraw           m0, 1
    467    paddw           m3, m0, [t6+wq]
    468 %else
    469    mova            m4, [t0+wq]
    470    paddw           m4, [t1+wq]
    471    mov             t0, t0m
    472    mov             t1, t6m
    473    psraw           m0, 1
    474    paddw           m3, m0, [t1+wq]
    475 %endif
    476    mova       [t0+wq], m0
    477    punpcklwd       m0, m2, m5
    478    pmaddwd         m0, m15
    479    punpckhwd       m2, m5
    480    pmaddwd         m2, m15
    481    punpcklwd       m1, m3, m4
    482    pmaddwd         m1, m14
    483    punpckhwd       m3, m4
    484    pmaddwd         m3, m14
    485    paddd           m0, m10
    486    paddd           m2, m10
    487    paddd           m0, m1
    488    paddd           m2, m3
    489    psrad           m0, 6
    490    psrad           m2, 6
    491    packssdw        m0, m2
    492    pmulhw          m0, m11
    493    pxor            m1, m1
    494    pmaxsw          m0, m1
    495    mova     [dstq+wq], m0
    496    add             wq, 16
    497    jl .hv_loop
    498 %if ARCH_X86_64
    499    mov             t6, t5
    500    mov             t5, t4
    501    mov             t4, t3
    502    mov             t3, t2
    503    mov             t2, t1
    504    mov             t1, t0
    505    mov             t0, t6
    506 %else
    507    mov             r4, t5m
    508    mov             t1, t4m
    509    mov            t6m, r4
    510    mov            t5m, t1
    511    mov             r4, t3m
    512    mov             t1, t2m
    513    mov            t4m, r4
    514    mov            t3m, t1
    515    mov             r4, t1m
    516    mov             t1, t0
    517    mov            t2m, r4
    518    mov             t0, t6m
    519    mov             wq, wm
    520 %endif
    521    add           dstq, strideq
    522    ret
    523 .v:
    524    movif64         wq, r4
    525    movif32        t0m, t0
    526    movif32        t1m, t1
    527 .v_loop:
    528 %if ARCH_X86_64
    529    mova            m1, [t4+wq]
    530    paddw           m1, [t2+wq]
    531    mova            m2, [t3+wq]
    532    mova            m4, [t1+wq]
    533    paddw           m3, m4, [t6+wq]
    534    paddw           m4, [t5+wq]
    535 %else
    536    mov             t0, t4m
    537    mov             t1, t2m
    538    mova            m1, [t0+wq]
    539    paddw           m1, [t1+wq]
    540    mov             t0, t3m
    541    mov             t1, t1m
    542    mova            m2, [t0+wq]
    543    mova            m4, [t1+wq]
    544    mov             t0, t6m
    545    mov             t1, t5m
    546    paddw           m3, m4, [t0+wq]
    547    paddw           m4, [t1+wq]
    548 %endif
    549    punpcklwd       m0, m1, m2
    550    pmaddwd         m0, m15
    551    punpckhwd       m1, m2
    552    pmaddwd         m1, m15
    553    punpcklwd       m2, m3, m4
    554    pmaddwd         m2, m14
    555    punpckhwd       m3, m4
    556    pmaddwd         m3, m14
    557    paddd           m0, m10
    558    paddd           m1, m10
    559    paddd           m0, m2
    560    paddd           m1, m3
    561    psrad           m0, 6
    562    psrad           m1, 6
    563    packssdw        m0, m1
    564    pmulhw          m0, m11
    565    pxor            m1, m1
    566    pmaxsw          m0, m1
    567    mova     [dstq+wq], m0
    568    add             wq, 16
    569    jl .v_loop
    570 %if ARCH_X86_64
    571    mov             t6, t5
    572    mov             t5, t4
    573    mov             t4, t3
    574    mov             t3, t2
    575    mov             t2, t1
    576 %else
    577    mov             t0, t5m
    578    mov             t1, t4m
    579    mov             r4, t3m
    580    mov            t6m, t0
    581    mov            t5m, t1
    582    mov            t4m, r4
    583    mov             r4, t2m
    584    mov             t1, t1m
    585    mov             t0, t0m
    586    mov            t3m, r4
    587    mov            t2m, t1
    588 %endif
    589    add           dstq, strideq
    590    ret
    591 
    592 %if ARCH_X86_32
    593 %if STACK_ALIGNMENT < 16
    594  %assign stack_size 12*16+384*8
    595 %else
    596  %assign stack_size 11*16+384*8
    597 %endif
    598 cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \
    599                                                    lpf, w, flt
    600 %if STACK_ALIGNMENT < 16
    601  %define lpfm        dword [esp+calloff+4*6]
    602  %define wm          dword [esp+calloff+4*7]
    603  %define hd          dword [esp+calloff+16*10+0]
    604  %define edgeb        byte [esp+calloff+16*10+4]
    605  %define edged       dword [esp+calloff+16*10+4]
    606 %else
    607  %define hd dword r5m
    608  %define edgeb byte r7m
    609 %endif
    610 %define PICmem dword [esp+calloff+4*0]
    611 %define t0m    dword [esp+calloff+4*1] ; wiener ring buffer pointers
    612 %define t1m    dword [esp+calloff+4*2]
    613 %define t2m    dword [esp+calloff+4*3]
    614 %define t3m    dword [esp+calloff+4*4]
    615 %define t4m    dword [esp+calloff+4*5]
    616 %define t2 t2m
    617 %define t3 t3m
    618 %define t4 t4m
    619 %define  m8 [esp+calloff+16*2]
    620 %define  m9 [esp+calloff+16*3]
    621 %define m10 [esp+calloff+16*4]
    622 %define m11 [esp+calloff+16*5]
    623 %define m12 [esp+calloff+16*6]
    624 %define m13 [esp+calloff+16*7]
    625 %define m14 [esp+calloff+16*8]
    626 %define m15 [esp+calloff+16*9]
    627 %define base t0-wiener_shifts
    628 %assign calloff 0
    629 %if STACK_ALIGNMENT < 16
    630    mov             wd, [rstk+stack_offset+20]
    631    mov             wm, wd
    632    mov             r5, [rstk+stack_offset+24]
    633    mov             hd, r5
    634    mov             r5, [rstk+stack_offset+32]
    635    mov          edged, r5 ; edge
    636 %endif
    637 %else
    638 cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \
    639                                                   w, h, edge, flt
    640 %define base
    641 %endif
    642 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16
    643    movifnidn       wd, wm
    644 %endif
    645 %if ARCH_X86_64
    646    mov           fltq, r6mp
    647    movifnidn       hd, hm
    648    mov          edged, r7m
    649    mov            t3d, r8m ; pixel_max
    650    movq           m12, [fltq]
    651    movq           m14, [fltq+16]
    652 %else
    653 %if STACK_ALIGNMENT < 16
    654    mov             t0, [rstk+stack_offset+28]
    655    mov             t1, [rstk+stack_offset+36] ; pixel_max
    656    movq            m1, [t0]    ; fx
    657    movq            m3, [t0+16] ; fy
    658    LEA             t0, wiener_shifts
    659 %else
    660    mov           fltq, r6m
    661    movq            m1, [fltq]
    662    movq            m3, [fltq+16]
    663    LEA             t0, wiener_shifts
    664    mov             t1, r8m ; pixel_max
    665 %endif
    666    mov         PICmem, t0
    667 %endif
    668    mova            m5, [base+wiener_shufE]
    669    mova            m6, [base+wiener_shufB]
    670    mova            m7, [base+wiener_shufD]
    671 %if ARCH_X86_64
    672    lea             t4, [wiener_shifts]
    673    add             wd, wd
    674    punpcklwd      m11, m12, m12
    675    pshufd         m11, m11, q1111 ; x1
    676    pshufd         m12, m12, q1111 ; x2 x3
    677    punpcklwd      m13, m14, m14
    678    pshufd         m13, m13, q1111 ; y1
    679    pshufd         m14, m14, q1111 ; y2 y3
    680    shr            t3d, 11
    681    mova            m8, [pd_m262128] ; (1 << 4) - (1 << 18)
    682    add           lpfq, wq
    683    lea             t1, [rsp+wq+16]
    684    add           dstq, wq
    685    neg             wq
    686 %define base t4-wiener_shifts
    687    movd            m9, [base+wiener_round+t3*4]
    688    movq           m10, [base+wiener_shifts+t3*8]
    689    pshufd          m9, m9, q0000
    690    pshufd          m0, m10, q0000
    691    pshufd         m10, m10, q1111
    692    mova           m15, [wiener_lshuf5]
    693    pmullw         m11, m0
    694    pmullw         m12, m0
    695 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
    696 %define lpfm [rsp]
    697 %define base
    698 %else
    699    add             wd, wd
    700    punpcklwd       m0, m1, m1
    701    pshufd          m0, m0, q1111 ; x1
    702    pshufd          m1, m1, q1111 ; x2 x3
    703    punpcklwd       m2, m3, m3
    704    pshufd          m2, m2, q1111 ; y1
    705    pshufd          m3, m3, q1111 ; y2 y3
    706    mova            m4, [base+pd_m262128] ; (1 << 4) - (1 << 18)
    707    mova           m13, m2
    708    mova           m14, m3
    709    mova            m8, m4
    710    shr             t1, 11
    711    add           lpfq, wq
    712    movd            m2, [base+wiener_round+t1*4]
    713    movq            m3, [base+wiener_shifts+t1*8]
    714 %if STACK_ALIGNMENT < 16
    715    lea             t1, [esp+16*11+wq+16]
    716 %else
    717    lea             t1, [esp+16*10+wq+16]
    718 %endif
    719    add           dstq, wq
    720    neg             wq
    721    pshufd          m2, m2, q0000
    722    pshufd          m4, m3, q0000
    723    pshufd          m3, m3, q1111
    724    mov             wm, wq
    725    pmullw          m0, m4
    726    pmullw          m1, m4
    727    mova            m4, [base+wiener_lshuf5]
    728    mova            m9, m2
    729    mova           m10, m3
    730    mova           m11, m0
    731    mova           m12, m1
    732    mova           m15, m4
    733 %endif
    734    test         edgeb, 4 ; LR_HAVE_TOP
    735    jz .no_top
    736    call .h_top
    737    add           lpfq, strideq
    738    mov             t4, t1
    739    add             t1, 384*2
    740    call .h_top
    741    lea            r10, [lpfq+strideq*4]
    742    mov           lpfq, dstq
    743    mov             t3, t1
    744    add             t1, 384*2
    745    add            r10, strideq
    746    mov           lpfm, r10 ; below
    747    call .h
    748    mov             t2, t1
    749    dec             hd
    750    jz .v1
    751    add           lpfq, strideq
    752    add             t1, 384*2
    753    call .h
    754    dec             hd
    755    jz .v2
    756 .main:
    757    mov             t0, t4
    758 .main_loop:
    759    call .hv
    760    dec             hd
    761    jnz .main_loop
    762    test         edgeb, 8 ; LR_HAVE_BOTTOM
    763    jz .v2
    764    mov           lpfq, lpfm
    765    call .hv_bottom
    766    add           lpfq, strideq
    767    call .hv_bottom
    768 .end:
    769    RET
    770 .no_top:
    771    lea            r10, [lpfq+strideq*4]
    772    mov           lpfq, dstq
    773    lea            r10, [r10+strideq*2]
    774    mov           lpfm, r10
    775    call .h
    776    mov             t4, t1
    777    mov             t3, t1
    778    mov             t2, t1
    779    dec             hd
    780    jz .v1
    781    add           lpfq, strideq
    782    add             t1, 384*2
    783    call .h
    784    dec             hd
    785    jz .v2
    786    lea             t0, [t1+384*2]
    787    call .hv
    788    dec             hd
    789    jz .v2
    790    add             t0, 384*6
    791    call .hv
    792    dec             hd
    793    jnz .main
    794 .v2:
    795    call .v
    796 %if ARCH_X86_64
    797    mov             t4, t3
    798    mov             t3, t2
    799    mov             t2, t1
    800 %else
    801    mov             t0, t3m
    802    mov             r4, t2m
    803    mov             t1, t1m
    804    mov            t4m, t0
    805    mov            t3m, r4
    806    mov            t2m, t1
    807    mov             wq, wm
    808 %endif
    809    add           dstq, strideq
    810 .v1:
    811    call .v
    812    jmp .end
    813 .extend_right:
    814 %assign stack_offset stack_offset+8
    815 %assign calloff 8
    816    movif32         t0, PICmem
    817    pxor            m1, m1
    818    movd            m2, wd
    819    mova            m0, [base+pb_2_3]
    820    pshufb          m2, m1
    821    mova            m1, [base+pb_m6_m5]
    822    psubb           m0, m2
    823    psubb           m1, m2
    824    mova            m2, [base+pb_0to15]
    825    pminub          m0, m2
    826    pminub          m1, m2
    827    pshufb          m3, m0
    828    pshufb          m4, m1
    829    ret
    830 %assign stack_offset stack_offset-4
    831 %assign calloff 4
    832 .h:
    833    movif64         wq, r4
    834    movif32         wq, wm
    835    test         edgeb, 1 ; LR_HAVE_LEFT
    836    jz .h_extend_left
    837    mova            m4, [lpfq+wq]
    838    movd            m3, [leftq+4]
    839    pslldq          m4, 4
    840    por             m3, m4
    841    add          leftq, 8
    842    jmp .h_main
    843 .h_extend_left:
    844    mova            m3, [lpfq+wq] ; avoid accessing memory located
    845    pshufb          m3, m15       ; before the start of the buffer
    846    jmp .h_main
    847 .h_top:
    848    movif64         wq, r4
    849    movif32         wq, wm
    850    test         edgeb, 1 ; LR_HAVE_LEFT
    851    jz .h_extend_left
    852 .h_loop:
    853    movu            m3, [lpfq+wq-4]
    854 .h_main:
    855    movu            m4, [lpfq+wq+4]
    856    test         edgeb, 2 ; LR_HAVE_RIGHT
    857    jnz .h_have_right
    858    cmp             wd, -18
    859    jl .h_have_right
    860    call .extend_right
    861 .h_have_right:
    862    pshufb          m0, m3, m5
    863    pmaddwd         m0, m11
    864    pshufb          m1, m4, m5
    865    pmaddwd         m1, m11
    866    pshufb          m2, m3, m6
    867    pshufb          m3, m7
    868    paddw           m2, m3
    869    pshufb          m3, m4, m6
    870    pmaddwd         m2, m12
    871    pshufb          m4, m7
    872    paddw           m3, m4
    873    pmaddwd         m3, m12
    874    paddd           m0, m8
    875    paddd           m1, m8
    876    paddd           m0, m2
    877    paddd           m1, m3
    878    psrad           m0, 4
    879    psrad           m1, 4
    880    packssdw        m0, m1
    881    psraw           m0, 1
    882    mova       [t1+wq], m0
    883    add             wq, 16
    884    jl .h_loop
    885    movif32         wq, wm
    886    ret
    887 ALIGN function_align
    888 .hv:
    889    add           lpfq, strideq
    890    movif64         wq, r4
    891    movif32        t0m, t0
    892    movif32        t1m, t1
    893    test         edgeb, 1 ; LR_HAVE_LEFT
    894    jz .hv_extend_left
    895    mova            m4, [lpfq+wq]
    896    movd            m3, [leftq+4]
    897    pslldq          m4, 4
    898    por             m3, m4
    899    add          leftq, 8
    900    jmp .hv_main
    901 .hv_extend_left:
    902    mova            m3, [lpfq+wq]
    903    pshufb          m3, m15
    904    jmp .hv_main
    905 .hv_bottom:
    906    movif64         wq, r4
    907    movif32        t0m, t0
    908    movif32        t1m, t1
    909    test         edgeb, 1 ; LR_HAVE_LEFT
    910    jz .hv_extend_left
    911 .hv_loop:
    912    movu            m3, [lpfq+wq-4]
    913 .hv_main:
    914    movu            m4, [lpfq+wq+4]
    915    test         edgeb, 2 ; LR_HAVE_RIGHT
    916    jnz .hv_have_right
    917    cmp             wd, -18
    918    jl .hv_have_right
    919    call .extend_right
    920 .hv_have_right:
    921    movif32         t1, t1m
    922    movif32         t0, t3m
    923    pshufb          m0, m3, m5
    924    pmaddwd         m0, m11
    925    pshufb          m1, m4, m5
    926    pmaddwd         m1, m11
    927    pshufb          m2, m3, m6
    928    pshufb          m3, m7
    929    paddw           m2, m3
    930    pshufb          m3, m4, m6
    931    pmaddwd         m2, m12
    932    pshufb          m4, m7
    933    paddw           m3, m4
    934    pmaddwd         m3, m12
    935    paddd           m0, m8
    936    paddd           m1, m8
    937    paddd           m0, m2
    938 %if ARCH_X86_64
    939    mova            m2, [t3+wq]
    940    paddw           m2, [t1+wq]
    941    paddd           m1, m3
    942    mova            m4, [t2+wq]
    943 %else
    944    mova            m2, [t0+wq]
    945    mov             t0, t2m
    946    paddw           m2, [t1+wq]
    947    mov             t1, t4m
    948    paddd           m1, m3
    949    mova            m4, [t0+wq]
    950    mov             t0, t0m
    951 %endif
    952    punpckhwd       m3, m2, m4
    953    pmaddwd         m3, m14
    954    punpcklwd       m2, m4
    955 %if ARCH_X86_64
    956    mova            m4, [t4+wq]
    957 %else
    958    mova            m4, [t1+wq]
    959 %endif
    960    psrad           m0, 4
    961    psrad           m1, 4
    962    packssdw        m0, m1
    963    pmaddwd         m2, m14
    964    psraw           m0, 1
    965    mova       [t0+wq], m0
    966    punpckhwd       m1, m0, m4
    967    pmaddwd         m1, m13
    968    punpcklwd       m0, m4
    969    pmaddwd         m0, m13
    970    paddd           m3, m9
    971    paddd           m2, m9
    972    paddd           m1, m3
    973    paddd           m0, m2
    974    psrad           m1, 6
    975    psrad           m0, 6
    976    packssdw        m0, m1
    977    pmulhw          m0, m10
    978    pxor            m1, m1
    979    pmaxsw          m0, m1
    980    mova     [dstq+wq], m0
    981    add             wq, 16
    982    jl .hv_loop
    983 %if ARCH_X86_64
    984    mov             t4, t3
    985    mov             t3, t2
    986    mov             t2, t1
    987    mov             t1, t0
    988    mov             t0, t4
    989 %else
    990    mov             r4, t3m
    991    mov             t1, t2m
    992    mov            t4m, r4
    993    mov            t3m, t1
    994    mov             r4, t1m
    995    mov             t1, t0
    996    mov            t2m, r4
    997    mov             t0, t4m
    998    mov             wq, wm
    999 %endif
   1000    add           dstq, strideq
   1001    ret
   1002 .v:
   1003    movif64         wq, r4
   1004    movif32        t1m, t1
   1005 .v_loop:
   1006 %if ARCH_X86_64
   1007    mova            m0, [t1+wq]
   1008    paddw           m2, m0, [t3+wq]
   1009    mova            m1, [t2+wq]
   1010    mova            m4, [t4+wq]
   1011 %else
   1012    mov             t0, t3m
   1013    mova            m0, [t1+wq]
   1014    mov             t1, t2m
   1015    paddw           m2, m0, [t0+wq]
   1016    mov             t0, t4m
   1017    mova            m1, [t1+wq]
   1018    mova            m4, [t0+wq]
   1019 %endif
   1020    punpckhwd       m3, m2, m1
   1021    pmaddwd         m3, m14
   1022    punpcklwd       m2, m1
   1023    pmaddwd         m2, m14
   1024    punpckhwd       m1, m0, m4
   1025    pmaddwd         m1, m13
   1026    punpcklwd       m0, m4
   1027    pmaddwd         m0, m13
   1028    paddd           m3, m9
   1029    paddd           m2, m9
   1030    paddd           m1, m3
   1031    paddd           m0, m2
   1032    psrad           m1, 6
   1033    psrad           m0, 6
   1034    packssdw        m0, m1
   1035    pmulhw          m0, m10
   1036    pxor            m1, m1
   1037    pmaxsw          m0, m1
   1038    mova     [dstq+wq], m0
   1039    add             wq, 16
   1040 %if ARCH_X86_64
   1041    jl .v_loop
   1042 %else
   1043    jge .v_end
   1044    mov             t1, t1m
   1045    jmp .v_loop
   1046 .v_end:
   1047 %endif
   1048    ret
   1049 
   1050 %macro MUL_32X16X2 6 ; dst[1-2], src[1-2], tmp[1-2]
   1051    pmulhuw         %5, %1, %3
   1052    pmulhuw         %6, %2, %4
   1053    pmullw          %1, %3
   1054    pmullw          %2, %4
   1055    pslld           %5, 16
   1056    pslld           %6, 16
   1057    paddd           %1, %5
   1058    paddd           %2, %6
   1059 %endmacro
   1060 
   1061 %macro SGR_CALC_X 10 ; BB_dst, BB_src, b, tmp, an[1-2], zero, s, b_mul, pf_256
   1062    punpcklwd       %4, %3, %7
   1063    punpckhwd       %3, %7
   1064    pmaddwd         %4, %4                 ; b * b
   1065    pmaddwd         %3, %3
   1066    punpcklwd       %1, %2, %7             ; BB
   1067    punpckhwd       %2, %7
   1068    psubd           %5, %4                 ; a * n - b * b
   1069    psubd           %6, %3
   1070    pcmpgtd         %4, %5, %7
   1071    pcmpgtd         %3, %6, %7
   1072    pand            %5, %4                 ; p
   1073    pand            %6, %3
   1074    MUL_32X16X2     %5, %6, %8, %8, %4, %3 ; p * s
   1075    paddw           %5, %9
   1076    paddw           %6, %9
   1077    psrld           %5, 20                 ; z + 1
   1078    psrld           %6, 20
   1079    cvtdq2ps        %5, %5
   1080    cvtdq2ps        %6, %6
   1081    pmaddwd         %1, %9                 ; BB * 164
   1082    pmaddwd         %2, %9
   1083    rcpps           %3, %5                 ; 1 / (z + 1)
   1084    rcpps           %4, %6
   1085    cmpltps         %5, %10
   1086    cmpltps         %6, %10
   1087    mulps           %3, %10                ; 256 / (z + 1)
   1088    mulps           %4, %10
   1089    packssdw        %5, %6
   1090    cvtps2dq        %3, %3
   1091    cvtps2dq        %4, %4
   1092    psrlw           %5, 8                  ; z < 255 ? 255 : 0
   1093    packssdw        %3, %4
   1094    pminsw          %3, %5                 ; x
   1095 %endmacro
   1096 
   1097 %if ARCH_X86_32
   1098 DECLARE_REG_TMP 0, 1, 2, 3, 5
   1099 %if STACK_ALIGNMENT < 16
   1100  %assign extra_stack 5*16
   1101 %else
   1102  %assign extra_stack 3*16
   1103 %endif
   1104 cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \
   1105                              dst, stride, left, lpf, w
   1106 %if STACK_ALIGNMENT < 16
   1107  %define dstm         dword [esp+calloff+16*0+4*6]
   1108  %define stridemp     dword [esp+calloff+16*0+4*7]
   1109  %define leftm        dword [esp+calloff+16*3+4*0]
   1110  %define lpfm         dword [esp+calloff+16*3+4*1]
   1111  %define w0m          dword [esp+calloff+16*3+4*2]
   1112  %define hd           dword [esp+calloff+16*3+4*3]
   1113  %define edgeb         byte [esp+calloff+16*3+4*4]
   1114  %define edged        dword [esp+calloff+16*3+4*4]
   1115  %define leftmp leftm
   1116 %else
   1117  %define w0m wm
   1118  %define hd dword r5m
   1119  %define edgeb  byte r7m
   1120  %define edged dword r7m
   1121 %endif
   1122 %define hvsrcm dword [esp+calloff+4*0]
   1123 %define w1m    dword [esp+calloff+4*1]
   1124 %define t0m    dword [esp+calloff+4*2]
   1125 %define t2m    dword [esp+calloff+4*3]
   1126 %define t3m    dword [esp+calloff+4*4]
   1127 %define t4m    dword [esp+calloff+4*5]
   1128 %define  m8 [base+pd_8]
   1129 %define  m9 [base+pd_0xfffffff0]
   1130 %define m10 [esp+calloff+16*2]
   1131 %define m11 [base+pw_164_24]
   1132 %define m12 [base+sgr_lshuf5]
   1133 %define m13 [base+pd_34816]
   1134 %define m14 [base+pw_1023]
   1135 %define m15 [base+pf_256]
   1136 %define r10 r4
   1137 %define base r6-pw_455_24
   1138 %assign calloff 0
   1139 %if STACK_ALIGNMENT < 16
   1140    mov        strideq, [rstk+stack_offset+ 8]
   1141    mov          leftq, [rstk+stack_offset+12]
   1142    mov           lpfq, [rstk+stack_offset+16]
   1143    mov             wd, [rstk+stack_offset+20]
   1144    mov           dstm, dstq
   1145    mov       stridemp, strideq
   1146    mov          leftm, leftq
   1147    mov             r1, [rstk+stack_offset+24]
   1148    mov             r2, [rstk+stack_offset+32]
   1149    mov           lpfm, lpfq
   1150    mov             hd, r1
   1151    mov          edged, r2
   1152 %endif
   1153 %else
   1154 cglobal sgr_filter_5x5_16bpc, 4, 13, 16, -400*24-16, dst, stride, left, lpf, \
   1155                                                     w, h, edge, params
   1156 %endif
   1157 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16
   1158    movifnidn       wd, wm
   1159 %endif
   1160 %if ARCH_X86_64
   1161    mov        paramsq, r6mp
   1162    movifnidn       hd, hm
   1163    add             wd, wd
   1164    mov          edged, r7m
   1165    movu           m10, [paramsq]
   1166    mova           m12, [sgr_lshuf5]
   1167    add           lpfq, wq
   1168    mova            m8, [pd_8]
   1169    lea             t1, [rsp+wq+20]
   1170    mova            m9, [pd_0xfffffff0]
   1171    add           dstq, wq
   1172    lea             t3, [rsp+wq*2+400*12+16]
   1173    mova           m11, [pw_164_24]
   1174    lea             t4, [rsp+wq+400*20+16]
   1175    pshufhw         m7, m10, q0000
   1176    pshufb         m10, [pw_256]  ; s0
   1177    punpckhqdq      m7, m7        ; w0
   1178    neg             wq
   1179    mova           m13, [pd_34816]  ; (1 << 11) + (1 << 15)
   1180    pxor            m6, m6
   1181    mova           m14, [pw_1023]
   1182    psllw           m7, 4
   1183    movaps         m15, [pf_256]
   1184 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
   1185 %define lpfm        [rsp]
   1186 %else
   1187    mov             r1, [rstk+stack_offset+28] ; params
   1188    LEA             r6, pw_455_24
   1189    add             wd, wd
   1190    movu            m1, [r1]
   1191    add           lpfm, wq
   1192    lea             t1, [rsp+extra_stack+wq+20]
   1193    add           dstq, wq
   1194    lea             t3, [rsp+extra_stack+wq*2+400*12+16]
   1195    mov           dstm, dstq
   1196    lea             t4, [rsp+extra_stack+wq+400*20+16]
   1197    mov            t3m, t3
   1198    pshufhw         m7, m1, q0000
   1199    mov            t4m, t4
   1200    pshufb          m1, [base+pw_256] ; s0
   1201    punpckhqdq      m7, m7            ; w0
   1202    psllw           m7, 4
   1203    neg             wq
   1204    mova           m10, m1
   1205    pxor            m6, m6
   1206    mov            w1m, wd
   1207    sub             wd, 4
   1208    mov           lpfq, lpfm
   1209    mov            w0m, wd
   1210 %define strideq r5
   1211 %endif
   1212    test         edgeb, 4 ; LR_HAVE_TOP
   1213    jz .no_top
   1214    call .h_top
   1215    add           lpfq, stridemp
   1216    mov             t2, t1
   1217    call .top_fixup
   1218    add             t1, 400*6
   1219    call .h_top
   1220    movif32    strideq, stridemp
   1221    lea            r10, [lpfq+strideq*4]
   1222    mov           lpfq, dstq
   1223    add            r10, strideq
   1224    mov           lpfm, r10 ; below
   1225    movif32        t0m, t2
   1226    mov             t0, t2
   1227    dec             hd
   1228    jz .height1
   1229    or           edged, 16
   1230    call .h
   1231 .main:
   1232    add           lpfq, stridemp
   1233    movif32         t4, t4m
   1234    call .hv
   1235    call .prep_n
   1236    sub             hd, 2
   1237    jl .extend_bottom
   1238 .main_loop:
   1239    movif32       lpfq, hvsrcm
   1240    add           lpfq, stridemp
   1241 %if ARCH_X86_64
   1242    test            hb, hb
   1243 %else
   1244    mov             r4, hd
   1245    test            r4, r4
   1246 %endif
   1247    jz .odd_height
   1248    call .h
   1249    add           lpfq, stridemp
   1250    call .hv
   1251    movif32       dstq, dstm
   1252    call .n0
   1253    call .n1
   1254    sub             hd, 2
   1255    movif32         t0, t0m
   1256    jge .main_loop
   1257    test         edgeb, 8 ; LR_HAVE_BOTTOM
   1258    jz .extend_bottom
   1259    mov           lpfq, lpfm
   1260    call .h_top
   1261    add           lpfq, stridemp
   1262    call .hv_bottom
   1263 .end:
   1264    movif32       dstq, dstm
   1265    call .n0
   1266    call .n1
   1267 .end2:
   1268    RET
   1269 .height1:
   1270    movif32         t4, t4m
   1271    call .hv
   1272    call .prep_n
   1273    jmp .odd_height_end
   1274 .odd_height:
   1275    call .hv
   1276    movif32       dstq, dstm
   1277    call .n0
   1278    call .n1
   1279 .odd_height_end:
   1280    call .v
   1281    movif32       dstq, dstm
   1282    call .n0
   1283    jmp .end2
   1284 .extend_bottom:
   1285    call .v
   1286    jmp .end
   1287 .no_top:
   1288    movif32    strideq, stridemp
   1289    lea            r10, [lpfq+strideq*4]
   1290    mov           lpfq, dstq
   1291    lea            r10, [r10+strideq*2]
   1292    mov           lpfm, r10
   1293    call .h
   1294    lea             t2, [t1+400*6]
   1295    call .top_fixup
   1296    dec             hd
   1297    jz .no_top_height1
   1298    or           edged, 16
   1299    mov             t0, t1
   1300    mov             t1, t2
   1301    movif32        t0m, t0
   1302    jmp .main
   1303 .no_top_height1:
   1304    movif32         t3, t3m
   1305    movif32         t4, t4m
   1306    call .v
   1307    call .prep_n
   1308    jmp .odd_height_end
   1309 .extend_right:
   1310    movd            m0, wd
   1311    movd            m1, [lpfq-2]
   1312    mova            m2, [base+pw_256]
   1313    mova            m3, [base+pb_m14_m13]
   1314    pshufb          m0, m6
   1315    pshufb          m1, m2
   1316    psubb           m2, m0
   1317    psubb           m3, m0
   1318    mova            m0, [base+pb_0to15]
   1319    pcmpgtb         m2, m0
   1320    pcmpgtb         m3, m0
   1321    pand            m4, m2
   1322    pand            m5, m3
   1323    pandn           m2, m1
   1324    pandn           m3, m1
   1325    por             m4, m2
   1326    por             m5, m3
   1327    ret
   1328 %assign stack_offset stack_offset+4
   1329 %assign calloff 4
   1330 .h: ; horizontal boxsum
   1331 %if ARCH_X86_64
   1332    lea             wq, [r4-4]
   1333 %else
   1334 %define leftq r4
   1335 %endif
   1336    test         edgeb, 1 ; LR_HAVE_LEFT
   1337    jz .h_extend_left
   1338    movif32      leftq, leftm
   1339    movddup         m5, [leftq]
   1340    movif32         wq, w0m
   1341    mova            m4, [lpfq+wq+4]
   1342    add         leftmp, 8
   1343    palignr         m4, m5, 10
   1344    jmp .h_main
   1345 .h_extend_left:
   1346    movif32         wq, w0m
   1347    mova            m4, [lpfq+wq+4]
   1348    pshufb          m4, m12
   1349    jmp .h_main
   1350 .h_top:
   1351 %if ARCH_X86_64
   1352    lea             wq, [r4-4]
   1353 %endif
   1354    test         edgeb, 1 ; LR_HAVE_LEFT
   1355    jz .h_extend_left
   1356    movif32         wq, w0m
   1357 .h_loop:
   1358    movu            m4, [lpfq+wq- 2]
   1359 .h_main:
   1360    movu            m5, [lpfq+wq+14]
   1361    test         edgeb, 2 ; LR_HAVE_RIGHT
   1362    jnz .h_have_right
   1363    cmp             wd, -20
   1364    jl .h_have_right
   1365    call .extend_right
   1366 .h_have_right:
   1367    palignr         m2, m5, m4, 2
   1368    paddw           m0, m4, m2
   1369    palignr         m3, m5, m4, 6
   1370    paddw           m0, m3
   1371    punpcklwd       m1, m2, m3
   1372    pmaddwd         m1, m1
   1373    punpckhwd       m2, m3
   1374    pmaddwd         m2, m2
   1375    palignr         m5, m4, 8
   1376    paddw           m0, m5
   1377    punpcklwd       m3, m4, m5
   1378    pmaddwd         m3, m3
   1379    paddd           m1, m3
   1380    punpckhwd       m3, m4, m5
   1381    pmaddwd         m3, m3
   1382    shufps          m4, m5, q2121
   1383    paddw           m0, m4             ; sum
   1384    punpcklwd       m5, m4, m6
   1385    pmaddwd         m5, m5
   1386    punpckhwd       m4, m6
   1387    pmaddwd         m4, m4
   1388    paddd           m2, m3
   1389    test         edgeb, 16             ; y > 0
   1390    jz .h_loop_end
   1391    paddw           m0, [t1+wq+400*0]
   1392    paddd           m1, [t1+wq+400*2]
   1393    paddd           m2, [t1+wq+400*4]
   1394 .h_loop_end:
   1395    paddd           m1, m5             ; sumsq
   1396    paddd           m2, m4
   1397    mova [t1+wq+400*0], m0
   1398    mova [t1+wq+400*2], m1
   1399    mova [t1+wq+400*4], m2
   1400    add             wq, 16
   1401    jl .h_loop
   1402    ret
   1403 .top_fixup:
   1404 %if ARCH_X86_64
   1405    lea             wq, [r4-4]
   1406 %else
   1407    mov             wd, w0m
   1408 %endif
   1409 .top_fixup_loop: ; the sums of the first row needs to be doubled
   1410    mova            m0, [t1+wq+400*0]
   1411    mova            m1, [t1+wq+400*2]
   1412    mova            m2, [t1+wq+400*4]
   1413    paddw           m0, m0
   1414    paddd           m1, m1
   1415    paddd           m2, m2
   1416    mova [t2+wq+400*0], m0
   1417    mova [t2+wq+400*2], m1
   1418    mova [t2+wq+400*4], m2
   1419    add             wq, 16
   1420    jl .top_fixup_loop
   1421    ret
   1422 ALIGN function_align
   1423 .hv: ; horizontal boxsum + vertical boxsum + ab
   1424 %if ARCH_X86_64
   1425    lea             wq, [r4-4]
   1426 %else
   1427    mov         hvsrcm, lpfq
   1428 %endif
   1429    test         edgeb, 1 ; LR_HAVE_LEFT
   1430    jz .hv_extend_left
   1431    movif32      leftq, leftm
   1432    movddup         m5, [leftq]
   1433    movif32         wq, w0m
   1434    mova            m4, [lpfq+wq+4]
   1435    add         leftmp, 8
   1436    palignr         m4, m5, 10
   1437    jmp .hv_main
   1438 .hv_extend_left:
   1439    movif32         wq, w0m
   1440    mova            m4, [lpfq+wq+4]
   1441    pshufb          m4, m12
   1442    jmp .hv_main
   1443 .hv_bottom:
   1444 %if ARCH_X86_64
   1445    lea             wq, [r4-4]
   1446 %else
   1447    mov         hvsrcm, lpfq
   1448 %endif
   1449    test         edgeb, 1 ; LR_HAVE_LEFT
   1450    jz .hv_extend_left
   1451    movif32         wq, w0m
   1452 %if ARCH_X86_32
   1453    jmp .hv_loop_start
   1454 %endif
   1455 .hv_loop:
   1456    movif32       lpfq, hvsrcm
   1457 .hv_loop_start:
   1458    movu            m4, [lpfq+wq- 2]
   1459 .hv_main:
   1460    movu            m5, [lpfq+wq+14]
   1461    test         edgeb, 2 ; LR_HAVE_RIGHT
   1462    jnz .hv_have_right
   1463    cmp             wd, -20
   1464    jl .hv_have_right
   1465    call .extend_right
   1466 .hv_have_right:
   1467    movif32         t3, hd
   1468    palignr         m3, m5, m4, 2
   1469    paddw           m0, m4, m3
   1470    palignr         m1, m5, m4, 6
   1471    paddw           m0, m1
   1472    punpcklwd       m2, m3, m1
   1473    pmaddwd         m2, m2
   1474    punpckhwd       m3, m1
   1475    pmaddwd         m3, m3
   1476    palignr         m5, m4, 8
   1477    paddw           m0, m5
   1478    punpcklwd       m1, m4, m5
   1479    pmaddwd         m1, m1
   1480    paddd           m2, m1
   1481    punpckhwd       m1, m4, m5
   1482    pmaddwd         m1, m1
   1483    shufps          m4, m5, q2121
   1484    paddw           m0, m4            ; h sum
   1485    punpcklwd       m5, m4, m6
   1486    pmaddwd         m5, m5
   1487    punpckhwd       m4, m6
   1488    pmaddwd         m4, m4
   1489    paddd           m3, m1
   1490    paddd           m2, m5            ; h sumsq
   1491    paddd           m3, m4
   1492    paddw           m1, m0, [t1+wq+400*0]
   1493    paddd           m4, m2, [t1+wq+400*2]
   1494    paddd           m5, m3, [t1+wq+400*4]
   1495 %if ARCH_X86_64
   1496    test            hd, hd
   1497 %else
   1498    test            t3, t3
   1499 %endif
   1500    jz .hv_last_row
   1501 .hv_main2:
   1502    paddw           m1, [t2+wq+400*0] ; hv sum
   1503    paddd           m4, [t2+wq+400*2] ; hv sumsq
   1504    paddd           m5, [t2+wq+400*4]
   1505    mova [t0+wq+400*0], m0
   1506    mova [t0+wq+400*2], m2
   1507    mova [t0+wq+400*4], m3
   1508    psrlw           m3, m1, 1
   1509    paddd           m4, m8
   1510    pavgw           m3, m6             ; (b + 2) >> 2
   1511    paddd           m5, m8
   1512    pand            m4, m9             ; ((a + 8) >> 4) << 4
   1513    pand            m5, m9
   1514    psrld           m2, m4, 4
   1515    psrld           m0, m5, 4
   1516    paddd           m2, m4
   1517    psrld           m4, 1
   1518    paddd           m0, m5
   1519    psrld           m5, 1
   1520    paddd           m4, m2             ; a * 25
   1521    paddd           m5, m0
   1522    movif32         t3, t3m
   1523    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m6, m10, m11, m15
   1524    punpcklwd       m2, m3, m3
   1525    mova     [t4+wq+4], m3
   1526    punpckhwd       m3, m3
   1527    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   1528    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
   1529    paddd           m1, m13
   1530    psrld           m0, 12             ; b
   1531    psrld           m1, 12
   1532    mova  [t3+wq*2+ 8], m0
   1533    mova  [t3+wq*2+24], m1
   1534    add             wq, 16
   1535    jl .hv_loop
   1536    mov             t2, t1
   1537    mov             t1, t0
   1538    mov             t0, t2
   1539    movif32        t0m, t0
   1540    ret
   1541 .hv_last_row: ; esoteric edge case for odd heights
   1542    mova [t1+wq+400*0], m1
   1543    paddw           m1, m0
   1544    mova [t1+wq+400*2], m4
   1545    paddd           m4, m2
   1546    mova [t1+wq+400*4], m5
   1547    paddd           m5, m3
   1548    jmp .hv_main2
   1549 .v: ; vertical boxsum + ab
   1550 %if ARCH_X86_64
   1551    lea             wq, [r4-4]
   1552 %else
   1553    mov             wd, w0m
   1554 %endif
   1555 .v_loop:
   1556    mova            m0, [t1+wq+400*0]
   1557    mova            m2, [t1+wq+400*2]
   1558    mova            m3, [t1+wq+400*4]
   1559    paddw           m1, m0, [t2+wq+400*0]
   1560    paddd           m4, m2, [t2+wq+400*2]
   1561    paddd           m5, m3, [t2+wq+400*4]
   1562    paddw           m0, m0
   1563    paddd           m2, m2
   1564    paddd           m3, m3
   1565    paddw           m1, m0             ; hv sum
   1566    paddd           m4, m2             ; hv sumsq
   1567    paddd           m5, m3
   1568    psrlw           m3, m1, 1
   1569    paddd           m4, m8
   1570    pavgw           m3, m6             ; (b + 2) >> 2
   1571    paddd           m5, m8
   1572    pand            m4, m9             ; ((a + 8) >> 4) << 4
   1573    pand            m5, m9
   1574    psrld           m2, m4, 4
   1575    psrld           m0, m5, 4
   1576    paddd           m2, m4
   1577    psrld           m4, 1
   1578    paddd           m0, m5
   1579    psrld           m5, 1
   1580    paddd           m4, m2             ; a * 25
   1581    paddd           m5, m0
   1582    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m6, m10, m11, m15
   1583    punpcklwd       m2, m3, m3
   1584    mova     [t4+wq+4], m3
   1585    punpckhwd       m3, m3
   1586    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   1587    paddd           m0, m13            ; x * b * 164 + (1 << 11) + (1 << 15)
   1588    paddd           m1, m13
   1589    psrld           m0, 12             ; b
   1590    psrld           m1, 12
   1591    mova  [t3+wq*2+ 8], m0
   1592    mova  [t3+wq*2+24], m1
   1593    add             wq, 16
   1594    jl .v_loop
   1595    ret
   1596 .prep_n: ; initial neighbor setup
   1597    movif64         wq, r4
   1598    movif32         wd, w1m
   1599 .prep_n_loop:
   1600    movu            m0, [t4+wq*1+ 2]
   1601    movu            m3, [t4+wq*1+ 4]
   1602    movu            m1, [t3+wq*2+ 4]
   1603    movu            m4, [t3+wq*2+ 8]
   1604    movu            m2, [t3+wq*2+20]
   1605    movu            m5, [t3+wq*2+24]
   1606    paddw           m3, m0
   1607    paddd           m4, m1
   1608    paddd           m5, m2
   1609    paddw           m3, [t4+wq*1+ 0]
   1610    paddd           m4, [t3+wq*2+ 0]
   1611    paddd           m5, [t3+wq*2+16]
   1612    paddw           m0, m3
   1613    psllw           m3, 2
   1614    paddd           m1, m4
   1615    pslld           m4, 2
   1616    paddd           m2, m5
   1617    pslld           m5, 2
   1618    paddw           m0, m3             ; a 565
   1619    paddd           m1, m4             ; b 565
   1620    paddd           m2, m5
   1621    mova [t4+wq*1+400*2+ 0], m0
   1622    mova [t3+wq*2+400*4+ 0], m1
   1623    mova [t3+wq*2+400*4+16], m2
   1624    add             wq, 16
   1625    jl .prep_n_loop
   1626    ret
   1627 ALIGN function_align
   1628 .n0: ; neighbor + output (even rows)
   1629    movif64         wq, r4
   1630    movif32         wd, w1m
   1631 .n0_loop:
   1632    movu            m0, [t4+wq*1+ 2]
   1633    movu            m3, [t4+wq*1+ 4]
   1634    movu            m1, [t3+wq*2+ 4]
   1635    movu            m4, [t3+wq*2+ 8]
   1636    movu            m2, [t3+wq*2+20]
   1637    movu            m5, [t3+wq*2+24]
   1638    paddw           m3, m0
   1639    paddd           m4, m1
   1640    paddd           m5, m2
   1641    paddw           m3, [t4+wq*1+ 0]
   1642    paddd           m4, [t3+wq*2+ 0]
   1643    paddd           m5, [t3+wq*2+16]
   1644    paddw           m0, m3
   1645    psllw           m3, 2
   1646    paddd           m1, m4
   1647    pslld           m4, 2
   1648    paddd           m2, m5
   1649    pslld           m5, 2
   1650    paddw           m0, m3             ; a 565
   1651    paddd           m1, m4             ; b 565
   1652    paddd           m2, m5
   1653    paddw           m3, m0, [t4+wq*1+400*2+ 0]
   1654    paddd           m4, m1, [t3+wq*2+400*4+ 0]
   1655    paddd           m5, m2, [t3+wq*2+400*4+16]
   1656    mova [t4+wq*1+400*2+ 0], m0
   1657    mova [t3+wq*2+400*4+ 0], m1
   1658    mova [t3+wq*2+400*4+16], m2
   1659    mova            m0, [dstq+wq]
   1660    punpcklwd       m1, m0, m6          ; src
   1661    punpcklwd       m2, m3, m6          ; a
   1662    pmaddwd         m2, m1              ; a * src
   1663    punpckhwd       m1, m0, m6
   1664    punpckhwd       m3, m6
   1665    pmaddwd         m3, m1
   1666    psubd           m4, m2              ; b - a * src + (1 << 8)
   1667    psubd           m5, m3
   1668    psrad           m4, 9
   1669    psrad           m5, 9
   1670    packssdw        m4, m5
   1671    pmulhrsw        m4, m7
   1672    paddw           m0, m4
   1673    pmaxsw          m0, m6
   1674    pminsw          m0, m14
   1675    mova     [dstq+wq], m0
   1676    add             wq, 16
   1677    jl .n0_loop
   1678    add           dstq, stridemp
   1679    ret
   1680 ALIGN function_align
   1681 .n1: ; neighbor + output (odd rows)
   1682    movif64         wq, r4
   1683    movif32         wd, w1m
   1684 .n1_loop:
   1685    mova            m0, [dstq+wq]
   1686    mova            m3, [t4+wq*1+400*2+ 0]
   1687    mova            m4, [t3+wq*2+400*4+ 0]
   1688    mova            m5, [t3+wq*2+400*4+16]
   1689    punpcklwd       m1, m0, m6          ; src
   1690    punpcklwd       m2, m3, m6          ; a
   1691    pmaddwd         m2, m1
   1692    punpckhwd       m1, m0, m6
   1693    punpckhwd       m3, m6
   1694    pmaddwd         m3, m1
   1695    psubd           m4, m2              ; b - a * src + (1 << 7)
   1696    psubd           m5, m3
   1697    psrad           m4, 8
   1698    psrad           m5, 8
   1699    packssdw        m4, m5
   1700    pmulhrsw        m4, m7
   1701    paddw           m0, m4
   1702    pmaxsw          m0, m6
   1703    pminsw          m0, m14
   1704    mova     [dstq+wq], m0
   1705    add             wq, 16
   1706    jl .n1_loop
   1707    add           dstq, stridemp
   1708    movif32       dstm, dstq
   1709    ret
   1710 
   1711 %if ARCH_X86_32
   1712 %if STACK_ALIGNMENT < 16
   1713  %assign extra_stack 4*16
   1714 %else
   1715  %assign extra_stack 2*16
   1716 %endif
   1717 cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \
   1718                              dst, stride, left, lpf, w
   1719 %if STACK_ALIGNMENT < 16
   1720  %define dstm         dword [esp+calloff+16*2+4*0]
   1721  %define stridemp     dword [esp+calloff+16*2+4*1]
   1722  %define leftm        dword [esp+calloff+16*2+4*2]
   1723  %define lpfm         dword [esp+calloff+16*2+4*3]
   1724  %define w0m          dword [esp+calloff+16*2+4*4]
   1725  %define hd           dword [esp+calloff+16*2+4*5]
   1726  %define edgeb         byte [esp+calloff+16*2+4*6]
   1727  %define edged        dword [esp+calloff+16*2+4*6]
   1728  %define leftmp leftm
   1729 %else
   1730  %define w0m wm
   1731  %define hd dword r5m
   1732  %define edgeb  byte r7m
   1733  %define edged dword r7m
   1734 %endif
   1735 %define hvsrcm dword [esp+calloff+4*0]
   1736 %define w1m    dword [esp+calloff+4*1]
   1737 %define t3m    dword [esp+calloff+4*2]
   1738 %define t4m    dword [esp+calloff+4*3]
   1739 %define  m8 [base+pd_8]
   1740 %define  m9 [esp+calloff+16*1]
   1741 %define m10 [base+pw_455_24]
   1742 %define m11 [base+pd_34816]
   1743 %define m12 [base+sgr_lshuf3]
   1744 %define m13 [base+pw_1023]
   1745 %define m14 [base+pf_256]
   1746 %define base r6-pw_455_24
   1747 %assign calloff 0
   1748 %if STACK_ALIGNMENT < 16
   1749    mov        strideq, [rstk+stack_offset+ 8]
   1750    mov          leftq, [rstk+stack_offset+12]
   1751    mov           lpfq, [rstk+stack_offset+16]
   1752    mov             wd, [rstk+stack_offset+20]
   1753    mov           dstm, dstq
   1754    mov       stridemp, strideq
   1755    mov          leftm, leftq
   1756    mov             r1, [rstk+stack_offset+24]
   1757    mov             r2, [rstk+stack_offset+32]
   1758    mov           lpfm, lpfq
   1759    mov             hd, r1
   1760    mov          edged, r2
   1761 %endif
   1762 %else
   1763 cglobal sgr_filter_3x3_16bpc, 4, 13, 15, -400*42-8, dst, stride, left, lpf, \
   1764                                                    w, h, edge, params
   1765 %endif
   1766 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16
   1767    movifnidn       wd, wm
   1768 %endif
   1769 %if ARCH_X86_64
   1770    mov        paramsq, r6mp
   1771    movifnidn       hd, hm
   1772    add             wd, wd
   1773    mov          edged, r7m
   1774    movq            m9, [paramsq+4]
   1775    add           lpfq, wq
   1776    lea             t1, [rsp+wq+12]
   1777    mova            m8, [pd_8]
   1778    add           dstq, wq
   1779    lea             t3, [rsp+wq*2+400*12+8]
   1780    mova           m10, [pw_455_24]
   1781    lea             t4, [rsp+wq+400*32+8]
   1782    mova           m11, [pd_34816]
   1783    pshuflw         m7, m9, q3333
   1784    pshufb          m9, [pw_256]  ; s1
   1785    punpcklqdq      m7, m7        ; w1
   1786    neg             wq
   1787    pxor            m6, m6
   1788    mova           m13, [pw_1023]
   1789    psllw           m7, 4
   1790    mova           m12, [sgr_lshuf3]
   1791    movaps         m14, [pf_256]
   1792 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
   1793 %define lpfm [rsp]
   1794 %else
   1795    mov             r1, [rstk+stack_offset+28] ; params
   1796    LEA             r6, pw_455_24
   1797    add             wd, wd
   1798    movq            m1, [r1+4]
   1799    add           lpfm, wq
   1800    lea             t1, [rsp+extra_stack+wq+20]
   1801    add           dstq, wq
   1802    lea             t3, [rsp+extra_stack+wq*2+400*12+16]
   1803    mov           dstm, dstq
   1804    lea             t4, [rsp+extra_stack+wq+400*32+16]
   1805    mov            t3m, t3
   1806    pshuflw         m7, m1, q3333
   1807    mov            t4m, t4
   1808    pshufb          m1, [base+pw_256] ; s1
   1809    punpcklqdq      m7, m7            ; w1
   1810    psllw           m7, 4
   1811    neg             wq
   1812    mova            m9, m1
   1813    pxor            m6, m6
   1814    mov            w1m, wd
   1815    sub             wd, 4
   1816    mov           lpfq, lpfm
   1817    mov            w0m, wd
   1818 %define strideq r5
   1819 %endif
   1820    test         edgeb, 4 ; LR_HAVE_TOP
   1821    jz .no_top
   1822    call .h_top
   1823    add           lpfq, stridemp
   1824    mov             t2, t1
   1825    add             t1, 400*6
   1826    call .h_top
   1827    movif32    strideq, stridemp
   1828    lea            r10, [lpfq+strideq*4]
   1829    mov           lpfq, dstq
   1830    add            r10, strideq
   1831    mov           lpfm, r10 ; below
   1832    movif32         t4, t4m
   1833    call .hv0
   1834 .main:
   1835    dec             hd
   1836    jz .height1
   1837    movif32       lpfq, hvsrcm
   1838    add           lpfq, stridemp
   1839    call .hv1
   1840    call .prep_n
   1841    sub             hd, 2
   1842    jl .extend_bottom
   1843 .main_loop:
   1844    movif32       lpfq, hvsrcm
   1845    add           lpfq, stridemp
   1846    call .hv0
   1847 %if ARCH_X86_64
   1848    test            hb, hb
   1849 %else
   1850    mov             r4, hd
   1851    test            r4, r4
   1852 %endif
   1853    jz .odd_height
   1854    movif32       lpfq, hvsrcm
   1855    add           lpfq, stridemp
   1856    call .hv1
   1857    call .n0
   1858    call .n1
   1859    sub             hd, 2
   1860    jge .main_loop
   1861    test         edgeb, 8 ; LR_HAVE_BOTTOM
   1862    jz .extend_bottom
   1863    mov           lpfq, lpfm
   1864    call .hv0_bottom
   1865    movif32       lpfq, hvsrcm
   1866    add           lpfq, stridemp
   1867    call .hv1_bottom
   1868 .end:
   1869    call .n0
   1870    call .n1
   1871 .end2:
   1872    RET
   1873 .height1:
   1874    call .v1
   1875    call .prep_n
   1876    jmp .odd_height_end
   1877 .odd_height:
   1878    call .v1
   1879    call .n0
   1880    call .n1
   1881 .odd_height_end:
   1882    call .v0
   1883    call .v1
   1884    call .n0
   1885    jmp .end2
   1886 .extend_bottom:
   1887    call .v0
   1888    call .v1
   1889    jmp .end
   1890 .no_top:
   1891    movif32    strideq, stridemp
   1892    lea            r10, [lpfq+strideq*4]
   1893    mov           lpfq, dstq
   1894    lea            r10, [r10+strideq*2]
   1895    mov           lpfm, r10
   1896    call .h
   1897 %if ARCH_X86_64
   1898    lea             wq, [r4-4]
   1899 %else
   1900    mov             wq, w0m
   1901    mov         hvsrcm, lpfq
   1902 %endif
   1903    lea             t2, [t1+400*6]
   1904 .top_fixup_loop:
   1905    mova            m0, [t1+wq+400*0]
   1906    mova            m1, [t1+wq+400*2]
   1907    mova            m2, [t1+wq+400*4]
   1908    mova [t2+wq+400*0], m0
   1909    mova [t2+wq+400*2], m1
   1910    mova [t2+wq+400*4], m2
   1911    add             wq, 16
   1912    jl .top_fixup_loop
   1913    movif32         t3, t3m
   1914    movif32         t4, t4m
   1915    call .v0
   1916    jmp .main
   1917 .extend_right:
   1918    movd            m1, wd
   1919    movd            m5, [lpfq-2]
   1920    mova            m2, [base+pw_256]
   1921    mova            m3, [base+pb_0to15]
   1922    pshufb          m1, m6
   1923    pshufb          m5, m2
   1924    psubb           m2, m1
   1925    pcmpgtb         m2, m3
   1926    pand            m4, m2
   1927    pandn           m2, m5
   1928    por             m4, m2
   1929    ret
   1930 %assign stack_offset stack_offset+4
   1931 %assign calloff 4
   1932 .h: ; horizontal boxsum
   1933 %if ARCH_X86_64
   1934    lea             wq, [r4-4]
   1935 %else
   1936 %define leftq r4
   1937 %endif
   1938    test         edgeb, 1 ; LR_HAVE_LEFT
   1939    jz .h_extend_left
   1940    movif32      leftq, leftm
   1941    movddup         m5, [leftq]
   1942    movif32         wq, w0m
   1943    mova            m4, [lpfq+wq+4]
   1944    add         leftmp, 8
   1945    palignr         m4, m5, 12
   1946    jmp .h_main
   1947 .h_extend_left:
   1948    movif32         wq, w0m
   1949    mova            m4, [lpfq+wq+4]
   1950    pshufb          m4, m12
   1951    jmp .h_main
   1952 .h_top:
   1953 %if ARCH_X86_64
   1954    lea             wq, [r4-4]
   1955 %endif
   1956    test         edgeb, 1 ; LR_HAVE_LEFT
   1957    jz .h_extend_left
   1958    movif32         wq, w0m
   1959 .h_loop:
   1960    movu            m4, [lpfq+wq+ 0]
   1961 .h_main:
   1962    movu            m5, [lpfq+wq+16]
   1963    test         edgeb, 2 ; LR_HAVE_RIGHT
   1964    jnz .h_have_right
   1965    cmp             wd, -18
   1966    jl .h_have_right
   1967    call .extend_right
   1968 .h_have_right:
   1969    palignr         m0, m5, m4, 2
   1970    paddw           m1, m4, m0
   1971    punpcklwd       m2, m4, m0
   1972    pmaddwd         m2, m2
   1973    punpckhwd       m3, m4, m0
   1974    pmaddwd         m3, m3
   1975    palignr         m5, m4, 4
   1976    paddw           m1, m5             ; sum
   1977    punpcklwd       m4, m5, m6
   1978    pmaddwd         m4, m4
   1979    punpckhwd       m5, m6
   1980    pmaddwd         m5, m5
   1981    paddd           m2, m4             ; sumsq
   1982    paddd           m3, m5
   1983    mova [t1+wq+400*0], m1
   1984    mova [t1+wq+400*2], m2
   1985    mova [t1+wq+400*4], m3
   1986    add             wq, 16
   1987    jl .h_loop
   1988    ret
   1989 ALIGN function_align
   1990 .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
   1991 %if ARCH_X86_64
   1992    lea             wq, [r4-4]
   1993 %else
   1994    mov         hvsrcm, lpfq
   1995 %endif
   1996    test         edgeb, 1 ; LR_HAVE_LEFT
   1997    jz .hv0_extend_left
   1998    movif32      leftq, leftm
   1999    movddup         m5, [leftq]
   2000    movif32         wq, w0m
   2001    mova            m4, [lpfq+wq+4]
   2002    add         leftmp, 8
   2003    palignr         m4, m5, 12
   2004    jmp .hv0_main
   2005 .hv0_extend_left:
   2006    movif32         wq, w0m
   2007    mova            m4, [lpfq+wq+4]
   2008    pshufb          m4, m12
   2009    jmp .hv0_main
   2010 .hv0_bottom:
   2011 %if ARCH_X86_64
   2012    lea             wq, [r4-4]
   2013 %else
   2014    mov         hvsrcm, lpfq
   2015 %endif
   2016    test         edgeb, 1 ; LR_HAVE_LEFT
   2017    jz .hv0_extend_left
   2018    movif32         wq, w0m
   2019 %if ARCH_X86_32
   2020    jmp .hv0_loop_start
   2021 %endif
   2022 .hv0_loop:
   2023    movif32       lpfq, hvsrcm
   2024 .hv0_loop_start:
   2025    movu            m4, [lpfq+wq+ 0]
   2026 .hv0_main:
   2027    movu            m5, [lpfq+wq+16]
   2028    test         edgeb, 2 ; LR_HAVE_RIGHT
   2029    jnz .hv0_have_right
   2030    cmp             wd, -18
   2031    jl .hv0_have_right
   2032    call .extend_right
   2033 .hv0_have_right:
   2034    palignr         m0, m5, m4, 2
   2035    paddw           m1, m4, m0
   2036    punpcklwd       m2, m4, m0
   2037    pmaddwd         m2, m2
   2038    punpckhwd       m3, m4, m0
   2039    pmaddwd         m3, m3
   2040    palignr         m5, m4, 4
   2041    paddw           m1, m5             ; sum
   2042    punpcklwd       m4, m5, m6
   2043    pmaddwd         m4, m4
   2044    punpckhwd       m5, m6
   2045    pmaddwd         m5, m5
   2046    paddd           m2, m4             ; sumsq
   2047    paddd           m3, m5
   2048    paddw           m0, m1, [t1+wq+400*0]
   2049    paddd           m4, m2, [t1+wq+400*2]
   2050    paddd           m5, m3, [t1+wq+400*4]
   2051    mova [t1+wq+400*0], m1
   2052    mova [t1+wq+400*2], m2
   2053    mova [t1+wq+400*4], m3
   2054    paddw           m1, m0, [t2+wq+400*0]
   2055    paddd           m2, m4, [t2+wq+400*2]
   2056    paddd           m3, m5, [t2+wq+400*4]
   2057    mova [t2+wq+400*0], m0
   2058    mova [t2+wq+400*2], m4
   2059    mova [t2+wq+400*4], m5
   2060    paddd           m2, m8
   2061    paddd           m3, m8
   2062    psrld           m2, 4              ; (a + 8) >> 4
   2063    psrld           m3, 4
   2064    pslld           m4, m2, 3
   2065    pslld           m5, m3, 3
   2066    paddd           m4, m2             ; ((a + 8) >> 4) * 9
   2067    paddd           m5, m3
   2068    psrlw           m3, m1, 1
   2069    pavgw           m3, m6             ; (b + 2) >> 2
   2070    movif32         t3, t3m
   2071    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m6, m9, m10, m14
   2072    punpcklwd       m2, m3, m3
   2073    mova     [t4+wq+4], m3
   2074    punpckhwd       m3, m3
   2075    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   2076    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
   2077    paddd           m1, m11
   2078    psrld           m0, 12
   2079    psrld           m1, 12
   2080    mova  [t3+wq*2+ 8], m0
   2081    mova  [t3+wq*2+24], m1
   2082    add             wq, 16
   2083    jl .hv0_loop
   2084    ret
   2085 ALIGN function_align
   2086 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
   2087 %if ARCH_X86_64
   2088    lea             wq, [r4-4]
   2089 %else
   2090    mov         hvsrcm, lpfq
   2091 %endif
   2092    test         edgeb, 1 ; LR_HAVE_LEFT
   2093    jz .hv1_extend_left
   2094    movif32      leftq, leftm
   2095    movddup         m5, [leftq]
   2096    movif32         wq, w0m
   2097    mova            m4, [lpfq+wq+4]
   2098    add         leftmp, 8
   2099    palignr         m4, m5, 12
   2100    jmp .hv1_main
   2101 .hv1_extend_left:
   2102    movif32         wq, w0m
   2103    mova            m4, [lpfq+wq+4]
   2104    pshufb          m4, m12
   2105    jmp .hv1_main
   2106 .hv1_bottom:
   2107 %if ARCH_X86_64
   2108    lea             wq, [r4-4]
   2109 %else
   2110    mov         hvsrcm, lpfq
   2111 %endif
   2112    test         edgeb, 1 ; LR_HAVE_LEFT
   2113    jz .hv1_extend_left
   2114    movif32         wq, w0m
   2115 %if ARCH_X86_32
   2116    jmp .hv1_loop_start
   2117 %endif
   2118 .hv1_loop:
   2119    movif32       lpfq, hvsrcm
   2120 .hv1_loop_start:
   2121    movu            m4, [lpfq+wq+ 0]
   2122 .hv1_main:
   2123    movu            m5, [lpfq+wq+16]
   2124    test         edgeb, 2 ; LR_HAVE_RIGHT
   2125    jnz .hv1_have_right
   2126    cmp             wd, -18
   2127    jl .hv1_have_right
   2128    call .extend_right
   2129 .hv1_have_right:
   2130    palignr         m1, m5, m4, 2
   2131    paddw           m0, m4, m1
   2132    punpcklwd       m2, m4, m1
   2133    pmaddwd         m2, m2
   2134    punpckhwd       m3, m4, m1
   2135    pmaddwd         m3, m3
   2136    palignr         m5, m4, 4
   2137    paddw           m0, m5             ; h sum
   2138    punpcklwd       m1, m5, m6
   2139    pmaddwd         m1, m1
   2140    punpckhwd       m5, m6
   2141    pmaddwd         m5, m5
   2142    paddd           m2, m1             ; h sumsq
   2143    paddd           m3, m5
   2144    paddw           m1, m0, [t2+wq+400*0]
   2145    paddd           m4, m2, [t2+wq+400*2]
   2146    paddd           m5, m3, [t2+wq+400*4]
   2147    mova [t2+wq+400*0], m0
   2148    mova [t2+wq+400*2], m2
   2149    mova [t2+wq+400*4], m3
   2150    paddd           m4, m8
   2151    paddd           m5, m8
   2152    psrld           m4, 4              ; (a + 8) >> 4
   2153    psrld           m5, 4
   2154    pslld           m2, m4, 3
   2155    pslld           m3, m5, 3
   2156    paddd           m4, m2             ; ((a + 8) >> 4) * 9
   2157    paddd           m5, m3
   2158    psrlw           m3, m1, 1
   2159    pavgw           m3, m6             ; (b + 2) >> 2
   2160    movif32         t3, t3m
   2161    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m6, m9, m10, m14
   2162    punpcklwd       m2, m3, m3
   2163    mova [t4+wq*1+400*2 +4], m3
   2164    punpckhwd       m3, m3
   2165    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   2166    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
   2167    paddd           m1, m11
   2168    psrld           m0, 12
   2169    psrld           m1, 12
   2170    mova [t3+wq*2+400*4+ 8], m0
   2171    mova [t3+wq*2+400*4+24], m1
   2172    add             wq, 16
   2173    jl .hv1_loop
   2174    mov            r10, t2
   2175    mov             t2, t1
   2176    mov             t1, r10
   2177    ret
   2178 .v0: ; vertical boxsums + ab (even rows)
   2179 %if ARCH_X86_64
   2180    lea             wq, [r4-4]
   2181 %else
   2182    mov             wd, w0m
   2183 %endif
   2184 .v0_loop:
   2185    mova            m0, [t1+wq+400*0]
   2186    mova            m4, [t1+wq+400*2]
   2187    mova            m5, [t1+wq+400*4]
   2188    paddw           m0, m0
   2189    paddd           m4, m4
   2190    paddd           m5, m5
   2191    paddw           m1, m0, [t2+wq+400*0]
   2192    paddd           m2, m4, [t2+wq+400*2]
   2193    paddd           m3, m5, [t2+wq+400*4]
   2194    mova [t2+wq+400*0], m0
   2195    mova [t2+wq+400*2], m4
   2196    mova [t2+wq+400*4], m5
   2197    paddd           m2, m8
   2198    paddd           m3, m8
   2199    psrld           m2, 4              ; (a + 8) >> 4
   2200    psrld           m3, 4
   2201    pslld           m4, m2, 3
   2202    pslld           m5, m3, 3
   2203    paddd           m4, m2             ; ((a + 8) >> 4) * 9
   2204    paddd           m5, m3
   2205    psrlw           m3, m1, 1
   2206    pavgw           m3, m6             ; (b + 2) >> 2
   2207    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m6, m9, m10, m14
   2208    punpcklwd       m2, m3, m3
   2209    mova [t4+wq*1+400*0+ 4], m3
   2210    punpckhwd       m3, m3
   2211    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   2212    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
   2213    paddd           m1, m11
   2214    psrld           m0, 12
   2215    psrld           m1, 12
   2216    mova [t3+wq*2+400*0+ 8], m0
   2217    mova [t3+wq*2+400*0+24], m1
   2218    add             wq, 16
   2219    jl .v0_loop
   2220    ret
   2221 .v1: ; vertical boxsums + ab (odd rows)
   2222 %if ARCH_X86_64
   2223    lea             wq, [r4-4]
   2224 %else
   2225    mov             wd, w0m
   2226 %endif
   2227 .v1_loop:
   2228    mova            m0, [t1+wq+400*0]
   2229    mova            m4, [t1+wq+400*2]
   2230    mova            m5, [t1+wq+400*4]
   2231    paddw           m1, m0, [t2+wq+400*0]
   2232    paddd           m2, m4, [t2+wq+400*2]
   2233    paddd           m3, m5, [t2+wq+400*4]
   2234    mova [t2+wq+400*0], m0
   2235    mova [t2+wq+400*2], m4
   2236    mova [t2+wq+400*4], m5
   2237    paddd           m2, m8
   2238    paddd           m3, m8
   2239    psrld           m2, 4              ; (a + 8) >> 4
   2240    psrld           m3, 4
   2241    pslld           m4, m2, 3
   2242    pslld           m5, m3, 3
   2243    paddd           m4, m2             ; ((a + 8) >> 4) * 9
   2244    paddd           m5, m3
   2245    psrlw           m3, m1, 1
   2246    pavgw           m3, m6             ; (b + 2) >> 2
   2247    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m6, m9, m10, m14
   2248    punpcklwd       m2, m3, m3
   2249    mova [t4+wq*1+400*2+ 4], m3
   2250    punpckhwd       m3, m3
   2251    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   2252    paddd           m0, m11            ; x * b * 455 + (1 << 11) + (1 << 15)
   2253    paddd           m1, m11
   2254    psrld           m0, 12
   2255    psrld           m1, 12
   2256    mova [t3+wq*2+400*4+ 8], m0
   2257    mova [t3+wq*2+400*4+24], m1
   2258    add             wq, 16
   2259    jl .v1_loop
   2260    mov            r10, t2
   2261    mov             t2, t1
   2262    mov             t1, r10
   2263    ret
   2264 .prep_n: ; initial neighbor setup
   2265    movif64         wq, r4
   2266    movif32         wd, w1m
   2267 .prep_n_loop:
   2268    movu            m0, [t4+wq*1+400*0+ 4]
   2269    movu            m1, [t3+wq*2+400*0+ 8]
   2270    movu            m2, [t3+wq*2+400*0+24]
   2271    movu            m3, [t4+wq*1+400*0+ 2]
   2272    movu            m4, [t3+wq*2+400*0+ 4]
   2273    movu            m5, [t3+wq*2+400*0+20]
   2274    paddw           m0, [t4+wq*1+400*0+ 0]
   2275    paddd           m1, [t3+wq*2+400*0+ 0]
   2276    paddd           m2, [t3+wq*2+400*0+16]
   2277    paddw           m3, m0
   2278    paddd           m4, m1
   2279    paddd           m5, m2
   2280    psllw           m3, 2                ; a[-1] 444
   2281    pslld           m4, 2                ; b[-1] 444
   2282    pslld           m5, 2
   2283    psubw           m3, m0               ; a[-1] 343
   2284    psubd           m4, m1               ; b[-1] 343
   2285    psubd           m5, m2
   2286    mova [t4+wq*1+400*4], m3
   2287    mova [t3+wq*2+400*8+ 0], m4
   2288    mova [t3+wq*2+400*8+16], m5
   2289    movu            m0, [t4+wq*1+400*2+ 4]
   2290    movu            m1, [t3+wq*2+400*4+ 8]
   2291    movu            m2, [t3+wq*2+400*4+24]
   2292    movu            m3, [t4+wq*1+400*2+ 2]
   2293    movu            m4, [t3+wq*2+400*4+ 4]
   2294    movu            m5, [t3+wq*2+400*4+20]
   2295    paddw           m0, [t4+wq*1+400*2+ 0]
   2296    paddd           m1, [t3+wq*2+400*4+ 0]
   2297    paddd           m2, [t3+wq*2+400*4+16]
   2298    paddw           m3, m0
   2299    paddd           m4, m1
   2300    paddd           m5, m2
   2301    psllw           m3, 2                 ; a[ 0] 444
   2302    pslld           m4, 2                 ; b[ 0] 444
   2303    pslld           m5, 2
   2304    mova [t4+wq*1+400* 6], m3
   2305    mova [t3+wq*2+400*12+ 0], m4
   2306    mova [t3+wq*2+400*12+16], m5
   2307    psubw           m3, m0                ; a[ 0] 343
   2308    psubd           m4, m1                ; b[ 0] 343
   2309    psubd           m5, m2
   2310    mova [t4+wq*1+400* 8], m3
   2311    mova [t3+wq*2+400*16+ 0], m4
   2312    mova [t3+wq*2+400*16+16], m5
   2313    add             wq, 16
   2314    jl .prep_n_loop
   2315    ret
   2316 ALIGN function_align
   2317 .n0: ; neighbor + output (even rows)
   2318    movif64         wq, r4
   2319    movif32         wd, w1m
   2320 .n0_loop:
   2321    movu            m3, [t4+wq*1+400*0+4]
   2322    movu            m1, [t4+wq*1+400*0+2]
   2323    paddw           m3, [t4+wq*1+400*0+0]
   2324    paddw           m1, m3
   2325    psllw           m1, 2                ; a[ 1] 444
   2326    psubw           m2, m1, m3           ; a[ 1] 343
   2327    paddw           m3, m2, [t4+wq*1+400*4]
   2328    paddw           m3, [t4+wq*1+400*6]
   2329    mova [t4+wq*1+400*4], m2
   2330    mova [t4+wq*1+400*6], m1
   2331    movu            m4, [t3+wq*2+400*0+8]
   2332    movu            m1, [t3+wq*2+400*0+4]
   2333    paddd           m4, [t3+wq*2+400*0+0]
   2334    paddd           m1, m4
   2335    pslld           m1, 2                ; b[ 1] 444
   2336    psubd           m2, m1, m4           ; b[ 1] 343
   2337    paddd           m4, m2, [t3+wq*2+400* 8+ 0]
   2338    paddd           m4, [t3+wq*2+400*12+ 0]
   2339    mova [t3+wq*2+400* 8+ 0], m2
   2340    mova [t3+wq*2+400*12+ 0], m1
   2341    movu            m5, [t3+wq*2+400*0+24]
   2342    movu            m1, [t3+wq*2+400*0+20]
   2343    paddd           m5, [t3+wq*2+400*0+16]
   2344    paddd           m1, m5
   2345    pslld           m1, 2
   2346    psubd           m2, m1, m5
   2347    paddd           m5, m2, [t3+wq*2+400* 8+16]
   2348    paddd           m5, [t3+wq*2+400*12+16]
   2349    mova [t3+wq*2+400* 8+16], m2
   2350    mova [t3+wq*2+400*12+16], m1
   2351    mova            m0, [dstq+wq]
   2352    punpcklwd       m1, m0, m6
   2353    punpcklwd       m2, m3, m6
   2354    pmaddwd         m2, m1               ; a * src
   2355    punpckhwd       m1, m0, m6
   2356    punpckhwd       m3, m6
   2357    pmaddwd         m3, m1
   2358    psubd           m4, m2               ; b - a * src + (1 << 8)
   2359    psubd           m5, m3
   2360    psrad           m4, 9
   2361    psrad           m5, 9
   2362    packssdw        m4, m5
   2363    pmulhrsw        m4, m7
   2364    paddw           m0, m4
   2365    pmaxsw          m0, m6
   2366    pminsw          m0, m13
   2367    mova     [dstq+wq], m0
   2368    add             wq, 16
   2369    jl .n0_loop
   2370    add           dstq, stridemp
   2371    ret
   2372 ALIGN function_align
   2373 .n1: ; neighbor + output (odd rows)
   2374    movif64         wq, r4
   2375    movif32         wd, w1m
   2376 .n1_loop:
   2377    movu            m3, [t4+wq*1+400*2+4]
   2378    movu            m1, [t4+wq*1+400*2+2]
   2379    paddw           m3, [t4+wq*1+400*2+0]
   2380    paddw           m1, m3
   2381    psllw           m1, 2                ; a[ 1] 444
   2382    psubw           m2, m1, m3           ; a[ 1] 343
   2383    paddw           m3, m2, [t4+wq*1+400*6]
   2384    paddw           m3, [t4+wq*1+400*8]
   2385    mova [t4+wq*1+400*6], m1
   2386    mova [t4+wq*1+400*8], m2
   2387    movu            m4, [t3+wq*2+400*4+8]
   2388    movu            m1, [t3+wq*2+400*4+4]
   2389    paddd           m4, [t3+wq*2+400*4+0]
   2390    paddd           m1, m4
   2391    pslld           m1, 2                ; b[ 1] 444
   2392    psubd           m2, m1, m4           ; b[ 1] 343
   2393    paddd           m4, m2, [t3+wq*2+400*12+ 0]
   2394    paddd           m4, [t3+wq*2+400*16+ 0]
   2395    mova [t3+wq*2+400*12+ 0], m1
   2396    mova [t3+wq*2+400*16+ 0], m2
   2397    movu            m5, [t3+wq*2+400*4+24]
   2398    movu            m1, [t3+wq*2+400*4+20]
   2399    paddd           m5, [t3+wq*2+400*4+16]
   2400    paddd           m1, m5
   2401    pslld           m1, 2
   2402    psubd           m2, m1, m5
   2403    paddd           m5, m2, [t3+wq*2+400*12+16]
   2404    paddd           m5, [t3+wq*2+400*16+16]
   2405    mova [t3+wq*2+400*12+16], m1
   2406    mova [t3+wq*2+400*16+16], m2
   2407    mova            m0, [dstq+wq]
   2408    punpcklwd       m1, m0, m6
   2409    punpcklwd       m2, m3, m6
   2410    pmaddwd         m2, m1               ; a * src
   2411    punpckhwd       m1, m0, m6
   2412    punpckhwd       m3, m6
   2413    pmaddwd         m3, m1
   2414    psubd           m4, m2               ; b - a * src + (1 << 8)
   2415    psubd           m5, m3
   2416    psrad           m4, 9
   2417    psrad           m5, 9
   2418    packssdw        m4, m5
   2419    pmulhrsw        m4, m7
   2420    paddw           m0, m4
   2421    pmaxsw          m0, m6
   2422    pminsw          m0, m13
   2423    mova     [dstq+wq], m0
   2424    add             wq, 16
   2425    jl .n1_loop
   2426    add           dstq, stridemp
   2427    movif32       dstm, dstq
   2428    ret
   2429 
   2430 %if ARCH_X86_32
   2431 %if STACK_ALIGNMENT < 16
   2432  %assign extra_stack 10*16
   2433 %else
   2434  %assign extra_stack 8*16
   2435 %endif
   2436 cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \
   2437                              dst, stride, left, lpf, w
   2438 %if STACK_ALIGNMENT < 16
   2439  %define dstm         dword [esp+calloff+16*8+4*0]
   2440  %define stridemp     dword [esp+calloff+16*8+4*1]
   2441  %define leftm        dword [esp+calloff+16*8+4*2]
   2442  %define lpfm         dword [esp+calloff+16*8+4*3]
   2443  %define w0m          dword [esp+calloff+16*8+4*4]
   2444  %define hd           dword [esp+calloff+16*8+4*5]
   2445  %define edgeb         byte [esp+calloff+16*8+4*6]
   2446  %define edged        dword [esp+calloff+16*8+4*6]
   2447  %define leftmp leftm
   2448 %else
   2449  %define w0m wm
   2450  %define hd dword r5m
   2451  %define edgeb  byte r7m
   2452  %define edged dword r7m
   2453 %endif
   2454 %define hvsrcm dword [esp+calloff+4*0]
   2455 %define w1m    dword [esp+calloff+4*1]
   2456 %define t3m    dword [esp+calloff+4*2]
   2457 %define t4m    dword [esp+calloff+4*3]
   2458 %xdefine m8 m6
   2459 %define  m9 [base+pd_8]
   2460 %define m10 [base+pd_34816]
   2461 %define m11 [base+pw_455_24]
   2462 %define m12 [base+pw_164_24]
   2463 %define m13 [esp+calloff+16*4]
   2464 %define m14 [esp+calloff+16*5]
   2465 %define m15 [esp+calloff+16*6]
   2466 %define  m6 [esp+calloff+16*7]
   2467 %define base r6-pw_455_24
   2468 %assign calloff 0
   2469 %if STACK_ALIGNMENT < 16
   2470    mov        strideq, [rstk+stack_offset+ 8]
   2471    mov          leftq, [rstk+stack_offset+12]
   2472    mov           lpfq, [rstk+stack_offset+16]
   2473    mov             wd, [rstk+stack_offset+20]
   2474    mov           dstm, dstq
   2475    mov       stridemp, strideq
   2476    mov          leftm, leftq
   2477    mov             r1, [rstk+stack_offset+24]
   2478    mov             r2, [rstk+stack_offset+32]
   2479    mov           lpfm, lpfq
   2480    mov             hd, r1
   2481    mov          edged, r2
   2482 %endif
   2483 %else
   2484 cglobal sgr_filter_mix_16bpc, 4, 13, 16, -400*66-40, dst, stride, left, lpf, \
   2485                                                     w, h, edge, params
   2486 %endif
   2487 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16
   2488    movifnidn       wd, wm
   2489 %endif
   2490 %if ARCH_X86_64
   2491    mov        paramsq, r6mp
   2492    movifnidn       hd, hm
   2493    add             wd, wd
   2494    mov          edged, r7m
   2495    mova           m14, [paramsq]
   2496    add           lpfq, wq
   2497    mova            m9, [pd_8]
   2498    lea             t1, [rsp+wq+44]
   2499    mova           m10, [pd_34816]
   2500    add           dstq, wq
   2501    mova           m11, [pw_455_24]
   2502    lea             t3, [rsp+wq*2+400*24+40]
   2503    mova           m12, [pw_164_24]
   2504    lea             t4, [rsp+wq+400*52+40]
   2505    neg             wq
   2506    pshufd         m15, m14, q2222 ; w0 w1
   2507    punpcklwd      m14, m14
   2508    pshufd         m13, m14, q0000 ; s0
   2509    pshufd         m14, m14, q2222 ; s1
   2510    pxor            m6, m6
   2511    psllw          m15, 2
   2512 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
   2513 %define lpfm [rsp]
   2514 %else
   2515    mov             r1, [rstk+stack_offset+28] ; params
   2516    LEA             r6, pw_455_24
   2517    add             wd, wd
   2518    mova            m2, [r1]
   2519    add           lpfm, wq
   2520    lea             t1, [rsp+extra_stack+wq+52]
   2521    add           dstq, wq
   2522    lea             t3, [rsp+extra_stack+wq*2+400*24+48]
   2523    mov           dstm, dstq
   2524    lea             t4, [rsp+extra_stack+wq+400*52+48]
   2525    mov            t3m, t3
   2526    mov            t4m, t4
   2527    neg             wq
   2528    pshuflw         m0, m2, q0000
   2529    pshuflw         m1, m2, q2222
   2530    pshufhw         m2, m2, q1010
   2531    punpcklqdq      m0, m0 ; s0
   2532    punpcklqdq      m1, m1 ; s1
   2533    punpckhqdq      m2, m2 ; w0 w1
   2534    mov            w1m, wd
   2535    pxor            m3, m3
   2536    psllw           m2, 2
   2537    mova           m13, m0
   2538    mova           m14, m1
   2539    sub             wd, 4
   2540    mova           m15, m2
   2541    mova            m6, m3
   2542    mov           lpfq, lpfm
   2543    mov            w0m, wd
   2544 %define strideq r5
   2545 %endif
   2546    test         edgeb, 4 ; LR_HAVE_TOP
   2547    jz .no_top
   2548    call .h_top
   2549    add           lpfq, stridemp
   2550    mov             t2, t1
   2551 %if ARCH_X86_64
   2552    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup
   2553 %else
   2554    mov             wq, w0m
   2555    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop
   2556 %endif
   2557    add             t1, 400*12
   2558    call .h_top
   2559    movif32    strideq, stridemp
   2560    lea            r10, [lpfq+strideq*4]
   2561    mov           lpfq, dstq
   2562    add            r10, strideq
   2563    mov           lpfm, r10 ; below
   2564    movif32         t4, t4m
   2565    call .hv0
   2566 .main:
   2567    dec             hd
   2568    jz .height1
   2569    movif32       lpfq, hvsrcm
   2570    add           lpfq, stridemp
   2571    call .hv1
   2572    call .prep_n
   2573    sub             hd, 2
   2574    jl .extend_bottom
   2575 .main_loop:
   2576    movif32       lpfq, hvsrcm
   2577    add           lpfq, stridemp
   2578    call .hv0
   2579 %if ARCH_X86_64
   2580    test            hd, hd
   2581 %else
   2582    mov             r4, hd
   2583    test            r4, r4
   2584 %endif
   2585    jz .odd_height
   2586    movif32       lpfq, hvsrcm
   2587    add           lpfq, stridemp
   2588    call .hv1
   2589    call .n0
   2590    call .n1
   2591    sub             hd, 2
   2592    jge .main_loop
   2593    test         edgeb, 8 ; LR_HAVE_BOTTOM
   2594    jz .extend_bottom
   2595    mov           lpfq, lpfm
   2596    call .hv0_bottom
   2597    movif32       lpfq, hvsrcm
   2598    add           lpfq, stridemp
   2599    call .hv1_bottom
   2600 .end:
   2601    call .n0
   2602    call .n1
   2603 .end2:
   2604    RET
   2605 .height1:
   2606    call .v1
   2607    call .prep_n
   2608    jmp .odd_height_end
   2609 .odd_height:
   2610    call .v1
   2611    call .n0
   2612    call .n1
   2613 .odd_height_end:
   2614    call .v0
   2615    call .v1
   2616    call .n0
   2617    jmp .end2
   2618 .extend_bottom:
   2619    call .v0
   2620    call .v1
   2621    jmp .end
   2622 .no_top:
   2623    movif32    strideq, stridemp
   2624    lea            r10, [lpfq+strideq*4]
   2625    mov           lpfq, dstq
   2626    lea            r10, [r10+strideq*2]
   2627    mov           lpfm, r10
   2628    call .h
   2629 %if ARCH_X86_64
   2630    lea             wq, [r4-4]
   2631 %else
   2632    mov             wq, w0m
   2633    mov         hvsrcm, lpfq
   2634 %endif
   2635    lea             t2, [t1+400*12]
   2636 .top_fixup_loop:
   2637    mova            m0, [t1+wq+400* 0]
   2638    mova            m1, [t1+wq+400* 2]
   2639    mova            m2, [t1+wq+400* 4]
   2640    paddw           m0, m0
   2641    mova            m3, [t1+wq+400* 6]
   2642    paddd           m1, m1
   2643    mova            m4, [t1+wq+400* 8]
   2644    paddd           m2, m2
   2645    mova            m5, [t1+wq+400*10]
   2646    mova [t2+wq+400* 0], m0
   2647    mova [t2+wq+400* 2], m1
   2648    mova [t2+wq+400* 4], m2
   2649    mova [t2+wq+400* 6], m3
   2650    mova [t2+wq+400* 8], m4
   2651    mova [t2+wq+400*10], m5
   2652    add             wq, 16
   2653    jl .top_fixup_loop
   2654    movif32         t3, t3m
   2655    movif32         t4, t4m
   2656    call .v0
   2657    jmp .main
   2658 .h: ; horizontal boxsum
   2659 %assign stack_offset stack_offset+4
   2660 %assign calloff 4
   2661 %if ARCH_X86_64
   2662    lea             wq, [r4-4]
   2663 %else
   2664 %define leftq r4
   2665 %endif
   2666    test         edgeb, 1 ; LR_HAVE_LEFT
   2667    jz .h_extend_left
   2668    movif32      leftq, leftm
   2669    movddup         m5, [leftq]
   2670    movif32         wq, w0m
   2671    mova            m4, [lpfq+wq+4]
   2672    add         leftmp, 8
   2673    palignr         m4, m5, 10
   2674    jmp .h_main
   2675 .h_extend_left:
   2676    movif32         wq, w0m
   2677    mova            m4, [lpfq+wq+4]
   2678    pshufb          m4, [base+sgr_lshuf5]
   2679    jmp .h_main
   2680 .h_top:
   2681 %if ARCH_X86_64
   2682    lea             wq, [r4-4]
   2683 %endif
   2684    test         edgeb, 1 ; LR_HAVE_LEFT
   2685    jz .h_extend_left
   2686    movif32         wq, w0m
   2687 .h_loop:
   2688    movu            m4, [lpfq+wq- 2]
   2689 .h_main:
   2690    movu            m5, [lpfq+wq+14]
   2691    test         edgeb, 2 ; LR_HAVE_RIGHT
   2692    jnz .h_have_right
   2693    cmp             wd, -20
   2694    jl .h_have_right
   2695 %if ARCH_X86_32
   2696    pxor            m8, m8
   2697 %endif
   2698    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
   2699 .h_have_right:
   2700    palignr         m3, m5, m4, 2
   2701    palignr         m0, m5, m4, 4
   2702    paddw           m1, m3, m0
   2703    punpcklwd       m2, m3, m0
   2704    pmaddwd         m2, m2
   2705    punpckhwd       m3, m0
   2706    pmaddwd         m3, m3
   2707    palignr         m0, m5, m4, 6
   2708    paddw           m1, m0             ; sum3
   2709    punpcklwd       m7, m0, m6
   2710    pmaddwd         m7, m7
   2711    punpckhwd       m0, m6
   2712    pmaddwd         m0, m0
   2713    paddd           m2, m7             ; sumsq3
   2714    palignr         m5, m4, 8
   2715    punpcklwd       m7, m5, m4
   2716    paddd           m3, m0
   2717    paddw           m0, m4, m5
   2718    pmaddwd         m7, m7
   2719    punpckhwd       m5, m4
   2720    pmaddwd         m5, m5
   2721    mova [t1+wq+400* 6], m1
   2722    mova [t1+wq+400* 8], m2
   2723    mova [t1+wq+400*10], m3
   2724    paddw           m0, m1             ; sum5
   2725    paddd           m7, m2             ; sumsq5
   2726    paddd           m5, m3
   2727    mova [t1+wq+400* 0], m0
   2728    mova [t1+wq+400* 2], m7
   2729    mova [t1+wq+400* 4], m5
   2730    add             wq, 16
   2731    jl .h_loop
   2732    ret
   2733 ALIGN function_align
   2734 .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
   2735 %if ARCH_X86_64
   2736    lea             wq, [r4-4]
   2737 %else
   2738    mov         hvsrcm, lpfq
   2739 %endif
   2740    test         edgeb, 1 ; LR_HAVE_LEFT
   2741    jz .hv0_extend_left
   2742    movif32      leftq, leftm
   2743    movddup         m5, [leftq]
   2744    movif32         wq, w0m
   2745    mova            m4, [lpfq+wq+4]
   2746    add         leftmp, 8
   2747    palignr         m4, m5, 10
   2748    jmp .hv0_main
   2749 .hv0_extend_left:
   2750    movif32         wq, w0m
   2751    mova            m4, [lpfq+wq+4]
   2752    pshufb          m4, [base+sgr_lshuf5]
   2753    jmp .hv0_main
   2754 .hv0_bottom:
   2755 %if ARCH_X86_64
   2756    lea             wq, [r4-4]
   2757 %else
   2758    mov         hvsrcm, lpfq
   2759 %endif
   2760    test         edgeb, 1 ; LR_HAVE_LEFT
   2761    jz .hv0_extend_left
   2762    movif32         wq, w0m
   2763 %if ARCH_X86_32
   2764    jmp .hv0_loop_start
   2765 %endif
   2766 .hv0_loop:
   2767    movif32       lpfq, hvsrcm
   2768 .hv0_loop_start:
   2769    movu            m4, [lpfq+wq- 2]
   2770 .hv0_main:
   2771    movu            m5, [lpfq+wq+14]
   2772    test         edgeb, 2 ; LR_HAVE_RIGHT
   2773    jnz .hv0_have_right
   2774    cmp             wd, -20
   2775    jl .hv0_have_right
   2776 %if ARCH_X86_32
   2777    pxor            m8, m8
   2778 %endif
   2779    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
   2780 .hv0_have_right:
   2781    palignr         m3, m5, m4, 2
   2782    palignr         m0, m5, m4, 4
   2783    movif32         t3, t3m
   2784    paddw           m1, m3, m0
   2785    punpcklwd       m2, m3, m0
   2786    pmaddwd         m2, m2
   2787    punpckhwd       m3, m0
   2788    pmaddwd         m3, m3
   2789    palignr         m0, m5, m4, 6
   2790    paddw           m1, m0             ; h sum3
   2791    punpcklwd       m7, m0, m6
   2792    pmaddwd         m7, m7
   2793    punpckhwd       m0, m6
   2794    pmaddwd         m0, m0
   2795    paddd           m2, m7             ; h sumsq3
   2796    palignr         m5, m4, 8
   2797    punpcklwd       m7, m5, m4
   2798    paddd           m3, m0
   2799    paddw           m0, m4, m5
   2800    pmaddwd         m7, m7
   2801    punpckhwd       m5, m4
   2802    pmaddwd         m5, m5
   2803    paddw           m0, m1             ; h sum5
   2804    paddd           m7, m2             ; h sumsq5
   2805    paddd           m5, m3
   2806    mova [t3+wq*2+400*8+ 8], m0
   2807    mova [t3+wq*2+400*0+ 8], m7
   2808    mova [t3+wq*2+400*0+24], m5
   2809    paddw           m0, [t1+wq+400* 0]
   2810    paddd           m7, [t1+wq+400* 2]
   2811    paddd           m5, [t1+wq+400* 4]
   2812    mova [t1+wq+400* 0], m0
   2813    mova [t1+wq+400* 2], m7
   2814    mova [t1+wq+400* 4], m5
   2815    paddw           m0, m1, [t1+wq+400* 6]
   2816    paddd           m4, m2, [t1+wq+400* 8]
   2817    paddd           m5, m3, [t1+wq+400*10]
   2818    mova [t1+wq+400* 6], m1
   2819    mova [t1+wq+400* 8], m2
   2820    mova [t1+wq+400*10], m3
   2821    paddw           m1, m0, [t2+wq+400* 6]
   2822    paddd           m2, m4, [t2+wq+400* 8]
   2823    paddd           m3, m5, [t2+wq+400*10]
   2824    mova [t2+wq+400* 6], m0
   2825    mova [t2+wq+400* 8], m4
   2826    mova [t2+wq+400*10], m5
   2827    paddd           m2, m9
   2828    paddd           m3, m9
   2829    movaps          m8, [base+pf_256]
   2830    psrld           m2, 4              ; (a3 + 8) >> 4
   2831    psrld           m3, 4
   2832    pslld           m4, m2, 3
   2833    pslld           m5, m3, 3
   2834    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
   2835    paddd           m5, m3
   2836    psrlw           m3, m1, 1
   2837 %if ARCH_X86_32
   2838    pxor            m7, m7
   2839    pavgw           m3, m7
   2840    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m7, m14, m11, m8
   2841 %else
   2842    pavgw           m3, m6             ; (b3 + 2) >> 2
   2843    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m6, m14, m11, m8
   2844 %endif
   2845    punpcklwd       m2, m3, m3
   2846    mova [t4+wq*1+400*2+ 4], m3
   2847    punpckhwd       m3, m3
   2848    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   2849    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   2850    paddd           m1, m10
   2851    psrld           m0, 12
   2852    psrld           m1, 12
   2853    mova [t3+wq*2+400*4+ 8], m0
   2854    mova [t3+wq*2+400*4+24], m1
   2855    add             wq, 16
   2856    jl .hv0_loop
   2857    ret
   2858 ALIGN function_align
   2859 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
   2860 %if ARCH_X86_64
   2861    lea             wq, [r4-4]
   2862 %else
   2863    mov         hvsrcm, lpfq
   2864 %endif
   2865    test         edgeb, 1 ; LR_HAVE_LEFT
   2866    jz .hv1_extend_left
   2867    movif32      leftq, leftm
   2868    movddup         m5, [leftq]
   2869    movif32         wq, w0m
   2870    mova            m4, [lpfq+wq+4]
   2871    add         leftmp, 8
   2872    palignr         m4, m5, 10
   2873    jmp .hv1_main
   2874 .hv1_extend_left:
   2875    movif32         wq, w0m
   2876    mova            m4, [lpfq+wq+4]
   2877    pshufb          m4, [base+sgr_lshuf5]
   2878    jmp .hv1_main
   2879 .hv1_bottom:
   2880 %if ARCH_X86_64
   2881    lea             wq, [r4-4]
   2882 %else
   2883    mov         hvsrcm, lpfq
   2884 %endif
   2885    test         edgeb, 1 ; LR_HAVE_LEFT
   2886    jz .hv1_extend_left
   2887    movif32         wq, w0m
   2888 %if ARCH_X86_32
   2889    jmp .hv1_loop_start
   2890 %endif
   2891 .hv1_loop:
   2892    movif32       lpfq, hvsrcm
   2893 .hv1_loop_start:
   2894    movu            m4, [lpfq+wq- 2]
   2895 .hv1_main:
   2896    movu            m5, [lpfq+wq+14]
   2897    test         edgeb, 2 ; LR_HAVE_RIGHT
   2898    jnz .hv1_have_right
   2899    cmp             wd, -20
   2900    jl .hv1_have_right
   2901 %if ARCH_X86_32
   2902    pxor            m8, m8
   2903 %endif
   2904    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
   2905 .hv1_have_right:
   2906    palignr         m7, m5, m4, 2
   2907    palignr         m3, m5, m4, 4
   2908    paddw           m2, m7, m3
   2909    punpcklwd       m0, m7, m3
   2910    pmaddwd         m0, m0
   2911    punpckhwd       m7, m3
   2912    pmaddwd         m7, m7
   2913    palignr         m3, m5, m4, 6
   2914    paddw           m2, m3             ; h sum3
   2915    punpcklwd       m1, m3, m6
   2916    pmaddwd         m1, m1
   2917    punpckhwd       m3, m6
   2918    pmaddwd         m3, m3
   2919    paddd           m0, m1             ; h sumsq3
   2920    palignr         m5, m4, 8
   2921    punpckhwd       m1, m4, m5
   2922    paddw           m8, m4, m5
   2923    pmaddwd         m1, m1
   2924    punpcklwd       m4, m5
   2925    pmaddwd         m4, m4
   2926    paddd           m7, m3
   2927    paddw           m5, m2, [t2+wq+400* 6]
   2928    mova [t2+wq+400* 6], m2
   2929    paddw           m8, m2             ; h sum5
   2930    paddd           m2, m0, [t2+wq+400* 8]
   2931    paddd           m3, m7, [t2+wq+400*10]
   2932    mova [t2+wq+400* 8], m0
   2933    mova [t2+wq+400*10], m7
   2934    paddd           m4, m0             ; h sumsq5
   2935    paddd           m1, m7
   2936    paddd           m2, m9
   2937    paddd           m3, m9
   2938    psrld           m2, 4              ; (a3 + 8) >> 4
   2939    psrld           m3, 4
   2940    pslld           m0, m2, 3
   2941    pslld           m7, m3, 3
   2942    paddd           m2, m0             ; ((a3 + 8) >> 4) * 9
   2943    paddd           m3, m7
   2944    psrlw           m7, m5, 1
   2945    pavgw           m7, m6             ; (b3 + 2) >> 2
   2946 %if ARCH_X86_32
   2947    mova      [esp+20], m8
   2948    mov             t3, t3m
   2949    SGR_CALC_X      m0, m5, m7, m8, m2, m3, m6, m14, m11, [base+pf_256]
   2950 %else
   2951    SGR_CALC_X      m0, m5, m7, m12, m2, m3, m6, m14, m11, [base+pf_256]
   2952 %endif
   2953    punpcklwd       m2, m7, m7
   2954    mova [t4+wq*1+400*4+4], m7
   2955    punpckhwd       m7, m7
   2956 %if ARCH_X86_32
   2957    MUL_32X16X2     m0, m5, m2, m7, m3, m8
   2958    mova            m8, [esp+20]
   2959 %else
   2960    MUL_32X16X2     m0, m5, m2, m7, m3, m12
   2961    mova           m12, [pw_164_24]
   2962 %endif
   2963    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   2964    paddd           m5, m10
   2965    psrld           m0, 12
   2966    psrld           m5, 12
   2967    mova [t3+wq*2+400*8+ 8], m0
   2968    mova [t3+wq*2+400*8+24], m5
   2969    paddw           m5, m8, [t2+wq+400*0]
   2970    paddd           m2, m4, [t2+wq+400*2]
   2971    paddd           m3, m1, [t2+wq+400*4]
   2972    paddw           m5, [t1+wq+400*0]
   2973    paddd           m2, [t1+wq+400*2]
   2974    paddd           m3, [t1+wq+400*4]
   2975    mova [t2+wq+400*0], m8
   2976    paddd           m2, m9
   2977    paddd           m3, m9
   2978    psrld           m2, 4              ; (a5 + 8) >> 4
   2979    psrld           m3, 4
   2980    mova [t2+wq+400*2], m4
   2981    pslld           m8, m2, 4
   2982    mova [t2+wq+400*4], m1
   2983    pslld           m4, m3, 4
   2984    paddd           m8, m2
   2985    pslld           m2, 3
   2986    paddd           m4, m3
   2987    pslld           m3, 3
   2988    paddd           m2, m8             ; ((a5 + 8) >> 4) * 25
   2989    paddd           m3, m4
   2990    psrlw           m1, m5, 1
   2991 %if ARCH_X86_32
   2992    pxor            m7, m7
   2993    pavgw           m1, m7
   2994    SGR_CALC_X      m0, m5, m1, m4, m2, m3, m7, m13, m12, [base+pf_256]
   2995 %else
   2996    movaps          m8, [base+pf_256]
   2997    pavgw           m1, m6             ; (b5 + 2) >> 2
   2998    SGR_CALC_X      m0, m5, m1, m4, m2, m3, m6, m13, m12, m8
   2999 %endif
   3000    punpcklwd       m2, m1, m1
   3001    mova [t4+wq*1+400*0+ 4], m1
   3002    punpckhwd       m1, m1
   3003    MUL_32X16X2     m0, m5, m2, m1, m3, m4
   3004    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
   3005    paddd           m5, m10
   3006    psrld           m0, 12
   3007    psrld           m5, 12
   3008    mova [t3+wq*2+400*0+ 8], m0
   3009    mova [t3+wq*2+400*0+24], m5
   3010    add             wq, 16
   3011    jl .hv1_loop
   3012    mov            r10, t2
   3013    mov             t2, t1
   3014    mov             t1, r10
   3015    ret
   3016 .v0: ; vertical boxsums + ab3 (even rows)
   3017 %if ARCH_X86_64
   3018    lea             wq, [r4-4]
   3019 %else
   3020    mov             wd, w0m
   3021 %endif
   3022    movaps          m8, [base+pf_256]
   3023 .v0_loop:
   3024    mova            m0, [t1+wq+400* 6]
   3025    mova            m4, [t1+wq+400* 8]
   3026    mova            m5, [t1+wq+400*10]
   3027    paddw           m0, m0
   3028    paddd           m4, m4
   3029    paddd           m5, m5
   3030    paddw           m1, m0, [t2+wq+400* 6]
   3031    paddd           m2, m4, [t2+wq+400* 8]
   3032    paddd           m3, m5, [t2+wq+400*10]
   3033    mova [t2+wq+400* 6], m0
   3034    mova [t2+wq+400* 8], m4
   3035    mova [t2+wq+400*10], m5
   3036    paddd           m2, m9
   3037    paddd           m3, m9
   3038    psrld           m2, 4              ; (a3 + 8) >> 4
   3039    psrld           m3, 4
   3040    pslld           m4, m2, 3
   3041    pslld           m5, m3, 3
   3042    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
   3043    paddd           m5, m3
   3044    psrlw           m3, m1, 1
   3045 %if ARCH_X86_32
   3046    pxor            m7, m7
   3047    pavgw           m3, m7
   3048    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m7, m14, m11, m8
   3049 %else
   3050    pavgw           m3, m6             ; (b3 + 2) >> 2
   3051    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m6, m14, m11, m8
   3052 %endif
   3053    punpcklwd       m2, m3, m3
   3054    mova [t4+wq*1+400*2+4], m3
   3055    punpckhwd       m3, m3
   3056    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   3057    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   3058    paddd           m1, m10
   3059    psrld           m0, 12
   3060    psrld           m1, 12
   3061    mova            m3, [t1+wq+400*0]
   3062    mova            m4, [t1+wq+400*2]
   3063    mova            m5, [t1+wq+400*4]
   3064    mova [t3+wq*2+400*8+ 8], m3
   3065    mova [t3+wq*2+400*0+ 8], m4
   3066    mova [t3+wq*2+400*0+24], m5
   3067    paddw           m3, m3 ; cc5
   3068    paddd           m4, m4
   3069    paddd           m5, m5
   3070    mova [t1+wq+400*0], m3
   3071    mova [t1+wq+400*2], m4
   3072    mova [t1+wq+400*4], m5
   3073    mova [t3+wq*2+400*4+ 8], m0
   3074    mova [t3+wq*2+400*4+24], m1
   3075    add             wq, 16
   3076    jl .v0_loop
   3077    ret
   3078 .v1: ; vertical boxsums + ab (odd rows)
   3079 %if ARCH_X86_64
   3080    lea             wq, [r4-4]
   3081 %else
   3082    mov             wd, w0m
   3083 %endif
   3084    movaps          m8, [base+pf_256]
   3085 .v1_loop:
   3086    mova            m4, [t1+wq+400* 6]
   3087    mova            m5, [t1+wq+400* 8]
   3088    mova            m7, [t1+wq+400*10]
   3089    paddw           m1, m4, [t2+wq+400* 6]
   3090    paddd           m2, m5, [t2+wq+400* 8]
   3091    paddd           m3, m7, [t2+wq+400*10]
   3092    mova [t2+wq+400* 6], m4
   3093    mova [t2+wq+400* 8], m5
   3094    mova [t2+wq+400*10], m7
   3095    paddd           m2, m9
   3096    paddd           m3, m9
   3097    psrld           m2, 4              ; (a3 + 8) >> 4
   3098    psrld           m3, 4
   3099    pslld           m4, m2, 3
   3100    pslld           m5, m3, 3
   3101    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
   3102    paddd           m5, m3
   3103    psrlw           m3, m1, 1
   3104 %if ARCH_X86_32
   3105    pxor            m7, m7
   3106    pavgw           m3, m7
   3107    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m7, m14, m11, m8
   3108 %else
   3109    pavgw           m3, m6             ; (b3 + 2) >> 2
   3110    SGR_CALC_X      m0, m1, m3, m2, m4, m5, m6, m14, m11, m8
   3111 %endif
   3112    punpcklwd       m2, m3, m3
   3113    mova [t4+wq*1+400*4+4], m3
   3114    punpckhwd       m3, m3
   3115    MUL_32X16X2     m0, m1, m2, m3, m4, m5
   3116    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   3117    paddd           m1, m10
   3118    psrld           m0, 12
   3119    psrld           m1, 12
   3120    mova            m4, [t3+wq*2+400*8+ 8]
   3121    mova            m5, [t3+wq*2+400*0+ 8]
   3122    mova            m7, [t3+wq*2+400*0+24]
   3123    mova [t3+wq*2+400*8+ 8], m0
   3124    mova [t3+wq*2+400*8+24], m1
   3125    paddw           m1, m4, [t2+wq+400*0]
   3126    paddd           m2, m5, [t2+wq+400*2]
   3127    paddd           m3, m7, [t2+wq+400*4]
   3128    paddw           m1, [t1+wq+400*0]
   3129    paddd           m2, [t1+wq+400*2]
   3130    paddd           m3, [t1+wq+400*4]
   3131    mova [t2+wq+400*0], m4
   3132    mova [t2+wq+400*2], m5
   3133    mova [t2+wq+400*4], m7
   3134    paddd           m2, m9
   3135    paddd           m3, m9
   3136    psrld           m2, 4              ; (a5 + 8) >> 4
   3137    psrld           m3, 4
   3138    pslld           m4, m2, 4
   3139    pslld           m5, m3, 4
   3140    paddd           m4, m2
   3141    pslld           m2, 3
   3142    paddd           m5, m3
   3143    pslld           m3, 3
   3144    paddd           m2, m4
   3145    paddd           m3, m5
   3146    psrlw           m5, m1, 1
   3147 %if ARCH_X86_32
   3148    pxor            m7, m7
   3149    pavgw           m5, m7
   3150    SGR_CALC_X      m0, m1, m5, m4, m2, m3, m7, m13, m12, m8
   3151 %else
   3152    pavgw           m5, m6             ; (b5 + 2) >> 2
   3153    SGR_CALC_X      m0, m1, m5, m4, m2, m3, m6, m13, m12, m8
   3154 %endif
   3155    punpcklwd       m4, m5, m5
   3156    mova [t4+wq*1+400*0+ 4], m5
   3157    punpckhwd       m5, m5
   3158    MUL_32X16X2     m0, m1, m4, m5, m2, m3
   3159    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
   3160    paddd           m1, m10
   3161    psrld           m0, 12
   3162    psrld           m1, 12
   3163    mova [t3+wq*2+400*0+ 8], m0
   3164    mova [t3+wq*2+400*0+24], m1
   3165    add             wq, 16
   3166    jl .v1_loop
   3167    mov            r10, t2
   3168    mov             t2, t1
   3169    mov             t1, r10
   3170    ret
   3171 .prep_n: ; initial neighbor setup
   3172    movif64         wq, r4
   3173    movif32         wd, w1m
   3174 .prep_n_loop:
   3175    movu            m0, [t4+wq*1+400*0+ 2]
   3176    movu            m1, [t3+wq*2+400*0+ 4]
   3177    movu            m2, [t3+wq*2+400*0+20]
   3178    movu            m3, [t4+wq*1+400*0+ 4]
   3179    movu            m4, [t3+wq*2+400*0+ 8]
   3180    paddw           m3, [t4+wq*1+400*0+ 0]
   3181    paddd           m4, [t3+wq*2+400*0+ 0]
   3182    paddd           m5, m2, [t3+wq*2+400*0+16]
   3183    movu            m7, [t3+wq*2+400*0+24]
   3184    paddw           m3, m0
   3185    paddd           m4, m1
   3186    paddd           m5, m7
   3187    paddw           m0, m3
   3188    paddd           m1, m4
   3189    paddd           m2, m5
   3190    psllw           m3, 2
   3191    pslld           m4, 2
   3192    pslld           m5, 2
   3193    paddw           m0, m3               ; a5 565
   3194    paddd           m1, m4               ; b5 565
   3195    paddd           m2, m5
   3196    mova [t4+wq*1+400* 6+ 0], m0
   3197    mova [t3+wq*2+400*12+ 0], m1
   3198    mova [t3+wq*2+400*12+16], m2
   3199    movu            m0, [t4+wq*1+400*2+ 4]
   3200    movu            m1, [t3+wq*2+400*4+ 8]
   3201    movu            m2, [t3+wq*2+400*4+24]
   3202    movu            m3, [t4+wq*1+400*2+ 2]
   3203    movu            m4, [t3+wq*2+400*4+ 4]
   3204    movu            m5, [t3+wq*2+400*4+20]
   3205    paddw           m0, [t4+wq*1+400*2+ 0]
   3206    paddd           m1, [t3+wq*2+400*4+ 0]
   3207    paddd           m2, [t3+wq*2+400*4+16]
   3208    paddw           m3, m0
   3209    paddd           m4, m1
   3210    paddd           m5, m2
   3211    psllw           m3, 2                ; a3[-1] 444
   3212    pslld           m4, 2                ; b3[-1] 444
   3213    pslld           m5, 2
   3214    psubw           m3, m0               ; a3[-1] 343
   3215    psubd           m4, m1               ; b3[-1] 343
   3216    psubd           m5, m2
   3217    mova [t4+wq*1+400* 8+ 0], m3
   3218    mova [t3+wq*2+400*16+ 0], m4
   3219    mova [t3+wq*2+400*16+16], m5
   3220    movu            m0, [t4+wq*1+400*4+ 4]
   3221    movu            m1, [t3+wq*2+400*8+ 8]
   3222    movu            m2, [t3+wq*2+400*8+24]
   3223    movu            m3, [t4+wq*1+400*4+ 2]
   3224    movu            m4, [t3+wq*2+400*8+ 4]
   3225    movu            m5, [t3+wq*2+400*8+20]
   3226    paddw           m0, [t4+wq*1+400*4+ 0]
   3227    paddd           m1, [t3+wq*2+400*8+ 0]
   3228    paddd           m2, [t3+wq*2+400*8+16]
   3229    paddw           m3, m0
   3230    paddd           m4, m1
   3231    paddd           m5, m2
   3232    psllw           m3, 2                 ; a3[ 0] 444
   3233    pslld           m4, 2                 ; b3[ 0] 444
   3234    pslld           m5, 2
   3235    mova [t4+wq*1+400*10+ 0], m3
   3236    mova [t3+wq*2+400*20+ 0], m4
   3237    mova [t3+wq*2+400*20+16], m5
   3238    psubw           m3, m0                ; a3[ 0] 343
   3239    psubd           m4, m1                ; b3[ 0] 343
   3240    psubd           m5, m2
   3241    mova [t4+wq*1+400*12+ 0], m3
   3242    mova [t3+wq*2+400*24+ 0], m4
   3243    mova [t3+wq*2+400*24+16], m5
   3244    add             wq, 16
   3245    jl .prep_n_loop
   3246    ret
   3247 ALIGN function_align
   3248 .n0: ; neighbor + output (even rows)
   3249    movif64         wq, r4
   3250    movif32         wd, w1m
   3251 .n0_loop:
   3252    movu            m0, [t4+wq*1+ 4]
   3253    movu            m2, [t4+wq*1+ 2]
   3254    paddw           m0, [t4+wq*1+ 0]
   3255    paddw           m0, m2
   3256    paddw           m2, m0
   3257    psllw           m0, 2
   3258    paddw           m0, m2               ; a5
   3259    movu            m4, [t3+wq*2+ 8]
   3260    movu            m5, [t3+wq*2+24]
   3261    movu            m1, [t3+wq*2+ 4]
   3262    movu            m3, [t3+wq*2+20]
   3263    paddd           m4, [t3+wq*2+ 0]
   3264    paddd           m5, [t3+wq*2+16]
   3265    paddd           m4, m1
   3266    paddd           m5, m3
   3267    paddd           m1, m4
   3268    paddd           m3, m5
   3269    pslld           m4, 2
   3270    pslld           m5, 2
   3271    paddd           m4, m1               ; b5
   3272    paddd           m5, m3
   3273    movu            m2, [t4+wq*1+400* 6]
   3274    paddw           m2, m0
   3275    mova [t4+wq*1+400* 6], m0
   3276    paddd           m0, m4, [t3+wq*2+400*12+ 0]
   3277    paddd           m1, m5, [t3+wq*2+400*12+16]
   3278    mova [t3+wq*2+400*12+ 0], m4
   3279    mova [t3+wq*2+400*12+16], m5
   3280    mova [rsp+16+ARCH_X86_32*4], m1
   3281    movu            m3, [t4+wq*1+400*2+4]
   3282    movu            m5, [t4+wq*1+400*2+2]
   3283    paddw           m3, [t4+wq*1+400*2+0]
   3284    paddw           m5, m3
   3285    psllw           m5, 2                ; a3[ 1] 444
   3286    psubw           m4, m5, m3           ; a3[ 1] 343
   3287    movu            m3, [t4+wq*1+400* 8]
   3288    paddw           m3, [t4+wq*1+400*10]
   3289    paddw           m3, m4
   3290    mova [t4+wq*1+400* 8], m4
   3291    mova [t4+wq*1+400*10], m5
   3292    movu            m1, [t3+wq*2+400*4+ 8]
   3293    movu            m5, [t3+wq*2+400*4+ 4]
   3294    movu            m7, [t3+wq*2+400*4+24]
   3295    movu            m8, [t3+wq*2+400*4+20]
   3296    paddd           m1, [t3+wq*2+400*4+ 0]
   3297    paddd           m7, [t3+wq*2+400*4+16]
   3298    paddd           m5, m1
   3299    paddd           m8, m7
   3300    pslld           m5, 2                ; b3[ 1] 444
   3301    pslld           m8, 2
   3302    psubd           m4, m5, m1           ; b3[ 1] 343
   3303 %if ARCH_X86_32
   3304    mova      [esp+52], m8
   3305    psubd           m8, m7
   3306 %else
   3307    psubd           m6, m8, m7
   3308    SWAP            m8, m6
   3309 %endif
   3310    paddd           m1, m4, [t3+wq*2+400*16+ 0]
   3311    paddd           m7, m8, [t3+wq*2+400*16+16]
   3312    paddd           m1, [t3+wq*2+400*20+ 0]
   3313    paddd           m7, [t3+wq*2+400*20+16]
   3314    mova [t3+wq*2+400*16+ 0], m4
   3315    mova [t3+wq*2+400*16+16], m8
   3316    mova [t3+wq*2+400*20+ 0], m5
   3317 %if ARCH_X86_32
   3318    mova            m8, [esp+52]
   3319 %else
   3320    SWAP            m8, m6
   3321    pxor            m6, m6
   3322 %endif
   3323    mova [t3+wq*2+400*20+16], m8
   3324    mova [rsp+32+ARCH_X86_32*4], m7
   3325    movu            m5, [dstq+wq]
   3326    punpcklwd       m4, m5, m6
   3327    punpcklwd       m7, m2, m6
   3328    pmaddwd         m7, m4               ; a5 * src
   3329    punpcklwd       m8, m3, m6
   3330    pmaddwd         m8, m4               ; a3 * src
   3331    punpckhwd       m5, m6
   3332    punpckhwd       m2, m6
   3333    pmaddwd         m2, m5
   3334    punpckhwd       m3, m6
   3335    pmaddwd         m3, m5
   3336    pslld           m4, 13
   3337    pslld           m5, 13
   3338    psubd           m0, m7               ; b5 - a5 * src + (1 << 8)
   3339    psubd           m1, m8               ; b3 - a3 * src + (1 << 8)
   3340    mova            m7, [base+pd_0xffff]
   3341    psrld           m0, 9
   3342    pslld           m1, 7
   3343    pand            m0, m7
   3344    pandn           m8, m7, m1
   3345    por             m0, m8
   3346    mova            m1, [rsp+16+ARCH_X86_32*4]
   3347    mova            m8, [rsp+32+ARCH_X86_32*4]
   3348    psubd           m1, m2
   3349    psubd           m8, m3
   3350    mova            m2, [base+pd_4096]
   3351    psrld           m1, 9
   3352    pslld           m8, 7
   3353    pand            m1, m7
   3354    pandn           m7, m8
   3355    por             m1, m7
   3356    pmaddwd         m0, m15
   3357    pmaddwd         m1, m15
   3358 %if ARCH_X86_32
   3359    pxor            m7, m7
   3360 %else
   3361    SWAP            m7, m6
   3362 %endif
   3363    paddd           m4, m2
   3364    paddd           m5, m2
   3365    paddd           m0, m4
   3366    paddd           m1, m5
   3367    psrad           m0, 8
   3368    psrad           m1, 8
   3369    packssdw        m0, m1               ; clip
   3370    pmaxsw          m0, m7
   3371    psrlw           m0, 5
   3372    mova     [dstq+wq], m0
   3373    add             wq, 16
   3374    jl .n0_loop
   3375    add           dstq, stridemp
   3376    ret
   3377 %if ARCH_X86_64
   3378    SWAP            m6, m7
   3379 %endif
   3380 ALIGN function_align
   3381 .n1: ; neighbor + output (odd rows)
   3382    movif64         wq, r4
   3383    movif32         wd, w1m
   3384 .n1_loop:
   3385    movu            m3, [t4+wq*1+400*4+4]
   3386    movu            m5, [t4+wq*1+400*4+2]
   3387    paddw           m3, [t4+wq*1+400*4+0]
   3388    paddw           m5, m3
   3389    psllw           m5, 2                ; a3[ 1] 444
   3390    psubw           m4, m5, m3           ; a3[ 1] 343
   3391    paddw           m3, m4, [t4+wq*1+400*12]
   3392    paddw           m3, [t4+wq*1+400*10]
   3393    mova [t4+wq*1+400*10], m5
   3394    mova [t4+wq*1+400*12], m4
   3395    movu            m1, [t3+wq*2+400*8+ 8]
   3396    movu            m5, [t3+wq*2+400*8+ 4]
   3397    movu            m7, [t3+wq*2+400*8+24]
   3398    movu            m8, [t3+wq*2+400*8+20]
   3399    paddd           m1, [t3+wq*2+400*8+ 0]
   3400    paddd           m7, [t3+wq*2+400*8+16]
   3401    paddd           m5, m1
   3402    paddd           m8, m7
   3403    pslld           m5, 2                ; b3[ 1] 444
   3404    pslld           m8, 2
   3405    psubd           m4, m5, m1           ; b3[ 1] 343
   3406    psubd           m0, m8, m7
   3407    paddd           m1, m4, [t3+wq*2+400*24+ 0]
   3408    paddd           m7, m0, [t3+wq*2+400*24+16]
   3409    paddd           m1, [t3+wq*2+400*20+ 0]
   3410    paddd           m7, [t3+wq*2+400*20+16]
   3411    mova [t3+wq*2+400*20+ 0], m5
   3412    mova [t3+wq*2+400*20+16], m8
   3413    mova [t3+wq*2+400*24+ 0], m4
   3414    mova [t3+wq*2+400*24+16], m0
   3415    mova            m5, [dstq+wq]
   3416    mova            m2, [t4+wq*1+400* 6]
   3417    punpcklwd       m4, m5, m6
   3418    punpcklwd       m8, m2, m6
   3419    pmaddwd         m8, m4               ; a5 * src
   3420    punpcklwd       m0, m3, m6
   3421    pmaddwd         m0, m4               ; a3 * src
   3422    punpckhwd       m5, m6
   3423    punpckhwd       m2, m6
   3424    pmaddwd         m2, m5
   3425    punpckhwd       m3, m6
   3426    pmaddwd         m3, m5
   3427    psubd           m1, m0               ; b3 - a3 * src + (1 << 8)
   3428    pslld           m4, 13
   3429    pslld           m5, 13
   3430    mova            m0, [t3+wq*2+400*12+ 0]
   3431    psubd           m0, m8               ; b5 - a5 * src + (1 << 8)
   3432    mova            m8, [t3+wq*2+400*12+16]
   3433    psubd           m8, m2
   3434    psubd           m7, m3
   3435    mova            m2, [base+pd_0xffff]
   3436    pslld           m1, 7
   3437    psrld           m0, 8
   3438    psrld           m8, 8
   3439    pslld           m7, 7
   3440    pand            m0, m2
   3441    pandn           m3, m2, m1
   3442    por             m0, m3
   3443    pand            m8, m2
   3444    pandn           m2, m7
   3445    por             m2, m8
   3446    mova            m1, [base+pd_4096]
   3447    pmaddwd         m0, m15
   3448    pmaddwd         m2, m15
   3449 %if ARCH_X86_64
   3450    SWAP            m7, m6
   3451 %endif
   3452    pxor            m7, m7
   3453    paddd           m4, m1
   3454    paddd           m5, m1
   3455    paddd           m0, m4
   3456    paddd           m2, m5
   3457    psrad           m0, 8
   3458    psrad           m2, 8
   3459    packssdw        m0, m2              ; clip
   3460    pmaxsw          m0, m7
   3461    psrlw           m0, 5
   3462    mova     [dstq+wq], m0
   3463    add             wq, 16
   3464    jl .n1_loop
   3465    add           dstq, stridemp
   3466    movif32       dstm, dstq
   3467    ret