tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration_avx512.asm (68693B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA 32
     32 
     33 wiener_shufA:  db  1,  2,  7,  6,  3,  4,  9,  8,  5,  6, 11, 10,  7,  8, 13, 12
     34 wiener_shufB:  db  2,  3,  8,  7,  4,  5, 10,  9,  6,  7, 12, 11,  8,  9, 14, 13
     35 wiener_shufC:  db  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11
     36 wiener_shufD:  db  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12
     37 wiener_perm32: db  1,  9,  3, 11,  5, 13,  7, 15, 33, 41, 35, 43, 37, 45, 39, 47
     38               db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63
     39 sgr_shuf:      db 128, 1, -1,  2,132,  3, -1,  4,136,  5, -1,  6,140,  7, -1,  8
     40               db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1,  0,128
     41 sgr_mix_perm:  db  1,  3,  5,  7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
     42 r_ext_mask:    times 68 db -1
     43               times  4 db  0
     44 wiener_x_shuf: db  0,  2, -1,  0
     45 wiener_x_add:  db  0,  1,127,  0
     46 
     47 pw_61448:      times 2 dw 61448
     48 pw_164_455:    dw 164, 455
     49 pd_m16380:     dd -16380
     50 pd_m4096:      dd -4096
     51 pd_m25         dd -25
     52 pd_m9:         dd -9
     53 pd_34816:      dd 34816
     54 pd_8421376:    dd 8421376
     55 
     56 cextern sgr_x_by_x
     57 
     58 SECTION .text
     59 
     60 DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
     61 
     62 INIT_ZMM avx512icl
     63 cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \
     64                                                    w, h, edge, flt
     65    mov           fltq, r6mp
     66    mov             wd, wm
     67    movifnidn       hd, hm
     68    mov          edged, r7m
     69    vbroadcasti32x4 m6, [wiener_shufA]
     70    vbroadcasti32x4 m7, [wiener_shufB]
     71    mov           r10d, 0xfffe
     72    vbroadcasti32x4 m8, [wiener_shufC]
     73    vbroadcasti32x4 m9, [wiener_shufD]
     74    kmovw           k1, r10d
     75    vpbroadcastd    m0, [wiener_x_shuf]
     76    vpbroadcastd    m1, [wiener_x_add]
     77    mov            r10, 0xaaaaaaaaaaaaaaaa
     78    vpbroadcastd   m11, [fltq+ 0]
     79    vpbroadcastd   m12, [fltq+ 4]
     80    kmovq           k2, r10
     81    vpbroadcastd   m10, [pd_m16380]
     82    packsswb       m11, m11 ; x0   x1   x0   x1
     83    vpbroadcastd   m14, [fltq+16]
     84    pshufb         m12, m0
     85    vpbroadcastd   m15, [fltq+20]
     86    paddb          m12, m1  ; x2   x3+1 x2   127
     87    vpbroadcastd   m13, [pd_8421376]
     88    psllw          m14, 5   ; y0 y1
     89    psllw          m15, 5   ; y2 y3
     90    cmp             wd, 32  ; the minimum lr unit size for chroma in 4:2:0 is 32
     91    jle .w32                ; pixels, so we need a special case for small widths
     92    lea             t1, [rsp+wq*2+16]
     93    add           lpfq, wq
     94    add           dstq, wq
     95    neg             wq
     96    test         edgeb, 4 ; LR_HAVE_TOP
     97    jz .no_top
     98    call .h_top
     99    add           lpfq, strideq
    100    mov             t6, t1
    101    mov             t5, t1
    102    add             t1, 384*2
    103    call .h_top
    104    lea            r10, [lpfq+strideq*4]
    105    mov           lpfq, dstq
    106    mov             t4, t1
    107    add             t1, 384*2
    108    add            r10, strideq
    109    mov          [rsp], r10 ; below
    110    call .h
    111    mov             t3, t1
    112    mov             t2, t1
    113    dec             hd
    114    jz .v1
    115    add           lpfq, strideq
    116    add             t1, 384*2
    117    call .h
    118    mov             t2, t1
    119    dec             hd
    120    jz .v2
    121    add           lpfq, strideq
    122    add             t1, 384*2
    123    call .h
    124    dec             hd
    125    jz .v3
    126 .main:
    127    lea             t0, [t1+384*2]
    128 .main_loop:
    129    call .hv
    130    dec             hd
    131    jnz .main_loop
    132    test         edgeb, 8 ; LR_HAVE_BOTTOM
    133    jz .v3
    134    mov           lpfq, [rsp]
    135    call .hv_bottom
    136    add           lpfq, strideq
    137    call .hv_bottom
    138 .v1:
    139    call .v
    140    RET
    141 .no_top:
    142    lea            r10, [lpfq+strideq*4]
    143    mov           lpfq, dstq
    144    lea            r10, [r10+strideq*2]
    145    mov          [rsp], r10
    146    call .h
    147    mov             t6, t1
    148    mov             t5, t1
    149    mov             t4, t1
    150    mov             t3, t1
    151    mov             t2, t1
    152    dec             hd
    153    jz .v1
    154    add           lpfq, strideq
    155    add             t1, 384*2
    156    call .h
    157    mov             t2, t1
    158    dec             hd
    159    jz .v2
    160    add           lpfq, strideq
    161    add             t1, 384*2
    162    call .h
    163    dec             hd
    164    jz .v3
    165    lea             t0, [t1+384*2]
    166    call .hv
    167    dec             hd
    168    jz .v3
    169    add             t0, 384*8
    170    call .hv
    171    dec             hd
    172    jnz .main
    173 .v3:
    174    call .v
    175 .v2:
    176    call .v
    177    jmp .v1
    178 .h:
    179    mov            r10, wq
    180    test         edgeb, 1 ; LR_HAVE_LEFT
    181    jz .h_extend_left
    182    movd          xm16, [leftq]
    183    vmovdqu32  m16{k1}, [lpfq+r10-4]
    184    add          leftq, 4
    185    jmp .h_main
    186 .h_extend_left:
    187    vpbroadcastb  xm16, [lpfq+r10]   ; the masked load ensures that no exception
    188    vmovdqu32  m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory
    189    jmp .h_main
    190 .h_top:
    191    mov            r10, wq
    192    test         edgeb, 1 ; LR_HAVE_LEFT
    193    jz .h_extend_left
    194 .h_loop:
    195    movu           m16, [lpfq+r10-4]
    196 .h_main:
    197    movu           m17, [lpfq+r10+4]
    198    test         edgeb, 2 ; LR_HAVE_RIGHT
    199    jnz .h_have_right
    200    cmp           r10d, -66
    201    jl .h_have_right
    202    push            r0
    203    lea             r0, [r_ext_mask+65]
    204    vpbroadcastb    m0, [lpfq-1]
    205    vpternlogd     m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
    206    vpternlogd     m17, m0, [r0+r10+8], 0xe4
    207    pop             r0
    208 .h_have_right:
    209    pshufb          m4, m16, m6
    210    mova            m0, m10
    211    vpdpbusd        m0, m4, m11
    212    pshufb          m4, m16, m7
    213    mova            m2, m10
    214    vpdpbusd        m2, m4, m11
    215    pshufb          m4, m17, m6
    216    mova            m1, m10
    217    vpdpbusd        m1, m4, m11
    218    pshufb          m4, m17, m7
    219    mova            m3, m10
    220    vpdpbusd        m3, m4, m11
    221    pshufb          m4, m16, m8
    222    vpdpbusd        m0, m4, m12
    223    pshufb         m16, m9
    224    vpdpbusd        m2, m16, m12
    225    pshufb          m4, m17, m8
    226    vpdpbusd        m1, m4, m12
    227    pshufb         m17, m9
    228    vpdpbusd        m3, m17, m12
    229    packssdw        m0, m2
    230    packssdw        m1, m3
    231    psraw           m0, 3
    232    psraw           m1, 3
    233    mova [t1+r10*2+ 0], m0
    234    mova [t1+r10*2+64], m1
    235    add            r10, 64
    236    jl .h_loop
    237    ret
    238 ALIGN function_align
    239 .hv:
    240    add           lpfq, strideq
    241    mov            r10, wq
    242    test         edgeb, 1 ; LR_HAVE_LEFT
    243    jz .hv_extend_left
    244    movd          xm16, [leftq]
    245    vmovdqu32  m16{k1}, [lpfq+r10-4]
    246    add          leftq, 4
    247    jmp .hv_main
    248 .hv_extend_left:
    249    vpbroadcastb  xm16, [lpfq+r10]
    250    vmovdqu32  m16{k1}, [lpfq+r10-4]
    251    jmp .hv_main
    252 .hv_bottom:
    253    mov            r10, wq
    254    test         edgeb, 1 ; LR_HAVE_LEFT
    255    jz .hv_extend_left
    256 .hv_loop:
    257    movu           m16, [lpfq+r10-4]
    258 .hv_main:
    259    movu           m17, [lpfq+r10+4]
    260    test         edgeb, 2 ; LR_HAVE_RIGHT
    261    jnz .hv_have_right
    262    cmp           r10d, -66
    263    jl .hv_have_right
    264    push            r0
    265    lea             r0, [r_ext_mask+65]
    266    vpbroadcastb    m0, [lpfq-1]
    267    vpternlogd     m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
    268    vpternlogd     m17, m0, [r0+r10+8], 0xe4
    269    pop             r0
    270 .hv_have_right:
    271    pshufb          m4, m16, m6
    272    mova            m0, m10
    273    vpdpbusd        m0, m4, m11
    274    pshufb          m4, m16, m7
    275    mova            m2, m10
    276    vpdpbusd        m2, m4, m11
    277    pshufb          m4, m17, m6
    278    mova            m1, m10
    279    vpdpbusd        m1, m4, m11
    280    pshufb          m4, m17, m7
    281    mova            m3, m10
    282    vpdpbusd        m3, m4, m11
    283    pshufb          m4, m16, m8
    284    vpdpbusd        m0, m4, m12
    285    pshufb         m16, m9
    286    vpdpbusd        m2, m16, m12
    287    pshufb          m4, m17, m8
    288    vpdpbusd        m1, m4, m12
    289    pshufb         m17, m9
    290    vpdpbusd        m3, m17, m12
    291    packssdw        m0, m2
    292    packssdw        m1, m3
    293    psraw           m0, 3
    294    psraw           m1, 3
    295    mova           m16, [t4+r10*2]
    296    paddw          m16, [t2+r10*2]
    297    mova            m3, [t3+r10*2]
    298    mova           m17, [t4+r10*2+64]
    299    paddw          m17, [t2+r10*2+64]
    300    mova            m5, [t3+r10*2+64]
    301    punpcklwd       m4, m16, m3
    302    mova            m2, m13
    303    vpdpwssd        m2, m4, m15
    304    punpcklwd      m18, m17, m5
    305    mova            m4, m13
    306    vpdpwssd        m4, m18, m15
    307    punpckhwd      m16, m3
    308    mova            m3, m13
    309    vpdpwssd        m3, m16, m15
    310    punpckhwd      m17, m5
    311    mova            m5, m13
    312    vpdpwssd        m5, m17, m15
    313    mova           m17, [t5+r10*2]
    314    paddw          m17, [t1+r10*2]
    315    paddw          m16, m0, [t6+r10*2]
    316    mova           m19, [t5+r10*2+64]
    317    paddw          m19, [t1+r10*2+64]
    318    paddw          m18, m1, [t6+r10*2+64]
    319    mova [t0+r10*2+ 0], m0
    320    mova [t0+r10*2+64], m1
    321    punpcklwd       m0, m16, m17
    322    vpdpwssd        m2, m0, m14
    323    punpcklwd       m1, m18, m19
    324    vpdpwssd        m4, m1, m14
    325    punpckhwd      m16, m17
    326    vpdpwssd        m3, m16, m14
    327    punpckhwd      m18, m19
    328    vpdpwssd        m5, m18, m14
    329    packuswb        m2, m4
    330    psrlw           m2, 8
    331    vpackuswb   m2{k2}, m3, m5
    332    movu    [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap
    333    add            r10, 64 ; function is used for chroma as well, and in some
    334    jl .hv_loop            ; esoteric edge cases chroma dst pointers may only
    335    mov             t6, t5 ; have a 32-byte alignment despite having a width
    336    mov             t5, t4 ; larger than 32, so use an unaligned store here.
    337    mov             t4, t3
    338    mov             t3, t2
    339    mov             t2, t1
    340    mov             t1, t0
    341    mov             t0, t6
    342    add           dstq, strideq
    343    ret
    344 .v:
    345    mov            r10, wq
    346 .v_loop:
    347    mova            m4, [t4+r10*2+ 0]
    348    paddw           m4, [t2+r10*2+ 0]
    349    mova            m1, [t3+r10*2+ 0]
    350    mova            m5, [t4+r10*2+64]
    351    paddw           m5, [t2+r10*2+64]
    352    mova            m3, [t3+r10*2+64]
    353    punpcklwd       m6, m4, m1
    354    mova            m0, m13
    355    vpdpwssd        m0, m6, m15
    356    punpcklwd       m6, m5, m3
    357    mova            m2, m13
    358    vpdpwssd        m2, m6, m15
    359    punpckhwd       m4, m1
    360    mova            m1, m13
    361    vpdpwssd        m1, m4, m15
    362    punpckhwd       m5, m3
    363    mova            m3, m13
    364    vpdpwssd        m3, m5, m15
    365    mova            m5, [t1+r10*2+ 0]
    366    paddw           m4, m5, [t6+r10*2+ 0]
    367    paddw           m5, [t5+r10*2+ 0]
    368    mova            m7, [t1+r10*2+64]
    369    paddw           m6, m7, [t6+r10*2+64]
    370    paddw           m7, [t5+r10*2+64]
    371    punpcklwd       m8, m4, m5
    372    vpdpwssd        m0, m8, m14
    373    punpcklwd       m8, m6, m7
    374    vpdpwssd        m2, m8, m14
    375    punpckhwd       m4, m5
    376    vpdpwssd        m1, m4, m14
    377    punpckhwd       m6, m7
    378    vpdpwssd        m3, m6, m14
    379    packuswb        m0, m2
    380    psrlw           m0, 8
    381    vpackuswb   m0{k2}, m1, m3
    382    movu    [dstq+r10], m0
    383    add            r10, 64
    384    jl .v_loop
    385    mov             t6, t5
    386    mov             t5, t4
    387    mov             t4, t3
    388    mov             t3, t2
    389    mov             t2, t1
    390    add           dstq, strideq
    391    ret
    392 .w32:
    393    lea            r10, [r_ext_mask+73]
    394    mova          ym18, [wiener_perm32]
    395    lea             t1, [rsp+16]
    396    sub            r10, wq
    397    test         edgeb, 4 ; LR_HAVE_TOP
    398    jz .w32_no_top
    399    call .w32_h_top
    400    add           lpfq, strideq
    401    mov             t6, t1
    402    mov             t5, t1
    403    add             t1, 32*2
    404    call .w32_h_top
    405    lea             r9, [lpfq+strideq*4]
    406    mov           lpfq, dstq
    407    mov             t4, t1
    408    add             t1, 32*2
    409    add             r9, strideq
    410    mov          [rsp], r9 ; below
    411    call .w32_h
    412    mov             t3, t1
    413    mov             t2, t1
    414    dec             hd
    415    jz .w32_v1
    416    add           lpfq, strideq
    417    add             t1, 32*2
    418    call .w32_h
    419    mov             t2, t1
    420    dec             hd
    421    jz .w32_v2
    422    add           lpfq, strideq
    423    add             t1, 32*2
    424    call .w32_h
    425    dec             hd
    426    jz .w32_v3
    427 .w32_main:
    428    lea             t0, [t1+32*2]
    429 .w32_main_loop:
    430    call .w32_hv
    431    dec             hd
    432    jnz .w32_main_loop
    433    test         edgeb, 8 ; LR_HAVE_BOTTOM
    434    jz .w32_v3
    435    mov           lpfq, [rsp]
    436    call .w32_hv_bottom
    437    add           lpfq, strideq
    438    call .w32_hv_bottom
    439 .w32_v1:
    440    call .w32_v
    441    RET
    442 .w32_no_top:
    443    lea             r9, [lpfq+strideq*4]
    444    mov           lpfq, dstq
    445    lea             r9, [r9+strideq*2]
    446    mov          [rsp], r9
    447    call .w32_h
    448    mov             t6, t1
    449    mov             t5, t1
    450    mov             t4, t1
    451    mov             t3, t1
    452    mov             t2, t1
    453    dec             hd
    454    jz .w32_v1
    455    add           lpfq, strideq
    456    add             t1, 32*2
    457    call .w32_h
    458    mov             t2, t1
    459    dec             hd
    460    jz .w32_v2
    461    add           lpfq, strideq
    462    add             t1, 32*2
    463    call .w32_h
    464    dec             hd
    465    jz .w32_v3
    466    lea             t0, [t1+32*2]
    467    call .w32_hv
    468    dec             hd
    469    jz .w32_v3
    470    add             t0, 32*8
    471    call .w32_hv
    472    dec             hd
    473    jnz .w32_main
    474 .w32_v3:
    475    call .w32_v
    476 .w32_v2:
    477    call .w32_v
    478    jmp .w32_v1
    479 .w32_h:
    480    test         edgeb, 1 ; LR_HAVE_LEFT
    481    jz .w32_h_extend_left
    482    movd          xm16, [leftq]
    483    vmovdqu32 ym16{k1}, [lpfq-4]
    484    add          leftq, 4
    485    jmp .w32_h_main
    486 .w32_h_extend_left:
    487    vpbroadcastb  xm16, [lpfq]   ; the masked load ensures that no exception
    488    vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory
    489    jmp .w32_h_main
    490 .w32_h_top:
    491    test         edgeb, 1 ; LR_HAVE_LEFT
    492    jz .w32_h_extend_left
    493    movu          ym16, [lpfq-4]
    494 .w32_h_main:
    495    vinserti32x8   m16, [lpfq+4], 1
    496    test         edgeb, 2 ; LR_HAVE_RIGHT
    497    jnz .w32_h_have_right
    498    vpbroadcastb    m0, [lpfq+wq-1]
    499    movu          ym17, [r10-8]
    500    vinserti32x8   m17, [r10+0], 1
    501    vpternlogd     m16, m0, m17, 0xe4 ; c ? a : b
    502 .w32_h_have_right:
    503    pshufb          m2, m16, m6
    504    mova            m0, m10
    505    vpdpbusd        m0, m2, m11
    506    pshufb          m2, m16, m7
    507    mova            m1, m10
    508    vpdpbusd        m1, m2, m11
    509    pshufb          m2, m16, m8
    510    vpdpbusd        m0, m2, m12
    511    pshufb         m16, m9
    512    vpdpbusd        m1, m16, m12
    513    packssdw        m0, m1
    514    psraw           m0, 3
    515    mova          [t1], m0
    516    ret
    517 .w32_hv:
    518    add           lpfq, strideq
    519    test         edgeb, 1 ; LR_HAVE_LEFT
    520    jz .w32_hv_extend_left
    521    movd          xm16, [leftq]
    522    vmovdqu32 ym16{k1}, [lpfq-4]
    523    add          leftq, 4
    524    jmp .w32_hv_main
    525 .w32_hv_extend_left:
    526    vpbroadcastb  xm16, [lpfq]
    527    vmovdqu32 ym16{k1}, [lpfq-4]
    528    jmp .w32_hv_main
    529 .w32_hv_bottom:
    530    test         edgeb, 1 ; LR_HAVE_LEFT
    531    jz .w32_hv_extend_left
    532    movu          ym16, [lpfq-4]
    533 .w32_hv_main:
    534    vinserti32x8   m16, [lpfq+4], 1
    535    test         edgeb, 2 ; LR_HAVE_RIGHT
    536    jnz .w32_hv_have_right
    537    vpbroadcastb    m0, [lpfq+wq-1]
    538    movu          ym17, [r10-8]
    539    vinserti32x8   m17, [r10+0], 1
    540    vpternlogd     m16, m0, m17, 0xe4
    541 .w32_hv_have_right:
    542    mova            m3, [t4]
    543    paddw           m3, [t2]
    544    mova            m2, [t3]
    545    pshufb          m4, m16, m6
    546    mova            m0, m10
    547    vpdpbusd        m0, m4, m11
    548    pshufb          m4, m16, m7
    549    mova            m5, m10
    550    vpdpbusd        m5, m4, m11
    551    punpcklwd       m4, m3, m2
    552    mova            m1, m13
    553    vpdpwssd        m1, m4, m15
    554    punpckhwd       m3, m2
    555    mova            m2, m13
    556    vpdpwssd        m2, m3, m15
    557    pshufb          m4, m16, m8
    558    vpdpbusd        m0, m4, m12
    559    pshufb         m16, m9
    560    vpdpbusd        m5, m16, m12
    561    packssdw        m0, m5
    562    psraw           m0, 3
    563    mova            m4, [t5]
    564    paddw           m4, [t1]
    565    paddw           m3, m0, [t6]
    566    mova          [t0], m0
    567    punpcklwd       m0, m3, m4
    568    vpdpwssd        m1, m0, m14
    569    punpckhwd       m3, m4
    570    vpdpwssd        m2, m3, m14
    571    packuswb        m1, m2
    572    vpermb         m16, m18, m1
    573    mova        [dstq], ym16
    574    mov             t6, t5
    575    mov             t5, t4
    576    mov             t4, t3
    577    mov             t3, t2
    578    mov             t2, t1
    579    mov             t1, t0
    580    mov             t0, t6
    581    add           dstq, strideq
    582    ret
    583 .w32_v:
    584    mova            m2, [t4]
    585    paddw           m2, [t2]
    586    mova            m1, [t3]
    587    mova            m4, [t1]
    588    paddw           m3, m4, [t6]
    589    paddw           m4, [t5]
    590    punpcklwd       m5, m2, m1
    591    mova            m0, m13
    592    vpdpwssd        m0, m5, m15
    593    punpckhwd       m2, m1
    594    mova            m1, m13
    595    vpdpwssd        m1, m2, m15
    596    punpcklwd       m2, m3, m4
    597    vpdpwssd        m0, m2, m14
    598    punpckhwd       m3, m4
    599    vpdpwssd        m1, m3, m14
    600    packuswb        m0, m1
    601    vpermb         m16, m18, m0
    602    mova        [dstq], ym16
    603    mov             t6, t5
    604    mov             t5, t4
    605    mov             t4, t3
    606    mov             t3, t2
    607    mov             t2, t1
    608    add           dstq, strideq
    609    ret
    610 
    611 cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \
    612                                                   w, h, edge, params
    613    mov        paramsq, r6mp
    614    mov             wd, wm
    615    mov             hd, hm
    616    mov          edged, r7m
    617    vbroadcasti32x4 m5, [sgr_shuf+1]
    618    add           lpfq, wq
    619    vbroadcasti32x4 m6, [sgr_shuf+9]
    620    add           dstq, wq
    621    vbroadcasti32x4 m7, [sgr_shuf+3]
    622    lea             t3, [rsp+wq*4+16+416*12]
    623    vbroadcasti32x4 m8, [sgr_shuf+7]
    624    pxor            m4, m4
    625    vpbroadcastd    m9, [pd_m25]
    626    vpsubd         m11, m4, [paramsq+0] {1to16} ; -s0
    627    vpbroadcastw   m15, [paramsq+8]             ; w0
    628    lea             t1, [rsp+wq*2+20]
    629    vpbroadcastd   m10, [pw_164_455]
    630    neg             wq
    631    vpbroadcastd   m12, [pw_61448]              ; (15 << 12) + (1 << 3)
    632    mov           r10d, 0xfe
    633    vpbroadcastd   m13, [pd_m4096]
    634    kmovb           k1, r10d
    635    vpbroadcastd   m14, [pd_34816]              ; (1 << 11) + (1 << 15)
    636    mov            r10, 0x3333333333333333
    637    mova           m18, [sgr_x_by_x+64*0]
    638    kmovq           k2, r10
    639    mova           m19, [sgr_x_by_x+64*1]
    640    lea            r12, [r_ext_mask+75]
    641    mova           m20, [sgr_x_by_x+64*2]
    642    psllw          m15, 4
    643    mova           m21, [sgr_x_by_x+64*3]
    644    lea            r10, [lpfq+strideq*4]
    645    mova          ym22, [sgr_shuf]
    646    add            r10, strideq
    647    mov          [rsp], r10 ; below
    648    test         edgeb, 4 ; LR_HAVE_TOP
    649    jz .no_top
    650    call .h_top
    651    add           lpfq, strideq
    652    mov             t2, t1
    653    call .top_fixup
    654    add             t1, 416*6
    655    call .h_top
    656    lea            r10, [lpfq+strideq*4]
    657    mov           lpfq, dstq
    658    add            r10, strideq
    659    mov          [rsp], r10 ; below
    660    mov             t0, t2
    661    dec             hd
    662    jz .height1
    663    or           edged, 16
    664    call .h
    665 .main:
    666    add           lpfq, strideq
    667    call .hv
    668    call .prep_n
    669    sub             hd, 2
    670    jl .extend_bottom
    671 .main_loop:
    672    add           lpfq, strideq
    673    test            hd, hd
    674    jz .odd_height
    675    call .h
    676    add           lpfq, strideq
    677    call .hv
    678    call .n0
    679    call .n1
    680    sub             hd, 2
    681    jge .main_loop
    682    test         edgeb, 8 ; LR_HAVE_BOTTOM
    683    jz .extend_bottom
    684    mov           lpfq, [rsp]
    685    call .h_top
    686    add           lpfq, strideq
    687    call .hv_bottom
    688 .end:
    689    call .n0
    690    call .n1
    691 .end2:
    692    RET
    693 .height1:
    694    call .hv
    695    call .prep_n
    696    jmp .odd_height_end
    697 .odd_height:
    698    call .hv
    699    call .n0
    700    call .n1
    701 .odd_height_end:
    702    call .v
    703    call .n0
    704    jmp .end2
    705 .extend_bottom:
    706    call .v
    707    jmp .end
    708 .no_top:
    709    lea            r10, [lpfq+strideq*4]
    710    mov           lpfq, dstq
    711    lea            r10, [r10+strideq*2]
    712    mov          [rsp], r10
    713    call .h
    714    lea             t2, [t1+416*6]
    715    call .top_fixup
    716    dec             hd
    717    jz .no_top_height1
    718    or           edged, 16
    719    mov             t0, t1
    720    mov             t1, t2
    721    jmp .main
    722 .no_top_height1:
    723    call .v
    724    call .prep_n
    725    jmp .odd_height_end
    726 .h: ; horizontal boxsum
    727    lea            r10, [wq-2]
    728    test         edgeb, 1 ; LR_HAVE_LEFT
    729    jz .h_extend_left
    730    movd          xm17, [leftq]
    731    vmovdqu32 ym17{k1}, [lpfq+wq-4]
    732    add          leftq, 4
    733    jmp .h_main
    734 .h_extend_left:
    735    vpbroadcastb  xm17, [lpfq+wq]
    736    vmovdqu32 ym17{k1}, [lpfq+wq-4]
    737    jmp .h_main
    738 .h_top:
    739    lea            r10, [wq-2]
    740    test         edgeb, 1 ; LR_HAVE_LEFT
    741    jz .h_extend_left
    742 .h_loop:
    743    movu          ym17, [lpfq+r10-2]
    744 .h_main:
    745    vinserti32x8   m17, [lpfq+r10+6], 1
    746    test         edgeb, 2 ; LR_HAVE_RIGHT
    747    jnz .h_have_right
    748    cmp           r10d, -34
    749    jl .h_have_right
    750    vpbroadcastb    m0, [lpfq-1]
    751    movu          ym16, [r12+r10-8]
    752    vinserti32x8   m16, [r12+r10+0], 1
    753    vpternlogd     m17, m0, m16, 0xe4
    754 .h_have_right:
    755    pshufb          m3, m17, m5
    756    pmullw          m2, m3, m3
    757    pshufb          m1, m17, m6
    758    paddw           m0, m3, m1
    759    shufps          m3, m1, q2121
    760    paddw           m0, m3
    761    punpcklwd      m16, m3, m1
    762    punpckhwd       m3, m1
    763    punpcklwd       m1, m2, m4
    764    vpdpwssd        m1, m16, m16
    765    punpckhwd       m2, m4
    766    vpdpwssd        m2, m3, m3
    767    pshufb         m16, m17, m7
    768    paddw           m0, m16
    769    pshufb         m17, m8
    770    paddw           m0, m17              ; sum
    771    punpcklwd       m3, m16, m17
    772    vpdpwssd        m1, m3, m3           ; sumsq
    773    punpckhwd      m16, m17
    774    vpdpwssd        m2, m16, m16
    775    test         edgeb, 16 ; y > 0
    776    jz .h_loop_end
    777    paddw           m0, [t1+r10*2+416*0]
    778    paddd           m1, [t1+r10*2+416*2]
    779    paddd           m2, [t1+r10*2+416*4]
    780 .h_loop_end:
    781    mova [t1+r10*2+416*0], m0
    782    mova [t1+r10*2+416*2], m1
    783    mova [t1+r10*2+416*4], m2
    784    add            r10, 32
    785    jl .h_loop
    786    ret
    787 .top_fixup:
    788    lea            r10, [wq-2]
    789 .top_fixup_loop: ; the sums of the first row needs to be doubled
    790    mova            m0, [t1+r10*2+416*0]
    791    mova            m1, [t1+r10*2+416*2]
    792    mova            m2, [t1+r10*2+416*4]
    793    paddw           m0, m0
    794    paddd           m1, m1
    795    paddd           m2, m2
    796    mova [t2+r10*2+416*0], m0
    797    mova [t2+r10*2+416*2], m1
    798    mova [t2+r10*2+416*4], m2
    799    add            r10, 32
    800    jl .top_fixup_loop
    801    ret
    802 ALIGN function_align
    803 .hv: ; horizontal boxsum + vertical boxsum + ab
    804    lea            r10, [wq-2]
    805    test         edgeb, 1 ; LR_HAVE_LEFT
    806    jz .hv_extend_left
    807    movd          xm17, [leftq]
    808    vmovdqu32 ym17{k1}, [lpfq+wq-4]
    809    add          leftq, 4
    810    jmp .hv_main
    811 .hv_extend_left:
    812    vpbroadcastb  xm17, [lpfq+wq]
    813    vmovdqu32 ym17{k1}, [lpfq+wq-4]
    814    jmp .hv_main
    815 .hv_bottom:
    816    lea            r10, [wq-2]
    817    test         edgeb, 1 ; LR_HAVE_LEFT
    818    jz .hv_extend_left
    819 .hv_loop:
    820    movu          ym17, [lpfq+r10-2]
    821 .hv_main:
    822    vinserti32x8   m17, [lpfq+r10+6], 1
    823    test         edgeb, 2 ; LR_HAVE_RIGHT
    824    jnz .hv_have_right
    825    cmp           r10d, -34
    826    jl .hv_have_right
    827    vpbroadcastb    m0, [lpfq-1]
    828    movu          ym16, [r12+r10-8]
    829    vinserti32x8   m16, [r12+r10+0], 1
    830    vpternlogd     m17, m0, m16, 0xe4
    831 .hv_have_right:
    832    pshufb          m1, m17, m5
    833    pmullw          m3, m1, m1
    834    pshufb          m2, m17, m6
    835    paddw           m0, m1, m2
    836    shufps          m1, m2, q2121
    837    paddw           m0, m1
    838    punpcklwd      m16, m1, m2
    839    punpckhwd       m1, m2
    840    punpcklwd       m2, m3, m4
    841    vpdpwssd        m2, m16, m16
    842    punpckhwd       m3, m4
    843    vpdpwssd        m3, m1, m1
    844    pshufb         m16, m17, m7
    845    paddw           m0, m16
    846    pshufb         m17, m8
    847    paddw           m0, m17              ; h sum
    848    punpcklwd       m1, m16, m17
    849    vpdpwssd        m2, m1, m1           ; h sumsq
    850    punpckhwd      m16, m17
    851    vpdpwssd        m3, m16, m16
    852    paddw           m1, m0, [t1+r10*2+416*0]
    853    paddd          m16, m2, [t1+r10*2+416*2]
    854    paddd          m17, m3, [t1+r10*2+416*4]
    855    test            hd, hd
    856    jz .hv_last_row
    857 .hv_main2:
    858    paddd          m16, [t2+r10*2+416*2] ; hv sumsq
    859    paddd          m17, [t2+r10*2+416*4]
    860    paddw           m1, [t2+r10*2+416*0] ; hv sum
    861    mova [t0+r10*2+416*2], m2
    862    mova [t0+r10*2+416*4], m3
    863    mova [t0+r10*2+416*0], m0
    864    pmulld         m16, m9               ; -a * 25
    865    pmulld         m17, m9
    866    punpcklwd       m0, m1, m4           ; b
    867    vpdpwssd       m16, m0, m0           ; -p
    868    punpckhwd       m1, m4
    869    vpdpwssd       m17, m1, m1
    870    pmaddwd         m0, m10              ; b * 164
    871    pmaddwd         m1, m10
    872    pmulld         m16, m11              ; p * s
    873    pmulld         m17, m11
    874    vpalignr   m17{k2}, m16, m16, 2
    875    mova           m16, m20
    876    paddusw        m17, m12
    877    psraw          m17, 4                ; min(z, 255) - 256
    878    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
    879    vpmovb2m        k3, m17
    880    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
    881    vmovdqu8   m17{k3}, m16              ; x
    882    pandn          m16, m13, m17
    883    psrld          m17, 16
    884    pmulld          m0, m16
    885    pmulld          m1, m17
    886    paddd           m0, m14              ; x * b * 164 + (1 << 11) + (1 << 15)
    887    paddd           m1, m14
    888    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
    889    vpternlogd     m17, m1, m13, 0xd8
    890    mova          [t3+r10*4+  8], m16    ; The neighbor calculations requires
    891    mova          [t3+r10*4+ 24], xm17   ; 13 bits for a and 21 bits for b.
    892    vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but
    893    mova          [t3+r10*4+ 72], m17    ; that gets us most of the way.
    894    vextracti128  [t3+r10*4+ 72], ym16, 1
    895    vextracti32x4 [t3+r10*4+104], m16, 3
    896    add            r10, 32
    897    jl .hv_loop
    898    mov             t2, t1
    899    mov             t1, t0
    900    mov             t0, t2
    901    ret
    902 .hv_last_row: ; esoteric edge case for odd heights
    903    mova [t1+r10*2+416*0], m1
    904    paddw              m1, m0
    905    mova [t1+r10*2+416*2], m16
    906    paddd             m16, m2
    907    mova [t1+r10*2+416*4], m17
    908    paddd             m17, m3
    909    jmp .hv_main2
    910 .v: ; vertical boxsum + ab
    911    lea            r10, [wq-2]
    912 .v_loop:
    913    mova            m2, [t1+r10*2+416*2]
    914    paddd          m16, m2, [t2+r10*2+416*2]
    915    mova            m3, [t1+r10*2+416*4]
    916    paddd          m17, m3, [t2+r10*2+416*4]
    917    paddd           m2, m2
    918    paddd           m3, m3
    919    paddd          m16, m2               ; hv sumsq
    920    paddd          m17, m3
    921    pmulld         m16, m9               ; -a * 25
    922    pmulld         m17, m9
    923    mova            m0, [t1+r10*2+416*0]
    924    paddw           m1, m0, [t2+r10*2+416*0]
    925    paddw           m0, m0
    926    paddw           m1, m0               ; hv sum
    927    punpcklwd       m0, m1, m4           ; b
    928    vpdpwssd       m16, m0, m0           ; -p
    929    punpckhwd       m1, m4
    930    vpdpwssd       m17, m1, m1
    931    pmaddwd         m0, m10              ; b * 164
    932    pmaddwd         m1, m10
    933    pmulld         m16, m11              ; p * s
    934    pmulld         m17, m11
    935    vpalignr   m17{k2}, m16, m16, 2
    936    mova           m16, m20
    937    paddusw        m17, m12
    938    psraw          m17, 4                ; min(z, 255) - 256
    939    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
    940    vpmovb2m        k3, m17
    941    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
    942    vmovdqu8   m17{k3}, m16              ; x
    943    pandn          m16, m13, m17
    944    psrld          m17, 16
    945    pmulld          m0, m16
    946    pmulld          m1, m17
    947    paddd           m0, m14              ; x * b * 164 + (1 << 11) + (1 << 15)
    948    paddd           m1, m14
    949    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
    950    vpternlogd     m17, m1, m13, 0xd8
    951    mova          [t3+r10*4+  8], m16
    952    mova          [t3+r10*4+ 24], xm17
    953    vextracti32x4 [t3+r10*4+ 56], m17, 2
    954    mova          [t3+r10*4+ 72], m17
    955    vextracti128  [t3+r10*4+ 72], ym16, 1
    956    vextracti32x4 [t3+r10*4+104], m16, 3
    957    add            r10, 32
    958    jl .v_loop
    959    ret
    960 .prep_n: ; initial neighbor setup
    961    mov            r10, wq
    962 .prep_n_loop:
    963    movu            m0, [t3+r10*4+ 4]
    964    movu            m1, [t3+r10*4+68]
    965    paddd           m2, m0, [t3+r10*4+ 0]
    966    paddd           m3, m1, [t3+r10*4+64]
    967    paddd           m2, [t3+r10*4+ 8]
    968    paddd           m3, [t3+r10*4+72]
    969    paddd           m0, m2
    970    pslld           m2, 2
    971    paddd           m1, m3
    972    pslld           m3, 2
    973    paddd           m2, m0                ; ab 565
    974    paddd           m3, m1
    975    pandn           m0, m13, m2           ; a
    976    psrld           m2, 12                ; b
    977    pandn           m1, m13, m3
    978    psrld           m3, 12
    979    mova [t3+r10*4+416*4+ 0], m0
    980    mova [t3+r10*4+416*8+ 0], m2
    981    mova [t3+r10*4+416*4+64], m1
    982    mova [t3+r10*4+416*8+64], m3
    983    add            r10, 32
    984    jl .prep_n_loop
    985    ret
    986 ALIGN function_align
    987 .n0: ; neighbor + output (even rows)
    988    mov            r10, wq
    989 .n0_loop:
    990    movu           m16, [t3+r10*4+ 4]
    991    movu           m17, [t3+r10*4+68]
    992    paddd           m0, m16, [t3+r10*4+ 0]
    993    paddd           m1, m17, [t3+r10*4+64]
    994    paddd           m0, [t3+r10*4+ 8]
    995    paddd           m1, [t3+r10*4+72]
    996    paddd          m16, m0
    997    pslld           m0, 2
    998    paddd          m17, m1
    999    pslld           m1, 2
   1000    paddd           m0, m16
   1001    paddd           m1, m17
   1002    pandn          m16, m13, m0
   1003    psrld           m0, 12
   1004    pandn          m17, m13, m1
   1005    psrld           m1, 12
   1006    paddd           m2, m16, [t3+r10*4+416*4+ 0] ; a
   1007    paddd           m3, m17, [t3+r10*4+416*4+64]
   1008    mova [t3+r10*4+416*4+ 0], m16
   1009    mova [t3+r10*4+416*4+64], m17
   1010    paddd          m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8)
   1011    paddd          m17, m1, [t3+r10*4+416*8+64]
   1012    mova [t3+r10*4+416*8+ 0], m0
   1013    mova [t3+r10*4+416*8+64], m1
   1014    pmovzxbd        m0, [dstq+r10+ 0]
   1015    pmovzxbd        m1, [dstq+r10+16]
   1016    pmaddwd         m2, m0                      ; a * src
   1017    pmaddwd         m3, m1
   1018    packssdw        m0, m1
   1019    psubd          m16, m2                      ; b - a * src + (1 << 8)
   1020    psubd          m17, m3
   1021    psrad          m16, 9
   1022    psrad          m17, 9
   1023    packssdw       m16, m17
   1024    pmulhrsw       m16, m15
   1025    paddw          m16, m0
   1026    packuswb       m16, m16
   1027    vpermd         m16, m22, m16
   1028    mova    [dstq+r10], ym16
   1029    add            r10, 32
   1030    jl .n0_loop
   1031    add           dstq, strideq
   1032    ret
   1033 ALIGN function_align
   1034 .n1: ; neighbor + output (odd rows)
   1035    mov            r10, wq
   1036 .n1_loop:
   1037    pmovzxbd        m0, [dstq+r10+ 0]
   1038    pmovzxbd        m1, [dstq+r10+16]
   1039    pmaddwd         m2, m0, [t3+r10*4+416*4+ 0] ; a * src
   1040    pmaddwd         m3, m1, [t3+r10*4+416*4+64]
   1041    mova           m16, [t3+r10*4+416*8+ 0]     ; b + (1 << 7)
   1042    mova           m17, [t3+r10*4+416*8+64]
   1043    packssdw        m0, m1
   1044    psubd          m16, m2                      ; b - a * src + (1 << 7)
   1045    psubd          m17, m3
   1046    psrad          m16, 8
   1047    psrad          m17, 8
   1048    packssdw       m16, m17
   1049    pmulhrsw       m16, m15
   1050    paddw          m16, m0
   1051    packuswb       m16, m16
   1052    vpermd         m16, m22, m16
   1053    mova    [dstq+r10], ym16
   1054    add            r10, 32
   1055    jl .n1_loop
   1056    add           dstq, strideq
   1057    ret
   1058 
   1059 cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \
   1060                                                    w, h, edge, params
   1061    mov        paramsq, r6mp
   1062    mov             wd, wm
   1063    movifnidn       hd, hm
   1064    mov          edged, r7m
   1065    vbroadcasti32x4 m5, [sgr_shuf+3]
   1066    add           lpfq, wq
   1067    vbroadcasti32x4 m6, [sgr_shuf+5]
   1068    add           dstq, wq
   1069    vbroadcasti32x4 m7, [sgr_shuf+7]
   1070    pxor            m4, m4
   1071    vpbroadcastd    m8, [pd_m9]
   1072    vpsubd         m11, m4, [paramsq+4] {1to16} ; -s1
   1073    vpbroadcastw   m15, [paramsq+10]            ; w1
   1074    lea             t1, [rsp+wq*2+20]
   1075    vpbroadcastd   m10, [pw_164_455]
   1076    lea             t3, [rsp+wq*4+16+416*12]
   1077    vpbroadcastd   m12, [pw_61448]              ; (15 << 12) + (1 << 3)
   1078    neg             wq
   1079    vpbroadcastd   m13, [pd_m4096]
   1080    mov           r10d, 0xfe
   1081    vpbroadcastd   m14, [pd_34816]              ; (1 << 11) + (1 << 15)
   1082    kmovb           k1, r10d
   1083    mova           m18, [sgr_x_by_x+64*0]
   1084    mov            r10, 0x3333333333333333
   1085    mova           m19, [sgr_x_by_x+64*1]
   1086    kmovq           k2, r10
   1087    mova           m20, [sgr_x_by_x+64*2]
   1088    psllw          m15, 4
   1089    mova           m21, [sgr_x_by_x+64*3]
   1090    lea            r14, [r_ext_mask+75]
   1091    mova           ym9, [sgr_shuf]
   1092    test         edgeb, 4 ; LR_HAVE_TOP
   1093    jz .no_top
   1094    call .h_top
   1095    add           lpfq, strideq
   1096    mov             t2, t1
   1097    add             t1, 416*6
   1098    call .h_top
   1099    lea             t4, [lpfq+strideq*4]
   1100    mov           lpfq, dstq
   1101    add             t4, strideq
   1102    mov          [rsp], t4 ; below
   1103    mov             t0, t2
   1104    call .hv
   1105 .main:
   1106    mov             t5, t3
   1107    add             t3, 416*4
   1108    dec             hd
   1109    jz .height1
   1110    add           lpfq, strideq
   1111    call .hv
   1112    call .prep_n
   1113    dec             hd
   1114    jz .extend_bottom
   1115 .main_loop:
   1116    add           lpfq, strideq
   1117    call .hv
   1118    call .n
   1119    dec             hd
   1120    jnz .main_loop
   1121    test         edgeb, 8 ; LR_HAVE_BOTTOM
   1122    jz .extend_bottom
   1123    mov           lpfq, [rsp]
   1124    call .hv_bottom
   1125    call .n
   1126    add           lpfq, strideq
   1127    call .hv_bottom
   1128 .end:
   1129    call .n
   1130    RET
   1131 .height1:
   1132    call .v
   1133    call .prep_n
   1134    mov             t2, t1
   1135    call .v
   1136    jmp .end
   1137 .extend_bottom:
   1138    call .v
   1139    call .n
   1140    mov             t2, t1
   1141    call .v
   1142    jmp .end
   1143 .no_top:
   1144    lea             t4, [lpfq+strideq*4]
   1145    mov           lpfq, dstq
   1146    lea             t4, [t4+strideq*2]
   1147    mov          [rsp], t4
   1148    call .h
   1149    lea             t0, [t1+416*6]
   1150    mov             t2, t1
   1151    call .v
   1152    jmp .main
   1153 .h: ; horizontal boxsum
   1154    lea            r10, [wq-2]
   1155    test         edgeb, 1 ; LR_HAVE_LEFT
   1156    jz .h_extend_left
   1157    movd          xm17, [leftq]
   1158    vmovdqu32 ym17{k1}, [lpfq+wq-4]
   1159    add          leftq, 4
   1160    jmp .h_main
   1161 .h_extend_left:
   1162    vpbroadcastb  xm17, [lpfq+wq]
   1163    vmovdqu32 ym17{k1}, [lpfq+wq-4]
   1164    jmp .h_main
   1165 .h_top:
   1166    lea            r10, [wq-2]
   1167    test         edgeb, 1 ; LR_HAVE_LEFT
   1168    jz .h_extend_left
   1169 .h_loop:
   1170    movu          ym17, [lpfq+r10-2]
   1171 .h_main:
   1172    vinserti32x8   m17, [lpfq+r10+6], 1
   1173    test         edgeb, 2 ; LR_HAVE_RIGHT
   1174    jnz .h_have_right
   1175    cmp           r10d, -33
   1176    jl .h_have_right
   1177    vpbroadcastb    m0, [lpfq-1]
   1178    movu          ym16, [r14+r10-8]
   1179    vinserti32x8   m16, [r14+r10+0], 1
   1180    vpternlogd     m17, m0, m16, 0xe4
   1181 .h_have_right:
   1182    pshufb          m0, m17, m5
   1183    pmullw          m2, m0, m0
   1184    pshufb         m16, m17, m6
   1185    paddw           m0, m16
   1186    pshufb         m17, m7
   1187    paddw           m0, m17    ; sum
   1188    punpcklwd       m3, m16, m17
   1189    punpcklwd       m1, m2, m4
   1190    vpdpwssd        m1, m3, m3 ; sumsq
   1191    punpckhwd      m16, m17
   1192    punpckhwd       m2, m4
   1193    vpdpwssd        m2, m16, m16
   1194    mova [t1+r10*2+416*0], m0
   1195    mova [t1+r10*2+416*2], m1
   1196    mova [t1+r10*2+416*4], m2
   1197    add            r10, 32
   1198    jl .h_loop
   1199    ret
   1200 ALIGN function_align
   1201 .hv: ; horizontal boxsum + vertical boxsum + ab
   1202    lea            r10, [wq-2]
   1203    test         edgeb, 1 ; LR_HAVE_LEFT
   1204    jz .hv_extend_left
   1205    movd          xm17, [leftq]
   1206    vmovdqu32 ym17{k1}, [lpfq+wq-4]
   1207    add          leftq, 4
   1208    jmp .hv_main
   1209 .hv_extend_left:
   1210    vpbroadcastb  xm17, [lpfq+wq]
   1211    vmovdqu32 ym17{k1}, [lpfq+wq-4]
   1212    jmp .hv_main
   1213 .hv_bottom:
   1214    lea            r10, [wq-2]
   1215    test         edgeb, 1 ; LR_HAVE_LEFT
   1216    jz .hv_extend_left
   1217 .hv_loop:
   1218    movu          ym17, [lpfq+r10-2]
   1219 .hv_main:
   1220    vinserti32x8   m17, [lpfq+r10+6], 1
   1221    test         edgeb, 2 ; LR_HAVE_RIGHT
   1222    jnz .hv_have_right
   1223    cmp           r10d, -33
   1224    jl .hv_have_right
   1225    vpbroadcastb    m0, [lpfq-1]
   1226    movu          ym16, [r14+r10-8]
   1227    vinserti32x8   m16, [r14+r10+0], 1
   1228    vpternlogd     m17, m0, m16, 0xe4
   1229 .hv_have_right:
   1230    pshufb          m0, m17, m5
   1231    pmullw          m3, m0, m0
   1232    pshufb          m1, m17, m6
   1233    paddw           m0, m1
   1234    pshufb         m17, m7
   1235    paddw           m0, m17              ; h sum
   1236    punpcklwd      m16, m17, m1
   1237    punpcklwd       m2, m3, m4
   1238    vpdpwssd        m2, m16, m16         ; h sumsq
   1239    punpckhwd      m17, m1
   1240    punpckhwd       m3, m4
   1241    vpdpwssd        m3, m17, m17
   1242    paddw           m1, m0, [t2+r10*2+416*0]
   1243    paddw           m1, [t1+r10*2+416*0] ; hv sum
   1244    paddd          m16, m2, [t2+r10*2+416*2]
   1245    paddd          m17, m3, [t2+r10*2+416*4]
   1246    paddd          m16, [t1+r10*2+416*2] ; hv sumsq
   1247    paddd          m17, [t1+r10*2+416*4]
   1248    mova [t0+r10*2+416*0], m0
   1249    mova [t0+r10*2+416*2], m2
   1250    mova [t0+r10*2+416*4], m3
   1251    pmulld         m16, m8               ; -a * 9
   1252    pmulld         m17, m8
   1253    punpcklwd       m0, m4, m1           ; b
   1254    vpdpwssd       m16, m0, m0           ; -p
   1255    punpckhwd       m1, m4, m1
   1256    vpdpwssd       m17, m1, m1
   1257    pmaddwd         m0, m10              ; b * 455
   1258    pmaddwd         m1, m10
   1259    pmulld         m16, m11              ; p * s
   1260    pmulld         m17, m11
   1261    vpalignr   m17{k2}, m16, m16, 2
   1262    mova           m16, m20
   1263    paddusw        m17, m12
   1264    psraw          m17, 4                ; min(z, 255) - 256
   1265    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
   1266    vpmovb2m        k3, m17
   1267    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
   1268    vmovdqu8   m17{k3}, m16              ; x
   1269    pandn          m16, m13, m17
   1270    psrld          m17, 16
   1271    pmulld          m0, m16
   1272    pmulld          m1, m17
   1273    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
   1274    paddd           m1, m14
   1275    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
   1276    vpternlogd     m17, m1, m13, 0xd8
   1277    mova          [t3+r10*4+  8], m16
   1278    mova          [t3+r10*4+ 24], xm17
   1279    vextracti32x4 [t3+r10*4+ 56], m17, 2
   1280    mova          [t3+r10*4+ 72], m17
   1281    vextracti128  [t3+r10*4+ 72], ym16, 1
   1282    vextracti32x4 [t3+r10*4+104], m16, 3
   1283    add            r10, 32
   1284    jl .hv_loop
   1285    mov             t2, t1
   1286    mov             t1, t0
   1287    mov             t0, t2
   1288    ret
   1289 .v: ; vertical boxsum + ab
   1290    lea            r10, [wq-2]
   1291 .v_loop:
   1292    mova           m16, [t1+r10*2+416*2]
   1293    mova           m17, [t1+r10*2+416*4]
   1294    paddd          m16, m16
   1295    paddd          m17, m17
   1296    paddd          m16, [t2+r10*2+416*2] ; hv sumsq
   1297    paddd          m17, [t2+r10*2+416*4]
   1298    pmulld         m16, m8               ; -a * 9
   1299    pmulld         m17, m8
   1300    mova            m1, [t1+r10*2+416*0]
   1301    paddw           m1, m1
   1302    paddw           m1, [t2+r10*2+416*0] ; hv sum
   1303    punpcklwd       m0, m4, m1           ; b
   1304    vpdpwssd       m16, m0, m0           ; -p
   1305    punpckhwd       m1, m4, m1
   1306    vpdpwssd       m17, m1, m1
   1307    pmaddwd         m0, m10              ; b * 455
   1308    pmaddwd         m1, m10
   1309    pmulld         m16, m11              ; p * s
   1310    pmulld         m17, m11
   1311    vpalignr   m17{k2}, m16, m16, 2
   1312    mova           m16, m20
   1313    paddusw        m17, m12
   1314    psraw          m17, 4                ; min(z, 255) - 256
   1315    vpermt2b       m16, m17, m21         ; sgr_x_by_x[128..255]
   1316    vpmovb2m        k3, m17
   1317    vpermi2b       m17, m18, m19         ; sgr_x_by_x[  0..127]
   1318    vmovdqu8   m17{k3}, m16              ; x
   1319    pandn          m16, m13, m17
   1320    psrld          m17, 16
   1321    pmulld          m0, m16
   1322    pmulld          m1, m17
   1323    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
   1324    paddd           m1, m14
   1325    vpternlogd     m16, m0, m13, 0xd8    ; a | (b << 12)
   1326    vpternlogd     m17, m1, m13, 0xd8
   1327    mova          [t3+r10*4+  8], m16
   1328    mova          [t3+r10*4+ 24], xm17
   1329    vextracti32x4 [t3+r10*4+ 56], m17, 2
   1330    mova          [t3+r10*4+ 72], m17
   1331    vextracti128  [t3+r10*4+ 72], ym16, 1
   1332    vextracti32x4 [t3+r10*4+104], m16, 3
   1333    add            r10, 32
   1334    jl .v_loop
   1335    ret
   1336 .prep_n: ; initial neighbor setup
   1337    mov            r10, wq
   1338    mov             t4, t3
   1339    add             t3, 416*4
   1340 .prep_n_loop:
   1341    mova            m2, [t5+r10*4+0]
   1342    mova            m3, [t4+r10*4+0]
   1343    paddd           m2, [t5+r10*4+8]
   1344    paddd           m3, [t4+r10*4+8]
   1345    paddd           m0, m2, [t5+r10*4+4]
   1346    paddd           m1, m3, [t4+r10*4+4]
   1347    pslld           m0, 2
   1348    paddd           m1, m1                ; ab[ 0] 222
   1349    psubd           m0, m2                ; ab[-1] 343
   1350    mova [t3+r10*4+416*4], m1
   1351    paddd           m1, m1
   1352    mova    [t5+r10*4], m0
   1353    psubd           m1, m3                ; ab[ 0] 343
   1354    mova    [t4+r10*4], m1
   1355    add            r10, 16
   1356    jl .prep_n_loop
   1357    ret
   1358 ; a+b are packed together in a single dword, but we can't do the
   1359 ; full neighbor calculations before splitting them since we don't
   1360 ; have sufficient precision. The solution is to do the calculations
   1361 ; in two equal halves and split a and b before doing the final sum.
   1362 ALIGN function_align
   1363 .n: ; neighbor + output
   1364    mov            r10, wq
   1365 .n_loop:
   1366    mova           m16, [t3+r10*4+ 0]
   1367    paddd          m16, [t3+r10*4+ 8]
   1368    paddd          m17, m16, [t3+r10*4+ 4]
   1369    paddd          m17, m17               ; ab[+1] 222
   1370    mova            m2, [t3+r10*4+416*4+ 0]
   1371    paddd           m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
   1372    mova            m3, [t3+r10*4+416*4+64]
   1373    paddd           m1, m3, [t5+r10*4+64]
   1374    mova [t3+r10*4+416*4+ 0], m17
   1375    paddd          m17, m17
   1376    psubd          m17, m16               ; ab[+1] 343
   1377    mova [t5+r10*4+ 0], m17
   1378    paddd           m2, m17               ; ab[ 0] 222 + ab[+1] 343
   1379    mova           m16, [t3+r10*4+64]
   1380    paddd          m16, [t3+r10*4+72]
   1381    paddd          m17, m16, [t3+r10*4+68]
   1382    paddd          m17, m17
   1383    mova [t3+r10*4+416*4+64], m17
   1384    paddd          m17, m17
   1385    psubd          m17, m16
   1386    mova [t5+r10*4+64], m17
   1387    pandn          m16, m13, m0
   1388    psrld           m0, 12
   1389    paddd           m3, m17
   1390    pandn          m17, m13, m2
   1391    psrld           m2, 12
   1392    paddd          m16, m17               ; a
   1393    pandn          m17, m13, m1
   1394    psrld           m1, 12
   1395    paddd           m0, m2                ; b + (1 << 8)
   1396    pandn           m2, m13, m3
   1397    psrld           m3, 12
   1398    paddd          m17, m2
   1399    pmovzxbd        m2, [dstq+r10+ 0]
   1400    paddd           m1, m3
   1401    pmovzxbd        m3, [dstq+r10+16]
   1402    pmaddwd        m16, m2                ; a * src
   1403    pmaddwd        m17, m3
   1404    packssdw        m2, m3
   1405    psubd           m0, m16               ; b - a * src + (1 << 8)
   1406    psubd           m1, m17
   1407    psrad           m0, 9
   1408    psrad           m1, 9
   1409    packssdw        m0, m1
   1410    pmulhrsw        m0, m15
   1411    paddw           m0, m2
   1412    packuswb        m0, m0
   1413    vpermd         m16, m9, m0
   1414    mova    [dstq+r10], ym16
   1415    add            r10, 32
   1416    jl .n_loop
   1417    mov            r10, t5
   1418    mov             t5, t4
   1419    mov             t4, r10
   1420    add           dstq, strideq
   1421    ret
   1422 
   1423 cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \
   1424                                                  w, h, edge, params
   1425    mov        paramsq, r6mp
   1426    mov             wd, wm
   1427    movifnidn       hd, hm
   1428    mov          edged, r7m
   1429    vbroadcasti128  m5, [sgr_shuf+1]
   1430    add           lpfq, wq
   1431    vbroadcasti128  m6, [sgr_shuf+9]
   1432    add           dstq, wq
   1433    vbroadcasti128  m7, [sgr_shuf+3]
   1434    lea             t3, [rsp+wq*4+416*24+8]
   1435    vbroadcasti128  m8, [sgr_shuf+7]
   1436    pxor            m4, m4
   1437    vpbroadcastd    m9, [pd_m9]
   1438    vpsubd         m11, m4, [paramsq+0] {1to16} ; -s0
   1439    vpbroadcastd   m14, [pw_61448]
   1440    vpsubd         m12, m4, [paramsq+4] {1to16} ; -s1
   1441    vpbroadcastd   m26, [paramsq+8]             ; w0 w1
   1442    lea             t1, [rsp+wq*2+12]
   1443    vpbroadcastd   m10, [pd_m25]
   1444    neg             wq
   1445    vpbroadcastd   m13, [pw_164_455]
   1446    mov           r10d, 0xfe
   1447    vpbroadcastd   m15, [pd_34816]
   1448    kmovb           k1, r10d
   1449    mova           m20, [sgr_x_by_x+64*0]
   1450    mov            r10, 0x3333333333333333
   1451    mova           m21, [sgr_x_by_x+64*1]
   1452    kmovq           k2, r10
   1453    mova           m22, [sgr_x_by_x+64*2]
   1454    lea            r12, [r_ext_mask+75]
   1455    mova           m23, [sgr_x_by_x+64*3]
   1456    vpbroadcastd   m24, [pd_m4096]
   1457    vpbroadcastd   m25, [sgr_shuf+28]           ; 0x8000____
   1458    psllw          m26, 5
   1459    mova          xm27, [sgr_mix_perm]
   1460    test         edgeb, 4 ; LR_HAVE_TOP
   1461    jz .no_top
   1462    call .h_top
   1463    add           lpfq, strideq
   1464    mov             t2, t1
   1465    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup
   1466    add             t1, 416*12
   1467    call .h_top
   1468    lea            r10, [lpfq+strideq*4]
   1469    mov           lpfq, dstq
   1470    add            r10, strideq
   1471    mov          [rsp], r10 ; below
   1472    call .hv0
   1473 .main:
   1474    dec             hd
   1475    jz .height1
   1476    add           lpfq, strideq
   1477    call .hv1
   1478    call .prep_n
   1479    sub             hd, 2
   1480    jl .extend_bottom
   1481 .main_loop:
   1482    add           lpfq, strideq
   1483    call .hv0
   1484    test            hd, hd
   1485    jz .odd_height
   1486    add           lpfq, strideq
   1487    call .hv1
   1488    call .n0
   1489    call .n1
   1490    sub             hd, 2
   1491    jge .main_loop
   1492    test         edgeb, 8 ; LR_HAVE_BOTTOM
   1493    jz .extend_bottom
   1494    mov           lpfq, [rsp]
   1495    call .hv0_bottom
   1496    add           lpfq, strideq
   1497    call .hv1_bottom
   1498 .end:
   1499    call .n0
   1500    call .n1
   1501 .end2:
   1502    RET
   1503 .height1:
   1504    call .v1
   1505    call .prep_n
   1506    jmp .odd_height_end
   1507 .odd_height:
   1508    call .v1
   1509    call .n0
   1510    call .n1
   1511 .odd_height_end:
   1512    call .v0
   1513    call .v1
   1514    call .n0
   1515    jmp .end2
   1516 .extend_bottom:
   1517    call .v0
   1518    call .v1
   1519    jmp .end
   1520 .no_top:
   1521    lea            r10, [lpfq+strideq*4]
   1522    mov           lpfq, dstq
   1523    lea            r10, [r10+strideq*2]
   1524    mov          [rsp], r10
   1525    call .h
   1526    lea             t2, [t1+416*12]
   1527    lea            r10, [wq-2]
   1528 .top_fixup_loop:
   1529    mova            m0, [t1+r10*2+416* 0]
   1530    mova            m1, [t1+r10*2+416* 2]
   1531    mova            m2, [t1+r10*2+416* 4]
   1532    paddw           m0, m0
   1533    mova            m3, [t1+r10*2+416* 6]
   1534    paddd           m1, m1
   1535    mova           m16, [t1+r10*2+416* 8]
   1536    paddd           m2, m2
   1537    mova           m17, [t1+r10*2+416*10]
   1538    mova [t2+r10*2+416* 0], m0
   1539    mova [t2+r10*2+416* 2], m1
   1540    mova [t2+r10*2+416* 4], m2
   1541    mova [t2+r10*2+416* 6], m3
   1542    mova [t2+r10*2+416* 8], m16
   1543    mova [t2+r10*2+416*10], m17
   1544    add            r10, 32
   1545    jl .top_fixup_loop
   1546    call .v0
   1547    jmp .main
   1548 .h: ; horizontal boxsums
   1549    lea            r10, [wq-2]
   1550    test         edgeb, 1 ; LR_HAVE_LEFT
   1551    jz .h_extend_left
   1552    movd          xm17, [leftq]
   1553    vmovdqu32 ym17{k1}, [lpfq+wq-4]
   1554    add          leftq, 4
   1555    jmp .h_main
   1556 .h_extend_left:
   1557    vpbroadcastb  xm17, [lpfq+wq]
   1558    vmovdqu32 ym17{k1}, [lpfq+wq-4]
   1559    jmp .h_main
   1560 .h_top:
   1561    lea            r10, [wq-2]
   1562    test         edgeb, 1 ; LR_HAVE_LEFT
   1563    jz .h_extend_left
   1564 .h_loop:
   1565    movu          ym17, [lpfq+r10-2]
   1566 .h_main:
   1567    vinserti32x8   m17, [lpfq+r10+6], 1
   1568    test         edgeb, 2 ; LR_HAVE_RIGHT
   1569    jnz .h_have_right
   1570    cmp           r10d, -34
   1571    jl .h_have_right
   1572    vpbroadcastb    m0, [lpfq-1]
   1573    movu          ym16, [r12+r10-8]
   1574    vinserti32x8   m16, [r12+r10+0], 1
   1575    vpternlogd     m17, m0, m16, 0xe4
   1576 .h_have_right:
   1577    pshufb          m3, m17, m5
   1578    pshufb         m18, m17, m6
   1579    shufps          m0, m3, m18, q2121
   1580    pmullw          m2, m0, m0
   1581    pshufb         m19, m17, m7
   1582    paddw           m0, m19
   1583    pshufb         m17, m8
   1584    paddw           m0, m17           ; sum3
   1585    punpcklwd      m16, m19, m17
   1586    punpcklwd       m1, m2, m4
   1587    vpdpwssd        m1, m16, m16      ; sumsq3
   1588    punpckhwd      m19, m17
   1589    punpckhwd       m2, m4
   1590    vpdpwssd        m2, m19, m19
   1591    mova [t1+r10*2+416* 6], m0
   1592    mova [t1+r10*2+416* 8], m1
   1593    mova [t1+r10*2+416*10], m2
   1594    punpcklwd      m19, m3, m18
   1595    paddw           m0, m3
   1596    vpdpwssd        m1, m19, m19      ; sumsq5
   1597    punpckhwd       m3, m18
   1598    paddw           m0, m18           ; sum5
   1599    vpdpwssd        m2, m3, m3
   1600    mova [t1+r10*2+416* 0], m0
   1601    mova [t1+r10*2+416* 2], m1
   1602    mova [t1+r10*2+416* 4], m2
   1603    add            r10, 32
   1604    jl .h_loop
   1605    ret
   1606 ALIGN function_align
   1607 .hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
   1608    lea            r10, [wq-2]
   1609    test         edgeb, 1 ; LR_HAVE_LEFT
   1610    jz .hv0_extend_left
   1611    movd          xm17, [leftq]
   1612    vmovdqu32 ym17{k1}, [lpfq+wq-4]
   1613    add          leftq, 4
   1614    jmp .hv0_main
   1615 .hv0_extend_left:
   1616    vpbroadcastb  xm17, [lpfq+wq]
   1617    vmovdqu32 ym17{k1}, [lpfq+wq-4]
   1618    jmp .hv0_main
   1619 .hv0_bottom:
   1620    lea            r10, [wq-2]
   1621    test         edgeb, 1 ; LR_HAVE_LEFT
   1622    jz .hv0_extend_left
   1623 .hv0_loop:
   1624    movu          ym17, [lpfq+r10-2]
   1625 .hv0_main:
   1626    vinserti32x8   m17, [lpfq+r10+6], 1
   1627    test         edgeb, 2 ; LR_HAVE_RIGHT
   1628    jnz .hv0_have_right
   1629    cmp           r10d, -34
   1630    jl .hv0_have_right
   1631    vpbroadcastb    m0, [lpfq-1]
   1632    movu          ym16, [r12+r10-8]
   1633    vinserti32x8   m16, [r12+r10+0], 1
   1634    vpternlogd     m17, m0, m16, 0xe4
   1635 .hv0_have_right:
   1636    pshufb         m18, m17, m5
   1637    pshufb         m19, m17, m6
   1638    shufps          m1, m18, m19, q2121
   1639    pmullw          m3, m1, m1
   1640    pshufb          m0, m17, m7
   1641    paddw           m1, m0
   1642    pshufb         m17, m8
   1643    paddw           m1, m17           ; sum3
   1644    punpcklwd      m16, m0, m17
   1645    punpcklwd       m2, m3, m4
   1646    vpdpwssd        m2, m16, m16      ; sumsq3
   1647    punpckhwd       m0, m17
   1648    punpckhwd       m3, m4
   1649    vpdpwssd        m3, m0, m0
   1650    paddw           m0, m1, [t1+r10*2+416* 6]
   1651    paddd          m16, m2, [t1+r10*2+416* 8]
   1652    paddd          m17, m3, [t1+r10*2+416*10]
   1653    mova [t1+r10*2+416* 6], m1
   1654    mova [t1+r10*2+416* 8], m2
   1655    mova [t1+r10*2+416*10], m3
   1656    paddw           m1, m18
   1657    paddw           m1, m19           ; sum5
   1658    mova [t3+r10*4+416*8+ 8], m1
   1659    paddw           m1, [t1+r10*2+416* 0]
   1660    mova [t1+r10*2+416* 0], m1
   1661    punpcklwd       m1, m18, m19
   1662    vpdpwssd        m2, m1, m1        ; sumsq5
   1663    punpckhwd      m18, m19
   1664    vpdpwssd        m3, m18, m18
   1665    mova [t3+r10*4+416*0+ 8], m2      ; we need a clean copy of the last row
   1666    mova [t3+r10*4+416*0+72], m3      ; in case height is odd
   1667    paddd           m2, [t1+r10*2+416* 2]
   1668    paddd           m3, [t1+r10*2+416* 4]
   1669    mova [t1+r10*2+416* 2], m2
   1670    mova [t1+r10*2+416* 4], m3
   1671    paddw           m1, m0, [t2+r10*2+416* 6]
   1672    paddd           m2, m16, [t2+r10*2+416* 8]
   1673    paddd           m3, m17, [t2+r10*2+416*10]
   1674    mova [t2+r10*2+416* 6], m0
   1675    mova [t2+r10*2+416* 8], m16
   1676    mova [t2+r10*2+416*10], m17
   1677    pmulld         m16, m2, m9        ; -a3 * 9
   1678    pmulld         m17, m3, m9
   1679    punpcklwd       m0, m4, m1        ; b3
   1680    vpdpwssd       m16, m0, m0        ; -p3
   1681    punpckhwd       m1, m4, m1
   1682    vpdpwssd       m17, m1, m1
   1683    pmulld         m16, m12           ; p3 * s1
   1684    pmulld         m17, m12
   1685    pmaddwd         m0, m13           ; b3 * 455
   1686    pmaddwd         m1, m13
   1687    vpalignr   m17{k2}, m16, m16, 2
   1688    mova           m16, m22
   1689    paddusw        m17, m14
   1690    psraw          m17, 4             ; min(z3, 255) - 256
   1691    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
   1692    vpmovb2m        k3, m17
   1693    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
   1694    vmovdqu8   m17{k3}, m16           ; x3
   1695    pandn          m16, m24, m17
   1696    psrld          m17, 16
   1697    pmulld          m0, m16
   1698    pmulld          m1, m17
   1699    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   1700    paddd           m1, m15
   1701    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
   1702    vpternlogd     m17, m1, m24, 0xd8
   1703    mova          [t3+r10*4+416*4+  8], m16
   1704    mova          [t3+r10*4+416*4+ 24], xm17
   1705    vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
   1706    mova          [t3+r10*4+416*4+ 72], m17
   1707    vextracti128  [t3+r10*4+416*4+ 72], ym16, 1
   1708    vextracti32x4 [t3+r10*4+416*4+104], m16, 3
   1709    add            r10, 32
   1710    jl .hv0_loop
   1711    ret
   1712 ALIGN function_align
   1713 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
   1714    lea            r10, [wq-2]
   1715    test         edgeb, 1 ; LR_HAVE_LEFT
   1716    jz .hv1_extend_left
   1717    movd          xm17, [leftq]
   1718    vmovdqu32 ym17{k1}, [lpfq+wq-4]
   1719    add          leftq, 4
   1720    jmp .hv1_main
   1721 .hv1_extend_left:
   1722    vpbroadcastb  xm17, [lpfq+wq]
   1723    vmovdqu32 ym17{k1}, [lpfq+wq-4]
   1724    jmp .hv1_main
   1725 .hv1_bottom:
   1726    lea            r10, [wq-2]
   1727    test         edgeb, 1 ; LR_HAVE_LEFT
   1728    jz .hv1_extend_left
   1729 .hv1_loop:
   1730    movu          ym17, [lpfq+r10-2]
   1731 .hv1_main:
   1732    vinserti32x8   m17, [lpfq+r10+6], 1
   1733    test         edgeb, 2 ; LR_HAVE_RIGHT
   1734    jnz .hv1_have_right
   1735    cmp           r10d, -34
   1736    jl .hv1_have_right
   1737    vpbroadcastb    m0, [lpfq-1]
   1738    movu          ym16, [r12+r10-8]
   1739    vinserti32x8   m16, [r12+r10+0], 1
   1740    vpternlogd    m17, m0, m16, 0xe4
   1741 .hv1_have_right:
   1742    pshufb          m3, m17, m5
   1743    pshufb         m19, m17, m6
   1744    shufps          m2, m3, m19, q2121
   1745    pmullw          m1, m2, m2
   1746    pshufb         m18, m17, m7
   1747    paddw           m2, m18
   1748    pshufb         m17, m8
   1749    paddw           m2, m17           ; sum3
   1750    punpcklwd      m16, m17, m18
   1751    punpcklwd       m0, m1, m4
   1752    vpdpwssd        m0, m16, m16      ; sumsq3
   1753    punpckhwd      m17, m18
   1754    punpckhwd       m1, m4
   1755    vpdpwssd        m1, m17, m17
   1756    paddd          m16, m0, [t2+r10*2+416* 8]
   1757    paddd          m17, m1, [t2+r10*2+416*10]
   1758    mova [t2+r10*2+416* 8], m0
   1759    mova [t2+r10*2+416*10], m1
   1760    punpcklwd      m18, m3, m19
   1761    vpdpwssd        m0, m18, m18      ; sumsq5
   1762    punpckhwd      m18, m3, m19
   1763    vpdpwssd        m1, m18, m18
   1764    paddw           m3, m19
   1765    pmulld         m16, m9            ; -a3 * 9
   1766    pmulld         m17, m9
   1767    paddd          m18, m0, [t2+r10*2+416*2]
   1768    paddd          m19, m1, [t2+r10*2+416*4]
   1769    paddd          m18, [t1+r10*2+416*2]
   1770    paddd          m19, [t1+r10*2+416*4]
   1771    mova [t2+r10*2+416*2], m0
   1772    mova [t2+r10*2+416*4], m1
   1773    pmulld         m18, m10           ; -a5 * 25
   1774    pmulld         m19, m10
   1775    paddw           m1, m2, [t2+r10*2+416* 6]
   1776    mova [t2+r10*2+416* 6], m2
   1777    paddw           m2, m3            ; sum5
   1778    paddw           m3, m2, [t2+r10*2+416*0]
   1779    paddw           m3, [t1+r10*2+416*0]
   1780    mova [t2+r10*2+416*0], m2
   1781    punpcklwd       m0, m4, m1        ; b3
   1782    vpdpwssd       m16, m0, m0        ; -p3
   1783    punpckhwd       m1, m4, m1
   1784    vpdpwssd       m17, m1, m1
   1785    punpcklwd       m2, m3, m4        ; b5
   1786    vpdpwssd       m18, m2, m2        ; -p5
   1787    punpckhwd       m3, m4
   1788    vpdpwssd       m19, m3, m3
   1789    pmulld         m16, m12           ; p3 * s1
   1790    pmulld         m17, m12
   1791    pmulld         m18, m11           ; p5 * s0
   1792    pmulld         m19, m11
   1793    pmaddwd         m0, m13           ; b3 * 455
   1794    pmaddwd         m1, m13
   1795    pmaddwd         m2, m13           ; b5 * 164
   1796    pmaddwd         m3, m13
   1797    vpalignr   m17{k2}, m16, m16, 2
   1798    vpalignr   m19{k2}, m18, m18, 2
   1799    paddusw        m17, m14
   1800    mova           m16, m22
   1801    psraw          m17, 4             ; min(z3, 255) - 256
   1802    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
   1803    vpmovb2m        k3, m17
   1804    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
   1805    paddusw        m19, m14
   1806    mova           m18, m22
   1807    psraw          m19, 4             ; min(z5, 255) - 256
   1808    vpermt2b       m18, m19, m23      ; sgr_x_by_x[128..255]
   1809    vpmovb2m        k4, m19
   1810    vpermi2b       m19, m20, m21      ; sgr_x_by_x[  0..127]
   1811    vmovdqu8   m17{k3}, m16           ; x3
   1812    vmovdqu8   m19{k4}, m18           ; x5
   1813    pandn          m16, m24, m17
   1814    psrld          m17, 16
   1815    pmulld          m0, m16
   1816    pmulld          m1, m17
   1817    pandn          m18, m24, m19
   1818    psrld          m19, 16
   1819    pmulld          m2, m18
   1820    pmulld          m3, m19
   1821    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   1822    paddd           m1, m15
   1823    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
   1824    vpternlogd     m17, m1, m24, 0xd8
   1825    mova          [t3+r10*4+416*8+  8], m16
   1826    mova          [t3+r10*4+416*8+ 24], xm17
   1827    vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
   1828    paddd           m2, m15           ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
   1829    paddd           m3, m15
   1830    mova          [t3+r10*4+416*8+ 72], m17
   1831    vextracti128  [t3+r10*4+416*8+ 72], ym16, 1
   1832    vextracti32x4 [t3+r10*4+416*8+104], m16, 3
   1833    vpternlogd     m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
   1834    vpternlogd     m19, m3, m24, 0xd8
   1835    mova          [t3+r10*4+416*0+  8], m18
   1836    mova          [t3+r10*4+416*0+ 24], xm19
   1837    vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
   1838    mova          [t3+r10*4+416*0+ 72], m19
   1839    vextracti128  [t3+r10*4+416*0+ 72], ym18, 1
   1840    vextracti32x4 [t3+r10*4+416*0+104], m18, 3
   1841    add            r10, 32
   1842    jl .hv1_loop
   1843    mov            r10, t2
   1844    mov             t2, t1
   1845    mov             t1, r10
   1846    ret
   1847 .v0: ; vertical boxsums + ab3 (even rows)
   1848    lea            r10, [wq-2]
   1849 .v0_loop:
   1850    mova            m2, [t1+r10*2+416* 8]
   1851    mova            m3, [t1+r10*2+416*10]
   1852    paddd           m2, m2
   1853    paddd           m3, m3
   1854    paddd          m16, m2, [t2+r10*2+416* 8]
   1855    paddd          m17, m3, [t2+r10*2+416*10]
   1856    mova            m0, [t1+r10*2+416* 6]
   1857    paddw           m0, m0
   1858    paddw           m1, m0, [t2+r10*2+416* 6]
   1859    pmulld         m16, m9            ; -a3 * 9
   1860    pmulld         m17, m9
   1861    mova [t2+r10*2+416* 6], m0
   1862    mova [t2+r10*2+416* 8], m2
   1863    mova [t2+r10*2+416*10], m3
   1864    mova            m2, [t1+r10*2+416*0]
   1865    mova            m3, [t1+r10*2+416*2]
   1866    mova           m18, [t1+r10*2+416*4]
   1867    punpcklwd       m0, m4, m1        ; b3
   1868    vpdpwssd       m16, m0, m0        ; -p3
   1869    punpckhwd       m1, m4, m1
   1870    vpdpwssd       m17, m1, m1
   1871    pmulld         m16, m12           ; p3 * s1
   1872    pmulld         m17, m12
   1873    pmaddwd         m0, m13           ; b3 * 455
   1874    pmaddwd         m1, m13
   1875    mova [t3+r10*4+416*8+ 8], m2
   1876    mova [t3+r10*4+416*0+ 8], m3
   1877    mova [t3+r10*4+416*0+72], m18
   1878    vpalignr   m17{k2}, m16, m16, 2
   1879    mova           m16, m22
   1880    paddusw        m17, m14
   1881    psraw          m17, 4             ; min(z3, 255) - 256
   1882    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
   1883    vpmovb2m        k3, m17
   1884    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
   1885    vmovdqu8   m17{k3}, m16           ; x3
   1886    pandn          m16, m24, m17
   1887    psrld          m17, 16
   1888    pmulld          m0, m16
   1889    pmulld          m1, m17
   1890    paddw           m2, m2            ; cc5
   1891    paddd           m3, m3
   1892    paddd          m18, m18
   1893    mova [t1+r10*2+416*0], m2
   1894    mova [t1+r10*2+416*2], m3
   1895    mova [t1+r10*2+416*4], m18
   1896    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   1897    paddd           m1, m15
   1898    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
   1899    vpternlogd     m17, m1, m24, 0xd8
   1900    mova          [t3+r10*4+416*4+  8], m16
   1901    mova          [t3+r10*4+416*4+ 24], xm17
   1902    vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
   1903    mova          [t3+r10*4+416*4+ 72], m17
   1904    vextracti128  [t3+r10*4+416*4+ 72], ym16, 1
   1905    vextracti32x4 [t3+r10*4+416*4+104], m16, 3
   1906    add            r10, 32
   1907    jl .v0_loop
   1908    ret
   1909 .v1: ; vertical boxsums + ab (odd rows)
   1910    lea            r10, [wq-2]
   1911 .v1_loop:
   1912    mova            m0, [t1+r10*2+416* 8]
   1913    paddd          m16, m0, [t2+r10*2+416* 8]
   1914    mova            m1, [t1+r10*2+416*10]
   1915    paddd          m17, m1, [t2+r10*2+416*10]
   1916    mova            m2, [t3+r10*4+416*0+ 8]
   1917    paddd          m18, m2, [t2+r10*2+416* 2]
   1918    mova            m3, [t3+r10*4+416*0+72]
   1919    paddd          m19, m3, [t2+r10*2+416* 4]
   1920    paddd          m18, [t1+r10*2+416* 2]
   1921    paddd          m19, [t1+r10*2+416* 4]
   1922    mova [t2+r10*2+416* 8], m0
   1923    mova [t2+r10*2+416*10], m1
   1924    mova [t2+r10*2+416* 2], m2
   1925    mova [t2+r10*2+416* 4], m3
   1926    pmulld         m16, m9            ; -a3 * 9
   1927    pmulld         m17, m9
   1928    pmulld         m18, m10           ; -a5 * 25
   1929    pmulld         m19, m10
   1930    mova            m0, [t1+r10*2+416* 6]
   1931    paddw           m1, m0, [t2+r10*2+416* 6]
   1932    mova            m2, [t3+r10*4+416*8+ 8]
   1933    paddw           m3, m2, [t2+r10*2+416*0]
   1934    paddw           m3, [t1+r10*2+416*0]
   1935    mova [t2+r10*2+416* 6], m0
   1936    mova [t2+r10*2+416*0], m2
   1937    punpcklwd       m0, m4, m1        ; b3
   1938    vpdpwssd       m16, m0, m0        ; -p3
   1939    punpckhwd       m1, m4, m1
   1940    vpdpwssd       m17, m1, m1
   1941    punpcklwd       m2, m3, m4        ; b5
   1942    vpdpwssd       m18, m2, m2        ; -p5
   1943    punpckhwd       m3, m4
   1944    vpdpwssd       m19, m3, m3
   1945    pmulld         m16, m12           ; p3 * s1
   1946    pmulld         m17, m12
   1947    pmulld         m18, m11           ; p5 * s0
   1948    pmulld         m19, m11
   1949    pmaddwd         m0, m13           ; b3 * 455
   1950    pmaddwd         m1, m13
   1951    pmaddwd         m2, m13           ; b5 * 164
   1952    pmaddwd         m3, m13
   1953    vpalignr   m17{k2}, m16, m16, 2
   1954    vpalignr   m19{k2}, m18, m18, 2
   1955    paddusw        m17, m14
   1956    mova           m16, m22
   1957    psraw          m17, 4             ; min(z3, 255) - 256
   1958    vpermt2b       m16, m17, m23      ; sgr_x_by_x[128..255]
   1959    vpmovb2m        k3, m17
   1960    vpermi2b       m17, m20, m21      ; sgr_x_by_x[  0..127]
   1961    paddusw        m19, m14
   1962    mova           m18, m22
   1963    psraw          m19, 4             ; min(z5, 255) - 256
   1964    vpermt2b       m18, m19, m23      ; sgr_x_by_x[128..255]
   1965    vpmovb2m        k4, m19
   1966    vpermi2b       m19, m20, m21      ; sgr_x_by_x[  0..127]
   1967    vmovdqu8   m17{k3}, m16           ; x3
   1968    vmovdqu8   m19{k4}, m18           ; x5
   1969    pandn          m16, m24, m17
   1970    psrld          m17, 16
   1971    pmulld          m0, m16
   1972    pmulld          m1, m17
   1973    pandn          m18, m24, m19
   1974    psrld          m19, m19, 16
   1975    pmulld          m2, m18
   1976    pmulld          m3, m19
   1977    paddd           m0, m15           ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   1978    paddd           m1, m15
   1979    vpternlogd     m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
   1980    vpternlogd     m17, m1, m24, 0xd8
   1981    mova          [t3+r10*4+416*8+  8], m16
   1982    mova          [t3+r10*4+416*8+ 24], xm17
   1983    vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
   1984    paddd           m2, m15           ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
   1985    paddd           m3, m15
   1986    mova          [t3+r10*4+416*8+ 72], m17
   1987    vextracti128  [t3+r10*4+416*8+ 72], ym16, 1
   1988    vextracti32x4 [t3+r10*4+416*8+104], m16, 3
   1989    vpternlogd     m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
   1990    vpternlogd     m19, m3, m24, 0xd8
   1991    mova          [t3+r10*4+416*0+  8], m18
   1992    mova          [t3+r10*4+416*0+ 24], xm19
   1993    vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
   1994    mova          [t3+r10*4+416*0+ 72], m19
   1995    vextracti128  [t3+r10*4+416*0+ 72], ym18, 1
   1996    vextracti32x4 [t3+r10*4+416*0+104], m18, 3
   1997    add            r10, 32
   1998    jl .v1_loop
   1999    mov            r10, t2
   2000    mov             t2, t1
   2001    mov             t1, r10
   2002    ret
   2003 .prep_n: ; initial neighbor setup
   2004    mov            r10, wq
   2005 .prep_n_loop:
   2006    movu            m0, [t3+r10*4+416*0+4]
   2007    paddd           m1, m0, [t3+r10*4+416*0+0]
   2008    mova           m16, [t3+r10*4+416*4+0]
   2009    paddd           m1, [t3+r10*4+416*0+8]
   2010    mova           m17, [t3+r10*4+416*8+0]
   2011    paddd          m16, [t3+r10*4+416*4+8]
   2012    paddd          m17, [t3+r10*4+416*8+8]
   2013    paddd           m2, m16, [t3+r10*4+416*4+4]
   2014    paddd           m3, m17, [t3+r10*4+416*8+4]
   2015    paddd           m0, m1
   2016    pslld           m1, 2
   2017    pslld           m2, 2
   2018    paddd           m1, m0            ; ab5 565
   2019    paddd           m3, m3            ; ab3[ 0] 222
   2020    psubd           m2, m16           ; ab3[-1] 343
   2021    mova [t3+r10*4+416*20], m3
   2022    pandn           m0, m24, m1       ; a5 565
   2023    mova [t3+r10*4+416*24], m2
   2024    psrld           m1, 12            ; b5 565
   2025    mova [t3+r10*4+416*12], m0
   2026    paddd           m3, m3
   2027    mova [t3+r10*4+416*16], m1
   2028    psubd           m3, m17           ; ab3[ 0] 343
   2029    mova [t3+r10*4+416*28], m3
   2030    add            r10, 16
   2031    jl .prep_n_loop
   2032    ret
   2033 ALIGN function_align
   2034 .n0: ; neighbor + output (even rows)
   2035    mov            r10, wq
   2036 .n0_loop:
   2037    movu            m2, [t3+r10*4+4]
   2038    paddd           m3, m2, [t3+r10*4+0]
   2039    paddd           m3, [t3+r10*4+8]
   2040    mova            m1, [t3+r10*4+416*4+0]
   2041    paddd           m2, m3
   2042    pslld           m3, 2
   2043    paddd           m1, [t3+r10*4+416*4+8]
   2044    paddd           m3, m2
   2045    pandn           m2, m24, m3
   2046    psrld           m3, 12
   2047    paddd           m0, m2, [t3+r10*4+416*12] ; a5
   2048    paddd          m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8)
   2049    mova [t3+r10*4+416*12], m2
   2050    mova [t3+r10*4+416*16], m3
   2051    paddd           m2, m1, [t3+r10*4+416*4+4]
   2052    paddd           m2, m2                    ; ab3[ 1] 222
   2053    mova            m3, [t3+r10*4+416*20]
   2054    paddd          m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343
   2055    mova [t3+r10*4+416*20], m2
   2056    paddd           m2, m2
   2057    psubd           m2, m1                    ; ab3[ 1] 343
   2058    mova [t3+r10*4+416*24], m2
   2059    paddd           m2, m3                    ; ab3[ 0] 222 + ab3[ 1] 343
   2060    pandn           m1, m24, m17
   2061    psrld          m17, 12
   2062    pandn           m3, m24, m2
   2063    psrld           m2, 12
   2064    paddd           m1, m3                    ; a3
   2065    pmovzxbd        m3, [dstq+r10]
   2066    paddd          m17, m2                    ; b3 + (1 << 8)
   2067    pmaddwd         m0, m3                    ; a5 * src
   2068    pmaddwd         m1, m3                    ; a3 * src
   2069    vpshldd         m3, m25, 16               ; (dst << 16) + (1 << 15)
   2070    psubd          m16, m0                    ; b5 - a5 * src + (1 << 8)
   2071    psubd          m17, m1                    ; b3 - a3 * src + (1 << 8)
   2072    psrld          m16, 9
   2073    pslld          m17, 7
   2074    vmovdqu8   m17{k2}, m16
   2075    vpdpwssd        m3, m17, m26
   2076    packuswb        m3, m2
   2077    vpermb         m16, m27, m3
   2078    mova    [dstq+r10], xm16
   2079    add            r10, 16
   2080    jl .n0_loop
   2081    add           dstq, strideq
   2082    ret
   2083 ALIGN function_align
   2084 .n1: ; neighbor + output (odd rows)
   2085    mov            r10, wq
   2086 .n1_loop:
   2087    mova            m1, [t3+r10*4+416*8+0]
   2088    paddd           m1, [t3+r10*4+416*8+8]
   2089    paddd           m2, m1, [t3+r10*4+416*8+4]
   2090    paddd           m2, m2                    ; ab3[ 1] 222
   2091    mova            m0, [t3+r10*4+416*20]
   2092    paddd          m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343
   2093    pmovzxbd        m3, [dstq+r10]
   2094    mova [t3+r10*4+416*20], m2
   2095    paddd           m2, m2
   2096    psubd           m2, m1                    ; ab3[ 1] 343
   2097    mova [t3+r10*4+416*28], m2
   2098    paddd           m0, m2                    ; ab3[ 0] 222 + ab3[ 1] 343
   2099    pandn           m1, m24, m17
   2100    psrld          m17, 12
   2101    pandn           m2, m24, m0
   2102    psrld           m0, 12
   2103    paddd           m1, m2                    ; a3
   2104    paddd          m17, m0                    ; b3 + (1 << 8)
   2105    mova           m16, [t3+r10*4+416*16]     ; b5 + (1 << 7)
   2106    pmaddwd         m1, m3                    ; a3 * src
   2107    pmaddwd         m0, m3, [t3+r10*4+416*12] ; a5 * src
   2108    vpshldd         m3, m25, 16               ; (dst << 16) + (1 << 15)
   2109    psubd          m17, m1                    ; b3 - a3 * src + (1 << 8)
   2110    psubd          m16, m0                    ; b5 - a5 * src + (1 << 7)
   2111    pslld          m17, 7
   2112    palignr    m17{k2}, m16, m16, 1
   2113    vpdpwssd        m3, m17, m26
   2114    packuswb        m3, m3
   2115    vpermb         m16, m27, m3
   2116    mova    [dstq+r10], xm16
   2117    add            r10, 16
   2118    jl .n1_loop
   2119    add           dstq, strideq
   2120    ret
   2121 
   2122 %endif ; ARCH_X86_64