tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration_avx2.asm (72785B)


      1 ; Copyright © 2018, VideoLAN and dav1d authors
      2 ; Copyright © 2018, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA 32
     32 
     33 wiener_l_shuf: db  4,  4,  4,  4,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     34               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     35 wiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
     36 wiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
     37 wiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
     38 sgr_l_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
     39 sgr_r_ext:     times 16 db 1
     40               times 16 db 9
     41 sgr_shuf:      db  1, -1,  2, -1,  3, -1,  4, -1,  5, -1,  6, -1,  7, -1,  8, -1
     42               db  9, -1, 10, -1, 11, -1, 12, -1
     43 
     44 pb_m5:         times 4 db -5
     45 pb_3:          times 4 db 3
     46 pw_5_6:        dw 5, 6
     47 pw_164_24:     dw 164, 24
     48 pw_455_24:     dw 455, 24
     49 pw_256:        times 2 dw 256
     50 pw_2056:       times 2 dw 2056
     51 pw_m16380:     times 2 dw -16380
     52 pd_25:         dd 25
     53 pd_34816:      dd 34816
     54 pd_m4096:      dd -4096
     55 pf_256:        dd 256.0
     56 
     57 cextern pb_0to63
     58 
     59 SECTION .text
     60 
     61 DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
     62 
     63 INIT_YMM avx2
     64 cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
     65                                                    w, h, edge, flt
     66    mov           fltq, r6mp
     67    movifnidn       hd, hm
     68    mov          edged, r7m
     69    mov             wd, wm
     70    vbroadcasti128  m6, [wiener_shufA]
     71    vpbroadcastb   m11, [fltq+ 0] ; x0 x0
     72    vbroadcasti128  m7, [wiener_shufB]
     73    vpbroadcastd   m12, [fltq+ 2]
     74    vbroadcasti128  m8, [wiener_shufC]
     75    packsswb       m12, m12       ; x1 x2
     76    vpbroadcastw   m13, [fltq+ 6] ; x3
     77    vbroadcasti128  m9, [sgr_shuf+6]
     78    add           lpfq, wq
     79    vpbroadcastd   m10, [pw_m16380]
     80    vpbroadcastd   m14, [fltq+16] ; y0 y1
     81    add           dstq, wq
     82    vpbroadcastd   m15, [fltq+20] ; y2 y3
     83    lea             t1, [rsp+wq*2+16]
     84    psllw          m14, 5
     85    neg             wq
     86    psllw          m15, 5
     87    test         edgeb, 4 ; LR_HAVE_TOP
     88    jz .no_top
     89    call .h_top
     90    add           lpfq, strideq
     91    mov             t6, t1
     92    mov             t5, t1
     93    add             t1, 384*2
     94    call .h_top
     95    lea            r10, [lpfq+strideq*4]
     96    mov           lpfq, dstq
     97    mov             t4, t1
     98    add             t1, 384*2
     99    add            r10, strideq
    100    mov          [rsp], r10 ; below
    101    call .h
    102    mov             t3, t1
    103    mov             t2, t1
    104    dec             hd
    105    jz .v1
    106    add           lpfq, strideq
    107    add             t1, 384*2
    108    call .h
    109    mov             t2, t1
    110    dec             hd
    111    jz .v2
    112    add           lpfq, strideq
    113    add             t1, 384*2
    114    call .h
    115    dec             hd
    116    jz .v3
    117 .main:
    118    lea             t0, [t1+384*2]
    119 .main_loop:
    120    call .hv
    121    dec             hd
    122    jnz .main_loop
    123    test         edgeb, 8 ; LR_HAVE_BOTTOM
    124    jz .v3
    125    mov           lpfq, [rsp]
    126    call .hv_bottom
    127    add           lpfq, strideq
    128    call .hv_bottom
    129 .v1:
    130    call .v
    131    RET
    132 .no_top:
    133    lea            r10, [lpfq+strideq*4]
    134    mov           lpfq, dstq
    135    lea            r10, [r10+strideq*2]
    136    mov          [rsp], r10
    137    call .h
    138    mov             t6, t1
    139    mov             t5, t1
    140    mov             t4, t1
    141    mov             t3, t1
    142    mov             t2, t1
    143    dec             hd
    144    jz .v1
    145    add           lpfq, strideq
    146    add             t1, 384*2
    147    call .h
    148    mov             t2, t1
    149    dec             hd
    150    jz .v2
    151    add           lpfq, strideq
    152    add             t1, 384*2
    153    call .h
    154    dec             hd
    155    jz .v3
    156    lea             t0, [t1+384*2]
    157    call .hv
    158    dec             hd
    159    jz .v3
    160    add             t0, 384*8
    161    call .hv
    162    dec             hd
    163    jnz .main
    164 .v3:
    165    call .v
    166 .v2:
    167    call .v
    168    jmp .v1
    169 .extend_right:
    170    movd           xm2, r10d
    171    vpbroadcastd    m0, [pb_3]
    172    vpbroadcastd    m1, [pb_m5]
    173    vpbroadcastb    m2, xm2
    174    mova            m3, [pb_0to63]
    175    psubb           m0, m2
    176    psubb           m1, m2
    177    pminub          m0, m3
    178    pminub          m1, m3
    179    pshufb          m4, m0
    180    pshufb          m5, m1
    181    ret
    182 .h:
    183    mov            r10, wq
    184    test         edgeb, 1 ; LR_HAVE_LEFT
    185    jz .h_extend_left
    186    movd           xm4, [leftq]
    187    vpblendd        m4, [lpfq+r10-4], 0xfe
    188    add          leftq, 4
    189    jmp .h_main
    190 .h_extend_left:
    191    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
    192    mova            m4, [lpfq+r10] ; before the start of the buffer
    193    palignr         m4, m5, 12
    194    pshufb          m4, [wiener_l_shuf]
    195    jmp .h_main
    196 .h_top:
    197    mov            r10, wq
    198    test         edgeb, 1 ; LR_HAVE_LEFT
    199    jz .h_extend_left
    200 .h_loop:
    201    movu            m4, [lpfq+r10-4]
    202 .h_main:
    203    movu            m5, [lpfq+r10+4]
    204    test         edgeb, 2 ; LR_HAVE_RIGHT
    205    jnz .h_have_right
    206    cmp           r10d, -34
    207    jl .h_have_right
    208    call .extend_right
    209 .h_have_right:
    210    pshufb          m0, m4, m6
    211    pmaddubsw       m0, m11
    212    pshufb          m1, m5, m6
    213    pmaddubsw       m1, m11
    214    pshufb          m2, m4, m7
    215    pmaddubsw       m2, m12
    216    pshufb          m3, m5, m7
    217    pmaddubsw       m3, m12
    218    paddw           m0, m2
    219    pshufb          m2, m4, m8
    220    pmaddubsw       m2, m12
    221    paddw           m1, m3
    222    pshufb          m3, m5, m8
    223    pmaddubsw       m3, m12
    224    pshufb          m4, m9
    225    paddw           m0, m2
    226    pmullw          m2, m4, m13
    227    pshufb          m5, m9
    228    paddw           m1, m3
    229    pmullw          m3, m5, m13
    230    psllw           m4, 7
    231    psllw           m5, 7
    232    paddw           m4, m10
    233    paddw           m5, m10
    234    paddw           m0, m2
    235    vpbroadcastd    m2, [pw_2056]
    236    paddw           m1, m3
    237    paddsw          m0, m4
    238    paddsw          m1, m5
    239    psraw           m0, 3
    240    psraw           m1, 3
    241    paddw           m0, m2
    242    paddw           m1, m2
    243    mova [t1+r10*2+ 0], m0
    244    mova [t1+r10*2+32], m1
    245    add            r10, 32
    246    jl .h_loop
    247    ret
    248 ALIGN function_align
    249 .hv:
    250    add           lpfq, strideq
    251    mov            r10, wq
    252    test         edgeb, 1 ; LR_HAVE_LEFT
    253    jz .hv_extend_left
    254    movd           xm4, [leftq]
    255    vpblendd        m4, [lpfq+r10-4], 0xfe
    256    add          leftq, 4
    257    jmp .hv_main
    258 .hv_extend_left:
    259    movu            m4, [lpfq+r10-4]
    260    pshufb          m4, [wiener_l_shuf]
    261    jmp .hv_main
    262 .hv_bottom:
    263    mov            r10, wq
    264    test         edgeb, 1 ; LR_HAVE_LEFT
    265    jz .hv_extend_left
    266 .hv_loop:
    267    movu            m4, [lpfq+r10-4]
    268 .hv_main:
    269    movu            m5, [lpfq+r10+4]
    270    test         edgeb, 2 ; LR_HAVE_RIGHT
    271    jnz .hv_have_right
    272    cmp           r10d, -34
    273    jl .hv_have_right
    274    call .extend_right
    275 .hv_have_right:
    276    pshufb          m0, m4, m6
    277    pmaddubsw       m0, m11
    278    pshufb          m1, m5, m6
    279    pmaddubsw       m1, m11
    280    pshufb          m2, m4, m7
    281    pmaddubsw       m2, m12
    282    pshufb          m3, m5, m7
    283    pmaddubsw       m3, m12
    284    paddw           m0, m2
    285    pshufb          m2, m4, m8
    286    pmaddubsw       m2, m12
    287    paddw           m1, m3
    288    pshufb          m3, m5, m8
    289    pmaddubsw       m3, m12
    290    pshufb          m4, m9
    291    paddw           m0, m2
    292    pmullw          m2, m4, m13
    293    pshufb          m5, m9
    294    paddw           m1, m3
    295    pmullw          m3, m5, m13
    296    psllw           m4, 7
    297    psllw           m5, 7
    298    paddw           m4, m10
    299    paddw           m5, m10
    300    paddw           m0, m2
    301    paddw           m1, m3
    302    mova            m2, [t4+r10*2]
    303    paddw           m2, [t2+r10*2]
    304    mova            m3, [t3+r10*2]
    305    paddsw          m0, m4
    306    vpbroadcastd    m4, [pw_2056]
    307    paddsw          m1, m5
    308    mova            m5, [t5+r10*2]
    309    paddw           m5, [t1+r10*2]
    310    psraw           m0, 3
    311    psraw           m1, 3
    312    paddw           m0, m4
    313    paddw           m1, m4
    314    paddw           m4, m0, [t6+r10*2]
    315    mova    [t0+r10*2], m0
    316    punpcklwd       m0, m2, m3
    317    pmaddwd         m0, m15
    318    punpckhwd       m2, m3
    319    pmaddwd         m2, m15
    320    punpcklwd       m3, m4, m5
    321    pmaddwd         m3, m14
    322    punpckhwd       m4, m5
    323    pmaddwd         m4, m14
    324    paddd           m0, m3
    325    paddd           m4, m2
    326    mova            m2, [t4+r10*2+32]
    327    paddw           m2, [t2+r10*2+32]
    328    mova            m3, [t3+r10*2+32]
    329    mova            m5, [t5+r10*2+32]
    330    paddw           m5, [t1+r10*2+32]
    331    packuswb        m0, m4
    332    paddw           m4, m1, [t6+r10*2+32]
    333    mova [t0+r10*2+32], m1
    334    punpcklwd       m1, m2, m3
    335    pmaddwd         m1, m15
    336    punpckhwd       m2, m3
    337    pmaddwd         m2, m15
    338    punpcklwd       m3, m4, m5
    339    pmaddwd         m3, m14
    340    punpckhwd       m4, m5
    341    pmaddwd         m4, m14
    342    paddd           m1, m3
    343    paddd           m2, m4
    344    packuswb        m1, m2
    345    psrlw           m0, 8
    346    psrlw           m1, 8
    347    packuswb        m0, m1
    348    mova    [dstq+r10], m0
    349    add            r10, 32
    350    jl .hv_loop
    351    mov             t6, t5
    352    mov             t5, t4
    353    mov             t4, t3
    354    mov             t3, t2
    355    mov             t2, t1
    356    mov             t1, t0
    357    mov             t0, t6
    358    add           dstq, strideq
    359    ret
    360 .v:
    361    mov            r10, wq
    362 .v_loop:
    363    mova            m2, [t4+r10*2+ 0]
    364    paddw           m2, [t2+r10*2+ 0]
    365    mova            m4, [t3+r10*2+ 0]
    366    mova            m6, [t1+r10*2+ 0]
    367    paddw           m8, m6, [t6+r10*2+ 0]
    368    paddw           m6, [t5+r10*2+ 0]
    369    mova            m3, [t4+r10*2+32]
    370    paddw           m3, [t2+r10*2+32]
    371    mova            m5, [t3+r10*2+32]
    372    mova            m7, [t1+r10*2+32]
    373    paddw           m9, m7, [t6+r10*2+32]
    374    paddw           m7, [t5+r10*2+32]
    375    punpcklwd       m0, m2, m4
    376    pmaddwd         m0, m15
    377    punpckhwd       m2, m4
    378    pmaddwd         m2, m15
    379    punpcklwd       m4, m8, m6
    380    pmaddwd         m4, m14
    381    punpckhwd       m6, m8, m6
    382    pmaddwd         m6, m14
    383    punpcklwd       m1, m3, m5
    384    pmaddwd         m1, m15
    385    punpckhwd       m3, m5
    386    pmaddwd         m3, m15
    387    punpcklwd       m5, m9, m7
    388    pmaddwd         m5, m14
    389    punpckhwd       m7, m9, m7
    390    pmaddwd         m7, m14
    391    paddd           m0, m4
    392    paddd           m2, m6
    393    paddd           m1, m5
    394    paddd           m3, m7
    395    packuswb        m0, m2
    396    packuswb        m1, m3
    397    psrlw           m0, 8
    398    psrlw           m1, 8
    399    packuswb        m0, m1
    400    mova    [dstq+r10], m0
    401    add            r10, 32
    402    jl .v_loop
    403    mov             t6, t5
    404    mov             t5, t4
    405    mov             t4, t3
    406    mov             t3, t2
    407    mov             t2, t1
    408    add           dstq, strideq
    409    ret
    410 
    411 cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
    412                                                  w, h, edge, flt
    413    mov           fltq, r6mp
    414    movifnidn       hd, hm
    415    mov          edged, r7m
    416    mov             wd, wm
    417    vbroadcasti128  m6, [wiener_shufB]
    418    vpbroadcastd   m12, [fltq+ 2]
    419    vbroadcasti128  m7, [wiener_shufC]
    420    packsswb       m12, m12       ; x1 x2
    421    vpbroadcastw   m13, [fltq+ 6] ; x3
    422    vbroadcasti128  m8, [sgr_shuf+6]
    423    add           lpfq, wq
    424    vpbroadcastd    m9, [pw_m16380]
    425    vpbroadcastd   m10, [pw_2056]
    426    mova           m11, [wiener_l_shuf]
    427    vpbroadcastd   m14, [fltq+16] ; __ y1
    428    add           dstq, wq
    429    vpbroadcastd   m15, [fltq+20] ; y2 y3
    430    lea             t1, [rsp+wq*2+16]
    431    psllw          m14, 5
    432    neg             wq
    433    psllw          m15, 5
    434    test         edgeb, 4 ; LR_HAVE_TOP
    435    jz .no_top
    436    call .h_top
    437    add           lpfq, strideq
    438    mov             t4, t1
    439    add             t1, 384*2
    440    call .h_top
    441    lea            r10, [lpfq+strideq*4]
    442    mov           lpfq, dstq
    443    mov             t3, t1
    444    add             t1, 384*2
    445    add            r10, strideq
    446    mov          [rsp], r10 ; below
    447    call .h
    448    mov             t2, t1
    449    dec             hd
    450    jz .v1
    451    add           lpfq, strideq
    452    add             t1, 384*2
    453    call .h
    454    dec             hd
    455    jz .v2
    456 .main:
    457    mov             t0, t4
    458 .main_loop:
    459    call .hv
    460    dec             hd
    461    jnz .main_loop
    462    test         edgeb, 8 ; LR_HAVE_BOTTOM
    463    jz .v2
    464    mov           lpfq, [rsp]
    465    call .hv_bottom
    466    add           lpfq, strideq
    467    call .hv_bottom
    468 .end:
    469    RET
    470 .no_top:
    471    lea            r10, [lpfq+strideq*4]
    472    mov           lpfq, dstq
    473    lea            r10, [r10+strideq*2]
    474    mov          [rsp], r10
    475    call .h
    476    mov             t4, t1
    477    mov             t3, t1
    478    mov             t2, t1
    479    dec             hd
    480    jz .v1
    481    add           lpfq, strideq
    482    add             t1, 384*2
    483    call .h
    484    dec             hd
    485    jz .v2
    486    lea             t0, [t1+384*2]
    487    call .hv
    488    dec             hd
    489    jz .v2
    490    add             t0, 384*6
    491    call .hv
    492    dec             hd
    493    jnz .main
    494 .v2:
    495    call .v
    496    mov             t4, t3
    497    mov             t3, t2
    498    mov             t2, t1
    499    add           dstq, strideq
    500 .v1:
    501    call .v
    502    jmp .end
    503 .h:
    504    mov            r10, wq
    505    test         edgeb, 1 ; LR_HAVE_LEFT
    506    jz .h_extend_left
    507    movd           xm4, [leftq]
    508    vpblendd        m4, [lpfq+r10-4], 0xfe
    509    add          leftq, 4
    510    jmp .h_main
    511 .h_extend_left:
    512    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
    513    mova            m4, [lpfq+r10] ; before the start of the buffer
    514    palignr         m4, m5, 12
    515    pshufb          m4, m11
    516    jmp .h_main
    517 .h_top:
    518    mov            r10, wq
    519    test         edgeb, 1 ; LR_HAVE_LEFT
    520    jz .h_extend_left
    521 .h_loop:
    522    movu            m4, [lpfq+r10-4]
    523 .h_main:
    524    movu            m5, [lpfq+r10+4]
    525    test         edgeb, 2 ; LR_HAVE_RIGHT
    526    jnz .h_have_right
    527    cmp           r10d, -33
    528    jl .h_have_right
    529    call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
    530 .h_have_right:
    531    pshufb          m0, m4, m6
    532    pmaddubsw       m0, m12
    533    pshufb          m1, m5, m6
    534    pmaddubsw       m1, m12
    535    pshufb          m2, m4, m7
    536    pmaddubsw       m2, m12
    537    pshufb          m3, m5, m7
    538    pmaddubsw       m3, m12
    539    pshufb          m4, m8
    540    paddw           m0, m2
    541    pmullw          m2, m4, m13
    542    pshufb          m5, m8
    543    paddw           m1, m3
    544    pmullw          m3, m5, m13
    545    psllw           m4, 7
    546    psllw           m5, 7
    547    paddw           m4, m9
    548    paddw           m5, m9
    549    paddw           m0, m2
    550    paddw           m1, m3
    551    paddsw          m0, m4
    552    paddsw          m1, m5
    553    psraw           m0, 3
    554    psraw           m1, 3
    555    paddw           m0, m10
    556    paddw           m1, m10
    557    mova [t1+r10*2+ 0], m0
    558    mova [t1+r10*2+32], m1
    559    add            r10, 32
    560    jl .h_loop
    561    ret
    562 ALIGN function_align
    563 .hv:
    564    add           lpfq, strideq
    565    mov            r10, wq
    566    test         edgeb, 1 ; LR_HAVE_LEFT
    567    jz .hv_extend_left
    568    movd           xm4, [leftq]
    569    vpblendd        m4, [lpfq+r10-4], 0xfe
    570    add          leftq, 4
    571    jmp .hv_main
    572 .hv_extend_left:
    573    movu            m4, [lpfq+r10-4]
    574    pshufb          m4, m11
    575    jmp .hv_main
    576 .hv_bottom:
    577    mov            r10, wq
    578    test         edgeb, 1 ; LR_HAVE_LEFT
    579    jz .hv_extend_left
    580 .hv_loop:
    581    movu            m4, [lpfq+r10-4]
    582 .hv_main:
    583    movu            m5, [lpfq+r10+4]
    584    test         edgeb, 2 ; LR_HAVE_RIGHT
    585    jnz .hv_have_right
    586    cmp           r10d, -33
    587    jl .hv_have_right
    588    call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
    589 .hv_have_right:
    590    pshufb          m0, m4, m6
    591    pmaddubsw       m0, m12
    592    pshufb          m1, m5, m6
    593    pmaddubsw       m1, m12
    594    pshufb          m2, m4, m7
    595    pmaddubsw       m2, m12
    596    pshufb          m3, m5, m7
    597    pmaddubsw       m3, m12
    598    pshufb          m4, m8
    599    paddw           m0, m2
    600    pmullw          m2, m4, m13
    601    pshufb          m5, m8
    602    paddw           m1, m3
    603    pmullw          m3, m5, m13
    604    psllw           m4, 7
    605    psllw           m5, 7
    606    paddw           m4, m9
    607    paddw           m5, m9
    608    paddw           m0, m2
    609    paddw           m1, m3
    610    mova            m2, [t3+r10*2]
    611    paddw           m2, [t1+r10*2]
    612    mova            m3, [t2+r10*2]
    613    paddsw          m0, m4
    614    paddsw          m1, m5
    615    psraw           m0, 3
    616    psraw           m1, 3
    617    paddw           m0, m10
    618    paddw           m1, m10
    619    paddw           m4, m0, [t4+r10*2]
    620    mova    [t0+r10*2], m0
    621    punpcklwd       m0, m2, m3
    622    pmaddwd         m0, m15
    623    punpckhwd       m2, m3
    624    pmaddwd         m2, m15
    625    punpcklwd       m3, m4, m4
    626    pmaddwd         m3, m14
    627    punpckhwd       m4, m4
    628    pmaddwd         m4, m14
    629    paddd           m0, m3
    630    paddd           m4, m2
    631    mova            m2, [t3+r10*2+32]
    632    paddw           m2, [t1+r10*2+32]
    633    mova            m3, [t2+r10*2+32]
    634    packuswb        m0, m4
    635    paddw           m4, m1, [t4+r10*2+32]
    636    mova [t0+r10*2+32], m1
    637    punpcklwd       m1, m2, m3
    638    pmaddwd         m1, m15
    639    punpckhwd       m2, m3
    640    pmaddwd         m2, m15
    641    punpcklwd       m3, m4, m4
    642    pmaddwd         m3, m14
    643    punpckhwd       m4, m4
    644    pmaddwd         m4, m14
    645    paddd           m1, m3
    646    paddd           m2, m4
    647    packuswb        m1, m2
    648    psrlw           m0, 8
    649    psrlw           m1, 8
    650    packuswb        m0, m1
    651    mova    [dstq+r10], m0
    652    add            r10, 32
    653    jl .hv_loop
    654    mov             t4, t3
    655    mov             t3, t2
    656    mov             t2, t1
    657    mov             t1, t0
    658    mov             t0, t4
    659    add           dstq, strideq
    660    ret
    661 .v:
    662    mov            r10, wq
    663    psrld          m13, m14, 16 ; y1 __
    664 .v_loop:
    665    mova            m6, [t1+r10*2+ 0]
    666    paddw           m2, m6, [t3+r10*2+ 0]
    667    mova            m4, [t2+r10*2+ 0]
    668    mova            m7, [t1+r10*2+32]
    669    paddw           m3, m7, [t3+r10*2+32]
    670    mova            m5, [t2+r10*2+32]
    671    paddw           m6, [t4+r10*2+ 0]
    672    paddw           m7, [t4+r10*2+32]
    673    punpcklwd       m0, m2, m4
    674    pmaddwd         m0, m15
    675    punpckhwd       m2, m4
    676    pmaddwd         m2, m15
    677    punpcklwd       m1, m3, m5
    678    pmaddwd         m1, m15
    679    punpckhwd       m3, m5
    680    pmaddwd         m3, m15
    681    punpcklwd       m5, m7, m6
    682    pmaddwd         m4, m5, m14
    683    punpckhwd       m7, m6
    684    pmaddwd         m6, m7, m14
    685    pmaddwd         m5, m13
    686    pmaddwd         m7, m13
    687    paddd           m0, m4
    688    paddd           m2, m6
    689    paddd           m1, m5
    690    paddd           m3, m7
    691    packuswb        m0, m2
    692    packuswb        m1, m3
    693    psrlw           m0, 8
    694    psrlw           m1, 8
    695    packuswb        m0, m1
    696    mova    [dstq+r10], m0
    697    add            r10, 32
    698    jl .v_loop
    699    ret
    700 
    701 cglobal sgr_filter_5x5_8bpc, 4, 12, 16, 400*24+16, dst, stride, left, lpf, \
    702                                                   w, h, edge, params
    703    mov        paramsq, r6mp
    704    mov             wd, wm
    705    movifnidn       hd, hm
    706    vbroadcasti128  m8, [sgr_shuf+0]
    707    mov          edged, r7m
    708    vbroadcasti128  m9, [sgr_shuf+8]
    709    add           lpfq, wq
    710    vbroadcasti128 m10, [sgr_shuf+2]
    711    add           dstq, wq
    712    vbroadcasti128 m11, [sgr_shuf+6]
    713    lea             t3, [rsp+wq*4+16+400*12]
    714    vpbroadcastw    m7, [paramsq+8] ; w0
    715    pxor            m6, m6
    716    vpbroadcastd   m12, [paramsq+0] ; s0
    717    lea             t1, [rsp+wq*2+20]
    718    vpbroadcastd   m13, [pw_164_24]
    719    neg             wq
    720    vbroadcastss   m14, [pf_256]
    721    psllw           m7, 4
    722    vpbroadcastd   m15, [pd_m4096]
    723    test         edgeb, 4 ; LR_HAVE_TOP
    724    jz .no_top
    725    call .h_top
    726    add           lpfq, strideq
    727    mov             t2, t1
    728    call .top_fixup
    729    add             t1, 400*6
    730    call .h_top
    731    lea            r10, [lpfq+strideq*4]
    732    mov           lpfq, dstq
    733    add            r10, strideq
    734    mov          [rsp], r10 ; below
    735    mov             t0, t2
    736    dec             hd
    737    jz .height1
    738    or           edged, 16
    739    call .h
    740 .main:
    741    add           lpfq, strideq
    742    call .hv
    743    call .prep_n
    744    sub             hd, 2
    745    jl .extend_bottom
    746 .main_loop:
    747    add           lpfq, strideq
    748    test            hd, hd
    749    jz .odd_height
    750    call .h
    751    add           lpfq, strideq
    752    call .hv
    753    call .n0
    754    call .n1
    755    sub             hd, 2
    756    jge .main_loop
    757    test         edgeb, 8 ; LR_HAVE_BOTTOM
    758    jz .extend_bottom
    759    mov           lpfq, [rsp]
    760    call .h_top
    761    add           lpfq, strideq
    762    call .hv_bottom
    763 .end:
    764    call .n0
    765    call .n1
    766 .end2:
    767    RET
    768 .height1:
    769    call .hv
    770    call .prep_n
    771    jmp .odd_height_end
    772 .odd_height:
    773    call .hv
    774    call .n0
    775    call .n1
    776 .odd_height_end:
    777    call .v
    778    call .n0
    779    jmp .end2
    780 .extend_bottom:
    781    call .v
    782    jmp .end
    783 .no_top:
    784    lea            r10, [lpfq+strideq*4]
    785    mov           lpfq, dstq
    786    lea            r10, [r10+strideq*2]
    787    mov          [rsp], r10
    788    call .h
    789    lea             t2, [t1+400*6]
    790    call .top_fixup
    791    dec             hd
    792    jz .no_top_height1
    793    or           edged, 16
    794    mov             t0, t1
    795    mov             t1, t2
    796    jmp .main
    797 .no_top_height1:
    798    call .v
    799    call .prep_n
    800    jmp .odd_height_end
    801 .extend_right:
    802    movd           xm2, r10d
    803    mova            m0, [sgr_r_ext]
    804    vpbroadcastb    m2, xm2
    805    psubb           m0, m2
    806    pminub          m0, [pb_0to63]
    807    pshufb          m5, m0
    808    ret
    809 .h: ; horizontal boxsum
    810    lea            r10, [wq-2]
    811    test         edgeb, 1 ; LR_HAVE_LEFT
    812    jz .h_extend_left
    813    vpbroadcastd   xm0, [leftq]
    814    mova           xm5, [lpfq+wq]
    815    palignr        xm5, xm0, 12
    816    add          leftq, 4
    817    jmp .h_main
    818 .h_extend_left:
    819    mova           xm5, [lpfq+wq]
    820    pshufb         xm5, [sgr_l_shuf]
    821    jmp .h_main
    822 .h_top:
    823    lea            r10, [wq-2]
    824    test         edgeb, 1 ; LR_HAVE_LEFT
    825    jz .h_extend_left
    826 .h_loop:
    827    movu           xm5, [lpfq+r10-2]
    828 .h_main:
    829    vinserti128     m5, [lpfq+r10+6], 1
    830    test         edgeb, 2 ; LR_HAVE_RIGHT
    831    jnz .h_have_right
    832    cmp           r10d, -18
    833    jl .h_have_right
    834    call .extend_right
    835 .h_have_right:
    836    pshufb          m3, m5, m8
    837    pmullw          m4, m3, m3
    838    pshufb          m2, m5, m9
    839    paddw           m0, m3, m2
    840    shufps          m3, m2, q2121
    841    paddw           m0, m3
    842    punpcklwd       m1, m2, m3
    843    pmaddwd         m1, m1
    844    punpckhwd       m2, m3
    845    pmaddwd         m2, m2
    846    punpcklwd       m3, m4, m6
    847    paddd           m1, m3
    848    punpckhwd       m4, m6
    849    paddd           m2, m4
    850    pshufb          m4, m5, m10
    851    paddw           m0, m4
    852    pshufb          m5, m11
    853    paddw           m0, m5 ; sum
    854    punpcklwd       m3, m4, m5
    855    pmaddwd         m3, m3
    856    punpckhwd       m4, m5
    857    pmaddwd         m4, m4
    858    test         edgeb, 16 ; y > 0
    859    jz .h_loop_end
    860    paddw           m0, [t1+r10*2+400*0]
    861    paddd           m1, [t1+r10*2+400*2]
    862    paddd           m2, [t1+r10*2+400*4]
    863 .h_loop_end:
    864    paddd           m1, m3 ; sumsq
    865    paddd           m2, m4
    866    mova [t1+r10*2+400*0], m0
    867    mova [t1+r10*2+400*2], m1
    868    mova [t1+r10*2+400*4], m2
    869    add            r10, 16
    870    jl .h_loop
    871    ret
    872 .top_fixup:
    873    lea            r10, [wq-2]
    874 .top_fixup_loop: ; the sums of the first row needs to be doubled
    875    mova            m0, [t1+r10*2+400*0]
    876    mova            m1, [t1+r10*2+400*2]
    877    mova            m2, [t1+r10*2+400*4]
    878    paddw           m0, m0
    879    paddd           m1, m1
    880    paddd           m2, m2
    881    mova [t2+r10*2+400*0], m0
    882    mova [t2+r10*2+400*2], m1
    883    mova [t2+r10*2+400*4], m2
    884    add            r10, 16
    885    jl .top_fixup_loop
    886    ret
    887 ALIGN function_align
    888 .hv: ; horizontal boxsum + vertical boxsum + ab
    889    lea            r10, [wq-2]
    890    test         edgeb, 1 ; LR_HAVE_LEFT
    891    jz .hv_extend_left
    892    vpbroadcastd   xm0, [leftq]
    893    mova           xm5, [lpfq+wq]
    894    palignr        xm5, xm0, 12
    895    add          leftq, 4
    896    jmp .hv_main
    897 .hv_extend_left:
    898    mova           xm5, [lpfq+wq]
    899    pshufb         xm5, [sgr_l_shuf]
    900    jmp .hv_main
    901 .hv_bottom:
    902    lea            r10, [wq-2]
    903    test         edgeb, 1 ; LR_HAVE_LEFT
    904    jz .hv_extend_left
    905 .hv_loop:
    906    movu           xm5, [lpfq+r10-2]
    907 .hv_main:
    908    vinserti128     m5, [lpfq+r10+6], 1
    909    test         edgeb, 2 ; LR_HAVE_RIGHT
    910    jnz .hv_have_right
    911    cmp           r10d, -18
    912    jl .hv_have_right
    913    call .extend_right
    914 .hv_have_right:
    915    pshufb          m1, m5, m8
    916    pmullw          m4, m1, m1
    917    pshufb          m3, m5, m9
    918    paddw           m0, m1, m3
    919    shufps          m1, m3, q2121
    920    paddw           m0, m1
    921    punpcklwd       m2, m3, m1
    922    pmaddwd         m2, m2
    923    punpckhwd       m3, m1
    924    pmaddwd         m3, m3
    925    punpcklwd       m1, m4, m6
    926    paddd           m2, m1
    927    punpckhwd       m4, m6
    928    paddd           m3, m4
    929    pshufb          m1, m5, m10
    930    paddw           m0, m1
    931    pshufb          m5, m11
    932    paddw           m0, m5               ; h sum
    933    punpcklwd       m4, m5, m1
    934    pmaddwd         m4, m4
    935    punpckhwd       m5, m1
    936    pmaddwd         m5, m5
    937    paddw           m1, m0, [t1+r10*2+400*0]
    938    paddd           m2, m4               ; h sumsq
    939    paddd           m3, m5
    940    paddd           m4, m2, [t1+r10*2+400*2]
    941    paddd           m5, m3, [t1+r10*2+400*4]
    942    test            hd, hd
    943    jz .hv_last_row
    944 .hv_main2:
    945    paddw           m1, [t2+r10*2+400*0] ; hv sum
    946    paddd           m4, [t2+r10*2+400*2] ; hv sumsq
    947    paddd           m5, [t2+r10*2+400*4]
    948    mova [t0+r10*2+400*0], m0
    949    mova [t0+r10*2+400*2], m2
    950    mova [t0+r10*2+400*4], m3
    951    vpbroadcastd    m2, [pd_25]
    952    punpcklwd       m0, m1, m6           ; b
    953    punpckhwd       m1, m6
    954    pmulld          m4, m2               ; a * 25
    955    pmulld          m5, m2
    956    pmaddwd         m2, m0, m0           ; b * b
    957    pmaddwd         m3, m1, m1
    958    psubd           m4, m2               ; p
    959    psubd           m5, m3
    960    pmulld          m4, m12              ; p * s
    961    pmulld          m5, m12
    962    pmaddwd         m0, m13              ; b * 164
    963    pmaddwd         m1, m13
    964    paddw           m4, m13
    965    paddw           m5, m13
    966    psrld           m4, 20               ; z + 1
    967    psrld           m5, 20
    968    cvtdq2ps        m4, m4
    969    cvtdq2ps        m5, m5
    970    rcpps           m2, m4               ; 1 / (z + 1)
    971    rcpps           m3, m5
    972    pcmpgtd         m4, m14, m4
    973    pcmpgtd         m5, m14, m5
    974    mulps           m2, m14              ; 256 / (z + 1)
    975    mulps           m3, m14
    976    psrld           m4, 24               ; z < 255 ? 255 : 0
    977    psrld           m5, 24
    978    cvtps2dq        m2, m2
    979    cvtps2dq        m3, m3
    980    pminsw          m2, m4               ; x
    981    pminsw          m3, m5
    982    vpbroadcastd    m4, [pd_34816]
    983    pmulld          m0, m2
    984    pmulld          m1, m3
    985    paddd           m0, m4               ; x * b * 164 + (1 << 11) + (1 << 15)
    986    paddd           m1, m4
    987    pand            m0, m15
    988    pand            m1, m15
    989    por             m0, m2               ; a | (b << 12)
    990    por             m1, m3
    991    mova         [t3+r10*4+ 8], xm0      ; The neighbor calculations requires
    992    vextracti128 [t3+r10*4+40], m0, 1    ; 13 bits for a and 21 bits for b.
    993    mova         [t3+r10*4+24], xm1      ; Packing them allows for 12+20, but
    994    vextracti128 [t3+r10*4+56], m1, 1    ; that gets us most of the way.
    995    add            r10, 16
    996    jl .hv_loop
    997    mov             t2, t1
    998    mov             t1, t0
    999    mov             t0, t2
   1000    ret
   1001 .hv_last_row: ; esoteric edge case for odd heights
   1002    mova [t1+r10*2+400*0], m1
   1003    paddw              m1, m0
   1004    mova [t1+r10*2+400*2], m4
   1005    paddd              m4, m2
   1006    mova [t1+r10*2+400*4], m5
   1007    paddd              m5, m3
   1008    jmp .hv_main2
   1009 .v: ; vertical boxsum + ab
   1010    lea            r10, [wq-2]
   1011 .v_loop:
   1012    mova            m0, [t1+r10*2+400*0]
   1013    mova            m2, [t1+r10*2+400*2]
   1014    mova            m3, [t1+r10*2+400*4]
   1015    paddw           m1, m0, [t2+r10*2+400*0]
   1016    paddd           m4, m2, [t2+r10*2+400*2]
   1017    paddd           m5, m3, [t2+r10*2+400*4]
   1018    paddw           m0, m0
   1019    paddd           m2, m2
   1020    paddd           m3, m3
   1021    paddw           m1, m0               ; hv sum
   1022    paddd           m4, m2               ; hv sumsq
   1023    paddd           m5, m3
   1024    vpbroadcastd    m2, [pd_25]
   1025    punpcklwd       m0, m1, m6           ; b
   1026    punpckhwd       m1, m6
   1027    pmulld          m4, m2               ; a * 25
   1028    pmulld          m5, m2
   1029    pmaddwd         m2, m0, m0           ; b * b
   1030    pmaddwd         m3, m1, m1
   1031    psubd           m4, m2               ; p
   1032    psubd           m5, m3
   1033    pmulld          m4, m12              ; p * s
   1034    pmulld          m5, m12
   1035    pmaddwd         m0, m13              ; b * 164
   1036    pmaddwd         m1, m13
   1037    paddw           m4, m13
   1038    paddw           m5, m13
   1039    psrld           m4, 20               ; z + 1
   1040    psrld           m5, 20
   1041    cvtdq2ps        m4, m4
   1042    cvtdq2ps        m5, m5
   1043    rcpps           m2, m4               ; 1 / (z + 1)
   1044    rcpps           m3, m5
   1045    pcmpgtd         m4, m14, m4
   1046    pcmpgtd         m5, m14, m5
   1047    mulps           m2, m14              ; 256 / (z + 1)
   1048    mulps           m3, m14
   1049    psrld           m4, 24               ; z < 255 ? 255 : 0
   1050    psrld           m5, 24
   1051    cvtps2dq        m2, m2
   1052    cvtps2dq        m3, m3
   1053    pminsw          m2, m4               ; x
   1054    pminsw          m3, m5
   1055    vpbroadcastd    m4, [pd_34816]
   1056    pmulld          m0, m2
   1057    pmulld          m1, m3
   1058    paddd           m0, m4               ; x * b * 164 + (1 << 11) + (1 << 15)
   1059    paddd           m1, m4
   1060    pand            m0, m15
   1061    pand            m1, m15
   1062    por             m0, m2               ; a | (b << 12)
   1063    por             m1, m3
   1064    mova         [t3+r10*4+ 8], xm0
   1065    vextracti128 [t3+r10*4+40], m0, 1
   1066    mova         [t3+r10*4+24], xm1
   1067    vextracti128 [t3+r10*4+56], m1, 1
   1068    add            r10, 16
   1069    jl .v_loop
   1070    ret
   1071 .prep_n: ; initial neighbor setup
   1072    mov            r10, wq
   1073 .prep_n_loop:
   1074    movu            m0, [t3+r10*4+ 4]
   1075    movu            m1, [t3+r10*4+36]
   1076    paddd           m2, m0, [t3+r10*4+ 0]
   1077    paddd           m3, m1, [t3+r10*4+32]
   1078    paddd           m2, [t3+r10*4+ 8]
   1079    paddd           m3, [t3+r10*4+40]
   1080    paddd           m0, m2
   1081    pslld           m2, 2
   1082    paddd           m1, m3
   1083    pslld           m3, 2
   1084    paddd           m2, m0                ; ab 565
   1085    paddd           m3, m1
   1086    pandn           m0, m15, m2           ; a
   1087    psrld           m2, 12                ; b
   1088    pandn           m1, m15, m3
   1089    psrld           m3, 12
   1090    mova [t3+r10*4+400*4+ 0], m0
   1091    mova [t3+r10*4+400*8+ 0], m2
   1092    mova [t3+r10*4+400*4+32], m1
   1093    mova [t3+r10*4+400*8+32], m3
   1094    add            r10, 16
   1095    jl .prep_n_loop
   1096    ret
   1097 ALIGN function_align
   1098 .n0: ; neighbor + output (even rows)
   1099    mov            r10, wq
   1100 .n0_loop:
   1101    movu            m0, [t3+r10*4+ 4]
   1102    movu            m1, [t3+r10*4+36]
   1103    paddd           m2, m0, [t3+r10*4+ 0]
   1104    paddd           m3, m1, [t3+r10*4+32]
   1105    paddd           m2, [t3+r10*4+ 8]
   1106    paddd           m3, [t3+r10*4+40]
   1107    paddd           m0, m2
   1108    pslld           m2, 2
   1109    paddd           m1, m3
   1110    pslld           m3, 2
   1111    paddd           m2, m0
   1112    paddd           m3, m1
   1113    pandn           m0, m15, m2
   1114    psrld           m2, 12
   1115    pandn           m1, m15, m3
   1116    psrld           m3, 12
   1117    paddd           m4, m0, [t3+r10*4+400*4+ 0] ; a
   1118    paddd           m5, m1, [t3+r10*4+400*4+32]
   1119    mova [t3+r10*4+400*4+ 0], m0
   1120    mova [t3+r10*4+400*4+32], m1
   1121    paddd           m0, m2, [t3+r10*4+400*8+ 0] ; b
   1122    paddd           m1, m3, [t3+r10*4+400*8+32]
   1123    mova [t3+r10*4+400*8+ 0], m2
   1124    mova [t3+r10*4+400*8+32], m3
   1125    pmovzxbd        m2, [dstq+r10+0]
   1126    pmovzxbd        m3, [dstq+r10+8]
   1127    pmaddwd         m4, m2 ; a * src
   1128    pmaddwd         m5, m3
   1129    packssdw        m2, m3
   1130    psubd           m0, m4 ; b - a * src + (1 << 8)
   1131    psubd           m1, m5
   1132    psrad           m0, 9
   1133    psrad           m1, 9
   1134    packssdw        m0, m1
   1135    pmulhrsw        m0, m7
   1136    paddw           m0, m2
   1137    vextracti128   xm1, m0, 1
   1138    packuswb       xm0, xm1
   1139    pshufd         xm0, xm0, q3120
   1140    mova    [dstq+r10], xm0
   1141    add            r10, 16
   1142    jl .n0_loop
   1143    add           dstq, strideq
   1144    ret
   1145 ALIGN function_align
   1146 .n1: ; neighbor + output (odd rows)
   1147    mov            r10, wq
   1148 .n1_loop:
   1149    pmovzxbd        m2, [dstq+r10+0]
   1150    pmovzxbd        m3, [dstq+r10+8]
   1151    pmaddwd         m4, m2, [t3+r10*4+400*4+ 0] ; a * src
   1152    pmaddwd         m5, m3, [t3+r10*4+400*4+32]
   1153    mova            m0, [t3+r10*4+400*8+ 0]     ; b
   1154    mova            m1, [t3+r10*4+400*8+32]
   1155    packssdw        m2, m3
   1156    psubd           m0, m4                      ; b - a * src + (1 << 7)
   1157    psubd           m1, m5
   1158    psrad           m0, 8
   1159    psrad           m1, 8
   1160    packssdw        m0, m1
   1161    pmulhrsw        m0, m7
   1162    paddw           m0, m2
   1163    vextracti128   xm1, m0, 1
   1164    packuswb       xm0, xm1
   1165    pshufd         xm0, xm0, q3120
   1166    mova    [dstq+r10], xm0
   1167    add            r10, 16
   1168    jl .n1_loop
   1169    add           dstq, strideq
   1170    ret
   1171 
   1172 cglobal sgr_filter_3x3_8bpc, 4, 14, 16, -400*28-16, dst, stride, left, lpf, \
   1173                                                    w, h, edge, params
   1174    mov        paramsq, r6mp
   1175    mov             wd, wm
   1176    movifnidn       hd, hm
   1177    vbroadcasti128  m8, [sgr_shuf+2]
   1178    mov          edged, r7m
   1179    vbroadcasti128  m9, [sgr_shuf+4]
   1180    add           lpfq, wq
   1181    vbroadcasti128 m10, [sgr_shuf+6]
   1182    add           dstq, wq
   1183    vpbroadcastw    m7, [paramsq+10] ; w1
   1184    lea             t3, [rsp+wq*4+16+400*12]
   1185    vpbroadcastd   m11, [paramsq+ 4] ; s1
   1186    pxor            m6, m6
   1187    vpbroadcastd   m12, [pw_455_24]
   1188    lea             t1, [rsp+wq*2+20]
   1189    vbroadcastss   m13, [pf_256]
   1190    neg             wq
   1191    vpbroadcastd   m14, [pd_34816] ; (1 << 11) + (1 << 15)
   1192    psllw           m7, 4
   1193    vpbroadcastd   m15, [pd_m4096]
   1194    test         edgeb, 4 ; LR_HAVE_TOP
   1195    jz .no_top
   1196    call .h_top
   1197    add           lpfq, strideq
   1198    mov             t2, t1
   1199    add             t1, 400*6
   1200    call .h_top
   1201    lea             t4, [lpfq+strideq*4]
   1202    mov           lpfq, dstq
   1203    add             t4, strideq
   1204    mov          [rsp], t4 ; below
   1205    mov             t0, t2
   1206    call .hv
   1207 .main:
   1208    mov             t5, t3
   1209    add             t3, 400*4
   1210    dec             hd
   1211    jz .height1
   1212    add           lpfq, strideq
   1213    call .hv
   1214    call .prep_n
   1215    dec             hd
   1216    jz .extend_bottom
   1217 .main_loop:
   1218    add           lpfq, strideq
   1219    call .hv
   1220    call .n
   1221    dec             hd
   1222    jnz .main_loop
   1223    test         edgeb, 8 ; LR_HAVE_BOTTOM
   1224    jz .extend_bottom
   1225    mov           lpfq, [rsp]
   1226    call .hv_bottom
   1227    call .n
   1228    add           lpfq, strideq
   1229    call .hv_bottom
   1230 .end:
   1231    call .n
   1232    RET
   1233 .height1:
   1234    call .v
   1235    call .prep_n
   1236    mov             t2, t1
   1237    call .v
   1238    jmp .end
   1239 .extend_bottom:
   1240    call .v
   1241    call .n
   1242    mov             t2, t1
   1243    call .v
   1244    jmp .end
   1245 .no_top:
   1246    lea             t4, [lpfq+strideq*4]
   1247    mov           lpfq, dstq
   1248    lea             t4, [t4+strideq*2]
   1249    mov          [rsp], t4
   1250    call .h
   1251    lea             t0, [t1+400*6]
   1252    mov             t2, t1
   1253    call .v
   1254    jmp .main
   1255 .h: ; horizontal boxsum
   1256    lea            r10, [wq-2]
   1257    test         edgeb, 1 ; LR_HAVE_LEFT
   1258    jz .h_extend_left
   1259    vpbroadcastd   xm0, [leftq]
   1260    mova           xm5, [lpfq+wq]
   1261    palignr        xm5, xm0, 12
   1262    add          leftq, 4
   1263    jmp .h_main
   1264 .h_extend_left:
   1265    mova           xm5, [lpfq+wq]
   1266    pshufb         xm5, [sgr_l_shuf]
   1267    jmp .h_main
   1268 .h_top:
   1269    lea            r10, [wq-2]
   1270    test         edgeb, 1 ; LR_HAVE_LEFT
   1271    jz .h_extend_left
   1272 .h_loop:
   1273    movu           xm5, [lpfq+r10-2]
   1274 .h_main:
   1275    vinserti128     m5, [lpfq+r10+6], 1
   1276    test         edgeb, 2 ; LR_HAVE_RIGHT
   1277    jnz .h_have_right
   1278    cmp           r10d, -17
   1279    jl .h_have_right
   1280    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
   1281 .h_have_right:
   1282    pshufb          m0, m5, m8
   1283    pmullw          m2, m0, m0
   1284    pshufb          m4, m5, m9
   1285    paddw           m0, m4
   1286    pshufb          m5, m10
   1287    paddw           m0, m5 ; sum
   1288    punpcklwd       m3, m4, m5
   1289    pmaddwd         m3, m3
   1290    punpckhwd       m4, m5
   1291    pmaddwd         m4, m4
   1292    punpcklwd       m1, m2, m6
   1293    punpckhwd       m2, m6
   1294    mova [t1+r10*2+400*0], m0
   1295    paddd           m1, m3 ; sumsq
   1296    paddd           m2, m4
   1297    mova [t1+r10*2+400*2], m1
   1298    mova [t1+r10*2+400*4], m2
   1299    add            r10, 16
   1300    jl .h_loop
   1301    ret
   1302 ALIGN function_align
   1303 .hv: ; horizontal boxsum + vertical boxsum + ab
   1304    lea            r10, [wq-2]
   1305    test         edgeb, 1 ; LR_HAVE_LEFT
   1306    jz .hv_extend_left
   1307    vpbroadcastd   xm0, [leftq]
   1308    mova           xm5, [lpfq+wq]
   1309    palignr        xm5, xm0, 12
   1310    add          leftq, 4
   1311    jmp .hv_main
   1312 .hv_extend_left:
   1313    mova           xm5, [lpfq+wq]
   1314    pshufb         xm5, [sgr_l_shuf]
   1315    jmp .hv_main
   1316 .hv_bottom:
   1317    lea            r10, [wq-2]
   1318    test         edgeb, 1 ; LR_HAVE_LEFT
   1319    jz .hv_extend_left
   1320 .hv_loop:
   1321    movu           xm5, [lpfq+r10-2]
   1322 .hv_main:
   1323    vinserti128     m5, [lpfq+r10+6], 1
   1324    test         edgeb, 2 ; LR_HAVE_RIGHT
   1325    jnz .hv_have_right
   1326    cmp           r10d, -17
   1327    jl .hv_have_right
   1328    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
   1329 .hv_have_right:
   1330    pshufb          m0, m5, m8
   1331    pmullw          m3, m0, m0
   1332    pshufb          m1, m5, m9
   1333    paddw           m0, m1
   1334    pshufb          m5, m10
   1335    paddw           m0, m5               ; h sum
   1336    punpcklwd       m4, m5, m1
   1337    pmaddwd         m4, m4
   1338    punpckhwd       m5, m1
   1339    pmaddwd         m5, m5
   1340    paddw           m1, m0, [t2+r10*2+400*0]
   1341    paddw           m1, [t1+r10*2+400*0] ; hv sum
   1342    punpcklwd       m2, m3, m6
   1343    punpckhwd       m3, m6
   1344    paddd           m4, m2               ; h sumsq
   1345    paddd           m5, m3
   1346    paddd           m2, m4, [t2+r10*2+400*2]
   1347    paddd           m3, m5, [t2+r10*2+400*4]
   1348    paddd           m2, [t1+r10*2+400*2] ; hv sumsq
   1349    paddd           m3, [t1+r10*2+400*4]
   1350    mova [t0+r10*2+400*0], m0
   1351    punpcklwd       m0, m1, m6           ; b
   1352    punpckhwd       m1, m6
   1353    mova [t0+r10*2+400*2], m4
   1354    pslld           m4, m2, 3
   1355    mova [t0+r10*2+400*4], m5
   1356    pslld           m5, m3, 3
   1357    paddd           m4, m2               ; a * 9
   1358    pmaddwd         m2, m0, m0           ; b * b
   1359    paddd           m5, m3
   1360    pmaddwd         m3, m1, m1
   1361    psubd           m4, m2               ; p
   1362    psubd           m5, m3
   1363    pmulld          m4, m11              ; p * s
   1364    pmulld          m5, m11
   1365    pmaddwd         m0, m12              ; b * 455
   1366    pmaddwd         m1, m12
   1367    paddw           m4, m12
   1368    paddw           m5, m12
   1369    psrld           m4, 20               ; z + 1
   1370    psrld           m5, 20
   1371    cvtdq2ps        m4, m4
   1372    cvtdq2ps        m5, m5
   1373    rcpps           m2, m4               ; 1 / (z + 1)
   1374    rcpps           m3, m5
   1375    pcmpgtd         m4, m13, m4
   1376    pcmpgtd         m5, m13, m5
   1377    mulps           m2, m13              ; 256 / (z + 1)
   1378    mulps           m3, m13
   1379    psrld           m4, 24               ; z < 255 ? 255 : 0
   1380    psrld           m5, 24
   1381    cvtps2dq        m2, m2
   1382    cvtps2dq        m3, m3
   1383    pminsw          m2, m4               ; x
   1384    pminsw          m3, m5
   1385    pmulld          m0, m2
   1386    pmulld          m1, m3
   1387    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
   1388    paddd           m1, m14
   1389    pand            m0, m15
   1390    pand            m1, m15
   1391    por             m0, m2               ; a | (b << 12)
   1392    por             m1, m3
   1393    mova         [t3+r10*4+ 8], xm0
   1394    vextracti128 [t3+r10*4+40], m0, 1
   1395    mova         [t3+r10*4+24], xm1
   1396    vextracti128 [t3+r10*4+56], m1, 1
   1397    add            r10, 16
   1398    jl .hv_loop
   1399    mov             t2, t1
   1400    mov             t1, t0
   1401    mov             t0, t2
   1402    ret
   1403 .v: ; vertical boxsum + ab
   1404    lea            r10, [wq-2]
   1405 .v_loop:
   1406    mova            m1, [t1+r10*2+400*0]
   1407    paddw           m1, m1
   1408    paddw           m1, [t2+r10*2+400*0] ; hv sum
   1409    mova            m2, [t1+r10*2+400*2]
   1410    mova            m3, [t1+r10*2+400*4]
   1411    paddd           m2, m2
   1412    paddd           m3, m3
   1413    paddd           m2, [t2+r10*2+400*2] ; hv sumsq
   1414    paddd           m3, [t2+r10*2+400*4]
   1415    punpcklwd       m0, m1, m6           ; b
   1416    punpckhwd       m1, m6
   1417    pslld           m4, m2, 3
   1418    pslld           m5, m3, 3
   1419    paddd           m4, m2               ; a * 9
   1420    pmaddwd         m2, m0, m0           ; b * b
   1421    paddd           m5, m3
   1422    pmaddwd         m3, m1, m1
   1423    psubd           m4, m2               ; p
   1424    psubd           m5, m3
   1425    pmulld          m4, m11              ; p * s
   1426    pmulld          m5, m11
   1427    pmaddwd         m0, m12              ; b * 455
   1428    pmaddwd         m1, m12
   1429    paddw           m4, m12
   1430    paddw           m5, m12
   1431    psrld           m4, 20               ; z + 1
   1432    psrld           m5, 20
   1433    cvtdq2ps        m4, m4
   1434    cvtdq2ps        m5, m5
   1435    rcpps           m2, m4               ; 1 / (z + 1)
   1436    rcpps           m3, m5
   1437    pcmpgtd         m4, m13, m4
   1438    pcmpgtd         m5, m13, m5
   1439    mulps           m2, m13              ; 256 / (z + 1)
   1440    mulps           m3, m13
   1441    psrld           m4, 24               ; z < 255 ? 255 : 0
   1442    psrld           m5, 24
   1443    cvtps2dq        m2, m2
   1444    cvtps2dq        m3, m3
   1445    pminsw          m2, m4               ; x
   1446    pminsw          m3, m5
   1447    pmulld          m0, m2
   1448    pmulld          m1, m3
   1449    paddd           m0, m14              ; x * b * 455 + (1 << 11) + (1 << 15)
   1450    paddd           m1, m14
   1451    pand            m0, m15
   1452    pand            m1, m15
   1453    por             m0, m2               ; a | (b << 12)
   1454    por             m1, m3
   1455    mova         [t3+r10*4+ 8], xm0
   1456    vextracti128 [t3+r10*4+40], m0, 1
   1457    mova         [t3+r10*4+24], xm1
   1458    vextracti128 [t3+r10*4+56], m1, 1
   1459    add            r10, 16
   1460    jl .v_loop
   1461    ret
   1462 .prep_n: ; initial neighbor setup
   1463    mov            r10, wq
   1464    mov             t4, t3
   1465    add             t3, 400*4
   1466 .prep_n_loop:
   1467    mova            m2, [t5+r10*4+0]
   1468    mova            m3, [t4+r10*4+0]
   1469    paddd           m2, [t5+r10*4+8]
   1470    paddd           m3, [t4+r10*4+8]
   1471    paddd           m0, m2, [t5+r10*4+4]
   1472    paddd           m1, m3, [t4+r10*4+4]
   1473    pslld           m0, 2
   1474    paddd           m1, m1                ; ab[ 0] 222
   1475    psubd           m0, m2                ; ab[-1] 343
   1476    mova [t3+r10*4+400*4], m1
   1477    paddd           m1, m1
   1478    mova    [t5+r10*4], m0
   1479    psubd           m1, m3                ; ab[ 0] 343
   1480    mova    [t4+r10*4], m1
   1481    add            r10, 8
   1482    jl .prep_n_loop
   1483    ret
   1484 ; a+b are packed together in a single dword, but we can't do the
   1485 ; full neighbor calculations before splitting them since we don't
   1486 ; have sufficient precision. The solution is to do the calculations
   1487 ; in two equal halves and split a and b before doing the final sum.
   1488 ALIGN function_align
   1489 .n: ; neighbor + output
   1490    mov            r10, wq
   1491 .n_loop:
   1492    mova            m4, [t3+r10*4+ 0]
   1493    paddd           m4, [t3+r10*4+ 8]
   1494    paddd           m5, m4, [t3+r10*4+ 4]
   1495    paddd           m5, m5                ; ab[+1] 222
   1496    mova            m2, [t3+r10*4+400*4+ 0]
   1497    paddd           m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
   1498    mova            m3, [t3+r10*4+400*4+32]
   1499    paddd           m1, m3, [t5+r10*4+32]
   1500    mova [t3+r10*4+400*4+ 0], m5
   1501    paddd           m5, m5
   1502    psubd           m5, m4                ; ab[+1] 343
   1503    mova [t5+r10*4+ 0], m5
   1504    paddd           m2, m5                ; ab[ 0] 222 + ab[+1] 343
   1505    mova            m4, [t3+r10*4+32]
   1506    paddd           m4, [t3+r10*4+40]
   1507    paddd           m5, m4, [t3+r10*4+36]
   1508    paddd           m5, m5
   1509    mova [t3+r10*4+400*4+32], m5
   1510    paddd           m5, m5
   1511    psubd           m5, m4
   1512    mova [t5+r10*4+32], m5
   1513    pandn           m4, m15, m0
   1514    psrld           m0, 12
   1515    paddd           m3, m5
   1516    pandn           m5, m15, m2
   1517    psrld           m2, 12
   1518    paddd           m4, m5                ; a
   1519    pandn           m5, m15, m1
   1520    psrld           m1, 12
   1521    paddd           m0, m2                ; b + (1 << 8)
   1522    pandn           m2, m15, m3
   1523    psrld           m3, 12
   1524    paddd           m5, m2
   1525    pmovzxbd        m2, [dstq+r10+0]
   1526    paddd           m1, m3
   1527    pmovzxbd        m3, [dstq+r10+8]
   1528    pmaddwd         m4, m2                ; a * src
   1529    pmaddwd         m5, m3
   1530    packssdw        m2, m3
   1531    psubd           m0, m4                ; b - a * src + (1 << 8)
   1532    psubd           m1, m5
   1533    psrad           m0, 9
   1534    psrad           m1, 9
   1535    packssdw        m0, m1
   1536    pmulhrsw        m0, m7
   1537    paddw           m0, m2
   1538    vextracti128   xm1, m0, 1
   1539    packuswb       xm0, xm1
   1540    pshufd         xm0, xm0, q3120
   1541    mova    [dstq+r10], xm0
   1542    add            r10, 16
   1543    jl .n_loop
   1544    mov            r10, t5
   1545    mov             t5, t4
   1546    mov             t4, r10
   1547    add           dstq, strideq
   1548    ret
   1549 
   1550 cglobal sgr_filter_mix_8bpc, 4, 12, 16, 400*56+8, dst, stride, left, lpf, \
   1551                                                  w, h, edge, params
   1552    mov        paramsq, r6mp
   1553    mov             wd, wm
   1554    movifnidn       hd, hm
   1555    mov          edged, r7m
   1556    vbroadcasti128  m9, [sgr_shuf+0]
   1557    vbroadcasti128 m10, [sgr_shuf+8]
   1558    add           lpfq, wq
   1559    vbroadcasti128 m11, [sgr_shuf+2]
   1560    vbroadcasti128 m12, [sgr_shuf+6]
   1561    add           dstq, wq
   1562    vpbroadcastd   m15, [paramsq+8] ; w0 w1
   1563    lea             t3, [rsp+wq*4+400*24+8]
   1564    vpbroadcastd   m13, [paramsq+0] ; s0
   1565    pxor            m7, m7
   1566    vpbroadcastd   m14, [paramsq+4] ; s1
   1567    lea             t1, [rsp+wq*2+12]
   1568    neg             wq
   1569    psllw          m15, 2 ; to reuse existing pd_m4096 register for rounding
   1570    test         edgeb, 4 ; LR_HAVE_TOP
   1571    jz .no_top
   1572    call .h_top
   1573    add           lpfq, strideq
   1574    mov             t2, t1
   1575    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup
   1576    add             t1, 400*12
   1577    call .h_top
   1578    lea            r10, [lpfq+strideq*4]
   1579    mov           lpfq, dstq
   1580    add            r10, strideq
   1581    mov          [rsp], r10 ; below
   1582    call .hv0
   1583 .main:
   1584    dec             hd
   1585    jz .height1
   1586    add           lpfq, strideq
   1587    call .hv1
   1588    call .prep_n
   1589    sub             hd, 2
   1590    jl .extend_bottom
   1591 .main_loop:
   1592    add           lpfq, strideq
   1593    call .hv0
   1594    test            hd, hd
   1595    jz .odd_height
   1596    add           lpfq, strideq
   1597    call .hv1
   1598    call .n0
   1599    call .n1
   1600    sub             hd, 2
   1601    jge .main_loop
   1602    test         edgeb, 8 ; LR_HAVE_BOTTOM
   1603    jz .extend_bottom
   1604    mov           lpfq, [rsp]
   1605    call .hv0_bottom
   1606    add           lpfq, strideq
   1607    call .hv1_bottom
   1608 .end:
   1609    call .n0
   1610    call .n1
   1611 .end2:
   1612    RET
   1613 .height1:
   1614    call .v1
   1615    call .prep_n
   1616    jmp .odd_height_end
   1617 .odd_height:
   1618    call .v1
   1619    call .n0
   1620    call .n1
   1621 .odd_height_end:
   1622    call .v0
   1623    call .v1
   1624    call .n0
   1625    jmp .end2
   1626 .extend_bottom:
   1627    call .v0
   1628    call .v1
   1629    jmp .end
   1630 .no_top:
   1631    lea            r10, [lpfq+strideq*4]
   1632    mov           lpfq, dstq
   1633    lea            r10, [r10+strideq*2]
   1634    mov          [rsp], r10
   1635    call .h
   1636    lea             t2, [t1+400*12]
   1637    lea            r10, [wq-2]
   1638 .top_fixup_loop:
   1639    mova            m0, [t1+r10*2+400* 0]
   1640    mova            m1, [t1+r10*2+400* 2]
   1641    mova            m2, [t1+r10*2+400* 4]
   1642    paddw           m0, m0
   1643    mova            m3, [t1+r10*2+400* 6]
   1644    paddd           m1, m1
   1645    mova            m4, [t1+r10*2+400* 8]
   1646    paddd           m2, m2
   1647    mova            m5, [t1+r10*2+400*10]
   1648    mova [t2+r10*2+400* 0], m0
   1649    mova [t2+r10*2+400* 2], m1
   1650    mova [t2+r10*2+400* 4], m2
   1651    mova [t2+r10*2+400* 6], m3
   1652    mova [t2+r10*2+400* 8], m4
   1653    mova [t2+r10*2+400*10], m5
   1654    add            r10, 16
   1655    jl .top_fixup_loop
   1656    call .v0
   1657    jmp .main
   1658 .h: ; horizontal boxsums
   1659    lea            r10, [wq-2]
   1660    test         edgeb, 1 ; LR_HAVE_LEFT
   1661    jz .h_extend_left
   1662    vpbroadcastd   xm0, [leftq]
   1663    mova           xm5, [lpfq+wq]
   1664    palignr        xm5, xm0, 12
   1665    add          leftq, 4
   1666    jmp .h_main
   1667 .h_extend_left:
   1668    mova           xm5, [lpfq+wq]
   1669    pshufb         xm5, [sgr_l_shuf]
   1670    jmp .h_main
   1671 .h_top:
   1672    lea            r10, [wq-2]
   1673    test         edgeb, 1 ; LR_HAVE_LEFT
   1674    jz .h_extend_left
   1675 .h_loop:
   1676    movu           xm5, [lpfq+r10-2]
   1677 .h_main:
   1678    vinserti128     m5, [lpfq+r10+6], 1
   1679    test         edgeb, 2 ; LR_HAVE_RIGHT
   1680    jnz .h_have_right
   1681    cmp           r10d, -18
   1682    jl .h_have_right
   1683    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
   1684 .h_have_right:
   1685    pshufb          m6, m5, m9
   1686    pshufb          m4, m5, m10
   1687    paddw           m8, m6, m4
   1688    shufps          m0, m6, m4, q2121
   1689    pmullw          m3, m0, m0
   1690    pshufb          m2, m5, m11
   1691    paddw           m0, m2
   1692    pshufb          m5, m12
   1693    paddw           m0, m5 ; sum3
   1694    punpcklwd       m1, m2, m5
   1695    pmaddwd         m1, m1
   1696    punpckhwd       m2, m5
   1697    pmaddwd         m2, m2
   1698    punpcklwd       m5, m6, m4
   1699    pmaddwd         m5, m5
   1700    punpckhwd       m6, m4
   1701    pmaddwd         m6, m6
   1702    punpcklwd       m4, m3, m7
   1703    paddd           m1, m4 ; sumsq3
   1704    punpckhwd       m3, m7
   1705    paddd           m2, m3
   1706    mova [t1+r10*2+400* 6], m0
   1707    mova [t1+r10*2+400* 8], m1
   1708    mova [t1+r10*2+400*10], m2
   1709    paddw           m8, m0 ; sum5
   1710    paddd           m5, m1 ; sumsq5
   1711    paddd           m6, m2
   1712    mova [t1+r10*2+400* 0], m8
   1713    mova [t1+r10*2+400* 2], m5
   1714    mova [t1+r10*2+400* 4], m6
   1715    add            r10, 16
   1716    jl .h_loop
   1717    ret
   1718 ALIGN function_align
   1719 .hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
   1720    lea            r10, [wq-2]
   1721    test         edgeb, 1 ; LR_HAVE_LEFT
   1722    jz .hv0_extend_left
   1723    vpbroadcastd   xm0, [leftq]
   1724    mova           xm5, [lpfq+wq]
   1725    palignr        xm5, xm0, 12
   1726    add          leftq, 4
   1727    jmp .hv0_main
   1728 .hv0_extend_left:
   1729    mova           xm5, [lpfq+wq]
   1730    pshufb         xm5, [sgr_l_shuf]
   1731    jmp .hv0_main
   1732 .hv0_bottom:
   1733    lea            r10, [wq-2]
   1734    test         edgeb, 1 ; LR_HAVE_LEFT
   1735    jz .hv0_extend_left
   1736 .hv0_loop:
   1737    movu           xm5, [lpfq+r10-2]
   1738 .hv0_main:
   1739    vinserti128     m5, [lpfq+r10+6], 1
   1740    test         edgeb, 2 ; LR_HAVE_RIGHT
   1741    jnz .hv0_have_right
   1742    cmp           r10d, -18
   1743    jl .hv0_have_right
   1744    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
   1745 .hv0_have_right:
   1746    pshufb          m6, m5, m9
   1747    pshufb          m4, m5, m10
   1748    paddw           m8, m6, m4
   1749    shufps          m1, m6, m4, q2121
   1750    pmullw          m0, m1, m1
   1751    pshufb          m3, m5, m11
   1752    paddw           m1, m3
   1753    pshufb          m5, m12
   1754    paddw           m1, m5 ; sum3
   1755    punpcklwd       m2, m3, m5
   1756    pmaddwd         m2, m2
   1757    punpckhwd       m3, m5
   1758    pmaddwd         m3, m3
   1759    punpcklwd       m5, m6, m4
   1760    pmaddwd         m5, m5
   1761    punpckhwd       m6, m4
   1762    pmaddwd         m6, m6
   1763    punpcklwd       m4, m0, m7
   1764    paddd           m2, m4 ; sumsq3
   1765    punpckhwd       m0, m7
   1766    paddd           m3, m0
   1767    paddw           m8, m1 ; sum5
   1768    paddd           m5, m2 ; sumsq5
   1769    paddd           m6, m3
   1770    mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row
   1771    mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd
   1772    mova [t3+r10*4+400*0+40], m6
   1773    paddw           m8, [t1+r10*2+400* 0]
   1774    paddd           m5, [t1+r10*2+400* 2]
   1775    paddd           m6, [t1+r10*2+400* 4]
   1776    mova [t1+r10*2+400* 0], m8
   1777    mova [t1+r10*2+400* 2], m5
   1778    mova [t1+r10*2+400* 4], m6
   1779    paddw           m0, m1, [t1+r10*2+400* 6]
   1780    paddd           m4, m2, [t1+r10*2+400* 8]
   1781    paddd           m5, m3, [t1+r10*2+400*10]
   1782    mova [t1+r10*2+400* 6], m1
   1783    mova [t1+r10*2+400* 8], m2
   1784    mova [t1+r10*2+400*10], m3
   1785    paddw           m1, m0, [t2+r10*2+400* 6]
   1786    paddd           m2, m4, [t2+r10*2+400* 8]
   1787    paddd           m3, m5, [t2+r10*2+400*10]
   1788    mova [t2+r10*2+400* 6], m0
   1789    mova [t2+r10*2+400* 8], m4
   1790    mova [t2+r10*2+400*10], m5
   1791    vpbroadcastd    m8, [pw_455_24]
   1792    punpcklwd       m0, m1, m7           ; b3
   1793    vbroadcastss    m6, [pf_256]
   1794    punpckhwd       m1, m7
   1795    pslld           m4, m2, 3
   1796    pslld           m5, m3, 3
   1797    paddd           m4, m2               ; a3 * 9
   1798    pmaddwd         m2, m0, m0           ; b3 * b
   1799    paddd           m5, m3
   1800    pmaddwd         m3, m1, m1
   1801    psubd           m4, m2               ; p3
   1802    psubd           m5, m3
   1803    pmulld          m4, m14              ; p3 * s1
   1804    pmulld          m5, m14
   1805    pmaddwd         m0, m8               ; b3 * 455
   1806    pmaddwd         m1, m8
   1807    paddw           m4, m8
   1808    paddw           m5, m8
   1809    vpbroadcastd    m8, [pd_34816]
   1810    psrld           m4, 20               ; z3 + 1
   1811    psrld           m5, 20
   1812    cvtdq2ps        m4, m4
   1813    cvtdq2ps        m5, m5
   1814    rcpps           m2, m4               ; 1 / (z3 + 1)
   1815    rcpps           m3, m5
   1816    pcmpgtd         m4, m6, m4
   1817    pcmpgtd         m5, m6, m5
   1818    mulps           m2, m6               ; 256 / (z3 + 1)
   1819    mulps           m3, m6
   1820    vpbroadcastd    m6, [pd_m4096]
   1821    psrld           m4, 24               ; z3 < 255 ? 255 : 0
   1822    psrld           m5, 24
   1823    cvtps2dq        m2, m2
   1824    cvtps2dq        m3, m3
   1825    pminsw          m2, m4               ; x3
   1826    pminsw          m3, m5
   1827    pmulld          m0, m2
   1828    pmulld          m1, m3
   1829    paddd           m0, m8               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   1830    paddd           m1, m8
   1831    pand            m0, m6
   1832    pand            m1, m6
   1833    por             m0, m2               ; a3 | (b3 << 12)
   1834    por             m1, m3
   1835    mova         [t3+r10*4+400*4+ 8], xm0
   1836    vextracti128 [t3+r10*4+400*4+40], m0, 1
   1837    mova         [t3+r10*4+400*4+24], xm1
   1838    vextracti128 [t3+r10*4+400*4+56], m1, 1
   1839    add            r10, 16
   1840    jl .hv0_loop
   1841    ret
   1842 ALIGN function_align
   1843 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
   1844    lea            r10, [wq-2]
   1845    test         edgeb, 1 ; LR_HAVE_LEFT
   1846    jz .hv1_extend_left
   1847    vpbroadcastd   xm0, [leftq]
   1848    mova           xm5, [lpfq+wq]
   1849    palignr        xm5, xm0, 12
   1850    add          leftq, 4
   1851    jmp .hv1_main
   1852 .hv1_extend_left:
   1853    mova           xm5, [lpfq+wq]
   1854    pshufb         xm5, [sgr_l_shuf]
   1855    jmp .hv1_main
   1856 .hv1_bottom:
   1857    lea            r10, [wq-2]
   1858    test         edgeb, 1 ; LR_HAVE_LEFT
   1859    jz .hv1_extend_left
   1860 .hv1_loop:
   1861    movu           xm5, [lpfq+r10-2]
   1862 .hv1_main:
   1863    vinserti128     m5, [lpfq+r10+6], 1
   1864    test         edgeb, 2 ; LR_HAVE_RIGHT
   1865    jnz .hv1_have_right
   1866    cmp           r10d, -18
   1867    jl .hv1_have_right
   1868    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
   1869 .hv1_have_right:
   1870    pshufb          m6, m5, m9
   1871    pshufb          m3, m5, m10
   1872    paddw           m8, m6, m3
   1873    shufps          m2, m6, m3, q2121
   1874    pmullw          m1, m2, m2
   1875    pshufb          m0, m5, m11
   1876    paddw           m2, m0
   1877    pshufb          m5, m12
   1878    paddw           m2, m5 ; sum3
   1879    punpcklwd       m4, m5, m0
   1880    pmaddwd         m4, m4
   1881    punpckhwd       m5, m0
   1882    pmaddwd         m5, m5
   1883    punpcklwd       m0, m6, m3
   1884    pmaddwd         m0, m0
   1885    punpckhwd       m6, m3
   1886    pmaddwd         m6, m6
   1887    punpcklwd       m3, m1, m7
   1888    paddd           m4, m3 ; sumsq3
   1889    punpckhwd       m1, m7
   1890    paddd           m5, m1
   1891    paddw           m1, m2, [t2+r10*2+400* 6]
   1892    mova [t2+r10*2+400* 6], m2
   1893    paddw           m8, m2 ; sum5
   1894    paddd           m2, m4, [t2+r10*2+400* 8]
   1895    paddd           m3, m5, [t2+r10*2+400*10]
   1896    mova [t2+r10*2+400* 8], m4
   1897    mova [t2+r10*2+400*10], m5
   1898    vpbroadcastd    m9, [pw_455_24]
   1899    paddd           m4, m0 ; sumsq5
   1900    paddd           m5, m6
   1901    punpcklwd       m0, m1, m7           ; b3
   1902    punpckhwd       m1, m7
   1903    pslld           m6, m2, 3
   1904    pslld           m7, m3, 3
   1905    paddd           m6, m2               ; a3 * 9
   1906    pmaddwd         m2, m0, m0           ; b3 * b3
   1907    paddd           m7, m3
   1908    pmaddwd         m3, m1, m1
   1909    psubd           m6, m2               ; p3
   1910    psubd           m7, m3
   1911    pmulld          m6, m14              ; p3 * s1
   1912    pmulld          m7, m14
   1913    pmaddwd         m0, m9               ; b3 * 455
   1914    pmaddwd         m1, m9
   1915    paddw           m6, m9
   1916    paddw           m7, m9
   1917    vbroadcastss    m9, [pf_256]
   1918    psrld           m6, 20               ; z3 + 1
   1919    psrld           m7, 20
   1920    cvtdq2ps        m6, m6
   1921    cvtdq2ps        m7, m7
   1922    rcpps           m2, m6               ; 1 / (z3 + 1)
   1923    rcpps           m3, m7
   1924    pcmpgtd         m6, m9, m6
   1925    pcmpgtd         m7, m9, m7
   1926    mulps           m2, m9               ; 256 / (z3 + 1)
   1927    mulps           m3, m9
   1928    vpbroadcastd    m9, [pd_34816]
   1929    psrld           m6, 24               ; z3 < 255 ? 255 : 0
   1930    psrld           m7, 24
   1931    cvtps2dq        m2, m2
   1932    cvtps2dq        m3, m3
   1933    pminsw          m2, m6               ; x3
   1934    vpbroadcastd    m6, [pd_m4096]
   1935    pminsw          m3, m7
   1936    pmulld          m0, m2
   1937    pmulld          m1, m3
   1938    paddd           m0, m9               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   1939    paddd           m1, m9
   1940    pand            m0, m6
   1941    pand            m7, m6, m1
   1942    por             m0, m2               ; a3 | (b3 << 12)
   1943    por             m7, m3
   1944    paddw           m1, m8, [t2+r10*2+400*0]
   1945    paddd           m2, m4, [t2+r10*2+400*2]
   1946    paddd           m3, m5, [t2+r10*2+400*4]
   1947    paddw           m1, [t1+r10*2+400*0]
   1948    paddd           m2, [t1+r10*2+400*2]
   1949    paddd           m3, [t1+r10*2+400*4]
   1950    mova [t2+r10*2+400*0], m8
   1951    mova [t2+r10*2+400*2], m4
   1952    mova [t2+r10*2+400*4], m5
   1953    mova         [t3+r10*4+400*8+ 8], xm0
   1954    vextracti128 [t3+r10*4+400*8+40], m0, 1
   1955    mova         [t3+r10*4+400*8+24], xm7
   1956    vextracti128 [t3+r10*4+400*8+56], m7, 1
   1957    vpbroadcastd    m4, [pd_25]
   1958    pxor            m7, m7
   1959    vpbroadcastd    m8, [pw_164_24]
   1960    punpcklwd       m0, m1, m7           ; b5
   1961    punpckhwd       m1, m7
   1962    pmulld          m2, m4               ; a5 * 25
   1963    pmulld          m3, m4
   1964    pmaddwd         m4, m0, m0           ; b5 * b5
   1965    pmaddwd         m5, m1, m1
   1966    psubd           m2, m4               ; p5
   1967    psubd           m3, m5
   1968    pmulld          m2, m13              ; p5 * s0
   1969    pmulld          m3, m13
   1970    pmaddwd         m0, m8               ; b5 * 164
   1971    pmaddwd         m1, m8
   1972    paddw           m2, m8
   1973    paddw           m3, m8
   1974    vbroadcastss    m8, [pf_256]
   1975    psrld           m2, 20               ; z5 + 1
   1976    psrld           m3, 20
   1977    cvtdq2ps        m2, m2
   1978    cvtdq2ps        m3, m3
   1979    rcpps           m4, m2               ; 1 / (z5 + 1)
   1980    rcpps           m5, m3
   1981    pcmpgtd         m2, m8, m2
   1982    pcmpgtd         m3, m8, m3
   1983    mulps           m4, m8               ; 256 / (z5 + 1)
   1984    mulps           m5, m8
   1985    psrld           m2, 24               ; z5 < 255 ? 255 : 0
   1986    psrld           m3, 24
   1987    cvtps2dq        m4, m4
   1988    cvtps2dq        m5, m5
   1989    pminsw          m4, m2               ; x5
   1990    pminsw          m5, m3
   1991    pmulld          m0, m4
   1992    pmulld          m1, m5
   1993    paddd           m0, m9               ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
   1994    paddd           m1, m9
   1995    vbroadcasti128  m9, [sgr_shuf]
   1996    pand            m0, m6
   1997    pand            m1, m6
   1998    por             m0, m4               ; a5 | (b5 << 12)
   1999    por             m1, m5
   2000    mova         [t3+r10*4+400*0+ 8], xm0
   2001    vextracti128 [t3+r10*4+400*0+40], m0, 1
   2002    mova         [t3+r10*4+400*0+24], xm1
   2003    vextracti128 [t3+r10*4+400*0+56], m1, 1
   2004    add            r10, 16
   2005    jl .hv1_loop
   2006    mov            r10, t2
   2007    mov             t2, t1
   2008    mov             t1, r10
   2009    ret
   2010 .v0: ; vertical boxsums + ab3 (even rows)
   2011    lea            r10, [wq-2]
   2012    vpbroadcastd    m6, [pd_34816]
   2013 .v0_loop:
   2014    mova            m0, [t1+r10*2+400* 6]
   2015    mova            m4, [t1+r10*2+400* 8]
   2016    mova            m5, [t1+r10*2+400*10]
   2017    paddw           m0, m0
   2018    paddd           m4, m4
   2019    paddd           m5, m5
   2020    paddw           m1, m0, [t2+r10*2+400* 6]
   2021    paddd           m2, m4, [t2+r10*2+400* 8]
   2022    paddd           m3, m5, [t2+r10*2+400*10]
   2023    mova [t2+r10*2+400* 6], m0
   2024    mova [t2+r10*2+400* 8], m4
   2025    mova [t2+r10*2+400*10], m5
   2026    vpbroadcastd    m8, [pw_455_24]
   2027    punpcklwd       m0, m1, m7           ; b3
   2028    punpckhwd       m1, m7
   2029    pslld           m4, m2, 3
   2030    pslld           m5, m3, 3
   2031    paddd           m4, m2               ; a3 * 9
   2032    pmaddwd         m2, m0, m0           ; b3 * b3
   2033    paddd           m5, m3
   2034    pmaddwd         m3, m1, m1
   2035    psubd           m4, m2               ; p3
   2036    psubd           m5, m3
   2037    pmulld          m4, m14              ; p3 * s1
   2038    pmulld          m5, m14
   2039    pmaddwd         m0, m8               ; b3 * 455
   2040    pmaddwd         m1, m8
   2041    paddw           m4, m8
   2042    paddw           m5, m8
   2043    vbroadcastss    m8, [pf_256]
   2044    psrld           m4, 20               ; z3 + 1
   2045    psrld           m5, 20
   2046    cvtdq2ps        m4, m4
   2047    cvtdq2ps        m5, m5
   2048    rcpps           m2, m4               ; 1 / (z3 + 1)
   2049    rcpps           m3, m5
   2050    pcmpgtd         m4, m8, m4
   2051    pcmpgtd         m5, m8, m5
   2052    mulps           m2, m8               ; 256 / (z3 + 1)
   2053    mulps           m3, m8
   2054    vpbroadcastd    m8, [pd_m4096]
   2055    psrld           m4, 24               ; z3 < 255 ? 255 : 0
   2056    psrld           m5, 24
   2057    cvtps2dq        m2, m2
   2058    cvtps2dq        m3, m3
   2059    pminsw          m2, m4               ; x3
   2060    pminsw          m3, m5
   2061    pmulld          m0, m2
   2062    pmulld          m1, m3
   2063    paddd           m0, m6               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   2064    paddd           m1, m6
   2065    pand            m0, m8
   2066    pand            m1, m8
   2067    por             m0, m2               ; a3 | (b3 << 12)
   2068    por             m1, m3
   2069    mova            m2, [t1+r10*2+400*0]
   2070    mova            m3, [t1+r10*2+400*2]
   2071    mova            m4, [t1+r10*2+400*4]
   2072    mova [t3+r10*4+400*8+ 8], m2
   2073    mova [t3+r10*4+400*0+ 8], m3
   2074    mova [t3+r10*4+400*0+40], m4
   2075    paddw           m2, m2               ; cc5
   2076    paddd           m3, m3
   2077    paddd           m4, m4
   2078    mova [t1+r10*2+400*0], m2
   2079    mova [t1+r10*2+400*2], m3
   2080    mova [t1+r10*2+400*4], m4
   2081    mova         [t3+r10*4+400*4+ 8], xm0
   2082    vextracti128 [t3+r10*4+400*4+40], m0, 1
   2083    mova         [t3+r10*4+400*4+24], xm1
   2084    vextracti128 [t3+r10*4+400*4+56], m1, 1
   2085    add            r10, 16
   2086    jl .v0_loop
   2087    ret
   2088 .v1: ; vertical boxsums + ab (odd rows)
   2089    lea            r10, [wq-2]
   2090 .v1_loop:
   2091    mova            m4, [t1+r10*2+400* 6]
   2092    mova            m5, [t1+r10*2+400* 8]
   2093    mova            m6, [t1+r10*2+400*10]
   2094    paddw           m1, m4, [t2+r10*2+400* 6]
   2095    paddd           m2, m5, [t2+r10*2+400* 8]
   2096    paddd           m3, m6, [t2+r10*2+400*10]
   2097    mova [t2+r10*2+400* 6], m4
   2098    mova [t2+r10*2+400* 8], m5
   2099    mova [t2+r10*2+400*10], m6
   2100    vpbroadcastd    m8, [pw_455_24]
   2101    punpcklwd       m0, m1, m7           ; b3
   2102    punpckhwd       m1, m7
   2103    pslld           m4, m2, 3
   2104    pslld           m5, m3, 3
   2105    paddd           m4, m2               ; a3 * 9
   2106    pmaddwd         m2, m0, m0           ; b3 * b3
   2107    paddd           m5, m3
   2108    pmaddwd         m3, m1, m1
   2109    psubd           m4, m2               ; p3
   2110    psubd           m5, m3
   2111    pmulld          m4, m14              ; p3 * s1
   2112    pmulld          m5, m14
   2113    pmaddwd         m0, m8               ; b3 * 455
   2114    pmaddwd         m1, m8
   2115    paddw           m4, m8
   2116    paddw           m5, m8
   2117    vbroadcastss    m8, [pf_256]
   2118    psrld           m4, 20               ; z3 + 1
   2119    psrld           m5, 20
   2120    cvtdq2ps        m4, m4
   2121    cvtdq2ps        m5, m5
   2122    rcpps           m2, m4               ; 1 / (z3 + 1)
   2123    rcpps           m3, m5
   2124    pcmpgtd         m4, m8, m4
   2125    pcmpgtd         m5, m8, m5
   2126    mulps           m2, m8               ; 256 / (z3 + 1)
   2127    mulps           m3, m8
   2128    vpbroadcastd    m8, [pd_m4096]
   2129    psrld           m4, 24               ; z3 < 255 ? 255 : 0
   2130    psrld           m5, 24
   2131    cvtps2dq        m2, m2
   2132    cvtps2dq        m3, m3
   2133    pminsw          m2, m4               ; x3
   2134    vpbroadcastd    m4, [pd_34816]
   2135    pminsw          m3, m5
   2136    pmulld          m0, m2
   2137    pmulld          m1, m3
   2138    paddd           m0, m4               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   2139    paddd           m1, m4
   2140    pand            m0, m8
   2141    pand            m8, m1
   2142    por             m0, m2               ; a3 | (b3 << 12)
   2143    por             m8, m3
   2144    mova            m4, [t3+r10*4+400*8+ 8]
   2145    mova            m5, [t3+r10*4+400*0+ 8]
   2146    mova            m6, [t3+r10*4+400*0+40]
   2147    paddw           m1, m4, [t2+r10*2+400*0]
   2148    paddd           m2, m5, [t2+r10*2+400*2]
   2149    paddd           m3, m6, [t2+r10*2+400*4]
   2150    paddw           m1, [t1+r10*2+400*0]
   2151    paddd           m2, [t1+r10*2+400*2]
   2152    paddd           m3, [t1+r10*2+400*4]
   2153    mova [t2+r10*2+400*0], m4
   2154    mova [t2+r10*2+400*2], m5
   2155    mova [t2+r10*2+400*4], m6
   2156    vpbroadcastd    m4, [pd_25]
   2157    mova         [t3+r10*4+400*8+ 8], xm0
   2158    vextracti128 [t3+r10*4+400*8+40], m0, 1
   2159    mova         [t3+r10*4+400*8+24], xm8
   2160    vextracti128 [t3+r10*4+400*8+56], m8, 1
   2161    vpbroadcastd    m8, [pw_164_24]
   2162    punpcklwd       m0, m1, m7           ; b5
   2163    vbroadcastss    m6, [pf_256]
   2164    punpckhwd       m1, m7
   2165    pmulld          m2, m4               ; a5 * 25
   2166    pmulld          m3, m4
   2167    pmaddwd         m4, m0, m0           ; b5 * b5
   2168    pmaddwd         m5, m1, m1
   2169    psubd           m2, m4               ; p5
   2170    psubd           m3, m5
   2171    pmulld          m2, m13              ; p5 * s0
   2172    pmulld          m3, m13
   2173    pmaddwd         m0, m8               ; b5 * 164
   2174    pmaddwd         m1, m8
   2175    paddw           m2, m8
   2176    paddw           m3, m8
   2177    vpbroadcastd    m8, [pd_34816]
   2178    psrld           m2, 20               ; z5 + 1
   2179    psrld           m3, 20
   2180    cvtdq2ps        m2, m2
   2181    cvtdq2ps        m3, m3
   2182    rcpps           m4, m2               ; 1 / (z5 + 1)
   2183    rcpps           m5, m3
   2184    pcmpgtd         m2, m6, m2
   2185    pcmpgtd         m3, m6, m3
   2186    mulps           m4, m6               ; 256 / (z5 + 1)
   2187    mulps           m5, m6
   2188    vpbroadcastd    m6, [pd_m4096]
   2189    psrld           m2, 24               ; z5 < 255 ? 255 : 0
   2190    psrld           m3, 24
   2191    cvtps2dq        m4, m4
   2192    cvtps2dq        m5, m5
   2193    pminsw          m4, m2               ; x5
   2194    pminsw          m5, m3
   2195    pmulld          m0, m4
   2196    pmulld          m1, m5
   2197    paddd           m0, m8               ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
   2198    paddd           m1, m8
   2199    pand            m0, m6
   2200    pand            m1, m6
   2201    por             m0, m4               ; a5 | (b5 << 12)
   2202    por             m1, m5
   2203    mova         [t3+r10*4+400*0+ 8], xm0
   2204    vextracti128 [t3+r10*4+400*0+40], m0, 1
   2205    mova         [t3+r10*4+400*0+24], xm1
   2206    vextracti128 [t3+r10*4+400*0+56], m1, 1
   2207    add            r10, 16
   2208    jl .v1_loop
   2209    mov            r10, t2
   2210    mov             t2, t1
   2211    mov             t1, r10
   2212    ret
   2213 .prep_n: ; initial neighbor setup
   2214    mov            r10, wq
   2215 .prep_n_loop:
   2216    movu            m0, [t3+r10*4+400*0+4]
   2217    paddd           m1, m0, [t3+r10*4+400*0+0]
   2218    mova            m4, [t3+r10*4+400*4+0]
   2219    paddd           m1, [t3+r10*4+400*0+8]
   2220    mova            m5, [t3+r10*4+400*8+0]
   2221    paddd           m4, [t3+r10*4+400*4+8]
   2222    paddd           m5, [t3+r10*4+400*8+8]
   2223    paddd           m2, m4, [t3+r10*4+400*4+4]
   2224    paddd           m3, m5, [t3+r10*4+400*8+4]
   2225    paddd           m0, m1
   2226    pslld           m1, 2
   2227    pslld           m2, 2
   2228    paddd           m1, m0                ; ab5 565
   2229    paddd           m3, m3                ; ab3[ 0] 222
   2230    psubd           m2, m4                ; ab3[-1] 343
   2231    mova [t3+r10*4+400*20], m3
   2232    pandn           m0, m6, m1            ; a5 565
   2233    mova [t3+r10*4+400*24], m2
   2234    psrld           m1, 12                ; b5 565
   2235    mova [t3+r10*4+400*12], m0
   2236    paddd           m3, m3
   2237    mova [t3+r10*4+400*16], m1
   2238    psubd           m3, m5                ; ab3[ 0] 343
   2239    mova [t3+r10*4+400*28], m3
   2240    add            r10, 8
   2241    jl .prep_n_loop
   2242    ret
   2243 ALIGN function_align
   2244 .n0: ; neighbor + output (even rows)
   2245    mov            r10, wq
   2246 .n0_loop:
   2247    movu            m0, [t3+r10*4+4]
   2248    paddd           m4, m0, [t3+r10*4+0]
   2249    paddd           m4, [t3+r10*4+8]
   2250    paddd           m0, m4
   2251    pslld           m4, 2
   2252    paddd           m4, m0
   2253    pandn           m0, m6, m4
   2254    psrld           m4, 12
   2255    paddd           m2, m0, [t3+r10*4+400*12] ; a5
   2256    mova [t3+r10*4+400*12], m0
   2257    paddd           m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8)
   2258    mova [t3+r10*4+400*16], m4
   2259    mova            m3, [t3+r10*4+400*4+0]
   2260    paddd           m3, [t3+r10*4+400*4+8]
   2261    paddd           m5, m3, [t3+r10*4+400*4+4]
   2262    paddd           m5, m5                    ; ab3[ 1] 222
   2263    mova            m4, [t3+r10*4+400*20]
   2264    paddd           m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343
   2265    mova [t3+r10*4+400*20], m5
   2266    paddd           m5, m5
   2267    psubd           m5, m3                    ; ab3[ 1] 343
   2268    mova [t3+r10*4+400*24], m5
   2269    paddd           m4, m5                    ; ab3[ 0] 222 + ab3[ 1] 343
   2270    pandn           m3, m6, m1
   2271    psrld           m1, 12
   2272    pandn           m5, m6, m4
   2273    psrld           m4, 12
   2274    paddd           m3, m5                    ; a3
   2275    paddd           m1, m4                    ; b3 + (1 << 8)
   2276    pmovzxbd        m4, [dstq+r10]
   2277    pmaddwd         m2, m4                    ; a5 * src
   2278    pmaddwd         m3, m4                    ; a3 * src
   2279    psubd           m0, m2                    ; b5 - a5 * src + (1 << 8)
   2280    psubd           m1, m3                    ; b3 - a3 * src + (1 << 8)
   2281    psrld           m0, 9
   2282    pslld           m1, 7
   2283    pblendw         m0, m1, 0xaa
   2284    pmaddwd         m0, m15
   2285    psubd           m0, m6
   2286    psrad           m0, 13
   2287    paddd           m0, m4
   2288    vextracti128   xm1, m0, 1
   2289    packssdw       xm0, xm1
   2290    packuswb       xm0, xm0
   2291    movq    [dstq+r10], xm0
   2292    add            r10, 8
   2293    jl .n0_loop
   2294    add           dstq, strideq
   2295    ret
   2296 ALIGN function_align
   2297 .n1: ; neighbor + output (odd rows)
   2298    mov            r10, wq
   2299 .n1_loop:
   2300    mova            m3, [t3+r10*4+400*8+0]
   2301    paddd           m3, [t3+r10*4+400*8+8]
   2302    paddd           m5, m3, [t3+r10*4+400*8+4]
   2303    paddd           m5, m5                    ; ab3[ 1] 222
   2304    mova            m4, [t3+r10*4+400*20]
   2305    paddd           m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343
   2306    mova [t3+r10*4+400*20], m5
   2307    paddd           m5, m5
   2308    psubd           m5, m3                    ; ab3[ 1] 343
   2309    mova [t3+r10*4+400*28], m5
   2310    paddd           m4, m5                    ; ab3[ 0] 222 + ab3[ 1] 343
   2311    pandn           m3, m6, m1
   2312    psrld           m1, 12
   2313    pandn           m5, m6, m4
   2314    psrld           m4, 12
   2315    paddd           m3, m5                    ; -a3
   2316    paddd           m1, m4                    ;  b3 + (1 << 8)
   2317    pmovzxbd        m4, [dstq+r10]
   2318    pmaddwd         m2, m4, [t3+r10*4+400*12] ; -a5 * src
   2319    mova            m0, [t3+r10*4+400*16]     ;  b5 + (1 << 7)
   2320    pmaddwd         m3, m4                    ; -a3 * src
   2321    psubd           m0, m2                    ; a5 * src + b5 + (1 << 7)
   2322    psubd           m1, m3                    ; a3 * src + b3 + (1 << 8)
   2323    psrld           m0, 8
   2324    pslld           m1, 7
   2325    pblendw         m0, m1, 0xaa
   2326    pmaddwd         m0, m15
   2327    psubd           m0, m6
   2328    psrad           m0, 13
   2329    paddd           m0, m4
   2330    vextracti128   xm1, m0, 1
   2331    packssdw       xm0, xm1
   2332    packuswb       xm0, xm0
   2333    movq    [dstq+r10], xm0
   2334    add            r10, 8
   2335    jl .n1_loop
   2336    add           dstq, strideq
   2337    ret
   2338 
   2339 %endif ; ARCH_X86_64