tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration16_avx2.asm (84243B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA 32
     32 
     33 sgr_lshuf3:    db  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
     34 sgr_lshuf5:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9
     35 wiener_lshuf5: db  4,  5,  4,  5,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     36               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     37 wiener_lshuf7: db  8,  9,  8,  9,  8,  9,  8,  9,  8,  9, 10, 11, 12, 13, 14, 15
     38               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     39 wiener_shufA:  db  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11
     40 wiener_shufB:  db  6,  7,  4,  5,  8,  9,  6,  7, 10, 11,  8,  9, 12, 13, 10, 11
     41 wiener_shufC:  db  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
     42 wiener_shufD:  db  2,  3, -1, -1,  4,  5, -1, -1,  6,  7, -1, -1,  8,  9, -1, -1
     43 wiener_shufE:  db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
     44 
     45 wiener_hshift: dw 4, 4, 1, 1
     46 wiener_vshift: dw 1024, 1024, 4096, 4096
     47 wiener_round:  dd 1049600, 1048832
     48 
     49 pb_m10_m9:     times 2 db -10, -9
     50 pb_m6_m5:      times 2 db  -6, -5
     51 pb_m2_m1:      times 2 db  -2, -1
     52 pb_2_3:        times 2 db   2,  3
     53 pb_6_7:        times 2 db   6,  7
     54 pw_1023:       times 2 dw 1023
     55 pw_164_24:     dw 164, 24
     56 pw_455_24:     dw 455, 24
     57 pd_8:          dd 8
     58 pd_25:         dd 25
     59 pd_4096:       dd 4096
     60 pd_34816:      dd 34816
     61 pd_m262128:    dd -262128
     62 pf_256:        dd 256.0
     63 
     64 %define pw_256 sgr_lshuf5
     65 
     66 cextern pb_0to63
     67 
     68 SECTION .text
     69 
     70 DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
     71 
     72 INIT_YMM avx2
     73 cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
     74                                                     w, h, edge, flt
     75 %define base t4-wiener_hshift
     76    mov           fltq, r6mp
     77    movifnidn       wd, wm
     78    movifnidn       hd, hm
     79    mov          edged, r7m
     80    mov            t3d, r8m ; pixel_max
     81    vbroadcasti128  m6, [wiener_shufA]
     82    vpbroadcastd   m12, [fltq+ 0] ; x0 x1
     83    lea             t4, [wiener_hshift]
     84    vbroadcasti128  m7, [wiener_shufB]
     85    add             wd, wd
     86    vpbroadcastd   m13, [fltq+ 4] ; x2 x3
     87    shr            t3d, 11
     88    vpbroadcastd   m14, [fltq+16] ; y0 y1
     89    add           lpfq, wq
     90    vpbroadcastd   m15, [fltq+20] ; y2 y3
     91    add           dstq, wq
     92    vbroadcasti128  m8, [wiener_shufC]
     93    lea             t1, [rsp+wq+16]
     94    vbroadcasti128  m9, [wiener_shufD]
     95    neg             wq
     96    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
     97    vpbroadcastd   m10, [base+wiener_round+t3*4]
     98    vpbroadcastd   m11, [base+wiener_vshift+t3*4]
     99    pmullw         m12, m0 ; upshift filter coefs to make the
    100    pmullw         m13, m0 ; horizontal downshift constant
    101    test         edgeb, 4 ; LR_HAVE_TOP
    102    jz .no_top
    103    call .h_top
    104    add           lpfq, strideq
    105    mov             t6, t1
    106    mov             t5, t1
    107    add             t1, 384*2
    108    call .h_top
    109    lea            r10, [lpfq+strideq*4]
    110    mov           lpfq, dstq
    111    mov             t4, t1
    112    add             t1, 384*2
    113    add            r10, strideq
    114    mov          [rsp], r10 ; below
    115    call .h
    116    mov             t3, t1
    117    mov             t2, t1
    118    dec             hd
    119    jz .v1
    120    add           lpfq, strideq
    121    add             t1, 384*2
    122    call .h
    123    mov             t2, t1
    124    dec             hd
    125    jz .v2
    126    add           lpfq, strideq
    127    add             t1, 384*2
    128    call .h
    129    dec             hd
    130    jz .v3
    131 .main:
    132    lea             t0, [t1+384*2]
    133 .main_loop:
    134    call .hv
    135    dec             hd
    136    jnz .main_loop
    137    test         edgeb, 8 ; LR_HAVE_BOTTOM
    138    jz .v3
    139    mov           lpfq, [rsp]
    140    call .hv_bottom
    141    add           lpfq, strideq
    142    call .hv_bottom
    143 .v1:
    144    call .v
    145    RET
    146 .no_top:
    147    lea            r10, [lpfq+strideq*4]
    148    mov           lpfq, dstq
    149    lea            r10, [r10+strideq*2]
    150    mov          [rsp], r10
    151    call .h
    152    mov             t6, t1
    153    mov             t5, t1
    154    mov             t4, t1
    155    mov             t3, t1
    156    mov             t2, t1
    157    dec             hd
    158    jz .v1
    159    add           lpfq, strideq
    160    add             t1, 384*2
    161    call .h
    162    mov             t2, t1
    163    dec             hd
    164    jz .v2
    165    add           lpfq, strideq
    166    add             t1, 384*2
    167    call .h
    168    dec             hd
    169    jz .v3
    170    lea             t0, [t1+384*2]
    171    call .hv
    172    dec             hd
    173    jz .v3
    174    add             t0, 384*8
    175    call .hv
    176    dec             hd
    177    jnz .main
    178 .v3:
    179    call .v
    180 .v2:
    181    call .v
    182    jmp .v1
    183 .extend_right:
    184    movd           xm1, r10d
    185    vpbroadcastd    m0, [pb_6_7]
    186    mova            m2, [pb_0to63]
    187    vpbroadcastb    m1, xm1
    188    psubb           m0, m1
    189    pminub          m0, m2
    190    pshufb          m3, m0
    191    vpbroadcastd    m0, [pb_m2_m1]
    192    psubb           m0, m1
    193    pminub          m0, m2
    194    pshufb          m4, m0
    195    vpbroadcastd    m0, [pb_m10_m9]
    196    psubb           m0, m1
    197    pminub          m0, m2
    198    pshufb          m5, m0
    199    ret
    200 .h:
    201    mov            r10, wq
    202    test         edgeb, 1 ; LR_HAVE_LEFT
    203    jz .h_extend_left
    204    movq           xm3, [leftq]
    205    vpblendd        m3, [lpfq+r10-8], 0xfc
    206    add          leftq, 8
    207    jmp .h_main
    208 .h_extend_left:
    209    vbroadcasti128  m3, [lpfq+r10] ; avoid accessing memory located
    210    mova            m4, [lpfq+r10] ; before the start of the buffer
    211    shufpd          m3, m4, 0x05
    212    pshufb          m3, [wiener_lshuf7]
    213    jmp .h_main2
    214 .h_top:
    215    mov            r10, wq
    216    test         edgeb, 1 ; LR_HAVE_LEFT
    217    jz .h_extend_left
    218 .h_loop:
    219    movu            m3, [lpfq+r10-8]
    220 .h_main:
    221    mova            m4, [lpfq+r10+0]
    222 .h_main2:
    223    movu            m5, [lpfq+r10+8]
    224    test         edgeb, 2 ; LR_HAVE_RIGHT
    225    jnz .h_have_right
    226    cmp           r10d, -36
    227    jl .h_have_right
    228    call .extend_right
    229 .h_have_right:
    230    pshufb          m0, m3, m6
    231    pshufb          m1, m4, m7
    232    paddw           m0, m1
    233    pshufb          m3, m8
    234    pmaddwd         m0, m12
    235    pshufb          m1, m4, m9
    236    paddw           m3, m1
    237    pshufb          m1, m4, m6
    238    pmaddwd         m3, m13
    239    pshufb          m2, m5, m7
    240    paddw           m1, m2
    241    vpbroadcastd    m2, [pd_m262128] ; (1 << 4) - (1 << 18)
    242    pshufb          m4, m8
    243    pmaddwd         m1, m12
    244    pshufb          m5, m9
    245    paddw           m4, m5
    246    pmaddwd         m4, m13
    247    paddd           m0, m2
    248    paddd           m1, m2
    249    paddd           m0, m3
    250    paddd           m1, m4
    251    psrad           m0, 4
    252    psrad           m1, 4
    253    packssdw        m0, m1
    254    psraw           m0, 1
    255    mova      [t1+r10], m0
    256    add            r10, 32
    257    jl .h_loop
    258    ret
    259 ALIGN function_align
    260 .hv:
    261    add           lpfq, strideq
    262    mov            r10, wq
    263    test         edgeb, 1 ; LR_HAVE_LEFT
    264    jz .hv_extend_left
    265    movq           xm3, [leftq]
    266    vpblendd        m3, [lpfq+r10-8], 0xfc
    267    add          leftq, 8
    268    jmp .hv_main
    269 .hv_extend_left:
    270    movu            m3, [lpfq+r10-8]
    271    pshufb          m3, [wiener_lshuf7]
    272    jmp .hv_main
    273 .hv_bottom:
    274    mov            r10, wq
    275    test         edgeb, 1 ; LR_HAVE_LEFT
    276    jz .hv_extend_left
    277 .hv_loop:
    278    movu            m3, [lpfq+r10-8]
    279 .hv_main:
    280    mova            m4, [lpfq+r10+0]
    281    movu            m5, [lpfq+r10+8]
    282    test         edgeb, 2 ; LR_HAVE_RIGHT
    283    jnz .hv_have_right
    284    cmp           r10d, -36
    285    jl .hv_have_right
    286    call .extend_right
    287 .hv_have_right:
    288    pshufb          m0, m3, m6
    289    pshufb          m1, m4, m7
    290    paddw           m0, m1
    291    pshufb          m3, m8
    292    pmaddwd         m0, m12
    293    pshufb          m1, m4, m9
    294    paddw           m3, m1
    295    pshufb          m1, m4, m6
    296    pmaddwd         m3, m13
    297    pshufb          m2, m5, m7
    298    paddw           m1, m2
    299    vpbroadcastd    m2, [pd_m262128]
    300    pshufb          m4, m8
    301    pmaddwd         m1, m12
    302    pshufb          m5, m9
    303    paddw           m4, m5
    304    pmaddwd         m4, m13
    305    paddd           m0, m2
    306    paddd           m1, m2
    307    mova            m2, [t4+r10]
    308    paddw           m2, [t2+r10]
    309    mova            m5, [t3+r10]
    310    paddd           m0, m3
    311    paddd           m1, m4
    312    psrad           m0, 4
    313    psrad           m1, 4
    314    packssdw        m0, m1
    315    mova            m4, [t5+r10]
    316    paddw           m4, [t1+r10]
    317    psraw           m0, 1
    318    paddw           m3, m0, [t6+r10]
    319    mova      [t0+r10], m0
    320    punpcklwd       m0, m2, m5
    321    pmaddwd         m0, m15
    322    punpckhwd       m2, m5
    323    pmaddwd         m2, m15
    324    punpcklwd       m1, m3, m4
    325    pmaddwd         m1, m14
    326    punpckhwd       m3, m4
    327    pmaddwd         m3, m14
    328    paddd           m0, m10
    329    paddd           m2, m10
    330    paddd           m0, m1
    331    paddd           m2, m3
    332    psrad           m0, 5
    333    psrad           m2, 5
    334    packusdw        m0, m2
    335    pmulhuw         m0, m11
    336    mova    [dstq+r10], m0
    337    add            r10, 32
    338    jl .hv_loop
    339    mov             t6, t5
    340    mov             t5, t4
    341    mov             t4, t3
    342    mov             t3, t2
    343    mov             t2, t1
    344    mov             t1, t0
    345    mov             t0, t6
    346    add           dstq, strideq
    347    ret
    348 .v:
    349    mov            r10, wq
    350 .v_loop:
    351    mova            m1, [t4+r10]
    352    paddw           m1, [t2+r10]
    353    mova            m2, [t3+r10]
    354    mova            m4, [t1+r10]
    355    paddw           m3, m4, [t6+r10]
    356    paddw           m4, [t5+r10]
    357    punpcklwd       m0, m1, m2
    358    pmaddwd         m0, m15
    359    punpckhwd       m1, m2
    360    pmaddwd         m1, m15
    361    punpcklwd       m2, m3, m4
    362    pmaddwd         m2, m14
    363    punpckhwd       m3, m4
    364    pmaddwd         m3, m14
    365    paddd           m0, m10
    366    paddd           m1, m10
    367    paddd           m0, m2
    368    paddd           m1, m3
    369    psrad           m0, 5
    370    psrad           m1, 5
    371    packusdw        m0, m1
    372    pmulhuw         m0, m11
    373    mova    [dstq+r10], m0
    374    add            r10, 32
    375    jl .v_loop
    376    mov             t6, t5
    377    mov             t5, t4
    378    mov             t4, t3
    379    mov             t3, t2
    380    mov             t2, t1
    381    add           dstq, strideq
    382    ret
    383 
    384 cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
    385                                                   w, h, edge, flt
    386 %define base t4-wiener_hshift
    387    mov           fltq, r6mp
    388    movifnidn       wd, wm
    389    movifnidn       hd, hm
    390    mov          edged, r7m
    391    mov            t3d, r8m ; pixel_max
    392    vbroadcasti128  m5, [wiener_shufE]
    393    vpbroadcastw   m11, [fltq+ 2] ; x1
    394    vbroadcasti128  m6, [wiener_shufB]
    395    lea             t4, [wiener_hshift]
    396    vbroadcasti128  m7, [wiener_shufD]
    397    add             wd, wd
    398    vpbroadcastd   m12, [fltq+ 4] ; x2 x3
    399    shr            t3d, 11
    400    vpbroadcastd    m8, [pd_m262128] ; (1 << 4) - (1 << 18)
    401    add           lpfq, wq
    402    vpbroadcastw   m13, [fltq+18] ; y1
    403    add           dstq, wq
    404    vpbroadcastd   m14, [fltq+20] ; y2 y3
    405    lea             t1, [rsp+wq+16]
    406    neg             wq
    407    vpbroadcastd    m0, [base+wiener_hshift+t3*4]
    408    vpbroadcastd    m9, [base+wiener_round+t3*4]
    409    vpbroadcastd   m10, [base+wiener_vshift+t3*4]
    410    mova           m15, [wiener_lshuf5]
    411    pmullw         m11, m0
    412    pmullw         m12, m0
    413    test         edgeb, 4 ; LR_HAVE_TOP
    414    jz .no_top
    415    call .h_top
    416    add           lpfq, strideq
    417    mov             t4, t1
    418    add             t1, 384*2
    419    call .h_top
    420    lea            r10, [lpfq+strideq*4]
    421    mov           lpfq, dstq
    422    mov             t3, t1
    423    add             t1, 384*2
    424    add            r10, strideq
    425    mov          [rsp], r10 ; below
    426    call .h
    427    mov             t2, t1
    428    dec             hd
    429    jz .v1
    430    add           lpfq, strideq
    431    add             t1, 384*2
    432    call .h
    433    dec             hd
    434    jz .v2
    435 .main:
    436    mov             t0, t4
    437 .main_loop:
    438    call .hv
    439    dec             hd
    440    jnz .main_loop
    441    test         edgeb, 8 ; LR_HAVE_BOTTOM
    442    jz .v2
    443    mov           lpfq, [rsp]
    444    call .hv_bottom
    445    add           lpfq, strideq
    446    call .hv_bottom
    447 .end:
    448    RET
    449 .no_top:
    450    lea            r10, [lpfq+strideq*4]
    451    mov           lpfq, dstq
    452    lea            r10, [r10+strideq*2]
    453    mov          [rsp], r10
    454    call .h
    455    mov             t4, t1
    456    mov             t3, t1
    457    mov             t2, t1
    458    dec             hd
    459    jz .v1
    460    add           lpfq, strideq
    461    add             t1, 384*2
    462    call .h
    463    dec             hd
    464    jz .v2
    465    lea             t0, [t1+384*2]
    466    call .hv
    467    dec             hd
    468    jz .v2
    469    add             t0, 384*6
    470    call .hv
    471    dec             hd
    472    jnz .main
    473 .v2:
    474    call .v
    475    mov             t4, t3
    476    mov             t3, t2
    477    mov             t2, t1
    478    add           dstq, strideq
    479 .v1:
    480    call .v
    481    jmp .end
    482 .extend_right:
    483    movd           xm2, r10d
    484    vpbroadcastd    m0, [pb_2_3]
    485    vpbroadcastd    m1, [pb_m6_m5]
    486    vpbroadcastb    m2, xm2
    487    psubb           m0, m2
    488    psubb           m1, m2
    489    mova            m2, [pb_0to63]
    490    pminub          m0, m2
    491    pminub          m1, m2
    492    pshufb          m3, m0
    493    pshufb          m4, m1
    494    ret
    495 .h:
    496    mov            r10, wq
    497    test         edgeb, 1 ; LR_HAVE_LEFT
    498    jz .h_extend_left
    499    movd           xm3, [leftq+4]
    500    vpblendd        m3, [lpfq+r10-4], 0xfe
    501    add          leftq, 8
    502    jmp .h_main
    503 .h_extend_left:
    504    vbroadcasti128  m4, [lpfq+r10] ; avoid accessing memory located
    505    mova            m3, [lpfq+r10] ; before the start of the buffer
    506    palignr         m3, m4, 12
    507    pshufb          m3, m15
    508    jmp .h_main
    509 .h_top:
    510    mov            r10, wq
    511    test         edgeb, 1 ; LR_HAVE_LEFT
    512    jz .h_extend_left
    513 .h_loop:
    514    movu            m3, [lpfq+r10-4]
    515 .h_main:
    516    movu            m4, [lpfq+r10+4]
    517    test         edgeb, 2 ; LR_HAVE_RIGHT
    518    jnz .h_have_right
    519    cmp           r10d, -34
    520    jl .h_have_right
    521    call .extend_right
    522 .h_have_right:
    523    pshufb          m0, m3, m5
    524    pmaddwd         m0, m11
    525    pshufb          m1, m4, m5
    526    pmaddwd         m1, m11
    527    pshufb          m2, m3, m6
    528    pshufb          m3, m7
    529    paddw           m2, m3
    530    pshufb          m3, m4, m6
    531    pmaddwd         m2, m12
    532    pshufb          m4, m7
    533    paddw           m3, m4
    534    pmaddwd         m3, m12
    535    paddd           m0, m8
    536    paddd           m1, m8
    537    paddd           m0, m2
    538    paddd           m1, m3
    539    psrad           m0, 4
    540    psrad           m1, 4
    541    packssdw        m0, m1
    542    psraw           m0, 1
    543    mova      [t1+r10], m0
    544    add            r10, 32
    545    jl .h_loop
    546    ret
    547 ALIGN function_align
    548 .hv:
    549    add           lpfq, strideq
    550    mov            r10, wq
    551    test         edgeb, 1 ; LR_HAVE_LEFT
    552    jz .hv_extend_left
    553    movd           xm3, [leftq+4]
    554    vpblendd        m3, [lpfq+r10-4], 0xfe
    555    add          leftq, 8
    556    jmp .hv_main
    557 .hv_extend_left:
    558    movu            m3, [lpfq+r10-4]
    559    pshufb          m3, m15
    560    jmp .hv_main
    561 .hv_bottom:
    562    mov            r10, wq
    563    test         edgeb, 1 ; LR_HAVE_LEFT
    564    jz .hv_extend_left
    565 .hv_loop:
    566    movu            m3, [lpfq+r10-4]
    567 .hv_main:
    568    movu            m4, [lpfq+r10+4]
    569    test         edgeb, 2 ; LR_HAVE_RIGHT
    570    jnz .hv_have_right
    571    cmp           r10d, -34
    572    jl .hv_have_right
    573    call .extend_right
    574 .hv_have_right:
    575    pshufb          m0, m3, m5
    576    pmaddwd         m0, m11
    577    pshufb          m1, m4, m5
    578    pmaddwd         m1, m11
    579    pshufb          m2, m3, m6
    580    pshufb          m3, m7
    581    paddw           m2, m3
    582    pshufb          m3, m4, m6
    583    pmaddwd         m2, m12
    584    pshufb          m4, m7
    585    paddw           m3, m4
    586    pmaddwd         m3, m12
    587    paddd           m0, m8
    588    paddd           m1, m8
    589    paddd           m0, m2
    590    mova            m2, [t3+r10]
    591    paddw           m2, [t1+r10]
    592    paddd           m1, m3
    593    mova            m4, [t2+r10]
    594    punpckhwd       m3, m2, m4
    595    pmaddwd         m3, m14
    596    punpcklwd       m2, m4
    597    mova            m4, [t4+r10]
    598    psrad           m0, 4
    599    psrad           m1, 4
    600    packssdw        m0, m1
    601    pmaddwd         m2, m14
    602    psraw           m0, 1
    603    mova      [t0+r10], m0
    604    punpckhwd       m1, m0, m4
    605    pmaddwd         m1, m13
    606    punpcklwd       m0, m4
    607    pmaddwd         m0, m13
    608    paddd           m3, m9
    609    paddd           m2, m9
    610    paddd           m1, m3
    611    paddd           m0, m2
    612    psrad           m1, 5
    613    psrad           m0, 5
    614    packusdw        m0, m1
    615    pmulhuw         m0, m10
    616    mova    [dstq+r10], m0
    617    add            r10, 32
    618    jl .hv_loop
    619    mov             t4, t3
    620    mov             t3, t2
    621    mov             t2, t1
    622    mov             t1, t0
    623    mov             t0, t4
    624    add           dstq, strideq
    625    ret
    626 .v:
    627    mov            r10, wq
    628 .v_loop:
    629    mova            m0, [t1+r10]
    630    paddw           m2, m0, [t3+r10]
    631    mova            m1, [t2+r10]
    632    mova            m4, [t4+r10]
    633    punpckhwd       m3, m2, m1
    634    pmaddwd         m3, m14
    635    punpcklwd       m2, m1
    636    pmaddwd         m2, m14
    637    punpckhwd       m1, m0, m4
    638    pmaddwd         m1, m13
    639    punpcklwd       m0, m4
    640    pmaddwd         m0, m13
    641    paddd           m3, m9
    642    paddd           m2, m9
    643    paddd           m1, m3
    644    paddd           m0, m2
    645    psrad           m1, 5
    646    psrad           m0, 5
    647    packusdw        m0, m1
    648    pmulhuw         m0, m10
    649    mova    [dstq+r10], m0
    650    add            r10, 32
    651    jl .v_loop
    652    ret
    653 
    654 cglobal sgr_filter_5x5_16bpc, 4, 14, 16, 400*24+16, dst, stride, left, lpf, \
    655                                                    w, h, edge, params
    656 %define base r13-pb_m10_m9
    657    movifnidn       wd, wm
    658    mov        paramsq, r6mp
    659    lea            r13, [pb_m10_m9]
    660    movifnidn       hd, hm
    661    mov          edged, r7m
    662    vpbroadcastw    m7, [paramsq+8] ; w0
    663    add             wd, wd
    664    vpbroadcastd    m8, [base+pd_8]
    665    add           lpfq, wq
    666    vpbroadcastd    m9, [base+pd_25]
    667    add           dstq, wq
    668    mova          xm10, [base+sgr_lshuf5]
    669    lea             t3, [rsp+wq*2+400*12+16]
    670    vpbroadcastd   m11, [paramsq+0] ; s0
    671    lea             t4, [rsp+wq+400*20+16]
    672    vpbroadcastd   m12, [base+pw_164_24]
    673    lea             t1, [rsp+wq+20]
    674    vbroadcastss   m13, [base+pf_256]
    675    neg             wq
    676    vpbroadcastd   m14, [base+pd_34816] ; (1 << 11) + (1 << 15)
    677    pxor            m6, m6
    678    vpbroadcastd   m15, [base+pw_1023]
    679    psllw           m7, 4
    680    test         edgeb, 4 ; LR_HAVE_TOP
    681    jz .no_top
    682    call .h_top
    683    add           lpfq, strideq
    684    mov             t2, t1
    685    call .top_fixup
    686    add             t1, 400*6
    687    call .h_top
    688    lea            r10, [lpfq+strideq*4]
    689    mov           lpfq, dstq
    690    add            r10, strideq
    691    mov          [rsp], r10 ; below
    692    mov             t0, t2
    693    dec             hd
    694    jz .height1
    695    or           edged, 16
    696    call .h
    697 .main:
    698    add           lpfq, strideq
    699    call .hv
    700    call .prep_n
    701    sub             hd, 2
    702    jl .extend_bottom
    703 .main_loop:
    704    add           lpfq, strideq
    705    test            hd, hd
    706    jz .odd_height
    707    call .h
    708    add           lpfq, strideq
    709    call .hv
    710    call .n0
    711    call .n1
    712    sub             hd, 2
    713    jge .main_loop
    714    test         edgeb, 8 ; LR_HAVE_BOTTOM
    715    jz .extend_bottom
    716    mov           lpfq, [rsp]
    717    call .h_top
    718    add           lpfq, strideq
    719    call .hv_bottom
    720 .end:
    721    call .n0
    722    call .n1
    723 .end2:
    724    RET
    725 .height1:
    726    call .hv
    727    call .prep_n
    728    jmp .odd_height_end
    729 .odd_height:
    730    call .hv
    731    call .n0
    732    call .n1
    733 .odd_height_end:
    734    call .v
    735    call .n0
    736    jmp .end2
    737 .extend_bottom:
    738    call .v
    739    jmp .end
    740 .no_top:
    741    lea            r10, [lpfq+strideq*4]
    742    mov           lpfq, dstq
    743    lea            r10, [r10+strideq*2]
    744    mov          [rsp], r10
    745    call .h
    746    lea             t2, [t1+400*6]
    747    call .top_fixup
    748    dec             hd
    749    jz .no_top_height1
    750    or           edged, 16
    751    mov             t0, t1
    752    mov             t1, t2
    753    jmp .main
    754 .no_top_height1:
    755    call .v
    756    call .prep_n
    757    jmp .odd_height_end
    758 .extend_right:
    759    vpbroadcastw    m0, [lpfq-2]
    760    movu            m1, [r13+r10+ 0]
    761    movu            m2, [r13+r10+16]
    762    vpblendvb       m4, m0, m1
    763    vpblendvb       m5, m0, m2
    764    ret
    765 .h: ; horizontal boxsum
    766    lea            r10, [wq-4]
    767    test         edgeb, 1 ; LR_HAVE_LEFT
    768    jz .h_extend_left
    769    vpbroadcastq   xm5, [leftq]
    770    vinserti128     m5, [lpfq+wq], 1
    771    mova            m4, [lpfq+wq]
    772    add          leftq, 8
    773    palignr         m4, m5, 10
    774    jmp .h_main
    775 .h_extend_left:
    776    mova           xm4, [lpfq+wq]
    777    pshufb         xm4, xm10
    778    vinserti128     m4, [lpfq+wq+10], 1
    779    jmp .h_main
    780 .h_top:
    781    lea            r10, [wq-4]
    782    test         edgeb, 1 ; LR_HAVE_LEFT
    783    jz .h_extend_left
    784 .h_loop:
    785    movu            m4, [lpfq+r10- 2]
    786 .h_main:
    787    movu            m5, [lpfq+r10+14]
    788    test         edgeb, 2 ; LR_HAVE_RIGHT
    789    jnz .h_have_right
    790    cmp           r10d, -36
    791    jl .h_have_right
    792    call .extend_right
    793 .h_have_right:
    794    palignr         m2, m5, m4, 2
    795    paddw           m0, m4, m2
    796    palignr         m3, m5, m4, 6
    797    paddw           m0, m3
    798    punpcklwd       m1, m2, m3
    799    pmaddwd         m1, m1
    800    punpckhwd       m2, m3
    801    pmaddwd         m2, m2
    802    shufpd          m5, m4, m5, 0x05
    803    paddw           m0, m5
    804    punpcklwd       m3, m4, m5
    805    pmaddwd         m3, m3
    806    paddd           m1, m3
    807    punpckhwd       m3, m4, m5
    808    pmaddwd         m3, m3
    809    shufps          m4, m5, q2121
    810    paddw           m0, m4             ; sum
    811    punpcklwd       m5, m4, m6
    812    pmaddwd         m5, m5
    813    punpckhwd       m4, m6
    814    pmaddwd         m4, m4
    815    paddd           m2, m3
    816    test         edgeb, 16             ; y > 0
    817    jz .h_loop_end
    818    paddw           m0, [t1+r10+400*0]
    819    paddd           m1, [t1+r10+400*2]
    820    paddd           m2, [t1+r10+400*4]
    821 .h_loop_end:
    822    paddd           m1, m5             ; sumsq
    823    paddd           m2, m4
    824    mova [t1+r10+400*0], m0
    825    mova [t1+r10+400*2], m1
    826    mova [t1+r10+400*4], m2
    827    add            r10, 32
    828    jl .h_loop
    829    ret
    830 .top_fixup:
    831    lea            r10, [wq-4]
    832 .top_fixup_loop: ; the sums of the first row needs to be doubled
    833    mova            m0, [t1+r10+400*0]
    834    mova            m1, [t1+r10+400*2]
    835    mova            m2, [t1+r10+400*4]
    836    paddw           m0, m0
    837    paddd           m1, m1
    838    paddd           m2, m2
    839    mova [t2+r10+400*0], m0
    840    mova [t2+r10+400*2], m1
    841    mova [t2+r10+400*4], m2
    842    add            r10, 32
    843    jl .top_fixup_loop
    844    ret
    845 ALIGN function_align
    846 .hv: ; horizontal boxsum + vertical boxsum + ab
    847    lea            r10, [wq-4]
    848    test         edgeb, 1 ; LR_HAVE_LEFT
    849    jz .hv_extend_left
    850    vpbroadcastq   xm5, [leftq]
    851    vinserti128     m5, [lpfq+wq], 1
    852    mova            m4, [lpfq+wq]
    853    add          leftq, 8
    854    palignr         m4, m5, 10
    855    jmp .hv_main
    856 .hv_extend_left:
    857    mova           xm4, [lpfq+wq]
    858    pshufb         xm4, xm10
    859    vinserti128     m4, [lpfq+wq+10], 1
    860    jmp .hv_main
    861 .hv_bottom:
    862    lea            r10, [wq-4]
    863    test         edgeb, 1 ; LR_HAVE_LEFT
    864    jz .hv_extend_left
    865 .hv_loop:
    866    movu            m4, [lpfq+r10- 2]
    867 .hv_main:
    868    movu            m5, [lpfq+r10+14]
    869    test         edgeb, 2 ; LR_HAVE_RIGHT
    870    jnz .hv_have_right
    871    cmp           r10d, -36
    872    jl .hv_have_right
    873    call .extend_right
    874 .hv_have_right:
    875    palignr         m3, m5, m4, 2
    876    paddw           m0, m4, m3
    877    palignr         m1, m5, m4, 6
    878    paddw           m0, m1
    879    punpcklwd       m2, m3, m1
    880    pmaddwd         m2, m2
    881    punpckhwd       m3, m1
    882    pmaddwd         m3, m3
    883    shufpd          m5, m4, m5, 0x05
    884    paddw           m0, m5
    885    punpcklwd       m1, m4, m5
    886    pmaddwd         m1, m1
    887    paddd           m2, m1
    888    punpckhwd       m1, m4, m5
    889    pmaddwd         m1, m1
    890    shufps          m4, m5, q2121
    891    paddw           m0, m4            ; h sum
    892    punpcklwd       m5, m4, m6
    893    pmaddwd         m5, m5
    894    punpckhwd       m4, m6
    895    pmaddwd         m4, m4
    896    paddd           m3, m1
    897    paddd           m2, m5            ; h sumsq
    898    paddd           m3, m4
    899    paddw           m1, m0, [t1+r10+400*0]
    900    paddd           m4, m2, [t1+r10+400*2]
    901    paddd           m5, m3, [t1+r10+400*4]
    902    test            hd, hd
    903    jz .hv_last_row
    904 .hv_main2:
    905    paddw           m1, [t2+r10+400*0] ; hv sum
    906    paddd           m4, [t2+r10+400*2] ; hv sumsq
    907    paddd           m5, [t2+r10+400*4]
    908    mova [t0+r10+400*0], m0
    909    mova [t0+r10+400*2], m2
    910    mova [t0+r10+400*4], m3
    911    psrlw           m3, m1, 1
    912    paddd           m4, m8
    913    pavgw           m3, m6             ; (b + 2) >> 2
    914    paddd           m5, m8
    915    psrld           m4, 4              ; (a + 8) >> 4
    916    punpcklwd       m2, m3, m6
    917    psrld           m5, 4
    918    punpckhwd       m3, m6
    919    pmulld          m4, m9             ; a * 25
    920    pmulld          m5, m9
    921    pmaddwd         m2, m2             ; b * b
    922    pmaddwd         m3, m3
    923    punpcklwd       m0, m1, m6         ; b
    924    punpckhwd       m1, m6
    925    pmaxud          m4, m2
    926    pmaxud          m5, m3
    927    psubd           m4, m2             ; p
    928    psubd           m5, m3
    929    pmulld          m4, m11            ; p * s
    930    pmulld          m5, m11
    931    pmaddwd         m0, m12            ; b * 164
    932    pmaddwd         m1, m12
    933    paddw           m4, m12
    934    paddw           m5, m12
    935    psrld           m4, 20             ; z + 1
    936    psrld           m5, 20
    937    cvtdq2ps        m4, m4
    938    cvtdq2ps        m5, m5
    939    rcpps           m2, m4             ; 1 / (z + 1)
    940    rcpps           m3, m5
    941    pcmpgtd         m4, m13, m4
    942    pcmpgtd         m5, m13, m5
    943    mulps           m2, m13            ; 256 / (z + 1)
    944    mulps           m3, m13
    945    psrld           m4, 24             ; z < 255 ? 255 : 0
    946    psrld           m5, 24
    947    cvtps2dq        m2, m2
    948    cvtps2dq        m3, m3
    949    pminsw          m2, m4             ; x
    950    pminsw          m3, m5
    951    pmulld          m0, m2
    952    pmulld          m1, m3
    953    packssdw        m2, m3
    954    paddd           m0, m14            ; x * b * 164 + (1 << 11) + (1 << 15)
    955    paddd           m1, m14
    956    mova    [t4+r10+4], m2
    957    psrld           m0, 12             ; b
    958    psrld           m1, 12
    959    mova         [t3+r10*2+ 8], xm0
    960    vextracti128 [t3+r10*2+40], m0, 1
    961    mova         [t3+r10*2+24], xm1
    962    vextracti128 [t3+r10*2+56], m1, 1
    963    add            r10, 32
    964    jl .hv_loop
    965    mov             t2, t1
    966    mov             t1, t0
    967    mov             t0, t2
    968    ret
    969 .hv_last_row: ; esoteric edge case for odd heights
    970    mova [t1+r10+400*0], m1
    971    paddw            m1, m0
    972    mova [t1+r10+400*2], m4
    973    paddd            m4, m2
    974    mova [t1+r10+400*4], m5
    975    paddd            m5, m3
    976    jmp .hv_main2
    977 .v: ; vertical boxsum + ab
    978    lea            r10, [wq-4]
    979 .v_loop:
    980    mova            m0, [t1+r10+400*0]
    981    mova            m2, [t1+r10+400*2]
    982    mova            m3, [t1+r10+400*4]
    983    paddw           m1, m0, [t2+r10+400*0]
    984    paddd           m4, m2, [t2+r10+400*2]
    985    paddd           m5, m3, [t2+r10+400*4]
    986    paddw           m0, m0
    987    paddd           m2, m2
    988    paddd           m3, m3
    989    paddw           m1, m0             ; hv sum
    990    paddd           m4, m2             ; hv sumsq
    991    paddd           m5, m3
    992    psrlw           m3, m1, 1
    993    paddd           m4, m8
    994    pavgw           m3, m6             ; (b + 2) >> 2
    995    paddd           m5, m8
    996    psrld           m4, 4              ; (a + 8) >> 4
    997    punpcklwd       m2, m3, m6
    998    psrld           m5, 4
    999    punpckhwd       m3, m6
   1000    pmulld          m4, m9             ; a * 25
   1001    pmulld          m5, m9
   1002    pmaddwd         m2, m2             ; b * b
   1003    pmaddwd         m3, m3
   1004    punpcklwd       m0, m1, m6         ; b
   1005    punpckhwd       m1, m6
   1006    pmaxud          m4, m2
   1007    pmaxud          m5, m3
   1008    psubd           m4, m2             ; p
   1009    psubd           m5, m3
   1010    pmulld          m4, m11            ; p * s
   1011    pmulld          m5, m11
   1012    pmaddwd         m0, m12            ; b * 164
   1013    pmaddwd         m1, m12
   1014    paddw           m4, m12
   1015    paddw           m5, m12
   1016    psrld           m4, 20             ; z + 1
   1017    psrld           m5, 20
   1018    cvtdq2ps        m4, m4
   1019    cvtdq2ps        m5, m5
   1020    rcpps           m2, m4             ; 1 / (z + 1)
   1021    rcpps           m3, m5
   1022    pcmpgtd         m4, m13, m4
   1023    pcmpgtd         m5, m13, m5
   1024    mulps           m2, m13            ; 256 / (z + 1)
   1025    mulps           m3, m13
   1026    psrld           m4, 24             ; z < 255 ? 255 : 0
   1027    psrld           m5, 24
   1028    cvtps2dq        m2, m2
   1029    cvtps2dq        m3, m3
   1030    pminsw          m2, m4             ; x
   1031    pminsw          m3, m5
   1032    pmulld          m0, m2
   1033    pmulld          m1, m3
   1034    packssdw        m2, m3
   1035    paddd           m0, m14            ; x * b * 164 + (1 << 11) + (1 << 15)
   1036    paddd           m1, m14
   1037    mova    [t4+r10+4], m2
   1038    psrld           m0, 12             ; b
   1039    psrld           m1, 12
   1040    mova         [t3+r10*2+ 8], xm0
   1041    vextracti128 [t3+r10*2+40], m0, 1
   1042    mova         [t3+r10*2+24], xm1
   1043    vextracti128 [t3+r10*2+56], m1, 1
   1044    add            r10, 32
   1045    jl .v_loop
   1046    ret
   1047 .prep_n: ; initial neighbor setup
   1048    mov            r10, wq
   1049 .prep_n_loop:
   1050    movu            m0, [t4+r10*1+ 2]
   1051    movu            m1, [t3+r10*2+ 4]
   1052    movu            m2, [t3+r10*2+36]
   1053    paddw           m3, m0, [t4+r10*1+ 0]
   1054    paddd           m4, m1, [t3+r10*2+ 0]
   1055    paddd           m5, m2, [t3+r10*2+32]
   1056    paddw           m3, [t4+r10*1+ 4]
   1057    paddd           m4, [t3+r10*2+ 8]
   1058    paddd           m5, [t3+r10*2+40]
   1059    paddw           m0, m3
   1060    psllw           m3, 2
   1061    paddd           m1, m4
   1062    pslld           m4, 2
   1063    paddd           m2, m5
   1064    pslld           m5, 2
   1065    paddw           m0, m3             ; a 565
   1066    paddd           m1, m4             ; b 565
   1067    paddd           m2, m5
   1068    mova [t4+r10*1+400*2+ 0], m0
   1069    mova [t3+r10*2+400*4+ 0], m1
   1070    mova [t3+r10*2+400*4+32], m2
   1071    add            r10, 32
   1072    jl .prep_n_loop
   1073    ret
   1074 ALIGN function_align
   1075 .n0: ; neighbor + output (even rows)
   1076    mov            r10, wq
   1077 .n0_loop:
   1078    movu            m0, [t4+r10*1+ 2]
   1079    movu            m1, [t3+r10*2+ 4]
   1080    movu            m2, [t3+r10*2+36]
   1081    paddw           m3, m0, [t4+r10*1+ 0]
   1082    paddd           m4, m1, [t3+r10*2+ 0]
   1083    paddd           m5, m2, [t3+r10*2+32]
   1084    paddw           m3, [t4+r10*1+ 4]
   1085    paddd           m4, [t3+r10*2+ 8]
   1086    paddd           m5, [t3+r10*2+40]
   1087    paddw           m0, m3
   1088    psllw           m3, 2
   1089    paddd           m1, m4
   1090    pslld           m4, 2
   1091    paddd           m2, m5
   1092    pslld           m5, 2
   1093    paddw           m0, m3             ; a 565
   1094    paddd           m1, m4             ; b 565
   1095    paddd           m2, m5
   1096    paddw           m3, m0, [t4+r10*1+400*2+ 0]
   1097    paddd           m4, m1, [t3+r10*2+400*4+ 0]
   1098    paddd           m5, m2, [t3+r10*2+400*4+32]
   1099    mova [t4+r10*1+400*2+ 0], m0
   1100    mova [t3+r10*2+400*4+ 0], m1
   1101    mova [t3+r10*2+400*4+32], m2
   1102    mova            m0, [dstq+r10]
   1103    punpcklwd       m1, m0, m6          ; src
   1104    punpcklwd       m2, m3, m6          ; a
   1105    pmaddwd         m2, m1              ; a * src
   1106    punpckhwd       m1, m0, m6
   1107    punpckhwd       m3, m6
   1108    pmaddwd         m3, m1
   1109    vinserti128     m1, m4, xm5, 1
   1110    vperm2i128      m4, m5, 0x31
   1111    psubd           m1, m2              ; b - a * src + (1 << 8)
   1112    psubd           m4, m3
   1113    psrad           m1, 9
   1114    psrad           m4, 9
   1115    packssdw        m1, m4
   1116    pmulhrsw        m1, m7
   1117    paddw           m0, m1
   1118    pmaxsw          m0, m6
   1119    pminsw          m0, m15
   1120    mova    [dstq+r10], m0
   1121    add            r10, 32
   1122    jl .n0_loop
   1123    add           dstq, strideq
   1124    ret
   1125 ALIGN function_align
   1126 .n1: ; neighbor + output (odd rows)
   1127    mov            r10, wq
   1128 .n1_loop:
   1129    mova            m0, [dstq+r10]
   1130    mova            m3, [t4+r10*1+400*2+ 0]
   1131    mova            m4, [t3+r10*2+400*4+ 0]
   1132    mova            m5, [t3+r10*2+400*4+32]
   1133    punpcklwd       m1, m0, m6          ; src
   1134    punpcklwd       m2, m3, m6          ; a
   1135    pmaddwd         m2, m1
   1136    punpckhwd       m1, m0, m6
   1137    punpckhwd       m3, m6
   1138    pmaddwd         m3, m1
   1139    vinserti128     m1, m4, xm5, 1
   1140    vperm2i128      m4, m5, 0x31
   1141    psubd           m1, m2              ; b - a * src + (1 << 7)
   1142    psubd           m4, m3
   1143    psrad           m1, 8
   1144    psrad           m4, 8
   1145    packssdw        m1, m4
   1146    pmulhrsw        m1, m7
   1147    paddw           m0, m1
   1148    pmaxsw          m0, m6
   1149    pminsw          m0, m15
   1150    mova    [dstq+r10], m0
   1151    add            r10, 32
   1152    jl .n1_loop
   1153    add           dstq, strideq
   1154    ret
   1155 
   1156 cglobal sgr_filter_3x3_16bpc, 4, 14, 15, 400*42+8, dst, stride, left, lpf, \
   1157                                                   w, h, edge, params
   1158    movifnidn       wd, wm
   1159    mov        paramsq, r6mp
   1160    lea            r13, [pb_m10_m9]
   1161    add             wd, wd
   1162    movifnidn       hd, hm
   1163    mov          edged, r7m
   1164    vpbroadcastw    m7, [paramsq+10] ; w1
   1165    add           lpfq, wq
   1166    vpbroadcastd    m8, [base+pd_8]
   1167    add           dstq, wq
   1168    vpbroadcastd    m9, [paramsq+ 4] ; s1
   1169    lea             t3, [rsp+wq*2+400*12+8]
   1170    mova          xm10, [base+sgr_lshuf3]
   1171    lea             t4, [rsp+wq+400*32+8]
   1172    vpbroadcastd   m11, [base+pw_455_24]
   1173    lea             t1, [rsp+wq+12]
   1174    vbroadcastss   m12, [base+pf_256]
   1175    neg             wq
   1176    vpbroadcastd   m13, [base+pd_34816]
   1177    pxor            m6, m6
   1178    vpbroadcastd   m14, [base+pw_1023]
   1179    psllw           m7, 4
   1180    test         edgeb, 4 ; LR_HAVE_TOP
   1181    jz .no_top
   1182    call .h_top
   1183    add           lpfq, strideq
   1184    mov             t2, t1
   1185    add             t1, 400*6
   1186    call .h_top
   1187    lea            r10, [lpfq+strideq*4]
   1188    mov           lpfq, dstq
   1189    add            r10, strideq
   1190    mov          [rsp], r10 ; below
   1191    call .hv0
   1192 .main:
   1193    dec             hd
   1194    jz .height1
   1195    add           lpfq, strideq
   1196    call .hv1
   1197    call .prep_n
   1198    sub             hd, 2
   1199    jl .extend_bottom
   1200 .main_loop:
   1201    add           lpfq, strideq
   1202    call .hv0
   1203    test            hd, hd
   1204    jz .odd_height
   1205    add           lpfq, strideq
   1206    call .hv1
   1207    call .n0
   1208    call .n1
   1209    sub             hd, 2
   1210    jge .main_loop
   1211    test         edgeb, 8 ; LR_HAVE_BOTTOM
   1212    jz .extend_bottom
   1213    mov           lpfq, [rsp]
   1214    call .hv0_bottom
   1215    add           lpfq, strideq
   1216    call .hv1_bottom
   1217 .end:
   1218    call .n0
   1219    call .n1
   1220 .end2:
   1221    RET
   1222 .height1:
   1223    call .v1
   1224    call .prep_n
   1225    jmp .odd_height_end
   1226 .odd_height:
   1227    call .v1
   1228    call .n0
   1229    call .n1
   1230 .odd_height_end:
   1231    call .v0
   1232    call .v1
   1233    call .n0
   1234    jmp .end2
   1235 .extend_bottom:
   1236    call .v0
   1237    call .v1
   1238    jmp .end
   1239 .no_top:
   1240    lea            r10, [lpfq+strideq*4]
   1241    mov           lpfq, dstq
   1242    lea            r10, [r10+strideq*2]
   1243    mov          [rsp], r10
   1244    call .h
   1245    lea            r10, [wq-4]
   1246    lea             t2, [t1+400*6]
   1247 .top_fixup_loop:
   1248    mova            m0, [t1+r10+400*0]
   1249    mova            m1, [t1+r10+400*2]
   1250    mova            m2, [t1+r10+400*4]
   1251    mova [t2+r10+400*0], m0
   1252    mova [t2+r10+400*2], m1
   1253    mova [t2+r10+400*4], m2
   1254    add            r10, 32
   1255    jl .top_fixup_loop
   1256    call .v0
   1257    jmp .main
   1258 .extend_right:
   1259    vpbroadcastw    m0, [lpfq-2]
   1260    movu            m1, [r13+r10+ 2]
   1261    movu            m2, [r13+r10+18]
   1262    vpblendvb       m4, m0, m1
   1263    vpblendvb       m5, m0, m2
   1264    ret
   1265 .h: ; horizontal boxsum
   1266    lea            r10, [wq-4]
   1267    test         edgeb, 1 ; LR_HAVE_LEFT
   1268    jz .h_extend_left
   1269    vpbroadcastq   xm5, [leftq]
   1270    vinserti128     m5, [lpfq+wq], 1
   1271    mova            m4, [lpfq+wq]
   1272    add          leftq, 8
   1273    palignr         m4, m5, 12
   1274    jmp .h_main
   1275 .h_extend_left:
   1276    mova           xm4, [lpfq+wq]
   1277    pshufb         xm4, xm10
   1278    vinserti128     m4, [lpfq+wq+12], 1
   1279    jmp .h_main
   1280 .h_top:
   1281    lea            r10, [wq-4]
   1282    test         edgeb, 1 ; LR_HAVE_LEFT
   1283    jz .h_extend_left
   1284 .h_loop:
   1285    movu            m4, [lpfq+r10+ 0]
   1286 .h_main:
   1287    movu            m5, [lpfq+r10+16]
   1288    test         edgeb, 2 ; LR_HAVE_RIGHT
   1289    jnz .h_have_right
   1290    cmp           r10d, -34
   1291    jl .h_have_right
   1292    call .extend_right
   1293 .h_have_right:
   1294    palignr         m0, m5, m4, 2
   1295    paddw           m1, m4, m0
   1296    punpcklwd       m2, m4, m0
   1297    pmaddwd         m2, m2
   1298    punpckhwd       m3, m4, m0
   1299    pmaddwd         m3, m3
   1300    palignr         m5, m4, 4
   1301    paddw           m1, m5             ; sum
   1302    punpcklwd       m4, m5, m6
   1303    pmaddwd         m4, m4
   1304    punpckhwd       m5, m6
   1305    pmaddwd         m5, m5
   1306    paddd           m2, m4             ; sumsq
   1307    paddd           m3, m5
   1308    mova [t1+r10+400*0], m1
   1309    mova [t1+r10+400*2], m2
   1310    mova [t1+r10+400*4], m3
   1311    add            r10, 32
   1312    jl .h_loop
   1313    ret
   1314 ALIGN function_align
   1315 .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
   1316    lea            r10, [wq-4]
   1317    test         edgeb, 1 ; LR_HAVE_LEFT
   1318    jz .hv0_extend_left
   1319    vpbroadcastq   xm5, [leftq]
   1320    vinserti128     m5, [lpfq+wq], 1
   1321    mova            m4, [lpfq+wq]
   1322    add          leftq, 8
   1323    palignr         m4, m5, 12
   1324    jmp .hv0_main
   1325 .hv0_extend_left:
   1326    mova           xm4, [lpfq+wq]
   1327    pshufb         xm4, xm10
   1328    vinserti128     m4, [lpfq+wq+12], 1
   1329    jmp .hv0_main
   1330 .hv0_bottom:
   1331    lea            r10, [wq-4]
   1332    test         edgeb, 1 ; LR_HAVE_LEFT
   1333    jz .hv0_extend_left
   1334 .hv0_loop:
   1335    movu            m4, [lpfq+r10+ 0]
   1336 .hv0_main:
   1337    movu            m5, [lpfq+r10+16]
   1338    test         edgeb, 2 ; LR_HAVE_RIGHT
   1339    jnz .hv0_have_right
   1340    cmp           r10d, -34
   1341    jl .hv0_have_right
   1342    call .extend_right
   1343 .hv0_have_right:
   1344    palignr         m0, m5, m4, 2
   1345    paddw           m1, m4, m0
   1346    punpcklwd       m2, m4, m0
   1347    pmaddwd         m2, m2
   1348    punpckhwd       m3, m4, m0
   1349    pmaddwd         m3, m3
   1350    palignr         m5, m4, 4
   1351    paddw           m1, m5             ; sum
   1352    punpcklwd       m4, m5, m6
   1353    pmaddwd         m4, m4
   1354    punpckhwd       m5, m6
   1355    pmaddwd         m5, m5
   1356    paddd           m2, m4             ; sumsq
   1357    paddd           m3, m5
   1358    paddw           m0, m1, [t1+r10+400*0]
   1359    paddd           m4, m2, [t1+r10+400*2]
   1360    paddd           m5, m3, [t1+r10+400*4]
   1361    mova [t1+r10+400*0], m1
   1362    mova [t1+r10+400*2], m2
   1363    mova [t1+r10+400*4], m3
   1364    paddw           m1, m0, [t2+r10+400*0]
   1365    paddd           m2, m4, [t2+r10+400*2]
   1366    paddd           m3, m5, [t2+r10+400*4]
   1367    mova [t2+r10+400*0], m0
   1368    mova [t2+r10+400*2], m4
   1369    mova [t2+r10+400*4], m5
   1370    paddd           m2, m8
   1371    paddd           m3, m8
   1372    psrld           m2, 4              ; (a + 8) >> 4
   1373    psrld           m3, 4
   1374    pslld           m4, m2, 3
   1375    pslld           m5, m3, 3
   1376    paddd           m4, m2             ; ((a + 8) >> 4) * 9
   1377    paddd           m5, m3
   1378    psrlw           m3, m1, 1
   1379    pavgw           m3, m6             ; (b + 2) >> 2
   1380    punpcklwd       m2, m3, m6
   1381    pmaddwd         m2, m2
   1382    punpckhwd       m3, m6
   1383    pmaddwd         m3, m3
   1384    punpcklwd       m0, m1, m6         ; b
   1385    punpckhwd       m1, m6
   1386    pmaxud          m4, m2
   1387    psubd           m4, m2             ; p
   1388    pmaxud          m5, m3
   1389    psubd           m5, m3
   1390    pmulld          m4, m9             ; p * s
   1391    pmulld          m5, m9
   1392    pmaddwd         m0, m11            ; b * 455
   1393    pmaddwd         m1, m11
   1394    paddw           m4, m11
   1395    paddw           m5, m11
   1396    psrld           m4, 20             ; z + 1
   1397    psrld           m5, 20
   1398    cvtdq2ps        m4, m4
   1399    cvtdq2ps        m5, m5
   1400    rcpps           m2, m4             ; 1 / (z + 1)
   1401    rcpps           m3, m5
   1402    pcmpgtd         m4, m12, m4
   1403    pcmpgtd         m5, m12, m5
   1404    mulps           m2, m12            ; 256 / (z + 1)
   1405    mulps           m3, m12
   1406    psrld           m4, 24             ; z < 255 ? 255 : 0
   1407    psrld           m5, 24
   1408    cvtps2dq        m2, m2
   1409    cvtps2dq        m3, m3
   1410    pminsw          m2, m4             ; x
   1411    pminsw          m3, m5
   1412    pmulld          m0, m2
   1413    pmulld          m1, m3
   1414    packssdw        m2, m3
   1415    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
   1416    paddd           m1, m13
   1417    psrld           m0, 12
   1418    psrld           m1, 12
   1419    mova         [t4+r10*1+400*0+ 4], m2
   1420    mova         [t3+r10*2+400*0+ 8], xm0
   1421    vextracti128 [t3+r10*2+400*0+40], m0, 1
   1422    mova         [t3+r10*2+400*0+24], xm1
   1423    vextracti128 [t3+r10*2+400*0+56], m1, 1
   1424    add            r10, 32
   1425    jl .hv0_loop
   1426    ret
   1427 ALIGN function_align
   1428 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
   1429    lea            r10, [wq-4]
   1430    test         edgeb, 1 ; LR_HAVE_LEFT
   1431    jz .hv1_extend_left
   1432    vpbroadcastq   xm5, [leftq]
   1433    vinserti128     m5, [lpfq+wq], 1
   1434    mova            m4, [lpfq+wq]
   1435    add          leftq, 8
   1436    palignr         m4, m5, 12
   1437    jmp .hv1_main
   1438 .hv1_extend_left:
   1439    mova           xm4, [lpfq+wq]
   1440    pshufb         xm4, xm10
   1441    vinserti128     m4, [lpfq+wq+12], 1
   1442    jmp .hv1_main
   1443 .hv1_bottom:
   1444    lea            r10, [wq-4]
   1445    test         edgeb, 1 ; LR_HAVE_LEFT
   1446    jz .hv1_extend_left
   1447 .hv1_loop:
   1448    movu            m4, [lpfq+r10+ 0]
   1449 .hv1_main:
   1450    movu            m5, [lpfq+r10+16]
   1451    test         edgeb, 2 ; LR_HAVE_RIGHT
   1452    jnz .hv1_have_right
   1453    cmp           r10d, -34
   1454    jl .hv1_have_right
   1455    call .extend_right
   1456 .hv1_have_right:
   1457    palignr         m1, m5, m4, 2
   1458    paddw           m0, m4, m1
   1459    punpcklwd       m2, m4, m1
   1460    pmaddwd         m2, m2
   1461    punpckhwd       m3, m4, m1
   1462    pmaddwd         m3, m3
   1463    palignr         m5, m4, 4
   1464    paddw           m0, m5             ; h sum
   1465    punpcklwd       m1, m5, m6
   1466    pmaddwd         m1, m1
   1467    punpckhwd       m5, m6
   1468    pmaddwd         m5, m5
   1469    paddd           m2, m1             ; h sumsq
   1470    paddd           m3, m5
   1471    paddw           m1, m0, [t2+r10+400*0]
   1472    paddd           m4, m2, [t2+r10+400*2]
   1473    paddd           m5, m3, [t2+r10+400*4]
   1474    mova [t2+r10+400*0], m0
   1475    mova [t2+r10+400*2], m2
   1476    mova [t2+r10+400*4], m3
   1477    paddd           m4, m8
   1478    paddd           m5, m8
   1479    psrld           m4, 4              ; (a + 8) >> 4
   1480    psrld           m5, 4
   1481    pslld           m2, m4, 3
   1482    pslld           m3, m5, 3
   1483    paddd           m4, m2             ; ((a + 8) >> 4) * 9
   1484    paddd           m5, m3
   1485    psrlw           m3, m1, 1
   1486    pavgw           m3, m6             ; (b + 2) >> 2
   1487    punpcklwd       m2, m3, m6
   1488    pmaddwd         m2, m2
   1489    punpckhwd       m3, m6
   1490    pmaddwd         m3, m3
   1491    punpcklwd       m0, m1, m6         ; b
   1492    punpckhwd       m1, m6
   1493    pmaxud          m4, m2
   1494    psubd           m4, m2             ; p
   1495    pmaxud          m5, m3
   1496    psubd           m5, m3
   1497    pmulld          m4, m9             ; p * s
   1498    pmulld          m5, m9
   1499    pmaddwd         m0, m11            ; b * 455
   1500    pmaddwd         m1, m11
   1501    paddw           m4, m11
   1502    paddw           m5, m11
   1503    psrld           m4, 20             ; z + 1
   1504    psrld           m5, 20
   1505    cvtdq2ps        m4, m4
   1506    cvtdq2ps        m5, m5
   1507    rcpps           m2, m4             ; 1 / (z + 1)
   1508    rcpps           m3, m5
   1509    pcmpgtd         m4, m12, m4
   1510    pcmpgtd         m5, m12, m5
   1511    mulps           m2, m12            ; 256 / (z + 1)
   1512    mulps           m3, m12
   1513    psrld           m4, 24             ; z < 255 ? 255 : 0
   1514    psrld           m5, 24
   1515    cvtps2dq        m2, m2
   1516    cvtps2dq        m3, m3
   1517    pminsw          m2, m4             ; x
   1518    pminsw          m3, m5
   1519    pmulld          m0, m2
   1520    pmulld          m1, m3
   1521    packssdw        m2, m3
   1522    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
   1523    paddd           m1, m13
   1524    psrld           m0, 12
   1525    psrld           m1, 12
   1526    mova         [t4+r10*1+400*2 +4], m2
   1527    mova         [t3+r10*2+400*4+ 8], xm0
   1528    vextracti128 [t3+r10*2+400*4+40], m0, 1
   1529    mova         [t3+r10*2+400*4+24], xm1
   1530    vextracti128 [t3+r10*2+400*4+56], m1, 1
   1531    add            r10, 32
   1532    jl .hv1_loop
   1533    mov            r10, t2
   1534    mov             t2, t1
   1535    mov             t1, r10
   1536    ret
   1537 .v0: ; vertical boxsums + ab (even rows)
   1538    lea            r10, [wq-4]
   1539 .v0_loop:
   1540    mova            m0, [t1+r10+400*0]
   1541    mova            m4, [t1+r10+400*2]
   1542    mova            m5, [t1+r10+400*4]
   1543    paddw           m0, m0
   1544    paddd           m4, m4
   1545    paddd           m5, m5
   1546    paddw           m1, m0, [t2+r10+400*0]
   1547    paddd           m2, m4, [t2+r10+400*2]
   1548    paddd           m3, m5, [t2+r10+400*4]
   1549    mova [t2+r10+400*0], m0
   1550    mova [t2+r10+400*2], m4
   1551    mova [t2+r10+400*4], m5
   1552    paddd           m2, m8
   1553    paddd           m3, m8
   1554    psrld           m2, 4              ; (a + 8) >> 4
   1555    psrld           m3, 4
   1556    pslld           m4, m2, 3
   1557    pslld           m5, m3, 3
   1558    paddd           m4, m2             ; ((a + 8) >> 4) * 9
   1559    paddd           m5, m3
   1560    psrlw           m3, m1, 1
   1561    pavgw           m3, m6             ; (b + 2) >> 2
   1562    punpcklwd       m2, m3, m6
   1563    pmaddwd         m2, m2
   1564    punpckhwd       m3, m6
   1565    pmaddwd         m3, m3
   1566    punpcklwd       m0, m1, m6         ; b
   1567    punpckhwd       m1, m6
   1568    pmaxud          m4, m2
   1569    psubd           m4, m2             ; p
   1570    pmaxud          m5, m3
   1571    psubd           m5, m3
   1572    pmulld          m4, m9             ; p * s
   1573    pmulld          m5, m9
   1574    pmaddwd         m0, m11            ; b * 455
   1575    pmaddwd         m1, m11
   1576    paddw           m4, m11
   1577    paddw           m5, m11
   1578    psrld           m4, 20             ; z + 1
   1579    psrld           m5, 20
   1580    cvtdq2ps        m4, m4
   1581    cvtdq2ps        m5, m5
   1582    rcpps           m2, m4             ; 1 / (z + 1)
   1583    rcpps           m3, m5
   1584    pcmpgtd         m4, m12, m4
   1585    pcmpgtd         m5, m12, m5
   1586    mulps           m2, m12            ; 256 / (z + 1)
   1587    mulps           m3, m12
   1588    psrld           m4, 24             ; z < 255 ? 255 : 0
   1589    psrld           m5, 24
   1590    cvtps2dq        m2, m2
   1591    cvtps2dq        m3, m3
   1592    pminsw          m2, m4             ; x
   1593    pminsw          m3, m5
   1594    pmulld          m0, m2
   1595    pmulld          m1, m3
   1596    packssdw        m2, m3
   1597    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
   1598    paddd           m1, m13
   1599    psrld           m0, 12
   1600    psrld           m1, 12
   1601    mova         [t4+r10*1+400*0+ 4], m2
   1602    mova         [t3+r10*2+400*0+ 8], xm0
   1603    vextracti128 [t3+r10*2+400*0+40], m0, 1
   1604    mova         [t3+r10*2+400*0+24], xm1
   1605    vextracti128 [t3+r10*2+400*0+56], m1, 1
   1606    add            r10, 32
   1607    jl .v0_loop
   1608    ret
   1609 .v1: ; vertical boxsums + ab (odd rows)
   1610    lea            r10, [wq-4]
   1611 .v1_loop:
   1612    mova            m0, [t1+r10+400*0]
   1613    mova            m4, [t1+r10+400*2]
   1614    mova            m5, [t1+r10+400*4]
   1615    paddw           m1, m0, [t2+r10+400*0]
   1616    paddd           m2, m4, [t2+r10+400*2]
   1617    paddd           m3, m5, [t2+r10+400*4]
   1618    mova [t2+r10+400*0], m0
   1619    mova [t2+r10+400*2], m4
   1620    mova [t2+r10+400*4], m5
   1621    paddd           m2, m8
   1622    paddd           m3, m8
   1623    psrld           m2, 4              ; (a + 8) >> 4
   1624    psrld           m3, 4
   1625    pslld           m4, m2, 3
   1626    pslld           m5, m3, 3
   1627    paddd           m4, m2             ; ((a + 8) >> 4) * 9
   1628    paddd           m5, m3
   1629    psrlw           m3, m1, 1
   1630    pavgw           m3, m6             ; (b + 2) >> 2
   1631    punpcklwd       m2, m3, m6
   1632    pmaddwd         m2, m2
   1633    punpckhwd       m3, m6
   1634    pmaddwd         m3, m3
   1635    punpcklwd       m0, m1, m6         ; b
   1636    punpckhwd       m1, m6
   1637    pmaxud          m4, m2
   1638    psubd           m4, m2             ; p
   1639    pmaxud          m5, m3
   1640    psubd           m5, m3
   1641    pmulld          m4, m9             ; p * s
   1642    pmulld          m5, m9
   1643    pmaddwd         m0, m11            ; b * 455
   1644    pmaddwd         m1, m11
   1645    paddw           m4, m11
   1646    paddw           m5, m11
   1647    psrld           m4, 20             ; z + 1
   1648    psrld           m5, 20
   1649    cvtdq2ps        m4, m4
   1650    cvtdq2ps        m5, m5
   1651    rcpps           m2, m4             ; 1 / (z + 1)
   1652    rcpps           m3, m5
   1653    pcmpgtd         m4, m12, m4
   1654    pcmpgtd         m5, m12, m5
   1655    mulps           m2, m12            ; 256 / (z + 1)
   1656    mulps           m3, m12
   1657    psrld           m4, 24             ; z < 255 ? 255 : 0
   1658    psrld           m5, 24
   1659    cvtps2dq        m2, m2
   1660    cvtps2dq        m3, m3
   1661    pminsw          m2, m4             ; x
   1662    pminsw          m3, m5
   1663    pmulld          m0, m2
   1664    pmulld          m1, m3
   1665    packssdw        m2, m3
   1666    paddd           m0, m13            ; x * b * 455 + (1 << 11) + (1 << 15)
   1667    paddd           m1, m13
   1668    psrld           m0, 12
   1669    psrld           m1, 12
   1670    mova         [t4+r10*1+400*2+ 4], m2
   1671    mova         [t3+r10*2+400*4+ 8], xm0
   1672    vextracti128 [t3+r10*2+400*4+40], m0, 1
   1673    mova         [t3+r10*2+400*4+24], xm1
   1674    vextracti128 [t3+r10*2+400*4+56], m1, 1
   1675    add            r10, 32
   1676    jl .v1_loop
   1677    mov            r10, t2
   1678    mov             t2, t1
   1679    mov             t1, r10
   1680    ret
   1681 .prep_n: ; initial neighbor setup
   1682    mov            r10, wq
   1683 .prep_n_loop:
   1684    mova           xm0, [t4+r10*1+400*0+0]
   1685    paddw          xm0, [t4+r10*1+400*0+4]
   1686    paddw          xm2, xm0, [t4+r10*1+400*0+2]
   1687    mova            m1, [t3+r10*2+400*0+0]
   1688    paddd           m1, [t3+r10*2+400*0+8]
   1689    paddd           m3, m1, [t3+r10*2+400*0+4]
   1690    psllw          xm2, 2                ; a[-1] 444
   1691    pslld           m3, 2                ; b[-1] 444
   1692    psubw          xm2, xm0              ; a[-1] 343
   1693    psubd           m3, m1               ; b[-1] 343
   1694    mova [t4+r10*1+400* 4], xm2
   1695    mova [t3+r10*2+400* 8], m3
   1696    mova           xm0, [t4+r10*1+400*2+0]
   1697    paddw          xm0, [t4+r10*1+400*2+4]
   1698    paddw          xm2, xm0, [t4+r10*1+400*2+2]
   1699    mova            m1, [t3+r10*2+400*4+0]
   1700    paddd           m1, [t3+r10*2+400*4+8]
   1701    paddd           m3, m1, [t3+r10*2+400*4+4]
   1702    psllw          xm2, 2                 ; a[ 0] 444
   1703    pslld           m3, 2                 ; b[ 0] 444
   1704    mova [t4+r10*1+400* 6], xm2
   1705    mova [t3+r10*2+400*12], m3
   1706    psubw          xm2, xm0               ; a[ 0] 343
   1707    psubd           m3, m1                ; b[ 0] 343
   1708    mova [t4+r10*1+400* 8], xm2
   1709    mova [t3+r10*2+400*16], m3
   1710    add            r10, 16
   1711    jl .prep_n_loop
   1712    ret
   1713 ALIGN function_align
   1714 .n0: ; neighbor + output (even rows)
   1715    mov            r10, wq
   1716 .n0_loop:
   1717    mova            m3, [t4+r10*1+400*0+0]
   1718    paddw           m3, [t4+r10*1+400*0+4]
   1719    paddw           m1, m3, [t4+r10*1+400*0+2]
   1720    psllw           m1, 2                ; a[ 1] 444
   1721    psubw           m2, m1, m3           ; a[ 1] 343
   1722    paddw           m3, m2, [t4+r10*1+400*4]
   1723    paddw           m3, [t4+r10*1+400*6]
   1724    mova [t4+r10*1+400*4], m2
   1725    mova [t4+r10*1+400*6], m1
   1726    mova            m4, [t3+r10*2+400*0+0]
   1727    paddd           m4, [t3+r10*2+400*0+8]
   1728    paddd           m1, m4, [t3+r10*2+400*0+4]
   1729    pslld           m1, 2                ; b[ 1] 444
   1730    psubd           m2, m1, m4           ; b[ 1] 343
   1731    paddd           m4, m2, [t3+r10*2+400* 8+ 0]
   1732    paddd           m4, [t3+r10*2+400*12+ 0]
   1733    mova [t3+r10*2+400* 8+ 0], m2
   1734    mova [t3+r10*2+400*12+ 0], m1
   1735    mova            m5, [t3+r10*2+400*0+32]
   1736    paddd           m5, [t3+r10*2+400*0+40]
   1737    paddd           m1, m5, [t3+r10*2+400*0+36]
   1738    pslld           m1, 2
   1739    psubd           m2, m1, m5
   1740    paddd           m5, m2, [t3+r10*2+400* 8+32]
   1741    paddd           m5, [t3+r10*2+400*12+32]
   1742    mova [t3+r10*2+400* 8+32], m2
   1743    mova [t3+r10*2+400*12+32], m1
   1744    mova            m0, [dstq+r10]
   1745    punpcklwd       m1, m0, m6
   1746    punpcklwd       m2, m3, m6
   1747    pmaddwd         m2, m1               ; a * src
   1748    punpckhwd       m1, m0, m6
   1749    punpckhwd       m3, m6
   1750    pmaddwd         m3, m1
   1751    vinserti128     m1, m4, xm5, 1
   1752    vperm2i128      m4, m5, 0x31
   1753    psubd           m1, m2               ; b - a * src + (1 << 8)
   1754    psubd           m4, m3
   1755    psrad           m1, 9
   1756    psrad           m4, 9
   1757    packssdw        m1, m4
   1758    pmulhrsw        m1, m7
   1759    paddw           m0, m1
   1760    pmaxsw          m0, m6
   1761    pminsw          m0, m14
   1762    mova    [dstq+r10], m0
   1763    add            r10, 32
   1764    jl .n0_loop
   1765    add           dstq, strideq
   1766    ret
   1767 ALIGN function_align
   1768 .n1: ; neighbor + output (odd rows)
   1769    mov            r10, wq
   1770 .n1_loop:
   1771    mova            m3, [t4+r10*1+400*2+0]
   1772    paddw           m3, [t4+r10*1+400*2+4]
   1773    paddw           m1, m3, [t4+r10*1+400*2+2]
   1774    psllw           m1, 2                ; a[ 1] 444
   1775    psubw           m2, m1, m3           ; a[ 1] 343
   1776    paddw           m3, m2, [t4+r10*1+400*6]
   1777    paddw           m3, [t4+r10*1+400*8]
   1778    mova [t4+r10*1+400*6], m1
   1779    mova [t4+r10*1+400*8], m2
   1780    mova            m4, [t3+r10*2+400*4+0]
   1781    paddd           m4, [t3+r10*2+400*4+8]
   1782    paddd           m1, m4, [t3+r10*2+400*4+4]
   1783    pslld           m1, 2                ; b[ 1] 444
   1784    psubd           m2, m1, m4           ; b[ 1] 343
   1785    paddd           m4, m2, [t3+r10*2+400*12+ 0]
   1786    paddd           m4, [t3+r10*2+400*16+ 0]
   1787    mova [t3+r10*2+400*12+ 0], m1
   1788    mova [t3+r10*2+400*16+ 0], m2
   1789    mova            m5, [t3+r10*2+400*4+32]
   1790    paddd           m5, [t3+r10*2+400*4+40]
   1791    paddd           m1, m5, [t3+r10*2+400*4+36]
   1792    pslld           m1, 2
   1793    psubd           m2, m1, m5
   1794    paddd           m5, m2, [t3+r10*2+400*12+32]
   1795    paddd           m5, [t3+r10*2+400*16+32]
   1796    mova [t3+r10*2+400*12+32], m1
   1797    mova [t3+r10*2+400*16+32], m2
   1798    mova            m0, [dstq+r10]
   1799    punpcklwd       m1, m0, m6
   1800    punpcklwd       m2, m3, m6
   1801    pmaddwd         m2, m1               ; a * src
   1802    punpckhwd       m1, m0, m6
   1803    punpckhwd       m3, m6
   1804    pmaddwd         m3, m1
   1805    vinserti128     m1, m4, xm5, 1
   1806    vperm2i128      m4, m5, 0x31
   1807    psubd           m1, m2               ; b - a * src + (1 << 8)
   1808    psubd           m4, m3
   1809    psrad           m1, 9
   1810    psrad           m4, 9
   1811    packssdw        m1, m4
   1812    pmulhrsw        m1, m7
   1813    paddw           m0, m1
   1814    pmaxsw          m0, m6
   1815    pminsw          m0, m14
   1816    mova    [dstq+r10], m0
   1817    add            r10, 32
   1818    jl .n1_loop
   1819    add           dstq, strideq
   1820    ret
   1821 
   1822 cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \
   1823                                                   w, h, edge, params
   1824    movifnidn       wd, wm
   1825    mov        paramsq, r6mp
   1826    lea            r13, [pb_m10_m9]
   1827    add             wd, wd
   1828    movifnidn       hd, hm
   1829    mov          edged, r7m
   1830    add           lpfq, wq
   1831    vpbroadcastd   m15, [paramsq+8] ; w0 w1
   1832    add           dstq, wq
   1833    vpbroadcastd   m13, [paramsq+0] ; s0
   1834    lea             t3, [rsp+wq*2+400*24+8]
   1835    vpbroadcastd   m14, [paramsq+4] ; s1
   1836    lea             t4, [rsp+wq+400*52+8]
   1837    vpbroadcastd    m9, [base+pd_8]
   1838    lea             t1, [rsp+wq+12]
   1839    vpbroadcastd   m10, [base+pd_34816]
   1840    neg             wq
   1841    vbroadcastss   m11, [base+pf_256]
   1842    pxor            m7, m7
   1843    vpbroadcastd   m12, [base+pw_455_24]
   1844    psllw          m15, 2
   1845    test         edgeb, 4 ; LR_HAVE_TOP
   1846    jz .no_top
   1847    call .h_top
   1848    add           lpfq, strideq
   1849    mov             t2, t1
   1850    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup
   1851    add             t1, 400*12
   1852    call .h_top
   1853    lea            r10, [lpfq+strideq*4]
   1854    mov           lpfq, dstq
   1855    add            r10, strideq
   1856    mov          [rsp], r10 ; below
   1857    call .hv0
   1858 .main:
   1859    dec             hd
   1860    jz .height1
   1861    add           lpfq, strideq
   1862    call .hv1
   1863    call .prep_n
   1864    sub             hd, 2
   1865    jl .extend_bottom
   1866 .main_loop:
   1867    add           lpfq, strideq
   1868    call .hv0
   1869    test            hd, hd
   1870    jz .odd_height
   1871    add           lpfq, strideq
   1872    call .hv1
   1873    call .n0
   1874    call .n1
   1875    sub             hd, 2
   1876    jge .main_loop
   1877    test         edgeb, 8 ; LR_HAVE_BOTTOM
   1878    jz .extend_bottom
   1879    mov           lpfq, [rsp]
   1880    call .hv0_bottom
   1881    add           lpfq, strideq
   1882    call .hv1_bottom
   1883 .end:
   1884    call .n0
   1885    call .n1
   1886 .end2:
   1887    RET
   1888 .height1:
   1889    call .v1
   1890    call .prep_n
   1891    jmp .odd_height_end
   1892 .odd_height:
   1893    call .v1
   1894    call .n0
   1895    call .n1
   1896 .odd_height_end:
   1897    call .v0
   1898    call .v1
   1899    call .n0
   1900    jmp .end2
   1901 .extend_bottom:
   1902    call .v0
   1903    call .v1
   1904    jmp .end
   1905 .no_top:
   1906    lea            r10, [lpfq+strideq*4]
   1907    mov           lpfq, dstq
   1908    lea            r10, [r10+strideq*2]
   1909    mov          [rsp], r10
   1910    call .h
   1911    lea            r10, [wq-4]
   1912    lea             t2, [t1+400*12]
   1913 .top_fixup_loop:
   1914    mova            m0, [t1+r10+400* 0]
   1915    mova            m1, [t1+r10+400* 2]
   1916    mova            m2, [t1+r10+400* 4]
   1917    paddw           m0, m0
   1918    mova            m3, [t1+r10+400* 6]
   1919    paddd           m1, m1
   1920    mova            m4, [t1+r10+400* 8]
   1921    paddd           m2, m2
   1922    mova            m5, [t1+r10+400*10]
   1923    mova [t2+r10+400* 0], m0
   1924    mova [t2+r10+400* 2], m1
   1925    mova [t2+r10+400* 4], m2
   1926    mova [t2+r10+400* 6], m3
   1927    mova [t2+r10+400* 8], m4
   1928    mova [t2+r10+400*10], m5
   1929    add            r10, 32
   1930    jl .top_fixup_loop
   1931    call .v0
   1932    jmp .main
   1933 .h: ; horizontal boxsum
   1934    lea            r10, [wq-4]
   1935    test         edgeb, 1 ; LR_HAVE_LEFT
   1936    jz .h_extend_left
   1937    vpbroadcastq   xm5, [leftq]
   1938    vinserti128     m5, [lpfq+wq], 1
   1939    mova            m4, [lpfq+wq]
   1940    add          leftq, 8
   1941    palignr         m4, m5, 10
   1942    jmp .h_main
   1943 .h_extend_left:
   1944    mova           xm4, [lpfq+wq]
   1945    pshufb         xm4, [base+sgr_lshuf5]
   1946    vinserti128     m4, [lpfq+wq+10], 1
   1947    jmp .h_main
   1948 .h_top:
   1949    lea            r10, [wq-4]
   1950    test         edgeb, 1 ; LR_HAVE_LEFT
   1951    jz .h_extend_left
   1952 .h_loop:
   1953    movu            m4, [lpfq+r10- 2]
   1954 .h_main:
   1955    movu            m5, [lpfq+r10+14]
   1956    test         edgeb, 2 ; LR_HAVE_RIGHT
   1957    jnz .h_have_right
   1958    cmp           r10d, -36
   1959    jl .h_have_right
   1960    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
   1961 .h_have_right:
   1962    palignr         m3, m5, m4, 2
   1963    palignr         m0, m5, m4, 4
   1964    paddw           m1, m3, m0
   1965    punpcklwd       m2, m3, m0
   1966    pmaddwd         m2, m2
   1967    punpckhwd       m3, m0
   1968    pmaddwd         m3, m3
   1969    palignr         m0, m5, m4, 6
   1970    paddw           m1, m0             ; sum3
   1971    punpcklwd       m6, m0, m7
   1972    pmaddwd         m6, m6
   1973    punpckhwd       m0, m7
   1974    pmaddwd         m0, m0
   1975    paddd           m2, m6             ; sumsq3
   1976    shufpd          m6, m4, m5, 0x05
   1977    punpcklwd       m5, m6, m4
   1978    paddw           m8, m4, m6
   1979    pmaddwd         m5, m5
   1980    punpckhwd       m6, m4
   1981    pmaddwd         m6, m6
   1982    paddd           m3, m0
   1983    mova [t1+r10+400* 6], m1
   1984    mova [t1+r10+400* 8], m2
   1985    mova [t1+r10+400*10], m3
   1986    paddw           m8, m1             ; sum5
   1987    paddd           m5, m2             ; sumsq5
   1988    paddd           m6, m3
   1989    mova [t1+r10+400* 0], m8
   1990    mova [t1+r10+400* 2], m5
   1991    mova [t1+r10+400* 4], m6
   1992    add            r10, 32
   1993    jl .h_loop
   1994    ret
   1995 ALIGN function_align
   1996 .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
   1997    lea            r10, [wq-4]
   1998    test         edgeb, 1 ; LR_HAVE_LEFT
   1999    jz .hv0_extend_left
   2000    vpbroadcastq   xm5, [leftq]
   2001    vinserti128     m5, [lpfq+wq], 1
   2002    mova            m4, [lpfq+wq]
   2003    add          leftq, 8
   2004    palignr         m4, m5, 10
   2005    jmp .hv0_main
   2006 .hv0_extend_left:
   2007    mova           xm4, [lpfq+wq]
   2008    pshufb         xm4, [base+sgr_lshuf5]
   2009    vinserti128     m4, [lpfq+wq+10], 1
   2010    jmp .hv0_main
   2011 .hv0_bottom:
   2012    lea            r10, [wq-4]
   2013    test         edgeb, 1 ; LR_HAVE_LEFT
   2014    jz .hv0_extend_left
   2015 .hv0_loop:
   2016    movu            m4, [lpfq+r10- 2]
   2017 .hv0_main:
   2018    movu            m5, [lpfq+r10+14]
   2019    test         edgeb, 2 ; LR_HAVE_RIGHT
   2020    jnz .hv0_have_right
   2021    cmp           r10d, -36
   2022    jl .hv0_have_right
   2023    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
   2024 .hv0_have_right:
   2025    palignr         m3, m5, m4, 2
   2026    palignr         m0, m5, m4, 4
   2027    paddw           m1, m3, m0
   2028    punpcklwd       m2, m3, m0
   2029    pmaddwd         m2, m2
   2030    punpckhwd       m3, m0
   2031    pmaddwd         m3, m3
   2032    palignr         m0, m5, m4, 6
   2033    paddw           m1, m0             ; h sum3
   2034    punpcklwd       m6, m0, m7
   2035    pmaddwd         m6, m6
   2036    punpckhwd       m0, m7
   2037    pmaddwd         m0, m0
   2038    paddd           m2, m6             ; h sumsq3
   2039    shufpd          m6, m4, m5, 0x05
   2040    punpcklwd       m5, m6, m4
   2041    paddw           m8, m4, m6
   2042    pmaddwd         m5, m5
   2043    punpckhwd       m6, m4
   2044    pmaddwd         m6, m6
   2045    paddd           m3, m0
   2046    paddw           m8, m1             ; h sum5
   2047    paddd           m5, m2             ; h sumsq5
   2048    paddd           m6, m3
   2049    mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4?
   2050    mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd
   2051    mova [t3+r10*2+400*0+40], m6
   2052    paddw           m8, [t1+r10+400* 0]
   2053    paddd           m5, [t1+r10+400* 2]
   2054    paddd           m6, [t1+r10+400* 4]
   2055    mova [t1+r10+400* 0], m8
   2056    mova [t1+r10+400* 2], m5
   2057    mova [t1+r10+400* 4], m6
   2058    paddw           m0, m1, [t1+r10+400* 6]
   2059    paddd           m4, m2, [t1+r10+400* 8]
   2060    paddd           m5, m3, [t1+r10+400*10]
   2061    mova [t1+r10+400* 6], m1
   2062    mova [t1+r10+400* 8], m2
   2063    mova [t1+r10+400*10], m3
   2064    paddw           m1, m0, [t2+r10+400* 6]
   2065    paddd           m2, m4, [t2+r10+400* 8]
   2066    paddd           m3, m5, [t2+r10+400*10]
   2067    mova [t2+r10+400* 6], m0
   2068    mova [t2+r10+400* 8], m4
   2069    mova [t2+r10+400*10], m5
   2070    paddd           m2, m9
   2071    paddd           m3, m9
   2072    psrld           m2, 4              ; (a3 + 8) >> 4
   2073    psrld           m3, 4
   2074    pslld           m4, m2, 3
   2075    pslld           m5, m3, 3
   2076    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
   2077    paddd           m5, m3
   2078    psrlw           m3, m1, 1
   2079    pavgw           m3, m7             ; (b3 + 2) >> 2
   2080    punpcklwd       m2, m3, m7
   2081    pmaddwd         m2, m2
   2082    punpckhwd       m3, m7
   2083    pmaddwd         m3, m3
   2084    punpcklwd       m0, m1, m7         ; b3
   2085    punpckhwd       m1, m7
   2086    pmaxud          m4, m2
   2087    psubd           m4, m2             ; p3
   2088    pmaxud          m5, m3
   2089    psubd           m5, m3
   2090    pmulld          m4, m14            ; p3 * s1
   2091    pmulld          m5, m14
   2092    pmaddwd         m0, m12            ; b3 * 455
   2093    pmaddwd         m1, m12
   2094    paddw           m4, m12
   2095    paddw           m5, m12
   2096    psrld           m4, 20             ; z3 + 1
   2097    psrld           m5, 20
   2098    cvtdq2ps        m4, m4
   2099    cvtdq2ps        m5, m5
   2100    rcpps           m2, m4             ; 1 / (z3 + 1)
   2101    rcpps           m3, m5
   2102    pcmpgtd         m4, m11, m4
   2103    pcmpgtd         m5, m11, m5
   2104    mulps           m2, m11            ; 256 / (z3 + 1)
   2105    mulps           m3, m11
   2106    psrld           m4, 24             ; z3 < 255 ? 255 : 0
   2107    psrld           m5, 24
   2108    cvtps2dq        m2, m2
   2109    cvtps2dq        m3, m3
   2110    pminsw          m2, m4             ; x3
   2111    pminsw          m3, m5
   2112    pmulld          m0, m2
   2113    pmulld          m1, m3
   2114    packssdw        m2, m3
   2115    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   2116    paddd           m1, m10
   2117    psrld           m0, 12
   2118    psrld           m1, 12
   2119    mova         [t4+r10*1+400*2+ 4], m2
   2120    mova         [t3+r10*2+400*4+ 8], xm0
   2121    vextracti128 [t3+r10*2+400*4+40], m0, 1
   2122    mova         [t3+r10*2+400*4+24], xm1
   2123    vextracti128 [t3+r10*2+400*4+56], m1, 1
   2124    add            r10, 32
   2125    jl .hv0_loop
   2126    ret
   2127 ALIGN function_align
   2128 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
   2129    lea            r10, [wq-4]
   2130    test         edgeb, 1 ; LR_HAVE_LEFT
   2131    jz .hv1_extend_left
   2132    vpbroadcastq   xm5, [leftq]
   2133    vinserti128     m5, [lpfq+wq], 1
   2134    mova            m4, [lpfq+wq]
   2135    add          leftq, 8
   2136    palignr         m4, m5, 10
   2137    jmp .hv1_main
   2138 .hv1_extend_left:
   2139    mova           xm4, [lpfq+wq]
   2140    pshufb         xm4, [base+sgr_lshuf5]
   2141    vinserti128     m4, [lpfq+wq+10], 1
   2142    jmp .hv1_main
   2143 .hv1_bottom:
   2144    lea            r10, [wq-4]
   2145    test         edgeb, 1 ; LR_HAVE_LEFT
   2146    jz .hv1_extend_left
   2147 .hv1_loop:
   2148    movu            m4, [lpfq+r10- 2]
   2149 .hv1_main:
   2150    movu            m5, [lpfq+r10+14]
   2151    test         edgeb, 2 ; LR_HAVE_RIGHT
   2152    jnz .hv1_have_right
   2153    cmp           r10d, -36
   2154    jl .hv1_have_right
   2155    call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
   2156 .hv1_have_right:
   2157    palignr         m6, m5, m4, 2
   2158    palignr         m3, m5, m4, 4
   2159    paddw           m2, m6, m3
   2160    punpcklwd       m0, m6, m3
   2161    pmaddwd         m0, m0
   2162    punpckhwd       m6, m3
   2163    pmaddwd         m6, m6
   2164    palignr         m3, m5, m4, 6
   2165    paddw           m2, m3             ; h sum3
   2166    punpcklwd       m1, m3, m7
   2167    pmaddwd         m1, m1
   2168    punpckhwd       m3, m7
   2169    pmaddwd         m3, m3
   2170    paddd           m0, m1             ; h sumsq3
   2171    shufpd          m1, m4, m5, 0x05
   2172    punpckhwd       m5, m4, m1
   2173    paddw           m8, m4, m1
   2174    pmaddwd         m5, m5
   2175    punpcklwd       m4, m1
   2176    pmaddwd         m4, m4
   2177    paddd           m6, m3
   2178    paddw           m1, m2, [t2+r10+400* 6]
   2179    mova [t2+r10+400* 6], m2
   2180    paddw           m8, m2             ; h sum5
   2181    paddd           m2, m0, [t2+r10+400* 8]
   2182    paddd           m3, m6, [t2+r10+400*10]
   2183    mova [t2+r10+400* 8], m0
   2184    mova [t2+r10+400*10], m6
   2185    paddd           m4, m0             ; h sumsq5
   2186    paddd           m5, m6
   2187    paddd           m2, m9
   2188    paddd           m3, m9
   2189    psrld           m2, 4              ; (a3 + 8) >> 4
   2190    psrld           m3, 4
   2191    pslld           m0, m2, 3
   2192    pslld           m6, m3, 3
   2193    paddd           m2, m0             ; ((a3 + 8) >> 4) * 9
   2194    paddd           m3, m6
   2195    psrlw           m6, m1, 1
   2196    pavgw           m6, m7             ; (b3 + 2) >> 2
   2197    punpcklwd       m0, m6, m7
   2198    pmaddwd         m0, m0
   2199    punpckhwd       m6, m7
   2200    pmaddwd         m6, m6
   2201    pmaxud          m2, m0
   2202    psubd           m2, m0             ; p3
   2203    pmaxud          m3, m6
   2204    psubd           m3, m6
   2205    punpcklwd       m0, m1, m7         ; b3
   2206    punpckhwd       m1, m7
   2207    pmulld          m2, m14            ; p3 * s1
   2208    pmulld          m3, m14
   2209    pmaddwd         m0, m12            ; b3 * 455
   2210    pmaddwd         m1, m12
   2211    paddw           m2, m12
   2212    paddw           m3, m12
   2213    psrld           m2, 20             ; z + 1
   2214    psrld           m3, 20
   2215    cvtdq2ps        m2, m2
   2216    cvtdq2ps        m3, m3
   2217    rcpps           m6, m2             ; 1 / (z + 1)
   2218    rcpps           m7, m3
   2219    pcmpgtd         m2, m11, m2
   2220    pcmpgtd         m3, m11, m3
   2221    mulps           m6, m11            ; 256 / (z + 1)
   2222    mulps           m7, m11
   2223    psrld           m2, 24             ; z < 255 ? 255 : 0
   2224    psrld           m3, 24
   2225    cvtps2dq        m6, m6
   2226    cvtps2dq        m7, m7
   2227    pminsw          m6, m2             ; x
   2228    pminsw          m7, m3
   2229    pmulld          m0, m6
   2230    packssdw        m6, m7
   2231    pmulld          m7, m1
   2232    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   2233    paddd           m7, m10
   2234    psrld           m0, 12
   2235    psrld           m7, 12
   2236    paddw           m1, m8, [t2+r10+400*0]
   2237    paddd           m2, m4, [t2+r10+400*2]
   2238    paddd           m3, m5, [t2+r10+400*4]
   2239    paddw           m1, [t1+r10+400*0]
   2240    paddd           m2, [t1+r10+400*2]
   2241    paddd           m3, [t1+r10+400*4]
   2242    mova [t2+r10+400*0], m8
   2243    mova [t2+r10+400*2], m4
   2244    mova [t2+r10+400*4], m5
   2245    mova         [t4+r10*1+400*4 +4], m6
   2246    mova         [t3+r10*2+400*8+ 8], xm0
   2247    vextracti128 [t3+r10*2+400*8+40], m0, 1
   2248    mova         [t3+r10*2+400*8+24], xm7
   2249    vextracti128 [t3+r10*2+400*8+56], m7, 1
   2250    vpbroadcastd    m4, [base+pd_25]
   2251    vpbroadcastd    m6, [base+pw_164_24]
   2252    pxor            m7, m7
   2253    paddd           m2, m9
   2254    paddd           m3, m9
   2255    psrld           m2, 4              ; (a5 + 8) >> 4
   2256    psrld           m3, 4
   2257    pmulld          m2, m4             ; ((a5 + 8) >> 4) * 25
   2258    pmulld          m3, m4
   2259    psrlw           m5, m1, 1
   2260    pavgw           m5, m7             ; (b5 + 2) >> 2
   2261    punpcklwd       m4, m5, m7
   2262    pmaddwd         m4, m4
   2263    punpckhwd       m5, m7
   2264    pmaddwd         m5, m5
   2265    punpcklwd       m0, m1, m7         ; b5
   2266    punpckhwd       m1, m7
   2267    pmaxud          m2, m4
   2268    psubd           m2, m4             ; p5
   2269    pmaxud          m3, m5
   2270    psubd           m3, m5
   2271    pmulld          m2, m13            ; p5 * s0
   2272    pmulld          m3, m13
   2273    pmaddwd         m0, m6             ; b5 * 164
   2274    pmaddwd         m1, m6
   2275    paddw           m2, m6
   2276    paddw           m3, m6
   2277    psrld           m2, 20             ; z5 + 1
   2278    psrld           m3, 20
   2279    cvtdq2ps        m2, m2
   2280    cvtdq2ps        m3, m3
   2281    rcpps           m4, m2             ; 1 / (z5 + 1)
   2282    rcpps           m5, m3
   2283    pcmpgtd         m2, m11, m2
   2284    pcmpgtd         m3, m11, m3
   2285    mulps           m4, m11            ; 256 / (z5 + 1)
   2286    mulps           m5, m11
   2287    psrld           m2, 24             ; z5 < 255 ? 255 : 0
   2288    psrld           m3, 24
   2289    cvtps2dq        m4, m4
   2290    cvtps2dq        m5, m5
   2291    pminsw          m4, m2             ; x5
   2292    pminsw          m5, m3
   2293    pmulld          m0, m4
   2294    pmulld          m1, m5
   2295    packssdw        m4, m5
   2296    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
   2297    paddd           m1, m10
   2298    psrld           m0, 12
   2299    psrld           m1, 12
   2300    mova         [t4+r10*1+400*0+ 4], m4
   2301    mova         [t3+r10*2+400*0+ 8], xm0
   2302    vextracti128 [t3+r10*2+400*0+40], m0, 1
   2303    mova         [t3+r10*2+400*0+24], xm1
   2304    vextracti128 [t3+r10*2+400*0+56], m1, 1
   2305    add            r10, 32
   2306    jl .hv1_loop
   2307    mov            r10, t2
   2308    mov             t2, t1
   2309    mov             t1, r10
   2310    ret
   2311 .v0: ; vertical boxsums + ab3 (even rows)
   2312    lea            r10, [wq-4]
   2313 .v0_loop:
   2314    mova            m0, [t1+r10+400* 6]
   2315    mova            m4, [t1+r10+400* 8]
   2316    mova            m5, [t1+r10+400*10]
   2317    paddw           m0, m0
   2318    paddd           m4, m4
   2319    paddd           m5, m5
   2320    paddw           m1, m0, [t2+r10+400* 6]
   2321    paddd           m2, m4, [t2+r10+400* 8]
   2322    paddd           m3, m5, [t2+r10+400*10]
   2323    mova [t2+r10+400* 6], m0
   2324    mova [t2+r10+400* 8], m4
   2325    mova [t2+r10+400*10], m5
   2326    paddd           m2, m9
   2327    paddd           m3, m9
   2328    psrld           m2, 4              ; (a3 + 8) >> 4
   2329    psrld           m3, 4
   2330    pslld           m4, m2, 3
   2331    pslld           m5, m3, 3
   2332    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
   2333    paddd           m5, m3
   2334    psrlw           m3, m1, 1
   2335    pavgw           m3, m7             ; (b3 + 2) >> 2
   2336    punpcklwd       m2, m3, m7
   2337    pmaddwd         m2, m2
   2338    punpckhwd       m3, m7
   2339    pmaddwd         m3, m3
   2340    punpcklwd       m0, m1, m7         ; b3
   2341    punpckhwd       m1, m7
   2342    pmaxud          m4, m2
   2343    psubd           m4, m2             ; p3
   2344    pmaxud          m5, m3
   2345    psubd           m5, m3
   2346    pmulld          m4, m14            ; p3 * s1
   2347    pmulld          m5, m14
   2348    pmaddwd         m0, m12            ; b3 * 455
   2349    pmaddwd         m1, m12
   2350    paddw           m4, m12
   2351    paddw           m5, m12
   2352    psrld           m4, 20             ; z + 1
   2353    psrld           m5, 20
   2354    cvtdq2ps        m4, m4
   2355    cvtdq2ps        m5, m5
   2356    rcpps           m2, m4             ; 1 / (z + 1)
   2357    rcpps           m3, m5
   2358    pcmpgtd         m4, m11, m4
   2359    pcmpgtd         m5, m11, m5
   2360    mulps           m2, m11            ; 256 / (z + 1)
   2361    mulps           m3, m11
   2362    psrld           m4, 24             ; z < 255 ? 255 : 0
   2363    psrld           m5, 24
   2364    cvtps2dq        m2, m2
   2365    cvtps2dq        m3, m3
   2366    pminsw          m2, m4             ; x
   2367    pminsw          m3, m5
   2368    pmulld          m0, m2
   2369    pmulld          m1, m3
   2370    packssdw        m2, m3
   2371    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   2372    paddd           m1, m10
   2373    psrld           m0, 12
   2374    psrld           m1, 12
   2375    mova            m3, [t1+r10+400*0]
   2376    mova            m4, [t1+r10+400*2]
   2377    mova            m5, [t1+r10+400*4]
   2378    mova [t3+r10*2+400*8+ 8], m3
   2379    mova [t3+r10*2+400*0+ 8], m4
   2380    mova [t3+r10*2+400*0+40], m5
   2381    paddw           m3, m3 ; cc5
   2382    paddd           m4, m4
   2383    paddd           m5, m5
   2384    mova [t1+r10+400*0], m3
   2385    mova [t1+r10+400*2], m4
   2386    mova [t1+r10+400*4], m5
   2387    mova         [t4+r10*1+400*2+ 4], m2
   2388    mova         [t3+r10*2+400*4+ 8], xm0
   2389    vextracti128 [t3+r10*2+400*4+40], m0, 1
   2390    mova         [t3+r10*2+400*4+24], xm1
   2391    vextracti128 [t3+r10*2+400*4+56], m1, 1
   2392    add            r10, 32
   2393    jl .v0_loop
   2394    ret
   2395 .v1: ; vertical boxsums + ab (odd rows)
   2396    lea            r10, [wq-4]
   2397 .v1_loop:
   2398    mova            m4, [t1+r10+400* 6]
   2399    mova            m5, [t1+r10+400* 8]
   2400    mova            m6, [t1+r10+400*10]
   2401    paddw           m1, m4, [t2+r10+400* 6]
   2402    paddd           m2, m5, [t2+r10+400* 8]
   2403    paddd           m3, m6, [t2+r10+400*10]
   2404    mova [t2+r10+400* 6], m4
   2405    mova [t2+r10+400* 8], m5
   2406    mova [t2+r10+400*10], m6
   2407    paddd           m2, m9
   2408    paddd           m3, m9
   2409    psrld           m2, 4              ; (a3 + 8) >> 4
   2410    psrld           m3, 4
   2411    pslld           m4, m2, 3
   2412    pslld           m5, m3, 3
   2413    paddd           m4, m2             ; ((a3 + 8) >> 4) * 9
   2414    paddd           m5, m3
   2415    psrlw           m3, m1, 1
   2416    pavgw           m3, m7             ; (b3 + 2) >> 2
   2417    punpcklwd       m2, m3, m7
   2418    pmaddwd         m2, m2
   2419    punpckhwd       m3, m7
   2420    pmaddwd         m3, m3
   2421    punpcklwd       m0, m1, m7         ; b3
   2422    punpckhwd       m1, m7
   2423    pmaxud          m4, m2
   2424    psubd           m4, m2             ; p3
   2425    pmaxud          m5, m3
   2426    psubd           m5, m3
   2427    pmulld          m4, m14            ; p3 * s1
   2428    pmulld          m5, m14
   2429    pmaddwd         m0, m12            ; b3 * 455
   2430    pmaddwd         m1, m12
   2431    paddw           m4, m12
   2432    paddw           m5, m12
   2433    psrld           m4, 20             ; z + 1
   2434    psrld           m5, 20
   2435    cvtdq2ps        m4, m4
   2436    cvtdq2ps        m5, m5
   2437    rcpps           m2, m4             ; 1 / (z + 1)
   2438    rcpps           m3, m5
   2439    pcmpgtd         m4, m11, m4
   2440    pcmpgtd         m5, m11, m5
   2441    mulps           m2, m11            ; 256 / (z + 1)
   2442    mulps           m3, m11
   2443    psrld           m4, 24             ; z < 255 ? 255 : 0
   2444    psrld           m5, 24
   2445    cvtps2dq        m2, m2
   2446    cvtps2dq        m3, m3
   2447    pminsw          m2, m4             ; x
   2448    pminsw          m3, m5
   2449    pmulld          m0, m2
   2450    pmulld          m1, m3
   2451    packssdw        m2, m3
   2452    paddd           m0, m10            ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
   2453    paddd           m1, m10
   2454    psrld           m0, 12
   2455    psrld           m8, m1, 12
   2456    mova [t4+r10*1+400*4+4], m2
   2457    mova            m4, [t3+r10*2+400*8+ 8]
   2458    mova            m5, [t3+r10*2+400*0+ 8]
   2459    mova            m6, [t3+r10*2+400*0+40]
   2460    paddw           m1, m4, [t2+r10+400*0]
   2461    paddd           m2, m5, [t2+r10+400*2]
   2462    paddd           m3, m6, [t2+r10+400*4]
   2463    paddw           m1, [t1+r10+400*0]
   2464    paddd           m2, [t1+r10+400*2]
   2465    paddd           m3, [t1+r10+400*4]
   2466    mova [t2+r10+400*0], m4
   2467    mova [t2+r10+400*2], m5
   2468    mova [t2+r10+400*4], m6
   2469    mova         [t3+r10*2+400*8+ 8], xm0
   2470    vextracti128 [t3+r10*2+400*8+40], m0, 1
   2471    mova         [t3+r10*2+400*8+24], xm8
   2472    vextracti128 [t3+r10*2+400*8+56], m8, 1
   2473    vpbroadcastd    m4, [base+pd_25]
   2474    vpbroadcastd    m6, [base+pw_164_24]
   2475    paddd           m2, m9
   2476    paddd           m3, m9
   2477    psrld           m2, 4              ; (a5 + 8) >> 4
   2478    psrld           m3, 4
   2479    pmulld          m2, m4             ; ((a5 + 8) >> 4) * 25
   2480    pmulld          m3, m4
   2481    psrlw           m5, m1, 1
   2482    pavgw           m5, m7             ; (b5 + 2) >> 2
   2483    punpcklwd       m4, m5, m7
   2484    pmaddwd         m4, m4
   2485    punpckhwd       m5, m7
   2486    pmaddwd         m5, m5
   2487    punpcklwd       m0, m1, m7         ; b5
   2488    punpckhwd       m1, m7
   2489    pmaxud          m2, m4
   2490    psubd           m2, m4             ; p5
   2491    pmaxud          m3, m5
   2492    psubd           m3, m5
   2493    pmulld          m2, m13            ; p5 * s0
   2494    pmulld          m3, m13
   2495    pmaddwd         m0, m6             ; b5 * 164
   2496    pmaddwd         m1, m6
   2497    paddw           m2, m6
   2498    paddw           m3, m6
   2499    psrld           m2, 20             ; z5 + 1
   2500    psrld           m3, 20
   2501    cvtdq2ps        m2, m2
   2502    cvtdq2ps        m3, m3
   2503    rcpps           m4, m2             ; 1 / (z5 + 1)
   2504    rcpps           m5, m3
   2505    pcmpgtd         m2, m11, m2
   2506    pcmpgtd         m3, m11, m3
   2507    mulps           m4, m11            ; 256 / (z5 + 1)
   2508    mulps           m5, m11
   2509    psrld           m2, 24             ; z5 < 255 ? 255 : 0
   2510    psrld           m3, 24
   2511    cvtps2dq        m4, m4
   2512    cvtps2dq        m5, m5
   2513    pminsw          m4, m2             ; x5
   2514    pminsw          m5, m3
   2515    pmulld          m0, m4
   2516    pmulld          m1, m5
   2517    packssdw        m4, m5
   2518    paddd           m0, m10            ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
   2519    paddd           m1, m10
   2520    psrld           m0, 12
   2521    psrld           m1, 12
   2522    mova         [t4+r10*1+400*0+ 4], m4
   2523    mova         [t3+r10*2+400*0+ 8], xm0
   2524    vextracti128 [t3+r10*2+400*0+40], m0, 1
   2525    mova         [t3+r10*2+400*0+24], xm1
   2526    vextracti128 [t3+r10*2+400*0+56], m1, 1
   2527    add            r10, 32
   2528    jl .v1_loop
   2529    mov            r10, t2
   2530    mov             t2, t1
   2531    mov             t1, r10
   2532    ret
   2533 .prep_n: ; initial neighbor setup
   2534    mov            r10, wq
   2535 .prep_n_loop:
   2536    movu           xm0, [t4+r10*1+400*0+2]
   2537    paddw          xm2, xm0, [t4+r10*1+400*0+0]
   2538    paddw          xm2, [t4+r10*1+400*0+4]
   2539    movu            m1, [t3+r10*2+400*0+4]
   2540    paddd           m3, m1, [t3+r10*2+400*0+0]
   2541    paddd           m3, [t3+r10*2+400*0+8]
   2542    paddw          xm0, xm2
   2543    paddd           m1, m3
   2544    psllw          xm2, 2
   2545    pslld           m3, 2
   2546    paddw          xm0, xm2              ; a5 565
   2547    paddd           m1, m3               ; b5 565
   2548    mova [t4+r10*1+400* 6], xm0
   2549    mova [t3+r10*2+400*12], m1
   2550    mova           xm0, [t4+r10*1+400*2+0]
   2551    paddw          xm0, [t4+r10*1+400*2+4]
   2552    paddw          xm2, xm0, [t4+r10*1+400*2+2]
   2553    mova            m1, [t3+r10*2+400*4+0]
   2554    paddd           m1, [t3+r10*2+400*4+8]
   2555    paddd           m3, m1, [t3+r10*2+400*4+4]
   2556    psllw          xm2, 2                ; a3[-1] 444
   2557    pslld           m3, 2                ; b3[-1] 444
   2558    psubw          xm2, xm0              ; a3[-1] 343
   2559    psubd           m3, m1               ; b3[-1] 343
   2560    mova [t4+r10*1+400* 8], xm2
   2561    mova [t3+r10*2+400*16], m3
   2562    mova           xm0, [t4+r10*1+400*4+0]
   2563    paddw          xm0, [t4+r10*1+400*4+4]
   2564    paddw          xm2, xm0, [t4+r10*1+400*4+2]
   2565    mova            m1, [t3+r10*2+400*8+0]
   2566    paddd           m1, [t3+r10*2+400*8+8]
   2567    paddd           m3, m1, [t3+r10*2+400*8+4]
   2568    psllw          xm2, 2                 ; a3[ 0] 444
   2569    pslld           m3, 2                 ; b3[ 0] 444
   2570    mova [t4+r10*1+400*10], xm2
   2571    mova [t3+r10*2+400*20], m3
   2572    psubw          xm2, xm0               ; a3[ 0] 343
   2573    psubd           m3, m1                ; b3[ 0] 343
   2574    mova [t4+r10*1+400*12], xm2
   2575    mova [t3+r10*2+400*24], m3
   2576    add            r10, 16
   2577    jl .prep_n_loop
   2578    ret
   2579 ALIGN function_align
   2580 .n0: ; neighbor + output (even rows)
   2581    mov            r10, wq
   2582    vpbroadcastd    m6, [base+pd_4096]
   2583 .n0_loop:
   2584    movu           xm2, [t4+r10*1+2]
   2585    paddw          xm0, xm2, [t4+r10*1+0]
   2586    paddw          xm0, [t4+r10*1+4]
   2587    paddw          xm2, xm0
   2588    psllw          xm0, 2
   2589    paddw          xm0, xm2              ; a5
   2590    movu            m1, [t3+r10*2+4]
   2591    paddd           m4, m1, [t3+r10*2+0]
   2592    paddd           m4, [t3+r10*2+8]
   2593    paddd           m1, m4
   2594    pslld           m4, 2
   2595    paddd           m4, m1               ; b5
   2596    paddw          xm2, xm0, [t4+r10*1+400* 6]
   2597    mova [t4+r10*1+400* 6], xm0
   2598    paddd           m0, m4, [t3+r10*2+400*12]
   2599    mova [t3+r10*2+400*12], m4
   2600    mova           xm3, [t4+r10*1+400*2+0]
   2601    paddw          xm3, [t4+r10*1+400*2+4]
   2602    paddw          xm5, xm3, [t4+r10*1+400*2+2]
   2603    psllw          xm5, 2                ; a3[ 1] 444
   2604    psubw          xm4, xm5, xm3         ; a3[ 1] 343
   2605    paddw          xm3, xm4, [t4+r10*1+400* 8]
   2606    paddw          xm3, [t4+r10*1+400*10]
   2607    mova [t4+r10*1+400* 8], xm4
   2608    mova [t4+r10*1+400*10], xm5
   2609    mova            m1, [t3+r10*2+400*4+0]
   2610    paddd           m1, [t3+r10*2+400*4+8]
   2611    paddd           m5, m1, [t3+r10*2+400*4+4]
   2612    pslld           m5, 2                ; b3[ 1] 444
   2613    psubd           m4, m5, m1           ; b3[ 1] 343
   2614    paddd           m1, m4, [t3+r10*2+400*16]
   2615    paddd           m1, [t3+r10*2+400*20]
   2616    mova [t3+r10*2+400*16], m4
   2617    mova [t3+r10*2+400*20], m5
   2618    pmovzxwd        m4, [dstq+r10]
   2619    pmovzxwd        m2, xm2              ; a5
   2620    pmovzxwd        m3, xm3              ; a3
   2621    pmaddwd         m2, m4               ; a5 * src
   2622    pmaddwd         m3, m4               ; a3 * src
   2623    pslld           m4, 13
   2624    psubd           m0, m2               ; b5 - a5 * src + (1 << 8)
   2625    psubd           m1, m3               ; b3 - a3 * src + (1 << 8)
   2626    psrld           m0, 9
   2627    pslld           m1, 7
   2628    pblendw         m0, m1, 0xaa
   2629    pmaddwd         m0, m15
   2630    paddd           m4, m6
   2631    paddd           m0, m4
   2632    psrad           m0, 7
   2633    vextracti128   xm1, m0, 1
   2634    packusdw       xm0, xm1              ; clip
   2635    psrlw          xm0, 6
   2636    mova    [dstq+r10], xm0
   2637    add            r10, 16
   2638    jl .n0_loop
   2639    add           dstq, strideq
   2640    ret
   2641 ALIGN function_align
   2642 .n1: ; neighbor + output (odd rows)
   2643    mov            r10, wq
   2644    vpbroadcastd    m6, [base+pd_4096]
   2645 .n1_loop:
   2646    mova           xm3, [t4+r10*1+400*4+0]
   2647    paddw          xm3, [t4+r10*1+400*4+4]
   2648    paddw          xm5, xm3, [t4+r10*1+400*4+2]
   2649    psllw          xm5, 2                ; a3[ 1] 444
   2650    psubw          xm4, xm5, xm3         ; a3[ 1] 343
   2651    paddw          xm3, xm4, [t4+r10*1+400*12]
   2652    paddw          xm3, [t4+r10*1+400*10]
   2653    mova [t4+r10*1+400*10], xm5
   2654    mova [t4+r10*1+400*12], xm4
   2655    mova            m1, [t3+r10*2+400*8+0]
   2656    paddd           m1, [t3+r10*2+400*8+8]
   2657    paddd           m5, m1, [t3+r10*2+400*8+4]
   2658    pslld           m5, 2                ; b3[ 1] 444
   2659    psubd           m4, m5, m1           ; b3[ 1] 343
   2660    paddd           m1, m4, [t3+r10*2+400*24]
   2661    paddd           m1, [t3+r10*2+400*20]
   2662    mova [t3+r10*2+400*20], m5
   2663    mova [t3+r10*2+400*24], m4
   2664    pmovzxwd        m4, [dstq+r10]
   2665    pmovzxwd        m2, [t4+r10*1+400* 6]
   2666    pmovzxwd        m3, xm3
   2667    mova            m0, [t3+r10*2+400*12]
   2668    pmaddwd         m2, m4               ; a5 * src
   2669    pmaddwd         m3, m4               ; a3 * src
   2670    pslld           m4, 13
   2671    psubd           m0, m2               ; b5 - a5 * src + (1 << 8)
   2672    psubd           m1, m3               ; b3 - a3 * src + (1 << 8)
   2673    psrld           m0, 8
   2674    pslld           m1, 7
   2675    pblendw         m0, m1, 0xaa
   2676    pmaddwd         m0, m15
   2677    paddd           m4, m6
   2678    paddd           m0, m4
   2679    psrad           m0, 7
   2680    vextracti128   xm1, m0, 1
   2681    packusdw       xm0, xm1              ; clip
   2682    psrlw          xm0, 6
   2683    mova    [dstq+r10], xm0
   2684    add            r10, 16
   2685    jl .n1_loop
   2686    add           dstq, strideq
   2687    ret
   2688 
   2689 %endif ; ARCH_X86_64