tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc16_avx2.asm (230035B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA 64
     32 
     33 ; dav1d_obmc_masks[] * -512
     34 const obmc_masks_avx2
     35            dw      0,      0,  -9728,      0, -12800,  -7168,  -2560,      0
     36            dw -14336, -11264,  -8192,  -5632,  -3584,  -1536,      0,      0
     37            dw -15360, -13824, -12288, -10752,  -9216,  -7680,  -6144,  -5120
     38            dw  -4096,  -3072,  -2048,  -1536,      0,      0,      0,      0
     39            dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240
     40            dw  -9728,  -8704,  -8192,  -7168,  -6656,  -6144,  -5632,  -4608
     41            dw  -4096,  -3584,  -3072,  -2560,  -2048,  -2048,  -1536,  -1024
     42            dw      0,      0,      0,      0,      0,      0,      0,      0
     43 
     44 deint_shuf:     dd 0,  4,  1,  5,  2,  6,  3,  7
     45 subpel_h_shufA: db 0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
     46 subpel_h_shufB: db 4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
     47 subpel_h_shuf2: db 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
     48 subpel_s_shuf2: db 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7
     49 subpel_s_shuf8: db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
     50 rescale_mul:    dd 0,  1,  2,  3,  4,  5,  6,  7
     51 rescale_mul2:   dd 0,  1,  4,  5,  2,  3,  6,  7
     52 resize_shuf:    db 0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
     53                db 8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
     54 blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
     55 wswap:          db 2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13
     56 bdct_lb_q: times 8 db 0
     57           times 8 db 4
     58           times 8 db 8
     59           times 8 db 12
     60 
     61 prep_mul:         dw 16, 16, 4, 4
     62 put_bilin_h_rnd:  dw 8, 8, 10, 10
     63 put_8tap_h_rnd:   dd 34, 40
     64 s_8tap_h_rnd:     dd 2, 8
     65 s_8tap_h_sh:      dd 2, 4
     66 put_s_8tap_v_rnd: dd 512, 128
     67 put_s_8tap_v_sh:  dd 10, 8
     68 prep_8tap_1d_rnd: dd     8 - (8192 <<  4)
     69 prep_8tap_2d_rnd: dd    32 - (8192 <<  5)
     70 warp8x8t_rnd:     dd 16384 - (8192 << 15)
     71 warp8x8_shift:    dd  5,  3
     72 warp8x8_rnd:      dw   4096,   4096,  16384,  16384
     73 bidir_rnd:        dw -16400, -16400, -16388, -16388
     74 bidir_mul:        dw   2048,   2048,   8192,   8192
     75 
     76 %define pw_16 prep_mul
     77 %define pd_512 put_s_8tap_v_rnd
     78 
     79 pw_2:          times 2 dw 2
     80 pw_64:         times 2 dw 64
     81 pw_2048:       times 2 dw 2048
     82 pw_8192:       times 2 dw 8192
     83 pw_27615:      times 2 dw 27615
     84 pw_32766:      times 2 dw 32766
     85 pw_m512:       times 2 dw -512
     86 pd_32:         dd 32
     87 pd_63:         dd 63
     88 pd_64:         dd 64
     89 pd_32768:      dd 32768
     90 pd_65538:      dd 65538
     91 pd_m524256:    dd -524256 ; -8192 << 6 + 32
     92 pd_0x3ff:      dd 0x3ff
     93 pq_0x40000000: dq 0x40000000
     94               dd 0
     95 
     96 %macro BIDIR_JMP_TABLE 2-*
     97    %xdefine %1_%2_table (%%table - 2*%3)
     98    %xdefine %%base %1_%2_table
     99    %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
    100    %%table:
    101    %rep %0 - 2
    102        dd %%prefix %+ .w%3 - %%base
    103        %rotate 1
    104    %endrep
    105 %endmacro
    106 
    107 BIDIR_JMP_TABLE avg,        avx2,    4, 8, 16, 32, 64, 128
    108 BIDIR_JMP_TABLE w_avg,      avx2,    4, 8, 16, 32, 64, 128
    109 BIDIR_JMP_TABLE mask,       avx2,    4, 8, 16, 32, 64, 128
    110 BIDIR_JMP_TABLE w_mask_420, avx2,    4, 8, 16, 32, 64, 128
    111 BIDIR_JMP_TABLE w_mask_422, avx2,    4, 8, 16, 32, 64, 128
    112 BIDIR_JMP_TABLE w_mask_444, avx2,    4, 8, 16, 32, 64, 128
    113 BIDIR_JMP_TABLE blend,      avx2,    4, 8, 16, 32
    114 BIDIR_JMP_TABLE blend_v,    avx2, 2, 4, 8, 16, 32
    115 BIDIR_JMP_TABLE blend_h,    avx2, 2, 4, 8, 16, 32, 64, 128
    116 
    117 %macro BASE_JMP_TABLE 3-*
    118    %xdefine %1_%2_table (%%table - %3)
    119    %xdefine %%base %1_%2
    120    %%table:
    121    %rep %0 - 2
    122        dw %%base %+ _w%3 - %%base
    123        %rotate 1
    124    %endrep
    125 %endmacro
    126 
    127 %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put)
    128 %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep)
    129 
    130 BASE_JMP_TABLE put,  avx2, 2, 4, 8, 16, 32, 64, 128
    131 BASE_JMP_TABLE prep, avx2,    4, 8, 16, 32, 64, 128
    132 
    133 %macro HV_JMP_TABLE 5-*
    134    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
    135    %xdefine %%base %1_%3
    136    %assign %%types %4
    137    %if %%types & 1
    138        %xdefine %1_%2_h_%3_table  (%%h  - %5)
    139        %%h:
    140        %rep %0 - 4
    141            dw %%prefix %+ .h_w%5 - %%base
    142            %rotate 1
    143        %endrep
    144        %rotate 4
    145    %endif
    146    %if %%types & 2
    147        %xdefine %1_%2_v_%3_table  (%%v  - %5)
    148        %%v:
    149        %rep %0 - 4
    150            dw %%prefix %+ .v_w%5 - %%base
    151            %rotate 1
    152        %endrep
    153        %rotate 4
    154    %endif
    155    %if %%types & 4
    156        %xdefine %1_%2_hv_%3_table (%%hv - %5)
    157        %%hv:
    158        %rep %0 - 4
    159            dw %%prefix %+ .hv_w%5 - %%base
    160            %rotate 1
    161        %endrep
    162    %endif
    163 %endmacro
    164 
    165 HV_JMP_TABLE put,  bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
    166 HV_JMP_TABLE prep, bilin, avx2, 7,    4, 8, 16, 32, 64, 128
    167 
    168 %macro SCALED_JMP_TABLE 2-*
    169    %xdefine %1_%2_table (%%table - %3)
    170    %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
    171 %%table:
    172    %rep %0 - 2
    173        dw %%base %+ .w%3 - %%base
    174        %rotate 1
    175    %endrep
    176    %rotate 2
    177 %%dy_1024:
    178    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
    179    %rep %0 - 2
    180        dw %%base %+ .dy1_w%3 - %%base
    181        %rotate 1
    182    %endrep
    183    %rotate 2
    184 %%dy_2048:
    185    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
    186    %rep %0 - 2
    187        dw %%base %+ .dy2_w%3 - %%base
    188        %rotate 1
    189    %endrep
    190 %endmacro
    191 
    192 SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
    193 SCALED_JMP_TABLE prep_8tap_scaled, avx2,   4, 8, 16, 32, 64, 128
    194 
    195 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
    196 
    197 cextern mc_subpel_filters
    198 %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
    199 
    200 cextern mc_warp_filter
    201 cextern resize_filter
    202 
    203 SECTION .text
    204 
    205 INIT_XMM avx2
    206 cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
    207    mov                mxyd, r6m ; mx
    208    lea                  r7, [put_avx2]
    209 %if UNIX64
    210    DECLARE_REG_TMP 8
    211    %define org_w r8d
    212    mov                 r8d, wd
    213 %else
    214    DECLARE_REG_TMP 7
    215    %define org_w wm
    216 %endif
    217    tzcnt                wd, wm
    218    movifnidn            hd, hm
    219    test               mxyd, mxyd
    220    jnz .h
    221    mov                mxyd, r7m ; my
    222    test               mxyd, mxyd
    223    jnz .v
    224 .put:
    225    movzx                wd, word [r7+wq*2+table_offset(put,)]
    226    add                  wq, r7
    227    jmp                  wq
    228 .put_w2:
    229    mov                 r6d, [srcq+ssq*0]
    230    mov                 r7d, [srcq+ssq*1]
    231    lea                srcq, [srcq+ssq*2]
    232    mov        [dstq+dsq*0], r6d
    233    mov        [dstq+dsq*1], r7d
    234    lea                dstq, [dstq+dsq*2]
    235    sub                  hd, 2
    236    jg .put_w2
    237    RET
    238 .put_w4:
    239    mov                  r6, [srcq+ssq*0]
    240    mov                  r7, [srcq+ssq*1]
    241    lea                srcq, [srcq+ssq*2]
    242    mov        [dstq+dsq*0], r6
    243    mov        [dstq+dsq*1], r7
    244    lea                dstq, [dstq+dsq*2]
    245    sub                  hd, 2
    246    jg .put_w4
    247    RET
    248 .put_w8:
    249    movu                 m0, [srcq+ssq*0]
    250    movu                 m1, [srcq+ssq*1]
    251    lea                srcq, [srcq+ssq*2]
    252    mova       [dstq+dsq*0], m0
    253    mova       [dstq+dsq*1], m1
    254    lea                dstq, [dstq+dsq*2]
    255    sub                  hd, 2
    256    jg .put_w8
    257    RET
    258 INIT_YMM avx2
    259 .put_w16:
    260    movu                 m0, [srcq+ssq*0]
    261    movu                 m1, [srcq+ssq*1]
    262    lea                srcq, [srcq+ssq*2]
    263    mova       [dstq+dsq*0], m0
    264    mova       [dstq+dsq*1], m1
    265    lea                dstq, [dstq+dsq*2]
    266    sub                  hd, 2
    267    jg .put_w16
    268    RET
    269 .put_w32:
    270    movu                 m0, [srcq+ssq*0+32*0]
    271    movu                 m1, [srcq+ssq*0+32*1]
    272    movu                 m2, [srcq+ssq*1+32*0]
    273    movu                 m3, [srcq+ssq*1+32*1]
    274    lea                srcq, [srcq+ssq*2]
    275    mova  [dstq+dsq*0+32*0], m0
    276    mova  [dstq+dsq*0+32*1], m1
    277    mova  [dstq+dsq*1+32*0], m2
    278    mova  [dstq+dsq*1+32*1], m3
    279    lea                dstq, [dstq+dsq*2]
    280    sub                  hd, 2
    281    jg .put_w32
    282    RET
    283 .put_w64:
    284    movu                 m0, [srcq+32*0]
    285    movu                 m1, [srcq+32*1]
    286    movu                 m2, [srcq+32*2]
    287    movu                 m3, [srcq+32*3]
    288    add                srcq, ssq
    289    mova        [dstq+32*0], m0
    290    mova        [dstq+32*1], m1
    291    mova        [dstq+32*2], m2
    292    mova        [dstq+32*3], m3
    293    add                dstq, dsq
    294    dec                  hd
    295    jg .put_w64
    296    RET
    297 .put_w128:
    298    movu                 m0, [srcq+32*0]
    299    movu                 m1, [srcq+32*1]
    300    movu                 m2, [srcq+32*2]
    301    movu                 m3, [srcq+32*3]
    302    mova        [dstq+32*0], m0
    303    mova        [dstq+32*1], m1
    304    mova        [dstq+32*2], m2
    305    mova        [dstq+32*3], m3
    306    movu                 m0, [srcq+32*4]
    307    movu                 m1, [srcq+32*5]
    308    movu                 m2, [srcq+32*6]
    309    movu                 m3, [srcq+32*7]
    310    add                srcq, ssq
    311    mova        [dstq+32*4], m0
    312    mova        [dstq+32*5], m1
    313    mova        [dstq+32*6], m2
    314    mova        [dstq+32*7], m3
    315    add                dstq, dsq
    316    dec                  hd
    317    jg .put_w128
    318    RET
    319 .h:
    320    movd                xm5, mxyd
    321    mov                mxyd, r7m ; my
    322    vpbroadcastd         m4, [pw_16]
    323    vpbroadcastw         m5, xm5
    324    psubw                m4, m5
    325    test               mxyd, mxyd
    326    jnz .hv
    327    ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
    328    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_h)]
    329    mov                 r6d, r8m ; bitdepth_max
    330    add                  wq, r7
    331    shr                 r6d, 11
    332    vpbroadcastd         m3, [r7-put_avx2+put_bilin_h_rnd+r6*4]
    333    jmp                  wq
    334 .h_w2:
    335    movq                xm1, [srcq+ssq*0]
    336    movhps              xm1, [srcq+ssq*1]
    337    lea                srcq, [srcq+ssq*2]
    338    pmullw              xm0, xm4, xm1
    339    psrlq               xm1, 16
    340    pmullw              xm1, xm5
    341    paddw               xm0, xm3
    342    paddw               xm0, xm1
    343    psrlw               xm0, 4
    344    movd       [dstq+dsq*0], xm0
    345    pextrd     [dstq+dsq*1], xm0, 2
    346    lea                dstq, [dstq+dsq*2]
    347    sub                  hd, 2
    348    jg .h_w2
    349    RET
    350 .h_w4:
    351    movq                xm0, [srcq+ssq*0]
    352    movhps              xm0, [srcq+ssq*1]
    353    movq                xm1, [srcq+ssq*0+2]
    354    movhps              xm1, [srcq+ssq*1+2]
    355    lea                srcq, [srcq+ssq*2]
    356    pmullw              xm0, xm4
    357    pmullw              xm1, xm5
    358    paddw               xm0, xm3
    359    paddw               xm0, xm1
    360    psrlw               xm0, 4
    361    movq       [dstq+dsq*0], xm0
    362    movhps     [dstq+dsq*1], xm0
    363    lea                dstq, [dstq+dsq*2]
    364    sub                  hd, 2
    365    jg .h_w4
    366    RET
    367 .h_w8:
    368    movu                xm0, [srcq+ssq*0]
    369    vinserti128          m0, [srcq+ssq*1], 1
    370    movu                xm1, [srcq+ssq*0+2]
    371    vinserti128          m1, [srcq+ssq*1+2], 1
    372    lea                srcq, [srcq+ssq*2]
    373    pmullw               m0, m4
    374    pmullw               m1, m5
    375    paddw                m0, m3
    376    paddw                m0, m1
    377    psrlw                m0, 4
    378    mova         [dstq+dsq*0], xm0
    379    vextracti128 [dstq+dsq*1], m0, 1
    380    lea                dstq, [dstq+dsq*2]
    381    sub                  hd, 2
    382    jg .h_w8
    383    RET
    384 .h_w16:
    385    pmullw               m0, m4, [srcq+ssq*0]
    386    pmullw               m1, m5, [srcq+ssq*0+2]
    387    paddw                m0, m3
    388    paddw                m0, m1
    389    pmullw               m1, m4, [srcq+ssq*1]
    390    pmullw               m2, m5, [srcq+ssq*1+2]
    391    lea                srcq, [srcq+ssq*2]
    392    paddw                m1, m3
    393    paddw                m1, m2
    394    psrlw                m0, 4
    395    psrlw                m1, 4
    396    mova       [dstq+dsq*0], m0
    397    mova       [dstq+dsq*1], m1
    398    lea                dstq, [dstq+dsq*2]
    399    sub                  hd, 2
    400    jg .h_w16
    401    RET
    402 .h_w32:
    403    pmullw               m0, m4, [srcq+32*0]
    404    pmullw               m1, m5, [srcq+32*0+2]
    405    paddw                m0, m3
    406    paddw                m0, m1
    407    pmullw               m1, m4, [srcq+32*1]
    408    pmullw               m2, m5, [srcq+32*1+2]
    409    add                srcq, ssq
    410    paddw                m1, m3
    411    paddw                m1, m2
    412    psrlw                m0, 4
    413    psrlw                m1, 4
    414    mova        [dstq+32*0], m0
    415    mova        [dstq+32*1], m1
    416    add                dstq, dsq
    417    dec                  hd
    418    jg .h_w32
    419    RET
    420 .h_w64:
    421 .h_w128:
    422    movifnidn           t0d, org_w
    423 .h_w64_loop0:
    424    mov                 r6d, t0d
    425 .h_w64_loop:
    426    pmullw               m0, m4, [srcq+r6*2-32*1]
    427    pmullw               m1, m5, [srcq+r6*2-32*1+2]
    428    paddw                m0, m3
    429    paddw                m0, m1
    430    pmullw               m1, m4, [srcq+r6*2-32*2]
    431    pmullw               m2, m5, [srcq+r6*2-32*2+2]
    432    paddw                m1, m3
    433    paddw                m1, m2
    434    psrlw                m0, 4
    435    psrlw                m1, 4
    436    mova   [dstq+r6*2-32*1], m0
    437    mova   [dstq+r6*2-32*2], m1
    438    sub                 r6d, 32
    439    jg .h_w64_loop
    440    add                srcq, ssq
    441    add                dstq, dsq
    442    dec                  hd
    443    jg .h_w64_loop0
    444    RET
    445 .v:
    446    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
    447    shl                mxyd, 11
    448    movd                xm5, mxyd
    449    add                  wq, r7
    450    vpbroadcastw         m5, xm5
    451    jmp                  wq
    452 .v_w2:
    453    movd                xm0, [srcq+ssq*0]
    454 .v_w2_loop:
    455    movd                xm1, [srcq+ssq*1]
    456    lea                srcq, [srcq+ssq*2]
    457    punpckldq           xm2, xm0, xm1
    458    movd                xm0, [srcq+ssq*0]
    459    punpckldq           xm1, xm0
    460    psubw               xm1, xm2
    461    pmulhrsw            xm1, xm5
    462    paddw               xm1, xm2
    463    movd       [dstq+dsq*0], xm1
    464    pextrd     [dstq+dsq*1], xm1, 1
    465    lea                dstq, [dstq+dsq*2]
    466    sub                  hd, 2
    467    jg .v_w2_loop
    468    RET
    469 .v_w4:
    470    movq                xm0, [srcq+ssq*0]
    471 .v_w4_loop:
    472    movq                xm1, [srcq+ssq*1]
    473    lea                srcq, [srcq+ssq*2]
    474    punpcklqdq          xm2, xm0, xm1
    475    movq                xm0, [srcq+ssq*0]
    476    punpcklqdq          xm1, xm0
    477    psubw               xm1, xm2
    478    pmulhrsw            xm1, xm5
    479    paddw               xm1, xm2
    480    movq       [dstq+dsq*0], xm1
    481    movhps     [dstq+dsq*1], xm1
    482    lea                dstq, [dstq+dsq*2]
    483    sub                  hd, 2
    484    jg .v_w4_loop
    485    RET
    486 .v_w8:
    487    movu                xm0, [srcq+ssq*0]
    488 .v_w8_loop:
    489    vbroadcasti128       m1, [srcq+ssq*1]
    490    lea                srcq, [srcq+ssq*2]
    491    vpblendd             m2, m0, m1, 0xf0
    492    vbroadcasti128       m0, [srcq+ssq*0]
    493    vpblendd             m1, m0, 0xf0
    494    psubw                m1, m2
    495    pmulhrsw             m1, m5
    496    paddw                m1, m2
    497    mova         [dstq+dsq*0], xm1
    498    vextracti128 [dstq+dsq*1], m1, 1
    499    lea                dstq, [dstq+dsq*2]
    500    sub                  hd, 2
    501    jg .v_w8_loop
    502    RET
    503 .v_w32:
    504    movu                 m0, [srcq+ssq*0+32*0]
    505    movu                 m1, [srcq+ssq*0+32*1]
    506 .v_w32_loop:
    507    movu                 m2, [srcq+ssq*1+32*0]
    508    movu                 m3, [srcq+ssq*1+32*1]
    509    lea                srcq, [srcq+ssq*2]
    510    psubw                m4, m2, m0
    511    pmulhrsw             m4, m5
    512    paddw                m4, m0
    513    movu                 m0, [srcq+ssq*0+32*0]
    514    mova  [dstq+dsq*0+32*0], m4
    515    psubw                m4, m3, m1
    516    pmulhrsw             m4, m5
    517    paddw                m4, m1
    518    movu                 m1, [srcq+ssq*0+32*1]
    519    mova  [dstq+dsq*0+32*1], m4
    520    psubw                m4, m0, m2
    521    pmulhrsw             m4, m5
    522    paddw                m4, m2
    523    mova  [dstq+dsq*1+32*0], m4
    524    psubw                m4, m1, m3
    525    pmulhrsw             m4, m5
    526    paddw                m4, m3
    527    mova  [dstq+dsq*1+32*1], m4
    528    lea                dstq, [dstq+dsq*2]
    529    sub                  hd, 2
    530    jg .v_w32_loop
    531    RET
    532 .v_w16:
    533 .v_w64:
    534 .v_w128:
    535    movifnidn           t0d, org_w
    536    add                 t0d, t0d
    537    mov                  r4, srcq
    538    lea                 r6d, [hq+t0*8-256]
    539    mov                  r7, dstq
    540 .v_w16_loop0:
    541    movu                 m0, [srcq+ssq*0]
    542 .v_w16_loop:
    543    movu                 m3, [srcq+ssq*1]
    544    lea                srcq, [srcq+ssq*2]
    545    psubw                m1, m3, m0
    546    pmulhrsw             m1, m5
    547    paddw                m1, m0
    548    movu                 m0, [srcq+ssq*0]
    549    psubw                m2, m0, m3
    550    pmulhrsw             m2, m5
    551    paddw                m2, m3
    552    mova       [dstq+dsq*0], m1
    553    mova       [dstq+dsq*1], m2
    554    lea                dstq, [dstq+dsq*2]
    555    sub                  hd, 2
    556    jg .v_w16_loop
    557    add                  r4, 32
    558    add                  r7, 32
    559    movzx                hd, r6b
    560    mov                srcq, r4
    561    mov                dstq, r7
    562    sub                 r6d, 1<<8
    563    jg .v_w16_loop0
    564    RET
    565 .hv:
    566    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
    567    WIN64_SPILL_XMM       8
    568    shl                mxyd, 11
    569    vpbroadcastd         m3, [pw_2]
    570    movd                xm6, mxyd
    571    vpbroadcastd         m7, [pw_8192]
    572    add                  wq, r7
    573    vpbroadcastw         m6, xm6
    574    test          dword r8m, 0x800
    575    jnz .hv_12bpc
    576    psllw                m4, 2
    577    psllw                m5, 2
    578    vpbroadcastd         m7, [pw_2048]
    579 .hv_12bpc:
    580    jmp                  wq
    581 .hv_w2:
    582    vpbroadcastq        xm1, [srcq+ssq*0]
    583    pmullw              xm0, xm4, xm1
    584    psrlq               xm1, 16
    585    pmullw              xm1, xm5
    586    paddw               xm0, xm3
    587    paddw               xm0, xm1
    588    psrlw               xm0, 2
    589 .hv_w2_loop:
    590    movq                xm2, [srcq+ssq*1]
    591    lea                srcq, [srcq+ssq*2]
    592    movhps              xm2, [srcq+ssq*0]
    593    pmullw              xm1, xm4, xm2
    594    psrlq               xm2, 16
    595    pmullw              xm2, xm5
    596    paddw               xm1, xm3
    597    paddw               xm1, xm2
    598    psrlw               xm1, 2              ; 1 _ 2 _
    599    shufpd              xm2, xm0, xm1, 0x01 ; 0 _ 1 _
    600    mova                xm0, xm1
    601    psubw               xm1, xm2
    602    paddw               xm1, xm1
    603    pmulhw              xm1, xm6
    604    paddw               xm1, xm2
    605    pmulhrsw            xm1, xm7
    606    movd       [dstq+dsq*0], xm1
    607    pextrd     [dstq+dsq*1], xm1, 2
    608    lea                dstq, [dstq+dsq*2]
    609    sub                  hd, 2
    610    jg .hv_w2_loop
    611    RET
    612 .hv_w4:
    613    pmullw              xm0, xm4, [srcq+ssq*0-8]
    614    pmullw              xm1, xm5, [srcq+ssq*0-6]
    615    paddw               xm0, xm3
    616    paddw               xm0, xm1
    617    psrlw               xm0, 2
    618 .hv_w4_loop:
    619    movq                xm1, [srcq+ssq*1]
    620    movq                xm2, [srcq+ssq*1+2]
    621    lea                srcq, [srcq+ssq*2]
    622    movhps              xm1, [srcq+ssq*0]
    623    movhps              xm2, [srcq+ssq*0+2]
    624    pmullw              xm1, xm4
    625    pmullw              xm2, xm5
    626    paddw               xm1, xm3
    627    paddw               xm1, xm2
    628    psrlw               xm1, 2              ; 1 2
    629    shufpd              xm2, xm0, xm1, 0x01 ; 0 1
    630    mova                xm0, xm1
    631    psubw               xm1, xm2
    632    paddw               xm1, xm1
    633    pmulhw              xm1, xm6
    634    paddw               xm1, xm2
    635    pmulhrsw            xm1, xm7
    636    movq       [dstq+dsq*0], xm1
    637    movhps     [dstq+dsq*1], xm1
    638    lea                dstq, [dstq+dsq*2]
    639    sub                  hd, 2
    640    jg .hv_w4_loop
    641    RET
    642 .hv_w8:
    643    pmullw              xm0, xm4, [srcq+ssq*0]
    644    pmullw              xm1, xm5, [srcq+ssq*0+2]
    645    paddw               xm0, xm3
    646    paddw               xm0, xm1
    647    psrlw               xm0, 2
    648    vinserti128          m0, xm0, 1
    649 .hv_w8_loop:
    650    movu                xm1, [srcq+ssq*1]
    651    movu                xm2, [srcq+ssq*1+2]
    652    lea                srcq, [srcq+ssq*2]
    653    vinserti128          m1, [srcq+ssq*0], 1
    654    vinserti128          m2, [srcq+ssq*0+2], 1
    655    pmullw               m1, m4
    656    pmullw               m2, m5
    657    paddw                m1, m3
    658    paddw                m1, m2
    659    psrlw                m1, 2            ; 1 2
    660    vperm2i128           m2, m0, m1, 0x21 ; 0 1
    661    mova                 m0, m1
    662    psubw                m1, m2
    663    paddw                m1, m1
    664    pmulhw               m1, m6
    665    paddw                m1, m2
    666    pmulhrsw             m1, m7
    667    mova         [dstq+dsq*0], xm1
    668    vextracti128 [dstq+dsq*1], m1, 1
    669    lea                dstq, [dstq+dsq*2]
    670    sub                  hd, 2
    671    jg .hv_w8_loop
    672    RET
    673 .hv_w16:
    674 .hv_w32:
    675 .hv_w64:
    676 .hv_w128:
    677 %if UNIX64
    678    lea                 r6d, [r8*2-32]
    679 %else
    680    mov                 r6d, wm
    681    lea                 r6d, [r6*2-32]
    682 %endif
    683    mov                  r4, srcq
    684    lea                 r6d, [hq+r6*8]
    685    mov                  r7, dstq
    686 .hv_w16_loop0:
    687    pmullw               m0, m4, [srcq+ssq*0]
    688    pmullw               m1, m5, [srcq+ssq*0+2]
    689    paddw                m0, m3
    690    paddw                m0, m1
    691    psrlw                m0, 2
    692 .hv_w16_loop:
    693    pmullw               m1, m4, [srcq+ssq*1]
    694    pmullw               m2, m5, [srcq+ssq*1+2]
    695    lea                srcq, [srcq+ssq*2]
    696    paddw                m1, m3
    697    paddw                m1, m2
    698    psrlw                m1, 2
    699    psubw                m2, m1, m0
    700    paddw                m2, m2
    701    pmulhw               m2, m6
    702    paddw                m2, m0
    703    pmulhrsw             m2, m7
    704    mova       [dstq+dsq*0], m2
    705    pmullw               m0, m4, [srcq+ssq*0]
    706    pmullw               m2, m5, [srcq+ssq*0+2]
    707    paddw                m0, m3
    708    paddw                m0, m2
    709    psrlw                m0, 2
    710    psubw                m2, m0, m1
    711    paddw                m2, m2
    712    pmulhw               m2, m6
    713    paddw                m2, m1
    714    pmulhrsw             m2, m7
    715    mova       [dstq+dsq*1], m2
    716    lea                dstq, [dstq+dsq*2]
    717    sub                  hd, 2
    718    jg .hv_w16_loop
    719    add                  r4, 32
    720    add                  r7, 32
    721    movzx                hd, r6b
    722    mov                srcq, r4
    723    mov                dstq, r7
    724    sub                 r6d, 1<<8
    725    jg .hv_w16_loop0
    726    RET
    727 
    728 cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
    729    movifnidn          mxyd, r5m ; mx
    730    lea                  r6, [prep_avx2]
    731 %if UNIX64
    732    DECLARE_REG_TMP 7
    733    %define org_w r7d
    734 %else
    735    DECLARE_REG_TMP 6
    736    %define org_w r5m
    737 %endif
    738    mov               org_w, wd
    739    tzcnt                wd, wm
    740    movifnidn            hd, hm
    741    test               mxyd, mxyd
    742    jnz .h
    743    mov                mxyd, r6m ; my
    744    test               mxyd, mxyd
    745    jnz .v
    746 .prep:
    747    movzx                wd, word [r6+wq*2+table_offset(prep,)]
    748    mov                 r5d, r7m ; bitdepth_max
    749    vpbroadcastd         m5, [r6-prep_avx2+pw_8192]
    750    add                  wq, r6
    751    shr                 r5d, 11
    752    vpbroadcastd         m4, [r6-prep_avx2+prep_mul+r5*4]
    753    lea            stride3q, [strideq*3]
    754    jmp                  wq
    755 .prep_w4:
    756    movq                xm0, [srcq+strideq*0]
    757    movhps              xm0, [srcq+strideq*1]
    758    vpbroadcastq         m1, [srcq+strideq*2]
    759    vpbroadcastq         m2, [srcq+stride3q ]
    760    lea                srcq, [srcq+strideq*4]
    761    vpblendd             m0, m1, 0x30
    762    vpblendd             m0, m2, 0xc0
    763    pmullw               m0, m4
    764    psubw                m0, m5
    765    mova             [tmpq], m0
    766    add                tmpq, 32
    767    sub                  hd, 4
    768    jg .prep_w4
    769    RET
    770 .prep_w8:
    771    movu                xm0, [srcq+strideq*0]
    772    vinserti128          m0, [srcq+strideq*1], 1
    773    movu                xm1, [srcq+strideq*2]
    774    vinserti128          m1, [srcq+stride3q ], 1
    775    lea                srcq, [srcq+strideq*4]
    776    pmullw               m0, m4
    777    pmullw               m1, m4
    778    psubw                m0, m5
    779    psubw                m1, m5
    780    mova        [tmpq+32*0], m0
    781    mova        [tmpq+32*1], m1
    782    add                tmpq, 32*2
    783    sub                  hd, 4
    784    jg .prep_w8
    785    RET
    786 .prep_w16:
    787    pmullw               m0, m4, [srcq+strideq*0]
    788    pmullw               m1, m4, [srcq+strideq*1]
    789    pmullw               m2, m4, [srcq+strideq*2]
    790    pmullw               m3, m4, [srcq+stride3q ]
    791    lea                srcq, [srcq+strideq*4]
    792    psubw                m0, m5
    793    psubw                m1, m5
    794    psubw                m2, m5
    795    psubw                m3, m5
    796    mova        [tmpq+32*0], m0
    797    mova        [tmpq+32*1], m1
    798    mova        [tmpq+32*2], m2
    799    mova        [tmpq+32*3], m3
    800    add                tmpq, 32*4
    801    sub                  hd, 4
    802    jg .prep_w16
    803    RET
    804 .prep_w32:
    805    pmullw               m0, m4, [srcq+strideq*0+32*0]
    806    pmullw               m1, m4, [srcq+strideq*0+32*1]
    807    pmullw               m2, m4, [srcq+strideq*1+32*0]
    808    pmullw               m3, m4, [srcq+strideq*1+32*1]
    809    lea                srcq, [srcq+strideq*2]
    810    psubw                m0, m5
    811    psubw                m1, m5
    812    psubw                m2, m5
    813    psubw                m3, m5
    814    mova        [tmpq+32*0], m0
    815    mova        [tmpq+32*1], m1
    816    mova        [tmpq+32*2], m2
    817    mova        [tmpq+32*3], m3
    818    add                tmpq, 32*4
    819    sub                  hd, 2
    820    jg .prep_w32
    821    RET
    822 .prep_w64:
    823    pmullw               m0, m4, [srcq+32*0]
    824    pmullw               m1, m4, [srcq+32*1]
    825    pmullw               m2, m4, [srcq+32*2]
    826    pmullw               m3, m4, [srcq+32*3]
    827    add                srcq, strideq
    828    psubw                m0, m5
    829    psubw                m1, m5
    830    psubw                m2, m5
    831    psubw                m3, m5
    832    mova        [tmpq+32*0], m0
    833    mova        [tmpq+32*1], m1
    834    mova        [tmpq+32*2], m2
    835    mova        [tmpq+32*3], m3
    836    add                tmpq, 32*4
    837    dec                  hd
    838    jg .prep_w64
    839    RET
    840 .prep_w128:
    841    pmullw               m0, m4, [srcq+32*0]
    842    pmullw               m1, m4, [srcq+32*1]
    843    pmullw               m2, m4, [srcq+32*2]
    844    pmullw               m3, m4, [srcq+32*3]
    845    psubw                m0, m5
    846    psubw                m1, m5
    847    psubw                m2, m5
    848    psubw                m3, m5
    849    mova        [tmpq+32*0], m0
    850    mova        [tmpq+32*1], m1
    851    mova        [tmpq+32*2], m2
    852    mova        [tmpq+32*3], m3
    853    pmullw               m0, m4, [srcq+32*4]
    854    pmullw               m1, m4, [srcq+32*5]
    855    pmullw               m2, m4, [srcq+32*6]
    856    pmullw               m3, m4, [srcq+32*7]
    857    add                tmpq, 32*8
    858    add                srcq, strideq
    859    psubw                m0, m5
    860    psubw                m1, m5
    861    psubw                m2, m5
    862    psubw                m3, m5
    863    mova        [tmpq-32*4], m0
    864    mova        [tmpq-32*3], m1
    865    mova        [tmpq-32*2], m2
    866    mova        [tmpq-32*1], m3
    867    dec                  hd
    868    jg .prep_w128
    869    RET
    870 .h:
    871    movd                xm5, mxyd
    872    mov                mxyd, r6m ; my
    873    vpbroadcastd         m4, [pw_16]
    874    vpbroadcastw         m5, xm5
    875    vpbroadcastd         m3, [pw_32766]
    876    psubw                m4, m5
    877    test          dword r7m, 0x800
    878    jnz .h_12bpc
    879    psllw                m4, 2
    880    psllw                m5, 2
    881 .h_12bpc:
    882    test               mxyd, mxyd
    883    jnz .hv
    884    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
    885    add                  wq, r6
    886    lea            stride3q, [strideq*3]
    887    jmp                  wq
    888 .h_w4:
    889    movu                xm1, [srcq+strideq*0]
    890    vinserti128          m1, [srcq+strideq*2], 1
    891    movu                xm2, [srcq+strideq*1]
    892    vinserti128          m2, [srcq+stride3q ], 1
    893    lea                srcq, [srcq+strideq*4]
    894    punpcklqdq           m0, m1, m2
    895    psrldq               m1, 2
    896    pslldq               m2, 6
    897    pmullw               m0, m4
    898    vpblendd             m1, m2, 0xcc
    899    pmullw               m1, m5
    900    psubw                m0, m3
    901    paddw                m0, m1
    902    psraw                m0, 2
    903    mova             [tmpq], m0
    904    add                tmpq, 32
    905    sub                  hd, 4
    906    jg .h_w4
    907    RET
    908 .h_w8:
    909    movu                xm0, [srcq+strideq*0]
    910    vinserti128          m0, [srcq+strideq*1], 1
    911    movu                xm1, [srcq+strideq*0+2]
    912    vinserti128          m1, [srcq+strideq*1+2], 1
    913    lea                srcq, [srcq+strideq*2]
    914    pmullw               m0, m4
    915    pmullw               m1, m5
    916    psubw                m0, m3
    917    paddw                m0, m1
    918    psraw                m0, 2
    919    mova             [tmpq], m0
    920    add                tmpq, 32
    921    sub                  hd, 2
    922    jg .h_w8
    923    RET
    924 .h_w16:
    925    pmullw               m0, m4, [srcq+strideq*0]
    926    pmullw               m1, m5, [srcq+strideq*0+2]
    927    psubw                m0, m3
    928    paddw                m0, m1
    929    pmullw               m1, m4, [srcq+strideq*1]
    930    pmullw               m2, m5, [srcq+strideq*1+2]
    931    lea                srcq, [srcq+strideq*2]
    932    psubw                m1, m3
    933    paddw                m1, m2
    934    psraw                m0, 2
    935    psraw                m1, 2
    936    mova        [tmpq+32*0], m0
    937    mova        [tmpq+32*1], m1
    938    add                tmpq, 32*2
    939    sub                  hd, 2
    940    jg .h_w16
    941    RET
    942 .h_w32:
    943 .h_w64:
    944 .h_w128:
    945    movifnidn           t0d, org_w
    946 .h_w32_loop0:
    947    mov                 r3d, t0d
    948 .h_w32_loop:
    949    pmullw               m0, m4, [srcq+r3*2-32*1]
    950    pmullw               m1, m5, [srcq+r3*2-32*1+2]
    951    psubw                m0, m3
    952    paddw                m0, m1
    953    pmullw               m1, m4, [srcq+r3*2-32*2]
    954    pmullw               m2, m5, [srcq+r3*2-32*2+2]
    955    psubw                m1, m3
    956    paddw                m1, m2
    957    psraw                m0, 2
    958    psraw                m1, 2
    959    mova   [tmpq+r3*2-32*1], m0
    960    mova   [tmpq+r3*2-32*2], m1
    961    sub                 r3d, 32
    962    jg .h_w32_loop
    963    add                srcq, strideq
    964    lea                tmpq, [tmpq+t0*2]
    965    dec                  hd
    966    jg .h_w32_loop0
    967    RET
    968 .v:
    969    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
    970    movd                xm5, mxyd
    971    vpbroadcastd         m4, [pw_16]
    972    vpbroadcastw         m5, xm5
    973    vpbroadcastd         m3, [pw_32766]
    974    add                  wq, r6
    975    lea            stride3q, [strideq*3]
    976    psubw                m4, m5
    977    test          dword r7m, 0x800
    978    jnz .v_12bpc
    979    psllw                m4, 2
    980    psllw                m5, 2
    981 .v_12bpc:
    982    jmp                  wq
    983 .v_w4:
    984    movq                xm0, [srcq+strideq*0]
    985 .v_w4_loop:
    986    vpbroadcastq         m2, [srcq+strideq*2]
    987    vpbroadcastq        xm1, [srcq+strideq*1]
    988    vpblendd             m2, m0, 0x03 ; 0 2 2 2
    989    vpbroadcastq         m0, [srcq+stride3q ]
    990    lea                srcq, [srcq+strideq*4]
    991    vpblendd             m1, m0, 0xf0 ; 1 1 3 3
    992    vpbroadcastq         m0, [srcq+strideq*0]
    993    vpblendd             m1, m2, 0x33 ; 0 1 2 3
    994    vpblendd             m0, m2, 0x0c ; 4 2 4 4
    995    punpckhqdq           m2, m1, m0   ; 1 2 3 4
    996    pmullw               m1, m4
    997    pmullw               m2, m5
    998    psubw                m1, m3
    999    paddw                m1, m2
   1000    psraw                m1, 2
   1001    mova             [tmpq], m1
   1002    add                tmpq, 32
   1003    sub                  hd, 4
   1004    jg .v_w4_loop
   1005    RET
   1006 .v_w8:
   1007    movu                xm0, [srcq+strideq*0]
   1008 .v_w8_loop:
   1009    vbroadcasti128       m2, [srcq+strideq*1]
   1010    lea                srcq, [srcq+strideq*2]
   1011    vpblendd             m1, m0, m2, 0xf0 ; 0 1
   1012    vbroadcasti128       m0, [srcq+strideq*0]
   1013    vpblendd             m2, m0, 0xf0     ; 1 2
   1014    pmullw               m1, m4
   1015    pmullw               m2, m5
   1016    psubw                m1, m3
   1017    paddw                m1, m2
   1018    psraw                m1, 2
   1019    mova             [tmpq], m1
   1020    add                tmpq, 32
   1021    sub                  hd, 2
   1022    jg .v_w8_loop
   1023    RET
   1024 .v_w16:
   1025    movu                 m0, [srcq+strideq*0]
   1026 .v_w16_loop:
   1027    movu                 m2, [srcq+strideq*1]
   1028    lea                srcq, [srcq+strideq*2]
   1029    pmullw               m0, m4
   1030    pmullw               m1, m5, m2
   1031    psubw                m0, m3
   1032    paddw                m1, m0
   1033    movu                 m0, [srcq+strideq*0]
   1034    psraw                m1, 2
   1035    pmullw               m2, m4
   1036    mova        [tmpq+32*0], m1
   1037    pmullw               m1, m5, m0
   1038    psubw                m2, m3
   1039    paddw                m1, m2
   1040    psraw                m1, 2
   1041    mova        [tmpq+32*1], m1
   1042    add                tmpq, 32*2
   1043    sub                  hd, 2
   1044    jg .v_w16_loop
   1045    RET
   1046 .v_w32:
   1047 .v_w64:
   1048 .v_w128:
   1049 %if WIN64
   1050    PUSH                 r7
   1051 %endif
   1052    movifnidn           r7d, org_w
   1053    add                 r7d, r7d
   1054    mov                  r3, srcq
   1055    lea                 r6d, [hq+r7*8-256]
   1056    mov                  r5, tmpq
   1057 .v_w32_loop0:
   1058    movu                 m0, [srcq+strideq*0]
   1059 .v_w32_loop:
   1060    movu                 m2, [srcq+strideq*1]
   1061    lea                srcq, [srcq+strideq*2]
   1062    pmullw               m0, m4
   1063    pmullw               m1, m5, m2
   1064    psubw                m0, m3
   1065    paddw                m1, m0
   1066    movu                 m0, [srcq+strideq*0]
   1067    psraw                m1, 2
   1068    pmullw               m2, m4
   1069    mova        [tmpq+r7*0], m1
   1070    pmullw               m1, m5, m0
   1071    psubw                m2, m3
   1072    paddw                m1, m2
   1073    psraw                m1, 2
   1074    mova        [tmpq+r7*1], m1
   1075    lea                tmpq, [tmpq+r7*2]
   1076    sub                  hd, 2
   1077    jg .v_w32_loop
   1078    add                  r3, 32
   1079    add                  r5, 32
   1080    movzx                hd, r6b
   1081    mov                srcq, r3
   1082    mov                tmpq, r5
   1083    sub                 r6d, 1<<8
   1084    jg .v_w32_loop0
   1085 %if WIN64
   1086    POP                  r7
   1087 %endif
   1088    RET
   1089 .hv:
   1090    WIN64_SPILL_XMM       7
   1091    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
   1092    shl                mxyd, 11
   1093    movd                xm6, mxyd
   1094    add                  wq, r6
   1095    lea            stride3q, [strideq*3]
   1096    vpbroadcastw         m6, xm6
   1097    jmp                  wq
   1098 .hv_w4:
   1099    movu                xm1, [srcq+strideq*0]
   1100 %if WIN64
   1101    movaps         [rsp+24], xmm7
   1102 %endif
   1103    pmullw              xm0, xm4, xm1
   1104    psrldq              xm1, 2
   1105    pmullw              xm1, xm5
   1106    psubw               xm0, xm3
   1107    paddw               xm0, xm1
   1108    psraw               xm0, 2
   1109    vpbroadcastq         m0, xm0
   1110 .hv_w4_loop:
   1111    movu                xm1, [srcq+strideq*1]
   1112    vinserti128          m1, [srcq+stride3q ], 1
   1113    movu                xm2, [srcq+strideq*2]
   1114    lea                srcq, [srcq+strideq*4]
   1115    vinserti128          m2, [srcq+strideq*0], 1
   1116    punpcklqdq           m7, m1, m2
   1117    psrldq               m1, 2
   1118    pslldq               m2, 6
   1119    pmullw               m7, m4
   1120    vpblendd             m1, m2, 0xcc
   1121    pmullw               m1, m5
   1122    psubw                m7, m3
   1123    paddw                m1, m7
   1124    psraw                m1, 2         ; 1 2 3 4
   1125    vpblendd             m0, m1, 0x3f
   1126    vpermq               m2, m0, q2103 ; 0 1 2 3
   1127    mova                 m0, m1
   1128    psubw                m1, m2
   1129    pmulhrsw             m1, m6
   1130    paddw                m1, m2
   1131    mova             [tmpq], m1
   1132    add                tmpq, 32
   1133    sub                  hd, 4
   1134    jg .hv_w4_loop
   1135 %if WIN64
   1136    movaps             xmm7, [rsp+24]
   1137 %endif
   1138    RET
   1139 .hv_w8:
   1140    pmullw              xm0, xm4, [srcq+strideq*0]
   1141    pmullw              xm1, xm5, [srcq+strideq*0+2]
   1142    psubw               xm0, xm3
   1143    paddw               xm0, xm1
   1144    psraw               xm0, 2
   1145    vinserti128          m0, xm0, 1
   1146 .hv_w8_loop:
   1147    movu                xm1, [srcq+strideq*1]
   1148    movu                xm2, [srcq+strideq*1+2]
   1149    lea                srcq, [srcq+strideq*2]
   1150    vinserti128          m1, [srcq+strideq*0], 1
   1151    vinserti128          m2, [srcq+strideq*0+2], 1
   1152    pmullw               m1, m4
   1153    pmullw               m2, m5
   1154    psubw                m1, m3
   1155    paddw                m1, m2
   1156    psraw                m1, 2            ; 1 2
   1157    vperm2i128           m2, m0, m1, 0x21 ; 0 1
   1158    mova                 m0, m1
   1159    psubw                m1, m2
   1160    pmulhrsw             m1, m6
   1161    paddw                m1, m2
   1162    mova             [tmpq], m1
   1163    add                tmpq, 32
   1164    sub                  hd, 2
   1165    jg .hv_w8_loop
   1166    RET
   1167 .hv_w16:
   1168 .hv_w32:
   1169 .hv_w64:
   1170 .hv_w128:
   1171 %if WIN64
   1172    PUSH                 r7
   1173 %endif
   1174    movifnidn           r7d, org_w
   1175    add                 r7d, r7d
   1176    mov                  r3, srcq
   1177    lea                 r6d, [hq+r7*8-256]
   1178    mov                  r5, tmpq
   1179 .hv_w16_loop0:
   1180    pmullw               m0, m4, [srcq]
   1181    pmullw               m1, m5, [srcq+2]
   1182    psubw                m0, m3
   1183    paddw                m0, m1
   1184    psraw                m0, 2
   1185 .hv_w16_loop:
   1186    pmullw               m1, m4, [srcq+strideq*1]
   1187    pmullw               m2, m5, [srcq+strideq*1+2]
   1188    lea                srcq, [srcq+strideq*2]
   1189    psubw                m1, m3
   1190    paddw                m1, m2
   1191    psraw                m1, 2
   1192    psubw                m2, m1, m0
   1193    pmulhrsw             m2, m6
   1194    paddw                m2, m0
   1195    mova        [tmpq+r7*0], m2
   1196    pmullw               m0, m4, [srcq+strideq*0]
   1197    pmullw               m2, m5, [srcq+strideq*0+2]
   1198    psubw                m0, m3
   1199    paddw                m0, m2
   1200    psraw                m0, 2
   1201    psubw                m2, m0, m1
   1202    pmulhrsw             m2, m6
   1203    paddw                m2, m1
   1204    mova        [tmpq+r7*1], m2
   1205    lea                tmpq, [tmpq+r7*2]
   1206    sub                  hd, 2
   1207    jg .hv_w16_loop
   1208    add                  r3, 32
   1209    add                  r5, 32
   1210    movzx                hd, r6b
   1211    mov                srcq, r3
   1212    mov                tmpq, r5
   1213    sub                 r6d, 1<<8
   1214    jg .hv_w16_loop0
   1215 %if WIN64
   1216    POP                  r7
   1217 %endif
   1218    RET
   1219 
   1220 ; int8_t subpel_filters[5][15][8]
   1221 %assign FILTER_REGULAR (0*15 << 16) | 3*15
   1222 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
   1223 %assign FILTER_SHARP   (2*15 << 16) | 3*15
   1224 
   1225 %macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
   1226 cglobal %1_%2_16bpc
   1227    mov                 t0d, FILTER_%3
   1228 %ifidn %3, %4
   1229    mov                 t1d, t0d
   1230 %else
   1231    mov                 t1d, FILTER_%4
   1232 %endif
   1233 %if %0 == 5 ; skip the jump in the last filter
   1234    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
   1235 %endif
   1236 %endmacro
   1237 
   1238 %if WIN64
   1239 DECLARE_REG_TMP 4, 5
   1240 %else
   1241 DECLARE_REG_TMP 7, 8
   1242 %endif
   1243 
   1244 %define PUT_8TAP_FN FN put_8tap,
   1245 PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
   1246 PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
   1247 PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
   1248 PUT_8TAP_FN regular,        REGULAR, REGULAR
   1249 
   1250 cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
   1251 %define base r8-put_avx2
   1252    imul                mxd, mxm, 0x010101
   1253    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
   1254    imul                myd, mym, 0x010101
   1255    add                 myd, t1d ; 6tap_v, my, 4tap_v
   1256    lea                  r8, [put_avx2]
   1257    movifnidn            wd, wm
   1258    movifnidn            hd, hm
   1259    test                mxd, 0xf00
   1260    jnz .h
   1261    test                myd, 0xf00
   1262    jnz .v
   1263 .put:
   1264    tzcnt                wd, wd
   1265    movzx                wd, word [r8+wq*2+table_offset(put,)]
   1266    add                  wq, r8
   1267 %if WIN64
   1268    pop                  r8
   1269 %endif
   1270    jmp                  wq
   1271 .h_w2:
   1272    movzx               mxd, mxb
   1273    sub                srcq, 2
   1274    mova                xm2, [subpel_h_shuf2]
   1275    vpbroadcastd        xm3, [base+subpel_filters+mxq*8+2]
   1276    pmovsxbw            xm3, xm3
   1277 .h_w2_loop:
   1278    movu                xm0, [srcq+ssq*0]
   1279    movu                xm1, [srcq+ssq*1]
   1280    lea                srcq, [srcq+ssq*2]
   1281    pshufb              xm0, xm2
   1282    pshufb              xm1, xm2
   1283    pmaddwd             xm0, xm3
   1284    pmaddwd             xm1, xm3
   1285    phaddd              xm0, xm1
   1286    paddd               xm0, xm4
   1287    psrad               xm0, 6
   1288    packusdw            xm0, xm0
   1289    pminsw              xm0, xm5
   1290    movd       [dstq+dsq*0], xm0
   1291    pextrd     [dstq+dsq*1], xm0, 1
   1292    lea                dstq, [dstq+dsq*2]
   1293    sub                  hd, 2
   1294    jg .h_w2_loop
   1295    RET
   1296 .h_w4:
   1297    movzx               mxd, mxb
   1298    sub                srcq, 2
   1299    pmovsxbw            xm3, [base+subpel_filters+mxq*8]
   1300    WIN64_SPILL_XMM       8
   1301    vbroadcasti128       m6, [subpel_h_shufA]
   1302    vbroadcasti128       m7, [subpel_h_shufB]
   1303    pshufd              xm3, xm3, q2211
   1304    vpbroadcastq         m2, xm3
   1305    vpermq               m3, m3, q1111
   1306 .h_w4_loop:
   1307    movu                xm1, [srcq+ssq*0]
   1308    vinserti128          m1, [srcq+ssq*1], 1
   1309    lea                srcq, [srcq+ssq*2]
   1310    pshufb               m0, m1, m6 ; 0 1 1 2 2 3 3 4
   1311    pshufb               m1, m7     ; 2 3 3 4 4 5 5 6
   1312    pmaddwd              m0, m2
   1313    pmaddwd              m1, m3
   1314    paddd                m0, m4
   1315    paddd                m0, m1
   1316    psrad                m0, 6
   1317    vextracti128        xm1, m0, 1
   1318    packusdw            xm0, xm1
   1319    pminsw              xm0, xm5
   1320    movq       [dstq+dsq*0], xm0
   1321    movhps     [dstq+dsq*1], xm0
   1322    lea                dstq, [dstq+dsq*2]
   1323    sub                  hd, 2
   1324    jg .h_w4_loop
   1325    RET
   1326 .h:
   1327    test                myd, 0xf00
   1328    jnz .hv
   1329    mov                 r7d, r8m
   1330    vpbroadcastw         m5, r8m
   1331    shr                 r7d, 11
   1332    vpbroadcastd         m4, [base+put_8tap_h_rnd+r7*4]
   1333    cmp                  wd, 4
   1334    je .h_w4
   1335    jl .h_w2
   1336    WIN64_SPILL_XMM      11
   1337    shr                 mxd, 16
   1338    sub                srcq, 4
   1339    vpbroadcastq         m0, [base+subpel_filters+1+mxq*8]
   1340    vbroadcasti128       m6, [base+subpel_h_shufA]
   1341    punpcklbw            m0, m0
   1342    psraw                m0, 8 ; sign-extend
   1343    pshufd               m7, m0, q0000
   1344    pshufd               m8, m0, q1111
   1345    pshufd               m9, m0, q2222
   1346    sub                  wd, 16
   1347    jge .h_w16
   1348 .h_w8:
   1349 %macro PUT_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
   1350    pshufb              m%1, m6        ; 01 12 23 34
   1351    pshufb              m%2, m6        ; 45 56 67 78
   1352    pmaddwd             m%4, m7, m%1   ; a0
   1353    pshufb              m%3, m6        ; 89 9a ab bc
   1354    pmaddwd             m%5, m9, m%2   ; a2
   1355    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
   1356    paddd               m%4, m%5       ; a0+a2
   1357    pmaddwd             m%5, m7, m%2   ; b0
   1358    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
   1359    pmaddwd             m%3, m9        ; b2
   1360    pmaddwd             m%1, m8        ; a1
   1361    pmaddwd             m%2, m8        ; b1
   1362    paddd               m%3, m%5       ; b0+b2
   1363    paddd               m%4, m4
   1364    paddd               m%3, m4
   1365    paddd               m%1, m%4
   1366    paddd               m%2, m%3
   1367    psrad               m%1, 6
   1368    psrad               m%2, 6
   1369    packusdw            m%1, m%2
   1370    pminsw              m%1, m5
   1371 %endmacro
   1372    movu                xm0, [srcq+ssq*0+ 0]
   1373    vinserti128          m0, [srcq+ssq*1+ 0], 1
   1374    movu                xm2, [srcq+ssq*0+16]
   1375    vinserti128          m2, [srcq+ssq*1+16], 1
   1376    shufpd               m1, m0, m2, 0x05
   1377    lea                srcq, [srcq+ssq*2]
   1378    PUT_6TAP_H            0, 1, 2, 3, 10
   1379    mova         [dstq+dsq*0], xm0
   1380    vextracti128 [dstq+dsq*1], m0, 1
   1381    lea                dstq, [dstq+dsq*2]
   1382    sub                  hd, 2
   1383    jg .h_w8
   1384    RET
   1385 .h_w16:
   1386    mov                 r6d, wd
   1387 .h_w16_loop:
   1388    movu                 m0, [srcq+r6*2+ 0]
   1389    movu                 m1, [srcq+r6*2+ 8]
   1390    movu                 m2, [srcq+r6*2+16]
   1391    PUT_6TAP_H            0, 1, 2, 3, 10
   1392    mova        [dstq+r6*2], m0
   1393    sub                 r6d, 16
   1394    jge .h_w16_loop
   1395    add                srcq, ssq
   1396    add                dstq, dsq
   1397    dec                  hd
   1398    jg .h_w16
   1399    RET
   1400 .v:
   1401    movzx               mxd, myb
   1402    shr                 myd, 16
   1403    cmp                  hd, 6
   1404    cmovs               myd, mxd
   1405    vpbroadcastq         m0, [base+subpel_filters+1+myq*8]
   1406    WIN64_SPILL_XMM      10, 12
   1407    vpbroadcastd         m5, [pd_32]
   1408    vpbroadcastw         m6, r8m
   1409    punpcklbw            m0, m0
   1410    mov                  r6, ssq
   1411    psraw                m0, 8 ; sign-extend
   1412    neg                  r6
   1413    pshufd               m7, m0, q0000
   1414    pshufd               m8, m0, q1111
   1415    pshufd               m9, m0, q2222
   1416    cmp                  wd, 4
   1417    jg .v_w8
   1418    je .v_w4
   1419 .v_w2:
   1420    movd                xm2, [srcq+r6 *2]
   1421    pinsrd              xm2, [srcq+r6 *1], 1
   1422    pinsrd              xm2, [srcq+ssq*0], 2
   1423    pinsrd              xm2, [srcq+ssq*1], 3 ; 0 1 2 3
   1424    lea                srcq, [srcq+ssq*2]
   1425    movd                xm0, [srcq+ssq*0]
   1426    palignr             xm3, xm0, xm2, 4     ; 1 2 3 4
   1427    punpcklwd           xm1, xm2, xm3        ; 01 12
   1428    punpckhwd           xm2, xm3             ; 23 34
   1429 .v_w2_loop:
   1430    movd                xm3, [srcq+ssq*1]
   1431    pmaddwd             xm4, xm7, xm1        ; a0 b0
   1432    mova                xm1, xm2
   1433    pmaddwd             xm2, xm8             ; a1 b1
   1434    lea                srcq, [srcq+ssq*2]
   1435    paddd               xm4, xm2
   1436    punpckldq           xm2, xm0, xm3        ; 4 5
   1437    movd                xm0, [srcq+ssq*0]
   1438    punpckldq           xm3, xm0             ; 5 6
   1439    punpcklwd           xm2, xm3             ; 45 56
   1440    pmaddwd             xm3, xm9, xm2        ; a2 b2
   1441    paddd               xm4, xm5
   1442    paddd               xm4, xm3
   1443    psrad               xm4, 6
   1444    packusdw            xm4, xm4
   1445    pminsw              xm4, xm6
   1446    movd       [dstq+dsq*0], xm4
   1447    pextrd     [dstq+dsq*1], xm4, 1
   1448    lea                dstq, [dstq+dsq*2]
   1449    sub                  hd, 2
   1450    jg .v_w2_loop
   1451    RET
   1452 .v_w4:
   1453    movq                xm1, [srcq+r6 *2]
   1454    vpbroadcastq         m3, [srcq+r6 *1]
   1455    vpbroadcastq         m2, [srcq+ssq*0]
   1456    vpbroadcastq         m4, [srcq+ssq*1]
   1457    lea                srcq, [srcq+ssq*2]
   1458    vpbroadcastq         m0, [srcq+ssq*0]
   1459    vpblendd             m1, m3, 0x30
   1460    vpblendd             m3, m2, 0x30
   1461    punpcklwd            m1, m3     ; 01 12
   1462    vpblendd             m2, m4, 0x30
   1463    vpblendd             m4, m0, 0x30
   1464    punpcklwd            m2, m4     ; 23 34
   1465 .v_w4_loop:
   1466    vpbroadcastq         m3, [srcq+ssq*1]
   1467    pmaddwd              m4, m7, m1 ; a0 b0
   1468    mova                 m1, m2
   1469    pmaddwd              m2, m8     ; a1 b1
   1470    lea                srcq, [srcq+ssq*2]
   1471    paddd                m4, m2
   1472    vpblendd             m2, m0, m3, 0x30
   1473    vpbroadcastq         m0, [srcq+ssq*0]
   1474    vpblendd             m3, m0, 0x30
   1475    punpcklwd            m2, m3     ; 45 56
   1476    pmaddwd              m3, m9, m2 ; a2 b2
   1477    paddd                m4, m5
   1478    paddd                m4, m3
   1479    psrad                m4, 6
   1480    vextracti128        xm3, m4, 1
   1481    packusdw            xm4, xm3
   1482    pminsw              xm4, xm6
   1483    movq       [dstq+dsq*0], xm4
   1484    movhps     [dstq+dsq*1], xm4
   1485    lea                dstq, [dstq+dsq*2]
   1486    sub                  hd, 2
   1487    jg .v_w4_loop
   1488    RET
   1489 .v_w8:
   1490    shl                  wd, 5
   1491    WIN64_PUSH_XMM       12
   1492    lea                  wd, [hq+wq-256]
   1493 .v_w8_loop0:
   1494    vbroadcasti128       m3, [srcq+r6 *2]
   1495    vbroadcasti128       m4, [srcq+r6 *1]
   1496    lea                  r7, [srcq+ssq*2]
   1497    vbroadcasti128       m0, [srcq+ssq*0]
   1498    vbroadcasti128       m1, [srcq+ssq*1]
   1499    mov                  r8, dstq
   1500    vbroadcasti128       m2, [r7+ssq*0]
   1501    shufpd               m3, m0, 0x0c
   1502    shufpd               m4, m1, 0x0c
   1503    punpcklwd            m1, m3, m4 ; 01
   1504    punpckhwd            m3, m4     ; 23
   1505    shufpd               m0, m2, 0x0c
   1506    punpcklwd            m2, m4, m0 ; 12
   1507    punpckhwd            m4, m0     ; 34
   1508 .v_w8_loop:
   1509    vbroadcasti128       m5, [r7+ssq*1]
   1510    pmaddwd             m10, m7, m1 ; a0
   1511    lea                  r7, [r7+ssq*2]
   1512    pmaddwd             m11, m7, m2 ; b0
   1513    mova                 m1, m3
   1514    pmaddwd              m3, m8     ; a1
   1515    mova                 m2, m4
   1516    pmaddwd              m4, m8     ; b1
   1517    paddd               m10, m3
   1518    vbroadcasti128       m3, [r7+ssq*0]
   1519    paddd               m11, m4
   1520    shufpd               m4, m0, m5, 0x0d
   1521    shufpd               m0, m5, m3, 0x0c
   1522    punpcklwd            m3, m4, m0 ; 45
   1523    punpckhwd            m4, m0     ; 56
   1524    pmaddwd              m5, m9, m3 ; a2
   1525    paddd               m10, m5
   1526    pmaddwd              m5, m9, m4 ; b2
   1527    paddd                m5, m11
   1528    psrad               m10, 5
   1529    psrad                m5, 5
   1530    packusdw            m10, m5
   1531    pxor                 m5, m5
   1532    pavgw                m5, m10
   1533    pminsw               m5, m6
   1534    vpermq               m5, m5, q3120
   1535    mova         [r8+dsq*0], xm5
   1536    vextracti128 [r8+dsq*1], m5, 1
   1537    lea                  r8, [r8+dsq*2]
   1538    sub                  hd, 2
   1539    jg .v_w8_loop
   1540    add                srcq, 16
   1541    add                dstq, 16
   1542    movzx                hd, wb
   1543    sub                  wd, 1<<8
   1544    jg .v_w8_loop0
   1545    RET
   1546 .hv:
   1547    WIN64_SPILL_XMM      12, 16
   1548    vpbroadcastd        m10, [pd_512]
   1549    vpbroadcastw        m11, r8m
   1550    cmp                  wd, 4
   1551    jg .hv_w8
   1552    movzx               mxd, mxb
   1553    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
   1554    movzx               mxd, myb
   1555    shr                 myd, 16
   1556    cmp                  hd, 6
   1557    cmovs               myd, mxd
   1558    vpbroadcastq         m1, [base+subpel_filters+1+myq*8]
   1559    mov                  r6, ssq
   1560    sub                srcq, 2
   1561    neg                  r6
   1562    pxor                 m6, m6
   1563    punpcklbw            m6, m0
   1564    punpcklbw            m1, m1
   1565    psraw                m1, 8 ; sign-extend
   1566    test          dword r8m, 0x800
   1567    jz .hv_10bit
   1568    psraw                m6, 2
   1569    psllw                m1, 2
   1570 .hv_10bit:
   1571    pshufd               m7, m1, q0000
   1572    pshufd               m8, m1, q1111
   1573    pshufd               m9, m1, q2222
   1574    cmp                  wd, 4
   1575    je .hv_w4
   1576    vbroadcasti128       m5, [subpel_h_shuf2]
   1577    vbroadcasti128       m0, [srcq+ssq*0]
   1578    vinserti128          m2, m0, [srcq+r6*2], 1 ; 2 0
   1579    movu                xm1, [srcq+ssq*1]
   1580    vinserti128          m1, [srcq+r6 *1], 1    ; 3 1
   1581    lea                srcq, [srcq+ssq*2]
   1582    vinserti128          m0, [srcq+ssq*0], 0    ; 4 2
   1583    REPX    {pshufb  x, m5}, m2, m1, m0
   1584    REPX    {pmaddwd x, m6}, m2, m1, m0
   1585    phaddd               m2, m1
   1586    phaddd               m1, m0
   1587    paddd                m2, m10
   1588    paddd                m1, m10
   1589    psrad                m2, 10
   1590    psrad                m1, 10
   1591    packssdw             m2, m1       ; 2 3 3 4   0 1 1 2
   1592    punpckhqdq           m0, m2, m2
   1593    punpcklwd            m2, m0       ; 23 34
   1594    vextracti128        xm1, m2, 1    ; 01 12
   1595 .hv_w2_loop:
   1596    movu                xm3, [srcq+ssq*1]
   1597    lea                srcq, [srcq+ssq*2]
   1598    movu                xm4, [srcq+ssq*0]
   1599    pshufb              xm3, xm5
   1600    pshufb              xm4, xm5
   1601    pmaddwd             xm3, xm6
   1602    pmaddwd             xm4, xm6
   1603    phaddd              xm3, xm4
   1604    pmaddwd             xm4, xm7, xm1 ; a0 b0
   1605    mova                xm1, xm2
   1606    pmaddwd             xm2, xm8      ; a1 b1
   1607    paddd               xm4, xm2
   1608    paddd               xm3, xm10
   1609    psrad               xm3, 10
   1610    packssdw            xm3, xm3
   1611    palignr             xm2, xm3, xm0, 12
   1612    mova                xm0, xm3
   1613    punpcklwd           xm2, xm0      ; 45 56
   1614    pmaddwd             xm3, xm9, xm2 ; a2 b2
   1615    paddd               xm4, xm10
   1616    paddd               xm4, xm3
   1617    psrad               xm4, 10
   1618    packusdw            xm4, xm4
   1619    pminsw              xm4, xm11
   1620    movd       [dstq+dsq*0], xm4
   1621    pextrd     [dstq+dsq*1], xm4, 1
   1622    lea                dstq, [dstq+dsq*2]
   1623    sub                  hd, 2
   1624    jg .hv_w2_loop
   1625    RET
   1626 .hv_w4:
   1627    WIN64_PUSH_XMM       14
   1628    vbroadcasti128      m12, [subpel_h_shufA]
   1629    pshufd               m5, m6, q0000
   1630    vbroadcasti128      m13, [subpel_h_shufB]
   1631    pshufd               m6, m6, q1111
   1632    movu                xm2, [srcq+r6 *2]
   1633    vinserti128          m2, [srcq+r6 *1], 1 ; 0 1
   1634    movu                xm0, [srcq+ssq*0]
   1635    vinserti128          m0, [srcq+ssq*1], 1 ; 2 3
   1636    lea                srcq, [srcq+ssq*2]
   1637    movu                xm3, [srcq+ssq*0]    ; 4
   1638    pshufb               m1, m2, m12
   1639    pmaddwd              m1, m5
   1640    pshufb               m2, m13
   1641    pmaddwd              m2, m6
   1642    pshufb               m4, m0, m12
   1643    pmaddwd              m4, m5
   1644    pshufb               m0, m13
   1645    pmaddwd              m0, m6
   1646    paddd                m2, m1
   1647    pshufb              xm1, xm3, xm12
   1648    pmaddwd             xm1, xm5
   1649    pshufb              xm3, xm13
   1650    pmaddwd             xm3, xm6
   1651    paddd                m0, m4
   1652    paddd                m2, m10
   1653    paddd               xm1, xm10
   1654    paddd                m0, m10
   1655    paddd               xm3, xm1
   1656    REPX      {psrad x, 10}, m2, m0, xm3
   1657    packssdw             m2, m0     ; 0 2   1 3
   1658    packssdw            xm0, xm3    ; 2 4
   1659    vperm2i128           m0, m2, 0x03
   1660    punpcklwd            m1, m2, m0 ; 01 12
   1661    punpckhwd            m2, m0     ; 23 34
   1662 .hv_w4_loop:
   1663    movu                xm3, [srcq+ssq*1]
   1664    lea                srcq, [srcq+ssq*2]
   1665    vinserti128          m3, [srcq+ssq*0], 1
   1666    pmaddwd              m4, m7, m1 ; a0 b0
   1667    mova                 m1, m2
   1668    pmaddwd              m2, m8     ; a1 b1
   1669    paddd                m4, m2
   1670    pshufb               m2, m3, m12
   1671    pmaddwd              m2, m5
   1672    pshufb               m3, m13
   1673    pmaddwd              m3, m6
   1674    paddd                m2, m10
   1675    paddd                m3, m2
   1676    psrad                m3, 10
   1677    packssdw             m3, m3     ; 5 5   6 6
   1678    vperm2i128           m2, m0, m3, 0x21
   1679    mova                 m0, m3
   1680    punpckhwd            m2, m3     ; 45 56
   1681    pmaddwd              m3, m9, m2 ; a2 b2
   1682    paddd                m4, m10
   1683    paddd                m4, m3
   1684    psrad                m4, 10
   1685    vextracti128        xm3, m4, 1
   1686    packusdw            xm4, xm3
   1687    pminsw              xm4, xm11
   1688    movq       [dstq+dsq*0], xm4
   1689    movhps     [dstq+dsq*1], xm4
   1690    lea                dstq, [dstq+dsq*2]
   1691    sub                  hd, 2
   1692    jg .hv_w4_loop
   1693    RET
   1694 .hv_w8:
   1695    WIN64_PUSH_XMM       16, 12
   1696    shr                 mxd, 16
   1697    vbroadcasti128      m12, [subpel_h_shufA]
   1698    vpbroadcastq         m2, [base+subpel_filters+1+mxq*8]
   1699    movzx               mxd, myb
   1700    shr                 myd, 16
   1701    cmp                  hd, 6
   1702    cmovs               myd, mxd
   1703    pmovsxbw            xm1, [base+subpel_filters+1+myq*8]
   1704    shl                  wd, 5
   1705    mov                  r6, ssq
   1706    sub                srcq, 4
   1707    pxor                 m0, m0
   1708    neg                  r6
   1709    punpcklbw            m0, m2
   1710    lea                  wd, [hq+wq-256]
   1711    test          dword r8m, 0x800
   1712    jz .hv_w8_10bit
   1713    psraw                m0, 2
   1714    psllw               xm1, 2
   1715 .hv_w8_10bit:
   1716    pshufd               m7, m0, q0000
   1717    pshufd               m8, m0, q1111
   1718 %if WIN64
   1719    %define v_mul (rsp+stack_offset+40) ; r4m
   1720 %else
   1721    %define v_mul (rsp+stack_offset+ 8) ; r6m
   1722 %endif
   1723    mova            [v_mul], xm1
   1724    pshufd               m9, m0, q2222
   1725 .hv_w8_loop0:
   1726    vbroadcasti128       m0, [srcq+ssq*0+ 0]
   1727    vinserti128          m3, m0, [srcq+r6*2+ 0], 0
   1728    lea                  r7, [srcq+ssq*2]
   1729    vbroadcasti128       m2, [srcq+ssq*0+16]
   1730    vinserti128          m1, m2, [srcq+r6*2+16], 0
   1731    mov                  r8, dstq
   1732    vinserti128          m0, [r7  +ssq*0+ 0], 1
   1733    vinserti128          m2, [r7  +ssq*0+16], 1
   1734    shufpd               m4, m3, m1, 0x05
   1735 %macro PUT_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
   1736    pshufb              m%1, m12       ; 01 12 23 34
   1737    pshufb              m%2, m12       ; 45 56 67 78
   1738    pmaddwd             m%4, m7, m%1   ; a0
   1739    pshufb              m%3, m12       ; 89 9a ab bc
   1740    pmaddwd             m%5, m9, m%2   ; a2
   1741    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
   1742    paddd               m%4, m%5       ; a0+a2
   1743    pmaddwd             m%5, m7, m%2   ; b0
   1744    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
   1745    pmaddwd             m%3, m9        ; b2
   1746    pmaddwd             m%1, m8        ; a1
   1747    pmaddwd             m%2, m8        ; b1
   1748    paddd               m%3, m%5       ; b0+b2
   1749    paddd               m%4, m10
   1750    paddd               m%3, m10
   1751    paddd               m%1, m%4
   1752    paddd               m%2, m%3
   1753    psrad               m%1, 10
   1754    psrad               m%2, 10
   1755    packssdw            m%1, m%2
   1756 %endmacro
   1757    PUT_6TAP_HV_H         3, 4, 1, 5, 6  ; 0 2
   1758    movu                xm4, [srcq+r6 *1+ 0]
   1759    vinserti128          m4, [srcq+ssq*1+ 0], 1
   1760    shufpd               m1, m0, m2, 0x05
   1761    PUT_6TAP_HV_H         0, 1, 2, 5, 6  ; 2 4
   1762    movu                xm2, [srcq+r6 *1+16]
   1763    vinserti128          m2, [srcq+ssq*1+16], 1
   1764    shufpd               m1, m4, m2, 0x05
   1765    PUT_6TAP_HV_H         4, 1, 2, 5, 6  ; 1 3
   1766    vpermq               m3, m3, q3120
   1767    vpermq               m4, m4, q3120
   1768    vpermq               m0, m0, q3120
   1769    punpcklwd            m1, m3, m4      ; 01
   1770    punpckhwd            m3, m4          ; 23
   1771    punpcklwd            m2, m4, m0      ; 12
   1772    punpckhwd            m4, m0          ; 34
   1773 .hv_w8_loop:
   1774    vpbroadcastd        m15, [v_mul+4*0]
   1775    vpbroadcastd        m13, [v_mul+4*1]
   1776    movu                xm5, [r7+ssq*1+ 0]
   1777    movu                xm6, [r7+ssq*1+16]
   1778    lea                  r7, [r7+ssq*2]
   1779    pmaddwd             m14, m15, m1     ; a0
   1780    pmaddwd             m15, m2          ; b0
   1781    vinserti128          m5, [r7+ssq*0+ 0], 1
   1782    vinserti128          m6, [r7+ssq*0+16], 1
   1783    mova                 m1, m3
   1784    pmaddwd              m3, m13         ; a1
   1785    mova                 m2, m4
   1786    pmaddwd              m4, m13         ; b1
   1787    paddd               m14, m3
   1788    shufpd               m3, m5, m6, 0x05
   1789    paddd               m15, m4
   1790    PUT_6TAP_HV_H         5, 3, 6, 4, 13 ; 5 6
   1791    vpbroadcastd         m6, [v_mul+4*2]
   1792    vpermq               m5, m5, q3120
   1793    shufpd               m4, m0, m5, 0x05
   1794    mova                 m0, m5
   1795    punpcklwd            m3, m4, m5      ; 45
   1796    punpckhwd            m4, m5          ; 56
   1797    pmaddwd              m5, m6, m3      ; a2
   1798    pmaddwd              m6, m4          ; b2
   1799    paddd               m14, m10
   1800    paddd               m15, m10
   1801    paddd                m5, m14
   1802    paddd                m6, m15
   1803    psrad                m5, 10
   1804    psrad                m6, 10
   1805    packusdw             m5, m6
   1806    pminsw               m5, m11
   1807    vpermq               m5, m5, q3120
   1808    mova         [r8+dsq*0], xm5
   1809    vextracti128 [r8+dsq*1], m5, 1
   1810    lea                  r8, [r8+dsq*2]
   1811    sub                  hd, 2
   1812    jg .hv_w8_loop
   1813    add                srcq, 16
   1814    add                dstq, 16
   1815    movzx                hd, wb
   1816    sub                  wd, 1<<8
   1817    jg .hv_w8_loop0
   1818    RET
   1819 
   1820 PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
   1821 PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
   1822 PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
   1823 PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
   1824 PUT_8TAP_FN sharp,          SHARP,   SHARP
   1825 
   1826 cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
   1827 %define base r8-put_avx2
   1828    imul                mxd, mxm, 0x010101
   1829    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   1830    imul                myd, mym, 0x010101
   1831    add                 myd, t1d ; 8tap_v, my, 4tap_v
   1832    lea                  r8, [put_avx2]
   1833    movifnidn            wd, wm
   1834    movifnidn            hd, hm
   1835    test                mxd, 0xf00
   1836    jnz .h
   1837    test                myd, 0xf00
   1838    jz mangle(private_prefix %+ _put_6tap_16bpc_avx2).put
   1839 .v:
   1840    movzx               mxd, myb
   1841    shr                 myd, 16
   1842    cmp                  hd, 6
   1843    cmovs               myd, mxd
   1844    vpbroadcastq         m0, [base+subpel_filters+myq*8]
   1845    WIN64_SPILL_XMM      12, 15
   1846    vpbroadcastd         m6, [pd_32]
   1847    vpbroadcastw         m7, r8m
   1848    lea                  r6, [ssq*3]
   1849    sub                srcq, r6
   1850    punpcklbw            m0, m0
   1851    psraw                m0, 8 ; sign-extend
   1852    pshufd               m8, m0, q0000
   1853    pshufd               m9, m0, q1111
   1854    pshufd              m10, m0, q2222
   1855    pshufd              m11, m0, q3333
   1856    cmp                  wd, 4
   1857    jg .v_w8
   1858    je .v_w4
   1859 .v_w2:
   1860    movd                xm2, [srcq+ssq*0]
   1861    pinsrd              xm2, [srcq+ssq*1], 1
   1862    pinsrd              xm2, [srcq+ssq*2], 2
   1863    pinsrd              xm2, [srcq+r6   ], 3 ; 0 1 2 3
   1864    lea                srcq, [srcq+ssq*4]
   1865    movd                xm3, [srcq+ssq*0]
   1866    vpbroadcastd        xm1, [srcq+ssq*1]
   1867    vpbroadcastd        xm0, [srcq+ssq*2]
   1868    add                srcq, r6
   1869    vpblendd            xm3, xm1, 0x02       ; 4 5
   1870    vpblendd            xm1, xm0, 0x02       ; 5 6
   1871    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
   1872    punpcklwd           xm3, xm1             ; 45 56
   1873    punpcklwd           xm1, xm2, xm4        ; 01 12
   1874    punpckhwd           xm2, xm4             ; 23 34
   1875 .v_w2_loop:
   1876    vpbroadcastd        xm4, [srcq+ssq*0]
   1877    pmaddwd             xm5, xm8, xm1        ; a0 b0
   1878    mova                xm1, xm2
   1879    pmaddwd             xm2, xm9             ; a1 b1
   1880    paddd               xm5, xm6
   1881    paddd               xm5, xm2
   1882    mova                xm2, xm3
   1883    pmaddwd             xm3, xm10            ; a2 b2
   1884    paddd               xm5, xm3
   1885    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
   1886    vpbroadcastd        xm0, [srcq+ssq*1]
   1887    lea                srcq, [srcq+ssq*2]
   1888    vpblendd            xm4, xm0, 0x02       ; 7 8
   1889    punpcklwd           xm3, xm4             ; 67 78
   1890    pmaddwd             xm4, xm11, xm3       ; a3 b3
   1891    paddd               xm5, xm4
   1892    psrad               xm5, 6
   1893    packusdw            xm5, xm5
   1894    pminsw              xm5, xm7
   1895    movd       [dstq+dsq*0], xm5
   1896    pextrd     [dstq+dsq*1], xm5, 1
   1897    lea                dstq, [dstq+dsq*2]
   1898    sub                  hd, 2
   1899    jg .v_w2_loop
   1900    RET
   1901 .v_w4:
   1902    movq                xm1, [srcq+ssq*0]
   1903    vpbroadcastq         m0, [srcq+ssq*1]
   1904    vpbroadcastq         m2, [srcq+ssq*2]
   1905    vpbroadcastq         m4, [srcq+r6   ]
   1906    lea                srcq, [srcq+ssq*4]
   1907    vpbroadcastq         m3, [srcq+ssq*0]
   1908    vpbroadcastq         m5, [srcq+ssq*1]
   1909    vpblendd             m1, m0, 0x30
   1910    vpblendd             m0, m2, 0x30
   1911    punpcklwd            m1, m0      ; 01 12
   1912    vpbroadcastq         m0, [srcq+ssq*2]
   1913    add                srcq, r6
   1914    vpblendd             m2, m4, 0x30
   1915    vpblendd             m4, m3, 0x30
   1916    punpcklwd            m2, m4      ; 23 34
   1917    vpblendd             m3, m5, 0x30
   1918    vpblendd             m5, m0, 0x30
   1919    punpcklwd            m3, m5      ; 45 56
   1920 .v_w4_loop:
   1921    vpbroadcastq         m4, [srcq+ssq*0]
   1922    pmaddwd              m5, m8, m1  ; a0 b0
   1923    mova                 m1, m2
   1924    pmaddwd              m2, m9      ; a1 b1
   1925    paddd                m5, m6
   1926    paddd                m5, m2
   1927    mova                 m2, m3
   1928    pmaddwd              m3, m10     ; a2 b2
   1929    paddd                m5, m3
   1930    vpblendd             m3, m0, m4, 0x30
   1931    vpbroadcastq         m0, [srcq+ssq*1]
   1932    lea                srcq, [srcq+ssq*2]
   1933    vpblendd             m4, m0, 0x30
   1934    punpcklwd            m3, m4      ; 67 78
   1935    pmaddwd              m4, m11, m3 ; a3 b3
   1936    paddd                m5, m4
   1937    psrad                m5, 6
   1938    vextracti128        xm4, m5, 1
   1939    packusdw            xm5, xm4
   1940    pminsw              xm5, xm7
   1941    movq       [dstq+dsq*0], xm5
   1942    movhps     [dstq+dsq*1], xm5
   1943    lea                dstq, [dstq+dsq*2]
   1944    sub                  hd, 2
   1945    jg .v_w4_loop
   1946    RET
   1947 .v_w8:
   1948    shl                  wd, 5
   1949    WIN64_PUSH_XMM       15
   1950    lea                  wd, [hq+wq-256]
   1951 .v_w8_loop0:
   1952    vbroadcasti128       m4, [srcq+ssq*0]
   1953    vbroadcasti128       m5, [srcq+ssq*1]
   1954    lea                  r7, [srcq+ssq*4]
   1955    vbroadcasti128       m0, [srcq+r6   ]
   1956    vbroadcasti128       m6, [srcq+ssq*2]
   1957    mov                  r8, dstq
   1958    vbroadcasti128       m1, [r7+ssq*0]
   1959    vbroadcasti128       m2, [r7+ssq*1]
   1960    vbroadcasti128       m3, [r7+ssq*2]
   1961    add                  r7, r6
   1962    shufpd               m4, m0, 0x0c
   1963    shufpd               m5, m1, 0x0c
   1964    punpcklwd            m1, m4, m5 ; 01
   1965    punpckhwd            m4, m5     ; 34
   1966    shufpd               m6, m2, 0x0c
   1967    punpcklwd            m2, m5, m6 ; 12
   1968    punpckhwd            m5, m6     ; 45
   1969    shufpd               m0, m3, 0x0c
   1970    punpcklwd            m3, m6, m0 ; 23
   1971    punpckhwd            m6, m0     ; 56
   1972 .v_w8_loop:
   1973    vbroadcasti128      m14, [r7+ssq*0]
   1974    pmaddwd             m12, m8, m1  ; a0
   1975    pmaddwd             m13, m8, m2  ; b0
   1976    mova                 m1, m3
   1977    mova                 m2, m4
   1978    pmaddwd              m3, m9      ; a1
   1979    pmaddwd              m4, m9      ; b1
   1980    paddd               m12, m3
   1981    paddd               m13, m4
   1982    mova                 m3, m5
   1983    mova                 m4, m6
   1984    pmaddwd              m5, m10     ; a2
   1985    pmaddwd              m6, m10     ; b2
   1986    paddd               m12, m5
   1987    vbroadcasti128       m5, [r7+ssq*1]
   1988    lea                  r7, [r7+ssq*2]
   1989    paddd               m13, m6
   1990    shufpd               m6, m0, m14, 0x0d
   1991    shufpd               m0, m14, m5, 0x0c
   1992    punpcklwd            m5, m6, m0  ; 67
   1993    punpckhwd            m6, m0      ; 78
   1994    pmaddwd             m14, m11, m5 ; a3
   1995    paddd               m12, m14
   1996    pmaddwd             m14, m11, m6 ; b3
   1997    paddd               m13, m14
   1998    psrad               m12, 5
   1999    psrad               m13, 5
   2000    packusdw            m12, m13
   2001    pxor                m13, m13
   2002    pavgw               m12, m13
   2003    pminsw              m12, m7
   2004    vpermq              m12, m12, q3120
   2005    mova         [r8+dsq*0], xm12
   2006    vextracti128 [r8+dsq*1], m12, 1
   2007    lea                  r8, [r8+dsq*2]
   2008    sub                  hd, 2
   2009    jg .v_w8_loop
   2010    add                srcq, 16
   2011    add                dstq, 16
   2012    movzx                hd, wb
   2013    sub                  wd, 1<<8
   2014    jg .v_w8_loop0
   2015    RET
   2016 .h:
   2017    RESET_STACK_STATE
   2018    test                myd, 0xf00
   2019    jnz .hv
   2020    mov                 r7d, r8m
   2021    vpbroadcastw         m5, r8m
   2022    shr                 r7d, 11
   2023    vpbroadcastd         m4, [base+put_8tap_h_rnd+r7*4]
   2024    cmp                  wd, 4
   2025    jl mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w2
   2026    je mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w4
   2027    WIN64_SPILL_XMM      13
   2028    shr                 mxd, 16
   2029    sub                srcq, 6
   2030    vpbroadcastq         m0, [base+subpel_filters+mxq*8]
   2031    vbroadcasti128       m6, [subpel_h_shufA]
   2032    vbroadcasti128       m7, [subpel_h_shufB]
   2033    punpcklbw            m0, m0
   2034    psraw                m0, 8 ; sign-extend
   2035    pshufd               m8, m0, q0000
   2036    pshufd               m9, m0, q1111
   2037    pshufd              m10, m0, q2222
   2038    pshufd              m11, m0, q3333
   2039    sub                  wd, 16
   2040    jge .h_w16
   2041 .h_w8:
   2042 %macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
   2043    pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
   2044    pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
   2045    pmaddwd             m%5, m9, m%4   ; abcd1
   2046    pmaddwd             m%1, m8        ; abcd0
   2047    pshufb              m%2, m7        ; 6 7 7 8 8 9 9 a
   2048    shufpd              m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
   2049    paddd               m%5, m4
   2050    paddd               m%1, m%5
   2051    pmaddwd             m%5, m11, m%2  ; abcd3
   2052    paddd               m%1, m%5
   2053    pmaddwd             m%5, m10, m%4  ; abcd2
   2054    pshufb              m%3, m7        ; a b b c c d d e
   2055    pmaddwd             m%4, m8        ; efgh0
   2056    paddd               m%1, m%5
   2057    pmaddwd             m%5, m9, m%2   ; efgh1
   2058    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
   2059    pmaddwd             m%3, m11       ; efgh3
   2060    pmaddwd             m%2, m10       ; efgh2
   2061    paddd               m%4, m4
   2062    paddd               m%4, m%5
   2063    paddd               m%3, m%4
   2064    paddd               m%2, m%3
   2065    psrad               m%1, 6
   2066    psrad               m%2, 6
   2067    packusdw            m%1, m%2
   2068    pminsw              m%1, m5
   2069 %endmacro
   2070    movu                xm0, [srcq+ssq*0+ 0]
   2071    vinserti128          m0, [srcq+ssq*1+ 0], 1
   2072    movu                xm2, [srcq+ssq*0+16]
   2073    vinserti128          m2, [srcq+ssq*1+16], 1
   2074    lea                srcq, [srcq+ssq*2]
   2075    shufpd               m1, m0, m2, 0x05
   2076    PUT_8TAP_H            0, 1, 2, 3, 12
   2077    mova         [dstq+dsq*0], xm0
   2078    vextracti128 [dstq+dsq*1], m0, 1
   2079    lea                dstq, [dstq+dsq*2]
   2080    sub                  hd, 2
   2081    jg .h_w8
   2082    RET
   2083 .h_w16:
   2084    mov                 r6d, wd
   2085 .h_w16_loop:
   2086    movu                 m0, [srcq+r6*2+ 0]
   2087    movu                 m1, [srcq+r6*2+ 8]
   2088    movu                 m2, [srcq+r6*2+16]
   2089    PUT_8TAP_H            0, 1, 2, 3, 12
   2090    mova        [dstq+r6*2], m0
   2091    sub                 r6d, 16
   2092    jge .h_w16_loop
   2093    add                srcq, ssq
   2094    add                dstq, dsq
   2095    dec                  hd
   2096    jg .h_w16
   2097    RET
   2098 .hv:
   2099    WIN64_SPILL_XMM      16
   2100    vpbroadcastw        m15, r8m
   2101    cmp                  wd, 4
   2102    jg .hv_w8
   2103    movzx               mxd, mxb
   2104    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
   2105    movzx               mxd, myb
   2106    shr                 myd, 16
   2107    cmp                  hd, 6
   2108    cmovs               myd, mxd
   2109    vpbroadcastq         m1, [base+subpel_filters+myq*8]
   2110    vpbroadcastd         m6, [pd_512]
   2111    lea                  r6, [ssq*3]
   2112    sub                srcq, 2
   2113    sub                srcq, r6
   2114    pxor                 m7, m7
   2115    punpcklbw            m7, m0
   2116    punpcklbw            m1, m1
   2117    psraw                m1, 8 ; sign-extend
   2118    test          dword r8m, 0x800
   2119    jz .hv_10bit
   2120    psraw                m7, 2
   2121    psllw                m1, 2
   2122 .hv_10bit:
   2123    pshufd              m11, m1, q0000
   2124    pshufd              m12, m1, q1111
   2125    pshufd              m13, m1, q2222
   2126    pshufd              m14, m1, q3333
   2127    cmp                  wd, 4
   2128    je .hv_w4
   2129    vbroadcasti128       m9, [subpel_h_shuf2]
   2130    vbroadcasti128       m1, [srcq+r6   ]    ; 3 3
   2131    movu                xm3, [srcq+ssq*2]
   2132    movu                xm0, [srcq+ssq*0]
   2133    movu                xm2, [srcq+ssq*1]
   2134    lea                srcq, [srcq+ssq*4]
   2135    vinserti128          m3, [srcq+ssq*0], 1 ; 2 4
   2136    vinserti128          m0, [srcq+ssq*1], 1 ; 0 5
   2137    vinserti128          m2, [srcq+ssq*2], 1 ; 1 6
   2138    add                srcq, r6
   2139    pshufb               m1, m9
   2140    pshufb               m3, m9
   2141    pshufb               m0, m9
   2142    pshufb               m2, m9
   2143    pmaddwd              m1, m7
   2144    pmaddwd              m3, m7
   2145    pmaddwd              m0, m7
   2146    pmaddwd              m2, m7
   2147    phaddd               m1, m3
   2148    phaddd               m0, m2
   2149    paddd                m1, m6
   2150    paddd                m0, m6
   2151    psrad                m1, 10
   2152    psrad                m0, 10
   2153    packssdw             m1, m0         ; 3 2 0 1
   2154    vextracti128        xm0, m1, 1      ; 3 4 5 6
   2155    pshufd              xm2, xm1, q1301 ; 2 3 1 2
   2156    pshufd              xm3, xm0, q2121 ; 4 5 4 5
   2157    punpckhwd           xm1, xm2        ; 01 12
   2158    punpcklwd           xm2, xm0        ; 23 34
   2159    punpckhwd           xm3, xm0        ; 45 56
   2160 .hv_w2_loop:
   2161    movu                xm4, [srcq+ssq*0]
   2162    movu                xm5, [srcq+ssq*1]
   2163    lea                srcq, [srcq+ssq*2]
   2164    pshufb              xm4, xm9
   2165    pshufb              xm5, xm9
   2166    pmaddwd             xm4, xm7
   2167    pmaddwd             xm5, xm7
   2168    phaddd              xm4, xm5
   2169    pmaddwd             xm5, xm11, xm1 ; a0 b0
   2170    mova                xm1, xm2
   2171    pmaddwd             xm2, xm12      ; a1 b1
   2172    paddd               xm5, xm2
   2173    mova                xm2, xm3
   2174    pmaddwd             xm3, xm13      ; a2 b2
   2175    paddd               xm5, xm3
   2176    paddd               xm4, xm6
   2177    psrad               xm4, 10
   2178    packssdw            xm4, xm4
   2179    palignr             xm3, xm4, xm0, 12
   2180    mova                xm0, xm4
   2181    punpcklwd           xm3, xm0       ; 67 78
   2182    pmaddwd             xm4, xm14, xm3 ; a3 b3
   2183    paddd               xm5, xm6
   2184    paddd               xm5, xm4
   2185    psrad               xm5, 10
   2186    packusdw            xm5, xm5
   2187    pminsw              xm5, xm15
   2188    movd       [dstq+dsq*0], xm5
   2189    pextrd     [dstq+dsq*1], xm5, 1
   2190    lea                dstq, [dstq+dsq*2]
   2191    sub                  hd, 2
   2192    jg .hv_w2_loop
   2193    RET
   2194 .hv_w4:
   2195    vbroadcasti128       m9, [subpel_h_shufA]
   2196    vbroadcasti128      m10, [subpel_h_shufB]
   2197    pshufd               m8, m7, q1111
   2198    pshufd               m7, m7, q0000
   2199    movu                xm1, [srcq+ssq*0]
   2200    vinserti128          m1, [srcq+ssq*1], 1     ; 0 1
   2201    vbroadcasti128       m0, [srcq+r6   ]
   2202    vinserti128          m2, m0, [srcq+ssq*2], 0 ; 2 3
   2203    lea                srcq, [srcq+ssq*4]
   2204    vinserti128          m0, [srcq+ssq*0], 1     ; 3 4
   2205    movu                xm3, [srcq+ssq*1]
   2206    vinserti128          m3, [srcq+ssq*2], 1     ; 5 6
   2207    add                srcq, r6
   2208    pshufb               m4, m1, m9
   2209    pshufb               m1, m10
   2210    pmaddwd              m4, m7
   2211    pmaddwd              m1, m8
   2212    pshufb               m5, m2, m9
   2213    pshufb               m2, m10
   2214    pmaddwd              m5, m7
   2215    pmaddwd              m2, m8
   2216    paddd                m4, m6
   2217    paddd                m1, m4
   2218    pshufb               m4, m0, m9
   2219    pshufb               m0, m10
   2220    pmaddwd              m4, m7
   2221    pmaddwd              m0, m8
   2222    paddd                m5, m6
   2223    paddd                m2, m5
   2224    pshufb               m5, m3, m9
   2225    pshufb               m3, m10
   2226    pmaddwd              m5, m7
   2227    pmaddwd              m3, m8
   2228    paddd                m4, m6
   2229    paddd                m4, m0
   2230    paddd                m5, m6
   2231    paddd                m5, m3
   2232    vperm2i128           m0, m1, m2, 0x21
   2233    psrld                m1, 10
   2234    psrld                m2, 10
   2235    vperm2i128           m3, m4, m5, 0x21
   2236    pslld                m4, 6
   2237    pslld                m5, 6
   2238    pblendw              m2, m4, 0xaa ; 23 34
   2239    pslld                m0, 6
   2240    pblendw              m1, m0, 0xaa ; 01 12
   2241    psrld                m3, 10
   2242    pblendw              m3, m5, 0xaa ; 45 56
   2243    psrad                m0, m5, 16
   2244 .hv_w4_loop:
   2245    movu                xm4, [srcq+ssq*0]
   2246    vinserti128          m4, [srcq+ssq*1], 1
   2247    lea                srcq, [srcq+ssq*2]
   2248    pmaddwd              m5, m11, m1   ; a0 b0
   2249    mova                 m1, m2
   2250    pmaddwd              m2, m12       ; a1 b1
   2251    paddd                m5, m6
   2252    paddd                m5, m2
   2253    mova                 m2, m3
   2254    pmaddwd              m3, m13       ; a2 b2
   2255    paddd                m5, m3
   2256    pshufb               m3, m4, m9
   2257    pshufb               m4, m10
   2258    pmaddwd              m3, m7
   2259    pmaddwd              m4, m8
   2260    paddd                m3, m6
   2261    paddd                m4, m3
   2262    psrad                m4, 10
   2263    packssdw             m0, m4        ; _ 7 6 8
   2264    vpermq               m3, m0, q1122 ; _ 6 _ 7
   2265    punpckhwd            m3, m0        ; 67 78
   2266    mova                 m0, m4
   2267    pmaddwd              m4, m14, m3   ; a3 b3
   2268    paddd                m4, m5
   2269    psrad                m4, 10
   2270    vextracti128        xm5, m4, 1
   2271    packusdw            xm4, xm5
   2272    pminsw              xm4, xm15
   2273    movq       [dstq+dsq*0], xm4
   2274    movhps     [dstq+dsq*1], xm4
   2275    lea                dstq, [dstq+dsq*2]
   2276    sub                  hd, 2
   2277    jg .hv_w4_loop
   2278    RET
   2279 .hv_w8:
   2280    shr                 mxd, 16
   2281    vpbroadcastq         m2, [base+subpel_filters+mxq*8]
   2282    movzx               mxd, myb
   2283    shr                 myd, 16
   2284    cmp                  hd, 6
   2285    cmovs               myd, mxd
   2286    pmovsxbw            xm1, [base+subpel_filters+myq*8]
   2287    shl                  wd, 5
   2288    lea                  r6, [ssq*3]
   2289    sub                srcq, 6
   2290    pxor                 m0, m0
   2291    sub                srcq, r6
   2292    punpcklbw            m0, m2
   2293    lea                  wd, [hq+wq-256]
   2294    test          dword r8m, 0x800
   2295    jz .hv_w8_10bit
   2296    psraw                m0, 2
   2297    psllw               xm1, 2
   2298 .hv_w8_10bit:
   2299    pshufd              m11, m0, q0000
   2300    pshufd              m12, m0, q1111
   2301    mova            [v_mul], xm1
   2302    pshufd              m13, m0, q2222
   2303    pshufd              m14, m0, q3333
   2304 .hv_w8_loop0:
   2305 %macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
   2306    pshufb               m2, m%1, m9   ; 2 3 3 4 4 5 5 6
   2307    pshufb              m%1, m8        ; 0 1 1 2 2 3 3 4
   2308    pmaddwd              m3, m12, m2
   2309    pmaddwd             m%1, m11
   2310    pshufb              m%2, m9        ; 6 7 7 8 8 9 9 a
   2311    shufpd               m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
   2312    paddd                m3, m10
   2313    paddd               m%1, m3
   2314    pmaddwd              m3, m14, m%2
   2315    paddd               m%1, m3
   2316    pmaddwd              m3, m13, m2
   2317    pshufb              m%3, m9        ; a b b c c d d e
   2318    pmaddwd              m2, m11
   2319    paddd               m%1, m3
   2320    pmaddwd              m3, m12, m%2
   2321    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
   2322    pmaddwd             m%3, m14
   2323    pmaddwd             m%2, m13
   2324    paddd                m2, m10
   2325    paddd                m2, m3
   2326    paddd               m%3, m2
   2327    paddd               m%2, m%3
   2328    psrad               m%1, 10
   2329    psrad               m%2, 10
   2330    packssdw            m%1, m%2
   2331 %endmacro
   2332    movu                xm4, [srcq+r6 *1+ 0]
   2333    vbroadcasti128       m8, [subpel_h_shufA]
   2334    lea                  r7, [srcq+ssq*4]
   2335    movu                xm6, [srcq+r6 *1+ 8]
   2336    vbroadcasti128       m9, [subpel_h_shufB]
   2337    mov                  r8, dstq
   2338    movu                xm0, [srcq+r6 *1+16]
   2339    vpbroadcastd        m10, [pd_512]
   2340    movu                xm5, [srcq+ssq*0+ 0]
   2341    vinserti128          m5, [r7  +ssq*0+ 0], 1
   2342    movu                xm1, [srcq+ssq*0+16]
   2343    vinserti128          m1, [r7  +ssq*0+16], 1
   2344    shufpd               m7, m5, m1, 0x05
   2345    INIT_XMM avx2
   2346    PUT_8TAP_HV_H         4, 6, 0    ; 3
   2347    INIT_YMM avx2
   2348    PUT_8TAP_HV_H         5, 7, 1    ; 0 4
   2349    movu                xm0, [srcq+ssq*2+ 0]
   2350    vinserti128          m0, [srcq+r6 *2+ 0], 1
   2351    movu                xm1, [srcq+ssq*2+16]
   2352    vinserti128          m1, [srcq+r6 *2+16], 1
   2353    shufpd               m7, m0, m1, 0x05
   2354    PUT_8TAP_HV_H         0, 7, 1    ; 2 6
   2355    movu                xm6, [srcq+ssq*1+ 0]
   2356    movu                xm1, [srcq+ssq*1+16]
   2357    vinserti128          m6, [r7  +ssq*1+ 0], 1
   2358    vinserti128          m1, [r7  +ssq*1+16], 1
   2359    add                  r7, r6
   2360    shufpd               m7, m6, m1, 0x05
   2361    PUT_8TAP_HV_H         6, 7, 1    ; 1 5
   2362    vpermq               m4, m4, q1100
   2363    vpermq               m5, m5, q3120
   2364    vpermq               m6, m6, q3120
   2365    vpermq               m7, m0, q3120
   2366    punpcklwd            m3, m7, m4  ; 23
   2367    punpckhwd            m4, m5      ; 34
   2368    punpcklwd            m1, m5, m6  ; 01
   2369    punpckhwd            m5, m6      ; 45
   2370    punpcklwd            m2, m6, m7  ; 12
   2371    punpckhwd            m6, m7      ; 56
   2372 .hv_w8_loop:
   2373    vpbroadcastd         m9, [v_mul+4*0]
   2374    vpbroadcastd         m7, [v_mul+4*1]
   2375    vpbroadcastd        m10, [v_mul+4*2]
   2376    pmaddwd              m8, m9, m1  ; a0
   2377    pmaddwd              m9, m2      ; b0
   2378    mova                 m1, m3
   2379    mova                 m2, m4
   2380    pmaddwd              m3, m7      ; a1
   2381    pmaddwd              m4, m7      ; b1
   2382    paddd                m8, m3
   2383    paddd                m9, m4
   2384    mova                 m3, m5
   2385    mova                 m4, m6
   2386    pmaddwd              m5, m10     ; a2
   2387    pmaddwd              m6, m10     ; b2
   2388    paddd                m8, m5
   2389    paddd                m9, m6
   2390    movu                xm5, [r7+ssq*0]
   2391    vinserti128          m5, [r7+ssq*1], 1
   2392    vbroadcasti128       m7, [subpel_h_shufA]
   2393    vbroadcasti128      m10, [subpel_h_shufB]
   2394    movu                xm6, [r7+ssq*0+16]
   2395    vinserti128          m6, [r7+ssq*1+16], 1
   2396    vextracti128       [r8], m0, 1
   2397    pshufb               m0, m5, m7  ; 01
   2398    pshufb               m5, m10     ; 23
   2399    pmaddwd              m0, m11
   2400    pmaddwd              m5, m12
   2401    paddd                m0, m5
   2402    pshufb               m5, m6, m7  ; 89
   2403    pshufb               m6, m10     ; ab
   2404    pmaddwd              m5, m13
   2405    pmaddwd              m6, m14
   2406    paddd                m6, m5
   2407    movu                xm5, [r7+ssq*0+8]
   2408    vinserti128          m5, [r7+ssq*1+8], 1
   2409    lea                  r7, [r7+ssq*2]
   2410    pshufb               m7, m5, m7
   2411    pshufb               m5, m10
   2412    pmaddwd             m10, m13, m7
   2413    pmaddwd              m7, m11
   2414    paddd                m0, m10
   2415    vpbroadcastd        m10, [pd_512]
   2416    paddd                m6, m7
   2417    pmaddwd              m7, m14, m5
   2418    pmaddwd              m5, m12
   2419    paddd                m0, m7
   2420    paddd                m5, m6
   2421    vbroadcasti128       m6, [r8]
   2422    paddd                m8, m10
   2423    paddd                m9, m10
   2424    paddd                m0, m10
   2425    paddd                m5, m10
   2426    vpbroadcastd        m10, [v_mul+4*3]
   2427    psrad                m0, 10
   2428    psrad                m5, 10
   2429    packssdw             m0, m5
   2430    vpermq               m7, m0, q3120 ; 7 8
   2431    shufpd               m6, m7, 0x04  ; 6 7
   2432    punpcklwd            m5, m6, m7    ; 67
   2433    punpckhwd            m6, m7        ; 78
   2434    pmaddwd              m7, m10, m5   ; a3
   2435    pmaddwd             m10, m6        ; b3
   2436    paddd                m7, m8
   2437    paddd                m9, m10
   2438    psrad                m7, 10
   2439    psrad                m9, 10
   2440    packusdw             m7, m9
   2441    pminsw               m7, m15
   2442    vpermq               m7, m7, q3120
   2443    mova         [r8+dsq*0], xm7
   2444    vextracti128 [r8+dsq*1], m7, 1
   2445    lea                  r8, [r8+dsq*2]
   2446    sub                  hd, 2
   2447    jg .hv_w8_loop
   2448    add                srcq, 16
   2449    add                dstq, 16
   2450    movzx                hd, wb
   2451    sub                  wd, 1<<8
   2452    jg .hv_w8_loop0
   2453    RET
   2454 
   2455 %if WIN64
   2456 DECLARE_REG_TMP 6, 4
   2457 %else
   2458 DECLARE_REG_TMP 6, 7
   2459 %endif
   2460 
   2461 %define PREP_8TAP_FN FN prep_8tap,
   2462 PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
   2463 PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
   2464 PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
   2465 PREP_8TAP_FN regular,        REGULAR, REGULAR
   2466 
   2467 cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
   2468 %define base r7-prep_avx2
   2469    imul                mxd, mxm, 0x010101
   2470    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
   2471    imul                myd, mym, 0x010101
   2472    add                 myd, t1d ; 6tap_v, my, 4tap_v
   2473    lea                  r7, [prep_avx2]
   2474    movifnidn            hd, hm
   2475    test                mxd, 0xf00
   2476    jnz .h
   2477    test                myd, 0xf00
   2478    jnz .v
   2479 .prep:
   2480    tzcnt                wd, wd
   2481    mov                 r6d, r7m ; bitdepth_max
   2482    movzx                wd, word [r7+wq*2+table_offset(prep,)]
   2483    vpbroadcastd         m5, [r7-prep_avx2+pw_8192]
   2484    shr                 r6d, 11
   2485    add                  wq, r7
   2486    vpbroadcastd         m4, [base+prep_mul+r6*4]
   2487    lea                  r6, [ssq*3]
   2488 %if WIN64
   2489    pop                  r7
   2490 %endif
   2491    jmp                  wq
   2492 .h_w4:
   2493    movzx               mxd, mxb
   2494    sub                srcq, 2
   2495    pmovsxbw            xm0, [base+subpel_filters+mxq*8]
   2496    vbroadcasti128       m3, [subpel_h_shufA]
   2497    lea                  r6, [ssq*3]
   2498    vbroadcasti128       m4, [subpel_h_shufB]
   2499    WIN64_SPILL_XMM       8
   2500    pshufd              xm0, xm0, q2211
   2501    test          dword r7m, 0x800
   2502    jnz .h_w4_12bpc
   2503    psllw               xm0, 2
   2504 .h_w4_12bpc:
   2505    vpbroadcastq         m6, xm0
   2506    vpermq               m7, m0, q1111
   2507 .h_w4_loop:
   2508    movu                xm1, [srcq+ssq*0]
   2509    vinserti128          m1, [srcq+ssq*2], 1
   2510    movu                xm2, [srcq+ssq*1]
   2511    vinserti128          m2, [srcq+r6 *1], 1
   2512    lea                srcq, [srcq+ssq*4]
   2513    pshufb               m0, m1, m3 ; 0 1 1 2 2 3 3 4
   2514    pshufb               m1, m4     ; 2 3 3 4 4 5 5 6
   2515    pmaddwd              m0, m6
   2516    pmaddwd              m1, m7
   2517    paddd                m0, m5
   2518    paddd                m0, m1
   2519    pshufb               m1, m2, m3
   2520    pshufb               m2, m4
   2521    pmaddwd              m1, m6
   2522    pmaddwd              m2, m7
   2523    paddd                m1, m5
   2524    paddd                m1, m2
   2525    psrad                m0, 4
   2526    psrad                m1, 4
   2527    packssdw             m0, m1
   2528    mova             [tmpq], m0
   2529    add                tmpq, 32
   2530    sub                  hd, 4
   2531    jg .h_w4_loop
   2532    RET
   2533 .h:
   2534    test                myd, 0xf00
   2535    jnz .hv
   2536    vpbroadcastd         m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
   2537    cmp                  wd, 4
   2538    je .h_w4
   2539    shr                 mxd, 16
   2540    sub                srcq, 4
   2541    vpbroadcastq         m0, [base+subpel_filters+1+mxq*8]
   2542    WIN64_SPILL_XMM      10
   2543    vbroadcasti128       m6, [subpel_h_shufA]
   2544    punpcklbw            m0, m0
   2545    psraw                m0, 8 ; sign-extend
   2546    test          dword r7m, 0x800
   2547    jnz .h_12bpc
   2548    psllw                m0, 2
   2549 .h_12bpc:
   2550    pshufd               m7, m0, q0000
   2551    pshufd               m8, m0, q1111
   2552    pshufd               m9, m0, q2222
   2553    cmp                  wd, 8
   2554    jg .h_w16
   2555 .h_w8:
   2556    movu                xm0, [srcq+ssq*0+ 0]
   2557    vinserti128          m0, [srcq+ssq*1+ 0], 1
   2558    movu                xm2, [srcq+ssq*0+16]
   2559    vinserti128          m2, [srcq+ssq*1+16], 1
   2560    lea                srcq, [srcq+ssq*2]
   2561    shufpd               m1, m0, m2, 0x05
   2562 %macro PREP_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
   2563    pshufb              m%1, m6        ; 01 12 23 34
   2564    pshufb              m%2, m6        ; 45 56 67 78
   2565    pmaddwd             m%4, m7, m%1   ; a0
   2566    pshufb              m%3, m6        ; 89 9a ab bc
   2567    pmaddwd             m%5, m9, m%2   ; a2
   2568    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
   2569    paddd               m%4, m%5       ; a0+a2
   2570    pmaddwd             m%5, m7, m%2   ; b0
   2571    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
   2572    pmaddwd             m%3, m9        ; b2
   2573    pmaddwd             m%1, m8        ; a1
   2574    pmaddwd             m%2, m8        ; b1
   2575    paddd               m%3, m%5       ; b0+b2
   2576    paddd               m%4, m5
   2577    paddd               m%3, m5
   2578    paddd               m%1, m%4
   2579    paddd               m%2, m%3
   2580    psrad               m%1, 4
   2581    psrad               m%2, 4
   2582    packssdw            m%1, m%2
   2583 %endmacro
   2584    PREP_6TAP_H           0, 1, 2, 3, 4
   2585    mova             [tmpq], m0
   2586    add                tmpq, 32
   2587    sub                  hd, 2
   2588    jg .h_w8
   2589    RET
   2590 .h_w16:
   2591    add                  wd, wd
   2592 .h_w16_loop0:
   2593    mov                 r6d, wd
   2594 .h_w16_loop:
   2595    movu                 m0, [srcq+r6-32]
   2596    movu                 m1, [srcq+r6-24]
   2597    movu                 m2, [srcq+r6-16]
   2598    PREP_6TAP_H           0, 1, 2, 3, 4
   2599    mova       [tmpq+r6-32], m0
   2600    sub                 r6d, 32
   2601    jg .h_w16_loop
   2602    add                srcq, ssq
   2603    add                tmpq, wq
   2604    dec                  hd
   2605    jg .h_w16_loop0
   2606    RET
   2607 .v:
   2608    movzx               mxd, myb
   2609    shr                 myd, 16
   2610    cmp                  hd, 4
   2611    cmove               myd, mxd
   2612    vpbroadcastq         m0, [base+subpel_filters+1+myq*8]
   2613    WIN64_SPILL_XMM       9, 12
   2614    vpbroadcastd         m5, [prep_8tap_1d_rnd]
   2615    mov                  r6, ssq
   2616    punpcklbw            m0, m0
   2617    neg                  r6
   2618    psraw                m0, 8 ; sign-extend
   2619    test          dword r7m, 0x800
   2620    jnz .v_12bpc
   2621    psllw                m0, 2
   2622 .v_12bpc:
   2623    pshufd               m6, m0, q0000
   2624    pshufd               m7, m0, q1111
   2625    pshufd               m8, m0, q2222
   2626    cmp                  wd, 4
   2627    jg .v_w8
   2628 .v_w4:
   2629    movq                xm1, [srcq+r6 *2]
   2630    vpbroadcastq         m3, [srcq+r6 *1]
   2631    vpbroadcastq         m2, [srcq+ssq*0]
   2632    vpbroadcastq         m4, [srcq+ssq*1]
   2633    lea                srcq, [srcq+ssq*2]
   2634    vpbroadcastq         m0, [srcq+ssq*0]
   2635    vpblendd             m1, m3, 0x30
   2636    vpblendd             m3, m2, 0x30
   2637    punpcklwd            m1, m3     ; 01 12
   2638    vpblendd             m2, m4, 0x30
   2639    vpblendd             m4, m0, 0x30
   2640    punpcklwd            m2, m4     ; 23 34
   2641 .v_w4_loop:
   2642    vpbroadcastq         m3, [srcq+ssq*1]
   2643    lea                srcq, [srcq+ssq*2]
   2644    pmaddwd              m4, m6, m1 ; a0 b0
   2645    mova                 m1, m2
   2646    pmaddwd              m2, m7     ; a1 b1
   2647    paddd                m4, m2
   2648    vpblendd             m2, m0, m3, 0x30
   2649    vpbroadcastq         m0, [srcq+ssq*0]
   2650    vpblendd             m3, m0, 0x30
   2651    punpcklwd            m2, m3     ; 45 56
   2652    pmaddwd              m3, m8, m2 ; a2 b2
   2653    paddd                m4, m5
   2654    paddd                m4, m3
   2655    psrad                m4, 4
   2656    vextracti128        xm3, m4, 1
   2657    packssdw            xm4, xm3
   2658    mova             [tmpq], xm4
   2659    add                tmpq, 16
   2660    sub                  hd, 2
   2661    jg .v_w4_loop
   2662    RET
   2663 .v_w8:
   2664    WIN64_PUSH_XMM       12
   2665 %if WIN64
   2666    push                 r8
   2667 %endif
   2668    mov                 r8d, wd
   2669    shl                  wd, 5
   2670    lea                  wd, [hq+wq-256]
   2671 .v_w8_loop0:
   2672    vbroadcasti128       m3, [srcq+r6 *2]
   2673    vbroadcasti128       m4, [srcq+r6 *1]
   2674    lea                  r5, [srcq+ssq*2]
   2675    vbroadcasti128       m0, [srcq+ssq*0]
   2676    vbroadcasti128       m1, [srcq+ssq*1]
   2677    mov                  r7, tmpq
   2678    vbroadcasti128       m2, [r5+ssq*0]
   2679    shufpd               m3, m0, 0x0c
   2680    shufpd               m4, m1, 0x0c
   2681    punpcklwd            m1, m3, m4 ; 01
   2682    punpckhwd            m3, m4     ; 23
   2683    shufpd               m0, m2, 0x0c
   2684    punpcklwd            m2, m4, m0 ; 12
   2685    punpckhwd            m4, m0     ; 34
   2686 .v_w8_loop:
   2687    vbroadcasti128       m9, [r5+ssq*1]
   2688    pmaddwd             m10, m6, m1 ; a0
   2689    lea                  r5, [r5+ssq*2]
   2690    pmaddwd             m11, m6, m2 ; b0
   2691    mova                 m1, m3
   2692    pmaddwd              m3, m7     ; a1
   2693    mova                 m2, m4
   2694    pmaddwd              m4, m7     ; b1
   2695    paddd               m10, m5
   2696    paddd               m11, m5
   2697    paddd               m10, m3
   2698    vbroadcasti128       m3, [r5+ssq*0]
   2699    paddd               m11, m4
   2700    shufpd               m4, m0, m9, 0x0d
   2701    shufpd               m0, m9, m3, 0x0c
   2702    punpcklwd            m3, m4, m0 ; 45
   2703    punpckhwd            m4, m0     ; 56
   2704    pmaddwd              m9, m8, m3 ; a2
   2705    paddd               m10, m9
   2706    pmaddwd              m9, m8, m4 ; b2
   2707    paddd               m11, m9
   2708    psrad               m10, 4
   2709    psrad               m11, 4
   2710    packssdw            m10, m11
   2711    vpermq              m10, m10, q3120
   2712    mova          [r7+r8*0], xm10
   2713    vextracti128  [r7+r8*2], m10, 1
   2714    lea                  r7, [r7+r8*4]
   2715    sub                  hd, 2
   2716    jg .v_w8_loop
   2717    add                srcq, 16
   2718    add                tmpq, 16
   2719    movzx                hd, wb
   2720    sub                  wd, 1<<8
   2721    jg .v_w8_loop0
   2722 %if WIN64
   2723    pop                  r8
   2724 %endif
   2725    RET
   2726 .hv:
   2727    WIN64_SPILL_XMM      13, 15
   2728    vpbroadcastd         m7, [prep_8tap_2d_rnd]
   2729    vbroadcasti128       m8, [subpel_h_shufA]
   2730    cmp                  wd, 4
   2731    jg .hv_w8
   2732    movzx               mxd, mxb
   2733    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
   2734    movzx               mxd, myb
   2735    shr                 myd, 16
   2736    cmp                  hd, 4
   2737    cmove               myd, mxd
   2738    vpbroadcastq         m1, [base+subpel_filters+1+myq*8]
   2739    mov                  r6, ssq
   2740    sub                srcq, 2
   2741    pxor                 m6, m6
   2742    neg                  r6
   2743    punpcklbw            m6, m0
   2744    punpcklbw            m1, m1
   2745    psraw                m6, 4
   2746    psraw                m1, 8
   2747    test          dword r7m, 0x800
   2748    jz .hv_w4_10bit
   2749    psraw                m6, 2
   2750 .hv_w4_10bit:
   2751    pshufd              m10, m1, q0000
   2752    pshufd              m11, m1, q1111
   2753    pshufd              m12, m1, q2222
   2754 .hv_w4:
   2755    movu                xm2, [srcq+r6 *2]
   2756    vinserti128          m2, [srcq+r6 *1], 1 ; 0 1
   2757    pshufd               m5, m6, q0000
   2758    vbroadcasti128       m9, [base+subpel_h_shufB]
   2759    movu                xm0, [srcq+ssq*0]
   2760    pshufd               m6, m6, q1111
   2761    vinserti128          m0, [srcq+ssq*1], 1 ; 2 3
   2762    lea                srcq, [srcq+ssq*2]
   2763    movu                xm3, [srcq+ssq*0]    ; 4
   2764    pshufb               m1, m2, m8
   2765    pmaddwd              m1, m5
   2766    pshufb               m2, m9
   2767    pmaddwd              m2, m6
   2768    pshufb               m4, m0, m8
   2769    pmaddwd              m4, m5
   2770    pshufb               m0, m9
   2771    pmaddwd              m0, m6
   2772    paddd                m2, m1
   2773    pshufb              xm1, xm3, xm8
   2774    pmaddwd             xm1, xm5
   2775    pshufb              xm3, xm9
   2776    pmaddwd             xm3, xm6
   2777    paddd                m0, m4
   2778    paddd                m2, m7
   2779    paddd               xm1, xm7
   2780    paddd                m0, m7
   2781    paddd               xm3, xm1
   2782    REPX       {psrad x, 6}, m2, m0, xm3
   2783    packssdw             m2, m0      ; 0 2   1 3
   2784    packssdw            xm0, xm3     ; 2 4
   2785    vperm2i128           m0, m2, 0x03
   2786    punpcklwd            m1, m2, m0  ; 01 12
   2787    punpckhwd            m2, m0      ; 23 34
   2788 .hv_w4_loop:
   2789    movu                xm3, [srcq+ssq*1]
   2790    lea                srcq, [srcq+ssq*2]
   2791    vinserti128          m3, [srcq+ssq*0], 1
   2792    pmaddwd              m4, m10, m1 ; a0 b0
   2793    mova                 m1, m2
   2794    pmaddwd              m2, m11     ; a1 b1
   2795    paddd                m4, m2
   2796    pshufb               m2, m3, m8
   2797    pmaddwd              m2, m5
   2798    pshufb               m3, m9
   2799    pmaddwd              m3, m6
   2800    paddd                m2, m7
   2801    paddd                m3, m2
   2802    psrad                m3, 6
   2803    packssdw             m3, m3      ; 5 5   6 6
   2804    vperm2i128           m2, m0, m3, 0x21
   2805    mova                 m0, m3
   2806    punpckhwd            m2, m3      ; 45 56
   2807    pmaddwd              m3, m12, m2 ; a2 b2
   2808    paddd                m4, m7
   2809    paddd                m4, m3
   2810    psrad                m4, 6
   2811    vextracti128        xm3, m4, 1
   2812    packssdw            xm4, xm3
   2813    mova             [tmpq], xm4
   2814    add                tmpq, 16
   2815    sub                  hd, 2
   2816    jg .hv_w4_loop
   2817    RET
   2818 .hv_w8:
   2819    shr                 mxd, 16
   2820    vpbroadcastq         m2, [base+subpel_filters+1+mxq*8]
   2821    movzx               mxd, myb
   2822    shr                 myd, 16
   2823    cmp                  hd, 4
   2824    cmove               myd, mxd
   2825    pmovsxbw            xm1, [base+subpel_filters+1+myq*8]
   2826    WIN64_PUSH_XMM       15
   2827 %if WIN64
   2828    PUSH                 r8
   2829 %endif
   2830    mov                 r8d, wd
   2831    shl                  wd, 5
   2832    mov                  r6, ssq
   2833    sub                srcq, 4
   2834    neg                  r6
   2835    lea                  wd, [hq+wq-256]
   2836    pxor                 m0, m0
   2837    punpcklbw            m0, m2
   2838    psraw                m0, 4
   2839    test          dword r7m, 0x800
   2840    jz .hv_w8_10bit
   2841    psraw                m0, 2
   2842 .hv_w8_10bit:
   2843    pshufd              m10, m0, q0000
   2844    pshufd              m11, m0, q1111
   2845    mova            [v_mul], xm1
   2846    pshufd              m12, m0, q2222
   2847 .hv_w8_loop0:
   2848    vbroadcasti128       m0, [srcq+ssq*0+ 0]
   2849    vinserti128          m3, m0, [srcq+r6*2+ 0], 0
   2850    lea                  r5, [srcq+ssq*2]
   2851    vbroadcasti128       m2, [srcq+ssq*0+16]
   2852    vinserti128          m1, m2, [srcq+r6*2+16], 0
   2853    mov                  r7, tmpq
   2854    vinserti128          m0, [r5  +ssq*0+ 0], 1
   2855    vinserti128          m2, [r5  +ssq*0+16], 1
   2856    shufpd               m4, m3, m1, 0x05
   2857 %macro PREP_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
   2858    pshufb              m%1, m8        ; 01 12 23 34
   2859    pshufb              m%2, m8        ; 45 56 67 78
   2860    pmaddwd             m%4, m10, m%1  ; a0
   2861    pshufb              m%3, m8        ; 89 9a ab bc
   2862    pmaddwd             m%5, m12, m%2  ; a2
   2863    shufpd              m%1, m%2, 0x05 ; 23 34 45 56
   2864    paddd               m%4, m%5       ; a0+a2
   2865    pmaddwd             m%5, m10, m%2  ; b0
   2866    shufpd              m%2, m%3, 0x05 ; 67 78 89 9a
   2867    pmaddwd             m%3, m12       ; b2
   2868    pmaddwd             m%1, m11       ; a1
   2869    pmaddwd             m%2, m11       ; b1
   2870    paddd               m%3, m%5       ; b0+b2
   2871    paddd               m%4, m7
   2872    paddd               m%3, m7
   2873    paddd               m%1, m%4
   2874    paddd               m%2, m%3
   2875    psrad               m%1, 6
   2876    psrad               m%2, 6
   2877    packssdw            m%1, m%2
   2878 %endmacro
   2879    PREP_6TAP_HV_H        3, 4, 1, 5, 6  ; 0 2
   2880    movu                xm4, [srcq+r6 *1+ 0]
   2881    vinserti128          m4, [srcq+ssq*1+ 0], 1
   2882    shufpd               m1, m0, m2, 0x05
   2883    PREP_6TAP_HV_H        0, 1, 2, 5, 6  ; 2 4
   2884    movu                xm2, [srcq+r6 *1+16]
   2885    vinserti128          m2, [srcq+ssq*1+16], 1
   2886    shufpd               m1, m4, m2, 0x05
   2887    PREP_6TAP_HV_H        4, 1, 2, 5, 6  ; 1 3
   2888    vpermq               m3, m3, q3120
   2889    vpermq               m4, m4, q3120
   2890    vpermq               m0, m0, q3120
   2891    punpcklwd            m1, m3, m4     ; 01
   2892    punpckhwd            m3, m4         ; 23
   2893    punpcklwd            m2, m4, m0     ; 12
   2894    punpckhwd            m4, m0         ; 34
   2895 .hv_w8_loop:
   2896    vpbroadcastd        m14, [v_mul+4*0]
   2897    vpbroadcastd         m9, [v_mul+4*1]
   2898    movu                xm5, [r5+ssq*1+ 0]
   2899    movu                xm6, [r5+ssq*1+16]
   2900    lea                  r5, [r5+ssq*2]
   2901    pmaddwd             m13, m14, m1    ; a0
   2902    pmaddwd             m14, m2         ; b0
   2903    vinserti128          m5, [r5+ssq*0+ 0], 1
   2904    vinserti128          m6, [r5+ssq*0+16], 1
   2905    mova                 m1, m3
   2906    pmaddwd              m3, m9         ; a1
   2907    mova                 m2, m4
   2908    pmaddwd              m4, m9         ; b1
   2909    paddd               m13, m3
   2910    shufpd               m3, m5, m6, 0x05
   2911    paddd               m14, m4
   2912    PREP_6TAP_HV_H        5, 3, 6, 4, 9 ; 5 6
   2913    vpbroadcastd         m6, [v_mul+4*2]
   2914    vpermq               m5, m5, q3120
   2915    shufpd               m4, m0, m5, 0x05
   2916    mova                 m0, m5
   2917    punpcklwd            m3, m4, m5     ; 45
   2918    punpckhwd            m4, m5         ; 56
   2919    pmaddwd              m5, m6, m3     ; a2
   2920    pmaddwd              m6, m4         ; b2
   2921    paddd               m13, m7
   2922    paddd               m14, m7
   2923    paddd                m5, m13
   2924    paddd                m6, m14
   2925    psrad                m5, 6
   2926    psrad                m6, 6
   2927    packssdw             m5, m6
   2928    vpermq               m5, m5, q3120
   2929    mova          [r7+r8*0], xm5
   2930    vextracti128  [r7+r8*2], m5, 1
   2931    lea                  r7, [r7+r8*4]
   2932    sub                  hd, 2
   2933    jg .hv_w8_loop
   2934    add                srcq, 16
   2935    add                tmpq, 16
   2936    movzx                hd, wb
   2937    sub                  wd, 1<<8
   2938    jg .hv_w8_loop0
   2939 %if WIN64
   2940    POP                  r8
   2941 %endif
   2942    RET
   2943 
   2944 PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
   2945 PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
   2946 PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
   2947 PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
   2948 PREP_8TAP_FN sharp,          SHARP,   SHARP
   2949 
   2950 cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
   2951 %define base r7-prep_avx2
   2952    imul                mxd, mxm, 0x010101
   2953    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   2954    imul                myd, mym, 0x010101
   2955    add                 myd, t1d ; 8tap_v, my, 4tap_v
   2956    lea                  r7, [prep_avx2]
   2957    movifnidn            hd, hm
   2958    test                mxd, 0xf00
   2959    jnz .h
   2960    test                myd, 0xf00
   2961    jz mangle(private_prefix %+ _prep_6tap_16bpc_avx2).prep
   2962 .v:
   2963    movzx               mxd, myb
   2964    shr                 myd, 16
   2965    cmp                  hd, 4
   2966    cmove               myd, mxd
   2967    vpbroadcastq         m0, [base+subpel_filters+myq*8]
   2968    WIN64_SPILL_XMM      12, 15
   2969    vpbroadcastd         m7, [prep_8tap_1d_rnd]
   2970    lea                  r6, [strideq*3]
   2971    punpcklbw            m0, m0
   2972    sub                srcq, r6
   2973    psraw                m0, 8 ; sign-extend
   2974    test          dword r7m, 0x800
   2975    jnz .v_12bpc
   2976    psllw                m0, 2
   2977 .v_12bpc:
   2978    pshufd               m8, m0, q0000
   2979    pshufd               m9, m0, q1111
   2980    pshufd              m10, m0, q2222
   2981    pshufd              m11, m0, q3333
   2982    cmp                  wd, 4
   2983    jg .v_w8
   2984 .v_w4:
   2985    movq                xm1, [srcq+strideq*0]
   2986    vpbroadcastq         m0, [srcq+strideq*1]
   2987    vpbroadcastq         m2, [srcq+strideq*2]
   2988    vpbroadcastq         m4, [srcq+r6       ]
   2989    lea                srcq, [srcq+strideq*4]
   2990    vpbroadcastq         m3, [srcq+strideq*0]
   2991    vpbroadcastq         m5, [srcq+strideq*1]
   2992    vpblendd             m1, m0, 0x30
   2993    vpblendd             m0, m2, 0x30
   2994    punpcklwd            m1, m0      ; 01 12
   2995    vpbroadcastq         m0, [srcq+strideq*2]
   2996    add                srcq, r6
   2997    vpblendd             m2, m4, 0x30
   2998    vpblendd             m4, m3, 0x30
   2999    punpcklwd            m2, m4      ; 23 34
   3000    vpblendd             m3, m5, 0x30
   3001    vpblendd             m5, m0, 0x30
   3002    punpcklwd            m3, m5      ; 45 56
   3003 .v_w4_loop:
   3004    vpbroadcastq         m4, [srcq+strideq*0]
   3005    pmaddwd              m5, m8, m1  ; a0 b0
   3006    mova                 m1, m2
   3007    pmaddwd              m2, m9      ; a1 b1
   3008    paddd                m5, m7
   3009    paddd                m5, m2
   3010    mova                 m2, m3
   3011    pmaddwd              m3, m10     ; a2 b2
   3012    paddd                m5, m3
   3013    vpblendd             m3, m0, m4, 0x30
   3014    vpbroadcastq         m0, [srcq+strideq*1]
   3015    lea                srcq, [srcq+strideq*2]
   3016    vpblendd             m4, m0, 0x30
   3017    punpcklwd            m3, m4      ; 67 78
   3018    pmaddwd              m4, m11, m3 ; a3 b3
   3019    paddd                m5, m4
   3020    psrad                m5, 4
   3021    vextracti128        xm4, m5, 1
   3022    packssdw            xm5, xm4
   3023    mova             [tmpq], xm5
   3024    add                tmpq, 16
   3025    sub                  hd, 2
   3026    jg .v_w4_loop
   3027    RET
   3028 .v_w8:
   3029 %if WIN64
   3030    WIN64_PUSH_XMM       15
   3031    push                 r8
   3032 %endif
   3033    mov                 r8d, wd
   3034    shl                  wd, 5
   3035    lea                  wd, [hq+wq-256]
   3036 .v_w8_loop0:
   3037    vbroadcasti128       m4, [srcq+strideq*0]
   3038    vbroadcasti128       m5, [srcq+strideq*1]
   3039    lea                  r5, [srcq+strideq*4]
   3040    vbroadcasti128       m0, [srcq+r6       ]
   3041    vbroadcasti128       m6, [srcq+strideq*2]
   3042    mov                  r7, tmpq
   3043    vbroadcasti128       m1, [r5+strideq*0]
   3044    vbroadcasti128       m2, [r5+strideq*1]
   3045    vbroadcasti128       m3, [r5+strideq*2]
   3046    add                  r5, r6
   3047    shufpd               m4, m0, 0x0c
   3048    shufpd               m5, m1, 0x0c
   3049    punpcklwd            m1, m4, m5 ; 01
   3050    punpckhwd            m4, m5     ; 34
   3051    shufpd               m6, m2, 0x0c
   3052    punpcklwd            m2, m5, m6 ; 12
   3053    punpckhwd            m5, m6     ; 45
   3054    shufpd               m0, m3, 0x0c
   3055    punpcklwd            m3, m6, m0 ; 23
   3056    punpckhwd            m6, m0     ; 56
   3057 .v_w8_loop:
   3058    vbroadcasti128      m14, [r5+strideq*0]
   3059    pmaddwd             m12, m8, m1  ; a0
   3060    pmaddwd             m13, m8, m2  ; b0
   3061    mova                 m1, m3
   3062    mova                 m2, m4
   3063    pmaddwd              m3, m9      ; a1
   3064    pmaddwd              m4, m9      ; b1
   3065    paddd               m12, m7
   3066    paddd               m13, m7
   3067    paddd               m12, m3
   3068    paddd               m13, m4
   3069    mova                 m3, m5
   3070    mova                 m4, m6
   3071    pmaddwd              m5, m10     ; a2
   3072    pmaddwd              m6, m10     ; b2
   3073    paddd               m12, m5
   3074    vbroadcasti128       m5, [r5+strideq*1]
   3075    lea                  r5, [r5+strideq*2]
   3076    paddd               m13, m6
   3077    shufpd               m6, m0, m14, 0x0d
   3078    shufpd               m0, m14, m5, 0x0c
   3079    punpcklwd            m5, m6, m0  ; 67
   3080    punpckhwd            m6, m0      ; 78
   3081    pmaddwd             m14, m11, m5 ; a3
   3082    paddd               m12, m14
   3083    pmaddwd             m14, m11, m6 ; b3
   3084    paddd               m13, m14
   3085    psrad               m12, 4
   3086    psrad               m13, 4
   3087    packssdw            m12, m13
   3088    vpermq              m12, m12, q3120
   3089    mova          [r7+r8*0], xm12
   3090    vextracti128  [r7+r8*2], m12, 1
   3091    lea                  r7, [r7+r8*4]
   3092    sub                  hd, 2
   3093    jg .v_w8_loop
   3094    add                srcq, 16
   3095    add                tmpq, 16
   3096    movzx                hd, wb
   3097    sub                  wd, 1<<8
   3098    jg .v_w8_loop0
   3099 %if WIN64
   3100    pop                  r8
   3101 %endif
   3102    RET
   3103 .h:
   3104    test                myd, 0xf00
   3105    jnz .hv
   3106    vpbroadcastd         m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
   3107    cmp                  wd, 4
   3108    je mangle(private_prefix %+ _prep_6tap_16bpc_avx2).h_w4
   3109    shr                 mxd, 16
   3110    sub                srcq, 6
   3111    vpbroadcastq         m0, [base+subpel_filters+mxq*8]
   3112    WIN64_SPILL_XMM      12
   3113    vbroadcasti128       m6, [subpel_h_shufA]
   3114    vbroadcasti128       m7, [subpel_h_shufB]
   3115    punpcklbw            m0, m0
   3116    psraw                m0, 8 ; sign-extend
   3117    test          dword r7m, 0x800
   3118    jnz .h_12bpc
   3119    psllw                m0, 2
   3120 .h_12bpc:
   3121    pshufd               m8, m0, q0000
   3122    pshufd               m9, m0, q1111
   3123    pshufd              m10, m0, q2222
   3124    pshufd              m11, m0, q3333
   3125    cmp                  wd, 8
   3126    jg .h_w16
   3127 .h_w8:
   3128 %macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
   3129    pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
   3130    pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
   3131    pmaddwd             m%5, m9, m%4   ; abcd1
   3132    pmaddwd             m%1, m8        ; abcd0
   3133    pshufb              m%2, m7        ; 6 7 7 8 8 9 9 a
   3134    shufpd              m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
   3135    paddd               m%5, m5
   3136    paddd               m%1, m%5
   3137    pmaddwd             m%5, m11, m%2  ; abcd3
   3138    paddd               m%1, m%5
   3139    pmaddwd             m%5, m10, m%4  ; abcd2
   3140    pshufb              m%3, m7        ; a b b c c d d e
   3141    pmaddwd             m%4, m8        ; efgh0
   3142    paddd               m%1, m%5
   3143    pmaddwd             m%5, m9, m%2   ; efgh1
   3144    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
   3145    pmaddwd             m%3, m11       ; efgh3
   3146    pmaddwd             m%2, m10       ; efgh2
   3147    paddd               m%4, m5
   3148    paddd               m%4, m%5
   3149    paddd               m%3, m%4
   3150    paddd               m%2, m%3
   3151    psrad               m%1, 4
   3152    psrad               m%2, 4
   3153    packssdw            m%1, m%2
   3154 %endmacro
   3155    movu                xm0, [srcq+strideq*0+ 0]
   3156    vinserti128          m0, [srcq+strideq*1+ 0], 1
   3157    movu                xm2, [srcq+strideq*0+16]
   3158    vinserti128          m2, [srcq+strideq*1+16], 1
   3159    lea                srcq, [srcq+strideq*2]
   3160    shufpd               m1, m0, m2, 0x05
   3161    PREP_8TAP_H           0, 1, 2, 3, 4
   3162    mova             [tmpq], m0
   3163    add                tmpq, 32
   3164    sub                  hd, 2
   3165    jg .h_w8
   3166    RET
   3167 .h_w16:
   3168    add                  wd, wd
   3169 .h_w16_loop0:
   3170    mov                 r6d, wd
   3171 .h_w16_loop:
   3172    movu                 m0, [srcq+r6-32]
   3173    movu                 m1, [srcq+r6-24]
   3174    movu                 m2, [srcq+r6-16]
   3175    PREP_8TAP_H           0, 1, 2, 3, 4
   3176    mova       [tmpq+r6-32], m0
   3177    sub                 r6d, 32
   3178    jg .h_w16_loop
   3179    add                srcq, strideq
   3180    add                tmpq, wq
   3181    dec                  hd
   3182    jg .h_w16_loop0
   3183    RET
   3184 .hv:
   3185    WIN64_SPILL_XMM      16
   3186    vpbroadcastd        m15, [prep_8tap_2d_rnd]
   3187    cmp                  wd, 4
   3188    jg .hv_w8
   3189    movzx               mxd, mxb
   3190    vpbroadcastd         m0, [base+subpel_filters+mxq*8+2]
   3191    movzx               mxd, myb
   3192    shr                 myd, 16
   3193    cmp                  hd, 4
   3194    cmove               myd, mxd
   3195    vpbroadcastq         m1, [base+subpel_filters+myq*8]
   3196    lea                  r6, [strideq*3]
   3197    sub                srcq, 2
   3198    pxor                 m7, m7
   3199    sub                srcq, r6
   3200    punpcklbw            m7, m0
   3201    punpcklbw            m1, m1
   3202    psraw                m7, 4
   3203    psraw                m1, 8
   3204    test          dword r7m, 0x800
   3205    jz .hv_w4_10bit
   3206    psraw                m7, 2
   3207 .hv_w4_10bit:
   3208    pshufd              m11, m1, q0000
   3209    pshufd              m12, m1, q1111
   3210    pshufd              m13, m1, q2222
   3211    pshufd              m14, m1, q3333
   3212 .hv_w4:
   3213    vbroadcasti128       m9, [subpel_h_shufA]
   3214    vbroadcasti128      m10, [subpel_h_shufB]
   3215    pshufd               m8, m7, q1111
   3216    pshufd               m7, m7, q0000
   3217    movu                xm1, [srcq+strideq*0]
   3218    vinserti128          m1, [srcq+strideq*1], 1     ; 0 1
   3219    vbroadcasti128       m0, [srcq+r6       ]
   3220    vinserti128          m2, m0, [srcq+strideq*2], 0 ; 2 3
   3221    lea                srcq, [srcq+strideq*4]
   3222    vinserti128          m0, [srcq+strideq*0], 1     ; 3 4
   3223    movu                xm3, [srcq+strideq*1]
   3224    vinserti128          m3, [srcq+strideq*2], 1     ; 5 6
   3225    add                srcq, r6
   3226    pshufb               m4, m1, m9
   3227    pshufb               m1, m10
   3228    pmaddwd              m4, m7
   3229    pmaddwd              m1, m8
   3230    pshufb               m5, m2, m9
   3231    pshufb               m2, m10
   3232    pmaddwd              m5, m7
   3233    pmaddwd              m2, m8
   3234    paddd                m4, m15
   3235    paddd                m1, m4
   3236    pshufb               m4, m0, m9
   3237    pshufb               m0, m10
   3238    pmaddwd              m4, m7
   3239    pmaddwd              m0, m8
   3240    paddd                m5, m15
   3241    paddd                m2, m5
   3242    pshufb               m5, m3, m9
   3243    pshufb               m3, m10
   3244    pmaddwd              m5, m7
   3245    pmaddwd              m3, m8
   3246    paddd                m4, m15
   3247    paddd                m4, m0
   3248    paddd                m5, m15
   3249    paddd                m5, m3
   3250    vperm2i128           m0, m1, m2, 0x21
   3251    psrld                m1, 6
   3252    psrld                m2, 6
   3253    vperm2i128           m3, m4, m5, 0x21
   3254    pslld                m4, 10
   3255    pslld                m5, 10
   3256    pblendw              m2, m4, 0xaa ; 23 34
   3257    pslld                m0, 10
   3258    pblendw              m1, m0, 0xaa ; 01 12
   3259    psrld                m3, 6
   3260    pblendw              m3, m5, 0xaa ; 45 56
   3261    psrad                m0, m5, 16
   3262 .hv_w4_loop:
   3263    movu                xm4, [srcq+strideq*0]
   3264    vinserti128          m4, [srcq+strideq*1], 1
   3265    lea                srcq, [srcq+strideq*2]
   3266    pmaddwd              m5, m11, m1   ; a0 b0
   3267    mova                 m1, m2
   3268    pmaddwd              m2, m12       ; a1 b1
   3269    paddd                m5, m15
   3270    paddd                m5, m2
   3271    mova                 m2, m3
   3272    pmaddwd              m3, m13       ; a2 b2
   3273    paddd                m5, m3
   3274    pshufb               m3, m4, m9
   3275    pshufb               m4, m10
   3276    pmaddwd              m3, m7
   3277    pmaddwd              m4, m8
   3278    paddd                m3, m15
   3279    paddd                m4, m3
   3280    psrad                m4, 6
   3281    packssdw             m0, m4        ; _ 7 6 8
   3282    vpermq               m3, m0, q1122 ; _ 6 _ 7
   3283    punpckhwd            m3, m0        ; 67 78
   3284    mova                 m0, m4
   3285    pmaddwd              m4, m14, m3   ; a3 b3
   3286    paddd                m4, m5
   3287    psrad                m4, 6
   3288    vextracti128        xm5, m4, 1
   3289    packssdw            xm4, xm5
   3290    mova             [tmpq], xm4
   3291    add                tmpq, 16
   3292    sub                  hd, 2
   3293    jg .hv_w4_loop
   3294    RET
   3295 .hv_w8:
   3296    shr                 mxd, 16
   3297    vpbroadcastq         m2, [base+subpel_filters+mxq*8]
   3298    movzx               mxd, myb
   3299    shr                 myd, 16
   3300    cmp                  hd, 4
   3301    cmove               myd, mxd
   3302    pmovsxbw            xm1, [base+subpel_filters+myq*8]
   3303 %if WIN64
   3304    PUSH                 r8
   3305 %endif
   3306    mov                 r8d, wd
   3307    shl                  wd, 5
   3308    lea                  r6, [strideq*3]
   3309    sub                srcq, 6
   3310    sub                srcq, r6
   3311    lea                  wd, [hq+wq-256]
   3312    pxor                 m0, m0
   3313    punpcklbw            m0, m2
   3314    psraw                m0, 4
   3315    test          dword r7m, 0x800
   3316    jz .hv_w8_10bit
   3317    psraw                m0, 2
   3318 .hv_w8_10bit:
   3319    pshufd              m11, m0, q0000
   3320    pshufd              m12, m0, q1111
   3321    mova            [v_mul], xm1
   3322    pshufd              m13, m0, q2222
   3323    pshufd              m14, m0, q3333
   3324 .hv_w8_loop0:
   3325 %macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
   3326    pshufb               m2, m%1, m9   ; 2 3 3 4 4 5 5 6
   3327    pshufb              m%1, m8        ; 0 1 1 2 2 3 3 4
   3328    pmaddwd              m3, m12, m2
   3329    pmaddwd             m%1, m11
   3330    pshufb              m%2, m9        ; 6 7 7 8 8 9 9 a
   3331    shufpd               m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
   3332    paddd                m3, m15
   3333    paddd               m%1, m3
   3334    pmaddwd              m3, m14, m%2
   3335    paddd               m%1, m3
   3336    pmaddwd              m3, m13, m2
   3337    pshufb              m%3, m9        ; a b b c c d d e
   3338    pmaddwd              m2, m11
   3339    paddd               m%1, m3
   3340    pmaddwd              m3, m12, m%2
   3341    shufpd              m%2, m%3, 0x05 ; 8 9 9 a a b b c
   3342    pmaddwd             m%3, m14
   3343    pmaddwd             m%2, m13
   3344    paddd                m2, m15
   3345    paddd                m2, m3
   3346    paddd                m2, m%3
   3347    paddd                m2, m%2
   3348    psrad               m%1, 6
   3349    psrad                m2, 6
   3350    packssdw            m%1, m2
   3351 %endmacro
   3352    movu                xm4, [srcq+r6       + 0]
   3353    vbroadcasti128       m8, [subpel_h_shufA]
   3354    lea                  r5, [srcq+strideq*4]
   3355    movu                xm6, [srcq+r6       + 8]
   3356    vbroadcasti128       m9, [subpel_h_shufB]
   3357    mov                  r7, tmpq
   3358    movu                xm0, [srcq+r6       +16]
   3359    movu                xm5, [srcq+strideq*0+ 0]
   3360    vinserti128          m5, [r5  +strideq*0+ 0], 1
   3361    movu                xm1, [srcq+strideq*0+16]
   3362    vinserti128          m1, [r5  +strideq*0+16], 1
   3363    shufpd               m7, m5, m1, 0x05
   3364    INIT_XMM avx2
   3365    PREP_8TAP_HV_H        4, 6, 0    ; 3
   3366    INIT_YMM avx2
   3367    PREP_8TAP_HV_H        5, 7, 1    ; 0 4
   3368    movu                xm0, [srcq+strideq*2+ 0]
   3369    vinserti128          m0, [srcq+r6     *2+ 0], 1
   3370    movu                xm1, [srcq+strideq*2+16]
   3371    vinserti128          m1, [srcq+r6     *2+16], 1
   3372    shufpd               m7, m0, m1, 0x05
   3373    PREP_8TAP_HV_H        0, 7, 1    ; 2 6
   3374    movu                xm6, [srcq+strideq*1+ 0]
   3375    movu                xm1, [srcq+strideq*1+16]
   3376    vinserti128          m6, [r5  +strideq*1+ 0], 1
   3377    vinserti128          m1, [r5  +strideq*1+16], 1
   3378    add                  r5, r6
   3379    shufpd               m7, m6, m1, 0x05
   3380    PREP_8TAP_HV_H        6, 7, 1    ; 1 5
   3381    vpermq               m4, m4, q1100
   3382    vpermq               m5, m5, q3120
   3383    vpermq               m6, m6, q3120
   3384    vpermq               m7, m0, q3120
   3385    punpcklwd            m3, m7, m4  ; 23
   3386    punpckhwd            m4, m5      ; 34
   3387    punpcklwd            m1, m5, m6  ; 01
   3388    punpckhwd            m5, m6      ; 45
   3389    punpcklwd            m2, m6, m7  ; 12
   3390    punpckhwd            m6, m7      ; 56
   3391 .hv_w8_loop:
   3392    vpbroadcastd         m9, [v_mul+4*0]
   3393    vpbroadcastd         m7, [v_mul+4*1]
   3394    vpbroadcastd        m10, [v_mul+4*2]
   3395    pmaddwd              m8, m9, m1  ; a0
   3396    pmaddwd              m9, m2      ; b0
   3397    mova                 m1, m3
   3398    mova                 m2, m4
   3399    pmaddwd              m3, m7      ; a1
   3400    pmaddwd              m4, m7      ; b1
   3401    paddd                m8, m15
   3402    paddd                m9, m15
   3403    paddd                m8, m3
   3404    paddd                m9, m4
   3405    mova                 m3, m5
   3406    mova                 m4, m6
   3407    pmaddwd              m5, m10     ; a2
   3408    pmaddwd              m6, m10     ; b2
   3409    paddd                m8, m5
   3410    paddd                m9, m6
   3411    movu                xm5, [r5+strideq*0]
   3412    vinserti128          m5, [r5+strideq*1], 1
   3413    vbroadcasti128       m7, [subpel_h_shufA]
   3414    vbroadcasti128      m10, [subpel_h_shufB]
   3415    movu                xm6, [r5+strideq*0+16]
   3416    vinserti128          m6, [r5+strideq*1+16], 1
   3417    vextracti128       [r7], m0, 1
   3418    pshufb               m0, m5, m7  ; 01
   3419    pshufb               m5, m10     ; 23
   3420    pmaddwd              m0, m11
   3421    pmaddwd              m5, m12
   3422    paddd                m0, m15
   3423    paddd                m0, m5
   3424    pshufb               m5, m6, m7  ; 89
   3425    pshufb               m6, m10     ; ab
   3426    pmaddwd              m5, m13
   3427    pmaddwd              m6, m14
   3428    paddd                m5, m15
   3429    paddd                m6, m5
   3430    movu                xm5, [r5+strideq*0+8]
   3431    vinserti128          m5, [r5+strideq*1+8], 1
   3432    lea                  r5, [r5+strideq*2]
   3433    pshufb               m7, m5, m7
   3434    pshufb               m5, m10
   3435    pmaddwd             m10, m13, m7
   3436    pmaddwd              m7, m11
   3437    paddd                m0, m10
   3438    paddd                m6, m7
   3439    pmaddwd              m7, m14, m5
   3440    pmaddwd              m5, m12
   3441    paddd                m0, m7
   3442    paddd                m5, m6
   3443    vbroadcasti128       m6, [r7]
   3444    vpbroadcastd        m10, [v_mul+4*3]
   3445    psrad                m0, 6
   3446    psrad                m5, 6
   3447    packssdw             m0, m5
   3448    vpermq               m7, m0, q3120 ; 7 8
   3449    shufpd               m6, m7, 0x04  ; 6 7
   3450    punpcklwd            m5, m6, m7    ; 67
   3451    punpckhwd            m6, m7        ; 78
   3452    pmaddwd              m7, m10, m5   ; a3
   3453    pmaddwd             m10, m6        ; b3
   3454    paddd                m7, m8
   3455    paddd                m9, m10
   3456    psrad                m7, 6
   3457    psrad                m9, 6
   3458    packssdw             m7, m9
   3459    vpermq               m7, m7, q3120
   3460    mova          [r7+r8*0], xm7
   3461    vextracti128  [r7+r8*2], m7, 1
   3462    lea                  r7, [r7+r8*4]
   3463    sub                  hd, 2
   3464    jg .hv_w8_loop
   3465    add                srcq, 16
   3466    add                tmpq, 16
   3467    movzx                hd, wb
   3468    sub                  wd, 1<<8
   3469    jg .hv_w8_loop0
   3470 %if WIN64
   3471    POP                  r8
   3472 %endif
   3473    RET
   3474 
   3475 %macro movifprep 2
   3476 %if isprep
   3477    mov %1, %2
   3478 %endif
   3479 %endmacro
   3480 
   3481 %macro REMAP_REG 2
   3482 %xdefine r%1  r%2
   3483 %xdefine r%1q r%2q
   3484 %xdefine r%1d r%2d
   3485 %endmacro
   3486 
   3487 %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
   3488 %if isprep
   3489  %xdefine r14_save r14
   3490  %assign %%i 14
   3491  %rep 14
   3492   %assign %%j %%i-1
   3493   REMAP_REG %%i, %%j
   3494   %assign %%i %%i-1
   3495  %endrep
   3496 %endif
   3497 %endmacro
   3498 
   3499 %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
   3500 %if isprep
   3501  %assign %%i 1
   3502  %rep 13
   3503   %assign %%j %%i+1
   3504   REMAP_REG %%i, %%j
   3505   %assign %%i %%i+1
   3506  %endrep
   3507  %xdefine r14 r14_save
   3508  %undef r14_save
   3509 %endif
   3510 %endmacro
   3511 
   3512 %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
   3513    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
   3514    RET
   3515 %if %1
   3516    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
   3517 %endif
   3518 %endmacro
   3519 
   3520 %macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd
   3521    movu               xm%1, [srcq+ r4*2]
   3522    movu               xm%2, [srcq+ r6*2]
   3523    movu               xm%3, [srcq+ r7*2]
   3524    movu               xm%4, [srcq+ r9*2]
   3525    vinserti128         m%1, [srcq+r10*2], 1
   3526    vinserti128         m%2, [srcq+r11*2], 1
   3527    vinserti128         m%3, [srcq+r13*2], 1
   3528    vinserti128         m%4, [srcq+ rX*2], 1
   3529    add                srcq, ssq
   3530    movu               xm%5, [srcq+ r4*2]
   3531    movu               xm%6, [srcq+ r6*2]
   3532    movu               xm%7, [srcq+ r7*2]
   3533    movu               xm%8, [srcq+ r9*2]
   3534    vinserti128         m%5, [srcq+r10*2], 1
   3535    vinserti128         m%6, [srcq+r11*2], 1
   3536    vinserti128         m%7, [srcq+r13*2], 1
   3537    vinserti128         m%8, [srcq+ rX*2], 1
   3538    add                srcq, ssq
   3539    pmaddwd             m%1, m12
   3540    pmaddwd             m%2, m13
   3541    pmaddwd             m%3, m14
   3542    pmaddwd             m%4, m15
   3543    pmaddwd             m%5, m12
   3544    pmaddwd             m%6, m13
   3545    pmaddwd             m%7, m14
   3546    pmaddwd             m%8, m15
   3547    phaddd              m%1, m%2
   3548 %if %9
   3549    mova                m10, [rsp+0x00]
   3550 %endif
   3551    phaddd              m%3, m%4
   3552    phaddd              m%5, m%6
   3553    phaddd              m%7, m%8
   3554    phaddd              m%1, m%3
   3555    phaddd              m%5, m%7
   3556    paddd               m%1, m10
   3557    paddd               m%5, m10
   3558    psrad               m%1, xm11
   3559    psrad               m%5, xm11
   3560    packssdw            m%1, m%5
   3561 %endmacro
   3562 
   3563 %macro MC_8TAP_SCALED 1
   3564 %ifidn %1, put
   3565 %assign isput  1
   3566 %assign isprep 0
   3567 cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
   3568 %xdefine base_reg r12
   3569    mov                 r7d, pxmaxm
   3570 %else
   3571 %assign isput  0
   3572 %assign isprep 1
   3573 cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
   3574  %define tmp_stridem qword [rsp+0xd0]
   3575 %xdefine base_reg r11
   3576 %endif
   3577    lea            base_reg, [%1_8tap_scaled_16bpc_avx2]
   3578 %define base base_reg-%1_8tap_scaled_16bpc_avx2
   3579    tzcnt                wd, wm
   3580    vpbroadcastd         m8, dxm
   3581 %if isprep && UNIX64
   3582    movd               xm10, mxd
   3583    vpbroadcastd        m10, xm10
   3584    mov                 r5d, t0d
   3585 DECLARE_REG_TMP 5, 7
   3586    mov                 r6d, pxmaxm
   3587 %else
   3588    vpbroadcastd        m10, mxm
   3589 %if isput
   3590    vpbroadcastw        m11, pxmaxm
   3591 %else
   3592    mov                 r6d, pxmaxm
   3593 %endif
   3594 %endif
   3595    mov                 dyd, dym
   3596 %if isput
   3597 %if WIN64
   3598    mov                 r8d, hm
   3599  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
   3600  %define hm r5m
   3601  %define dxm r8m
   3602 %else
   3603  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
   3604  %define hm r6m
   3605 %endif
   3606 %define dsm [rsp+0x98]
   3607 %define rX r1
   3608 %define rXd r1d
   3609 %else ; prep
   3610 %if WIN64
   3611    mov                 r7d, hm
   3612  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
   3613  %define hm r4m
   3614  %define dxm r7m
   3615 %else
   3616  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
   3617  %define hm [rsp+0x98]
   3618 %endif
   3619 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
   3620 %define rX r14
   3621 %define rXd r14d
   3622 %endif
   3623    shr                 r7d, 11
   3624    vpbroadcastd         m6, [base+pd_0x3ff]
   3625    vpbroadcastd        m12, [base+s_8tap_h_rnd+r7*4]
   3626    movd                xm7, [base+s_8tap_h_sh+r7*4]
   3627 %if isput
   3628    vpbroadcastd        m13, [base+put_s_8tap_v_rnd+r7*4]
   3629    pinsrd              xm7, [base+put_s_8tap_v_sh+r7*4], 2
   3630 %else
   3631    vpbroadcastd        m13, [base+pd_m524256]
   3632 %endif
   3633    pxor                 m9, m9
   3634    lea                ss3q, [ssq*3]
   3635    movzx               r7d, t1b
   3636    shr                 t1d, 16
   3637    cmp                  hd, 6
   3638    cmovs               t1d, r7d
   3639    sub                srcq, ss3q
   3640    cmp                 dyd, 1024
   3641    je .dy1
   3642    cmp                 dyd, 2048
   3643    je .dy2
   3644    movzx                wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
   3645    add                  wq, base_reg
   3646    jmp                  wq
   3647 %if isput
   3648 .w2:
   3649    mov                 myd, mym
   3650    movzx               t0d, t0b
   3651    sub                srcq, 2
   3652    movd               xm15, t0d
   3653    punpckldq            m8, m9, m8
   3654    paddd               m10, m8 ; mx+dx*[0,1]
   3655    vpbroadcastd       xm14, [base+pq_0x40000000+2]
   3656    vpbroadcastd       xm15, xm15
   3657    pand                xm8, xm10, xm6
   3658    psrld               xm8, 6
   3659    paddd              xm15, xm8
   3660    movd                r4d, xm15
   3661    pextrd              r6d, xm15, 1
   3662    vbroadcasti128       m5, [base+bdct_lb_q]
   3663    vbroadcasti128       m6, [base+subpel_s_shuf2]
   3664    vpbroadcastd       xm15, [base+subpel_filters+r4*8+2]
   3665    vpbroadcastd        xm4, [base+subpel_filters+r6*8+2]
   3666    pcmpeqd             xm8, xm9
   3667    psrld               m10, 10
   3668    paddd               m10, m10
   3669    movu                xm0, [srcq+ssq*0]
   3670    movu                xm1, [srcq+ssq*1]
   3671    movu                xm2, [srcq+ssq*2]
   3672    movu                xm3, [srcq+ss3q ]
   3673    lea                srcq, [srcq+ssq*4]
   3674    pshufb              m10, m5
   3675    paddb               m10, m6
   3676    vpblendd           xm15, xm4, 0xa
   3677    pblendvb           xm15, xm14, xm8
   3678    pmovsxbw            m15, xm15
   3679    vinserti128          m0, [srcq+ssq*0], 1 ; 0 4
   3680    vinserti128          m1, [srcq+ssq*1], 1 ; 1 5
   3681    vinserti128          m2, [srcq+ssq*2], 1 ; 2 6
   3682    vinserti128          m3, [srcq+ss3q ], 1 ; 3 7
   3683    lea                srcq, [srcq+ssq*4]
   3684    REPX    {pshufb x, m10}, m0, m1, m2, m3
   3685    REPX   {pmaddwd x, m15}, m0, m1, m2, m3
   3686    phaddd               m0, m1
   3687    phaddd               m2, m3
   3688    paddd                m0, m12
   3689    paddd                m2, m12
   3690    psrad                m0, xm7
   3691    psrad                m2, xm7
   3692    packssdw             m0, m2             ; 0 1 2 3  4 5 6 7
   3693    vextracti128        xm1, m0, 1
   3694    palignr             xm2, xm1, xm0, 4    ; 1 2 3 4
   3695    punpcklwd           xm3, xm0, xm2       ; 01 12
   3696    punpckhwd           xm0, xm2            ; 23 34
   3697    pshufd              xm4, xm1, q0321     ; 5 6 7 _
   3698    punpcklwd           xm2, xm1, xm4       ; 45 56
   3699    punpckhwd           xm4, xm1, xm4       ; 67 __
   3700 .w2_loop:
   3701    and                 myd, 0x3ff
   3702    mov                 r6d, 64 << 24
   3703    mov                 r4d, myd
   3704    shr                 r4d, 6
   3705    lea                 r4d, [t1+r4]
   3706    cmovnz              r6q, [base+subpel_filters+r4*8]
   3707    movq               xm14, r6q
   3708    pmovsxbw           xm14, xm14
   3709    pshufd              xm8, xm14, q0000
   3710    pshufd              xm9, xm14, q1111
   3711    pmaddwd             xm5, xm3, xm8
   3712    pmaddwd             xm6, xm0, xm9
   3713    pshufd              xm8, xm14, q2222
   3714    pshufd             xm14, xm14, q3333
   3715    paddd               xm5, xm6
   3716    pmaddwd             xm6, xm2, xm8
   3717    pmaddwd             xm8, xm4, xm14
   3718    psrldq              xm9, xm7, 8
   3719    paddd               xm5, xm6
   3720    paddd               xm5, xm13
   3721    paddd               xm5, xm8
   3722    psrad               xm5, xm9
   3723    packusdw            xm5, xm5
   3724    pminsw              xm5, xm11
   3725    movd             [dstq], xm5
   3726    add                dstq, dsq
   3727    dec                  hd
   3728    jz .ret
   3729    add                 myd, dyd
   3730    test                myd, ~0x3ff
   3731    jz .w2_loop
   3732    movu                xm5, [srcq]
   3733    test                myd, 0x400
   3734    jz .w2_skip_line
   3735    add                srcq, ssq
   3736    shufps              xm3, xm0, q1032     ; 01 12
   3737    shufps              xm0, xm2, q1032     ; 23 34
   3738    shufps              xm2, xm4, q1032     ; 45 56
   3739    pshufb              xm5, xm10
   3740    pmaddwd             xm5, xm15
   3741    phaddd              xm5, xm5
   3742    paddd               xm5, xm12
   3743    psrad               xm5, xm7
   3744    packssdw            xm5, xm5
   3745    palignr             xm1, xm5, xm1, 12
   3746    punpcklqdq          xm1, xm1            ; 6 7 6 7
   3747    punpcklwd           xm4, xm1, xm5       ; 67 __
   3748    jmp .w2_loop
   3749 .w2_skip_line:
   3750    movu                xm6, [srcq+ssq*1]
   3751    lea                srcq, [srcq+ssq*2]
   3752    mova                xm3, xm0            ; 01 12
   3753    mova                xm0, xm2            ; 23 34
   3754    pshufb              xm5, xm10
   3755    pshufb              xm6, xm10
   3756    pmaddwd             xm5, xm15
   3757    pmaddwd             xm6, xm15
   3758    phaddd              xm5, xm6
   3759    paddd               xm5, xm12
   3760    psrad               xm5, xm7
   3761    packssdw            xm5, xm5            ; 6 7 6 7
   3762    palignr             xm1, xm5, xm1, 8    ; 4 5 6 7
   3763    pshufd              xm5, xm1, q0321     ; 5 6 7 _
   3764    punpcklwd           xm2, xm1, xm5       ; 45 56
   3765    punpckhwd           xm4, xm1, xm5       ; 67 __
   3766    jmp .w2_loop
   3767 %endif
   3768 .w4:
   3769    mov                 myd, mym
   3770    mova         [rsp+0x00], m12
   3771 %if isput
   3772    mova         [rsp+0x20], xm13
   3773 %else
   3774    SWAP                m11, m13
   3775 %endif
   3776    mova         [rsp+0x30], xm7
   3777    vbroadcasti128       m7, [base+rescale_mul]
   3778    movzx               t0d, t0b
   3779    sub                srcq, 2
   3780    movd               xm15, t0d
   3781    pmaddwd              m8, m7
   3782    vpbroadcastq         m2, [base+pq_0x40000000+1]
   3783    vpbroadcastd       xm15, xm15
   3784    SWAP                m13, m10
   3785    paddd               m13, m8 ; mx+dx*[0-3]
   3786    pand                 m6, m13
   3787    psrld                m6, 6
   3788    paddd              xm15, xm6
   3789    movd                r4d, xm15
   3790    pextrd              r6d, xm15, 1
   3791    pextrd             r11d, xm15, 2
   3792    pextrd             r13d, xm15, 3
   3793    vbroadcasti128       m5, [base+bdct_lb_q+ 0]
   3794    vbroadcasti128       m1, [base+bdct_lb_q+16]
   3795    vbroadcasti128       m0, [base+subpel_s_shuf2]
   3796    vpbroadcastd       xm14, [base+subpel_filters+r4*8+2]
   3797    vpbroadcastd        xm7, [base+subpel_filters+r6*8+2]
   3798    vpbroadcastd       xm15, [base+subpel_filters+r11*8+2]
   3799    vpbroadcastd        xm8, [base+subpel_filters+r13*8+2]
   3800    pcmpeqd              m6, m9
   3801    punpckldq           m10, m6, m6
   3802    punpckhdq            m6, m6
   3803    psrld               m13, 10
   3804    paddd               m13, m13
   3805    vpblendd           xm14, xm7, 0xa
   3806    vpblendd           xm15, xm8, 0xa
   3807    pmovsxbw            m14, xm14
   3808    pmovsxbw            m15, xm15
   3809    pblendvb            m14, m2, m10
   3810    pblendvb            m15, m2, m6
   3811    pextrd               r4, xm13, 2
   3812    pshufb              m12, m13, m5
   3813    pshufb              m13, m1
   3814    lea                  r6, [r4+ssq*1]
   3815    lea                 r11, [r4+ssq*2]
   3816    lea                 r13, [r4+ss3q ]
   3817    movu                xm7, [srcq+ssq*0]
   3818    movu                xm9, [srcq+ssq*1]
   3819    movu                xm8, [srcq+ssq*2]
   3820    movu               xm10, [srcq+ss3q ]
   3821    movu                xm1, [srcq+r4   ]
   3822    movu                xm3, [srcq+r6   ]
   3823    movu                xm2, [srcq+r11  ]
   3824    movu                xm4, [srcq+r13  ]
   3825    lea                srcq, [srcq+ssq*4]
   3826    vinserti128          m7, [srcq+ssq*0], 1
   3827    vinserti128          m9, [srcq+ssq*1], 1
   3828    vinserti128          m8, [srcq+ssq*2], 1
   3829    vinserti128         m10, [srcq+ss3q ], 1
   3830    vinserti128          m1, [srcq+r4   ], 1
   3831    vinserti128          m3, [srcq+r6   ], 1
   3832    vinserti128          m2, [srcq+r11  ], 1
   3833    vinserti128          m4, [srcq+r13  ], 1
   3834    lea                srcq, [srcq+ssq*4]
   3835    vpbroadcastb         m5, xm13
   3836    psubb               m13, m5
   3837    paddb               m12, m0
   3838    paddb               m13, m0
   3839    REPX    {pshufb x, m12}, m7, m9, m8, m10
   3840    REPX   {pmaddwd x, m14}, m7, m9, m8, m10
   3841    REPX    {pshufb x, m13}, m1, m2, m3, m4
   3842    REPX   {pmaddwd x, m15}, m1, m2, m3, m4
   3843    mova                 m5, [rsp+0x00]
   3844    movd                xm6, [rsp+0x30]
   3845    phaddd               m7, m1
   3846    phaddd               m9, m3
   3847    phaddd               m8, m2
   3848    phaddd              m10, m4
   3849    REPX      {paddd x, m5}, m7, m9, m8, m10
   3850    REPX     {psrad x, xm6}, m7, m9, m8, m10
   3851    packssdw             m7, m9                 ; 0 1  4 5
   3852    packssdw             m8, m10                ; 2 3  6 7
   3853    vextracti128        xm9, m7, 1              ; 4 5
   3854    vextracti128        xm3, m8, 1              ; 6 7
   3855    shufps              xm4, xm7, xm8, q1032    ; 1 2
   3856    shufps              xm5, xm8, xm9, q1032    ; 3 4
   3857    shufps              xm6, xm9, xm3, q1032    ; 5 6
   3858    psrldq             xm10, xm3, 8             ; 7 _
   3859    punpcklwd           xm0, xm7, xm4   ; 01
   3860    punpckhwd           xm7, xm4        ; 12
   3861    punpcklwd           xm1, xm8, xm5   ; 23
   3862    punpckhwd           xm8, xm5        ; 34
   3863    punpcklwd           xm2, xm9, xm6   ; 45
   3864    punpckhwd           xm9, xm6        ; 56
   3865    punpcklwd           xm3, xm10       ; 67
   3866    mova         [rsp+0x40], xm7
   3867    mova         [rsp+0x50], xm8
   3868    mova         [rsp+0x60], xm9
   3869 .w4_loop:
   3870    and                 myd, 0x3ff
   3871    mov                r11d, 64 << 24
   3872    mov                r13d, myd
   3873    shr                r13d, 6
   3874    lea                r13d, [t1+r13]
   3875    cmovnz             r11q, [base+subpel_filters+r13*8]
   3876    movq                xm9, r11q
   3877    pmovsxbw            xm9, xm9
   3878    pshufd              xm7, xm9, q0000
   3879    pshufd              xm8, xm9, q1111
   3880    pmaddwd             xm4, xm0, xm7
   3881    pmaddwd             xm5, xm1, xm8
   3882    pshufd              xm7, xm9, q2222
   3883    pshufd              xm9, xm9, q3333
   3884    pmaddwd             xm6, xm2, xm7
   3885    pmaddwd             xm8, xm3, xm9
   3886 %if isput
   3887    mova                xm7, [rsp+0x20]
   3888    movd                xm9, [rsp+0x38]
   3889 %else
   3890    SWAP                 m7, m11
   3891 %endif
   3892    paddd               xm4, xm5
   3893    paddd               xm6, xm8
   3894    paddd               xm4, xm6
   3895    paddd               xm4, xm7
   3896 %if isput
   3897    psrad               xm4, xm9
   3898    packusdw            xm4, xm4
   3899    pminuw              xm4, xm11
   3900    movq             [dstq], xm4
   3901    add                dstq, dsq
   3902 %else
   3903    SWAP                m11, m7
   3904    psrad               xm4, 6
   3905    packssdw            xm4, xm4
   3906    movq             [tmpq], xm4
   3907    add                tmpq, 8
   3908 %endif
   3909    dec                  hd
   3910    jz .ret
   3911    add                 myd, dyd
   3912    test                myd, ~0x3ff
   3913    jz .w4_loop
   3914    mova                xm8, [rsp+0x00]
   3915    movd                xm9, [rsp+0x30]
   3916    movu                xm4, [srcq]
   3917    movu                xm5, [srcq+r4]
   3918    test                myd, 0x400
   3919    jz .w4_skip_line
   3920    mova                xm0, [rsp+0x40]
   3921    mova         [rsp+0x40], xm1
   3922    mova                xm1, [rsp+0x50]
   3923    mova         [rsp+0x50], xm2
   3924    mova                xm2, [rsp+0x60]
   3925    mova         [rsp+0x60], xm3
   3926    pshufb              xm4, xm12
   3927    pshufb              xm5, xm13
   3928    pmaddwd             xm4, xm14
   3929    pmaddwd             xm5, xm15
   3930    phaddd              xm4, xm5
   3931    paddd               xm4, xm8
   3932    psrad               xm4, xm9
   3933    packssdw            xm4, xm4
   3934    punpcklwd           xm3, xm10, xm4
   3935    mova               xm10, xm4
   3936    add                srcq, ssq
   3937    jmp .w4_loop
   3938 .w4_skip_line:
   3939    movu                xm6, [srcq+ssq*1]
   3940    movu                xm7, [srcq+r6]
   3941    movu                 m0, [rsp+0x50]
   3942    pshufb              xm4, xm12
   3943    pshufb              xm6, xm12
   3944    pshufb              xm5, xm13
   3945    pshufb              xm7, xm13
   3946    pmaddwd             xm4, xm14
   3947    pmaddwd             xm6, xm14
   3948    pmaddwd             xm5, xm15
   3949    pmaddwd             xm7, xm15
   3950    mova         [rsp+0x40], m0
   3951    phaddd              xm4, xm5
   3952    phaddd              xm6, xm7
   3953    paddd               xm4, xm8
   3954    paddd               xm6, xm8
   3955    psrad               xm4, xm9
   3956    psrad               xm6, xm9
   3957    packssdw            xm4, xm6
   3958    punpcklwd           xm9, xm10, xm4
   3959    mova         [rsp+0x60], xm9
   3960    psrldq             xm10, xm4, 8
   3961    mova                xm0, xm1
   3962    mova                xm1, xm2
   3963    mova                xm2, xm3
   3964    punpcklwd           xm3, xm4, xm10
   3965    lea                srcq, [srcq+ssq*2]
   3966    jmp .w4_loop
   3967    SWAP                m10, m13
   3968 %if isprep
   3969    SWAP                m13, m11
   3970 %endif
   3971 .w8:
   3972    mov    dword [rsp+0x80], 1
   3973    movifprep   tmp_stridem, 16
   3974    jmp .w_start
   3975 .w16:
   3976    mov    dword [rsp+0x80], 2
   3977    movifprep   tmp_stridem, 32
   3978    jmp .w_start
   3979 .w32:
   3980    mov    dword [rsp+0x80], 4
   3981    movifprep   tmp_stridem, 64
   3982    jmp .w_start
   3983 .w64:
   3984    mov    dword [rsp+0x80], 8
   3985    movifprep   tmp_stridem, 128
   3986    jmp .w_start
   3987 .w128:
   3988    mov    dword [rsp+0x80], 16
   3989    movifprep   tmp_stridem, 256
   3990 .w_start:
   3991    SWAP                m10, m12, m1
   3992    SWAP                m11, m7
   3993    ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
   3994 %if isput
   3995    movifnidn           dsm, dsq
   3996    mova         [rsp+0xb0], xm7
   3997 %endif
   3998    mova         [rsp+0x00], m10
   3999    mova         [rsp+0x20], m13
   4000    shr                 t0d, 16
   4001    sub                srcq, 6
   4002    pmaddwd              m8, [base+rescale_mul2]
   4003    movd               xm15, t0d
   4004    mov          [rsp+0x84], t0d
   4005    mov          [rsp+0x88], srcq
   4006    mov          [rsp+0x90], r0q ; dstq / tmpq
   4007 %if UNIX64
   4008    mov                  hm, hd
   4009 %endif
   4010    shl           dword dxm, 3 ; dx*8
   4011    vpbroadcastd        m15, xm15
   4012    paddd                m1, m8 ; mx+dx*[0-7]
   4013    jmp .hloop
   4014 .hloop_prep:
   4015    dec    dword [rsp+0x80]
   4016    jz .ret
   4017    add    qword [rsp+0x90], 16
   4018    mov                  hd, hm
   4019    vpbroadcastd         m8, dxm
   4020    vpbroadcastd         m6, [base+pd_0x3ff]
   4021    paddd                m1, m8, [rsp+0x40]
   4022    vpbroadcastd        m15, [rsp+0x84]
   4023    pxor                 m9, m9
   4024    mov                srcq, [rsp+0x88]
   4025    mov                 r0q, [rsp+0x90] ; dstq / tmpq
   4026 .hloop:
   4027    vpbroadcastq        xm2, [base+pq_0x40000000]
   4028    pand                 m5, m1, m6
   4029    psrld                m5, 6
   4030    paddd               m15, m5
   4031    pcmpeqd              m5, m9
   4032    vextracti128        xm7, m15, 1
   4033    movq                 r6, xm15
   4034    pextrq               r9, xm15, 1
   4035    movq                r11, xm7
   4036    pextrq               rX, xm7, 1
   4037    mov                 r4d, r6d
   4038    shr                  r6, 32
   4039    mov                 r7d, r9d
   4040    shr                  r9, 32
   4041    mov                r10d, r11d
   4042    shr                 r11, 32
   4043    mov                r13d, rXd
   4044    shr                  rX, 32
   4045    mova         [rsp+0x40], m1
   4046    movq               xm12, [base+subpel_filters+ r4*8]
   4047    movq               xm13, [base+subpel_filters+ r6*8]
   4048    movhps             xm12, [base+subpel_filters+ r7*8]
   4049    movhps             xm13, [base+subpel_filters+ r9*8]
   4050    movq               xm14, [base+subpel_filters+r10*8]
   4051    movq               xm15, [base+subpel_filters+r11*8]
   4052    movhps             xm14, [base+subpel_filters+r13*8]
   4053    movhps             xm15, [base+subpel_filters+ rX*8]
   4054    psrld                m1, 10
   4055    vextracti128        xm7, m1, 1
   4056    vextracti128        xm6, m5, 1
   4057    movq         [rsp+0xa0], xm1
   4058    movq         [rsp+0xa8], xm7
   4059    movq                 r6, xm1
   4060    pextrq              r11, xm1, 1
   4061    movq                 r9, xm7
   4062    pextrq               rX, xm7, 1
   4063    mov                 r4d, r6d
   4064    shr                  r6, 32
   4065    mov                r10d, r11d
   4066    shr                 r11, 32
   4067    mov                 r7d, r9d
   4068    shr                  r9, 32
   4069    mov                r13d, rXd
   4070    shr                  rX, 32
   4071    pshufd              xm4, xm5, q2200
   4072    pshufd              xm5, xm5, q3311
   4073    pshufd              xm7, xm6, q2200
   4074    pshufd              xm6, xm6, q3311
   4075    pblendvb           xm12, xm2, xm4
   4076    pblendvb           xm13, xm2, xm5
   4077    pblendvb           xm14, xm2, xm7
   4078    pblendvb           xm15, xm2, xm6
   4079    pmovsxbw            m12, xm12
   4080    pmovsxbw            m13, xm13
   4081    pmovsxbw            m14, xm14
   4082    pmovsxbw            m15, xm15
   4083    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
   4084    mova        [rsp+0x60], m0
   4085    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
   4086    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
   4087    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
   4088    mova                 m0, [rsp+0x60]
   4089    vbroadcasti128       m9, [base+subpel_s_shuf8]
   4090    mov                 myd, mym
   4091    mov                 dyd, dym
   4092    pshufb               m0, m9     ; 01a 01b
   4093    pshufb               m1, m9     ; 23a 23b
   4094    pshufb               m2, m9     ; 45a 45b
   4095    pshufb               m3, m9     ; 67a 67b
   4096 .vloop:
   4097    and                 myd, 0x3ff
   4098    mov                 r6d, 64 << 24
   4099    mov                 r4d, myd
   4100    shr                 r4d, 6
   4101    lea                 r4d, [t1+r4]
   4102    cmovnz              r6q, [base+subpel_filters+r4*8]
   4103    movq                xm9, r6q
   4104    punpcklqdq          xm9, xm9
   4105    pmovsxbw             m9, xm9
   4106    pshufd               m8, m9, q0000
   4107    pshufd               m7, m9, q1111
   4108    pmaddwd              m4, m0, m8
   4109    pmaddwd              m5, m1, m7
   4110    pshufd               m8, m9, q2222
   4111    pshufd               m9, m9, q3333
   4112    pmaddwd              m6, m2, m8
   4113    pmaddwd              m7, m3, m9
   4114 %if isput
   4115    psrldq              xm8, xm11, 8
   4116 %endif
   4117    paddd                m4, [rsp+0x20]
   4118    paddd                m6, m7
   4119    paddd                m4, m5
   4120    paddd                m4, m6
   4121 %if isput
   4122    psrad                m4, xm8
   4123    vextracti128        xm5, m4, 1
   4124    packusdw            xm4, xm5
   4125    pminsw              xm4, [rsp+0xb0]
   4126    mova             [dstq], xm4
   4127    add                dstq, dsm
   4128 %else
   4129    psrad                m4, 6
   4130    vextracti128        xm5, m4, 1
   4131    packssdw            xm4, xm5
   4132    mova             [tmpq], xm4
   4133    add                tmpq, tmp_stridem
   4134 %endif
   4135    dec                  hd
   4136    jz .hloop_prep
   4137    add                 myd, dyd
   4138    test                myd, ~0x3ff
   4139    jz .vloop
   4140    test                myd, 0x400
   4141    mov          [rsp+0x60], myd
   4142    mov                 r4d, [rsp+0xa0]
   4143    mov                 r6d, [rsp+0xa4]
   4144    mov                 r7d, [rsp+0xa8]
   4145    mov                 r9d, [rsp+0xac]
   4146    jz .skip_line
   4147    vbroadcasti128       m9, [base+wswap]
   4148    movu                xm4, [srcq+ r4*2]
   4149    movu                xm5, [srcq+ r6*2]
   4150    movu                xm6, [srcq+ r7*2]
   4151    movu                xm7, [srcq+ r9*2]
   4152    vinserti128          m4, [srcq+r10*2], 1
   4153    vinserti128          m5, [srcq+r11*2], 1
   4154    vinserti128          m6, [srcq+r13*2], 1
   4155    vinserti128          m7, [srcq+ rX*2], 1
   4156    add                srcq, ssq
   4157    mov                 myd, [rsp+0x60]
   4158    mov                 dyd, dym
   4159    pshufb               m0, m9
   4160    pshufb               m1, m9
   4161    pshufb               m2, m9
   4162    pshufb               m3, m9
   4163    pmaddwd              m4, m12
   4164    pmaddwd              m5, m13
   4165    pmaddwd              m6, m14
   4166    pmaddwd              m7, m15
   4167    phaddd               m4, m5
   4168    phaddd               m6, m7
   4169    phaddd               m4, m6
   4170    paddd                m4, m10
   4171    psrad                m4, xm11
   4172    pslld                m4, 16
   4173    pblendw              m0, m1, 0xaa
   4174    pblendw              m1, m2, 0xaa
   4175    pblendw              m2, m3, 0xaa
   4176    pblendw              m3, m4, 0xaa
   4177    jmp .vloop
   4178 .skip_line:
   4179    mova                 m0, m1
   4180    mova                 m1, m2
   4181    mova                 m2, m3
   4182    MC_8TAP_SCALED_H      3, 10, 4, 5, 6, 7, 8, 9, 1
   4183    vbroadcasti128       m9, [base+subpel_s_shuf8]
   4184    mov                 myd, [rsp+0x60]
   4185    mov                 dyd, dym
   4186    pshufb               m3, m9
   4187    jmp .vloop
   4188    SWAP                 m1, m12, m10
   4189    SWAP                 m7, m11
   4190 .dy1:
   4191    movzx                wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
   4192    add                  wq, base_reg
   4193    jmp                  wq
   4194 %if isput
   4195 .dy1_w2:
   4196    mov                 myd, mym
   4197    movzx               t0d, t0b
   4198    sub                srcq, 2
   4199    movd               xm15, t0d
   4200    punpckldq            m8, m9, m8
   4201    paddd               m10, m8 ; mx+dx*[0-1]
   4202    vpbroadcastd       xm14, [base+pq_0x40000000+2]
   4203    vpbroadcastd       xm15, xm15
   4204    pand                xm8, xm10, xm6
   4205    psrld               xm8, 6
   4206    paddd              xm15, xm8
   4207    movd                r4d, xm15
   4208    pextrd              r6d, xm15, 1
   4209    vbroadcasti128       m5, [base+bdct_lb_q]
   4210    vbroadcasti128       m6, [base+subpel_s_shuf2]
   4211    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
   4212    vpbroadcastd         m4, [base+subpel_filters+r6*8+2]
   4213    pcmpeqd             xm8, xm9
   4214    psrld               m10, 10
   4215    paddd               m10, m10
   4216    movu                xm0, [srcq+ssq*0]
   4217    movu                xm1, [srcq+ssq*1]
   4218    movu                xm2, [srcq+ssq*2]
   4219    movu                xm3, [srcq+ss3q ]
   4220    lea                srcq, [srcq+ssq*4]
   4221    shr                 myd, 6
   4222    mov                 r4d, 64 << 24
   4223    lea                 myd, [t1+myq]
   4224    cmovnz              r4q, [base+subpel_filters+myq*8]
   4225    pshufb              m10, m5
   4226    paddb               m10, m6
   4227    vpblendd           xm15, xm4, 0xa
   4228    pblendvb           xm15, xm14, xm8
   4229    pmovsxbw            m15, xm15
   4230    vinserti128          m0, [srcq+ssq*0], 1
   4231    vinserti128          m1, [srcq+ssq*1], 1
   4232    vinserti128          m2, [srcq+ssq*2], 1
   4233    add                srcq, ss3q
   4234    movq                xm6, r4q
   4235    pmovsxbw            xm6, xm6
   4236    pshufd              xm8, xm6, q0000
   4237    pshufd              xm9, xm6, q1111
   4238    pshufd             xm14, xm6, q2222
   4239    pshufd              xm6, xm6, q3333
   4240    REPX    {pshufb x, m10}, m0, m1, m2
   4241    pshufb              xm3, xm10
   4242    REPX   {pmaddwd x, m15}, m0, m1, m2
   4243    pmaddwd             xm3, xm15
   4244    phaddd               m0, m1
   4245    phaddd               m2, m3
   4246    paddd                m0, m12
   4247    paddd                m2, m12
   4248    psrad                m0, xm7
   4249    psrad                m2, xm7
   4250    packssdw             m0, m2
   4251    vextracti128        xm1, m0, 1
   4252    palignr             xm2, xm1, xm0, 4
   4253    pshufd              xm4, xm1, q2121
   4254    punpcklwd           xm3, xm0, xm2       ; 01 12
   4255    punpckhwd           xm0, xm2            ; 23 34
   4256    punpcklwd           xm2, xm1, xm4       ; 45 56
   4257 .dy1_w2_loop:
   4258    movu                xm1, [srcq+ssq*0]
   4259    movu                xm5, [srcq+ssq*1]
   4260    lea                srcq, [srcq+ssq*2]
   4261    pshufb              xm1, xm10
   4262    pshufb              xm5, xm10
   4263    pmaddwd             xm1, xm15
   4264    pmaddwd             xm5, xm15
   4265    phaddd              xm1, xm5
   4266    pmaddwd             xm5, xm3, xm8
   4267    mova                xm3, xm0
   4268    pmaddwd             xm0, xm9
   4269    paddd               xm1, xm12
   4270    psrad               xm1, xm7
   4271    packssdw            xm1, xm1
   4272    paddd               xm5, xm0
   4273    mova                xm0, xm2
   4274    pmaddwd             xm2, xm14
   4275    paddd               xm5, xm2
   4276    palignr             xm2, xm1, xm4, 12
   4277    punpcklwd           xm2, xm1            ; 67 78
   4278    pmaddwd             xm4, xm2, xm6
   4279    paddd               xm5, xm13
   4280    paddd               xm5, xm4
   4281    mova                xm4, xm1
   4282    psrldq              xm1, xm7, 8
   4283    psrad               xm5, xm1
   4284    packusdw            xm5, xm5
   4285    pminsw              xm5, xm11
   4286    movd       [dstq+dsq*0], xm5
   4287    pextrd     [dstq+dsq*1], xm5, 1
   4288    lea                dstq, [dstq+dsq*2]
   4289    sub                  hd, 2
   4290    jg .dy1_w2_loop
   4291    RET
   4292 %endif
   4293 .dy1_w4:
   4294    mov                 myd, mym
   4295 %if isput
   4296    mova         [rsp+0x50], xm11
   4297 %endif
   4298    mova         [rsp+0x00], m12
   4299    mova         [rsp+0x20], m13
   4300    mova         [rsp+0x40], xm7
   4301    vbroadcasti128       m7, [base+rescale_mul]
   4302    movzx               t0d, t0b
   4303    sub                srcq, 2
   4304    movd               xm15, t0d
   4305    pmaddwd              m8, m7
   4306    vpbroadcastq         m2, [base+pq_0x40000000+1]
   4307    vpbroadcastd       xm15, xm15
   4308    SWAP                m13, m10
   4309    paddd               m13, m8 ; mx+dx*[0-3]
   4310    pand                 m6, m13
   4311    psrld                m6, 6
   4312    paddd              xm15, xm6
   4313    movd                r4d, xm15
   4314    pextrd              r6d, xm15, 1
   4315    pextrd             r11d, xm15, 2
   4316    pextrd             r13d, xm15, 3
   4317    vbroadcasti128       m5, [base+bdct_lb_q+ 0]
   4318    vbroadcasti128       m1, [base+bdct_lb_q+16]
   4319    vbroadcasti128       m4, [base+subpel_s_shuf2]
   4320    vpbroadcastd       xm14, [base+subpel_filters+r4*8+2]
   4321    vpbroadcastd        xm7, [base+subpel_filters+r6*8+2]
   4322    vpbroadcastd       xm15, [base+subpel_filters+r11*8+2]
   4323    vpbroadcastd        xm8, [base+subpel_filters+r13*8+2]
   4324    pcmpeqd              m6, m9
   4325    punpckldq           m10, m6, m6
   4326    punpckhdq            m6, m6
   4327    psrld               m13, 10
   4328    paddd               m13, m13
   4329    vpblendd           xm14, xm7, 0xa
   4330    vpblendd           xm15, xm8, 0xa
   4331    pmovsxbw            m14, xm14
   4332    pmovsxbw            m15, xm15
   4333    pblendvb            m14, m2, m10
   4334    pblendvb            m15, m2, m6
   4335    pextrd               r4, xm13, 2
   4336    pshufb              m12, m13, m5
   4337    pshufb              m13, m1
   4338    lea                  r6, [r4+ssq*2]
   4339    lea                 r11, [r4+ssq*1]
   4340    lea                 r13, [r4+ss3q ]
   4341    movu                xm0, [srcq+ssq*0]
   4342    movu                xm7, [srcq+r4   ]
   4343    movu                xm1, [srcq+ssq*2]
   4344    movu                xm8, [srcq+r6   ]
   4345    vinserti128          m0, [srcq+ssq*1], 1 ; 0 1
   4346    vinserti128          m7, [srcq+r11  ], 1
   4347    vinserti128          m1, [srcq+ss3q ], 1 ; 2 3
   4348    vinserti128          m8, [srcq+r13  ], 1
   4349    lea                srcq, [srcq+ssq*4]
   4350    movu                xm2, [srcq+ssq*0]
   4351    movu                xm9, [srcq+r4   ]
   4352    movu                xm3, [srcq+ssq*2]    ; 6 _
   4353    movu               xm10, [srcq+r6   ]
   4354    vinserti128          m2, [srcq+ssq*1], 1 ; 4 5
   4355    vinserti128          m9, [srcq+r11  ], 1
   4356    lea                srcq, [srcq+ss3q ]
   4357    vpbroadcastb         m5, xm13
   4358    psubb               m13, m5
   4359    paddb               m12, m4
   4360    paddb               m13, m4
   4361    mova                 m5, [rsp+0x00]
   4362    movd                xm6, [rsp+0x40]
   4363    pshufb               m0, m12
   4364    pshufb               m1, m12
   4365    pmaddwd              m0, m14
   4366    pmaddwd              m1, m14
   4367    pshufb               m7, m13
   4368    pshufb               m8, m13
   4369    pmaddwd              m7, m15
   4370    pmaddwd              m8, m15
   4371    pshufb               m2, m12
   4372    pshufb              xm3, xm12
   4373    pmaddwd              m2, m14
   4374    pmaddwd             xm3, xm14
   4375    pshufb               m9, m13
   4376    pshufb             xm10, xm13
   4377    pmaddwd              m9, m15
   4378    pmaddwd            xm10, xm15
   4379    phaddd               m0, m7
   4380    phaddd               m1, m8
   4381    phaddd               m2, m9
   4382    phaddd              xm3, xm10
   4383    paddd                m0, m5
   4384    paddd                m1, m5
   4385    paddd                m2, m5
   4386    paddd               xm3, xm5
   4387    psrad                m0, xm6
   4388    psrad                m1, xm6
   4389    psrad                m2, xm6
   4390    psrad               xm3, xm6
   4391    vperm2i128           m4, m0, m1, 0x21 ; 1 2
   4392    vperm2i128           m5, m1, m2, 0x21 ; 3 4
   4393    vperm2i128           m6, m2, m3, 0x21 ; 5 6
   4394    shr                 myd, 6
   4395    mov                r13d, 64 << 24
   4396    lea                 myd, [t1+myq]
   4397    cmovnz             r13q, [base+subpel_filters+myq*8]
   4398    pslld                m4, 16
   4399    pslld                m5, 16
   4400    pslld                m6, 16
   4401    pblendw              m0, m4, 0xaa ; 01 12
   4402    pblendw              m1, m5, 0xaa ; 23 34
   4403    pblendw              m2, m6, 0xaa ; 45 56
   4404    movq               xm10, r13q
   4405    punpcklqdq         xm10, xm10
   4406    pmovsxbw            m10, xm10
   4407    pshufd               m7, m10, q0000
   4408    pshufd               m8, m10, q1111
   4409    pshufd               m9, m10, q2222
   4410    pshufd              m10, m10, q3333
   4411 .dy1_w4_loop:
   4412    movu               xm11, [srcq+ssq*0]
   4413    movu                xm6, [srcq+r4   ]
   4414    vinserti128         m11, [srcq+ssq*1], 1
   4415    vinserti128          m6, [srcq+r11  ], 1
   4416    lea                srcq, [srcq+ssq*2]
   4417    pmaddwd              m4, m0, m7
   4418    pmaddwd              m5, m1, m8
   4419    pshufb              m11, m12
   4420    pshufb               m6, m13
   4421    pmaddwd             m11, m14
   4422    pmaddwd              m6, m15
   4423    paddd                m4, [rsp+0x20]
   4424    phaddd              m11, m6
   4425    pmaddwd              m6, m2, m9
   4426    paddd               m11, [rsp+0x00]
   4427    psrad               m11, [rsp+0x40]
   4428    mova                 m0, m1
   4429    mova                 m1, m2
   4430    paddd                m5, m6
   4431    paddd                m4, m5
   4432    vinserti128          m2, m3, xm11, 1
   4433    pslld                m3, m11, 16
   4434    pblendw              m2, m3, 0xaa   ; 67 78
   4435    pmaddwd              m5, m2, m10
   4436    vextracti128        xm3, m11, 1
   4437    paddd                m4, m5
   4438 %if isput
   4439    psrad                m4, [rsp+0x48]
   4440    vextracti128        xm5, m4, 1
   4441    packusdw            xm4, xm5
   4442    pminsw              xm4, [rsp+0x50]
   4443    movq       [dstq+dsq*0], xm4
   4444    movhps     [dstq+dsq*1], xm4
   4445    lea                dstq, [dstq+dsq*2]
   4446 %else
   4447    psrad                m4, 6
   4448    vextracti128        xm5, m4, 1
   4449    packssdw            xm4, xm5
   4450    mova             [tmpq], xm4
   4451    add                tmpq, 16
   4452 %endif
   4453    sub                  hd, 2
   4454    jg .dy1_w4_loop
   4455    MC_8TAP_SCALED_RET
   4456    SWAP                 m10, m13
   4457 .dy1_w8:
   4458    mov    dword [rsp+0xa0], 1
   4459    movifprep   tmp_stridem, 16
   4460    jmp .dy1_w_start
   4461 .dy1_w16:
   4462    mov    dword [rsp+0xa0], 2
   4463    movifprep   tmp_stridem, 32
   4464    jmp .dy1_w_start
   4465 .dy1_w32:
   4466    mov    dword [rsp+0xa0], 4
   4467    movifprep   tmp_stridem, 64
   4468    jmp .dy1_w_start
   4469 .dy1_w64:
   4470    mov    dword [rsp+0xa0], 8
   4471    movifprep   tmp_stridem, 128
   4472    jmp .dy1_w_start
   4473 .dy1_w128:
   4474    mov    dword [rsp+0xa0], 16
   4475    movifprep   tmp_stridem, 256
   4476 .dy1_w_start:
   4477    SWAP                m10, m12, m1
   4478    SWAP                m11, m7
   4479    ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
   4480    mov                 myd, mym
   4481 %if isput
   4482 %define dsm [rsp+0xb8]
   4483    movifnidn           dsm, dsq
   4484    mova         [rsp+0xc0], xm7
   4485 %else
   4486 %if UNIX64
   4487  %define hm [rsp+0xb8]
   4488 %endif
   4489 %endif
   4490    mova         [rsp+0x00], m10
   4491    mova         [rsp+0x20], m13
   4492    mova         [rsp+0x40], xm11
   4493    shr                 t0d, 16
   4494    sub                srcq, 6
   4495    shr                 myd, 6
   4496    mov                 r4d, 64 << 24
   4497    lea                 myd, [t1+myq]
   4498    cmovnz              r4q, [base+subpel_filters+myq*8]
   4499    pmaddwd              m8, [base+rescale_mul2]
   4500    movd               xm15, t0d
   4501    mov          [rsp+0xa4], t0d
   4502    mov          [rsp+0xa8], srcq
   4503    mov          [rsp+0xb0], r0q ; dstq / tmpq
   4504 %if UNIX64
   4505    mov                  hm, hd
   4506 %endif
   4507    shl           dword dxm, 3 ; dx*8
   4508    vpbroadcastd        m15, xm15
   4509    paddd                m1, m8 ; mx+dx*[0-7]
   4510    movq                xm0, r4q
   4511    pmovsxbw            xm0, xm0
   4512    mova         [rsp+0x50], xm0
   4513    jmp .dy1_hloop
   4514 .dy1_hloop_prep:
   4515    dec    dword [rsp+0xa0]
   4516    jz .ret
   4517    add    qword [rsp+0xb0], 16
   4518    mov                  hd, hm
   4519    vpbroadcastd         m8, dxm
   4520    vpbroadcastd         m6, [base+pd_0x3ff]
   4521    paddd                m1, m8, [rsp+0x60]
   4522    vpbroadcastd        m15, [rsp+0xa4]
   4523    pxor                 m9, m9
   4524    mov                srcq, [rsp+0xa8]
   4525    mov                 r0q, [rsp+0xb0] ; dstq / tmpq
   4526    mova                m10, [rsp+0x00]
   4527    mova               xm11, [rsp+0x40]
   4528 .dy1_hloop:
   4529    vpbroadcastq        xm2, [base+pq_0x40000000]
   4530    pand                 m5, m1, m6
   4531    psrld                m5, 6
   4532    paddd               m15, m5
   4533    pcmpeqd              m5, m9
   4534    vextracti128        xm7, m15, 1
   4535    movq                 r6, xm15
   4536    pextrq               r9, xm15, 1
   4537    movq                r11, xm7
   4538    pextrq               rX, xm7, 1
   4539    mov                 r4d, r6d
   4540    shr                  r6, 32
   4541    mov                 r7d, r9d
   4542    shr                  r9, 32
   4543    mov                r10d, r11d
   4544    shr                 r11, 32
   4545    mov                r13d, rXd
   4546    shr                  rX, 32
   4547    mova         [rsp+0x60], m1
   4548    movq               xm12, [base+subpel_filters+ r4*8]
   4549    movq               xm13, [base+subpel_filters+ r6*8]
   4550    movhps             xm12, [base+subpel_filters+ r7*8]
   4551    movhps             xm13, [base+subpel_filters+ r9*8]
   4552    movq               xm14, [base+subpel_filters+r10*8]
   4553    movq               xm15, [base+subpel_filters+r11*8]
   4554    movhps             xm14, [base+subpel_filters+r13*8]
   4555    movhps             xm15, [base+subpel_filters+ rX*8]
   4556    psrld                m1, 10
   4557    vextracti128        xm7, m1, 1
   4558    vextracti128        xm6, m5, 1
   4559    movq                 r6, xm1
   4560    pextrq              r11, xm1, 1
   4561    movq                 r9, xm7
   4562    pextrq               rX, xm7, 1
   4563    mov                 r4d, r6d
   4564    shr                  r6, 32
   4565    mov                r10d, r11d
   4566    shr                 r11, 32
   4567    mov                 r7d, r9d
   4568    shr                  r9, 32
   4569    mov                r13d, rXd
   4570    shr                  rX, 32
   4571    pshufd              xm4, xm5, q2200
   4572    pshufd              xm5, xm5, q3311
   4573    pshufd              xm7, xm6, q2200
   4574    pshufd              xm6, xm6, q3311
   4575    pblendvb           xm12, xm2, xm4
   4576    pblendvb           xm13, xm2, xm5
   4577    pblendvb           xm14, xm2, xm7
   4578    pblendvb           xm15, xm2, xm6
   4579    pmovsxbw            m12, xm12
   4580    pmovsxbw            m13, xm13
   4581    pmovsxbw            m14, xm14
   4582    pmovsxbw            m15, xm15
   4583    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
   4584    mova         [rsp+0x80], m0
   4585    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
   4586    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
   4587    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
   4588    mova                 m0, [rsp+0x80]
   4589    vbroadcasti128       m7, [base+subpel_s_shuf8]
   4590    vpbroadcastd         m8, [rsp+0x50]
   4591    vpbroadcastd         m9, [rsp+0x54]
   4592    vpbroadcastd        m10, [rsp+0x58]
   4593    vpbroadcastd        m11, [rsp+0x5c]
   4594    pshufb               m0, m7     ; 01a 01b
   4595    pshufb               m1, m7     ; 23a 23b
   4596    pshufb               m2, m7     ; 45a 45b
   4597    pshufb               m3, m7     ; 67a 67b
   4598 .dy1_vloop:
   4599    pmaddwd              m4, m0, m8
   4600    pmaddwd              m5, m1, m9
   4601    pmaddwd              m6, m2, m10
   4602    pmaddwd              m7, m3, m11
   4603    paddd                m4, [rsp+0x20]
   4604    paddd                m6, m7
   4605    paddd                m4, m5
   4606    paddd                m4, m6
   4607 %if isput
   4608    psrad                m4, [rsp+0x48]
   4609    vextracti128        xm5, m4, 1
   4610    packusdw            xm4, xm5
   4611    pminsw              xm4, [rsp+0xc0]
   4612    mova             [dstq], xm4
   4613    add                dstq, dsm
   4614 %else
   4615    psrad                m4, 6
   4616    vextracti128        xm5, m4, 1
   4617    packssdw            xm4, xm5
   4618    mova             [tmpq], xm4
   4619    add                tmpq, tmp_stridem
   4620 %endif
   4621    dec                  hd
   4622    jz .dy1_hloop_prep
   4623    vbroadcasti128       m7, [base+wswap]
   4624    pshufb               m0, m7
   4625    pshufb               m1, m7
   4626    pshufb               m2, m7
   4627    pshufb               m3, m7
   4628    movu                xm4, [srcq+ r4*2]
   4629    movu                xm5, [srcq+ r6*2]
   4630    movu                xm6, [srcq+ r7*2]
   4631    movu                xm7, [srcq+ r9*2]
   4632    vinserti128          m4, [srcq+r10*2], 1
   4633    vinserti128          m5, [srcq+r11*2], 1
   4634    vinserti128          m6, [srcq+r13*2], 1
   4635    vinserti128          m7, [srcq+ rX*2], 1
   4636    add                srcq, ssq
   4637    pmaddwd              m4, m12
   4638    pmaddwd              m5, m13
   4639    pmaddwd              m6, m14
   4640    pmaddwd              m7, m15
   4641    phaddd               m4, m5
   4642    phaddd               m6, m7
   4643    phaddd               m4, m6
   4644    paddd                m4, [rsp+0x00]
   4645    psrad                m4, [rsp+0x40]
   4646    pslld                m4, 16
   4647    pblendw              m0, m1, 0xaa
   4648    pblendw              m1, m2, 0xaa
   4649    pblendw              m2, m3, 0xaa
   4650    pblendw              m3, m4, 0xaa
   4651    jmp .dy1_vloop
   4652    SWAP                 m1, m12, m10
   4653    SWAP                 m7, m11
   4654 .dy2:
   4655    movzx                wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
   4656    add                  wq, base_reg
   4657    jmp                  wq
   4658 %if isput
   4659 .dy2_w2:
   4660    mov                 myd, mym
   4661    movzx               t0d, t0b
   4662    sub                srcq, 2
   4663    movd               xm15, t0d
   4664    punpckldq            m8, m9, m8
   4665    paddd               m10, m8 ; mx+dx*[0-1]
   4666    vpbroadcastd       xm14, [base+pq_0x40000000+2]
   4667    vpbroadcastd       xm15, xm15
   4668    pand                xm8, xm10, xm6
   4669    psrld               xm8, 6
   4670    paddd              xm15, xm8
   4671    movd                r4d, xm15
   4672    pextrd              r6d, xm15, 1
   4673    vbroadcasti128       m5, [base+bdct_lb_q]
   4674    vbroadcasti128       m6, [base+subpel_s_shuf2]
   4675    vpbroadcastd       xm15, [base+subpel_filters+r4*8+2]
   4676    vpbroadcastd        xm4, [base+subpel_filters+r6*8+2]
   4677    pcmpeqd             xm8, xm9
   4678    psrld               m10, 10
   4679    paddd               m10, m10
   4680    movu                xm0, [srcq+ssq*0]
   4681    movu                xm1, [srcq+ssq*2]
   4682    movu                xm2, [srcq+ssq*4]
   4683    pshufb              m10, m5
   4684    paddb               m10, m6
   4685    vpblendd           xm15, xm4, 0xa
   4686    pblendvb           xm15, xm14, xm8
   4687    pmovsxbw            m15, xm15
   4688    vinserti128          m0, [srcq+ssq*1], 1 ; 0 1
   4689    vinserti128          m1, [srcq+ss3q ], 1 ; 2 3
   4690    lea                srcq, [srcq+ssq*4]
   4691    vinserti128          m2, [srcq+ssq*1], 1 ; 4 5
   4692    lea                srcq, [srcq+ssq*2]
   4693    shr                 myd, 6
   4694    mov                 r4d, 64 << 24
   4695    lea                 myd, [t1+myq]
   4696    cmovnz              r4q, [base+subpel_filters+myq*8]
   4697    pshufb               m0, m10
   4698    pshufb               m1, m10
   4699    pshufb               m2, m10
   4700    pmaddwd              m0, m15
   4701    pmaddwd              m1, m15
   4702    pmaddwd              m2, m15
   4703    movq                xm6, r4q
   4704    pmovsxbw            xm6, xm6
   4705    phaddd               m0, m1
   4706    phaddd               m1, m2
   4707    paddd                m0, m12
   4708    paddd                m1, m12
   4709    psrad                m0, xm7
   4710    psrad                m1, xm7
   4711    packssdw             m0, m1             ; 0 2 2 4  1 3 3 5
   4712    vextracti128        xm1, m0, 1
   4713    pshufd              xm8, xm6, q0000
   4714    pshufd              xm9, xm6, q1111
   4715    pshufd             xm14, xm6, q2222
   4716    pshufd              xm6, xm6, q3333
   4717    punpcklwd           xm2, xm0, xm1       ; 01 23
   4718    punpckhwd           xm1, xm0, xm1       ; 23 45
   4719 .dy2_w2_loop:
   4720    movu                xm3, [srcq+ssq*0]
   4721    movu                xm5, [srcq+ssq*2]
   4722    vinserti128          m3, [srcq+ssq*1], 1 ; 6 7
   4723    vinserti128          m5, [srcq+ss3q ], 1 ; 8 9
   4724    lea                srcq, [srcq+ssq*4]
   4725    pmaddwd             xm4, xm2, xm8
   4726    pmaddwd             xm1, xm9
   4727    pshufb               m3, m10
   4728    pshufb               m5, m10
   4729    pmaddwd              m3, m15
   4730    pmaddwd              m5, m15
   4731    phaddd               m3, m5
   4732    paddd               xm4, xm1
   4733    paddd                m3, m12
   4734    psrad                m3, xm7
   4735    packssdw             m3, m3
   4736    pshufd               m3, m3, q2100
   4737    palignr              m0, m3, m0, 12     ; 4 6 6 8  5 7 7 9
   4738    vextracti128        xm1, m0, 1
   4739    punpcklwd           xm2, xm0, xm1       ; 45 67
   4740    punpckhwd           xm1, xm0, xm1       ; 67 89
   4741    pmaddwd             xm3, xm2, xm14
   4742    pmaddwd             xm5, xm1, xm6
   4743    paddd               xm4, xm13
   4744    paddd               xm4, xm3
   4745    psrldq              xm3, xm7, 8
   4746    paddd               xm4, xm5
   4747    psrad               xm4, xm3
   4748    packusdw            xm4, xm4
   4749    pminsw              xm4, xm11
   4750    movd       [dstq+dsq*0], xm4
   4751    pextrd     [dstq+dsq*1], xm4, 1
   4752    lea                dstq, [dstq+dsq*2]
   4753    sub                  hd, 2
   4754    jg .dy2_w2_loop
   4755    RET
   4756 %endif
   4757 .dy2_w4:
   4758    mov                 myd, mym
   4759 %if isput
   4760    mova         [rsp+0x50], xm11
   4761 %endif
   4762    mova         [rsp+0x00], m12
   4763    mova         [rsp+0x20], m13
   4764    mova         [rsp+0x40], xm7
   4765    vbroadcasti128       m7, [base+rescale_mul]
   4766    movzx               t0d, t0b
   4767    sub                srcq, 2
   4768    movd               xm15, t0d
   4769    pmaddwd              m8, m7
   4770    vpbroadcastq         m2, [base+pq_0x40000000+1]
   4771    vpbroadcastd       xm15, xm15
   4772    SWAP                m13, m10
   4773    paddd               m13, m8 ; mx+dx*[0-3]
   4774    pand                 m6, m13
   4775    psrld                m6, 6
   4776    paddd              xm15, xm6
   4777    movd                r4d, xm15
   4778    pextrd              r6d, xm15, 1
   4779    pextrd             r11d, xm15, 2
   4780    pextrd             r13d, xm15, 3
   4781    vbroadcasti128       m5, [base+bdct_lb_q+ 0]
   4782    vbroadcasti128       m1, [base+bdct_lb_q+16]
   4783    vbroadcasti128       m4, [base+subpel_s_shuf2]
   4784    vpbroadcastd       xm14, [base+subpel_filters+r4*8+2]
   4785    vpbroadcastd        xm7, [base+subpel_filters+r6*8+2]
   4786    vpbroadcastd       xm15, [base+subpel_filters+r11*8+2]
   4787    vpbroadcastd        xm8, [base+subpel_filters+r13*8+2]
   4788    shr                 myd, 6
   4789    mov                r13d, 64 << 24
   4790    lea                 myd, [t1+myq]
   4791    cmovnz             r13q, [base+subpel_filters+myq*8]
   4792    pcmpeqd              m6, m9
   4793    punpckldq           m11, m6, m6
   4794    punpckhdq            m6, m6
   4795    psrld               m13, 10
   4796    paddd               m13, m13
   4797    vpblendd           xm14, xm7, 0xa
   4798    vpblendd           xm15, xm8, 0xa
   4799    pmovsxbw            m14, xm14
   4800    pmovsxbw            m15, xm15
   4801    movq               xm10, r13q
   4802    pblendvb            m14, m2, m11
   4803    pblendvb            m15, m2, m6
   4804    pextrd               r4, xm13, 2
   4805    pshufb              m12, m13, m5
   4806    pshufb              m13, m1
   4807    lea                  r6, [r4+ssq*1]
   4808    lea                 r11, [r4+ssq*2]
   4809    lea                 r13, [r4+ss3q ]
   4810    movu                xm0, [srcq+ssq*0]
   4811    movu                xm7, [srcq+r4   ]
   4812    movu                xm1, [srcq+ssq*1]
   4813    movu                xm8, [srcq+r6   ]
   4814    vinserti128          m0, [srcq+ssq*2], 1 ; 0 2
   4815    vinserti128          m7, [srcq+r11  ], 1
   4816    vinserti128          m1, [srcq+ss3q ], 1 ; 1 3
   4817    vinserti128          m8, [srcq+r13  ], 1
   4818    lea                srcq, [srcq+ssq*4]
   4819    movu                xm2, [srcq+ssq*0]
   4820    movu                xm9, [srcq+r4   ]
   4821    vinserti128          m2, [srcq+ssq*1], 1 ; 4 5
   4822    vinserti128          m9, [srcq+r6   ], 1
   4823    lea                srcq, [srcq+ssq*2]
   4824    vpbroadcastb         m5, xm13
   4825    psubb               m13, m5
   4826    paddb               m12, m4
   4827    paddb               m13, m4
   4828    mova                 m5, [rsp+0x00]
   4829    movd                xm6, [rsp+0x40]
   4830    pshufb               m0, m12
   4831    pshufb               m1, m12
   4832    pshufb               m2, m12
   4833    pmaddwd              m0, m14
   4834    pmaddwd              m1, m14
   4835    pmaddwd              m2, m14
   4836    pshufb               m7, m13
   4837    pshufb               m8, m13
   4838    pshufb               m9, m13
   4839    pmaddwd              m7, m15
   4840    pmaddwd              m8, m15
   4841    pmaddwd              m9, m15
   4842    punpcklqdq         xm10, xm10
   4843    pmovsxbw            m10, xm10
   4844    phaddd               m0, m7
   4845    phaddd               m1, m8
   4846    phaddd               m2, m9
   4847    paddd                m0, m5
   4848    paddd                m1, m5
   4849    paddd                m2, m5
   4850    psrad                m0, xm6
   4851    psrad                m1, xm6
   4852    psrad                m2, xm6
   4853    vperm2i128           m3, m0, m2, 0x21 ; 2 4
   4854    vperm2i128           m2, m1, 0x13     ; 3 5
   4855    pshufd               m7, m10, q0000
   4856    pshufd               m8, m10, q1111
   4857    pshufd               m9, m10, q2222
   4858    pshufd              m10, m10, q3333
   4859    packssdw             m0, m3 ; 0 2  2 4
   4860    packssdw             m1, m2 ; 1 3  3 5
   4861    punpckhwd            m2, m0, m1 ; 23 45
   4862    punpcklwd            m0, m1     ; 01 23
   4863 .dy2_w4_loop:
   4864    movu                xm1, [srcq+ssq*0]
   4865    movu                xm6, [srcq+r4   ]
   4866    movu                xm3, [srcq+ssq*1]
   4867    movu               xm11, [srcq+r6   ]
   4868    vinserti128          m1, [srcq+ssq*2], 1 ; 6 8
   4869    vinserti128          m6, [srcq+r11  ], 1
   4870    vinserti128          m3, [srcq+ss3q ], 1 ; 7 9
   4871    vinserti128         m11, [srcq+r13  ], 1
   4872    lea                srcq, [srcq+ssq*4]
   4873    pmaddwd              m4, m0, m7
   4874    pmaddwd              m5, m2, m8
   4875    pshufb               m1, m12
   4876    pshufb               m3, m12
   4877    pmaddwd              m1, m14
   4878    pmaddwd              m3, m14
   4879    mova                 m0, [rsp+0x00]
   4880    pshufb               m6, m13
   4881    pshufb              m11, m13
   4882    pmaddwd              m6, m15
   4883    pmaddwd             m11, m15
   4884    paddd                m4, m5
   4885    movd                xm5, [rsp+0x40]
   4886    phaddd               m1, m6
   4887    phaddd               m3, m11
   4888    paddd                m1, m0
   4889    paddd                m3, m0
   4890    psrad                m1, xm5
   4891    psrad                m3, xm5
   4892    pslld                m3, 16
   4893    pblendw              m1, m3, 0xaa     ; 67 89
   4894    vperm2i128           m0, m2, m1, 0x21 ; 45 67
   4895    paddd                m4, [rsp+0x20]
   4896    mova                 m2, m1
   4897    pmaddwd              m5, m0, m9
   4898    pmaddwd              m6, m2, m10
   4899    paddd                m4, m5
   4900    paddd                m4, m6
   4901 %if isput
   4902    psrad                m4, [rsp+0x48]
   4903    vextracti128        xm5, m4, 1
   4904    packusdw            xm4, xm5
   4905    pminsw              xm4, [rsp+0x50]
   4906    movq       [dstq+dsq*0], xm4
   4907    movhps     [dstq+dsq*1], xm4
   4908    lea                dstq, [dstq+dsq*2]
   4909 %else
   4910    psrad                m4, 6
   4911    vextracti128        xm5, m4, 1
   4912    packssdw            xm4, xm5
   4913    mova             [tmpq], xm4
   4914    add                tmpq, 16
   4915 %endif
   4916    sub                  hd, 2
   4917    jg .dy2_w4_loop
   4918    MC_8TAP_SCALED_RET
   4919    SWAP                m10, m13
   4920 .dy2_w8:
   4921    mov    dword [rsp+0xa0], 1
   4922    movifprep   tmp_stridem, 16
   4923    jmp .dy2_w_start
   4924 .dy2_w16:
   4925    mov    dword [rsp+0xa0], 2
   4926    movifprep   tmp_stridem, 32
   4927    jmp .dy2_w_start
   4928 .dy2_w32:
   4929    mov    dword [rsp+0xa0], 4
   4930    movifprep   tmp_stridem, 64
   4931    jmp .dy2_w_start
   4932 .dy2_w64:
   4933    mov    dword [rsp+0xa0], 8
   4934    movifprep   tmp_stridem, 128
   4935    jmp .dy2_w_start
   4936 .dy2_w128:
   4937    mov    dword [rsp+0xa0], 16
   4938    movifprep   tmp_stridem, 256
   4939 .dy2_w_start:
   4940    SWAP                m10, m12, m1
   4941    SWAP                m11, m7
   4942    ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
   4943    mov                 myd, mym
   4944 %if isput
   4945    movifnidn           dsm, dsq
   4946    mova         [rsp+0xc0], xm7
   4947 %endif
   4948    mova         [rsp+0x00], m10
   4949    mova         [rsp+0x20], m13
   4950    mova         [rsp+0x40], xm11
   4951    shr                 t0d, 16
   4952    sub                srcq, 6
   4953    shr                 myd, 6
   4954    mov                 r4d, 64 << 24
   4955    lea                 myd, [t1+myq]
   4956    cmovnz              r4q, [base+subpel_filters+myq*8]
   4957    pmaddwd              m8, [base+rescale_mul2]
   4958    movd               xm15, t0d
   4959    mov          [rsp+0xa4], t0d
   4960    mov          [rsp+0xa8], srcq
   4961    mov          [rsp+0xb0], r0q ; dstq / tmpq
   4962 %if UNIX64
   4963    mov                  hm, hd
   4964 %endif
   4965    shl           dword dxm, 3 ; dx*8
   4966    vpbroadcastd        m15, xm15
   4967    paddd                m1, m8 ; mx+dx*[0-7]
   4968    movq                xm0, r4q
   4969    pmovsxbw            xm0, xm0
   4970    mova         [rsp+0x50], xm0
   4971    jmp .dy2_hloop
   4972 .dy2_hloop_prep:
   4973    dec    dword [rsp+0xa0]
   4974    jz .ret
   4975    add    qword [rsp+0xb0], 16
   4976    mov                  hd, hm
   4977    vpbroadcastd         m8, dxm
   4978    vpbroadcastd         m6, [base+pd_0x3ff]
   4979    paddd                m1, m8, [rsp+0x60]
   4980    vpbroadcastd        m15, [rsp+0xa4]
   4981    pxor                 m9, m9
   4982    mov                srcq, [rsp+0xa8]
   4983    mov                 r0q, [rsp+0xb0] ; dstq / tmpq
   4984    mova                m10, [rsp+0x00]
   4985    mova               xm11, [rsp+0x40]
   4986 .dy2_hloop:
   4987    vpbroadcastq        xm2, [base+pq_0x40000000]
   4988    pand                 m5, m1, m6
   4989    psrld                m5, 6
   4990    paddd               m15, m5
   4991    pcmpeqd              m5, m9
   4992    vextracti128        xm7, m15, 1
   4993    movq                 r6, xm15
   4994    pextrq               r9, xm15, 1
   4995    movq                r11, xm7
   4996    pextrq               rX, xm7, 1
   4997    mov                 r4d, r6d
   4998    shr                  r6, 32
   4999    mov                 r7d, r9d
   5000    shr                  r9, 32
   5001    mov                r10d, r11d
   5002    shr                 r11, 32
   5003    mov                r13d, rXd
   5004    shr                  rX, 32
   5005    mova         [rsp+0x60], m1
   5006    movq               xm12, [base+subpel_filters+ r4*8]
   5007    movq               xm13, [base+subpel_filters+ r6*8]
   5008    movhps             xm12, [base+subpel_filters+ r7*8]
   5009    movhps             xm13, [base+subpel_filters+ r9*8]
   5010    movq               xm14, [base+subpel_filters+r10*8]
   5011    movq               xm15, [base+subpel_filters+r11*8]
   5012    movhps             xm14, [base+subpel_filters+r13*8]
   5013    movhps             xm15, [base+subpel_filters+ rX*8]
   5014    psrld                m1, 10
   5015    vextracti128        xm7, m1, 1
   5016    vextracti128        xm6, m5, 1
   5017    movq                 r6, xm1
   5018    pextrq              r11, xm1, 1
   5019    movq                 r9, xm7
   5020    pextrq               rX, xm7, 1
   5021    mov                 r4d, r6d
   5022    shr                  r6, 32
   5023    mov                r10d, r11d
   5024    shr                 r11, 32
   5025    mov                 r7d, r9d
   5026    shr                  r9, 32
   5027    mov                r13d, rXd
   5028    shr                  rX, 32
   5029    pshufd              xm4, xm5, q2200
   5030    pshufd              xm5, xm5, q3311
   5031    pshufd              xm7, xm6, q2200
   5032    pshufd              xm6, xm6, q3311
   5033    pblendvb           xm12, xm2, xm4
   5034    pblendvb           xm13, xm2, xm5
   5035    pblendvb           xm14, xm2, xm7
   5036    pblendvb           xm15, xm2, xm6
   5037    pmovsxbw            m12, xm12
   5038    pmovsxbw            m13, xm13
   5039    pmovsxbw            m14, xm14
   5040    pmovsxbw            m15, xm15
   5041    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
   5042    mova         [rsp+0x80], m0
   5043    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
   5044    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
   5045    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
   5046    mova                 m0, [rsp+0x80]
   5047    vbroadcasti128       m7, [base+subpel_s_shuf8]
   5048    vpbroadcastd         m8, [rsp+0x50]
   5049    vpbroadcastd         m9, [rsp+0x54]
   5050    vpbroadcastd        m10, [rsp+0x58]
   5051    vpbroadcastd        m11, [rsp+0x5c]
   5052    pshufb               m0, m7     ; 01a 01b
   5053    pshufb               m1, m7     ; 23a 23b
   5054    pshufb               m2, m7     ; 45a 45b
   5055    pshufb               m3, m7     ; 67a 67b
   5056 .dy2_vloop:
   5057    pmaddwd              m4, m0, m8
   5058    pmaddwd              m5, m1, m9
   5059    pmaddwd              m6, m2, m10
   5060    pmaddwd              m7, m3, m11
   5061    paddd                m4, [rsp+0x20]
   5062    paddd                m6, m7
   5063    paddd                m4, m5
   5064    paddd                m4, m6
   5065 %if isput
   5066    psrad                m4, [rsp+0x48]
   5067    vextracti128        xm5, m4, 1
   5068    packusdw            xm4, xm5
   5069    pminsw              xm4, [rsp+0xc0]
   5070    mova             [dstq], xm4
   5071    add                dstq, dsm
   5072 %else
   5073    psrad                m4, 6
   5074    vextracti128        xm5, m4, 1
   5075    packssdw            xm4, xm5
   5076    mova             [tmpq], xm4
   5077    add                tmpq, tmp_stridem
   5078 %endif
   5079    dec                  hd
   5080    jz .dy2_hloop_prep
   5081    mova                 m0, m1
   5082    mova                 m1, m2
   5083    mova                 m2, m3
   5084    movu                xm3, [srcq+ r4*2]
   5085    movu                xm4, [srcq+ r6*2]
   5086    movu                xm5, [srcq+ r7*2]
   5087    movu                xm6, [srcq+ r9*2]
   5088    vinserti128          m3, [srcq+r10*2], 1
   5089    vinserti128          m4, [srcq+r11*2], 1
   5090    vinserti128          m5, [srcq+r13*2], 1
   5091    vinserti128          m6, [srcq+ rX*2], 1
   5092    add                srcq, ssq
   5093    pmaddwd              m3, m12
   5094    pmaddwd              m4, m13
   5095    pmaddwd              m5, m14
   5096    pmaddwd              m6, m15
   5097    phaddd               m3, m4
   5098    phaddd               m5, m6
   5099    phaddd               m3, m5
   5100    movu                xm4, [srcq+ r4*2]
   5101    movu                xm5, [srcq+ r6*2]
   5102    movu                xm6, [srcq+ r7*2]
   5103    movu                xm7, [srcq+ r9*2]
   5104    vinserti128          m4, [srcq+r10*2], 1
   5105    vinserti128          m5, [srcq+r11*2], 1
   5106    vinserti128          m6, [srcq+r13*2], 1
   5107    vinserti128          m7, [srcq+ rX*2], 1
   5108    add                srcq, ssq
   5109    pmaddwd              m4, m12
   5110    pmaddwd              m5, m13
   5111    pmaddwd              m6, m14
   5112    pmaddwd              m7, m15
   5113    phaddd               m4, m5
   5114    phaddd               m6, m7
   5115    mova                 m5, [rsp+0x00]
   5116    movd                xm7, [rsp+0x40]
   5117    phaddd               m4, m6
   5118    paddd                m3, m5
   5119    paddd                m4, m5
   5120    psrad                m3, xm7
   5121    psrad                m4, xm7
   5122    pslld                m4, 16
   5123    pblendw              m3, m4, 0xaa
   5124    jmp .dy2_vloop
   5125 .ret:
   5126    MC_8TAP_SCALED_RET 0
   5127 %undef isput
   5128 %undef isprep
   5129 %endmacro
   5130 
   5131 %macro BILIN_SCALED_FN 1
   5132 cglobal %1_bilin_scaled_16bpc
   5133    mov                 t0d, (5*15 << 16) | 5*15
   5134    mov                 t1d, t0d
   5135    jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
   5136 %endmacro
   5137 
   5138 %if WIN64
   5139 DECLARE_REG_TMP 6, 5
   5140 %else
   5141 DECLARE_REG_TMP 6, 8
   5142 %endif
   5143 
   5144 %define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
   5145 BILIN_SCALED_FN put
   5146 PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_16bpc
   5147 PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_16bpc
   5148 PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_16bpc
   5149 PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_16bpc
   5150 PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_16bpc
   5151 PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_16bpc
   5152 PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_16bpc
   5153 PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_16bpc
   5154 PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
   5155 MC_8TAP_SCALED put
   5156 
   5157 %if WIN64
   5158 DECLARE_REG_TMP 5, 4
   5159 %else
   5160 DECLARE_REG_TMP 6, 7
   5161 %endif
   5162 
   5163 %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
   5164 BILIN_SCALED_FN prep
   5165 PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_16bpc
   5166 PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_16bpc
   5167 PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_16bpc
   5168 PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_16bpc
   5169 PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_16bpc
   5170 PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_16bpc
   5171 PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_16bpc
   5172 PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_16bpc
   5173 PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
   5174 MC_8TAP_SCALED prep
   5175 
   5176 %macro WARP_V 5 ; dst, 01, 23, 45, 67
   5177    lea               tmp1d, [myq+deltaq*4]
   5178    lea               tmp2d, [myq+deltaq*1]
   5179    shr                 myd, 10
   5180    shr               tmp1d, 10
   5181    movq                xm8, [filterq+myq  *8]
   5182    vinserti128          m8, [filterq+tmp1q*8], 1 ; a e
   5183    lea               tmp1d, [tmp2q+deltaq*4]
   5184    lea                 myd, [tmp2q+deltaq*1]
   5185    shr               tmp2d, 10
   5186    shr               tmp1d, 10
   5187    movq                xm0, [filterq+tmp2q*8]
   5188    vinserti128          m0, [filterq+tmp1q*8], 1 ; b f
   5189    lea               tmp1d, [myq+deltaq*4]
   5190    lea               tmp2d, [myq+deltaq*1]
   5191    shr                 myd, 10
   5192    shr               tmp1d, 10
   5193    movq                xm9, [filterq+myq  *8]
   5194    vinserti128          m9, [filterq+tmp1q*8], 1 ; c g
   5195    lea               tmp1d, [tmp2q+deltaq*4]
   5196    lea                 myd, [tmp2q+gammaq]       ; my += gamma
   5197    punpcklwd            m8, m0
   5198    shr               tmp2d, 10
   5199    shr               tmp1d, 10
   5200    movq                xm0, [filterq+tmp2q*8]
   5201    vinserti128          m0, [filterq+tmp1q*8], 1 ; d h
   5202    punpcklwd            m0, m9, m0
   5203    punpckldq            m9, m8, m0
   5204    punpckhdq            m0, m8, m0
   5205    punpcklbw            m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
   5206    punpckhbw            m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
   5207    pmaddwd             m%2, m8
   5208    pmaddwd              m9, m%3
   5209    punpcklbw            m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
   5210    punpckhbw            m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
   5211    pmaddwd              m8, m%4
   5212    pmaddwd              m0, m%5
   5213    paddd                m9, m%2
   5214    mova                m%2, m%3
   5215    paddd                m0, m8
   5216    mova                m%3, m%4
   5217    mova                m%4, m%5
   5218    paddd               m%1, m0, m9
   5219 %endmacro
   5220 
   5221 cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts
   5222    mov                 r6d, r7m
   5223    lea                  r9, [$$]
   5224    shr                 r6d, 11
   5225    vpbroadcastd        m13, [r9-$$+warp8x8_shift+r6*4]
   5226    vpbroadcastd        m14, [warp8x8t_rnd]
   5227    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main
   5228    jmp .start
   5229 .loop:
   5230    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2
   5231    lea                tmpq, [tmpq+tsq*4]
   5232 .start:
   5233    paddd                m7, m14
   5234    paddd                m0, m14
   5235    psrad                m7, 15
   5236    psrad                m0, 15
   5237    packssdw             m7, m0
   5238    vpermq               m7, m7, q3120
   5239    mova         [tmpq+tsq*0], xm7
   5240    vextracti128 [tmpq+tsq*2], m7, 1
   5241    dec                 r4d
   5242    jg .loop
   5243 .end:
   5244    RET
   5245 
   5246 cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \
   5247                                          alpha, beta, filter, tmp1, delta, \
   5248                                          my, gamma
   5249    mov                 r6d, r7m
   5250    lea             filterq, [$$]
   5251    shr                 r6d, 11
   5252    vpbroadcastd        m13, [filterq-$$+warp8x8_shift+r6*4]
   5253    vpbroadcastd        m14, [filterq-$$+warp8x8_rnd  +r6*4]
   5254    vpbroadcastw        m15, r7m ; pixel_max
   5255    call .main
   5256    jmp .start
   5257 .loop:
   5258    call .main2
   5259    lea                dstq, [dstq+dsq*2]
   5260 .start:
   5261    psrad                m7, 16
   5262    psrad                m0, 16
   5263    packusdw             m7, m0
   5264    pmulhrsw             m7, m14
   5265    pminsw               m7, m15
   5266    vpermq               m7, m7, q3120
   5267    mova         [dstq+dsq*0], xm7
   5268    vextracti128 [dstq+dsq*1], m7, 1
   5269    dec                 r4d
   5270    jg .loop
   5271 .end:
   5272    RET
   5273 ALIGN function_align
   5274 .main:
   5275    ; Stack args offset by one (r4m -> r5m etc.) due to call
   5276 %if WIN64
   5277    mov               abcdq, r5m
   5278    mov                 mxd, r6m
   5279 %endif
   5280    movsx            alphad, word [abcdq+2*0]
   5281    movsx             betad, word [abcdq+2*1]
   5282    vpbroadcastd        m12, [pd_32768]
   5283    pxor                m11, m11
   5284    add             filterq, mc_warp_filter-$$
   5285    lea               tmp1q, [ssq*3]
   5286    add                 mxd, 512+(64<<10)
   5287    lea               tmp2d, [alphaq*3]
   5288    sub                srcq, tmp1q    ; src -= src_stride*3
   5289    sub               betad, tmp2d    ; beta -= alpha*3
   5290    mov                 myd, r7m
   5291    call .h
   5292    psrld                m1, m0, 16
   5293    call .h
   5294    pblendw              m1, m0, 0xaa ; 01
   5295    psrld                m2, m0, 16
   5296    call .h
   5297    pblendw              m2, m0, 0xaa ; 12
   5298    psrld                m3, m0, 16
   5299    call .h
   5300    pblendw              m3, m0, 0xaa ; 23
   5301    psrld                m4, m0, 16
   5302    call .h
   5303    pblendw              m4, m0, 0xaa ; 34
   5304    psrld                m5, m0, 16
   5305    call .h
   5306    pblendw              m5, m0, 0xaa ; 45
   5307    psrld                m6, m0, 16
   5308    call .h
   5309    pblendw              m6, m0, 0xaa ; 56
   5310    movsx            deltad, word [abcdq+2*2]
   5311    movsx            gammad, word [abcdq+2*3]
   5312    add                 myd, 512+(64<<10)
   5313    mov                 r4d, 4
   5314    lea               tmp1d, [deltaq*3]
   5315    sub              gammad, tmp1d    ; gamma -= delta*3
   5316 .main2:
   5317    call .h
   5318    psrld                m7, m6, 16
   5319    pblendw              m7, m0, 0xaa ; 67
   5320    WARP_V                7, 1, 3, 5, 7
   5321    call .h
   5322    psrld               m10, m5, 16
   5323    pblendw             m10, m0, 0xaa ; 78
   5324    WARP_V                0, 2, 4, 6, 10
   5325    ret
   5326 ALIGN function_align
   5327 .h:
   5328    lea               tmp1d, [mxq+alphaq*4]
   5329    lea               tmp2d, [mxq+alphaq*1]
   5330    movu               xm10, [srcq-6]
   5331    vinserti128         m10, [srcq+2], 1
   5332    shr                 mxd, 10 ; 0
   5333    shr               tmp1d, 10 ; 4
   5334    movq                xm0, [filterq+mxq  *8]
   5335    vinserti128          m0, [filterq+tmp1q*8], 1
   5336    lea               tmp1d, [tmp2q+alphaq*4]
   5337    lea                 mxd, [tmp2q+alphaq*1]
   5338    movu                xm8, [srcq-4]
   5339    vinserti128          m8, [srcq+4], 1
   5340    shr               tmp2d, 10 ; 1
   5341    shr               tmp1d, 10 ; 5
   5342    movq                xm9, [filterq+tmp2q*8]
   5343    vinserti128          m9, [filterq+tmp1q*8], 1
   5344    lea               tmp1d, [mxq+alphaq*4]
   5345    lea               tmp2d, [mxq+alphaq*1]
   5346    shr                 mxd, 10 ; 2
   5347    shr               tmp1d, 10 ; 6
   5348    punpcklbw            m0, m11, m0
   5349    pmaddwd              m0, m10
   5350    movu               xm10, [srcq-2]
   5351    vinserti128         m10, [srcq+6], 1
   5352    punpcklbw            m9, m11, m9
   5353    pmaddwd              m9, m8
   5354    movq                xm8, [filterq+mxq  *8]
   5355    vinserti128          m8, [filterq+tmp1q*8], 1
   5356    lea               tmp1d, [tmp2q+alphaq*4]
   5357    lea                 mxd, [tmp2q+betaq] ; mx += beta
   5358    phaddd               m0, m9 ; 0 1   4 5
   5359    movu                xm9, [srcq+0]
   5360    vinserti128          m9, [srcq+8], 1
   5361    shr               tmp2d, 10 ; 3
   5362    shr               tmp1d, 10 ; 7
   5363    punpcklbw            m8, m11, m8
   5364    pmaddwd              m8, m10
   5365    movq               xm10, [filterq+tmp2q*8]
   5366    vinserti128         m10, [filterq+tmp1q*8], 1
   5367    punpcklbw           m10, m11, m10
   5368    pmaddwd              m9, m10
   5369    add                srcq, ssq
   5370    phaddd               m8, m9 ; 2 3   6 7
   5371    phaddd               m0, m8 ; 0 1 2 3   4 5 6 7
   5372    vpsllvd              m0, m13
   5373    paddd                m0, m12 ; rounded 14-bit result in upper 16 bits of dword
   5374    ret
   5375 
   5376 %macro BIDIR_FN 0
   5377    call .main
   5378    lea            stride3q, [strideq*3]
   5379    jmp                  wq
   5380 .w4:
   5381    movq   [dstq          ], xm0
   5382    movhps [dstq+strideq*1], xm0
   5383    vextracti128        xm0, m0, 1
   5384    movq   [dstq+strideq*2], xm0
   5385    movhps [dstq+stride3q ], xm0
   5386    cmp                  hd, 4
   5387    je .ret
   5388    lea                dstq, [dstq+strideq*4]
   5389    movq   [dstq          ], xm1
   5390    movhps [dstq+strideq*1], xm1
   5391    vextracti128        xm1, m1, 1
   5392    movq   [dstq+strideq*2], xm1
   5393    movhps [dstq+stride3q ], xm1
   5394    cmp                  hd, 8
   5395    je .ret
   5396    lea                dstq, [dstq+strideq*4]
   5397    movq   [dstq          ], xm2
   5398    movhps [dstq+strideq*1], xm2
   5399    vextracti128        xm2, m2, 1
   5400    movq   [dstq+strideq*2], xm2
   5401    movhps [dstq+stride3q ], xm2
   5402    lea                dstq, [dstq+strideq*4]
   5403    movq   [dstq          ], xm3
   5404    movhps [dstq+strideq*1], xm3
   5405    vextracti128        xm3, m3, 1
   5406    movq   [dstq+strideq*2], xm3
   5407    movhps [dstq+stride3q ], xm3
   5408 .ret:
   5409    RET
   5410 .w8:
   5411    mova         [dstq+strideq*0], xm0
   5412    vextracti128 [dstq+strideq*1], m0, 1
   5413    mova         [dstq+strideq*2], xm1
   5414    vextracti128 [dstq+stride3q ], m1, 1
   5415    cmp                  hd, 4
   5416    jne .w8_loop_start
   5417    RET
   5418 .w8_loop:
   5419    call .main
   5420    lea                dstq, [dstq+strideq*4]
   5421    mova         [dstq+strideq*0], xm0
   5422    vextracti128 [dstq+strideq*1], m0, 1
   5423    mova         [dstq+strideq*2], xm1
   5424    vextracti128 [dstq+stride3q ], m1, 1
   5425 .w8_loop_start:
   5426    lea                dstq, [dstq+strideq*4]
   5427    mova         [dstq+strideq*0], xm2
   5428    vextracti128 [dstq+strideq*1], m2, 1
   5429    mova         [dstq+strideq*2], xm3
   5430    vextracti128 [dstq+stride3q ], m3, 1
   5431    sub                  hd, 8
   5432    jg .w8_loop
   5433    RET
   5434 .w16_loop:
   5435    call .main
   5436    lea                dstq, [dstq+strideq*4]
   5437 .w16:
   5438    mova   [dstq+strideq*0], m0
   5439    mova   [dstq+strideq*1], m1
   5440    mova   [dstq+strideq*2], m2
   5441    mova   [dstq+stride3q ], m3
   5442    sub                  hd, 4
   5443    jg .w16_loop
   5444    RET
   5445 .w32_loop:
   5446    call .main
   5447    lea                dstq, [dstq+strideq*2]
   5448 .w32:
   5449    mova [dstq+strideq*0+32*0], m0
   5450    mova [dstq+strideq*0+32*1], m1
   5451    mova [dstq+strideq*1+32*0], m2
   5452    mova [dstq+strideq*1+32*1], m3
   5453    sub                  hd, 2
   5454    jg .w32_loop
   5455    RET
   5456 .w64_loop:
   5457    call .main
   5458    add                dstq, strideq
   5459 .w64:
   5460    mova        [dstq+32*0], m0
   5461    mova        [dstq+32*1], m1
   5462    mova        [dstq+32*2], m2
   5463    mova        [dstq+32*3], m3
   5464    dec                  hd
   5465    jg .w64_loop
   5466    RET
   5467 .w128_loop:
   5468    call .main
   5469    add                dstq, strideq
   5470 .w128:
   5471    mova        [dstq+32*0], m0
   5472    mova        [dstq+32*1], m1
   5473    mova        [dstq+32*2], m2
   5474    mova        [dstq+32*3], m3
   5475    call .main
   5476    mova        [dstq+32*4], m0
   5477    mova        [dstq+32*5], m1
   5478    mova        [dstq+32*6], m2
   5479    mova        [dstq+32*7], m3
   5480    dec                  hd
   5481    jg .w128_loop
   5482    RET
   5483 %endmacro
   5484 
   5485 %if WIN64
   5486 DECLARE_REG_TMP 5
   5487 %else
   5488 DECLARE_REG_TMP 7
   5489 %endif
   5490 
   5491 cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
   5492 %define base r6-avg_avx2_table
   5493    lea                  r6, [avg_avx2_table]
   5494    tzcnt                wd, wm
   5495    mov                 t0d, r6m ; pixel_max
   5496    movsxd               wq, [r6+wq*4]
   5497    shr                 t0d, 11
   5498    vpbroadcastd         m4, [base+bidir_rnd+t0*4]
   5499    vpbroadcastd         m5, [base+bidir_mul+t0*4]
   5500    movifnidn            hd, hm
   5501    add                  wq, r6
   5502    BIDIR_FN
   5503 ALIGN function_align
   5504 .main:
   5505    mova                 m0, [tmp1q+32*0]
   5506    paddsw               m0, [tmp2q+32*0]
   5507    mova                 m1, [tmp1q+32*1]
   5508    paddsw               m1, [tmp2q+32*1]
   5509    mova                 m2, [tmp1q+32*2]
   5510    paddsw               m2, [tmp2q+32*2]
   5511    mova                 m3, [tmp1q+32*3]
   5512    paddsw               m3, [tmp2q+32*3]
   5513    add               tmp1q, 32*4
   5514    add               tmp2q, 32*4
   5515    pmaxsw               m0, m4
   5516    pmaxsw               m1, m4
   5517    pmaxsw               m2, m4
   5518    pmaxsw               m3, m4
   5519    psubsw               m0, m4
   5520    psubsw               m1, m4
   5521    psubsw               m2, m4
   5522    psubsw               m3, m4
   5523    pmulhw               m0, m5
   5524    pmulhw               m1, m5
   5525    pmulhw               m2, m5
   5526    pmulhw               m3, m5
   5527    ret
   5528 
   5529 cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3
   5530    lea                  r6, [w_avg_avx2_table]
   5531    tzcnt                wd, wm
   5532    mov                 t0d, r6m ; weight
   5533    vpbroadcastw         m8, r7m ; pixel_max
   5534    vpbroadcastd         m7, [r6-w_avg_avx2_table+pd_65538]
   5535    movsxd               wq, [r6+wq*4]
   5536    paddw                m7, m8
   5537    add                  wq, r6
   5538    lea                 r6d, [t0-16]
   5539    shl                 t0d, 16
   5540    sub                 t0d, r6d ; 16-weight, weight
   5541    pslld                m7, 7
   5542    rorx                r6d, t0d, 30 ; << 2
   5543    test          dword r7m, 0x800
   5544    cmovz               r6d, t0d
   5545    movifnidn            hd, hm
   5546    movd                xm6, r6d
   5547    vpbroadcastd         m6, xm6
   5548    BIDIR_FN
   5549 ALIGN function_align
   5550 .main:
   5551    mova                 m4, [tmp1q+32*0]
   5552    mova                 m0, [tmp2q+32*0]
   5553    punpckhwd            m5, m0, m4
   5554    punpcklwd            m0, m4
   5555    mova                 m4, [tmp1q+32*1]
   5556    mova                 m1, [tmp2q+32*1]
   5557    pmaddwd              m5, m6
   5558    pmaddwd              m0, m6
   5559    paddd                m5, m7
   5560    paddd                m0, m7
   5561    psrad                m5, 8
   5562    psrad                m0, 8
   5563    packusdw             m0, m5
   5564    punpckhwd            m5, m1, m4
   5565    punpcklwd            m1, m4
   5566    mova                 m4, [tmp1q+32*2]
   5567    mova                 m2, [tmp2q+32*2]
   5568    pmaddwd              m5, m6
   5569    pmaddwd              m1, m6
   5570    paddd                m5, m7
   5571    paddd                m1, m7
   5572    psrad                m5, 8
   5573    psrad                m1, 8
   5574    packusdw             m1, m5
   5575    punpckhwd            m5, m2, m4
   5576    punpcklwd            m2, m4
   5577    mova                 m4, [tmp1q+32*3]
   5578    mova                 m3, [tmp2q+32*3]
   5579    add               tmp1q, 32*4
   5580    add               tmp2q, 32*4
   5581    pmaddwd              m5, m6
   5582    pmaddwd              m2, m6
   5583    paddd                m5, m7
   5584    paddd                m2, m7
   5585    psrad                m5, 8
   5586    psrad                m2, 8
   5587    packusdw             m2, m5
   5588    punpckhwd            m5, m3, m4
   5589    punpcklwd            m3, m4
   5590    pmaddwd              m5, m6
   5591    pmaddwd              m3, m6
   5592    paddd                m5, m7
   5593    paddd                m3, m7
   5594    psrad                m5, 8
   5595    psrad                m3, 8
   5596    packusdw             m3, m5
   5597    pminsw               m0, m8
   5598    pminsw               m1, m8
   5599    pminsw               m2, m8
   5600    pminsw               m3, m8
   5601    ret
   5602 
   5603 cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
   5604 %define base r7-mask_avx2_table
   5605    lea                  r7, [mask_avx2_table]
   5606    tzcnt                wd, wm
   5607    mov                 r6d, r7m ; pixel_max
   5608    movifnidn            hd, hm
   5609    shr                 r6d, 11
   5610    movsxd               wq, [r7+wq*4]
   5611    vpbroadcastd         m8, [base+pw_64]
   5612    vpbroadcastd         m9, [base+bidir_rnd+r6*4]
   5613    vpbroadcastd        m10, [base+bidir_mul+r6*4]
   5614    mov               maskq, maskmp
   5615    add                  wq, r7
   5616    BIDIR_FN
   5617 ALIGN function_align
   5618 .main:
   5619 %macro MASK 1
   5620    pmovzxbw             m5, [maskq+16*%1]
   5621    mova                m%1, [tmp1q+32*%1]
   5622    mova                 m6, [tmp2q+32*%1]
   5623    punpckhwd            m4, m%1, m6
   5624    punpcklwd           m%1, m6
   5625    psubw                m7, m8, m5
   5626    punpckhwd            m6, m5, m7 ; m, 64-m
   5627    punpcklwd            m5, m7
   5628    pmaddwd              m4, m6     ; tmp1 * m + tmp2 * (64-m)
   5629    pmaddwd             m%1, m5
   5630    psrad                m4, 5
   5631    psrad               m%1, 5
   5632    packssdw            m%1, m4
   5633    pmaxsw              m%1, m9
   5634    psubsw              m%1, m9
   5635    pmulhw              m%1, m10
   5636 %endmacro
   5637    MASK                  0
   5638    MASK                  1
   5639    MASK                  2
   5640    MASK                  3
   5641    add               maskq, 16*4
   5642    add               tmp1q, 32*4
   5643    add               tmp2q, 32*4
   5644    ret
   5645 
   5646 cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
   5647 %define base r7-w_mask_420_avx2_table
   5648    lea                  r7, [w_mask_420_avx2_table]
   5649    tzcnt                wd, wm
   5650    mov                 r6d, r8m ; pixel_max
   5651    movd                xm0, r7m ; sign
   5652    movifnidn            hd, hm
   5653    shr                 r6d, 11
   5654    movsxd               wq, [r7+wq*4]
   5655    vpbroadcastd        m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
   5656    vpbroadcastd        m11, [base+pw_64]
   5657    vpbroadcastd        m12, [base+bidir_rnd+r6*4]
   5658    vpbroadcastd        m13, [base+bidir_mul+r6*4]
   5659    movd               xm14, [base+pw_2]
   5660    mov               maskq, maskmp
   5661    psubw              xm14, xm0
   5662    vpbroadcastw        m14, xm14
   5663    add                  wq, r7
   5664    call .main
   5665    lea            stride3q, [strideq*3]
   5666    jmp                  wq
   5667 .w4:
   5668    phaddd               m4, m5
   5669    paddw                m4, m14
   5670    psrlw                m4, 2
   5671    packuswb             m4, m4
   5672    vextracti128        xm5, m4, 1
   5673    punpcklwd           xm4, xm5
   5674    movq   [dstq+strideq*0], xm0
   5675    movhps [dstq+strideq*1], xm0
   5676    vextracti128        xm0, m0, 1
   5677    movq   [dstq+strideq*2], xm0
   5678    movhps [dstq+stride3q ], xm0
   5679    mova            [maskq], xm4
   5680    cmp                  hd, 8
   5681    jl .w4_end
   5682    lea                dstq, [dstq+strideq*4]
   5683    movq   [dstq+strideq*0], xm1
   5684    movhps [dstq+strideq*1], xm1
   5685    vextracti128        xm1, m1, 1
   5686    movq   [dstq+strideq*2], xm1
   5687    movhps [dstq+stride3q ], xm1
   5688    je .w4_end
   5689    lea                dstq, [dstq+strideq*4]
   5690    movq   [dstq+strideq*0], xm2
   5691    movhps [dstq+strideq*1], xm2
   5692    vextracti128        xm2, m2, 1
   5693    movq   [dstq+strideq*2], xm2
   5694    movhps [dstq+stride3q ], xm2
   5695    lea                dstq, [dstq+strideq*4]
   5696    movq   [dstq+strideq*0], xm3
   5697    movhps [dstq+strideq*1], xm3
   5698    vextracti128        xm3, m3, 1
   5699    movq   [dstq+strideq*2], xm3
   5700    movhps [dstq+stride3q ], xm3
   5701 .w4_end:
   5702    RET
   5703 .w8_loop:
   5704    call .main
   5705    lea                dstq, [dstq+strideq*4]
   5706    add               maskq, 16
   5707 .w8:
   5708    vperm2i128           m6, m4, m5, 0x21
   5709    vpblendd             m4, m5, 0xf0
   5710    paddw                m4, m14
   5711    paddw                m4, m6
   5712    psrlw                m4, 2
   5713    vextracti128        xm5, m4, 1
   5714    packuswb            xm4, xm5
   5715    mova         [dstq+strideq*0], xm0
   5716    vextracti128 [dstq+strideq*1], m0, 1
   5717    mova         [dstq+strideq*2], xm1
   5718    vextracti128 [dstq+stride3q ], m1, 1
   5719    mova            [maskq], xm4
   5720    sub                  hd, 8
   5721    jl .w8_end
   5722    lea                dstq, [dstq+strideq*4]
   5723    mova         [dstq+strideq*0], xm2
   5724    vextracti128 [dstq+strideq*1], m2, 1
   5725    mova         [dstq+strideq*2], xm3
   5726    vextracti128 [dstq+stride3q ], m3, 1
   5727    jg .w8_loop
   5728 .w8_end:
   5729    RET
   5730 .w16_loop:
   5731    call .main
   5732    lea                dstq, [dstq+strideq*4]
   5733    add               maskq, 16
   5734 .w16:
   5735    punpcklqdq           m6, m4, m5
   5736    punpckhqdq           m4, m5
   5737    paddw                m6, m14
   5738    paddw                m4, m6
   5739    psrlw                m4, 2
   5740    vextracti128        xm5, m4, 1
   5741    packuswb            xm4, xm5
   5742    pshufd              xm4, xm4, q3120
   5743    mova   [dstq+strideq*0], m0
   5744    mova   [dstq+strideq*1], m1
   5745    mova   [dstq+strideq*2], m2
   5746    mova   [dstq+stride3q ], m3
   5747    mova            [maskq], xm4
   5748    sub                  hd, 4
   5749    jg .w16_loop
   5750    RET
   5751 .w32_loop:
   5752    call .main
   5753    lea                dstq, [dstq+strideq*4]
   5754    add               maskq, 32
   5755 .w32:
   5756    paddw                m4, m14
   5757    paddw                m4, m5
   5758    psrlw               m15, m4, 2
   5759    mova [dstq+strideq*0+32*0], m0
   5760    mova [dstq+strideq*0+32*1], m1
   5761    mova [dstq+strideq*1+32*0], m2
   5762    mova [dstq+strideq*1+32*1], m3
   5763    call .main
   5764    mova                 m6, [deint_shuf]
   5765    paddw                m4, m14
   5766    paddw                m4, m5
   5767    psrlw                m4, 2
   5768    packuswb            m15, m4
   5769    vpermd               m4, m6, m15
   5770    mova [dstq+strideq*2+32*0], m0
   5771    mova [dstq+strideq*2+32*1], m1
   5772    mova [dstq+stride3q +32*0], m2
   5773    mova [dstq+stride3q +32*1], m3
   5774    mova            [maskq], m4
   5775    sub                  hd, 4
   5776    jg .w32_loop
   5777    RET
   5778 .w64_loop:
   5779    call .main
   5780    lea                dstq, [dstq+strideq*2]
   5781    add               maskq, 32
   5782 .w64:
   5783    paddw                m4, m14
   5784    paddw               m15, m14, m5
   5785    mova [dstq+strideq*0+32*0], m0
   5786    mova [dstq+strideq*0+32*1], m1
   5787    mova [dstq+strideq*0+32*2], m2
   5788    mova [dstq+strideq*0+32*3], m3
   5789    mova            [maskq], m4 ; no available registers
   5790    call .main
   5791    paddw                m4, [maskq]
   5792    mova                 m6, [deint_shuf]
   5793    paddw                m5, m15
   5794    psrlw                m4, 2
   5795    psrlw                m5, 2
   5796    packuswb             m4, m5 ; 0 2 4 6   1 3 5 7
   5797    vpermd               m4, m6, m4
   5798    mova [dstq+strideq*1+32*0], m0
   5799    mova [dstq+strideq*1+32*1], m1
   5800    mova [dstq+strideq*1+32*2], m2
   5801    mova [dstq+strideq*1+32*3], m3
   5802    mova            [maskq], m4
   5803    sub                  hd, 2
   5804    jg .w64_loop
   5805    RET
   5806 .w128_loop:
   5807    call .main
   5808    lea                dstq, [dstq+strideq*2]
   5809    add               maskq, 64
   5810 .w128:
   5811    paddw                m4, m14
   5812    paddw                m5, m14
   5813    mova [dstq+strideq*0+32*0], m0
   5814    mova [dstq+strideq*0+32*1], m1
   5815    mova [dstq+strideq*0+32*2], m2
   5816    mova [dstq+strideq*0+32*3], m3
   5817    mova       [maskq+32*0], m4
   5818    mova     [dstq+strideq], m5
   5819    call .main
   5820    paddw                m4, m14
   5821    paddw               m15, m14, m5
   5822    mova [dstq+strideq*0+32*4], m0
   5823    mova [dstq+strideq*0+32*5], m1
   5824    mova [dstq+strideq*0+32*6], m2
   5825    mova [dstq+strideq*0+32*7], m3
   5826    mova       [maskq+32*1], m4
   5827    call .main
   5828    paddw                m4, [maskq+32*0]
   5829    paddw                m5, [dstq+strideq]
   5830    mova                 m6, [deint_shuf]
   5831    psrlw                m4, 2
   5832    psrlw                m5, 2
   5833    packuswb             m4, m5
   5834    vpermd               m4, m6, m4
   5835    mova [dstq+strideq*1+32*0], m0
   5836    mova [dstq+strideq*1+32*1], m1
   5837    mova [dstq+strideq*1+32*2], m2
   5838    mova [dstq+strideq*1+32*3], m3
   5839    mova       [maskq+32*0], m4
   5840    call .main
   5841    paddw                m4, [maskq+32*1]
   5842    mova                 m6, [deint_shuf]
   5843    paddw                m5, m15
   5844    psrlw                m4, 2
   5845    psrlw                m5, 2
   5846    packuswb             m4, m5
   5847    vpermd               m4, m6, m4
   5848    mova [dstq+strideq*1+32*4], m0
   5849    mova [dstq+strideq*1+32*5], m1
   5850    mova [dstq+strideq*1+32*6], m2
   5851    mova [dstq+strideq*1+32*7], m3
   5852    mova       [maskq+32*1], m4
   5853    sub                  hd, 2
   5854    jg .w128_loop
   5855    RET
   5856 ALIGN function_align
   5857 .main:
   5858 %macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul
   5859    mova                m%1, [tmp1q+32*%1]
   5860    mova                m%2, [tmp2q+32*%1]
   5861    punpcklwd            m8, m%2, m%1
   5862    punpckhwd            m9, m%2, m%1
   5863    psubsw              m%1, m%2
   5864    pabsw               m%1, m%1
   5865    psubusw              m7, m10, m%1
   5866    psrlw                m7, 10       ; 64-m
   5867    psubw               m%2, m%3, m7  ; m
   5868    punpcklwd           m%1, m7, m%2
   5869    punpckhwd            m7, m%2
   5870    pmaddwd             m%1, m8
   5871    pmaddwd              m7, m9
   5872    psrad               m%1, 5
   5873    psrad                m7, 5
   5874    packssdw            m%1, m7
   5875    pmaxsw              m%1, m%4
   5876    psubsw              m%1, m%4
   5877    pmulhw              m%1, m%5
   5878 %endmacro
   5879    W_MASK                0, 4
   5880    W_MASK                1, 5
   5881    phaddw               m4, m5
   5882    W_MASK                2, 5
   5883    W_MASK                3, 6
   5884    phaddw               m5, m6
   5885    add               tmp1q, 32*4
   5886    add               tmp2q, 32*4
   5887    ret
   5888 
   5889 cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
   5890 %define base r7-w_mask_422_avx2_table
   5891    lea                  r7, [w_mask_422_avx2_table]
   5892    tzcnt                wd, wm
   5893    mov                 r6d, r8m ; pixel_max
   5894    vpbroadcastb        m14, r7m ; sign
   5895    movifnidn            hd, hm
   5896    shr                 r6d, 11
   5897    movsxd               wq, [r7+wq*4]
   5898    vpbroadcastd        m10, [base+pw_27615]
   5899    vpbroadcastd        m11, [base+pw_64]
   5900    vpbroadcastd        m12, [base+bidir_rnd+r6*4]
   5901    vpbroadcastd        m13, [base+bidir_mul+r6*4]
   5902    mova                m15, [base+deint_shuf]
   5903    mov               maskq, maskmp
   5904    add                  wq, r7
   5905    call .main
   5906    lea            stride3q, [strideq*3]
   5907    jmp                  wq
   5908 .w4:
   5909    movq   [dstq+strideq*0], xm0
   5910    movhps [dstq+strideq*1], xm0
   5911    vextracti128        xm0, m0, 1
   5912    movq   [dstq+strideq*2], xm0
   5913    movhps [dstq+stride3q ], xm0
   5914    cmp                  hd, 8
   5915    jl .w4_end
   5916    lea                dstq, [dstq+strideq*4]
   5917    movq   [dstq+strideq*0], xm1
   5918    movhps [dstq+strideq*1], xm1
   5919    vextracti128        xm1, m1, 1
   5920    movq   [dstq+strideq*2], xm1
   5921    movhps [dstq+stride3q ], xm1
   5922    je .w4_end
   5923    lea                dstq, [dstq+strideq*4]
   5924    movq   [dstq+strideq*0], xm2
   5925    movhps [dstq+strideq*1], xm2
   5926    vextracti128        xm2, m2, 1
   5927    movq   [dstq+strideq*2], xm2
   5928    movhps [dstq+stride3q ], xm2
   5929    lea                dstq, [dstq+strideq*4]
   5930    movq   [dstq+strideq*0], xm3
   5931    movhps [dstq+strideq*1], xm3
   5932    vextracti128        xm3, m3, 1
   5933    movq   [dstq+strideq*2], xm3
   5934    movhps [dstq+stride3q ], xm3
   5935 .w4_end:
   5936    RET
   5937 .w8_loop:
   5938    call .main
   5939    lea                dstq, [dstq+strideq*4]
   5940 .w8:
   5941    mova         [dstq+strideq*0], xm0
   5942    vextracti128 [dstq+strideq*1], m0, 1
   5943    mova         [dstq+strideq*2], xm1
   5944    vextracti128 [dstq+stride3q ], m1, 1
   5945    sub                  hd, 8
   5946    jl .w8_end
   5947    lea                dstq, [dstq+strideq*4]
   5948    mova         [dstq+strideq*0], xm2
   5949    vextracti128 [dstq+strideq*1], m2, 1
   5950    mova         [dstq+strideq*2], xm3
   5951    vextracti128 [dstq+stride3q ], m3, 1
   5952    jg .w8_loop
   5953 .w8_end:
   5954    RET
   5955 .w16_loop:
   5956    call .main
   5957    lea                dstq, [dstq+strideq*4]
   5958 .w16:
   5959    mova   [dstq+strideq*0], m0
   5960    mova   [dstq+strideq*1], m1
   5961    mova   [dstq+strideq*2], m2
   5962    mova   [dstq+stride3q ], m3
   5963    sub                  hd, 4
   5964    jg .w16_loop
   5965    RET
   5966 .w32_loop:
   5967    call .main
   5968    lea                dstq, [dstq+strideq*2]
   5969 .w32:
   5970    mova [dstq+strideq*0+32*0], m0
   5971    mova [dstq+strideq*0+32*1], m1
   5972    mova [dstq+strideq*1+32*0], m2
   5973    mova [dstq+strideq*1+32*1], m3
   5974    sub                  hd, 2
   5975    jg .w32_loop
   5976    RET
   5977 .w64_loop:
   5978    call .main
   5979    add                dstq, strideq
   5980 .w64:
   5981    mova        [dstq+32*0], m0
   5982    mova        [dstq+32*1], m1
   5983    mova        [dstq+32*2], m2
   5984    mova        [dstq+32*3], m3
   5985    dec                  hd
   5986    jg .w64_loop
   5987    RET
   5988 .w128_loop:
   5989    call .main
   5990    add                dstq, strideq
   5991 .w128:
   5992    mova        [dstq+32*0], m0
   5993    mova        [dstq+32*1], m1
   5994    mova        [dstq+32*2], m2
   5995    mova        [dstq+32*3], m3
   5996    call .main
   5997    mova        [dstq+32*4], m0
   5998    mova        [dstq+32*5], m1
   5999    mova        [dstq+32*6], m2
   6000    mova        [dstq+32*7], m3
   6001    dec                  hd
   6002    jg .w128_loop
   6003    RET
   6004 ALIGN function_align
   6005 .main:
   6006    W_MASK                0, 4
   6007    W_MASK                1, 5
   6008    phaddw               m4, m5
   6009    W_MASK                2, 5
   6010    W_MASK                3, 6
   6011    phaddw               m5, m6
   6012    add               tmp1q, 32*4
   6013    add               tmp2q, 32*4
   6014    packuswb             m4, m5
   6015    pxor                 m5, m5
   6016    psubb                m4, m14
   6017    pavgb                m4, m5
   6018    vpermd               m4, m15, m4
   6019    mova            [maskq], m4
   6020    add               maskq, 32
   6021    ret
   6022 
   6023 cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
   6024 %define base r7-w_mask_444_avx2_table
   6025    lea                  r7, [w_mask_444_avx2_table]
   6026    tzcnt                wd, wm
   6027    mov                 r6d, r8m ; pixel_max
   6028    movifnidn            hd, hm
   6029    shr                 r6d, 11
   6030    movsxd               wq, [r7+wq*4]
   6031    vpbroadcastd        m10, [base+pw_27615]
   6032    vpbroadcastd         m4, [base+pw_64]
   6033    vpbroadcastd         m5, [base+bidir_rnd+r6*4]
   6034    vpbroadcastd         m6, [base+bidir_mul+r6*4]
   6035    mov               maskq, maskmp
   6036    add                  wq, r7
   6037    call .main
   6038    lea            stride3q, [strideq*3]
   6039    jmp                  wq
   6040 .w4:
   6041    movq   [dstq+strideq*0], xm0
   6042    movhps [dstq+strideq*1], xm0
   6043    vextracti128        xm0, m0, 1
   6044    movq   [dstq+strideq*2], xm0
   6045    movhps [dstq+stride3q ], xm0
   6046    cmp                  hd, 8
   6047    jl .w4_end
   6048    lea                dstq, [dstq+strideq*4]
   6049    movq   [dstq+strideq*0], xm1
   6050    movhps [dstq+strideq*1], xm1
   6051    vextracti128        xm1, m1, 1
   6052    movq   [dstq+strideq*2], xm1
   6053    movhps [dstq+stride3q ], xm1
   6054    je .w4_end
   6055    call .main
   6056    lea                dstq, [dstq+strideq*4]
   6057    movq   [dstq+strideq*0], xm0
   6058    movhps [dstq+strideq*1], xm0
   6059    vextracti128        xm0, m0, 1
   6060    movq   [dstq+strideq*2], xm0
   6061    movhps [dstq+stride3q ], xm0
   6062    lea                dstq, [dstq+strideq*4]
   6063    movq   [dstq+strideq*0], xm1
   6064    movhps [dstq+strideq*1], xm1
   6065    vextracti128        xm1, m1, 1
   6066    movq   [dstq+strideq*2], xm1
   6067    movhps [dstq+stride3q ], xm1
   6068 .w4_end:
   6069    RET
   6070 .w8_loop:
   6071    call .main
   6072    lea                dstq, [dstq+strideq*4]
   6073 .w8:
   6074    mova         [dstq+strideq*0], xm0
   6075    vextracti128 [dstq+strideq*1], m0, 1
   6076    mova         [dstq+strideq*2], xm1
   6077    vextracti128 [dstq+stride3q ], m1, 1
   6078    sub                  hd, 4
   6079    jg .w8_loop
   6080 .w8_end:
   6081    RET
   6082 .w16_loop:
   6083    call .main
   6084    lea                dstq, [dstq+strideq*2]
   6085 .w16:
   6086    mova   [dstq+strideq*0], m0
   6087    mova   [dstq+strideq*1], m1
   6088    sub                  hd, 2
   6089    jg .w16_loop
   6090    RET
   6091 .w32_loop:
   6092    call .main
   6093    add                dstq, strideq
   6094 .w32:
   6095    mova        [dstq+32*0], m0
   6096    mova        [dstq+32*1], m1
   6097    dec                  hd
   6098    jg .w32_loop
   6099    RET
   6100 .w64_loop:
   6101    call .main
   6102    add                dstq, strideq
   6103 .w64:
   6104    mova        [dstq+32*0], m0
   6105    mova        [dstq+32*1], m1
   6106    call .main
   6107    mova        [dstq+32*2], m0
   6108    mova        [dstq+32*3], m1
   6109    dec                  hd
   6110    jg .w64_loop
   6111    RET
   6112 .w128_loop:
   6113    call .main
   6114    add                dstq, strideq
   6115 .w128:
   6116    mova        [dstq+32*0], m0
   6117    mova        [dstq+32*1], m1
   6118    call .main
   6119    mova        [dstq+32*2], m0
   6120    mova        [dstq+32*3], m1
   6121    call .main
   6122    mova        [dstq+32*4], m0
   6123    mova        [dstq+32*5], m1
   6124    call .main
   6125    mova        [dstq+32*6], m0
   6126    mova        [dstq+32*7], m1
   6127    dec                  hd
   6128    jg .w128_loop
   6129    RET
   6130 ALIGN function_align
   6131 .main:
   6132    W_MASK                0, 2, 4, 5, 6
   6133    W_MASK                1, 3, 4, 5, 6
   6134    packuswb             m2, m3
   6135    vpermq               m2, m2, q3120
   6136    add               tmp1q, 32*2
   6137    add               tmp2q, 32*2
   6138    mova            [maskq], m2
   6139    add               maskq, 32
   6140    ret
   6141 
   6142 ; (a * (64 - m) + b * m + 32) >> 6
   6143 ; = (((b - a) * m + 32) >> 6) + a
   6144 ; = (((b - a) * (m << 9) + 16384) >> 15) + a
   6145 ;   except m << 9 overflows int16_t when m == 64 (which is possible),
   6146 ;   but if we negate m it works out (-64 << 9 == -32768).
   6147 ; = (((a - b) * (m * -512) + 16384) >> 15) + a
   6148 cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
   6149 %define base r6-blend_avx2_table
   6150    lea                  r6, [blend_avx2_table]
   6151    tzcnt                wd, wm
   6152    movifnidn            hd, hm
   6153    movsxd               wq, [r6+wq*4]
   6154    movifnidn         maskq, maskmp
   6155    vpbroadcastd         m6, [base+pw_m512]
   6156    add                  wq, r6
   6157    lea                  r6, [dsq*3]
   6158    jmp                  wq
   6159 .w4:
   6160    pmovzxbw             m3, [maskq]
   6161    movq                xm0, [dstq+dsq*0]
   6162    movhps              xm0, [dstq+dsq*1]
   6163    vpbroadcastq         m1, [dstq+dsq*2]
   6164    vpbroadcastq         m2, [dstq+r6   ]
   6165    vpblendd             m0, m1, 0x30
   6166    vpblendd             m0, m2, 0xc0
   6167    psubw                m1, m0, [tmpq]
   6168    add               maskq, 16
   6169    add                tmpq, 32
   6170    pmullw               m3, m6
   6171    pmulhrsw             m1, m3
   6172    paddw                m0, m1
   6173    vextracti128        xm1, m0, 1
   6174    movq       [dstq+dsq*0], xm0
   6175    movhps     [dstq+dsq*1], xm0
   6176    movq       [dstq+dsq*2], xm1
   6177    movhps     [dstq+r6   ], xm1
   6178    lea                dstq, [dstq+dsq*4]
   6179    sub                  hd, 4
   6180    jg .w4
   6181    RET
   6182 .w8:
   6183    pmovzxbw             m4, [maskq+16*0]
   6184    pmovzxbw             m5, [maskq+16*1]
   6185    mova                xm0, [dstq+dsq*0]
   6186    vinserti128          m0, [dstq+dsq*1], 1
   6187    mova                xm1, [dstq+dsq*2]
   6188    vinserti128          m1, [dstq+r6   ], 1
   6189    psubw                m2, m0, [tmpq+32*0]
   6190    psubw                m3, m1, [tmpq+32*1]
   6191    add               maskq, 16*2
   6192    add                tmpq, 32*2
   6193    pmullw               m4, m6
   6194    pmullw               m5, m6
   6195    pmulhrsw             m2, m4
   6196    pmulhrsw             m3, m5
   6197    paddw                m0, m2
   6198    paddw                m1, m3
   6199    mova         [dstq+dsq*0], xm0
   6200    vextracti128 [dstq+dsq*1], m0, 1
   6201    mova         [dstq+dsq*2], xm1
   6202    vextracti128 [dstq+r6   ], m1, 1
   6203    lea                dstq, [dstq+dsq*4]
   6204    sub                  hd, 4
   6205    jg .w8
   6206    RET
   6207 .w16:
   6208    pmovzxbw             m4, [maskq+16*0]
   6209    pmovzxbw             m5, [maskq+16*1]
   6210    mova                 m0,     [dstq+dsq*0]
   6211    psubw                m2, m0, [tmpq+ 32*0]
   6212    mova                 m1,     [dstq+dsq*1]
   6213    psubw                m3, m1, [tmpq+ 32*1]
   6214    add               maskq, 16*2
   6215    add                tmpq, 32*2
   6216    pmullw               m4, m6
   6217    pmullw               m5, m6
   6218    pmulhrsw             m2, m4
   6219    pmulhrsw             m3, m5
   6220    paddw                m0, m2
   6221    paddw                m1, m3
   6222    mova       [dstq+dsq*0], m0
   6223    mova       [dstq+dsq*1], m1
   6224    lea                dstq, [dstq+dsq*2]
   6225    sub                  hd, 2
   6226    jg .w16
   6227    RET
   6228 .w32:
   6229    pmovzxbw             m4, [maskq+16*0]
   6230    pmovzxbw             m5, [maskq+16*1]
   6231    mova                 m0,     [dstq+32*0]
   6232    psubw                m2, m0, [tmpq+32*0]
   6233    mova                 m1,     [dstq+32*1]
   6234    psubw                m3, m1, [tmpq+32*1]
   6235    add               maskq, 16*2
   6236    add                tmpq, 32*2
   6237    pmullw               m4, m6
   6238    pmullw               m5, m6
   6239    pmulhrsw             m2, m4
   6240    pmulhrsw             m3, m5
   6241    paddw                m0, m2
   6242    paddw                m1, m3
   6243    mova        [dstq+32*0], m0
   6244    mova        [dstq+32*1], m1
   6245    add                dstq, dsq
   6246    dec                  hd
   6247    jg .w32
   6248    RET
   6249 
   6250 INIT_XMM avx2
   6251 cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h
   6252 %define base r5-blend_v_avx2_table
   6253    lea                  r5, [blend_v_avx2_table]
   6254    tzcnt                wd, wm
   6255    movifnidn            hd, hm
   6256    movsxd               wq, [r5+wq*4]
   6257    add                  wq, r5
   6258    jmp                  wq
   6259 .w2:
   6260    vpbroadcastd         m2, [base+obmc_masks_avx2+2*2]
   6261 .w2_loop:
   6262    movd                 m0, [dstq+dsq*0]
   6263    pinsrd               m0, [dstq+dsq*1], 1
   6264    movq                 m1, [tmpq]
   6265    add                tmpq, 4*2
   6266    psubw                m1, m0, m1
   6267    pmulhrsw             m1, m2
   6268    paddw                m0, m1
   6269    movd       [dstq+dsq*0], m0
   6270    pextrd     [dstq+dsq*1], m0, 1
   6271    lea                dstq, [dstq+dsq*2]
   6272    sub                  hd, 2
   6273    jg .w2_loop
   6274    RET
   6275 .w4:
   6276    vpbroadcastq         m2, [base+obmc_masks_avx2+4*2]
   6277 .w4_loop:
   6278    movq                 m0, [dstq+dsq*0]
   6279    movhps               m0, [dstq+dsq*1]
   6280    psubw                m1, m0, [tmpq]
   6281    add                tmpq, 8*2
   6282    pmulhrsw             m1, m2
   6283    paddw                m0, m1
   6284    movq       [dstq+dsq*0], m0
   6285    movhps     [dstq+dsq*1], m0
   6286    lea                dstq, [dstq+dsq*2]
   6287    sub                  hd, 2
   6288    jg .w4_loop
   6289    RET
   6290 INIT_YMM avx2
   6291 .w8:
   6292    vbroadcasti128       m2, [base+obmc_masks_avx2+8*2]
   6293 .w8_loop:
   6294    mova                xm0, [dstq+dsq*0]
   6295    vinserti128          m0, [dstq+dsq*1], 1
   6296    psubw                m1, m0, [tmpq]
   6297    add                tmpq, 16*2
   6298    pmulhrsw             m1, m2
   6299    paddw                m0, m1
   6300    mova         [dstq+dsq*0], xm0
   6301    vextracti128 [dstq+dsq*1], m0, 1
   6302    lea                dstq, [dstq+dsq*2]
   6303    sub                  hd, 2
   6304    jg .w8_loop
   6305    RET
   6306 .w16:
   6307    mova                 m4, [base+obmc_masks_avx2+16*2]
   6308 .w16_loop:
   6309    mova                 m0,     [dstq+dsq*0]
   6310    psubw                m2, m0, [tmpq+ 32*0]
   6311    mova                 m1,     [dstq+dsq*1]
   6312    psubw                m3, m1, [tmpq+ 32*1]
   6313    add                tmpq, 32*2
   6314    pmulhrsw             m2, m4
   6315    pmulhrsw             m3, m4
   6316    paddw                m0, m2
   6317    paddw                m1, m3
   6318    mova       [dstq+dsq*0], m0
   6319    mova       [dstq+dsq*1], m1
   6320    lea                dstq, [dstq+dsq*2]
   6321    sub                  hd, 2
   6322    jg .w16_loop
   6323    RET
   6324 .w32:
   6325 %if WIN64
   6326    movaps         [rsp+ 8], xmm6
   6327    movaps         [rsp+24], xmm7
   6328 %endif
   6329    mova                 m6, [base+obmc_masks_avx2+32*2]
   6330    vbroadcasti128       m7, [base+obmc_masks_avx2+32*3]
   6331 .w32_loop:
   6332    mova                 m0,     [dstq+dsq*0+32*0]
   6333    psubw                m3, m0, [tmpq      +32*0]
   6334    mova                xm2,     [dstq+dsq*0+32*1]
   6335    mova                xm5,     [tmpq      +32*1]
   6336    mova                 m1,     [dstq+dsq*1+32*0]
   6337    psubw                m4, m1, [tmpq      +32*2]
   6338    vinserti128          m2,     [dstq+dsq*1+32*1], 1
   6339    vinserti128          m5,     [tmpq      +32*3], 1
   6340    add                tmpq, 32*4
   6341    psubw                m5, m2, m5
   6342    pmulhrsw             m3, m6
   6343    pmulhrsw             m4, m6
   6344    pmulhrsw             m5, m7
   6345    paddw                m0, m3
   6346    paddw                m1, m4
   6347    paddw                m2, m5
   6348    mova         [dstq+dsq*0+32*0], m0
   6349    mova         [dstq+dsq*1+32*0], m1
   6350    mova         [dstq+dsq*0+32*1], xm2
   6351    vextracti128 [dstq+dsq*1+32*1], m2, 1
   6352    lea                dstq, [dstq+dsq*2]
   6353    sub                  hd, 2
   6354    jg .w32_loop
   6355 %if WIN64
   6356    movaps             xmm6, [rsp+ 8]
   6357    movaps             xmm7, [rsp+24]
   6358 %endif
   6359    RET
   6360 
   6361 %macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
   6362    mova                 m0,     [dstq+32*(%1+0)]
   6363    psubw                m2, m0, [tmpq+32*(%2+0)]
   6364    mova                 m1,     [dstq+32*(%1+1)]
   6365    psubw                m3, m1, [tmpq+32*(%2+1)]
   6366 %if %3
   6367    add                tmpq, 32*%3
   6368 %endif
   6369    pmulhrsw             m2, m4
   6370    pmulhrsw             m3, m4
   6371    paddw                m0, m2
   6372    paddw                m1, m3
   6373    mova   [dstq+32*(%1+0)], m0
   6374    mova   [dstq+32*(%1+1)], m1
   6375 %endmacro
   6376 
   6377 INIT_XMM avx2
   6378 cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
   6379 %define base r5-blend_h_avx2_table
   6380    lea                  r5, [blend_h_avx2_table]
   6381    tzcnt                wd, wm
   6382    mov                  hd, hm
   6383    movsxd               wq, [r5+wq*4]
   6384    add                  wq, r5
   6385    lea               maskq, [base+obmc_masks_avx2+hq*2]
   6386    lea                  hd, [hq*3]
   6387    shr                  hd, 2 ; h * 3/4
   6388    lea               maskq, [maskq+hq*2]
   6389    neg                  hq
   6390    jmp                  wq
   6391 .w2:
   6392    movd                 m0, [dstq+dsq*0]
   6393    pinsrd               m0, [dstq+dsq*1], 1
   6394    movd                 m2, [maskq+hq*2]
   6395    movq                 m1, [tmpq]
   6396    add                tmpq, 4*2
   6397    punpcklwd            m2, m2
   6398    psubw                m1, m0, m1
   6399    pmulhrsw             m1, m2
   6400    paddw                m0, m1
   6401    movd       [dstq+dsq*0], m0
   6402    pextrd     [dstq+dsq*1], m0, 1
   6403    lea                dstq, [dstq+dsq*2]
   6404    add                  hq, 2
   6405    jl .w2
   6406    RET
   6407 .w4:
   6408    mova                 m3, [blend_shuf]
   6409 .w4_loop:
   6410    movq                 m0, [dstq+dsq*0]
   6411    movhps               m0, [dstq+dsq*1]
   6412    movd                 m2, [maskq+hq*2]
   6413    psubw                m1, m0, [tmpq]
   6414    add                tmpq, 8*2
   6415    pshufb               m2, m3
   6416    pmulhrsw             m1, m2
   6417    paddw                m0, m1
   6418    movq       [dstq+dsq*0], m0
   6419    movhps     [dstq+dsq*1], m0
   6420    lea                dstq, [dstq+dsq*2]
   6421    add                  hq, 2
   6422    jl .w4_loop
   6423    RET
   6424 INIT_YMM avx2
   6425 .w8:
   6426    vbroadcasti128       m3, [blend_shuf]
   6427    shufpd               m3, m3, 0x0c
   6428 .w8_loop:
   6429    mova                xm0, [dstq+dsq*0]
   6430    vinserti128          m0, [dstq+dsq*1], 1
   6431    vpbroadcastd         m2, [maskq+hq*2]
   6432    psubw                m1, m0, [tmpq]
   6433    add                tmpq, 16*2
   6434    pshufb               m2, m3
   6435    pmulhrsw             m1, m2
   6436    paddw                m0, m1
   6437    mova         [dstq+dsq*0], xm0
   6438    vextracti128 [dstq+dsq*1], m0, 1
   6439    lea                dstq, [dstq+dsq*2]
   6440    add                  hq, 2
   6441    jl .w8_loop
   6442    RET
   6443 .w16:
   6444    vpbroadcastw         m4, [maskq+hq*2]
   6445    vpbroadcastw         m5, [maskq+hq*2+2]
   6446    mova                 m0,     [dstq+dsq*0]
   6447    psubw                m2, m0, [tmpq+ 32*0]
   6448    mova                 m1,     [dstq+dsq*1]
   6449    psubw                m3, m1, [tmpq+ 32*1]
   6450    add                tmpq, 32*2
   6451    pmulhrsw             m2, m4
   6452    pmulhrsw             m3, m5
   6453    paddw                m0, m2
   6454    paddw                m1, m3
   6455    mova       [dstq+dsq*0], m0
   6456    mova       [dstq+dsq*1], m1
   6457    lea                dstq, [dstq+dsq*2]
   6458    add                  hq, 2
   6459    jl .w16
   6460    RET
   6461 .w32:
   6462    vpbroadcastw         m4, [maskq+hq*2]
   6463    BLEND_H_ROW           0, 0, 2
   6464    add                dstq, dsq
   6465    inc                  hq
   6466    jl .w32
   6467    RET
   6468 .w64:
   6469    vpbroadcastw         m4, [maskq+hq*2]
   6470    BLEND_H_ROW           0, 0
   6471    BLEND_H_ROW           2, 2, 4
   6472    add                dstq, dsq
   6473    inc                  hq
   6474    jl .w64
   6475    RET
   6476 .w128:
   6477    vpbroadcastw         m4, [maskq+hq*2]
   6478    BLEND_H_ROW           0,  0
   6479    BLEND_H_ROW           2,  2, 8
   6480    BLEND_H_ROW           4, -4
   6481    BLEND_H_ROW           6, -2
   6482    add                dstq, dsq
   6483    inc                  hq
   6484    jl .w128
   6485    RET
   6486 
   6487 cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
   6488                                   bottomext, rightext
   6489    ; we assume that the buffer (stride) is larger than width, so we can
   6490    ; safely overwrite by a few bytes
   6491 
   6492    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
   6493    xor                r12d, r12d
   6494    lea                 r10, [ihq-1]
   6495    cmp                  yq, ihq
   6496    cmovs               r10, yq
   6497    test                 yq, yq
   6498    cmovs               r10, r12
   6499    imul                r10, sstrideq
   6500    add                srcq, r10
   6501 
   6502    ; ref += iclip(x, 0, iw - 1)
   6503    lea                 r10, [iwq-1]
   6504    cmp                  xq, iwq
   6505    cmovs               r10, xq
   6506    test                 xq, xq
   6507    cmovs               r10, r12
   6508    lea                srcq, [srcq+r10*2]
   6509 
   6510    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
   6511    lea          bottomextq, [yq+bhq]
   6512    sub          bottomextq, ihq
   6513    lea                  r3, [bhq-1]
   6514    cmovs        bottomextq, r12
   6515 
   6516    DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
   6517                bottomext, rightext
   6518 
   6519    ; top_ext = iclip(-y, 0, bh - 1)
   6520    neg             topextq
   6521    cmovs           topextq, r12
   6522    cmp          bottomextq, bhq
   6523    cmovns       bottomextq, r3
   6524    cmp             topextq, bhq
   6525    cmovg           topextq, r3
   6526 
   6527    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
   6528    lea           rightextq, [xq+bwq]
   6529    sub           rightextq, iwq
   6530    lea                  r2, [bwq-1]
   6531    cmovs         rightextq, r12
   6532 
   6533    DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
   6534                bottomext, rightext
   6535 
   6536    ; left_ext = iclip(-x, 0, bw - 1)
   6537    neg            leftextq
   6538    cmovs          leftextq, r12
   6539    cmp           rightextq, bwq
   6540    cmovns        rightextq, r2
   6541    cmp            leftextq, bwq
   6542    cmovns         leftextq, r2
   6543 
   6544    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
   6545                dst, dstride, src, sstride, bottomext, rightext
   6546 
   6547    ; center_h = bh - top_ext - bottom_ext
   6548    lea                  r3, [bottomextq+topextq]
   6549    sub            centerhq, r3
   6550 
   6551    ; blk += top_ext * PXSTRIDE(dst_stride)
   6552    mov                  r2, topextq
   6553    imul                 r2, dstrideq
   6554    add                dstq, r2
   6555    mov                 r9m, dstq
   6556 
   6557    ; center_w = bw - left_ext - right_ext
   6558    mov            centerwq, bwq
   6559    lea                  r3, [rightextq+leftextq]
   6560    sub            centerwq, r3
   6561 
   6562 %macro v_loop 3 ; need_left_ext, need_right_ext, suffix
   6563 .v_loop_%3:
   6564 %if %1
   6565    ; left extension
   6566    xor                  r3, r3
   6567    vpbroadcastw         m0, [srcq]
   6568 .left_loop_%3:
   6569    mova        [dstq+r3*2], m0
   6570    add                  r3, 16
   6571    cmp                  r3, leftextq
   6572    jl .left_loop_%3
   6573 
   6574    ; body
   6575    lea                 r12, [dstq+leftextq*2]
   6576 %endif
   6577    xor                  r3, r3
   6578 .body_loop_%3:
   6579    movu                 m0, [srcq+r3*2]
   6580 %if %1
   6581    movu         [r12+r3*2], m0
   6582 %else
   6583    movu        [dstq+r3*2], m0
   6584 %endif
   6585    add                  r3, 16
   6586    cmp                  r3, centerwq
   6587    jl .body_loop_%3
   6588 
   6589 %if %2
   6590    ; right extension
   6591 %if %1
   6592    lea                 r12, [r12+centerwq*2]
   6593 %else
   6594    lea                 r12, [dstq+centerwq*2]
   6595 %endif
   6596    xor                  r3, r3
   6597    vpbroadcastw         m0, [srcq+centerwq*2-2]
   6598 .right_loop_%3:
   6599    movu         [r12+r3*2], m0
   6600    add                  r3, 16
   6601    cmp                  r3, rightextq
   6602    jl .right_loop_%3
   6603 
   6604 %endif
   6605    add                dstq, dstrideq
   6606    add                srcq, sstrideq
   6607    dec            centerhq
   6608    jg .v_loop_%3
   6609 %endmacro
   6610 
   6611    test           leftextq, leftextq
   6612    jnz .need_left_ext
   6613    test          rightextq, rightextq
   6614    jnz .need_right_ext
   6615    v_loop                0, 0, 0
   6616    jmp .body_done
   6617 
   6618 .need_left_ext:
   6619    test          rightextq, rightextq
   6620    jnz .need_left_right_ext
   6621    v_loop                1, 0, 1
   6622    jmp .body_done
   6623 
   6624 .need_left_right_ext:
   6625    v_loop                1, 1, 2
   6626    jmp .body_done
   6627 
   6628 .need_right_ext:
   6629    v_loop                0, 1, 3
   6630 
   6631 .body_done:
   6632    ; bottom edge extension
   6633    test         bottomextq, bottomextq
   6634    jz .top
   6635    mov                srcq, dstq
   6636    sub                srcq, dstrideq
   6637    xor                  r1, r1
   6638 .bottom_x_loop:
   6639    mova                 m0, [srcq+r1*2]
   6640    lea                  r3, [dstq+r1*2]
   6641    mov                  r4, bottomextq
   6642 .bottom_y_loop:
   6643    mova               [r3], m0
   6644    add                  r3, dstrideq
   6645    dec                  r4
   6646    jg .bottom_y_loop
   6647    add                  r1, 16
   6648    cmp                  r1, bwq
   6649    jl .bottom_x_loop
   6650 
   6651 .top:
   6652    ; top edge extension
   6653    test            topextq, topextq
   6654    jz .end
   6655    mov                srcq, r9m
   6656    mov                dstq, dstm
   6657    xor                  r1, r1
   6658 .top_x_loop:
   6659    mova                 m0, [srcq+r1*2]
   6660    lea                  r3, [dstq+r1*2]
   6661    mov                  r4, topextq
   6662 .top_y_loop:
   6663    mova               [r3], m0
   6664    add                  r3, dstrideq
   6665    dec                  r4
   6666    jg .top_y_loop
   6667    add                  r1, 16
   6668    cmp                  r1, bwq
   6669    jl .top_x_loop
   6670 
   6671 .end:
   6672    RET
   6673 
   6674 cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
   6675                                 dst_w, h, src_w, dx, mx0, pxmax
   6676    sub          dword mx0m, 4<<14
   6677    sub        dword src_wm, 8
   6678    vpbroadcastd         m5, dxm
   6679    vpbroadcastd         m8, mx0m
   6680    vpbroadcastd         m6, src_wm
   6681 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
   6682    LEA                  r7, $$
   6683 %define base r7-$$
   6684    vpbroadcastd         m3, [base+pd_64]
   6685    vpbroadcastw        xm7, pxmaxm
   6686    pmaddwd              m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
   6687    pslld                m5, 3                      ; dx*8
   6688    pslld                m6, 14
   6689    paddd                m8, m2                     ; mx+[0..7]*dx
   6690 .loop_y:
   6691    xor                  xd, xd
   6692    mova                 m4, m8             ; per-line working version of mx
   6693 .loop_x:
   6694    vpbroadcastd        m10, [base+pd_63]
   6695    pxor                 m2, m2
   6696    pmaxsd               m0, m4, m2
   6697    psrad                m9, m4, 8          ; filter offset (unmasked)
   6698    pminsd               m0, m6             ; iclip(mx, 0, src_w-8)
   6699    psubd                m1, m4, m0         ; pshufb offset
   6700    psrad                m0, 14             ; clipped src_x offset
   6701    psrad                m1, 14             ; pshufb edge_emu offset
   6702    pand                 m9, m10            ; filter offset (masked)
   6703    ; load source pixels
   6704    movd                r8d, xm0
   6705    pextrd              r9d, xm0, 1
   6706    pextrd             r10d, xm0, 2
   6707    pextrd             r11d, xm0, 3
   6708    vextracti128        xm0, m0, 1
   6709    movu               xm10, [srcq+r8*2]
   6710    movu               xm11, [srcq+r9*2]
   6711    movu               xm12, [srcq+r10*2]
   6712    movu               xm13, [srcq+r11*2]
   6713    movd                r8d, xm0
   6714    pextrd              r9d, xm0, 1
   6715    pextrd             r10d, xm0, 2
   6716    pextrd             r11d, xm0, 3
   6717    vinserti128         m10, [srcq+r8*2], 1
   6718    vinserti128         m11, [srcq+r9*2], 1
   6719    vinserti128         m12, [srcq+r10*2], 1
   6720    vinserti128         m13, [srcq+r11*2], 1
   6721    ptest                m1, m1
   6722    jz .filter
   6723    movq                 r9, xm1
   6724    pextrq              r11, xm1, 1
   6725    movsxd               r8, r9d
   6726    sar                  r9, 32
   6727    movsxd              r10, r11d
   6728    sar                 r11, 32
   6729    vextracti128        xm1, m1, 1
   6730    movu               xm14, [base+resize_shuf+8+r8*2]
   6731    movu               xm15, [base+resize_shuf+8+r9*2]
   6732    movu                xm0, [base+resize_shuf+8+r10*2]
   6733    movu                xm2, [base+resize_shuf+8+r11*2]
   6734    movq                 r9, xm1
   6735    pextrq              r11, xm1, 1
   6736    movsxd               r8, r9d
   6737    sar                  r9, 32
   6738    movsxd              r10, r11d
   6739    sar                 r11, 32
   6740    vinserti128         m14, [base+resize_shuf+8+r8*2], 1
   6741    vinserti128         m15, [base+resize_shuf+8+r9*2], 1
   6742    vinserti128          m0, [base+resize_shuf+8+r10*2], 1
   6743    vinserti128          m2, [base+resize_shuf+8+r11*2], 1
   6744    pshufb              m10, m14
   6745    pshufb              m11, m15
   6746    pshufb              m12, m0
   6747    pshufb              m13, m2
   6748 .filter:
   6749    movd                r8d, xm9
   6750    pextrd              r9d, xm9, 1
   6751    pextrd             r10d, xm9, 2
   6752    pextrd             r11d, xm9, 3
   6753    vextracti128        xm9, m9, 1
   6754    movq               xm14, [base+resize_filter+r8*8]
   6755    movq               xm15, [base+resize_filter+r9*8]
   6756    movq                xm0, [base+resize_filter+r10*8]
   6757    movq                xm2, [base+resize_filter+r11*8]
   6758    movd                r8d, xm9
   6759    pextrd              r9d, xm9, 1
   6760    pextrd             r10d, xm9, 2
   6761    pextrd             r11d, xm9, 3
   6762    movhps             xm14, [base+resize_filter+r8*8]
   6763    movhps             xm15, [base+resize_filter+r9*8]
   6764    movhps              xm0, [base+resize_filter+r10*8]
   6765    movhps              xm2, [base+resize_filter+r11*8]
   6766    pmovsxbw            m14, xm14
   6767    pmovsxbw            m15, xm15
   6768    pmovsxbw             m0, xm0
   6769    pmovsxbw             m2, xm2
   6770    pmaddwd             m10, m14
   6771    pmaddwd             m11, m15
   6772    pmaddwd             m12, m0
   6773    pmaddwd             m13, m2
   6774    phaddd              m10, m11
   6775    phaddd              m12, m13
   6776    phaddd              m10, m12
   6777    psubd               m10, m3, m10
   6778    psrad               m10, 7
   6779    vextracti128        xm0, m10, 1
   6780    packusdw           xm10, xm0
   6781    pminsw             xm10, xm7
   6782    mova        [dstq+xq*2], xm10
   6783    paddd                m4, m5
   6784    add                  xd, 8
   6785    cmp                  xd, dst_wd
   6786    jl .loop_x
   6787    add                dstq, dst_strideq
   6788    add                srcq, src_strideq
   6789    dec                  hd
   6790    jg .loop_y
   6791    RET
   6792 
   6793 %endif ; ARCH_X86_64