tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc16_sse.asm (327466B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 SECTION_RODATA
     30 
     31 ; dav1d_obmc_masks[] << 9
     32 obmc_masks:     dw     0,     0,  9728,     0, 12800,  7168,  2560,     0
     33                dw 14336, 11264,  8192,  5632,  3584,  1536,     0,     0
     34                dw 15360, 13824, 12288, 10752,  9216,  7680,  6144,  5120
     35                dw  4096,  3072,  2048,  1536,     0,     0,     0,     0
     36                dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240
     37                dw  9728,  8704,  8192,  7168,  6656,  6144,  5632,  4608
     38                dw  4096,  3584,  3072,  2560,  2048,  2048,  1536,  1024
     39 
     40 blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
     41 spel_h_shufA:   db 0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
     42 spel_h_shufB:   db 4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
     43 spel_h_shuf2:   db 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
     44 spel_s_shuf2:   db 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7
     45 spel_s_shuf8:   db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
     46 unpckw:         db 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
     47 rescale_mul:    dd 0,  1,  2,  3
     48 resize_shuf:    db 0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
     49                db 8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
     50 bdct_lb_q: times 8 db 0
     51           times 8 db 4
     52           times 8 db 8
     53           times 8 db 12
     54 
     55 pw_2:             times 8 dw 2
     56 pw_16:            times 4 dw 16
     57 prep_mul:         times 4 dw 16
     58                  times 8 dw 4
     59 pw_64:            times 8 dw 64
     60 pw_256:           times 8 dw 256
     61 pw_2048:          times 4 dw 2048
     62 bidir_mul:        times 4 dw 2048
     63 pw_8192:          times 8 dw 8192
     64 pw_27615:         times 8 dw 27615
     65 pw_32766:         times 8 dw 32766
     66 pw_m512:          times 8 dw -512
     67 pd_63:            times 4 dd 63
     68 pd_64:            times 4 dd 64
     69 pd_512:           times 4 dd 512
     70 pd_2560:          times 2 dd 2560
     71 pd_8704:          times 2 dd 8704
     72 pd_m524256:       times 4 dd -524256 ; -8192 << 6 + 32
     73 pd_0x3ff:         times 4 dd 0x3ff
     74 pd_0x4000:        times 4 dd 0x4000
     75 pq_0x400000:      times 2 dq 0x400000
     76 pq_0x40000000:    times 2 dq 0x40000000
     77 pd_65538:         times 2 dd 65538
     78 
     79 put_bilin_h_rnd:  times 4 dw 8
     80                  times 4 dw 10
     81 s_8tap_h_rnd:     times 2 dd 2
     82                  times 2 dd 8
     83 put_s_8tap_v_rnd: times 2 dd 512
     84                  times 2 dd 128
     85 s_8tap_h_sh:      dd 2, 4
     86 put_s_8tap_v_sh:  dd 10, 8
     87 bidir_rnd:        times 4 dw -16400
     88                  times 4 dw -16388
     89 put_8tap_h_rnd:   dd 34, 34, 40, 40
     90 prep_8tap_1d_rnd: times 2 dd     8 - (8192 <<  4)
     91 prep_8tap_2d_rnd: times 4 dd    32 - (8192 <<  5)
     92 
     93 warp8x8_shift:    dd 11, 13
     94 warp8x8_rnd1:     dd 1024, 1024, 4096, 4096
     95 warp8x8_rnd2:     times 4 dw 4096
     96                  times 4 dw 16384
     97 warp8x8t_rnd:     times 2 dd 16384 - (8192 << 15)
     98 
     99 %macro BIDIR_JMP_TABLE 2-*
    100    %xdefine %1_%2_table (%%table - 2*%3)
    101    %xdefine %%base %1_%2_table
    102    %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
    103    %%table:
    104    %rep %0 - 2
    105        dd %%prefix %+ .w%3 - %%base
    106        %rotate 1
    107    %endrep
    108 %endmacro
    109 
    110 BIDIR_JMP_TABLE avg,        ssse3,    4, 8, 16, 32, 64, 128
    111 BIDIR_JMP_TABLE w_avg,      ssse3,    4, 8, 16, 32, 64, 128
    112 BIDIR_JMP_TABLE mask,       ssse3,    4, 8, 16, 32, 64, 128
    113 BIDIR_JMP_TABLE w_mask_420, ssse3,    4, 8, 16, 32, 64, 128
    114 BIDIR_JMP_TABLE w_mask_422, ssse3,    4, 8, 16, 32, 64, 128
    115 BIDIR_JMP_TABLE w_mask_444, ssse3,    4, 8, 16, 32, 64, 128
    116 BIDIR_JMP_TABLE blend,      ssse3,    4, 8, 16, 32
    117 BIDIR_JMP_TABLE blend_v,    ssse3, 2, 4, 8, 16, 32
    118 BIDIR_JMP_TABLE blend_h,    ssse3, 2, 4, 8, 16, 32, 64, 128
    119 
    120 %macro BASE_JMP_TABLE 3-*
    121    %xdefine %1_%2_table (%%table - %3)
    122    %xdefine %%base %1_%2
    123    %%table:
    124    %rep %0 - 2
    125        dw %%base %+ _w%3 - %%base
    126        %rotate 1
    127    %endrep
    128 %endmacro
    129 
    130 %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put)
    131 %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep)
    132 
    133 BASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
    134 BASE_JMP_TABLE prep, ssse3,    4, 8, 16, 32, 64, 128
    135 
    136 %macro SCALED_JMP_TABLE 2-*
    137    %xdefine %1_%2_table (%%table - %3)
    138    %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
    139 %%table:
    140    %rep %0 - 2
    141        dw %%base %+ .w%3 - %%base
    142        %rotate 1
    143    %endrep
    144    %rotate 2
    145 %%dy_1024:
    146    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
    147    %rep %0 - 2
    148        dw %%base %+ .dy1_w%3 - %%base
    149        %rotate 1
    150    %endrep
    151    %rotate 2
    152 %%dy_2048:
    153    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
    154    %rep %0 - 2
    155        dw %%base %+ .dy2_w%3 - %%base
    156        %rotate 1
    157    %endrep
    158 %endmacro
    159 
    160 SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
    161 SCALED_JMP_TABLE prep_8tap_scaled, ssse3,   4, 8, 16, 32, 64, 128
    162 
    163 cextern mc_subpel_filters
    164 %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
    165 
    166 cextern mc_warp_filter
    167 cextern resize_filter
    168 
    169 SECTION .text
    170 
    171 %if UNIX64
    172 DECLARE_REG_TMP 7
    173 %else
    174 DECLARE_REG_TMP 5
    175 %endif
    176 
    177 INIT_XMM ssse3
    178 cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy
    179 %define base t0-put_ssse3
    180    mov                mxyd, r6m ; mx
    181    LEA                  t0, put_ssse3
    182    movifnidn            wd, wm
    183    test               mxyd, mxyd
    184    jnz .h
    185    mov                mxyd, r7m ; my
    186    test               mxyd, mxyd
    187    jnz .v
    188 .put:
    189    tzcnt                wd, wd
    190    movzx                wd, word [base+put_ssse3_table+wq*2]
    191    add                  wq, t0
    192    movifnidn            hd, hm
    193    jmp                  wq
    194 .put_w2:
    195    mov                 r4d, [srcq+ssq*0]
    196    mov                 r6d, [srcq+ssq*1]
    197    lea                srcq, [srcq+ssq*2]
    198    mov        [dstq+dsq*0], r4d
    199    mov        [dstq+dsq*1], r6d
    200    lea                dstq, [dstq+dsq*2]
    201    sub                  hd, 2
    202    jg .put_w2
    203    RET
    204 .put_w4:
    205    movq                 m0, [srcq+ssq*0]
    206    movq                 m1, [srcq+ssq*1]
    207    lea                srcq, [srcq+ssq*2]
    208    movq       [dstq+dsq*0], m0
    209    movq       [dstq+dsq*1], m1
    210    lea                dstq, [dstq+dsq*2]
    211    sub                  hd, 2
    212    jg .put_w4
    213    RET
    214 .put_w8:
    215    movu                 m0, [srcq+ssq*0]
    216    movu                 m1, [srcq+ssq*1]
    217    lea                srcq, [srcq+ssq*2]
    218    mova       [dstq+dsq*0], m0
    219    mova       [dstq+dsq*1], m1
    220    lea                dstq, [dstq+dsq*2]
    221    sub                  hd, 2
    222    jg .put_w8
    223    RET
    224 .put_w16:
    225    movu                 m0, [srcq+ssq*0+16*0]
    226    movu                 m1, [srcq+ssq*0+16*1]
    227    movu                 m2, [srcq+ssq*1+16*0]
    228    movu                 m3, [srcq+ssq*1+16*1]
    229    lea                srcq, [srcq+ssq*2]
    230    mova  [dstq+dsq*0+16*0], m0
    231    mova  [dstq+dsq*0+16*1], m1
    232    mova  [dstq+dsq*1+16*0], m2
    233    mova  [dstq+dsq*1+16*1], m3
    234    lea                dstq, [dstq+dsq*2]
    235    sub                  hd, 2
    236    jg .put_w16
    237    RET
    238 .put_w32:
    239    movu                 m0, [srcq+16*0]
    240    movu                 m1, [srcq+16*1]
    241    movu                 m2, [srcq+16*2]
    242    movu                 m3, [srcq+16*3]
    243    add                srcq, ssq
    244    mova        [dstq+16*0], m0
    245    mova        [dstq+16*1], m1
    246    mova        [dstq+16*2], m2
    247    mova        [dstq+16*3], m3
    248    add                dstq, dsq
    249    dec                  hd
    250    jg .put_w32
    251    RET
    252 .put_w64:
    253    movu                 m0, [srcq+16*0]
    254    movu                 m1, [srcq+16*1]
    255    movu                 m2, [srcq+16*2]
    256    movu                 m3, [srcq+16*3]
    257    mova        [dstq+16*0], m0
    258    mova        [dstq+16*1], m1
    259    mova        [dstq+16*2], m2
    260    mova        [dstq+16*3], m3
    261    movu                 m0, [srcq+16*4]
    262    movu                 m1, [srcq+16*5]
    263    movu                 m2, [srcq+16*6]
    264    movu                 m3, [srcq+16*7]
    265    add                srcq, ssq
    266    mova        [dstq+16*4], m0
    267    mova        [dstq+16*5], m1
    268    mova        [dstq+16*6], m2
    269    mova        [dstq+16*7], m3
    270    add                dstq, dsq
    271    dec                  hd
    272    jg .put_w64
    273    RET
    274 .put_w128:
    275    add                srcq, 16*8
    276    add                dstq, 16*8
    277 .put_w128_loop:
    278    movu                 m0, [srcq-16*8]
    279    movu                 m1, [srcq-16*7]
    280    movu                 m2, [srcq-16*6]
    281    movu                 m3, [srcq-16*5]
    282    mova        [dstq-16*8], m0
    283    mova        [dstq-16*7], m1
    284    mova        [dstq-16*6], m2
    285    mova        [dstq-16*5], m3
    286    movu                 m0, [srcq-16*4]
    287    movu                 m1, [srcq-16*3]
    288    movu                 m2, [srcq-16*2]
    289    movu                 m3, [srcq-16*1]
    290    mova        [dstq-16*4], m0
    291    mova        [dstq-16*3], m1
    292    mova        [dstq-16*2], m2
    293    mova        [dstq-16*1], m3
    294    movu                 m0, [srcq+16*0]
    295    movu                 m1, [srcq+16*1]
    296    movu                 m2, [srcq+16*2]
    297    movu                 m3, [srcq+16*3]
    298    mova        [dstq+16*0], m0
    299    mova        [dstq+16*1], m1
    300    mova        [dstq+16*2], m2
    301    mova        [dstq+16*3], m3
    302    movu                 m0, [srcq+16*4]
    303    movu                 m1, [srcq+16*5]
    304    movu                 m2, [srcq+16*6]
    305    movu                 m3, [srcq+16*7]
    306    add                srcq, ssq
    307    mova        [dstq+16*4], m0
    308    mova        [dstq+16*5], m1
    309    mova        [dstq+16*6], m2
    310    mova        [dstq+16*7], m3
    311    add                dstq, dsq
    312    dec                  hd
    313    jg .put_w128_loop
    314    RET
    315 .h:
    316    movd                 m5, mxyd
    317    mov                mxyd, r7m ; my
    318    mova                 m4, [base+pw_16]
    319    pshufb               m5, [base+pw_256]
    320    psubw                m4, m5
    321    test               mxyd, mxyd
    322    jnz .hv
    323    ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
    324    mov                 r6d, r8m ; bitdepth_max
    325    shr                 r6d, 11
    326    movddup              m3, [base+put_bilin_h_rnd+r6*8]
    327    movifnidn            hd, hm
    328    sub                  wd, 8
    329    jg .h_w16
    330    je .h_w8
    331    cmp                  wd, -4
    332    je .h_w4
    333 .h_w2:
    334    movq                 m1, [srcq+ssq*0]
    335    movhps               m1, [srcq+ssq*1]
    336    lea                srcq, [srcq+ssq*2]
    337    pmullw               m0, m4, m1
    338    psrlq                m1, 16
    339    pmullw               m1, m5
    340    paddw                m0, m3
    341    paddw                m0, m1
    342    psrlw                m0, 4
    343    movd       [dstq+dsq*0], m0
    344    punpckhqdq           m0, m0
    345    movd       [dstq+dsq*1], m0
    346    lea                dstq, [dstq+dsq*2]
    347    sub                  hd, 2
    348    jg .h_w2
    349    RET
    350 .h_w4:
    351    movq                 m0, [srcq+ssq*0]
    352    movhps               m0, [srcq+ssq*1]
    353    movq                 m1, [srcq+ssq*0+2]
    354    movhps               m1, [srcq+ssq*1+2]
    355    lea                srcq, [srcq+ssq*2]
    356    pmullw               m0, m4
    357    pmullw               m1, m5
    358    paddw                m0, m3
    359    paddw                m0, m1
    360    psrlw                m0, 4
    361    movq       [dstq+dsq*0], m0
    362    movhps     [dstq+dsq*1], m0
    363    lea                dstq, [dstq+dsq*2]
    364    sub                  hd, 2
    365    jg .h_w4
    366    RET
    367 .h_w8:
    368    movu                 m0, [srcq+ssq*0]
    369    movu                 m1, [srcq+ssq*0+2]
    370    pmullw               m0, m4
    371    pmullw               m1, m5
    372    paddw                m0, m3
    373    paddw                m0, m1
    374    movu                 m1, [srcq+ssq*1]
    375    movu                 m2, [srcq+ssq*1+2]
    376    lea                srcq, [srcq+ssq*2]
    377    pmullw               m1, m4
    378    pmullw               m2, m5
    379    paddw                m1, m3
    380    paddw                m1, m2
    381    psrlw                m0, 4
    382    psrlw                m1, 4
    383    mova       [dstq+dsq*0], m0
    384    mova       [dstq+dsq*1], m1
    385    lea                dstq, [dstq+dsq*2]
    386    sub                  hd, 2
    387    jg .h_w8
    388    RET
    389 .h_w16:
    390    lea                srcq, [srcq+wq*2]
    391    lea                dstq, [dstq+wq*2]
    392    neg                  wq
    393 .h_w16_loop0:
    394    mov                  r6, wq
    395 .h_w16_loop:
    396    movu                 m0, [srcq+r6*2+ 0]
    397    movu                 m1, [srcq+r6*2+ 2]
    398    pmullw               m0, m4
    399    pmullw               m1, m5
    400    paddw                m0, m3
    401    paddw                m0, m1
    402    movu                 m1, [srcq+r6*2+16]
    403    movu                 m2, [srcq+r6*2+18]
    404    pmullw               m1, m4
    405    pmullw               m2, m5
    406    paddw                m1, m3
    407    paddw                m1, m2
    408    psrlw                m0, 4
    409    psrlw                m1, 4
    410    mova   [dstq+r6*2+16*0], m0
    411    mova   [dstq+r6*2+16*1], m1
    412    add                  r6, 16
    413    jl .h_w16_loop
    414    add                srcq, ssq
    415    add                dstq, dsq
    416    dec                  hd
    417    jg .h_w16_loop0
    418    RET
    419 .v:
    420    shl                mxyd, 11
    421    movd                 m5, mxyd
    422    pshufb               m5, [base+pw_256]
    423    movifnidn            hd, hm
    424    cmp                  wd, 4
    425    jg .v_w8
    426    je .v_w4
    427 .v_w2:
    428    movd                 m0, [srcq+ssq*0]
    429 .v_w2_loop:
    430    movd                 m1, [srcq+ssq*1]
    431    lea                srcq, [srcq+ssq*2]
    432    punpcklqdq           m2, m0, m1
    433    movd                 m0, [srcq+ssq*0]
    434    punpcklqdq           m1, m0
    435    psubw                m1, m2
    436    pmulhrsw             m1, m5
    437    paddw                m1, m2
    438    movd       [dstq+dsq*0], m1
    439    punpckhqdq           m1, m1
    440    movd       [dstq+dsq*1], m1
    441    lea                dstq, [dstq+dsq*2]
    442    sub                  hd, 2
    443    jg .v_w2_loop
    444    RET
    445 .v_w4:
    446    movq                 m0, [srcq+ssq*0]
    447 .v_w4_loop:
    448    movq                 m1, [srcq+ssq*1]
    449    lea                srcq, [srcq+ssq*2]
    450    punpcklqdq           m2, m0, m1
    451    movq                 m0, [srcq+ssq*0]
    452    punpcklqdq           m1, m0
    453    psubw                m1, m2
    454    pmulhrsw             m1, m5
    455    paddw                m1, m2
    456    movq       [dstq+dsq*0], m1
    457    movhps     [dstq+dsq*1], m1
    458    lea                dstq, [dstq+dsq*2]
    459    sub                  hd, 2
    460    jg .v_w4_loop
    461    RET
    462 .v_w8:
    463 %if ARCH_X86_64
    464 %if WIN64
    465    push                 r7
    466 %endif
    467    shl                  wd, 5
    468    mov                  r7, srcq
    469    lea                 r6d, [wq+hq-256]
    470    mov                  r4, dstq
    471 %else
    472    mov                  r6, srcq
    473 %endif
    474 .v_w8_loop0:
    475    movu                 m0, [srcq+ssq*0]
    476 .v_w8_loop:
    477    movu                 m3, [srcq+ssq*1]
    478    lea                srcq, [srcq+ssq*2]
    479    psubw                m1, m3, m0
    480    pmulhrsw             m1, m5
    481    paddw                m1, m0
    482    movu                 m0, [srcq+ssq*0]
    483    psubw                m2, m0, m3
    484    pmulhrsw             m2, m5
    485    paddw                m2, m3
    486    mova       [dstq+dsq*0], m1
    487    mova       [dstq+dsq*1], m2
    488    lea                dstq, [dstq+dsq*2]
    489    sub                  hd, 2
    490    jg .v_w8_loop
    491 %if ARCH_X86_64
    492    add                  r7, 16
    493    add                  r4, 16
    494    movzx                hd, r6b
    495    mov                srcq, r7
    496    mov                dstq, r4
    497    sub                 r6d, 1<<8
    498 %else
    499    mov                dstq, dstmp
    500    add                  r6, 16
    501    mov                  hd, hm
    502    add                dstq, 16
    503    mov                srcq, r6
    504    mov               dstmp, dstq
    505    sub                  wd, 8
    506 %endif
    507    jg .v_w8_loop0
    508 %if WIN64
    509    pop                 r7
    510 %endif
    511    RET
    512 .hv:
    513    WIN64_SPILL_XMM       8
    514    shl                mxyd, 11
    515    mova                 m3, [base+pw_2]
    516    movd                 m6, mxyd
    517    mova                 m7, [base+pw_8192]
    518    pshufb               m6, [base+pw_256]
    519    test          dword r8m, 0x800
    520    jnz .hv_12bpc
    521    psllw                m4, 2
    522    psllw                m5, 2
    523    mova                 m7, [base+pw_2048]
    524 .hv_12bpc:
    525    movifnidn            hd, hm
    526    cmp                  wd, 4
    527    jg .hv_w8
    528    je .hv_w4
    529 .hv_w2:
    530    movddup              m0, [srcq+ssq*0]
    531    pshufhw              m1, m0, q0321
    532    pmullw               m0, m4
    533    pmullw               m1, m5
    534    paddw                m0, m3
    535    paddw                m0, m1
    536    psrlw                m0, 2
    537 .hv_w2_loop:
    538    movq                 m2, [srcq+ssq*1]
    539    lea                srcq, [srcq+ssq*2]
    540    movhps               m2, [srcq+ssq*0]
    541    pmullw               m1, m4, m2
    542    psrlq                m2, 16
    543    pmullw               m2, m5
    544    paddw                m1, m3
    545    paddw                m1, m2
    546    psrlw                m1, 2            ; 1 _ 2 _
    547    shufpd               m2, m0, m1, 0x01 ; 0 _ 1 _
    548    mova                 m0, m1
    549    psubw                m1, m2
    550    paddw                m1, m1
    551    pmulhw               m1, m6
    552    paddw                m1, m2
    553    pmulhrsw             m1, m7
    554    movd       [dstq+dsq*0], m1
    555    punpckhqdq           m1, m1
    556    movd       [dstq+dsq*1], m1
    557    lea                dstq, [dstq+dsq*2]
    558    sub                  hd, 2
    559    jg .hv_w2_loop
    560    RET
    561 .hv_w4:
    562    movddup              m0, [srcq+ssq*0]
    563    movddup              m1, [srcq+ssq*0+2]
    564    pmullw               m0, m4
    565    pmullw               m1, m5
    566    paddw                m0, m3
    567    paddw                m0, m1
    568    psrlw                m0, 2
    569 .hv_w4_loop:
    570    movq                 m1, [srcq+ssq*1]
    571    movq                 m2, [srcq+ssq*1+2]
    572    lea                srcq, [srcq+ssq*2]
    573    movhps               m1, [srcq+ssq*0]
    574    movhps               m2, [srcq+ssq*0+2]
    575    pmullw               m1, m4
    576    pmullw               m2, m5
    577    paddw                m1, m3
    578    paddw                m1, m2
    579    psrlw                m1, 2            ; 1 2
    580    shufpd               m2, m0, m1, 0x01 ; 0 1
    581    mova                 m0, m1
    582    psubw                m1, m2
    583    paddw                m1, m1
    584    pmulhw               m1, m6
    585    paddw                m1, m2
    586    pmulhrsw             m1, m7
    587    movq       [dstq+dsq*0], m1
    588    movhps     [dstq+dsq*1], m1
    589    lea                dstq, [dstq+dsq*2]
    590    sub                  hd, 2
    591    jg .hv_w4_loop
    592    RET
    593 .hv_w8:
    594 %if ARCH_X86_64
    595 %if WIN64
    596    push                 r7
    597 %endif
    598    shl                  wd, 5
    599    lea                 r6d, [wq+hq-256]
    600    mov                  r4, srcq
    601    mov                  r7, dstq
    602 %else
    603    mov                  r6, srcq
    604 %endif
    605 .hv_w8_loop0:
    606    movu                 m0, [srcq+ssq*0]
    607    movu                 m1, [srcq+ssq*0+2]
    608    pmullw               m0, m4
    609    pmullw               m1, m5
    610    paddw                m0, m3
    611    paddw                m0, m1
    612    psrlw                m0, 2
    613 .hv_w8_loop:
    614    movu                 m1, [srcq+ssq*1]
    615    movu                 m2, [srcq+ssq*1+2]
    616    lea                srcq, [srcq+ssq*2]
    617    pmullw               m1, m4
    618    pmullw               m2, m5
    619    paddw                m1, m3
    620    paddw                m1, m2
    621    psrlw                m1, 2
    622    psubw                m2, m1, m0
    623    paddw                m2, m2
    624    pmulhw               m2, m6
    625    paddw                m2, m0
    626    pmulhrsw             m2, m7
    627    mova       [dstq+dsq*0], m2
    628    movu                 m0, [srcq+ssq*0]
    629    movu                 m2, [srcq+ssq*0+2]
    630    pmullw               m0, m4
    631    pmullw               m2, m5
    632    paddw                m0, m3
    633    paddw                m0, m2
    634    psrlw                m0, 2
    635    psubw                m2, m0, m1
    636    paddw                m2, m2
    637    pmulhw               m2, m6
    638    paddw                m2, m1
    639    pmulhrsw             m2, m7
    640    mova       [dstq+dsq*1], m2
    641    lea                dstq, [dstq+dsq*2]
    642    sub                  hd, 2
    643    jg .hv_w8_loop
    644 %if ARCH_X86_64
    645    add                  r4, 16
    646    add                  r7, 16
    647    movzx                hd, r6b
    648    mov                srcq, r4
    649    mov                dstq, r7
    650    sub                 r6d, 1<<8
    651 %else
    652    mov                dstq, dstmp
    653    add                  r6, 16
    654    mov                  hd, hm
    655    add                dstq, 16
    656    mov                srcq, r6
    657    mov               dstmp, dstq
    658    sub                  wd, 8
    659 %endif
    660    jg .hv_w8_loop0
    661 %if WIN64
    662    pop                  r7
    663 %endif
    664    RET
    665 
    666 cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3
    667 %define base r6-prep_ssse3
    668    movifnidn          mxyd, r5m ; mx
    669    LEA                  r6, prep_ssse3
    670    movifnidn            hd, hm
    671    test               mxyd, mxyd
    672    jnz .h
    673    mov                mxyd, r6m ; my
    674    test               mxyd, mxyd
    675    jnz .v
    676 .prep:
    677    tzcnt                wd, wd
    678    movzx                wd, word [base+prep_ssse3_table+wq*2]
    679    mov                 r5d, r7m ; bitdepth_max
    680    mova                 m5, [base+pw_8192]
    681    add                  wq, r6
    682    shr                 r5d, 11
    683    movddup              m4, [base+prep_mul+r5*8]
    684    lea            stride3q, [strideq*3]
    685    jmp                  wq
    686 .prep_w4:
    687    movq                 m0, [srcq+strideq*0]
    688    movhps               m0, [srcq+strideq*1]
    689    movq                 m1, [srcq+strideq*2]
    690    movhps               m1, [srcq+stride3q ]
    691    lea                srcq, [srcq+strideq*4]
    692    pmullw               m0, m4
    693    pmullw               m1, m4
    694    psubw                m0, m5
    695    psubw                m1, m5
    696    mova        [tmpq+16*0], m0
    697    mova        [tmpq+16*1], m1
    698    add                tmpq, 16*2
    699    sub                  hd, 4
    700    jg .prep_w4
    701    RET
    702 .prep_w8:
    703    movu                 m0, [srcq+strideq*0]
    704    movu                 m1, [srcq+strideq*1]
    705    movu                 m2, [srcq+strideq*2]
    706    movu                 m3, [srcq+stride3q ]
    707    lea                srcq, [srcq+strideq*4]
    708    REPX     {pmullw x, m4}, m0, m1, m2, m3
    709    REPX     {psubw  x, m5}, m0, m1, m2, m3
    710    mova        [tmpq+16*0], m0
    711    mova        [tmpq+16*1], m1
    712    mova        [tmpq+16*2], m2
    713    mova        [tmpq+16*3], m3
    714    add                tmpq, 16*4
    715    sub                  hd, 4
    716    jg .prep_w8
    717    RET
    718 .prep_w16:
    719    movu                 m0, [srcq+strideq*0+16*0]
    720    movu                 m1, [srcq+strideq*0+16*1]
    721    movu                 m2, [srcq+strideq*1+16*0]
    722    movu                 m3, [srcq+strideq*1+16*1]
    723    lea                srcq, [srcq+strideq*2]
    724    REPX     {pmullw x, m4}, m0, m1, m2, m3
    725    REPX     {psubw  x, m5}, m0, m1, m2, m3
    726    mova        [tmpq+16*0], m0
    727    mova        [tmpq+16*1], m1
    728    mova        [tmpq+16*2], m2
    729    mova        [tmpq+16*3], m3
    730    add                tmpq, 16*4
    731    sub                  hd, 2
    732    jg .prep_w16
    733    RET
    734 .prep_w32:
    735    movu                 m0, [srcq+16*0]
    736    movu                 m1, [srcq+16*1]
    737    movu                 m2, [srcq+16*2]
    738    movu                 m3, [srcq+16*3]
    739    add                srcq, strideq
    740    REPX     {pmullw x, m4}, m0, m1, m2, m3
    741    REPX     {psubw  x, m5}, m0, m1, m2, m3
    742    mova        [tmpq+16*0], m0
    743    mova        [tmpq+16*1], m1
    744    mova        [tmpq+16*2], m2
    745    mova        [tmpq+16*3], m3
    746    add                tmpq, 16*4
    747    dec                  hd
    748    jg .prep_w32
    749    RET
    750 .prep_w64:
    751    movu                 m0, [srcq+16*0]
    752    movu                 m1, [srcq+16*1]
    753    movu                 m2, [srcq+16*2]
    754    movu                 m3, [srcq+16*3]
    755    REPX     {pmullw x, m4}, m0, m1, m2, m3
    756    REPX     {psubw  x, m5}, m0, m1, m2, m3
    757    mova        [tmpq+16*0], m0
    758    mova        [tmpq+16*1], m1
    759    mova        [tmpq+16*2], m2
    760    mova        [tmpq+16*3], m3
    761    movu                 m0, [srcq+16*4]
    762    movu                 m1, [srcq+16*5]
    763    movu                 m2, [srcq+16*6]
    764    movu                 m3, [srcq+16*7]
    765    add                srcq, strideq
    766    REPX     {pmullw x, m4}, m0, m1, m2, m3
    767    REPX     {psubw  x, m5}, m0, m1, m2, m3
    768    mova        [tmpq+16*4], m0
    769    mova        [tmpq+16*5], m1
    770    mova        [tmpq+16*6], m2
    771    mova        [tmpq+16*7], m3
    772    add                tmpq, 16*8
    773    dec                  hd
    774    jg .prep_w64
    775    RET
    776 .prep_w128:
    777    movu                 m0, [srcq+16* 0]
    778    movu                 m1, [srcq+16* 1]
    779    movu                 m2, [srcq+16* 2]
    780    movu                 m3, [srcq+16* 3]
    781    REPX     {pmullw x, m4}, m0, m1, m2, m3
    782    REPX     {psubw  x, m5}, m0, m1, m2, m3
    783    mova        [tmpq+16*0], m0
    784    mova        [tmpq+16*1], m1
    785    mova        [tmpq+16*2], m2
    786    mova        [tmpq+16*3], m3
    787    movu                 m0, [srcq+16* 4]
    788    movu                 m1, [srcq+16* 5]
    789    movu                 m2, [srcq+16* 6]
    790    movu                 m3, [srcq+16* 7]
    791    REPX     {pmullw x, m4}, m0, m1, m2, m3
    792    REPX     {psubw  x, m5}, m0, m1, m2, m3
    793    mova        [tmpq+16*4], m0
    794    mova        [tmpq+16*5], m1
    795    mova        [tmpq+16*6], m2
    796    mova        [tmpq+16*7], m3
    797    movu                 m0, [srcq+16* 8]
    798    movu                 m1, [srcq+16* 9]
    799    movu                 m2, [srcq+16*10]
    800    movu                 m3, [srcq+16*11]
    801    add                tmpq, 16*16
    802    REPX     {pmullw x, m4}, m0, m1, m2, m3
    803    REPX     {psubw  x, m5}, m0, m1, m2, m3
    804    mova        [tmpq-16*8], m0
    805    mova        [tmpq-16*7], m1
    806    mova        [tmpq-16*6], m2
    807    mova        [tmpq-16*5], m3
    808    movu                 m0, [srcq+16*12]
    809    movu                 m1, [srcq+16*13]
    810    movu                 m2, [srcq+16*14]
    811    movu                 m3, [srcq+16*15]
    812    add                srcq, strideq
    813    REPX     {pmullw x, m4}, m0, m1, m2, m3
    814    REPX     {psubw  x, m5}, m0, m1, m2, m3
    815    mova        [tmpq-16*4], m0
    816    mova        [tmpq-16*3], m1
    817    mova        [tmpq-16*2], m2
    818    mova        [tmpq-16*1], m3
    819    dec                  hd
    820    jg .prep_w128
    821    RET
    822 .h:
    823    movd                 m4, mxyd
    824    mov                mxyd, r6m ; my
    825    mova                 m3, [base+pw_16]
    826    pshufb               m4, [base+pw_256]
    827    mova                 m5, [base+pw_32766]
    828    psubw                m3, m4
    829    test          dword r7m, 0x800
    830    jnz .h_12bpc
    831    psllw                m3, 2
    832    psllw                m4, 2
    833 .h_12bpc:
    834    test               mxyd, mxyd
    835    jnz .hv
    836    sub                  wd, 8
    837    je .h_w8
    838    jg .h_w16
    839 .h_w4:
    840    movq                 m0, [srcq+strideq*0]
    841    movhps               m0, [srcq+strideq*1]
    842    movq                 m1, [srcq+strideq*0+2]
    843    movhps               m1, [srcq+strideq*1+2]
    844    lea                srcq, [srcq+strideq*2]
    845    pmullw               m0, m3
    846    pmullw               m1, m4
    847    psubw                m0, m5
    848    paddw                m0, m1
    849    psraw                m0, 2
    850    mova             [tmpq], m0
    851    add                tmpq, 16
    852    sub                  hd, 2
    853    jg .h_w4
    854    RET
    855 .h_w8:
    856    movu                 m0, [srcq+strideq*0]
    857    movu                 m1, [srcq+strideq*0+2]
    858    pmullw               m0, m3
    859    pmullw               m1, m4
    860    psubw                m0, m5
    861    paddw                m0, m1
    862    movu                 m1, [srcq+strideq*1]
    863    movu                 m2, [srcq+strideq*1+2]
    864    lea                srcq, [srcq+strideq*2]
    865    pmullw               m1, m3
    866    pmullw               m2, m4
    867    psubw                m1, m5
    868    paddw                m1, m2
    869    psraw                m0, 2
    870    psraw                m1, 2
    871    mova        [tmpq+16*0], m0
    872    mova        [tmpq+16*1], m1
    873    add                tmpq, 16*2
    874    sub                  hd, 2
    875    jg .h_w8
    876    RET
    877 .h_w16:
    878    lea                srcq, [srcq+wq*2]
    879    neg                  wq
    880 .h_w16_loop0:
    881    mov                  r6, wq
    882 .h_w16_loop:
    883    movu                 m0, [srcq+r6*2+ 0]
    884    movu                 m1, [srcq+r6*2+ 2]
    885    pmullw               m0, m3
    886    pmullw               m1, m4
    887    psubw                m0, m5
    888    paddw                m0, m1
    889    movu                 m1, [srcq+r6*2+16]
    890    movu                 m2, [srcq+r6*2+18]
    891    pmullw               m1, m3
    892    pmullw               m2, m4
    893    psubw                m1, m5
    894    paddw                m1, m2
    895    psraw                m0, 2
    896    psraw                m1, 2
    897    mova        [tmpq+16*0], m0
    898    mova        [tmpq+16*1], m1
    899    add                tmpq, 16*2
    900    add                  r6, 16
    901    jl .h_w16_loop
    902    add                srcq, strideq
    903    dec                  hd
    904    jg .h_w16_loop0
    905    RET
    906 .v:
    907    movd                 m4, mxyd
    908    mova                 m3, [base+pw_16]
    909    pshufb               m4, [base+pw_256]
    910    mova                 m5, [base+pw_32766]
    911    psubw                m3, m4
    912    test          dword r7m, 0x800
    913    jnz .v_12bpc
    914    psllw                m3, 2
    915    psllw                m4, 2
    916 .v_12bpc:
    917    cmp                  wd, 8
    918    je .v_w8
    919    jg .v_w16
    920 .v_w4:
    921    movq                 m0, [srcq+strideq*0]
    922 .v_w4_loop:
    923    movq                 m2, [srcq+strideq*1]
    924    lea                srcq, [srcq+strideq*2]
    925    punpcklqdq           m1, m0, m2 ; 0 1
    926    movq                 m0, [srcq+strideq*0]
    927    punpcklqdq           m2, m0     ; 1 2
    928    pmullw               m1, m3
    929    pmullw               m2, m4
    930    psubw                m1, m5
    931    paddw                m1, m2
    932    psraw                m1, 2
    933    mova             [tmpq], m1
    934    add                tmpq, 16
    935    sub                  hd, 2
    936    jg .v_w4_loop
    937    RET
    938 .v_w8:
    939    movu                 m0, [srcq+strideq*0]
    940 .v_w8_loop:
    941    movu                 m2, [srcq+strideq*1]
    942    lea                srcq, [srcq+strideq*2]
    943    pmullw               m0, m3
    944    pmullw               m1, m4, m2
    945    psubw                m0, m5
    946    paddw                m1, m0
    947    movu                 m0, [srcq+strideq*0]
    948    psraw                m1, 2
    949    pmullw               m2, m3
    950    mova        [tmpq+16*0], m1
    951    pmullw               m1, m4, m0
    952    psubw                m2, m5
    953    paddw                m1, m2
    954    psraw                m1, 2
    955    mova        [tmpq+16*1], m1
    956    add                tmpq, 16*2
    957    sub                  hd, 2
    958    jg .v_w8_loop
    959    RET
    960 .v_w16:
    961 %if WIN64
    962    push                 r7
    963 %endif
    964    mov                  r5, srcq
    965 %if ARCH_X86_64
    966    lea                 r6d, [wq*4-32]
    967    mov                  wd, wd
    968    lea                 r6d, [hq+r6*8]
    969    mov                  r7, tmpq
    970 %else
    971    mov                 r6d, wd
    972 %endif
    973 .v_w16_loop0:
    974    movu                 m0, [srcq+strideq*0]
    975 .v_w16_loop:
    976    movu                 m2, [srcq+strideq*1]
    977    lea                srcq, [srcq+strideq*2]
    978    pmullw               m0, m3
    979    pmullw               m1, m4, m2
    980    psubw                m0, m5
    981    paddw                m1, m0
    982    movu                 m0, [srcq+strideq*0]
    983    psraw                m1, 2
    984    pmullw               m2, m3
    985    mova        [tmpq+wq*0], m1
    986    pmullw               m1, m4, m0
    987    psubw                m2, m5
    988    paddw                m1, m2
    989    psraw                m1, 2
    990    mova        [tmpq+wq*2], m1
    991    lea                tmpq, [tmpq+wq*4]
    992    sub                  hd, 2
    993    jg .v_w16_loop
    994 %if ARCH_X86_64
    995    add                  r5, 16
    996    add                  r7, 16
    997    movzx                hd, r6b
    998    mov                srcq, r5
    999    mov                tmpq, r7
   1000    sub                 r6d, 1<<8
   1001 %else
   1002    mov                tmpq, tmpmp
   1003    add                  r5, 16
   1004    mov                  hd, hm
   1005    add                tmpq, 16
   1006    mov                srcq, r5
   1007    mov               tmpmp, tmpq
   1008    sub                 r6d, 8
   1009 %endif
   1010    jg .v_w16_loop0
   1011 %if WIN64
   1012    pop                  r7
   1013 %endif
   1014    RET
   1015 .hv:
   1016    WIN64_SPILL_XMM       7
   1017    shl                mxyd, 11
   1018    movd                 m6, mxyd
   1019    pshufb               m6, [base+pw_256]
   1020    cmp                  wd, 8
   1021    je .hv_w8
   1022    jg .hv_w16
   1023 .hv_w4:
   1024    movddup              m0, [srcq+strideq*0]
   1025    movddup              m1, [srcq+strideq*0+2]
   1026    pmullw               m0, m3
   1027    pmullw               m1, m4
   1028    psubw                m0, m5
   1029    paddw                m0, m1
   1030    psraw                m0, 2
   1031 .hv_w4_loop:
   1032    movq                 m1, [srcq+strideq*1]
   1033    movq                 m2, [srcq+strideq*1+2]
   1034    lea                srcq, [srcq+strideq*2]
   1035    movhps               m1, [srcq+strideq*0]
   1036    movhps               m2, [srcq+strideq*0+2]
   1037    pmullw               m1, m3
   1038    pmullw               m2, m4
   1039    psubw                m1, m5
   1040    paddw                m1, m2
   1041    psraw                m1, 2            ; 1 2
   1042    shufpd               m2, m0, m1, 0x01 ; 0 1
   1043    mova                 m0, m1
   1044    psubw                m1, m2
   1045    pmulhrsw             m1, m6
   1046    paddw                m1, m2
   1047    mova             [tmpq], m1
   1048    add                tmpq, 16
   1049    sub                  hd, 2
   1050    jg .hv_w4_loop
   1051    RET
   1052 .hv_w8:
   1053    movu                 m0, [srcq+strideq*0]
   1054    movu                 m1, [srcq+strideq*0+2]
   1055    pmullw               m0, m3
   1056    pmullw               m1, m4
   1057    psubw                m0, m5
   1058    paddw                m0, m1
   1059    psraw                m0, 2
   1060 .hv_w8_loop:
   1061    movu                 m1, [srcq+strideq*1]
   1062    movu                 m2, [srcq+strideq*1+2]
   1063    lea                srcq, [srcq+strideq*2]
   1064    pmullw               m1, m3
   1065    pmullw               m2, m4
   1066    psubw                m1, m5
   1067    paddw                m1, m2
   1068    psraw                m1, 2
   1069    psubw                m2, m1, m0
   1070    pmulhrsw             m2, m6
   1071    paddw                m2, m0
   1072    mova        [tmpq+16*0], m2
   1073    movu                 m0, [srcq+strideq*0]
   1074    movu                 m2, [srcq+strideq*0+2]
   1075    pmullw               m0, m3
   1076    pmullw               m2, m4
   1077    psubw                m0, m5
   1078    paddw                m0, m2
   1079    psraw                m0, 2
   1080    psubw                m2, m0, m1
   1081    pmulhrsw             m2, m6
   1082    paddw                m2, m1
   1083    mova        [tmpq+16*1], m2
   1084    add                tmpq, 16*2
   1085    sub                  hd, 2
   1086    jg .hv_w8_loop
   1087    RET
   1088 .hv_w16:
   1089 %if WIN64
   1090    push                 r7
   1091 %endif
   1092    mov                  r5, srcq
   1093 %if ARCH_X86_64
   1094    lea                 r6d, [wq*4-32]
   1095    mov                  wd, wd
   1096    lea                 r6d, [hq+r6*8]
   1097    mov                  r7, tmpq
   1098 %else
   1099    mov                 r6d, wd
   1100 %endif
   1101 .hv_w16_loop0:
   1102    movu                 m0, [srcq+strideq*0]
   1103    movu                 m1, [srcq+strideq*0+2]
   1104    pmullw               m0, m3
   1105    pmullw               m1, m4
   1106    psubw                m0, m5
   1107    paddw                m0, m1
   1108    psraw                m0, 2
   1109 .hv_w16_loop:
   1110    movu                 m1, [srcq+strideq*1]
   1111    movu                 m2, [srcq+strideq*1+2]
   1112    lea                srcq, [srcq+strideq*2]
   1113    pmullw               m1, m3
   1114    pmullw               m2, m4
   1115    psubw                m1, m5
   1116    paddw                m1, m2
   1117    psraw                m1, 2
   1118    psubw                m2, m1, m0
   1119    pmulhrsw             m2, m6
   1120    paddw                m2, m0
   1121    mova        [tmpq+wq*0], m2
   1122    movu                 m0, [srcq+strideq*0]
   1123    movu                 m2, [srcq+strideq*0+2]
   1124    pmullw               m0, m3
   1125    pmullw               m2, m4
   1126    psubw                m0, m5
   1127    paddw                m0, m2
   1128    psraw                m0, 2
   1129    psubw                m2, m0, m1
   1130    pmulhrsw             m2, m6
   1131    paddw                m2, m1
   1132    mova        [tmpq+wq*2], m2
   1133    lea                tmpq, [tmpq+wq*4]
   1134    sub                  hd, 2
   1135    jg .hv_w16_loop
   1136 %if ARCH_X86_64
   1137    add                  r5, 16
   1138    add                  r7, 16
   1139    movzx                hd, r6b
   1140    mov                srcq, r5
   1141    mov                tmpq, r7
   1142    sub                 r6d, 1<<8
   1143 %else
   1144    mov                tmpq, tmpmp
   1145    add                  r5, 16
   1146    mov                  hd, hm
   1147    add                tmpq, 16
   1148    mov                srcq, r5
   1149    mov               tmpmp, tmpq
   1150    sub                 r6d, 8
   1151 %endif
   1152    jg .hv_w16_loop0
   1153 %if WIN64
   1154    pop                  r7
   1155 %endif
   1156    RET
   1157 
   1158 ; int8_t subpel_filters[5][15][8]
   1159 %assign FILTER_REGULAR (0*15 << 16) | 3*15
   1160 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
   1161 %assign FILTER_SHARP   (2*15 << 16) | 3*15
   1162 
   1163 %macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
   1164 cglobal %1_%2_16bpc
   1165    mov                 t0d, FILTER_%3
   1166 %ifidn %3, %4
   1167    mov                 t1d, t0d
   1168 %else
   1169    mov                 t1d, FILTER_%4
   1170 %endif
   1171 %if %0 == 5 ; skip the jump in the last filter
   1172    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
   1173 %endif
   1174 %endmacro
   1175 
   1176 %if ARCH_X86_32
   1177 DECLARE_REG_TMP 1, 2, 6
   1178 %elif WIN64
   1179 DECLARE_REG_TMP 4, 5, 8
   1180 %else
   1181 DECLARE_REG_TMP 7, 8, 8
   1182 %endif
   1183 
   1184 %define PUT_8TAP_FN FN put_8tap,
   1185 PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
   1186 PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
   1187 PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
   1188 PUT_8TAP_FN regular,        REGULAR, REGULAR
   1189 
   1190 cglobal put_6tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my
   1191    %define            base  t2-put_ssse3
   1192 %if ARCH_X86_32
   1193    %define             mxb  r0b
   1194    %define             mxd  r0
   1195    %define             mxq  r0
   1196    %define             myb  r1b
   1197    %define             myd  r1
   1198    %define             myq  r1
   1199 %endif
   1200    imul                mxd, mxm, 0x010101
   1201    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
   1202    imul                myd, mym, 0x010101
   1203    add                 myd, t1d ; 6tap_v, my, 4tap_v
   1204    LEA                  t2, put_ssse3
   1205    movifnidn            wd, wm
   1206    movifnidn          srcq, srcmp
   1207    movifnidn           ssq, ssmp
   1208    movifnidn            hd, hm
   1209    test                mxd, 0xf00
   1210    jnz .h
   1211    test                myd, 0xf00
   1212    jnz .v
   1213 .put:
   1214    tzcnt                wd, wd
   1215    movzx                wd, word [base+put_ssse3_table+wq*2]
   1216    movifnidn          dstq, dstmp
   1217    movifnidn           dsq, dsmp
   1218    add                  wq, t2
   1219 %if WIN64
   1220    pop                  r8
   1221    pop                  r7
   1222 %endif
   1223    jmp                  wq
   1224 .h_w2:
   1225    mova                 m2, [base+spel_h_shuf2]
   1226    pshufd               m3, m3, q2121
   1227 .h_w2_loop:
   1228    movu                 m0, [srcq+ssq*0]
   1229    movu                 m1, [srcq+ssq*1]
   1230    lea                srcq, [srcq+ssq*2]
   1231    pshufb               m0, m2
   1232    pshufb               m1, m2
   1233    pmaddwd              m0, m3
   1234    pmaddwd              m1, m3
   1235    phaddd               m0, m1
   1236    paddd                m0, m4
   1237    psrad                m0, 6
   1238    packssdw             m0, m0
   1239    pxor                 m1, m1
   1240    pminsw               m0, m5
   1241    pmaxsw               m0, m1
   1242    movd       [dstq+dsq*0], m0
   1243    pshuflw              m0, m0, q3232
   1244    movd       [dstq+dsq*1], m0
   1245    lea                dstq, [dstq+dsq*2]
   1246    sub                  hd, 2
   1247    jg .h_w2_loop
   1248    RET
   1249 .h_w4:
   1250    movzx               mxd, mxb
   1251    lea                srcq, [srcq-2]
   1252    movq                 m3, [base+subpel_filters+mxq*8]
   1253    movifnidn          dstq, dstmp
   1254    punpcklbw            m3, m3
   1255    psraw                m3, 8 ; sign-extend
   1256    jl .h_w2
   1257    WIN64_SPILL_XMM       9
   1258    mova                 m7, [base+spel_h_shufA]
   1259 %if ARCH_X86_32
   1260    %define              m8  [base+spel_h_shufB]
   1261 %else
   1262    mova                 m8, [base+spel_h_shufB]
   1263 %endif
   1264    pshufd               m2, m3, q1111
   1265    pshufd               m3, m3, q2222
   1266 .h_w4_loop:
   1267    movu                 m0, [srcq+ssq*0]
   1268    movu                 m1, [srcq+ssq*1]
   1269    lea                srcq, [srcq+ssq*2]
   1270    pshufb               m6, m0, m7 ; 0 1 1 2 2 3 3 4
   1271    pmaddwd              m6, m2
   1272    pshufb               m0, m8     ; 2 3 3 4 4 5 5 6
   1273    pmaddwd              m0, m3
   1274    paddd                m0, m6
   1275    pshufb               m6, m1, m7
   1276    pmaddwd              m6, m2
   1277    pshufb               m1, m8
   1278    pmaddwd              m1, m3
   1279    paddd                m0, m4
   1280    paddd                m6, m4
   1281    paddd                m1, m6
   1282    psrad                m0, 6
   1283    psrad                m1, 6
   1284    packssdw             m0, m1
   1285    pxor                 m1, m1
   1286    pminsw               m0, m5
   1287    pmaxsw               m0, m1
   1288    movq       [dstq+dsq*0], m0
   1289    movhps     [dstq+dsq*1], m0
   1290    lea                dstq, [dstq+dsq*2]
   1291    sub                  hd, 2
   1292    jg .h_w4_loop
   1293    RET
   1294 .h:
   1295    RESET_STACK_STATE
   1296    test                myd, 0xf00
   1297    jnz .hv
   1298    mov                 myd, r8m
   1299    movd                 m5, r8m
   1300    shr                 myd, 11
   1301    movddup              m4, [base+put_8tap_h_rnd+myq*8]
   1302    movifnidn           dsq, dsmp
   1303    pshufb               m5, [base+pw_256]
   1304    sub                  wd, 4
   1305    jle .h_w4
   1306    WIN64_SPILL_XMM      11
   1307    shr                 mxd, 16
   1308    movq                 m2, [base+subpel_filters+1+mxq*8]
   1309    movifnidn          dstq, dstmp
   1310    mova                 m6, [base+spel_h_shufA]
   1311    mova                 m7, [base+spel_h_shufB]
   1312    lea                srcq, [srcq+wq*2]
   1313    punpcklbw            m2, m2
   1314    lea                dstq, [dstq+wq*2]
   1315    psraw                m2, 8
   1316    neg                  wq
   1317 %if ARCH_X86_32
   1318    ALLOC_STACK       -16*3
   1319    %define              m8  [rsp+16*0]
   1320    %define              m9  [rsp+16*1]
   1321    %define             m10  [rsp+16*2]
   1322    pshufd               m0, m2, q0000
   1323    pshufd               m1, m2, q1111
   1324    pshufd               m2, m2, q2222
   1325    mova                 m8, m0
   1326    mova                 m9, m1
   1327    mova                m10, m2
   1328 %else
   1329    pshufd               m8, m2, q0000
   1330    pshufd               m9, m2, q1111
   1331    pshufd              m10, m2, q2222
   1332 %endif
   1333 .h_w8_loop0:
   1334    mov                  r6, wq
   1335 .h_w8_loop:
   1336    movu                 m3, [srcq+r6*2-4]
   1337    movu                 m2, [srcq+r6*2+8]
   1338    pshufb               m0, m3, m6   ; 01 12 23 34
   1339    pmaddwd              m0, m8       ; abcd0
   1340    pshufb               m3, m7       ; 23 34 45 56
   1341    pmaddwd              m1, m9, m3   ; abcd1
   1342    paddd                m0, m1
   1343    pshufb               m1, m2, m6   ; 67 78 89 9a
   1344    shufpd               m3, m1, 0x01 ; 45 56 67 78
   1345    pmaddwd              m1, m9       ; efgh1
   1346    pshufb               m2, m7       ; 89 9a ab bc
   1347    pmaddwd              m2, m10      ; efgh2
   1348    paddd                m1, m2
   1349    pmaddwd              m2, m10, m3  ; abcd2
   1350    pmaddwd              m3, m8       ; efgh0
   1351    paddd                m0, m4
   1352    paddd                m1, m4
   1353    paddd                m0, m2
   1354    paddd                m1, m3
   1355    psrad                m0, 6
   1356    psrad                m1, 6
   1357    packssdw             m0, m1
   1358    pxor                 m1, m1
   1359    pminsw               m0, m5
   1360    pmaxsw               m0, m1
   1361    mova        [dstq+r6*2], m0
   1362    add                  r6, 8
   1363    jl .h_w8_loop
   1364    add                srcq, ssq
   1365    add                dstq, dsq
   1366    dec                  hd
   1367    jg .h_w8_loop0
   1368    RET
   1369 .v:
   1370    movzx               mxd, myb
   1371    shr                 myd, 16
   1372    cmp                  hd, 6
   1373    cmovb               myd, mxd
   1374    movq                 m2, [base+subpel_filters+1+myq*8]
   1375    WIN64_SPILL_XMM      11, 16
   1376    movd                 m5, r8m
   1377    movifnidn          dstq, dstmp
   1378    movifnidn           dsq, dsmp
   1379    punpcklbw            m2, m2
   1380    pshufb               m5, [base+pw_256]
   1381    psraw                m2, 8 ; sign-extend
   1382 %if ARCH_X86_32
   1383    ALLOC_STACK       -16*4
   1384    pshufd               m0, m2, q0000
   1385    mov                  r6, ssq
   1386    pshufd               m1, m2, q1111
   1387    neg                  r6
   1388    pshufd               m2, m2, q2222
   1389    mova                 m8, m0
   1390    mova                 m9, m1
   1391    mova                m10, m2
   1392    cmp                  wd, 2
   1393    jne .v_w4
   1394 %else
   1395    mov                  r6, ssq
   1396    pshufd               m8, m2, q0000
   1397    neg                  r6
   1398    cmp                  wd, 4
   1399    jg .v_w8
   1400    pshufd               m9, m2, q1111
   1401    pshufd              m10, m2, q2222
   1402    je .v_w4
   1403 %endif
   1404 .v_w2:
   1405    movd                 m1, [srcq+r6 *2]
   1406    movd                 m3, [srcq+r6 *1]
   1407    movd                 m2, [srcq+ssq*0]
   1408    movd                 m4, [srcq+ssq*1]
   1409    lea                srcq, [srcq+ssq*2]
   1410    movd                 m0, [srcq+ssq*0]
   1411    punpckldq            m1, m3      ; 0 1
   1412    punpckldq            m3, m2      ; 1 2
   1413    punpckldq            m2, m4      ; 2 3
   1414    punpckldq            m4, m0      ; 3 4
   1415    punpcklwd            m1, m3      ; 01 12
   1416    punpcklwd            m2, m4      ; 23 34
   1417    pxor                 m6, m6
   1418 .v_w2_loop:
   1419    movd                 m3, [srcq+ssq*1]
   1420    lea                srcq, [srcq+ssq*2]
   1421    pmaddwd              m4, m8, m1  ; a0 b0
   1422    mova                 m1, m2
   1423    pmaddwd              m2, m9      ; a1 b1
   1424    paddd                m4, m2
   1425    punpckldq            m2, m0, m3  ; 4 5
   1426    movd                 m0, [srcq+ssq*0]
   1427    punpckldq            m3, m0      ; 5 6
   1428    punpcklwd            m2, m3      ; 67 78
   1429    pmaddwd              m3, m10, m2 ; a2 b2
   1430    paddd                m4, m3
   1431    psrad                m4, 5
   1432    packssdw             m4, m4
   1433    pmaxsw               m4, m6
   1434    pavgw                m4, m6
   1435    pminsw               m4, m5
   1436    movd       [dstq+dsq*0], m4
   1437    pshuflw              m4, m4, q3232
   1438    movd       [dstq+dsq*1], m4
   1439    lea                dstq, [dstq+dsq*2]
   1440    sub                  hd, 2
   1441    jg .v_w2_loop
   1442    RET
   1443 .v_w4:
   1444 %if ARCH_X86_32
   1445    shl                  wd, 14
   1446    lea                srcq, [srcq+r6*2]
   1447    lea                  wd, [wq+hq-(1<<16)]
   1448 %if STACK_ALIGNMENT < 16
   1449    %define           dstmp  [esp+16*3]
   1450 %endif
   1451 .v_w4_loop0:
   1452    mov               dstmp, dstq
   1453    movq                 m1, [srcq+ssq*0]
   1454    movq                 m2, [srcq+ssq*1]
   1455    lea                  r6, [srcq+ssq*2]
   1456    movq                 m3, [r6  +ssq*0]
   1457    movq                 m4, [r6  +ssq*1]
   1458    lea                  r6, [r6  +ssq*2]
   1459 %else
   1460    movq                 m1, [srcq+r6 *2]
   1461    movq                 m2, [srcq+r6 *1]
   1462    lea                  r6, [srcq+ssq*2]
   1463    movq                 m3, [srcq+ssq*0]
   1464    movq                 m4, [srcq+ssq*1]
   1465 %endif
   1466    movq                 m0, [r6  +ssq*0]
   1467    punpcklwd            m1, m2      ; 01
   1468    punpcklwd            m2, m3      ; 12
   1469    punpcklwd            m3, m4      ; 23
   1470    punpcklwd            m4, m0      ; 34
   1471 .v_w4_loop:
   1472    pmaddwd              m6, m8, m1  ; a0
   1473    pmaddwd              m7, m8, m2  ; b0
   1474    mova                 m1, m3
   1475    pmaddwd              m3, m9      ; a1
   1476    mova                 m2, m4
   1477    pmaddwd              m4, m9      ; b1
   1478    paddd                m6, m3
   1479    movq                 m3, [r6+ssq*0]
   1480    paddd                m7, m4
   1481    movq                 m4, [r6+ssq*1]
   1482    lea                  r6, [r6+ssq*2]
   1483    movq                 m0, [r6+ssq*0]
   1484    punpcklwd            m3, m4      ; 45
   1485    punpcklwd            m4, m0      ; 56
   1486    pmaddwd              m0, m10, m3 ; a2
   1487    paddd                m6, m0
   1488    pmaddwd              m0, m10, m4 ; b2
   1489    paddd                m7, m0
   1490    psrad                m6, 5
   1491    psrad                m7, 5
   1492    packssdw             m6, m7
   1493    pxor                 m7, m7
   1494    pmaxsw               m6, m7
   1495    pavgw                m6, m7
   1496    pminsw               m6, m5
   1497    movq       [dstq+dsq*0], m6
   1498    movhps     [dstq+dsq*1], m6
   1499    lea                dstq, [dstq+dsq*2]
   1500    sub                  hd, 2
   1501    jg .v_w4_loop
   1502 %if ARCH_X86_32
   1503    mov                dstq, dstmp
   1504    add                srcq, 8
   1505    movzx                hd, ww
   1506    add                dstq, 8
   1507    sub                  wd, 1<<16
   1508    jg .v_w4_loop0
   1509    RET
   1510 %else
   1511    RET
   1512 .v_w8:
   1513    mova                r6m, m8
   1514    shl                  wd, 5
   1515    pshufd               m6, m2, q1111
   1516    lea                  wd, [wq+hq-(1<<8)]
   1517    pshufd               m7, m2, q2222
   1518    WIN64_PUSH_XMM       16
   1519 .v_w8_loop0:
   1520    movu                 m9, [srcq+ r6*2]
   1521    movu                m11, [srcq+ r6*1]
   1522    lea                  r7, [srcq+ssq*2]
   1523    movu                m13, [srcq+ssq*0]
   1524    movu                m15, [srcq+ssq*1]
   1525    mov                  r8, dstq
   1526    movu                 m4, [r7  +ssq*0]
   1527    punpcklwd            m8, m9, m11  ; 01
   1528    punpckhwd            m9, m11
   1529    punpcklwd           m10, m11, m13 ; 12
   1530    punpckhwd           m11, m13
   1531    punpcklwd           m12, m13, m15 ; 23
   1532    punpckhwd           m13, m15
   1533    punpcklwd           m14, m15, m4  ; 34
   1534    punpckhwd           m15, m4
   1535 .v_w8_loop:
   1536    mova                 m3, r6m
   1537    pmaddwd              m0, m8, m3   ; a0
   1538    pmaddwd              m2, m9, m3   ; a0'
   1539    pmaddwd              m1, m10, m3  ; b0
   1540    pmaddwd              m3, m11      ; b0'
   1541    mova                 m8, m12
   1542    pmaddwd             m12, m6       ; a1
   1543    mova                 m9, m13
   1544    pmaddwd             m13, m6       ; a1'
   1545    mova                m10, m14
   1546    pmaddwd             m14, m6       ; b1
   1547    mova                m11, m15
   1548    pmaddwd             m15, m6       ; b1'
   1549    paddd                m0, m12
   1550    paddd                m2, m13
   1551    movu                m13, [r7+ssq*0]
   1552    paddd                m1, m14
   1553    paddd                m3, m15
   1554    movu                m15, [r7+ssq*1]
   1555    lea                  r7, [r7+ssq*2]
   1556    movu                 m4, [r7+ssq*0]
   1557    punpcklwd           m12, m13, m15 ; 45
   1558    punpckhwd           m13, m15
   1559    punpcklwd           m14, m15, m4  ; 56
   1560    punpckhwd           m15, m4
   1561    pmaddwd              m4, m7, m12  ; a2
   1562    paddd                m0, m4
   1563    pmaddwd              m4, m7, m13  ; a2'
   1564    paddd                m2, m4
   1565    pmaddwd              m4, m7, m14  ; b2
   1566    paddd                m1, m4
   1567    pmaddwd              m4, m7, m15  ; b2'
   1568    paddd                m3, m4
   1569    REPX       {psrad x, 5}, m0, m2, m1, m3
   1570    packssdw             m0, m2
   1571    packssdw             m1, m3
   1572    pxor                 m2, m2
   1573    pmaxsw               m0, m2
   1574    pmaxsw               m1, m2
   1575    pavgw                m0, m2
   1576    pavgw                m1, m2
   1577    pminsw               m0, m5
   1578    pminsw               m1, m5
   1579    mova         [r8+dsq*0], m0
   1580    mova         [r8+dsq*1], m1
   1581    lea                  r8, [r8+dsq*2]
   1582    sub                  hd, 2
   1583    jg .v_w8_loop
   1584    add                srcq, 16
   1585    add                dstq, 16
   1586    movzx                hd, wb
   1587    sub                  wd, 1<<8
   1588    jg .v_w8_loop0
   1589    RET
   1590 %endif
   1591 .hv:
   1592    cmp                  wd, 4
   1593    jg .hv_w8
   1594    WIN64_SPILL_XMM      12, 16
   1595 %if ARCH_X86_32
   1596    movd                 m3, r8m
   1597    pshufb               m3, [base+pw_256]
   1598 %else
   1599    movd                m11, r8m
   1600    pshufb              m11, [base+pw_256]
   1601 %endif
   1602    movzx               mxd, mxb
   1603    movq                 m0, [base+subpel_filters+mxq*8]
   1604    movzx               mxd, myb
   1605    shr                 myd, 16
   1606    cmp                  hd, 6
   1607    cmovb               myd, mxd
   1608    movq                 m2, [base+subpel_filters+1+myq*8]
   1609    movddup              m7, [base+pd_8704]
   1610    sub                srcq, 2
   1611    pshuflw              m0, m0, q2121
   1612    pxor                 m6, m6
   1613    punpcklbw            m6, m0
   1614    punpcklbw            m2, m2
   1615    psraw                m2, 8 ; sign-extend
   1616    test          dword r8m, 0x800
   1617    jz .hv_w2_10bpc
   1618    movddup              m7, [base+pd_2560]
   1619    psraw                m6, 2
   1620    psllw                m2, 2
   1621 .hv_w2_10bpc:
   1622 %if ARCH_X86_32
   1623 %assign regs_used 2
   1624    ALLOC_STACK       -16*7
   1625 %assign regs_used 7
   1626    mov                dstq, r0mp
   1627    mov                 dsq, r1mp
   1628    %define             m11  [esp+16*4]
   1629    pshufd               m0, m2, q0000
   1630    pshufd               m1, m2, q1111
   1631    pshufd               m2, m2, q2222
   1632    mova                 m8, m0
   1633    mova                 m9, m1
   1634    mova                m10, m2
   1635    mova                m11, m3
   1636    neg                 ssq
   1637    movu                 m3, [srcq+ssq*2]
   1638    movu                 m4, [srcq+ssq*1]
   1639    neg                 ssq
   1640 %else
   1641    pshufd               m8, m2, q0000
   1642    mov                  r6, ssq
   1643    pshufd               m9, m2, q1111
   1644    neg                  r6
   1645    pshufd              m10, m2, q2222
   1646    movu                 m3, [srcq+r6 *2]
   1647    movu                 m4, [srcq+r6 *1]
   1648 %endif
   1649    movu                 m1, [srcq+ssq*0]
   1650    movu                 m0, [srcq+ssq*1]
   1651    lea                srcq, [srcq+ssq*2]
   1652    movu                 m2, [srcq+ssq*0]
   1653    cmp                  wd, 4
   1654    je .hv_w4
   1655    mova                 m5, [base+spel_h_shuf2]
   1656    REPX    {pshufb  x, m5}, m3, m4, m0, m1, m2
   1657    REPX    {pmaddwd x, m6}, m3, m0, m4, m1, m2
   1658    phaddd               m3, m0        ; 0 3
   1659    phaddd               m4, m1        ; 1 2
   1660    phaddd               m0, m2        ; 3 4
   1661    REPX    {paddd   x, m7}, m3, m4, m0
   1662    REPX    {psrad   x, 10}, m3, m4, m0
   1663    packssdw             m3, m4        ; 0 3 1 2
   1664    packssdw             m4, m0        ; 1 2 3 4
   1665    pshufd               m2, m3, q1320 ; 0 1 2 3
   1666    punpcklwd            m1, m2, m4    ; 01 12
   1667    punpckhwd            m2, m4        ; 23 34
   1668 .hv_w2_loop:
   1669    movu                 m3, [srcq+ssq*1]
   1670    lea                srcq, [srcq+ssq*2]
   1671    movu                 m4, [srcq+ssq*0]
   1672    pshufb               m3, m5
   1673    pshufb               m4, m5
   1674    pmaddwd              m3, m6
   1675    pmaddwd              m4, m6
   1676    phaddd               m3, m4
   1677    pmaddwd              m4, m8, m1    ; a0 b0
   1678    mova                 m1, m2
   1679    pmaddwd              m2, m9        ; a1 b1
   1680    paddd                m4, m2
   1681    paddd                m3, m7
   1682    psrad                m3, 10        ; 5 6
   1683    packssdw             m0, m3
   1684    pshufd               m2, m0, q2103
   1685    punpckhwd            m2, m0        ; 45 56
   1686    mova                 m0, m3
   1687    pmaddwd              m3, m10, m2   ; a2 b2
   1688    paddd                m4, m3
   1689    psrad                m4, 10
   1690    packssdw             m4, m4
   1691    pxor                 m3, m3
   1692    pminsw               m4, m11
   1693    pmaxsw               m4, m3
   1694    movd       [dstq+dsq*0], m4
   1695    pshuflw              m4, m4, q1032
   1696    movd       [dstq+dsq*1], m4
   1697    lea                dstq, [dstq+dsq*2]
   1698    sub                  hd, 2
   1699    jg .hv_w2_loop
   1700    RET
   1701 .hv_w4:
   1702 %if ARCH_X86_32
   1703    %define             m12  [esp+16*5]
   1704    %define             m13  [esp+16*6]
   1705    %define             m14  [base+spel_h_shufA]
   1706    %define             m15  [base+spel_h_shufB]
   1707    pshufd               m5, m6, q0000
   1708    pshufd               m6, m6, q1111
   1709    mova                m12, m5
   1710    mova                m13, m6
   1711 %else
   1712    WIN64_PUSH_XMM       16
   1713    mova                m14, [base+spel_h_shufA]
   1714    mova                m15, [base+spel_h_shufB]
   1715    pshufd              m12, m6, q0000
   1716    pshufd              m13, m6, q1111
   1717 %endif
   1718 %macro HV_H_W4_6TAP 3-4 m15 ; dst, src, tmp, shufB
   1719    pshufb               %3, %2, m14
   1720    pmaddwd              %3, m12
   1721    pshufb               %2, %4
   1722    pmaddwd              %2, m13
   1723    paddd                %3, m7
   1724    paddd                %1, %2, %3
   1725 %endmacro
   1726    HV_H_W4_6TAP         m3, m3, m5
   1727    HV_H_W4_6TAP         m4, m4, m5
   1728    HV_H_W4_6TAP         m5, m1, m5
   1729    HV_H_W4_6TAP         m0, m0, m1
   1730    HV_H_W4_6TAP         m2, m2, m1
   1731    REPX      {psrad x, 10}, m3, m5, m4, m0, m2
   1732    packssdw             m3, m5      ; 0 2
   1733    packssdw             m4, m0      ; 1 3
   1734    packssdw             m5, m2      ; 2 4
   1735    punpcklwd            m1, m3, m4  ; 01
   1736    punpckhwd            m3, m4      ; 23
   1737    punpcklwd            m2, m4, m5  ; 12
   1738    punpckhwd            m4, m5      ; 34
   1739 .hv_w4_loop:
   1740    movu                 m0, [srcq+ssq*1]
   1741    pmaddwd              m5, m8, m1  ; a0
   1742    lea                srcq, [srcq+ssq*2]
   1743    pmaddwd              m6, m8, m2  ; b0
   1744    mova                 m1, m3
   1745    pmaddwd              m3, m9      ; a1
   1746    mova                 m2, m4
   1747    pmaddwd              m4, m9      ; b1
   1748    paddd                m5, m3
   1749    movu                 m3, [srcq+ssq*0]
   1750    paddd                m6, m4
   1751    HV_H_W4_6TAP         m0, m0, m4
   1752    HV_H_W4_6TAP         m3, m3, m4
   1753    psrad                m4, m2, 16
   1754    psrad                m0, 10
   1755    psrad                m3, 10
   1756    packssdw             m4, m0      ; 4 5
   1757    packssdw             m0, m3      ; 5 6
   1758    punpcklwd            m3, m4, m0  ; 45
   1759    punpckhwd            m4, m0      ; 56
   1760    pmaddwd              m0, m10, m3 ; a2
   1761    paddd                m5, m0
   1762    pmaddwd              m0, m10, m4 ; b2
   1763    paddd                m6, m0
   1764    psrad                m5, 10
   1765    psrad                m6, 10
   1766    packssdw             m5, m6
   1767    pxor                 m6, m6
   1768    pminsw               m5, m11
   1769    pmaxsw               m5, m6
   1770    movq       [dstq+dsq*0], m5
   1771    movhps     [dstq+dsq*1], m5
   1772    lea                dstq, [dstq+dsq*2]
   1773    sub                  hd, 2
   1774    jg .hv_w4_loop
   1775    RET
   1776 .hv_w8:
   1777    RESET_STACK_STATE
   1778    shr                 mxd, 16
   1779    movq                 m2, [base+subpel_filters+1+mxq*8]
   1780    movzx               mxd, myb
   1781    shr                 myd, 16
   1782    cmp                  hd, 6
   1783    cmovb               myd, mxd
   1784    movq                 m1, [base+subpel_filters+1+myq*8]
   1785    movd                 m3, r8m
   1786    movddup              m4, [base+pd_8704]
   1787    pshufb               m3, [base+pw_256]
   1788    pxor                 m0, m0
   1789    punpcklbw            m0, m2
   1790    punpcklbw            m1, m1
   1791    sub                srcq, 4
   1792    psraw                m1, 8 ; sign-extend
   1793    test          dword r8m, 0x800
   1794    jz .hv_w8_10bpc
   1795    movddup              m4, [base+pd_2560]
   1796    psraw                m0, 2
   1797    psllw                m1, 2
   1798 .hv_w8_10bpc:
   1799 %if ARCH_X86_32
   1800 %assign regs_used 2
   1801    ALLOC_STACK       -16*9
   1802 %assign regs_used 7
   1803    mov                dstq, r0mp
   1804    mov                 dsq, r1mp
   1805    mova         [rsp+16*7], m4
   1806 %else
   1807    ALLOC_STACK        16*7, 16
   1808 %endif
   1809    mova         [rsp+16*6], m3
   1810    pshufd               m2, m0, q0000
   1811    mova         [rsp+16*0], m2
   1812    pshufd               m2, m0, q1111
   1813    mova         [rsp+16*1], m2
   1814    pshufd               m0, m0, q2222
   1815    mova         [rsp+16*2], m0
   1816    pshufd               m2, m1, q0000
   1817    mova         [rsp+16*3], m2
   1818    pshufd               m2, m1, q1111
   1819    mova         [rsp+16*4], m2
   1820    pshufd               m1, m1, q2222
   1821    mova         [rsp+16*5], m1
   1822    mov                  r6, ssq
   1823    neg                  r6
   1824 %if ARCH_X86_32
   1825    shl                  wd, 14
   1826    lea                 r4d, [wq+hq-(1<<16)]
   1827 %if STACK_ALIGNMENT < 16
   1828    %define           srcmp  [esp+16*8+4*0]
   1829    %define           dstmp  [esp+16*8+4*1]
   1830 %endif
   1831 %macro HV_H_6TAP 3-6 [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-2], mul[1-3]
   1832    punpcklwd            %1, %2, %3   ; 01 12 23 34
   1833    punpckhwd            %2, %3       ; 45 56 67 78
   1834    pmaddwd              %3, %4, %1   ; a0
   1835    shufpd               %1, %2, 0x01 ; 23 34 45 56
   1836    pmaddwd              %2, %6       ; a2
   1837    pmaddwd              %1, %5       ; a1
   1838    paddd                %2, %3
   1839    paddd                %1, %2
   1840 %endmacro
   1841 .hv_w8_loop0:
   1842    mov               srcmp, srcq
   1843    mov               dstmp, dstq
   1844    movu                 m5, [srcq+r6*2+0]
   1845    movu                 m6, [srcq+r6*2+2]
   1846    mova                 m7, [rsp+16*0]
   1847    mova                 m1, [rsp+16*1]
   1848    mova                 m0, [rsp+16*2]
   1849    HV_H_6TAP            m2, m5, m6, m7, m1, m0
   1850    movu                 m5, [srcq+r6*1+0]
   1851    movu                 m6, [srcq+r6*1+2]
   1852    HV_H_6TAP            m3, m5, m6, m7, m1, m0
   1853    movu                 m5, [srcq+ssq*0+0]
   1854    movu                 m6, [srcq+ssq*0+2]
   1855    HV_H_6TAP            m4, m5, m6, m7, m1, m0
   1856    movu                 m5, [srcq+ssq*1+0]
   1857    movu                 m6, [srcq+ssq*1+2]
   1858    lea                srcq, [srcq+ssq*2]
   1859    HV_H_6TAP            m0, m5, m6, m7, m1
   1860    movu                 m5, [srcq+ssq*0+0]
   1861    movu                 m6, [srcq+ssq*0+2]
   1862    HV_H_6TAP            m1, m5, m6, m7
   1863    mova                 m5, [rsp+16*7]
   1864    REPX      {paddd x, m5}, m2, m3, m4, m0, m1
   1865    REPX      {psrad x, 10}, m2, m4, m3, m0, m1
   1866    packssdw             m2, m4     ; 0 2
   1867    packssdw             m3, m0     ; 1 3
   1868    packssdw             m4, m1     ; 2 4
   1869    punpcklwd            m0, m2, m3 ; 01
   1870    punpckhwd            m2, m3     ; 23
   1871    punpcklwd            m1, m3, m4 ; 12
   1872    punpckhwd            m3, m4     ; 34
   1873 .hv_w8_loop:
   1874    mova                 m5, [rsp+16*3]
   1875    mova                 m6, [rsp+16*4]
   1876    pmaddwd              m4, m0, m5 ; a0
   1877    pmaddwd              m5, m1     ; b0
   1878    mova                 m0, m2
   1879    pmaddwd              m2, m6     ; a1
   1880    mova                 m1, m3
   1881    pmaddwd              m3, m6     ; b1
   1882    paddd                m4, m2
   1883    movu                 m2, [srcq+ssq*1+0]
   1884    paddd                m5, m3
   1885    movu                 m3, [srcq+ssq*1+2]
   1886    lea                srcq, [srcq+ssq*2]
   1887    HV_H_6TAP            m6, m2, m3
   1888    movu                 m2, [srcq+ssq*0+0]
   1889    movu                 m3, [srcq+ssq*0+2]
   1890    HV_H_6TAP            m7, m2, m3
   1891    mova                 m2, [rsp+16*7]
   1892    psrad                m3, m1, 16
   1893    paddd                m6, m2
   1894    paddd                m7, m2
   1895    psrad                m6, 10
   1896    psrad                m7, 10
   1897    packssdw             m3, m6     ; 4 5
   1898    packssdw             m6, m7     ; 5 6
   1899    mova                 m7, [rsp+16*5]
   1900    punpcklwd            m2, m3, m6 ; 45
   1901    punpckhwd            m3, m6     ; 56
   1902    pmaddwd              m6, m2, m7 ; a2
   1903    pmaddwd              m7, m3     ; b2
   1904    paddd                m4, m6
   1905    paddd                m5, m7
   1906    psrad                m4, 10
   1907    psrad                m5, 10
   1908    packssdw             m4, m5
   1909    pxor                 m5, m5
   1910    pminsw               m4, [rsp+16*6]
   1911    pmaxsw               m4, m5
   1912    movq       [dstq+dsq*0], m4
   1913    movhps     [dstq+dsq*1], m4
   1914    lea                dstq, [dstq+dsq*2]
   1915    sub                  hd, 2
   1916    jg .hv_w8_loop
   1917    mov                srcq, srcmp
   1918    mov                dstq, dstmp
   1919    movzx                hd, r4w
   1920    add                srcq, 8
   1921    add                dstq, 8
   1922    sub                 r4d, 1<<16
   1923 %else
   1924    shl                  wd, 5
   1925    lea                 r8d, [wq+hq-256]
   1926 %macro HV_H_6TAP 5-9 [spel_h_shufA], [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-3], shift, shuf, mul[1-3]
   1927 %ifid %6
   1928    REPX     {pshufb x, %6}, %2, %3, %4
   1929 %else
   1930    mova                 %1, %6
   1931    pshufb               %2, %1       ; 01 12 23 34
   1932    pshufb               %3, %1       ; 45 56 67 78
   1933    pshufb               %4, %1       ; 89 9a ab bc
   1934 %endif
   1935    pmaddwd              %1, %7, %2
   1936    shufpd               %2, %3, 0x01 ; 23 34 45 56
   1937    pmaddwd              %2, %8
   1938    paddd                %1, %2
   1939    pmaddwd              %2, %9, %3
   1940    paddd                %1, %2
   1941    pmaddwd              %2, %7, %3
   1942    shufpd               %3, %4, 0x01 ; 67 78 89 9a
   1943    pmaddwd              %4, %9
   1944    pmaddwd              %3, %8
   1945    paddd                %1, m4
   1946    paddd                %2, m4
   1947    paddd                %3, %4
   1948    paddd                %2, %3
   1949    psrad                %1, %5
   1950    psrad                %2, %5
   1951    packssdw             %1, %2
   1952 %endmacro
   1953 .hv_w8_loop0:
   1954    mova                 m5, [spel_h_shufA]
   1955    movu                 m0, [srcq+r6*2+ 0]
   1956    mova                 m6, [rsp+16*0]
   1957    movu                 m1, [srcq+r6*2+ 8]
   1958    mova                 m7, [rsp+16*1]
   1959    movu                 m2, [srcq+r6*2+16]
   1960    mova                 m8, [rsp+16*2]
   1961    HV_H_6TAP            m9, m0, m1, m2, 10, m5, m6, m7, m8
   1962    movu                 m0, [srcq+r6*1+ 0]
   1963    movu                 m1, [srcq+r6*1+ 8]
   1964    movu                 m2, [srcq+r6*1+16]
   1965    lea                  r4, [srcq+ssq*2]
   1966    HV_H_6TAP           m11, m0, m1, m2, 10, m5, m6, m7, m8
   1967    movu                 m0, [srcq+ssq*0+ 0]
   1968    movu                 m1, [srcq+ssq*0+ 8]
   1969    movu                 m2, [srcq+ssq*0+16]
   1970    mov                  r7, dstq
   1971    HV_H_6TAP           m13, m0, m1, m2, 10, m5, m6, m7, m8
   1972    movu                 m0, [srcq+ssq*1+ 0]
   1973    movu                 m1, [srcq+ssq*1+ 8]
   1974    movu                 m2, [srcq+ssq*1+16]
   1975    HV_H_6TAP           m15, m0, m1, m2, 10, m5, m6, m7, m8
   1976    movu                 m0, [r4+ssq*0+ 0]
   1977    movu                 m1, [r4+ssq*0+ 8]
   1978    movu                 m2, [r4+ssq*0+16]
   1979    HV_H_6TAP            m5, m0, m1, m2, 10, m5, m6, m7, m8
   1980    punpcklwd            m8, m9, m11  ; 01
   1981    punpckhwd            m9, m11
   1982    punpcklwd           m10, m11, m13 ; 12
   1983    punpckhwd           m11, m13
   1984    punpcklwd           m12, m13, m15 ; 23
   1985    punpckhwd           m13, m15
   1986    punpcklwd           m14, m15, m5  ; 34
   1987    punpckhwd           m15, m5
   1988 .hv_w8_loop:
   1989    mova                 m3, [rsp+16*3]
   1990    mova                 m7, [rsp+16*4]
   1991    pmaddwd              m0, m8, m3   ; a0
   1992    mova                 m8, m12
   1993    pmaddwd              m2, m9, m3   ; a0'
   1994    mova                 m9, m13
   1995    pmaddwd              m1, m10, m3  ; b0
   1996    mova                m10, m14
   1997    pmaddwd              m3, m11      ; b0'
   1998    mova                m11, m15
   1999    REPX    {pmaddwd x, m7}, m12, m13, m14, m15
   2000    movu                 m6, [r4+ssq*1+ 0]
   2001    paddd                m0, m12
   2002    movu                 m7, [r4+ssq*1+ 8]
   2003    paddd                m2, m13
   2004    movu                m12, [r4+ssq*1+16]
   2005    paddd                m1, m14
   2006    lea                  r4, [r4+ssq*2]
   2007    paddd                m3, m15
   2008    HV_H_6TAP           m15, m6, m7, m12, 10
   2009    movu                 m6, [r4+ssq*0+ 0]
   2010    movu                 m7, [r4+ssq*0+ 8]
   2011    movu                m14, [r4+ssq*0+16]
   2012    punpcklwd           m12, m5, m15 ; 45
   2013    punpckhwd           m13, m5, m15
   2014    HV_H_6TAP            m5, m6, m7, m14, 10
   2015    mova                 m7, [rsp+16*5]
   2016    punpcklwd           m14, m15, m5  ; 56
   2017    punpckhwd           m15, m5
   2018    pmaddwd              m6, m12, m7  ; a2
   2019    paddd                m0, m6
   2020    pmaddwd              m6, m13, m7  ; a2'
   2021    paddd                m2, m6
   2022    pmaddwd              m6, m14, m7  ; b2
   2023    pmaddwd              m7, m15      ; b2'
   2024    paddd                m1, m6
   2025    mova                 m6, [rsp+16*6]
   2026    paddd                m3, m7
   2027    REPX      {psrad x, 10}, m0, m2, m1, m3
   2028    packssdw             m0, m2
   2029    packssdw             m1, m3
   2030    pxor                 m2, m2
   2031    pminsw               m0, m6
   2032    pminsw               m1, m6
   2033    pmaxsw               m0, m2
   2034    pmaxsw               m1, m2
   2035    mova         [r7+dsq*0], m0
   2036    mova         [r7+dsq*1], m1
   2037    lea                  r7, [r7+dsq*2]
   2038    sub                  hd, 2
   2039    jg .hv_w8_loop
   2040    add                srcq, 16
   2041    add                dstq, 16
   2042    movzx                hd, r8b
   2043    sub                 r8d, 1<<8
   2044 %endif
   2045    jg .hv_w8_loop0
   2046    RET
   2047 
   2048 PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
   2049 PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
   2050 PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
   2051 PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
   2052 PUT_8TAP_FN sharp,          SHARP,   SHARP
   2053 
   2054 cglobal put_8tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my
   2055 %if ARCH_X86_32
   2056    %define             mxb  r0b
   2057    %define             mxd  r0
   2058    %define             mxq  r0
   2059    %define             myb  r1b
   2060    %define             myd  r1
   2061    %define             myq  r1
   2062    %define              m8  [esp+16*0]
   2063    %define              m9  [esp+16*1]
   2064    %define             m10  [esp+16*2]
   2065    %define             m11  [esp+16*3]
   2066    %define             m12  [esp+16*4]
   2067    %define             m13  [esp+16*5]
   2068    %define             m14  [esp+16*6]
   2069    %define             m15  [esp+16*7]
   2070 %endif
   2071    imul                mxd, mxm, 0x010101
   2072    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   2073    imul                myd, mym, 0x010101
   2074    add                 myd, t1d ; 8tap_v, my, 4tap_v
   2075    LEA                  t2, put_ssse3
   2076    movifnidn            wd, wm
   2077    movifnidn          srcq, srcmp
   2078    movifnidn           ssq, ssmp
   2079    movifnidn            hd, hm
   2080    test                mxd, 0xf00
   2081    jnz .h
   2082    test                myd, 0xf00
   2083    jz mangle(private_prefix %+ _put_6tap_16bpc_ssse3).put
   2084 .v:
   2085    movzx               mxd, myb
   2086    shr                 myd, 16
   2087    cmp                  hd, 6
   2088    cmovb               myd, mxd
   2089    movq                 m3, [base+subpel_filters+myq*8]
   2090    WIN64_SPILL_XMM      15
   2091    movd                 m7, r8m
   2092    movifnidn          dstq, dstmp
   2093    movifnidn           dsq, dsmp
   2094    punpcklbw            m3, m3
   2095    pshufb               m7, [base+pw_256]
   2096    psraw                m3, 8 ; sign-extend
   2097 %if ARCH_X86_32
   2098    ALLOC_STACK       -16*7
   2099    pshufd               m0, m3, q0000
   2100    pshufd               m1, m3, q1111
   2101    pshufd               m2, m3, q2222
   2102    pshufd               m3, m3, q3333
   2103    mova                 m8, m0
   2104    mova                 m9, m1
   2105    mova                m10, m2
   2106    mova                m11, m3
   2107 %else
   2108    pshufd               m8, m3, q0000
   2109    pshufd               m9, m3, q1111
   2110    pshufd              m10, m3, q2222
   2111    pshufd              m11, m3, q3333
   2112 %endif
   2113    lea                  r6, [ssq*3]
   2114    sub                srcq, r6
   2115    cmp                  wd, 2
   2116    jne .v_w4
   2117 .v_w2:
   2118    movd                 m1, [srcq+ssq*0]
   2119    movd                 m4, [srcq+ssq*1]
   2120    movd                 m2, [srcq+ssq*2]
   2121    add                srcq, r6
   2122    movd                 m5, [srcq+ssq*0]
   2123    movd                 m3, [srcq+ssq*1]
   2124    movd                 m6, [srcq+ssq*2]
   2125    add                srcq, r6
   2126    movd                 m0, [srcq+ssq*0]
   2127    punpckldq            m1, m4      ; 0 1
   2128    punpckldq            m4, m2      ; 1 2
   2129    punpckldq            m2, m5      ; 2 3
   2130    punpckldq            m5, m3      ; 3 4
   2131    punpckldq            m3, m6      ; 4 5
   2132    punpckldq            m6, m0      ; 5 6
   2133    punpcklwd            m1, m4      ; 01 12
   2134    punpcklwd            m2, m5      ; 23 34
   2135    punpcklwd            m3, m6      ; 45 56
   2136    pxor                 m6, m6
   2137 .v_w2_loop:
   2138    movd                 m4, [srcq+ssq*1]
   2139    lea                srcq, [srcq+ssq*2]
   2140    pmaddwd              m5, m8, m1  ; a0 b0
   2141    mova                 m1, m2
   2142    pmaddwd              m2, m9      ; a1 b1
   2143    paddd                m5, m2
   2144    mova                 m2, m3
   2145    pmaddwd              m3, m10     ; a2 b2
   2146    paddd                m5, m3
   2147    punpckldq            m3, m0, m4  ; 6 7
   2148    movd                 m0, [srcq+ssq*0]
   2149    punpckldq            m4, m0      ; 7 8
   2150    punpcklwd            m3, m4      ; 67 78
   2151    pmaddwd              m4, m11, m3 ; a3 b3
   2152    paddd                m5, m4
   2153    psrad                m5, 5
   2154    packssdw             m5, m5
   2155    pmaxsw               m5, m6
   2156    pavgw                m5, m6
   2157    pminsw               m5, m7
   2158    movd       [dstq+dsq*0], m5
   2159    pshuflw              m5, m5, q3232
   2160    movd       [dstq+dsq*1], m5
   2161    lea                dstq, [dstq+dsq*2]
   2162    sub                  hd, 2
   2163    jg .v_w2_loop
   2164    RET
   2165 .v_w4:
   2166 %if ARCH_X86_32
   2167    shl                  wd, 14
   2168 %if STACK_ALIGNMENT < 16
   2169    mov          [esp+4*29], srcq
   2170    mov          [esp+4*30], dstq
   2171 %else
   2172    mov               srcmp, srcq
   2173 %endif
   2174    lea                  wd, [wq+hq-(1<<16)]
   2175 %else
   2176    shl                  wd, 6
   2177    mov                  r7, srcq
   2178    mov                  r8, dstq
   2179    lea                  wd, [wq+hq-(1<<8)]
   2180 %endif
   2181 .v_w4_loop0:
   2182    movq                 m1, [srcq+ssq*0]
   2183    movq                 m2, [srcq+ssq*1]
   2184    movq                 m3, [srcq+ssq*2]
   2185    add                srcq, r6
   2186    movq                 m4, [srcq+ssq*0]
   2187    movq                 m5, [srcq+ssq*1]
   2188    movq                 m6, [srcq+ssq*2]
   2189    add                srcq, r6
   2190    movq                 m0, [srcq+ssq*0]
   2191    punpcklwd            m1, m2      ; 01
   2192    punpcklwd            m2, m3      ; 12
   2193    punpcklwd            m3, m4      ; 23
   2194    punpcklwd            m4, m5      ; 34
   2195    punpcklwd            m5, m6      ; 45
   2196    punpcklwd            m6, m0      ; 56
   2197 %if ARCH_X86_32
   2198    jmp .v_w4_loop_start
   2199 .v_w4_loop:
   2200    mova                 m1, m12
   2201    mova                 m2, m13
   2202    mova                 m3, m14
   2203 .v_w4_loop_start:
   2204    pmaddwd              m1, m8      ; a0
   2205    pmaddwd              m2, m8      ; b0
   2206    mova                m12, m3
   2207    mova                m13, m4
   2208    pmaddwd              m3, m9      ; a1
   2209    pmaddwd              m4, m9      ; b1
   2210    paddd                m1, m3
   2211    paddd                m2, m4
   2212    mova                m14, m5
   2213    mova                 m4, m6
   2214    pmaddwd              m5, m10     ; a2
   2215    pmaddwd              m6, m10     ; b2
   2216    paddd                m1, m5
   2217    paddd                m2, m6
   2218    movq                 m6, [srcq+ssq*1]
   2219    lea                srcq, [srcq+ssq*2]
   2220    punpcklwd            m5, m0, m6  ; 67
   2221    movq                 m0, [srcq+ssq*0]
   2222    pmaddwd              m3, m11, m5 ; a3
   2223    punpcklwd            m6, m0      ; 78
   2224    paddd                m1, m3
   2225    pmaddwd              m3, m11, m6 ; b3
   2226    paddd                m2, m3
   2227    psrad                m1, 5
   2228    psrad                m2, 5
   2229    packssdw             m1, m2
   2230    pxor                 m2, m2
   2231    pmaxsw               m1, m2
   2232    pavgw                m1, m2
   2233    pminsw               m1, m7
   2234    movq       [dstq+dsq*0], m1
   2235    movhps     [dstq+dsq*1], m1
   2236    lea                dstq, [dstq+dsq*2]
   2237    sub                  hd, 2
   2238    jg .v_w4_loop
   2239 %if STACK_ALIGNMENT < 16
   2240    mov                srcq, [esp+4*29]
   2241    mov                dstq, [esp+4*30]
   2242    movzx                hd, ww
   2243    add                srcq, 8
   2244    add                dstq, 8
   2245    mov          [esp+4*29], srcq
   2246    mov          [esp+4*30], dstq
   2247 %else
   2248    mov                srcq, srcmp
   2249    mov                dstq, dstmp
   2250    movzx                hd, ww
   2251    add                srcq, 8
   2252    add                dstq, 8
   2253    mov               srcmp, srcq
   2254    mov               dstmp, dstq
   2255 %endif
   2256    sub                  wd, 1<<16
   2257 %else
   2258 .v_w4_loop:
   2259    pmaddwd             m12, m8, m1  ; a0
   2260    pmaddwd             m13, m8, m2  ; b0
   2261    mova                 m1, m3
   2262    mova                 m2, m4
   2263    pmaddwd              m3, m9      ; a1
   2264    pmaddwd              m4, m9      ; b1
   2265    paddd               m12, m3
   2266    paddd               m13, m4
   2267    mova                 m3, m5
   2268    mova                 m4, m6
   2269    pmaddwd              m5, m10     ; a2
   2270    pmaddwd              m6, m10     ; b2
   2271    paddd               m12, m5
   2272    paddd               m13, m6
   2273    movq                 m6, [srcq+ssq*1]
   2274    lea                srcq, [srcq+ssq*2]
   2275    punpcklwd            m5, m0, m6  ; 67
   2276    movq                 m0, [srcq+ssq*0]
   2277    pmaddwd             m14, m11, m5 ; a3
   2278    punpcklwd            m6, m0      ; 78
   2279    paddd               m12, m14
   2280    pmaddwd             m14, m11, m6 ; b3
   2281    paddd               m13, m14
   2282    psrad               m12, 5
   2283    psrad               m13, 5
   2284    packssdw            m12, m13
   2285    pxor                m13, m13
   2286    pmaxsw              m12, m13
   2287    pavgw               m12, m13
   2288    pminsw              m12, m7
   2289    movq       [dstq+dsq*0], m12
   2290    movhps     [dstq+dsq*1], m12
   2291    lea                dstq, [dstq+dsq*2]
   2292    sub                  hd, 2
   2293    jg .v_w4_loop
   2294    add                  r7, 8
   2295    add                  r8, 8
   2296    movzx                hd, wb
   2297    mov                srcq, r7
   2298    mov                dstq, r8
   2299    sub                  wd, 1<<8
   2300 %endif
   2301    jg .v_w4_loop0
   2302    RET
   2303 .h:
   2304    RESET_STACK_STATE
   2305    test                myd, 0xf00
   2306    jnz .hv
   2307    mov                 myd, r8m
   2308    movd                 m5, r8m
   2309    shr                 myd, 11
   2310    movddup              m4, [base+put_8tap_h_rnd+myq*8]
   2311    movifnidn           dsq, dsmp
   2312    pshufb               m5, [base+pw_256]
   2313    cmp                  wd, 4
   2314    jle mangle(private_prefix %+ _put_6tap_16bpc_ssse3).h_w4
   2315    WIN64_SPILL_XMM      12
   2316    shr                 mxd, 16
   2317    movq                 m3, [base+subpel_filters+mxq*8]
   2318    movifnidn          dstq, dstmp
   2319    mova                 m6, [base+spel_h_shufA]
   2320    mova                 m7, [base+spel_h_shufB]
   2321 %if UNIX64
   2322    mov                  wd, wd
   2323 %endif
   2324    lea                srcq, [srcq+wq*2]
   2325    punpcklbw            m3, m3
   2326    lea                dstq, [dstq+wq*2]
   2327    psraw                m3, 8
   2328    neg                  wq
   2329 %if ARCH_X86_32
   2330    ALLOC_STACK       -16*4
   2331    pshufd               m0, m3, q0000
   2332    pshufd               m1, m3, q1111
   2333    pshufd               m2, m3, q2222
   2334    pshufd               m3, m3, q3333
   2335    mova                 m8, m0
   2336    mova                 m9, m1
   2337    mova                m10, m2
   2338    mova                m11, m3
   2339 %else
   2340    pshufd               m8, m3, q0000
   2341    pshufd               m9, m3, q1111
   2342    pshufd              m10, m3, q2222
   2343    pshufd              m11, m3, q3333
   2344 %endif
   2345 .h_w8_loop0:
   2346    mov                  r6, wq
   2347 .h_w8_loop:
   2348    movu                 m0, [srcq+r6*2- 6]
   2349    movu                 m1, [srcq+r6*2+ 2]
   2350    pshufb               m2, m0, m6   ; 0 1 1 2 2 3 3 4
   2351    pshufb               m0, m7       ; 2 3 3 4 4 5 5 6
   2352    pmaddwd              m2, m8       ; abcd0
   2353    pmaddwd              m0, m9       ; abcd1
   2354    pshufb               m3, m1, m6   ; 4 5 5 6 6 7 7 8
   2355    pshufb               m1, m7       ; 6 7 7 8 8 9 9 a
   2356    paddd                m2, m4
   2357    paddd                m0, m2
   2358    pmaddwd              m2, m10, m3  ; abcd2
   2359    pmaddwd              m3, m8       ; efgh0
   2360    paddd                m0, m2
   2361    pmaddwd              m2, m11, m1  ; abcd3
   2362    pmaddwd              m1, m9       ; efgh1
   2363    paddd                m0, m2
   2364    movu                 m2, [srcq+r6*2+10]
   2365    paddd                m3, m4
   2366    paddd                m1, m3
   2367    pshufb               m3, m2, m6   ; 8 9 9 a a b b c
   2368    pshufb               m2, m7       ; a b b c c d d e
   2369    pmaddwd              m3, m10      ; efgh2
   2370    pmaddwd              m2, m11      ; efgh3
   2371    paddd                m1, m3
   2372    paddd                m1, m2
   2373    psrad                m0, 6
   2374    psrad                m1, 6
   2375    packssdw             m0, m1
   2376    pxor                 m1, m1
   2377    pminsw               m0, m5
   2378    pmaxsw               m0, m1
   2379    mova        [dstq+r6*2], m0
   2380    add                  r6, 8
   2381    jl .h_w8_loop
   2382    add                srcq, ssq
   2383    add                dstq, dsq
   2384    dec                  hd
   2385    jg .h_w8_loop0
   2386    RET
   2387 .hv:
   2388    RESET_STACK_STATE
   2389 %if ARCH_X86_32
   2390    movd                 m4, r8m
   2391    pshufb               m4, [base+pw_256]
   2392 %else
   2393 %if WIN64
   2394    ALLOC_STACK        16*6, 16
   2395 %endif
   2396    movd                m15, r8m
   2397    pshufb              m15, [base+pw_256]
   2398 %endif
   2399    cmp                  wd, 4
   2400    jg .hv_w8
   2401    movzx               mxd, mxb
   2402    je .hv_w4
   2403    movq                 m0, [base+subpel_filters+mxq*8]
   2404    movzx               mxd, myb
   2405    shr                 myd, 16
   2406    cmp                  hd, 6
   2407    cmovb               myd, mxd
   2408    movq                 m3, [base+subpel_filters+myq*8]
   2409    movddup              m6, [base+pd_8704]
   2410    pshuflw              m0, m0, q2121
   2411    pxor                 m7, m7
   2412    punpcklbw            m7, m0
   2413    punpcklbw            m3, m3
   2414    psraw                m3, 8 ; sign-extend
   2415    test          dword r8m, 0x800
   2416    jz .hv_w2_10bpc
   2417    movddup              m6, [base+pd_2560]
   2418    psraw                m7, 2
   2419    psllw                m3, 2
   2420 .hv_w2_10bpc:
   2421 %if ARCH_X86_32
   2422    mov                dstq, dstmp
   2423    mov                 dsq, dsmp
   2424    mova                 m5, [base+spel_h_shuf2]
   2425    ALLOC_STACK       -16*8
   2426    pshufd               m0, m3, q0000
   2427    pshufd               m1, m3, q1111
   2428    pshufd               m2, m3, q2222
   2429    pshufd               m3, m3, q3333
   2430    mova                 m9, m5
   2431    mova                m11, m0
   2432    mova                m12, m1
   2433    mova                m13, m2
   2434    mova                m14, m3
   2435    mova                m15, m4
   2436 %else
   2437    mova                 m9, [base+spel_h_shuf2]
   2438    pshufd              m11, m3, q0000
   2439    pshufd              m12, m3, q1111
   2440    pshufd              m13, m3, q2222
   2441    pshufd              m14, m3, q3333
   2442 %endif
   2443    lea                  r6, [ssq*3]
   2444    sub                srcq, 2
   2445    sub                srcq, r6
   2446    movu                 m2, [srcq+ssq*0]
   2447    movu                 m3, [srcq+ssq*1]
   2448    movu                 m1, [srcq+ssq*2]
   2449    add                srcq, r6
   2450    movu                 m4, [srcq+ssq*0]
   2451 %if ARCH_X86_32
   2452    REPX    {pshufb  x, m5}, m2, m3, m1, m4
   2453 %else
   2454    REPX    {pshufb  x, m9}, m2, m3, m1, m4
   2455 %endif
   2456    REPX    {pmaddwd x, m7}, m2, m3, m1, m4
   2457    phaddd               m2, m3        ; 0 1
   2458    phaddd               m1, m4        ; 2 3
   2459    movu                 m3, [srcq+ssq*1]
   2460    movu                 m4, [srcq+ssq*2]
   2461    add                srcq, r6
   2462    movu                 m0, [srcq+ssq*0]
   2463 %if ARCH_X86_32
   2464    REPX    {pshufb  x, m5}, m3, m4, m0
   2465 %else
   2466    REPX    {pshufb  x, m9}, m3, m4, m0
   2467 %endif
   2468    REPX    {pmaddwd x, m7}, m3, m4, m0
   2469    phaddd               m3, m4        ; 4 5
   2470    phaddd               m0, m0        ; 6 6
   2471    REPX    {paddd   x, m6}, m2, m1, m3, m0
   2472    REPX    {psrad   x, 10}, m2, m1, m3, m0
   2473    packssdw             m2, m1        ; 0 1 2 3
   2474    packssdw             m3, m0        ; 4 5 6 _
   2475    palignr              m4, m3, m2, 4 ; 1 2 3 4
   2476    pshufd               m5, m3, q0321 ; 5 6 _ _
   2477    punpcklwd            m1, m2, m4    ; 01 12
   2478    punpckhwd            m2, m4        ; 23 34
   2479    punpcklwd            m3, m5        ; 45 56
   2480 .hv_w2_loop:
   2481    movu                 m4, [srcq+ssq*1]
   2482    lea                srcq, [srcq+ssq*2]
   2483    movu                 m5, [srcq+ssq*0]
   2484    pshufb               m4, m9
   2485    pshufb               m5, m9
   2486    pmaddwd              m4, m7
   2487    pmaddwd              m5, m7
   2488    phaddd               m4, m5
   2489    pmaddwd              m5, m11, m1   ; a0 b0
   2490    mova                 m1, m2
   2491    pmaddwd              m2, m12       ; a1 b1
   2492    paddd                m5, m2
   2493    mova                 m2, m3
   2494    pmaddwd              m3, m13       ; a2 b2
   2495    paddd                m5, m3
   2496    paddd                m4, m6
   2497    psrad                m4, 10        ; 7 8
   2498    packssdw             m0, m4
   2499    pshufd               m3, m0, q2103
   2500    punpckhwd            m3, m0        ; 67 78
   2501    mova                 m0, m4
   2502    pmaddwd              m4, m14, m3   ; a3 b3
   2503    paddd                m5, m4
   2504    psrad                m5, 10
   2505    packssdw             m5, m5
   2506    pxor                 m4, m4
   2507    pminsw               m5, m15
   2508    pmaxsw               m5, m4
   2509    movd       [dstq+dsq*0], m5
   2510    pshuflw              m5, m5, q3232
   2511    movd       [dstq+dsq*1], m5
   2512    lea                dstq, [dstq+dsq*2]
   2513    sub                  hd, 2
   2514    jg .hv_w2_loop
   2515    RET
   2516 .hv_w8:
   2517    shr                 mxd, 16
   2518 .hv_w4:
   2519    movq                 m2, [base+subpel_filters+mxq*8]
   2520    movzx               mxd, myb
   2521    shr                 myd, 16
   2522    cmp                  hd, 6
   2523    cmovb               myd, mxd
   2524    movq                 m3, [base+subpel_filters+myq*8]
   2525 %if ARCH_X86_32
   2526    RESET_STACK_STATE
   2527    mov                dstq, dstmp
   2528    mov                 dsq, dsmp
   2529    mova                 m0, [base+spel_h_shufA]
   2530    mova                 m1, [base+spel_h_shufB]
   2531    mova                 m6, [base+pd_512]
   2532    ALLOC_STACK      -16*15
   2533    mova                 m8, m0
   2534    mova                 m9, m1
   2535    mova                m14, m6
   2536 %else
   2537    mova                 m8, [base+spel_h_shufA]
   2538    mova                 m9, [base+spel_h_shufB]
   2539 %endif
   2540    pxor                 m0, m0
   2541    punpcklbw            m0, m2
   2542    punpcklbw            m3, m3
   2543    psraw                m3, 8
   2544    test          dword r8m, 0x800
   2545    jz .hv_w4_10bpc
   2546    psraw                m0, 2
   2547    psllw                m3, 2
   2548 .hv_w4_10bpc:
   2549    lea                  r6, [ssq*3]
   2550    sub                srcq, 6
   2551    sub                srcq, r6
   2552 %if ARCH_X86_32
   2553    %define tmp esp+16*8
   2554    shl                  wd, 14
   2555 %if STACK_ALIGNMENT < 16
   2556    mov          [esp+4*61], srcq
   2557    mov          [esp+4*62], dstq
   2558 %else
   2559    mov               srcmp, srcq
   2560 %endif
   2561    mova         [tmp+16*5], m4
   2562    lea                  wd, [wq+hq-(1<<16)]
   2563    pshufd               m1, m0, q0000
   2564    pshufd               m2, m0, q1111
   2565    pshufd               m5, m0, q2222
   2566    pshufd               m0, m0, q3333
   2567    mova                m10, m1
   2568    mova                m11, m2
   2569    mova                m12, m5
   2570    mova                m13, m0
   2571 %else
   2572 %if WIN64
   2573    %define tmp rsp
   2574 %else
   2575    %define tmp rsp-104 ; red zone
   2576 %endif
   2577    shl                  wd, 6
   2578    mov                  r7, srcq
   2579    mov                  r8, dstq
   2580    lea                  wd, [wq+hq-(1<<8)]
   2581    pshufd              m10, m0, q0000
   2582    pshufd              m11, m0, q1111
   2583    pshufd              m12, m0, q2222
   2584    pshufd              m13, m0, q3333
   2585    mova         [tmp+16*5], m15
   2586 %endif
   2587    pshufd               m0, m3, q0000
   2588    pshufd               m1, m3, q1111
   2589    pshufd               m2, m3, q2222
   2590    pshufd               m3, m3, q3333
   2591    mova         [tmp+16*1], m0
   2592    mova         [tmp+16*2], m1
   2593    mova         [tmp+16*3], m2
   2594    mova         [tmp+16*4], m3
   2595 %macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512]
   2596    pshufb              m%3, m%1, m8 ; 0 1 1 2 2 3 3 4
   2597    pshufb              m%1, m9      ; 2 3 3 4 4 5 5 6
   2598    pmaddwd             m%3, m10
   2599    pmaddwd             m%1, m11
   2600    paddd               m%3, %5
   2601    paddd               m%1, m%3
   2602    pshufb              m%3, m%2, m8 ; 4 5 5 6 6 7 7 8
   2603    pshufb              m%2, m9      ; 6 7 7 8 8 9 9 a
   2604    pmaddwd             m%3, m12
   2605    pmaddwd             m%2, m13
   2606    paddd               m%1, m%3
   2607    paddd               m%1, m%2
   2608    psrad               m%1, %4
   2609 %endmacro
   2610 .hv_w4_loop0:
   2611 %if ARCH_X86_64
   2612    mova                m14, [pd_512]
   2613 %endif
   2614    movu                 m4, [srcq+ssq*0+0]
   2615    movu                 m1, [srcq+ssq*0+8]
   2616    movu                 m5, [srcq+ssq*1+0]
   2617    movu                 m2, [srcq+ssq*1+8]
   2618    movu                 m6, [srcq+ssq*2+0]
   2619    movu                 m3, [srcq+ssq*2+8]
   2620    add                srcq, r6
   2621    PUT_8TAP_HV_H         4, 1, 0, 10
   2622    PUT_8TAP_HV_H         5, 2, 0, 10
   2623    PUT_8TAP_HV_H         6, 3, 0, 10
   2624    movu                 m7, [srcq+ssq*0+0]
   2625    movu                 m2, [srcq+ssq*0+8]
   2626    movu                 m1, [srcq+ssq*1+0]
   2627    movu                 m3, [srcq+ssq*1+8]
   2628    PUT_8TAP_HV_H         7, 2, 0, 10
   2629    PUT_8TAP_HV_H         1, 3, 0, 10
   2630    movu                 m2, [srcq+ssq*2+0]
   2631    movu                 m3, [srcq+ssq*2+8]
   2632    add                srcq, r6
   2633    PUT_8TAP_HV_H         2, 3, 0, 10
   2634    packssdw             m4, m7      ; 0 3
   2635    packssdw             m5, m1      ; 1 4
   2636    movu                 m0, [srcq+ssq*0+0]
   2637    movu                 m1, [srcq+ssq*0+8]
   2638    PUT_8TAP_HV_H         0, 1, 3, 10
   2639    packssdw             m6, m2      ; 2 5
   2640    packssdw             m7, m0      ; 3 6
   2641    punpcklwd            m1, m4, m5  ; 01
   2642    punpckhwd            m4, m5      ; 34
   2643    punpcklwd            m2, m5, m6  ; 12
   2644    punpckhwd            m5, m6      ; 45
   2645    punpcklwd            m3, m6, m7  ; 23
   2646    punpckhwd            m6, m7      ; 56
   2647 %if ARCH_X86_32
   2648    jmp .hv_w4_loop_start
   2649 .hv_w4_loop:
   2650    mova                 m1, [tmp+16*6]
   2651    mova                 m2, m15
   2652 .hv_w4_loop_start:
   2653    mova                 m7, [tmp+16*1]
   2654    pmaddwd              m1, m7      ; a0
   2655    pmaddwd              m2, m7      ; b0
   2656    mova                 m7, [tmp+16*2]
   2657    mova         [tmp+16*6], m3
   2658    pmaddwd              m3, m7      ; a1
   2659    mova                m15, m4
   2660    pmaddwd              m4, m7      ; b1
   2661    mova                 m7, [tmp+16*3]
   2662    paddd                m1, m3
   2663    paddd                m2, m4
   2664    mova                 m3, m5
   2665    pmaddwd              m5, m7      ; a2
   2666    mova                 m4, m6
   2667    pmaddwd              m6, m7      ; b2
   2668    paddd                m1, m5
   2669    paddd                m2, m6
   2670    movu                 m7, [srcq+ssq*1+0]
   2671    movu                 m5, [srcq+ssq*1+8]
   2672    lea                srcq, [srcq+ssq*2]
   2673    PUT_8TAP_HV_H         7, 5, 6, 10
   2674    packssdw             m0, m7      ; 6 7
   2675    mova         [tmp+16*0], m0
   2676    movu                 m0, [srcq+ssq*0+0]
   2677    movu                 m5, [srcq+ssq*0+8]
   2678    PUT_8TAP_HV_H         0, 5, 6, 10
   2679    mova                 m6, [tmp+16*0]
   2680    packssdw             m7, m0      ; 7 8
   2681    punpcklwd            m5, m6, m7  ; 67
   2682    punpckhwd            m6, m7      ; 78
   2683    pmaddwd              m7, m5, [tmp+16*4]
   2684    paddd                m1, m7      ; a3
   2685    pmaddwd              m7, m6, [tmp+16*4]
   2686    paddd                m2, m7      ; b3
   2687    psrad                m1, 9
   2688    psrad                m2, 9
   2689    packssdw             m1, m2
   2690    pxor                 m7, m7
   2691    pmaxsw               m1, m7
   2692    pavgw                m7, m1
   2693    pminsw               m7, [tmp+16*5]
   2694    movq       [dstq+dsq*0], m7
   2695    movhps     [dstq+dsq*1], m7
   2696    lea                dstq, [dstq+dsq*2]
   2697    sub                  hd, 2
   2698    jg .hv_w4_loop
   2699 %if STACK_ALIGNMENT < 16
   2700    mov                srcq, [esp+4*61]
   2701    mov                dstq, [esp+4*62]
   2702    add                srcq, 8
   2703    add                dstq, 8
   2704    mov          [esp+4*61], srcq
   2705    mov          [esp+4*62], dstq
   2706 %else
   2707    mov                srcq, srcmp
   2708    mov                dstq, dstmp
   2709    add                srcq, 8
   2710    add                dstq, 8
   2711    mov               srcmp, srcq
   2712    mov               dstmp, dstq
   2713 %endif
   2714    movzx                hd, ww
   2715    sub                  wd, 1<<16
   2716 %else
   2717 .hv_w4_loop:
   2718    mova                m15, [tmp+16*1]
   2719    pmaddwd             m14, m15, m1 ; a0
   2720    pmaddwd             m15, m2      ; b0
   2721    mova                 m7, [tmp+16*2]
   2722    mova                 m1, m3
   2723    pmaddwd              m3, m7      ; a1
   2724    mova                 m2, m4
   2725    pmaddwd              m4, m7      ; b1
   2726    mova                 m7, [tmp+16*3]
   2727    paddd               m14, m3
   2728    paddd               m15, m4
   2729    mova                 m3, m5
   2730    pmaddwd              m5, m7      ; a2
   2731    mova                 m4, m6
   2732    pmaddwd              m6, m7      ; b2
   2733    paddd               m14, m5
   2734    paddd               m15, m6
   2735    movu                 m7, [srcq+ssq*1+0]
   2736    movu                 m5, [srcq+ssq*1+8]
   2737    lea                srcq, [srcq+ssq*2]
   2738    PUT_8TAP_HV_H         7, 5, 6, 10, [pd_512]
   2739    packssdw             m0, m7      ; 6 7
   2740    mova         [tmp+16*0], m0
   2741    movu                 m0, [srcq+ssq*0+0]
   2742    movu                 m5, [srcq+ssq*0+8]
   2743    PUT_8TAP_HV_H         0, 5, 6, 10, [pd_512]
   2744    mova                 m6, [tmp+16*0]
   2745    packssdw             m7, m0      ; 7 8
   2746    punpcklwd            m5, m6, m7  ; 67
   2747    punpckhwd            m6, m7      ; 78
   2748    pmaddwd              m7, m5, [tmp+16*4]
   2749    paddd               m14, m7      ; a3
   2750    pmaddwd              m7, m6, [tmp+16*4]
   2751    paddd               m15, m7      ; b3
   2752    psrad               m14, 9
   2753    psrad               m15, 9
   2754    packssdw            m14, m15
   2755    pxor                 m7, m7
   2756    pmaxsw              m14, m7
   2757    pavgw                m7, m14
   2758    pminsw               m7, [tmp+16*5]
   2759    movq       [dstq+dsq*0], m7
   2760    movhps     [dstq+dsq*1], m7
   2761    lea                dstq, [dstq+dsq*2]
   2762    sub                  hd, 2
   2763    jg .hv_w4_loop
   2764    add                  r7, 8
   2765    add                  r8, 8
   2766    movzx                hd, wb
   2767    mov                srcq, r7
   2768    mov                dstq, r8
   2769    sub                  wd, 1<<8
   2770 %endif
   2771    jg .hv_w4_loop0
   2772    RET
   2773 %undef tmp
   2774 
   2775 %if ARCH_X86_32
   2776 DECLARE_REG_TMP 2, 1, 6, 4
   2777 %elif WIN64
   2778 DECLARE_REG_TMP 6, 4, 7, 4
   2779 %else
   2780 DECLARE_REG_TMP 6, 7, 7, 8
   2781 %endif
   2782 
   2783 %define PREP_8TAP_FN FN prep_8tap,
   2784 PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
   2785 PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
   2786 PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
   2787 PREP_8TAP_FN regular,        REGULAR, REGULAR
   2788 
   2789 cglobal prep_6tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my
   2790    %define            base  t2-prep_ssse3
   2791 %if ARCH_X86_32
   2792    %define             mxb  r0b
   2793    %define             mxd  r0
   2794    %define             mxq  r0
   2795    %define             myb  r2b
   2796    %define             myd  r2
   2797    %define             myq  r2
   2798 %endif
   2799    imul                mxd, mxm, 0x010101
   2800    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
   2801    imul                myd, mym, 0x010101
   2802    add                 myd, t1d ; 6tap_v, my, 4tap_v
   2803    LEA                  t2, prep_ssse3
   2804    movifnidn            wd, wm
   2805    movifnidn            hd, hm
   2806    movifnidn          srcq, srcmp
   2807    test                mxd, 0xf00
   2808    jnz .h
   2809    test                myd, 0xf00
   2810    jnz .v
   2811 .prep:
   2812    tzcnt                wd, wd
   2813    mov                 myd, r7m ; bitdepth_max
   2814    movzx                wd, word [base+prep_ssse3_table+wq*2]
   2815    mova                 m5, [base+pw_8192]
   2816    shr                 myd, 11
   2817    add                  wq, t2
   2818    movddup              m4, [base+prep_mul+myq*8]
   2819    movifnidn           ssq, ssmp
   2820    movifnidn          tmpq, tmpmp
   2821    lea                  r6, [ssq*3]
   2822 %if WIN64
   2823    pop                  r7
   2824 %endif
   2825    jmp                  wq
   2826 .h:
   2827    RESET_STACK_STATE
   2828    test                myd, 0xf00
   2829    jnz .hv
   2830    movifnidn           ssq, r2mp
   2831    movddup              m5, [base+prep_8tap_1d_rnd]
   2832    cmp                  wd, 4
   2833    je mangle(private_prefix %+ _prep_8tap_16bpc_ssse3).h_w4
   2834    WIN64_SPILL_XMM      10
   2835    shr                 mxd, 16
   2836    movq                 m2, [base+subpel_filters+1+mxq*8]
   2837    movifnidn          tmpq, r0mp
   2838    mova                 m4, [base+spel_h_shufA]
   2839    add                  wd, wd
   2840    mova                 m6, [base+spel_h_shufB]
   2841    add                srcq, wq
   2842    punpcklbw            m2, m2
   2843    add                tmpq, wq
   2844    psraw                m2, 8
   2845    neg                  wq
   2846    test          dword r7m, 0x800
   2847    jnz .h_w8_12bpc
   2848    psllw                m2, 2
   2849 .h_w8_12bpc:
   2850    pshufd               m7, m2, q0000
   2851 %if ARCH_X86_32
   2852    ALLOC_STACK       -16*2
   2853    %define              m8  [rsp+16*0]
   2854    %define              m9  [rsp+16*1]
   2855    pshufd               m0, m2, q1111
   2856    pshufd               m1, m2, q2222
   2857    mova                 m8, m0
   2858    mova                 m9, m1
   2859 %else
   2860    pshufd               m8, m2, q1111
   2861    pshufd               m9, m2, q2222
   2862 %endif
   2863 .h_w8_loop0:
   2864    mov                  r6, wq
   2865 .h_w8_loop:
   2866    movu                 m3, [srcq+r6-4]
   2867    movu                 m2, [srcq+r6+8]
   2868    pshufb               m0, m3, m4  ; 01 12 23 34
   2869    pmaddwd              m0, m7      ; abcd0
   2870    pshufb               m3, m6      ; 23 34 45 56
   2871    pmaddwd              m1, m8, m3  ; abcd1
   2872    paddd                m0, m1
   2873    pshufb               m1, m2, m4  ; 67 78 89 9a
   2874    shufpd               m3, m1, 0x01; 45 56 67 78
   2875    pmaddwd              m1, m8      ; efgh1
   2876    pshufb               m2, m6      ; 89 9a ab bc
   2877    pmaddwd              m2, m9      ; efgh2
   2878    paddd                m1, m2
   2879    pmaddwd              m2, m9 , m3 ; abcd2
   2880    pmaddwd              m3, m7      ; efgh0
   2881    paddd                m0, m5
   2882    paddd                m1, m5
   2883    paddd                m0, m2
   2884    paddd                m1, m3
   2885    psrad                m0, 4
   2886    psrad                m1, 4
   2887    packssdw             m0, m1
   2888    mova          [tmpq+r6], m0
   2889    add                  r6, 16
   2890    jl .h_w8_loop
   2891    add                srcq, ssq
   2892    sub                tmpq, wq
   2893    dec                  hd
   2894    jg .h_w8_loop0
   2895    RET
   2896 .v:
   2897    movzx               mxd, myb
   2898    shr                 myd, 16
   2899    cmp                  hd, 6
   2900    cmovb               myd, mxd
   2901    movddup              m5, [base+prep_8tap_1d_rnd]
   2902    movq                 m2, [base+subpel_filters+1+myq*8]
   2903    WIN64_SPILL_XMM      11, 16
   2904    movifnidn           ssq, r2mp
   2905    movifnidn          tmpq, r0mp
   2906    punpcklbw            m2, m2
   2907    sub                srcq, ssq
   2908    psraw                m2, 8 ; sign-extend
   2909    test          dword r7m, 0x800
   2910    jnz .v_12bpc
   2911    psllw                m2, 2
   2912 .v_12bpc:
   2913    sub                srcq, ssq
   2914 %if ARCH_X86_32
   2915    ALLOC_STACK       -16*4
   2916    pshufd               m0, m2, q0000
   2917    mov                 r6d, wd
   2918    pshufd               m1, m2, q1111
   2919    shl                 r6d, 14
   2920    pshufd               m2, m2, q2222
   2921    lea                 r6d, [r6+hq-(1<<16)]
   2922    mova                 m8, m0
   2923    mova                 m9, m1
   2924    mova                m10, m2
   2925 %if STACK_ALIGNMENT < 16
   2926    %define           srcmp  [esp+16*3+4*0]
   2927    %define           tmpmp  [esp+16*3+4*1]
   2928 %endif
   2929 .v_w4_loop0:
   2930    mov               srcmp, srcq
   2931    mov               tmpmp, tmpq
   2932 %else
   2933    pshufd               m8, m2, q0000
   2934    and                  wd, -8
   2935    jnz .v_w8
   2936    pshufd               m9, m2, q1111
   2937    pshufd              m10, m2, q2222
   2938 %endif
   2939    movq                 m1, [srcq+ssq*0]
   2940    movq                 m2, [srcq+ssq*1]
   2941    lea                srcq, [srcq+ssq*2]
   2942    movq                 m3, [srcq+ssq*0]
   2943    movq                 m4, [srcq+ssq*1]
   2944    lea                srcq, [srcq+ssq*2]
   2945    movq                 m0, [srcq+ssq*0]
   2946    punpcklwd            m1, m2      ; 01
   2947    punpcklwd            m2, m3      ; 12
   2948    punpcklwd            m3, m4      ; 23
   2949    punpcklwd            m4, m0      ; 34
   2950 .v_w4_loop:
   2951    pmaddwd              m6, m8, m1  ; a0
   2952    pmaddwd              m7, m8, m2  ; b0
   2953    mova                 m1, m3
   2954    pmaddwd              m3, m9      ; a1
   2955    mova                 m2, m4
   2956    pmaddwd              m4, m9      ; b1
   2957    paddd                m6, m3
   2958    movq                 m3, [srcq+ssq*0]
   2959    paddd                m7, m4
   2960    movq                 m4, [srcq+ssq*1]
   2961    lea                srcq, [srcq+ssq*2]
   2962    movq                 m0, [srcq+ssq*0]
   2963    punpcklwd            m3, m4      ; 45
   2964    punpcklwd            m4, m0      ; 56
   2965    pmaddwd              m0, m10, m3 ; a2
   2966    paddd                m6, m5
   2967    paddd                m6, m0
   2968    pmaddwd              m0, m10, m4 ; b2
   2969    paddd                m7, m5
   2970    paddd                m7, m0
   2971    psrad                m6, 4
   2972    psrad                m7, 4
   2973    packssdw             m6, m7
   2974 %if ARCH_X86_32
   2975    movq        [tmpq+wq*0], m6
   2976    movhps      [tmpq+wq*2], m6
   2977    lea                tmpq, [tmpq+wq*4]
   2978    sub                  hd, 2
   2979    jg .v_w4_loop
   2980    mov                srcq, srcmp
   2981    mov                tmpq, tmpmp
   2982    movzx                hd, r6w
   2983    add                srcq, 8
   2984    add                tmpq, 8
   2985    sub                 r6d, 1<<16
   2986    jg .v_w4_loop0
   2987    RET
   2988 %else
   2989    mova             [tmpq], m6
   2990    add                tmpq, 16
   2991    sub                  hd, 2
   2992    jg .v_w4_loop
   2993    RET
   2994 .v_w8:
   2995    mova                r6m, m8
   2996    lea                 r6d, [wq*4-(1<<5)]
   2997    pshufd               m6, m2, q1111
   2998    lea                 r6d, [hq+r6*8]
   2999    pshufd               m7, m2, q2222
   3000    WIN64_PUSH_XMM       16
   3001 .v_w8_loop0:
   3002    movu                 m9, [srcq+ssq*0]
   3003    lea                  r5, [srcq+ssq*2]
   3004    movu                m11, [srcq+ssq*1]
   3005    mov                  r7, tmpq
   3006    movu                m13, [r5+ssq*0]
   3007    movu                m15, [r5+ssq*1]
   3008    lea                  r5, [r5+ssq*2]
   3009    movu                 m4, [r5+ssq*0]
   3010    punpcklwd            m8, m9, m11  ; 01
   3011    punpckhwd            m9, m11
   3012    punpcklwd           m10, m11, m13 ; 12
   3013    punpckhwd           m11, m13
   3014    punpcklwd           m12, m13, m15 ; 23
   3015    punpckhwd           m13, m15
   3016    punpcklwd           m14, m15, m4  ; 34
   3017    punpckhwd           m15, m4
   3018 .v_w8_loop:
   3019    mova                 m3, r6m
   3020    pmaddwd              m0, m8, m3   ; a0
   3021    pmaddwd              m2, m9, m3   ; a0'
   3022    pmaddwd              m1, m10, m3  ; b0
   3023    pmaddwd              m3, m11      ; b0'
   3024    mova                 m8, m12
   3025    pmaddwd             m12, m6       ; a1
   3026    mova                 m9, m13
   3027    pmaddwd             m13, m6       ; a1'
   3028    mova                m10, m14
   3029    pmaddwd             m14, m6       ; b1
   3030    mova                m11, m15
   3031    pmaddwd             m15, m6       ; b1'
   3032    paddd                m0, m12
   3033    paddd                m2, m13
   3034    movu                m13, [r5+ssq*0]
   3035    paddd                m1, m14
   3036    paddd                m3, m15
   3037    movu                m15, [r5+ssq*1]
   3038    lea                  r5, [r5+ssq*2]
   3039    movu                 m4, [r5+ssq*0]
   3040    REPX      {paddd x, m5}, m0, m2, m1, m3
   3041    punpcklwd           m12, m13, m15 ; 45
   3042    punpckhwd           m13, m15
   3043    punpcklwd           m14, m15, m4  ; 56
   3044    punpckhwd           m15, m4
   3045    pmaddwd              m4, m7, m12  ; a2
   3046    paddd                m0, m4
   3047    pmaddwd              m4, m7, m13  ; a2'
   3048    paddd                m2, m4
   3049    pmaddwd              m4, m7, m14  ; b2
   3050    paddd                m1, m4
   3051    pmaddwd              m4, m7, m15  ; b2'
   3052    paddd                m3, m4
   3053    REPX       {psrad x, 4}, m0, m2, m1, m3
   3054    packssdw             m0, m2
   3055    packssdw             m1, m3
   3056    mova          [r7+wq*0], m0
   3057    mova          [r7+wq*2], m1
   3058    lea                  r7, [r7+wq*4]
   3059    sub                  hd, 2
   3060    jg .v_w8_loop
   3061    add                srcq, 16
   3062    add                tmpq, 16
   3063    movzx                hd, r6b
   3064    sub                 r6d, 1<<8
   3065    jg .v_w8_loop0
   3066    RET
   3067 %endif
   3068 .hv:
   3069    and                  wd, -8
   3070    jnz .hv_w8
   3071    movzx               mxd, mxb
   3072    movq                 m0, [base+subpel_filters+mxq*8]
   3073    movzx               mxd, myb
   3074    shr                 myd, 16
   3075    cmp                  hd, 6
   3076    cmovb               myd, mxd
   3077    movq                 m2, [base+subpel_filters+1+myq*8]
   3078    WIN64_SPILL_XMM      15
   3079    movifnidn           ssq, r2mp
   3080    movifnidn          tmpq, r0mp
   3081    mova                 m7, [base+prep_8tap_2d_rnd]
   3082    sub                srcq, 2
   3083    pshuflw              m0, m0, q2121
   3084    pxor                 m6, m6
   3085    punpcklbw            m6, m0
   3086    punpcklbw            m2, m2
   3087    psraw                m6, 4
   3088    psraw                m2, 8
   3089    test          dword r7m, 0x800
   3090    jz .hv_w4_10bpc
   3091    psraw                m6, 2
   3092 .hv_w4_10bpc:
   3093 %if ARCH_X86_32
   3094 %assign regs_used 4
   3095    ALLOC_STACK       -16*7
   3096 %assign regs_used 7
   3097    %define             m10  [esp+16*3]
   3098    %define             m12  [esp+16*5]
   3099    %define             m13  [esp+16*6]
   3100    %define             m14  [base+spel_h_shufA]
   3101    %define             m11  [base+spel_h_shufB]
   3102    pshufd               m0, m2, q0000
   3103    pshufd               m1, m2, q1111
   3104    pshufd               m2, m2, q2222
   3105    pshufd               m5, m6, q0000
   3106    pshufd               m6, m6, q1111
   3107    mova                 m8, m0
   3108    mova                 m9, m1
   3109    mova                m10, m2
   3110    mova                m12, m5
   3111    mova                m13, m6
   3112    neg                 ssq
   3113    movu                 m3, [srcq+ssq*2]
   3114    movu                 m4, [srcq+ssq*1]
   3115    neg                 ssq
   3116 %else
   3117    mov                  r6, ssq
   3118    pshufd               m8, m2, q0000
   3119    neg                  r6
   3120    pshufd               m9, m2, q1111
   3121    movu                 m3, [srcq+r6 *2]
   3122    pshufd              m10, m2, q2222
   3123    movu                 m4, [srcq+r6 *1]
   3124    pshufd              m12, m6, q0000
   3125    mova                m14, [base+spel_h_shufA]
   3126    pshufd              m13, m6, q1111
   3127    mova                m11, [base+spel_h_shufB]
   3128 %endif
   3129    movu                 m1, [srcq+ssq*0]
   3130    movu                 m0, [srcq+ssq*1]
   3131    lea                srcq, [srcq+ssq*2]
   3132    movu                 m2, [srcq+ssq*0]
   3133    HV_H_W4_6TAP         m3, m3, m5, m11
   3134    HV_H_W4_6TAP         m4, m4, m5, m11
   3135    HV_H_W4_6TAP         m5, m1, m5, m11
   3136    HV_H_W4_6TAP         m0, m0, m1, m11
   3137    HV_H_W4_6TAP         m2, m2, m1, m11
   3138    REPX       {psrad x, 6}, m3, m5, m4, m0, m2
   3139    packssdw             m3, m5      ; 0 2
   3140    packssdw             m4, m0      ; 1 3
   3141    packssdw             m5, m2      ; 2 4
   3142    punpcklwd            m1, m3, m4  ; 01
   3143    punpckhwd            m3, m4      ; 23
   3144    punpcklwd            m2, m4, m5  ; 12
   3145    punpckhwd            m4, m5      ; 34
   3146 .hv_w4_loop:
   3147    movu                 m0, [srcq+ssq*1]
   3148    pmaddwd              m5, m8, m1  ; a0
   3149    lea                srcq, [srcq+ssq*2]
   3150    pmaddwd              m6, m8, m2  ; b0
   3151    mova                 m1, m3
   3152    pmaddwd              m3, m9      ; a1
   3153    mova                 m2, m4
   3154    pmaddwd              m4, m9      ; b1
   3155    paddd                m5, m3
   3156    movu                 m3, [srcq+ssq*0]
   3157    paddd                m6, m4
   3158    HV_H_W4_6TAP         m0, m0, m4, m11
   3159    HV_H_W4_6TAP         m3, m3, m4, m11
   3160    psrad                m4, m2, 16
   3161    psrad                m0, 6
   3162    psrad                m3, 6
   3163    packssdw             m4, m0      ; 4 5
   3164    packssdw             m0, m3      ; 5 6
   3165    punpcklwd            m3, m4, m0  ; 45
   3166    punpckhwd            m4, m0      ; 56
   3167    pmaddwd              m0, m10, m3 ; a2
   3168    paddd                m5, m7
   3169    paddd                m5, m0
   3170    pmaddwd              m0, m10, m4 ; b2
   3171    paddd                m6, m7
   3172    paddd                m6, m0
   3173    psrad                m5, 6
   3174    psrad                m6, 6
   3175    packssdw             m5, m6
   3176    mova             [tmpq], m5
   3177    add                tmpq, 16
   3178    sub                  hd, 2
   3179    jg .hv_w4_loop
   3180    RET
   3181 .hv_w8:
   3182    RESET_STACK_STATE
   3183    shr                 mxd, 16
   3184    movq                 m2, [base+subpel_filters+1+mxq*8]
   3185    movzx               mxd, myb
   3186    shr                 myd, 16
   3187    cmp                  hd, 6
   3188    cmovb               myd, mxd
   3189    movq                 m1, [base+subpel_filters+1+myq*8]
   3190    movifnidn           ssq, r2mp
   3191    mova                 m4, [base+prep_8tap_2d_rnd]
   3192    pxor                 m0, m0
   3193    punpcklbw            m0, m2
   3194    punpcklbw            m1, m1
   3195    sub                srcq, 4
   3196    psraw                m0, 4
   3197    psraw                m1, 8
   3198    test          dword r7m, 0x800
   3199    jz .hv_w8_10bpc
   3200    psraw                m0, 2
   3201 .hv_w8_10bpc:
   3202 %if ARCH_X86_32
   3203 %assign regs_used 1
   3204    ALLOC_STACK       -16*9
   3205 %assign regs_used 7
   3206    mov                tmpq, r0mp
   3207    mova         [rsp+16*7], m4
   3208 %else
   3209 %if WIN64
   3210    PUSH                 r8
   3211 %assign regs_used 9
   3212 %endif
   3213    ALLOC_STACK        16*6, 16
   3214 %endif
   3215    pshufd               m2, m0, q0000
   3216    mova         [rsp+16*0], m2
   3217    pshufd               m2, m0, q1111
   3218    mova         [rsp+16*1], m2
   3219    pshufd               m0, m0, q2222
   3220    mova         [rsp+16*2], m0
   3221    pshufd               m2, m1, q0000
   3222    mova         [rsp+16*3], m2
   3223    pshufd               m2, m1, q1111
   3224    mova         [rsp+16*4], m2
   3225    pshufd               m1, m1, q2222
   3226    mova         [rsp+16*5], m1
   3227    mov                  r6, ssq
   3228    neg                  r6
   3229 %if ARCH_X86_32
   3230    mov                 r5d, wd
   3231    shl                 r5d, 14
   3232    lea                 r5d, [r5+hq-(1<<16)]
   3233 %if STACK_ALIGNMENT < 16
   3234    %define           srcmp  [esp+16*8+4*0]
   3235    %define           tmpmp  [esp+16*8+4*1]
   3236 %endif
   3237 .hv_w8_loop0:
   3238    mov               srcmp, srcq
   3239    mov               tmpmp, tmpq
   3240    movu                 m5, [srcq+r6*2+0]
   3241    movu                 m6, [srcq+r6*2+2]
   3242    mova                 m7, [rsp+16*0]
   3243    mova                 m1, [rsp+16*1]
   3244    mova                 m0, [rsp+16*2]
   3245    HV_H_6TAP            m2, m5, m6, m7, m1, m0
   3246    movu                 m5, [srcq+r6*1+0]
   3247    movu                 m6, [srcq+r6*1+2]
   3248    HV_H_6TAP            m3, m5, m6, m7, m1, m0
   3249    movu                 m5, [srcq+ssq*0+0]
   3250    movu                 m6, [srcq+ssq*0+2]
   3251    HV_H_6TAP            m4, m5, m6, m7, m1, m0
   3252    movu                 m5, [srcq+ssq*1+0]
   3253    movu                 m6, [srcq+ssq*1+2]
   3254    lea                srcq, [srcq+ssq*2]
   3255    HV_H_6TAP            m0, m5, m6, m7, m1
   3256    movu                 m5, [srcq+ssq*0+0]
   3257    movu                 m6, [srcq+ssq*0+2]
   3258    HV_H_6TAP            m1, m5, m6, m7
   3259    mova                 m5, [rsp+16*7]
   3260    REPX      {paddd x, m5}, m2, m3, m4, m0, m1
   3261    REPX      {psrad x, 6 }, m2, m4, m3, m0, m1
   3262    packssdw             m2, m4     ; 0 2
   3263    packssdw             m3, m0     ; 1 3
   3264    packssdw             m4, m1     ; 2 4
   3265    punpcklwd            m0, m2, m3 ; 01
   3266    punpckhwd            m2, m3     ; 23
   3267    punpcklwd            m1, m3, m4 ; 12
   3268    punpckhwd            m3, m4     ; 34
   3269 .hv_w8_loop:
   3270    mova                 m5, [rsp+16*3]
   3271    mova                 m6, [rsp+16*4]
   3272    pmaddwd              m4, m0, m5 ; a0
   3273    pmaddwd              m5, m1     ; b0
   3274    mova                 m0, m2
   3275    pmaddwd              m2, m6     ; a1
   3276    mova                 m1, m3
   3277    pmaddwd              m3, m6     ; b1
   3278    paddd                m4, m2
   3279    movu                 m2, [srcq+ssq*1+0]
   3280    paddd                m5, m3
   3281    movu                 m3, [srcq+ssq*1+2]
   3282    lea                srcq, [srcq+ssq*2]
   3283    HV_H_6TAP            m6, m2, m3
   3284    movu                 m2, [srcq+ssq*0+0]
   3285    movu                 m3, [srcq+ssq*0+2]
   3286    HV_H_6TAP            m7, m2, m3
   3287    mova                 m2, [rsp+16*7]
   3288    psrad                m3, m1, 16
   3289    REPX      {paddd x, m2}, m6, m7, m4, m5
   3290    psrad                m6, 6
   3291    psrad                m7, 6
   3292    packssdw             m3, m6     ; 4 5
   3293    packssdw             m6, m7     ; 5 6
   3294    mova                 m7, [rsp+16*5]
   3295    punpcklwd            m2, m3, m6 ; 45
   3296    punpckhwd            m3, m6     ; 56
   3297    pmaddwd              m6, m2, m7 ; a2
   3298    pmaddwd              m7, m3     ; b2
   3299    paddd                m4, m6
   3300    paddd                m5, m7
   3301    psrad                m4, 6
   3302    psrad                m5, 6
   3303    packssdw             m4, m5
   3304    movq        [tmpq+wq*0], m4
   3305    movhps      [tmpq+wq*2], m4
   3306    lea                tmpq, [tmpq+wq*4]
   3307    sub                  hd, 2
   3308    jg .hv_w8_loop
   3309    mov                srcq, srcmp
   3310    mov                tmpq, tmpmp
   3311    movzx                hd, r5w
   3312    add                srcq, 8
   3313    add                tmpq, 8
   3314    sub                 r5d, 1<<16
   3315 %else
   3316    lea                 r8d, [wq*4-(1<<5)]
   3317    lea                 r8d, [hq+r8*8]
   3318 .hv_w8_loop0:
   3319    mova                 m5, [spel_h_shufA]
   3320    movu                 m0, [srcq+r6*2+ 0]
   3321    mova                 m6, [rsp+16*0]
   3322    movu                 m1, [srcq+r6*2+ 8]
   3323    mova                 m7, [rsp+16*1]
   3324    movu                 m2, [srcq+r6*2+16]
   3325    mova                 m8, [rsp+16*2]
   3326    HV_H_6TAP            m9, m0, m1, m2, 6, m5, m6, m7, m8
   3327    movu                 m0, [srcq+r6*1+ 0]
   3328    movu                 m1, [srcq+r6*1+ 8]
   3329    movu                 m2, [srcq+r6*1+16]
   3330    lea                  r5, [srcq+ssq*2]
   3331    HV_H_6TAP           m11, m0, m1, m2, 6, m5, m6, m7, m8
   3332    movu                 m0, [srcq+ssq*0+ 0]
   3333    movu                 m1, [srcq+ssq*0+ 8]
   3334    movu                 m2, [srcq+ssq*0+16]
   3335    mov                  r7, tmpq
   3336    HV_H_6TAP           m13, m0, m1, m2, 6, m5, m6, m7, m8
   3337    movu                 m0, [srcq+ssq*1+ 0]
   3338    movu                 m1, [srcq+ssq*1+ 8]
   3339    movu                 m2, [srcq+ssq*1+16]
   3340    HV_H_6TAP           m15, m0, m1, m2, 6, m5, m6, m7, m8
   3341    movu                 m0, [r5+ssq*0+ 0]
   3342    movu                 m1, [r5+ssq*0+ 8]
   3343    movu                 m2, [r5+ssq*0+16]
   3344    HV_H_6TAP            m5, m0, m1, m2, 6, m5, m6, m7, m8
   3345    punpcklwd            m8, m9, m11  ; 01
   3346    punpckhwd            m9, m11
   3347    punpcklwd           m10, m11, m13 ; 12
   3348    punpckhwd           m11, m13
   3349    punpcklwd           m12, m13, m15 ; 23
   3350    punpckhwd           m13, m15
   3351    punpcklwd           m14, m15, m5  ; 34
   3352    punpckhwd           m15, m5
   3353 .hv_w8_loop:
   3354    mova                 m3, [rsp+16*3]
   3355    mova                 m7, [rsp+16*4]
   3356    pmaddwd              m0, m8, m3   ; a0
   3357    mova                 m8, m12
   3358    pmaddwd              m2, m9, m3   ; a0'
   3359    mova                 m9, m13
   3360    pmaddwd              m1, m10, m3  ; b0
   3361    mova                m10, m14
   3362    pmaddwd              m3, m11      ; b0'
   3363    mova                m11, m15
   3364    REPX    {pmaddwd x, m7}, m12, m13, m14, m15
   3365    movu                 m6, [r5+ssq*1+ 0]
   3366    paddd                m0, m12
   3367    movu                 m7, [r5+ssq*1+ 8]
   3368    paddd                m2, m13
   3369    movu                m12, [r5+ssq*1+16]
   3370    paddd                m1, m14
   3371    lea                  r5, [r5+ssq*2]
   3372    paddd                m3, m15
   3373    HV_H_6TAP           m15, m6, m7, m12, 6
   3374    movu                 m6, [r5+ssq*0+ 0]
   3375    movu                 m7, [r5+ssq*0+ 8]
   3376    movu                m14, [r5+ssq*0+16]
   3377    punpcklwd           m12, m5, m15 ; 45
   3378    punpckhwd           m13, m5, m15
   3379    HV_H_6TAP            m5, m6, m7, m14, 6
   3380    mova                 m7, [rsp+16*5]
   3381    REPX      {paddd x, m4}, m0, m2, m1, m3
   3382    punpcklwd           m14, m15, m5  ; 56
   3383    punpckhwd           m15, m5
   3384    pmaddwd              m6, m12, m7  ; a2
   3385    paddd                m0, m6
   3386    pmaddwd              m6, m13, m7  ; a2'
   3387    paddd                m2, m6
   3388    pmaddwd              m6, m14, m7  ; b2
   3389    pmaddwd              m7, m15      ; b2'
   3390    paddd                m1, m6
   3391    paddd                m3, m7
   3392    REPX       {psrad x, 6}, m0, m2, m1, m3
   3393    packssdw             m0, m2
   3394    packssdw             m1, m3
   3395    mova          [r7+wq*0], m0
   3396    mova          [r7+wq*2], m1
   3397    lea                  r7, [r7+wq*4]
   3398    sub                  hd, 2
   3399    jg .hv_w8_loop
   3400    add                srcq, 16
   3401    add                tmpq, 16
   3402    movzx                hd, r8b
   3403    sub                 r8d, 1<<8
   3404 %endif
   3405    jg .hv_w8_loop0
   3406    RET
   3407 
   3408 PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
   3409 PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
   3410 PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
   3411 PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
   3412 PREP_8TAP_FN sharp,          SHARP,   SHARP
   3413 
   3414 cglobal prep_8tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my
   3415 %if ARCH_X86_32
   3416    %define             mxb  r0b
   3417    %define             mxd  r0
   3418    %define             mxq  r0
   3419    %define             myb  r2b
   3420    %define             myd  r2
   3421    %define             myq  r2
   3422    %define              m8  [esp+16*0]
   3423    %define              m9  [esp+16*1]
   3424    %define             m10  [esp+16*2]
   3425    %define             m11  [esp+16*3]
   3426    %define             m12  [esp+16*4]
   3427    %define             m13  [esp+16*5]
   3428    %define             m14  [esp+16*6]
   3429    %define             m15  [esp+16*7]
   3430 %endif
   3431    imul                mxd, mxm, 0x010101
   3432    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   3433    imul                myd, mym, 0x010101
   3434    add                 myd, t1d ; 8tap_v, my, 4tap_v
   3435    LEA                  t2, prep_ssse3
   3436    movifnidn            wd, wm
   3437    movifnidn          srcq, srcmp
   3438    test                mxd, 0xf00
   3439    jnz .h
   3440    movifnidn            hd, hm
   3441    test                myd, 0xf00
   3442    jz mangle(private_prefix %+ _prep_6tap_16bpc_ssse3).prep
   3443 .v:
   3444    movzx               mxd, myb
   3445    shr                 myd, 16
   3446    cmp                  hd, 4
   3447    cmove               myd, mxd
   3448    movq                 m3, [base+subpel_filters+myq*8]
   3449    WIN64_SPILL_XMM      15
   3450    movddup              m7, [base+prep_8tap_1d_rnd]
   3451    movifnidn           ssq, r2mp
   3452    movifnidn          tmpq, r0mp
   3453    punpcklbw            m3, m3
   3454    psraw                m3, 8 ; sign-extend
   3455    test          dword r7m, 0x800
   3456    jnz .v_12bpc
   3457    psllw                m3, 2
   3458 .v_12bpc:
   3459 %if ARCH_X86_32
   3460    ALLOC_STACK       -16*7
   3461    pshufd               m0, m3, q0000
   3462    pshufd               m1, m3, q1111
   3463    pshufd               m2, m3, q2222
   3464    pshufd               m3, m3, q3333
   3465    mova                 m8, m0
   3466    mova                 m9, m1
   3467    mova                m10, m2
   3468    mova                m11, m3
   3469 %else
   3470    pshufd               m8, m3, q0000
   3471    pshufd               m9, m3, q1111
   3472    pshufd              m10, m3, q2222
   3473    pshufd              m11, m3, q3333
   3474 %endif
   3475    lea                  r6, [ssq*3]
   3476    sub                srcq, r6
   3477    mov                 r6d, wd
   3478    shl                  wd, 6
   3479    mov                  r5, srcq
   3480 %if ARCH_X86_64
   3481    mov                  r7, tmpq
   3482 %elif STACK_ALIGNMENT < 16
   3483    mov          [esp+4*29], tmpq
   3484 %endif
   3485    lea                  wd, [wq+hq-(1<<8)]
   3486 .v_loop0:
   3487    movq                 m1, [srcq+ssq*0]
   3488    movq                 m2, [srcq+ssq*1]
   3489    lea                srcq, [srcq+ssq*2]
   3490    movq                 m3, [srcq+ssq*0]
   3491    movq                 m4, [srcq+ssq*1]
   3492    lea                srcq, [srcq+ssq*2]
   3493    movq                 m5, [srcq+ssq*0]
   3494    movq                 m6, [srcq+ssq*1]
   3495    lea                srcq, [srcq+ssq*2]
   3496    movq                 m0, [srcq+ssq*0]
   3497    punpcklwd            m1, m2      ; 01
   3498    punpcklwd            m2, m3      ; 12
   3499    punpcklwd            m3, m4      ; 23
   3500    punpcklwd            m4, m5      ; 34
   3501    punpcklwd            m5, m6      ; 45
   3502    punpcklwd            m6, m0      ; 56
   3503 %if ARCH_X86_32
   3504    jmp .v_loop_start
   3505 .v_loop:
   3506    mova                 m1, m12
   3507    mova                 m2, m13
   3508    mova                 m3, m14
   3509 .v_loop_start:
   3510    pmaddwd              m1, m8      ; a0
   3511    pmaddwd              m2, m8      ; b0
   3512    mova                m12, m3
   3513    mova                m13, m4
   3514    pmaddwd              m3, m9      ; a1
   3515    pmaddwd              m4, m9      ; b1
   3516    paddd                m1, m3
   3517    paddd                m2, m4
   3518    mova                m14, m5
   3519    mova                 m4, m6
   3520    pmaddwd              m5, m10     ; a2
   3521    pmaddwd              m6, m10     ; b2
   3522    paddd                m1, m5
   3523    paddd                m2, m6
   3524    movq                 m6, [srcq+ssq*1]
   3525    lea                srcq, [srcq+ssq*2]
   3526    punpcklwd            m5, m0, m6  ; 67
   3527    movq                 m0, [srcq+ssq*0]
   3528    pmaddwd              m3, m11, m5 ; a3
   3529    punpcklwd            m6, m0      ; 78
   3530    paddd                m1, m7
   3531    paddd                m1, m3
   3532    pmaddwd              m3, m11, m6 ; b3
   3533    paddd                m2, m7
   3534    paddd                m2, m3
   3535    psrad                m1, 4
   3536    psrad                m2, 4
   3537    packssdw             m1, m2
   3538    movq        [tmpq+r6*0], m1
   3539    movhps      [tmpq+r6*2], m1
   3540    lea                tmpq, [tmpq+r6*4]
   3541    sub                  hd, 2
   3542    jg .v_loop
   3543 %if STACK_ALIGNMENT < 16
   3544    mov                tmpq, [esp+4*29]
   3545    add                  r5, 8
   3546    add                tmpq, 8
   3547    mov                srcq, r5
   3548    mov          [esp+4*29], tmpq
   3549 %else
   3550    mov                tmpq, tmpmp
   3551    add                  r5, 8
   3552    add                tmpq, 8
   3553    mov                srcq, r5
   3554    mov               tmpmp, tmpq
   3555 %endif
   3556 %else
   3557 .v_loop:
   3558    pmaddwd             m12, m8, m1  ; a0
   3559    pmaddwd             m13, m8, m2  ; b0
   3560    mova                 m1, m3
   3561    mova                 m2, m4
   3562    pmaddwd              m3, m9      ; a1
   3563    pmaddwd              m4, m9      ; b1
   3564    paddd               m12, m3
   3565    paddd               m13, m4
   3566    mova                 m3, m5
   3567    mova                 m4, m6
   3568    pmaddwd              m5, m10     ; a2
   3569    pmaddwd              m6, m10     ; b2
   3570    paddd               m12, m5
   3571    paddd               m13, m6
   3572    movq                 m6, [srcq+ssq*1]
   3573    lea                srcq, [srcq+ssq*2]
   3574    punpcklwd            m5, m0, m6  ; 67
   3575    movq                 m0, [srcq+ssq*0]
   3576    pmaddwd             m14, m11, m5 ; a3
   3577    punpcklwd            m6, m0      ; 78
   3578    paddd               m12, m7
   3579    paddd               m12, m14
   3580    pmaddwd             m14, m11, m6 ; b3
   3581    paddd               m13, m7
   3582    paddd               m13, m14
   3583    psrad               m12, 4
   3584    psrad               m13, 4
   3585    packssdw            m12, m13
   3586    movq        [tmpq+r6*0], m12
   3587    movhps      [tmpq+r6*2], m12
   3588    lea                tmpq, [tmpq+r6*4]
   3589    sub                  hd, 2
   3590    jg .v_loop
   3591    add                  r5, 8
   3592    add                  r7, 8
   3593    mov                srcq, r5
   3594    mov                tmpq, r7
   3595 %endif
   3596    movzx                hd, wb
   3597    sub                  wd, 1<<8
   3598    jg .v_loop0
   3599    RET
   3600 .h:
   3601    RESET_STACK_STATE
   3602    test                myd, 0xf00
   3603    jnz .hv
   3604    movifnidn           ssq, r2mp
   3605    movifnidn            hd, r4m
   3606    movddup              m5, [base+prep_8tap_1d_rnd]
   3607    cmp                  wd, 4
   3608    jne .h_w8
   3609 .h_w4:
   3610    movzx               mxd, mxb
   3611    movq                 m0, [base+subpel_filters+mxq*8]
   3612    mova                 m3, [base+spel_h_shufA]
   3613    mova                 m4, [base+spel_h_shufB]
   3614    movifnidn          tmpq, tmpmp
   3615    sub                srcq, 2
   3616    WIN64_SPILL_XMM       8
   3617    punpcklbw            m0, m0
   3618    psraw                m0, 8
   3619    test          dword r7m, 0x800
   3620    jnz .h_w4_12bpc
   3621    psllw                m0, 2
   3622 .h_w4_12bpc:
   3623    pshufd               m6, m0, q1111
   3624    pshufd               m7, m0, q2222
   3625 .h_w4_loop:
   3626    movu                 m1, [srcq+ssq*0]
   3627    movu                 m2, [srcq+ssq*1]
   3628    lea                srcq, [srcq+ssq*2]
   3629    pshufb               m0, m1, m3 ; 0 1 1 2 2 3 3 4
   3630    pshufb               m1, m4     ; 2 3 3 4 4 5 5 6
   3631    pmaddwd              m0, m6
   3632    pmaddwd              m1, m7
   3633    paddd                m0, m5
   3634    paddd                m0, m1
   3635    pshufb               m1, m2, m3
   3636    pshufb               m2, m4
   3637    pmaddwd              m1, m6
   3638    pmaddwd              m2, m7
   3639    paddd                m1, m5
   3640    paddd                m1, m2
   3641    psrad                m0, 4
   3642    psrad                m1, 4
   3643    packssdw             m0, m1
   3644    mova             [tmpq], m0
   3645    add                tmpq, 16
   3646    sub                  hd, 2
   3647    jg .h_w4_loop
   3648    RET
   3649 .h_w8:
   3650    WIN64_SPILL_XMM      11
   3651    shr                 mxd, 16
   3652    movq                 m2, [base+subpel_filters+mxq*8]
   3653    mova                 m4, [base+spel_h_shufA]
   3654    mova                 m6, [base+spel_h_shufB]
   3655    movifnidn          tmpq, r0mp
   3656    add                  wd, wd
   3657    punpcklbw            m2, m2
   3658    add                srcq, wq
   3659    psraw                m2, 8
   3660    add                tmpq, wq
   3661    neg                  wq
   3662    test          dword r7m, 0x800
   3663    jnz .h_w8_12bpc
   3664    psllw                m2, 2
   3665 .h_w8_12bpc:
   3666    pshufd               m7, m2, q0000
   3667 %if ARCH_X86_32
   3668    ALLOC_STACK       -16*3
   3669    pshufd               m0, m2, q1111
   3670    pshufd               m1, m2, q2222
   3671    pshufd               m2, m2, q3333
   3672    mova                 m8, m0
   3673    mova                 m9, m1
   3674    mova                m10, m2
   3675 %else
   3676    pshufd               m8, m2, q1111
   3677    pshufd               m9, m2, q2222
   3678    pshufd              m10, m2, q3333
   3679 %endif
   3680 .h_w8_loop0:
   3681    mov                  r6, wq
   3682 .h_w8_loop:
   3683    movu                 m0, [srcq+r6- 6]
   3684    movu                 m1, [srcq+r6+ 2]
   3685    pshufb               m2, m0, m4  ; 0 1 1 2 2 3 3 4
   3686    pshufb               m0, m6      ; 2 3 3 4 4 5 5 6
   3687    pmaddwd              m2, m7      ; abcd0
   3688    pmaddwd              m0, m8      ; abcd1
   3689    pshufb               m3, m1, m4  ; 4 5 5 6 6 7 7 8
   3690    pshufb               m1, m6      ; 6 7 7 8 8 9 9 a
   3691    paddd                m2, m5
   3692    paddd                m0, m2
   3693    pmaddwd              m2, m9, m3  ; abcd2
   3694    pmaddwd              m3, m7      ; efgh0
   3695    paddd                m0, m2
   3696    pmaddwd              m2, m10, m1 ; abcd3
   3697    pmaddwd              m1, m8      ; efgh1
   3698    paddd                m0, m2
   3699    movu                 m2, [srcq+r6+10]
   3700    paddd                m3, m5
   3701    paddd                m1, m3
   3702    pshufb               m3, m2, m4  ; a b b c c d d e
   3703    pshufb               m2, m6      ; 8 9 9 a a b b c
   3704    pmaddwd              m3, m9      ; efgh2
   3705    pmaddwd              m2, m10     ; efgh3
   3706    paddd                m1, m3
   3707    paddd                m1, m2
   3708    psrad                m0, 4
   3709    psrad                m1, 4
   3710    packssdw             m0, m1
   3711    mova          [tmpq+r6], m0
   3712    add                  r6, 16
   3713    jl .h_w8_loop
   3714    add                srcq, ssq
   3715    sub                tmpq, wq
   3716    dec                  hd
   3717    jg .h_w8_loop0
   3718    RET
   3719 .hv:
   3720    RESET_STACK_STATE
   3721    movzx               t3d, mxb
   3722    shr                 mxd, 16
   3723    cmp                  wd, 4
   3724    cmove               mxd, t3d
   3725    movifnidn            hd, r4m
   3726    movq                 m2, [base+subpel_filters+mxq*8]
   3727    movzx               mxd, myb
   3728    shr                 myd, 16
   3729    cmp                  hd, 4
   3730    cmove               myd, mxd
   3731    movq                 m3, [base+subpel_filters+myq*8]
   3732 %if ARCH_X86_32
   3733    mov                 ssq, r2mp
   3734    mov                tmpq, r0mp
   3735    mova                 m0, [base+spel_h_shufA]
   3736    mova                 m1, [base+spel_h_shufB]
   3737    mova                 m4, [base+prep_8tap_2d_rnd]
   3738    ALLOC_STACK      -16*14
   3739    mova                 m8, m0
   3740    mova                 m9, m1
   3741    mova                m14, m4
   3742 %else
   3743 %if WIN64
   3744    ALLOC_STACK        16*6, 16
   3745 %endif
   3746    mova                 m8, [base+spel_h_shufA]
   3747    mova                 m9, [base+spel_h_shufB]
   3748 %endif
   3749    pxor                 m0, m0
   3750    punpcklbw            m0, m2
   3751    punpcklbw            m3, m3
   3752    psraw                m0, 4
   3753    psraw                m3, 8
   3754    test          dword r7m, 0x800
   3755    jz .hv_10bpc
   3756    psraw                m0, 2
   3757 .hv_10bpc:
   3758    lea                  r6, [ssq*3]
   3759    sub                srcq, 6
   3760    sub                srcq, r6
   3761    mov                 r6d, wd
   3762    shl                  wd, 6
   3763    mov                  r5, srcq
   3764 %if ARCH_X86_32
   3765    %define             tmp  esp+16*8
   3766 %if STACK_ALIGNMENT < 16
   3767    mov          [esp+4*61], tmpq
   3768 %endif
   3769    pshufd               m1, m0, q0000
   3770    pshufd               m2, m0, q1111
   3771    pshufd               m5, m0, q2222
   3772    pshufd               m0, m0, q3333
   3773    mova                m10, m1
   3774    mova                m11, m2
   3775    mova                m12, m5
   3776    mova                m13, m0
   3777 %else
   3778 %if WIN64
   3779    %define             tmp  rsp
   3780 %else
   3781    %define             tmp  rsp-88 ; red zone
   3782 %endif
   3783    mov                  r7, tmpq
   3784    pshufd              m10, m0, q0000
   3785    pshufd              m11, m0, q1111
   3786    pshufd              m12, m0, q2222
   3787    pshufd              m13, m0, q3333
   3788 %endif
   3789    lea                  wd, [wq+hq-(1<<8)]
   3790    pshufd               m0, m3, q0000
   3791    pshufd               m1, m3, q1111
   3792    pshufd               m2, m3, q2222
   3793    pshufd               m3, m3, q3333
   3794    mova         [tmp+16*1], m0
   3795    mova         [tmp+16*2], m1
   3796    mova         [tmp+16*3], m2
   3797    mova         [tmp+16*4], m3
   3798 .hv_loop0:
   3799 %if ARCH_X86_64
   3800    mova                m14, [prep_8tap_2d_rnd]
   3801 %endif
   3802    movu                 m4, [srcq+ssq*0+0]
   3803    movu                 m1, [srcq+ssq*0+8]
   3804    movu                 m5, [srcq+ssq*1+0]
   3805    movu                 m2, [srcq+ssq*1+8]
   3806    lea                srcq, [srcq+ssq*2]
   3807    movu                 m6, [srcq+ssq*0+0]
   3808    movu                 m3, [srcq+ssq*0+8]
   3809    PUT_8TAP_HV_H         4, 1, 0, 6
   3810    PUT_8TAP_HV_H         5, 2, 0, 6
   3811    PUT_8TAP_HV_H         6, 3, 0, 6
   3812    movu                 m7, [srcq+ssq*1+0]
   3813    movu                 m2, [srcq+ssq*1+8]
   3814    lea                srcq, [srcq+ssq*2]
   3815    movu                 m1, [srcq+ssq*0+0]
   3816    movu                 m3, [srcq+ssq*0+8]
   3817    PUT_8TAP_HV_H         7, 2, 0, 6
   3818    PUT_8TAP_HV_H         1, 3, 0, 6
   3819    movu                 m2, [srcq+ssq*1+0]
   3820    movu                 m3, [srcq+ssq*1+8]
   3821    lea                srcq, [srcq+ssq*2]
   3822    PUT_8TAP_HV_H         2, 3, 0, 6
   3823    packssdw             m4, m7      ; 0 3
   3824    packssdw             m5, m1      ; 1 4
   3825    movu                 m0, [srcq+ssq*0+0]
   3826    movu                 m1, [srcq+ssq*0+8]
   3827    PUT_8TAP_HV_H         0, 1, 3, 6
   3828    packssdw             m6, m2      ; 2 5
   3829    packssdw             m7, m0      ; 3 6
   3830    punpcklwd            m1, m4, m5  ; 01
   3831    punpckhwd            m4, m5      ; 34
   3832    punpcklwd            m2, m5, m6  ; 12
   3833    punpckhwd            m5, m6      ; 45
   3834    punpcklwd            m3, m6, m7  ; 23
   3835    punpckhwd            m6, m7      ; 56
   3836 %if ARCH_X86_32
   3837    jmp .hv_loop_start
   3838 .hv_loop:
   3839    mova                 m1, [tmp+16*5]
   3840    mova                 m2, m15
   3841 .hv_loop_start:
   3842    mova                 m7, [tmp+16*1]
   3843    pmaddwd              m1, m7      ; a0
   3844    pmaddwd              m2, m7      ; b0
   3845    mova                 m7, [tmp+16*2]
   3846    mova         [tmp+16*5], m3
   3847    pmaddwd              m3, m7      ; a1
   3848    mova                m15, m4
   3849    pmaddwd              m4, m7      ; b1
   3850    mova                 m7, [tmp+16*3]
   3851    paddd                m1, m14
   3852    paddd                m2, m14
   3853    paddd                m1, m3
   3854    paddd                m2, m4
   3855    mova                 m3, m5
   3856    pmaddwd              m5, m7      ; a2
   3857    mova                 m4, m6
   3858    pmaddwd              m6, m7      ; b2
   3859    paddd                m1, m5
   3860    paddd                m2, m6
   3861    movu                 m7, [srcq+ssq*1+0]
   3862    movu                 m5, [srcq+ssq*1+8]
   3863    lea                srcq, [srcq+ssq*2]
   3864    PUT_8TAP_HV_H         7, 5, 6, 6
   3865    packssdw             m0, m7      ; 6 7
   3866    mova         [tmp+16*0], m0
   3867    movu                 m0, [srcq+ssq*0+0]
   3868    movu                 m5, [srcq+ssq*0+8]
   3869    PUT_8TAP_HV_H         0, 5, 6, 6
   3870    mova                 m6, [tmp+16*0]
   3871    packssdw             m7, m0      ; 7 8
   3872    punpcklwd            m5, m6, m7  ; 67
   3873    punpckhwd            m6, m7      ; 78
   3874    pmaddwd              m7, m5, [tmp+16*4]
   3875    paddd                m1, m7      ; a3
   3876    pmaddwd              m7, m6, [tmp+16*4]
   3877    paddd                m2, m7      ; b3
   3878    psrad                m1, 6
   3879    psrad                m2, 6
   3880    packssdw             m1, m2
   3881    movq        [tmpq+r6*0], m1
   3882    movhps      [tmpq+r6*2], m1
   3883    lea                tmpq, [tmpq+r6*4]
   3884    sub                  hd, 2
   3885    jg .hv_loop
   3886 %if STACK_ALIGNMENT < 16
   3887    mov                tmpq, [esp+4*61]
   3888    add                  r5, 8
   3889    add                tmpq, 8
   3890    mov                srcq, r5
   3891    mov          [esp+4*61], tmpq
   3892 %else
   3893    mov                tmpq, tmpmp
   3894    add                  r5, 8
   3895    add                tmpq, 8
   3896    mov                srcq, r5
   3897    mov               tmpmp, tmpq
   3898 %endif
   3899 %else
   3900 .hv_loop:
   3901    mova                m15, [tmp+16*1]
   3902    mova                 m7, [prep_8tap_2d_rnd]
   3903    pmaddwd             m14, m15, m1 ; a0
   3904    pmaddwd             m15, m2      ; b0
   3905    paddd               m14, m7
   3906    paddd               m15, m7
   3907    mova                 m7, [tmp+16*2]
   3908    mova                 m1, m3
   3909    pmaddwd              m3, m7      ; a1
   3910    mova                 m2, m4
   3911    pmaddwd              m4, m7      ; b1
   3912    mova                 m7, [tmp+16*3]
   3913    paddd               m14, m3
   3914    paddd               m15, m4
   3915    mova                 m3, m5
   3916    pmaddwd              m5, m7      ; a2
   3917    mova                 m4, m6
   3918    pmaddwd              m6, m7      ; b2
   3919    paddd               m14, m5
   3920    paddd               m15, m6
   3921    movu                 m7, [srcq+ssq*1+0]
   3922    movu                 m5, [srcq+ssq*1+8]
   3923    lea                srcq, [srcq+ssq*2]
   3924    PUT_8TAP_HV_H         7, 5, 6, 6, [prep_8tap_2d_rnd]
   3925    packssdw             m0, m7      ; 6 7
   3926    mova         [tmp+16*0], m0
   3927    movu                 m0, [srcq+ssq*0+0]
   3928    movu                 m5, [srcq+ssq*0+8]
   3929    PUT_8TAP_HV_H         0, 5, 6, 6, [prep_8tap_2d_rnd]
   3930    mova                 m6, [tmp+16*0]
   3931    packssdw             m7, m0      ; 7 8
   3932    punpcklwd            m5, m6, m7  ; 67
   3933    punpckhwd            m6, m7      ; 78
   3934    pmaddwd              m7, m5, [tmp+16*4]
   3935    paddd               m14, m7      ; a3
   3936    pmaddwd              m7, m6, [tmp+16*4]
   3937    paddd               m15, m7      ; b3
   3938    psrad               m14, 6
   3939    psrad               m15, 6
   3940    packssdw            m14, m15
   3941    movq        [tmpq+r6*0], m14
   3942    movhps      [tmpq+r6*2], m14
   3943    lea                tmpq, [tmpq+r6*4]
   3944    sub                  hd, 2
   3945    jg .hv_loop
   3946    add                  r5, 8
   3947    add                  r7, 8
   3948    mov                srcq, r5
   3949    mov                tmpq, r7
   3950 %endif
   3951    movzx                hd, wb
   3952    sub                  wd, 1<<8
   3953    jg .hv_loop0
   3954    RET
   3955 %undef tmp
   3956 
   3957 %macro movifprep 2
   3958 %if isprep
   3959    mov %1, %2
   3960 %endif
   3961 %endmacro
   3962 
   3963 %macro SAVE_REG 1
   3964 %xdefine r%1_save  r%1
   3965 %xdefine r%1q_save r%1q
   3966 %xdefine r%1d_save r%1d
   3967 %if ARCH_X86_32
   3968  %define r%1m_save [rstk+stack_offset+(%1+1)*4]
   3969 %endif
   3970 %endmacro
   3971 
   3972 %macro LOAD_REG 1
   3973 %xdefine r%1  r%1_save
   3974 %xdefine r%1q r%1q_save
   3975 %xdefine r%1d r%1d_save
   3976 %if ARCH_X86_32
   3977  %define r%1m r%1m_save
   3978 %endif
   3979 %undef r%1d_save
   3980 %undef r%1q_save
   3981 %undef r%1_save
   3982 %endmacro
   3983 
   3984 %macro REMAP_REG 2-3
   3985 %xdefine r%1  r%2
   3986 %xdefine r%1q r%2q
   3987 %xdefine r%1d r%2d
   3988 %if ARCH_X86_32
   3989  %if %3 == 0
   3990   %xdefine r%1m r%2m
   3991  %else
   3992   %define r%1m [rstk+stack_offset+(%1+1)*4]
   3993  %endif
   3994 %endif
   3995 %endmacro
   3996 
   3997 %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
   3998 %if isprep
   3999  %if ARCH_X86_64
   4000   SAVE_REG 14
   4001   %assign %%i 14
   4002   %rep 14
   4003    %assign %%j %%i-1
   4004    REMAP_REG %%i, %%j
   4005    %assign %%i %%i-1
   4006   %endrep
   4007  %else
   4008   SAVE_REG 5
   4009   %assign %%i 5
   4010   %rep 5
   4011    %assign %%j %%i-1
   4012    REMAP_REG %%i, %%j, 0
   4013    %assign %%i %%i-1
   4014   %endrep
   4015  %endif
   4016 %endif
   4017 %endmacro
   4018 
   4019 %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
   4020 %if isprep
   4021  %assign %%i 1
   4022  %if ARCH_X86_64
   4023   %rep 13
   4024    %assign %%j %%i+1
   4025    REMAP_REG %%i, %%j
   4026    %assign %%i %%i+1
   4027   %endrep
   4028   LOAD_REG 14
   4029  %else
   4030   %rep 4
   4031    %assign %%j %%i+1
   4032    REMAP_REG %%i, %%j, 1
   4033    %assign %%i %%i+1
   4034   %endrep
   4035   LOAD_REG 5
   4036  %endif
   4037 %endif
   4038 %endmacro
   4039 
   4040 %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
   4041    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
   4042    RET
   4043 %if %1
   4044    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
   4045 %endif
   4046 %endmacro
   4047 
   4048 %if ARCH_X86_32
   4049 %macro MC_4TAP_SCALED_H 1 ; dst_mem
   4050    movu                 m7, [srcq+ssq*0]
   4051    movu                 m2, [srcq+ssq*1]
   4052    movu                 m5, [r4  +ssq*0]
   4053    movu                 m6, [r4  +ssq*1]
   4054    lea                srcq, [srcq+ssq*2]
   4055    lea                  r4, [r4  +ssq*2]
   4056    REPX    {pshufb x, m12}, m7, m2
   4057    REPX   {pmaddwd x, m13}, m7, m2
   4058    REPX    {pshufb x, m14}, m5, m6
   4059    REPX   {pmaddwd x, m15}, m5, m6
   4060    phaddd               m7, m5
   4061    phaddd               m2, m6
   4062    mova                 m5, [esp+0x00]
   4063    movd                 m6, [esp+0x10]
   4064    paddd                m7, m5
   4065    paddd                m2, m5
   4066    psrad                m7, m6
   4067    psrad                m2, m6
   4068    packssdw             m7, m2
   4069    mova           [stk+%1], m7
   4070 %endmacro
   4071 %endif
   4072 
   4073 %if ARCH_X86_64
   4074 %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
   4075    movu                m%1, [srcq+ r4*2]
   4076    movu                m%2, [srcq+ r6*2]
   4077    movu                m%3, [srcq+ r7*2]
   4078    movu                m%4, [srcq+ r9*2]
   4079    movu                m%5, [srcq+r10*2]
   4080    movu                m%6, [srcq+r11*2]
   4081    movu                m%7, [srcq+r13*2]
   4082    movu                m%8, [srcq+ rX*2]
   4083    add                srcq, ssq
   4084    pmaddwd             m%1, [stk+0x10]
   4085    pmaddwd             m%2, [stk+0x20]
   4086    pmaddwd             m%3, [stk+0x30]
   4087    pmaddwd             m%4, [stk+0x40]
   4088    pmaddwd             m%5, [stk+0x50]
   4089    pmaddwd             m%6, [stk+0x60]
   4090    pmaddwd             m%7, [stk+0x70]
   4091    pmaddwd             m%8, [stk+0x80]
   4092    phaddd              m%1, m%2
   4093    phaddd              m%3, m%4
   4094    phaddd              m%5, m%6
   4095    phaddd              m%7, m%8
   4096    phaddd              m%1, m%3
   4097    phaddd              m%5, m%7
   4098    paddd               m%1, hround
   4099    paddd               m%5, hround
   4100    psrad               m%1, m12
   4101    psrad               m%5, m12
   4102    packssdw            m%1, m%5
   4103 %endmacro
   4104 %else
   4105 %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets
   4106  %if %3 == 1
   4107    mov                  r0, [stk+ 0]
   4108    mov                  rX, [stk+ 4]
   4109    mov                  r4, [stk+ 8]
   4110    mov                  r5, [stk+12]
   4111  %endif
   4112    movu                 m0, [srcq+r0*2]
   4113    movu                 m1, [srcq+rX*2]
   4114    movu                 m2, [srcq+r4*2]
   4115    movu                 m3, [srcq+r5*2]
   4116    mov                  r0, [stk+16]
   4117    mov                  rX, [stk+20]
   4118    mov                  r4, [stk+24]
   4119    mov                  r5, [stk+28]
   4120    pmaddwd              m0, [stk+%1+0x00]
   4121    pmaddwd              m1, [stk+%1+0x10]
   4122    pmaddwd              m2, [stk+%1+0x20]
   4123    pmaddwd              m3, [stk+%1+0x30]
   4124    phaddd               m0, m1
   4125    phaddd               m2, m3
   4126    movu                 m4, [srcq+r0*2]
   4127    movu                 m5, [srcq+rX*2]
   4128    movu                 m6, [srcq+r4*2]
   4129    movu                 m7, [srcq+r5*2]
   4130    add                srcq, ssq
   4131    pmaddwd              m4, [stk+%1+0xa0]
   4132    pmaddwd              m5, [stk+%1+0xb0]
   4133    pmaddwd              m6, [stk+%1+0xc0]
   4134    pmaddwd              m7, [stk+%1+0xd0]
   4135    phaddd               m4, m5
   4136    phaddd               m6, m7
   4137    phaddd               m0, m2
   4138    phaddd               m4, m6
   4139    paddd                m0, hround
   4140    paddd                m4, hround
   4141    psrad                m0, m12
   4142    psrad                m4, m12
   4143    packssdw             m0, m4
   4144  %if %2 != 0
   4145    mova           [stk+%2], m0
   4146  %endif
   4147 %endmacro
   4148 %endif
   4149 
   4150 %macro MC_8TAP_SCALED 1
   4151 %ifidn %1, put
   4152 %assign isput  1
   4153 %assign isprep 0
   4154 %if ARCH_X86_64
   4155  %if required_stack_alignment <= STACK_ALIGNMENT
   4156 cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
   4157  %else
   4158 cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
   4159  %endif
   4160 %else ; ARCH_X86_32
   4161  %if required_stack_alignment <= STACK_ALIGNMENT
   4162 cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
   4163  %else
   4164 cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
   4165  %endif
   4166 %endif
   4167 %xdefine base_reg r12
   4168 %else ; prep
   4169 %assign isput  0
   4170 %assign isprep 1
   4171 %if ARCH_X86_64
   4172  %if required_stack_alignment <= STACK_ALIGNMENT
   4173 cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
   4174   %xdefine tmp_stridem r14q
   4175  %else
   4176 cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
   4177   %define tmp_stridem qword [stk+0x138]
   4178  %endif
   4179  %xdefine base_reg r11
   4180 %else ; ARCH_X86_32
   4181  %if required_stack_alignment <= STACK_ALIGNMENT
   4182 cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
   4183  %else
   4184 cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
   4185  %endif
   4186  %define tmp_stridem dword [stk+0x138]
   4187 %endif
   4188 %endif
   4189 %if ARCH_X86_32
   4190    mov         [esp+0x1f0], t0d
   4191    mov         [esp+0x1f4], t1d
   4192 %if isput && required_stack_alignment > STACK_ALIGNMENT
   4193    mov                dstd, dstm
   4194    mov                 dsd, dsm
   4195    mov                srcd, srcm
   4196    mov                 ssd, ssm
   4197    mov                  hd, hm
   4198    mov                  r4, mxm
   4199  %define r0m  [esp+0x200]
   4200  %define dsm  [esp+0x204]
   4201  %define dsmp dsm
   4202  %define r1m  dsm
   4203  %define r2m  [esp+0x208]
   4204  %define ssm  [esp+0x20c]
   4205  %define r3m  ssm
   4206  %define hm   [esp+0x210]
   4207  %define mxm  [esp+0x214]
   4208    mov                 r0m, dstd
   4209    mov                 dsm, dsd
   4210    mov                 r2m, srcd
   4211    mov                 ssm, ssd
   4212    mov                  hm, hd
   4213    mov                  r0, mym
   4214    mov                  r1, dxm
   4215    mov                  r2, dym
   4216  %define mym    [esp+0x218]
   4217  %define dxm    [esp+0x21c]
   4218  %define dym    [esp+0x220]
   4219    mov                 mxm, r4
   4220    mov                 mym, r0
   4221    mov                 dxm, r1
   4222    mov                 dym, r2
   4223    tzcnt                wd, wm
   4224 %endif
   4225 %if isput
   4226    mov                  r3, pxmaxm
   4227  %define pxmaxm r3
   4228 %else
   4229    mov                  r2, pxmaxm
   4230 %endif
   4231 %if isprep && required_stack_alignment > STACK_ALIGNMENT
   4232  %xdefine base_reg r5
   4233 %else
   4234  %xdefine base_reg r6
   4235 %endif
   4236 %endif
   4237    LEA            base_reg, %1_8tap_scaled_16bpc_ssse3
   4238 %xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3
   4239 %if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
   4240    tzcnt                wd, wm
   4241 %endif
   4242 %if ARCH_X86_64
   4243 %if isput
   4244    mov                 r7d, pxmaxm
   4245 %endif
   4246 %else
   4247 %define m8  m0
   4248 %define m9  m1
   4249 %define m14 m4
   4250 %define m15 m3
   4251 %endif
   4252    movd                 m8, dxm
   4253    movd                m14, mxm
   4254 %if isput
   4255    movd                m15, pxmaxm
   4256 %endif
   4257    pshufd               m8, m8, q0000
   4258    pshufd              m14, m14, q0000
   4259 %if isput
   4260    pshuflw             m15, m15, q0000
   4261    punpcklqdq          m15, m15
   4262 %endif
   4263 %if isprep
   4264 %if UNIX64
   4265    mov                 r5d, t0d
   4266  DECLARE_REG_TMP 5, 7
   4267 %endif
   4268 %if ARCH_X86_64
   4269    mov                 r6d, pxmaxm
   4270 %endif
   4271 %endif
   4272 %if ARCH_X86_64
   4273    mov                 dyd, dym
   4274 %endif
   4275 %if isput
   4276 %if WIN64
   4277    mov                 r8d, hm
   4278  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
   4279  %define hm r5m
   4280  %define dxm r8m
   4281 %elif ARCH_X86_64
   4282  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
   4283  %define hm r6m
   4284 %else
   4285 %endif
   4286 %if ARCH_X86_64
   4287  %if required_stack_alignment > STACK_ALIGNMENT
   4288   %define dsm [rsp+0x138]
   4289   %define rX r1
   4290   %define rXd r1d
   4291  %else
   4292   %define dsm dsq
   4293   %define rX r14
   4294   %define rXd r14d
   4295  %endif
   4296 %else
   4297  %define rX r1
   4298 %endif
   4299 %else ; prep
   4300 %if WIN64
   4301    mov                 r7d, hm
   4302  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
   4303  %define hm r4m
   4304  %define dxm r7m
   4305 %elif ARCH_X86_64
   4306  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
   4307  %xdefine hm r7m
   4308 %endif
   4309 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
   4310 %if ARCH_X86_64
   4311  %define rX r14
   4312  %define rXd r14d
   4313 %else
   4314  %define rX r3
   4315 %endif
   4316 %endif
   4317 %if ARCH_X86_64
   4318    shr                 r7d, 11
   4319    mova                m10, [base+pd_0x3ff]
   4320    movddup             m11, [base+s_8tap_h_rnd+r7*8]
   4321    movd                m12, [base+s_8tap_h_sh+r7*4]
   4322 %if isput
   4323    movddup             m13, [base+put_s_8tap_v_rnd+r7*8]
   4324    movd                 m7, [base+put_s_8tap_v_sh+r7*4]
   4325  %define pxmaxm [rsp]
   4326    mova             pxmaxm, m15
   4327    punpcklqdq          m12, m7
   4328 %endif
   4329    lea                ss3q, [ssq*3]
   4330    movzx               r7d, t1b
   4331    shr                 t1d, 16
   4332    cmp                  hd, 6
   4333    cmovs               t1d, r7d
   4334    sub                srcq, ss3q
   4335 %else
   4336 %define m10    [base+pd_0x3ff]
   4337 %define m11    [esp+0x00]
   4338 %define m12    [esp+0x10]
   4339    shr                  r3, 11
   4340    movddup              m1, [base+s_8tap_h_rnd+r3*8]
   4341    movd                 m2, [base+s_8tap_h_sh+r3*4]
   4342 %if isput
   4343  %define m13    [esp+0x20]
   4344  %define pxmaxm [esp+0x30]
   4345  %define stk esp+0x40
   4346    movddup              m5, [base+put_s_8tap_v_rnd+r3*8]
   4347    movd                 m6, [base+put_s_8tap_v_sh+r3*4]
   4348    mova             pxmaxm, m15
   4349    punpcklqdq           m2, m6
   4350    mova                m13, m5
   4351 %else
   4352  %define m13 [base+pd_m524256]
   4353 %endif
   4354    mov                 ssd, ssm
   4355    mova                m11, m1
   4356    mova                m12, m2
   4357 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
   4358    mov                  r1, [esp+0x1f4]
   4359    lea                  r0, [ssd*3]
   4360    movzx                r2, r1b
   4361    shr                  r1, 16
   4362    cmp            dword hm, 6
   4363    cmovs                r1, r2
   4364    mov         [esp+0x1f4], r1
   4365 %if isprep
   4366    mov                  r1, r1m
   4367 %endif
   4368    mov                  r2, r2m
   4369    sub                srcq, r0
   4370 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
   4371 %define ss3q r0
   4372 %define myd r4
   4373 %define dyd dword dym
   4374 %define hd  dword hm
   4375 %endif
   4376    cmp                 dyd, 1024
   4377    je .dy1
   4378    cmp                 dyd, 2048
   4379    je .dy2
   4380    movzx                wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
   4381    add                  wq, base_reg
   4382    jmp                  wq
   4383 %if isput
   4384 .w2:
   4385 %if ARCH_X86_64
   4386    mov                 myd, mym
   4387    movzx               t0d, t0b
   4388    sub                srcq, 2
   4389    movd                m15, t0d
   4390 %else
   4391    movzx                r4, byte [esp+0x1f0]
   4392    sub                srcq, 2
   4393    movd                m15, r4
   4394 %endif
   4395    pxor                 m9, m9
   4396    punpckldq            m9, m8
   4397    paddd               m14, m9 ; mx+dx*[0-1]
   4398 %if ARCH_X86_64
   4399    mova                 m9, [base+pd_0x4000]
   4400 %endif
   4401    pshufd              m15, m15, q0000
   4402    pand                 m8, m14, m10
   4403    psrld                m8, 6
   4404    paddd               m15, m8
   4405    movd                r4d, m15
   4406    pshufd              m15, m15, q0321
   4407 %if ARCH_X86_64
   4408    movd                r6d, m15
   4409 %else
   4410    movd                r3d, m15
   4411 %endif
   4412    mova                 m5, [base+bdct_lb_q]
   4413    mova                 m6, [base+spel_s_shuf2]
   4414    movd                m15, [base+subpel_filters+r4*8+2]
   4415 %if ARCH_X86_64
   4416    movd                 m7, [base+subpel_filters+r6*8+2]
   4417 %else
   4418    movd                 m7, [base+subpel_filters+r3*8+2]
   4419 %endif
   4420    pxor                 m2, m2
   4421    pcmpeqd              m8, m2
   4422    psrld               m14, 10
   4423    paddd               m14, m14
   4424 %if ARCH_X86_32
   4425    mov                  r3, r3m
   4426    pshufb              m14, m5
   4427    paddb               m14, m6
   4428    mova              [stk], m14
   4429    SWAP                 m5, m0
   4430    SWAP                 m6, m3
   4431  %define m15 m6
   4432 %endif
   4433    movu                 m0, [srcq+ssq*0]
   4434    movu                 m1, [srcq+ssq*1]
   4435    movu                 m2, [srcq+ssq*2]
   4436    movu                 m3, [srcq+ss3q ]
   4437    lea                srcq, [srcq+ssq*4]
   4438    punpckldq           m15, m7
   4439 %if ARCH_X86_64
   4440    pshufb              m14, m5
   4441    paddb               m14, m6
   4442    pand                 m9, m8
   4443    pandn                m8, m15
   4444    SWAP                m15, m8
   4445    por                 m15, m9
   4446    movu                 m4, [srcq+ssq*0]
   4447    movu                 m5, [srcq+ssq*1]
   4448    movu                 m6, [srcq+ssq*2]
   4449    movu                 m7, [srcq+ss3q ]
   4450    lea                srcq, [srcq+ssq*4]
   4451 %else
   4452    pand                 m7, m5, [base+pd_0x4000]
   4453    pandn                m5, m15
   4454    por                  m5, m7
   4455  %define m15 m5
   4456 %endif
   4457    punpcklbw           m15, m15
   4458    psraw               m15, 8
   4459    REPX    {pshufb x, m14}, m0, m1, m2, m3
   4460    REPX   {pmaddwd x, m15}, m0, m1, m2, m3
   4461 %if ARCH_X86_64
   4462    REPX    {pshufb x, m14}, m4, m5, m6, m7
   4463    REPX   {pmaddwd x, m15}, m4, m5, m6, m7
   4464    phaddd               m0, m1
   4465    phaddd               m2, m3
   4466    phaddd               m4, m5
   4467    phaddd               m6, m7
   4468    REPX     {paddd x, m11}, m0, m2, m4, m6
   4469    REPX     {psrad x, m12}, m0, m2, m4, m6
   4470    packssdw             m0, m2 ; 0 1 2 3
   4471    packssdw             m4, m6 ; 4 5 6 7
   4472    SWAP                 m1, m4
   4473 %else
   4474    mova         [stk+0x10], m15
   4475    phaddd               m0, m1
   4476    phaddd               m2, m3
   4477    movu                 m1, [srcq+ssq*0]
   4478    movu                 m7, [srcq+ssq*1]
   4479    movu                 m6, [srcq+ssq*2]
   4480    movu                 m3, [srcq+ss3q ]
   4481    lea                srcq, [srcq+ssq*4]
   4482    REPX    {pshufb x, m14}, m1, m7, m6, m3
   4483    REPX   {pmaddwd x, m15}, m1, m7, m6, m3
   4484    phaddd               m1, m7
   4485    phaddd               m6, m3
   4486    REPX     {paddd x, m11}, m0, m2, m1, m6
   4487    REPX     {psrad x, m12}, m0, m2, m1, m6
   4488    packssdw             m0, m2
   4489    packssdw             m1, m6
   4490  %define m14 [stk+0x00]
   4491  %define m15 [stk+0x10]
   4492 %endif
   4493    palignr              m2, m1, m0, 4 ; 1 2 3 4
   4494    punpcklwd            m3, m0, m2    ; 01 12
   4495    punpckhwd            m0, m2        ; 23 34
   4496    pshufd               m5, m1, q0321 ; 5 6 7 _
   4497    punpcklwd            m2, m1, m5    ; 45 56
   4498    punpckhwd            m4, m1, m5    ; 67 __
   4499 %if ARCH_X86_32
   4500    mov                 myd, mym
   4501    mov                  r0, r0m
   4502    mova         [stk+0x20], m3
   4503    mova         [stk+0x30], m0
   4504    mova         [stk+0x40], m2
   4505    mova         [stk+0x50], m4
   4506 %endif
   4507 .w2_loop:
   4508    and                 myd, 0x3ff
   4509 %if ARCH_X86_64
   4510    mov                 r6d, 64 << 24
   4511    mov                 r4d, myd
   4512    shr                 r4d, 6
   4513    lea                 r4d, [t1+r4]
   4514    cmovnz              r6q, [base+subpel_filters+r4*8]
   4515    movq                m10, r6q
   4516    punpcklbw           m10, m10
   4517    psraw               m10, 8
   4518    pshufd               m7, m10, q0000
   4519    pshufd               m8, m10, q1111
   4520    pmaddwd              m5, m3, m7
   4521    pmaddwd              m6, m0, m8
   4522    pshufd               m9, m10, q2222
   4523    pshufd              m10, m10, q3333
   4524    pmaddwd              m7, m2, m9
   4525    pmaddwd              m8, m4, m10
   4526    paddd                m5, m6
   4527    paddd                m7, m8
   4528 %else
   4529    mov                  r1, [esp+0x1f4]
   4530    xor                  r3, r3
   4531    mov                  r5, myd
   4532    shr                  r5, 6
   4533    lea                  r1, [r1+r5]
   4534    mov                  r5, 64 << 24
   4535    cmovnz               r3, [base+subpel_filters+r1*8+4]
   4536    cmovnz               r5, [base+subpel_filters+r1*8+0]
   4537    movd                 m6, r3
   4538    movd                 m7, r5
   4539    punpckldq            m7, m6
   4540    punpcklbw            m7, m7
   4541    psraw                m7, 8
   4542    pshufd               m5, m7, q0000
   4543    pshufd               m6, m7, q1111
   4544    pmaddwd              m3, m5
   4545    pmaddwd              m0, m6
   4546    pshufd               m5, m7, q2222
   4547    pshufd               m7, m7, q3333
   4548    pmaddwd              m2, m5
   4549    pmaddwd              m4, m7
   4550    paddd                m3, m0
   4551    paddd                m2, m4
   4552    SWAP                 m5, m3
   4553    SWAP                 m7, m2
   4554  %define m8 m3
   4555 %endif
   4556    paddd                m5, m13
   4557    pshufd               m6, m12, q1032
   4558    pxor                 m8, m8
   4559    paddd                m5, m7
   4560    psrad                m5, m6
   4561    packssdw             m5, m5
   4562    pmaxsw               m5, m8
   4563    pminsw               m5, pxmaxm
   4564    movd             [dstq], m5
   4565    add                dstq, dsmp
   4566    dec                  hd
   4567    jz .ret
   4568 %if ARCH_X86_64
   4569    add                 myd, dyd
   4570 %else
   4571    add                 myd, dym
   4572 %endif
   4573    test                myd, ~0x3ff
   4574 %if ARCH_X86_32
   4575    SWAP                 m3, m5
   4576    SWAP                 m2, m7
   4577    mova                 m3, [stk+0x20]
   4578    mova                 m0, [stk+0x30]
   4579    mova                 m2, [stk+0x40]
   4580    mova                 m4, [stk+0x50]
   4581 %endif
   4582    jz .w2_loop
   4583 %if ARCH_X86_32
   4584    mov                  r3, r3m
   4585 %endif
   4586    movu                 m5, [srcq]
   4587    test                myd, 0x400
   4588    jz .w2_skip_line
   4589    add                srcq, ssq
   4590    shufps               m3, m0, q1032      ; 01 12
   4591    shufps               m0, m2, q1032      ; 23 34
   4592    shufps               m2, m4, q1032      ; 45 56
   4593    pshufb               m5, m14
   4594    pmaddwd              m5, m15
   4595    phaddd               m5, m5
   4596    paddd                m5, m11
   4597    psrad                m5, m12
   4598    packssdw             m5, m5
   4599    palignr              m4, m5, m1, 12
   4600    punpcklqdq           m1, m4, m4         ; 6 7 6 7
   4601    punpcklwd            m4, m1, m5         ; 67 __
   4602 %if ARCH_X86_32
   4603    mova         [stk+0x20], m3
   4604    mova         [stk+0x30], m0
   4605    mova         [stk+0x40], m2
   4606    mova         [stk+0x50], m4
   4607 %endif
   4608    jmp .w2_loop
   4609 .w2_skip_line:
   4610    movu                 m6, [srcq+ssq*1]
   4611    lea                srcq, [srcq+ssq*2]
   4612    mova                 m3, m0             ; 01 12
   4613    mova                 m0, m2             ; 23 34
   4614    pshufb               m5, m14
   4615    pshufb               m6, m14
   4616    pmaddwd              m5, m15
   4617    pmaddwd              m6, m15
   4618    phaddd               m5, m6
   4619    paddd                m5, m11
   4620    psrad                m5, m12
   4621    packssdw             m5, m5             ; 6 7 6 7
   4622    punpckhqdq           m1, m5             ; 4 5 6 7
   4623    pshufd               m5, m1, q0321      ; 5 6 7 _
   4624    punpcklwd            m2, m1, m5         ; 45 56
   4625    punpckhwd            m4, m1, m5         ; 67 __
   4626 %if ARCH_X86_32
   4627    mova         [stk+0x20], m3
   4628    mova         [stk+0x30], m0
   4629    mova         [stk+0x40], m2
   4630    mova         [stk+0x50], m4
   4631 %endif
   4632    jmp .w2_loop
   4633 %endif
   4634 INIT_XMM ssse3
   4635 .w4:
   4636 %if ARCH_X86_64
   4637    mov                 myd, mym
   4638    mova         [rsp+0x10], m11
   4639    mova         [rsp+0x20], m12
   4640 %if isput
   4641    mova         [rsp+0x30], m13
   4642 %endif
   4643    movzx               t0d, t0b
   4644    sub                srcq, 2
   4645    movd                m15, t0d
   4646 %else
   4647 %define m8  m0
   4648 %xdefine m14 m4
   4649 %define m15 m3
   4650    movzx                r4, byte [esp+0x1f0]
   4651    sub                srcq, 2
   4652    movd                m15, r4
   4653 %endif
   4654    pmaddwd              m8, [base+rescale_mul]
   4655 %if ARCH_X86_64
   4656    mova                 m9, [base+pd_0x4000]
   4657 %else
   4658 %define m9 [base+pd_0x4000]
   4659 %endif
   4660    pshufd              m15, m15, q0000
   4661    paddd               m14, m8 ; mx+dx*[0-3]
   4662    pand                 m0, m14, m10
   4663    psrld                m0, 6
   4664    paddd               m15, m0
   4665    pshufd               m7, m15, q1032
   4666 %if ARCH_X86_64
   4667    movd                r4d, m15
   4668    movd               r11d, m7
   4669    pshufd              m15, m15, q0321
   4670    pshufd               m7, m7, q0321
   4671    movd                r6d, m15
   4672    movd               r13d, m7
   4673    mova                m10, [base+bdct_lb_q+ 0]
   4674    mova                m11, [base+bdct_lb_q+16]
   4675    movd                m13, [base+subpel_filters+ r4*8+2]
   4676    movd                 m2, [base+subpel_filters+ r6*8+2]
   4677    movd                m15, [base+subpel_filters+r11*8+2]
   4678    movd                 m4, [base+subpel_filters+r13*8+2]
   4679 %else
   4680    movd                 r0, m15
   4681    movd                 r4, m7
   4682    pshufd              m15, m15, q0321
   4683    pshufd               m7, m7, q0321
   4684    movd                 rX, m15
   4685    movd                 r5, m7
   4686    mova                 m5, [base+bdct_lb_q+ 0]
   4687    mova                 m6, [base+bdct_lb_q+16]
   4688    movd                 m1, [base+subpel_filters+r0*8+2]
   4689    movd                 m2, [base+subpel_filters+rX*8+2]
   4690    movd                 m3, [base+subpel_filters+r4*8+2]
   4691    movd                 m7, [base+subpel_filters+r5*8+2]
   4692    movifprep            r3, r3m
   4693    SWAP                 m4, m7
   4694 %define m10 m5
   4695 %define m11 m6
   4696 %define m12 m1
   4697 %define m13 m1
   4698 %endif
   4699    psrld               m14, 10
   4700    paddd               m14, m14
   4701    punpckldq           m13, m2
   4702    punpckldq           m15, m4
   4703    punpcklqdq          m13, m15
   4704    pxor                 m2, m2
   4705    pcmpeqd              m0, m2
   4706 %if ARCH_X86_64
   4707    pand                 m9, m0
   4708 %else
   4709    pand                 m2, m9, m0
   4710 %define m9 m2
   4711    SWAP                 m7, m4
   4712 %endif
   4713    pandn                m0, m13
   4714 %if ARCH_X86_64
   4715    SWAP                m13, m0
   4716 %else
   4717 %define m13 m0
   4718 %endif
   4719    por                 m13, m9
   4720    punpckhbw           m15, m13, m13
   4721    punpcklbw           m13, m13
   4722    psraw               m15, 8
   4723    psraw               m13, 8
   4724    pshufb              m12, m14, m10
   4725    pshufb              m14, m11
   4726    mova                m10, [base+spel_s_shuf2]
   4727    movd                r4d, m14
   4728    shr                 r4d, 24
   4729 %if ARCH_X86_32
   4730    mova         [stk+0x20], m13
   4731    mova         [stk+0x30], m15
   4732    pxor                 m2, m2
   4733 %endif
   4734    pshufb               m7, m14, m2
   4735    psubb               m14, m7
   4736    paddb               m12, m10
   4737    paddb               m14, m10
   4738 %if ARCH_X86_64
   4739    lea                  r6, [r4+ssq*1]
   4740    lea                 r11, [r4+ssq*2]
   4741    lea                 r13, [r4+ss3q ]
   4742    movu                 m7, [srcq+ssq*0]
   4743    movu                 m9, [srcq+ssq*1]
   4744    movu                 m8, [srcq+ssq*2]
   4745    movu                m10, [srcq+ss3q ]
   4746    movu                 m1, [srcq+r4   ]
   4747    movu                 m3, [srcq+r6   ]
   4748    movu                 m2, [srcq+r11  ]
   4749    movu                 m4, [srcq+r13  ]
   4750    lea                srcq, [srcq+ssq*4]
   4751    REPX    {pshufb x, m12}, m7, m9, m8, m10
   4752    REPX   {pmaddwd x, m13}, m7, m9, m8, m10
   4753    REPX    {pshufb x, m14}, m1, m2, m3, m4
   4754    REPX   {pmaddwd x, m15}, m1, m2, m3, m4
   4755    mova                 m5, [rsp+0x10]
   4756    movd                xm6, [rsp+0x20]
   4757    phaddd               m7, m1
   4758    phaddd               m9, m3
   4759    phaddd               m8, m2
   4760    phaddd              m10, m4
   4761    movu                 m1, [srcq+ssq*0]
   4762    movu                 m2, [srcq+ssq*1]
   4763    movu                 m3, [srcq+ssq*2]
   4764    movu                 m4, [srcq+ss3q ]
   4765    REPX      {paddd x, m5}, m7, m9, m8, m10
   4766    REPX     {psrad x, xm6}, m7, m9, m8, m10
   4767    packssdw             m7, m9  ; 0 1
   4768    packssdw             m8, m10 ; 2 3
   4769    movu                 m0, [srcq+r4   ]
   4770    movu                 m9, [srcq+r6   ]
   4771    movu                m10, [srcq+r11  ]
   4772    movu                m11, [srcq+r13  ]
   4773    lea                srcq, [srcq+ssq*4]
   4774    REPX    {pshufb x, m12}, m1, m2, m3, m4
   4775    REPX   {pmaddwd x, m13}, m1, m2, m3, m4
   4776    REPX    {pshufb x, m14}, m0, m9, m10, m11
   4777    REPX   {pmaddwd x, m15}, m0, m9, m10, m11
   4778    phaddd               m1, m0
   4779    phaddd               m2, m9
   4780    phaddd               m3, m10
   4781    phaddd               m4, m11
   4782    REPX      {paddd x, m5}, m1, m2, m3, m4
   4783    REPX     {psrad x, xm6}, m1, m2, m3, m4
   4784    packssdw             m1, m2 ; 4 5
   4785    packssdw             m3, m4 ; 6 7
   4786    SWAP                 m9, m1
   4787    shufps               m4, m7, m8, q1032  ; 1 2
   4788    shufps               m5, m8, m9, q1032  ; 3 4
   4789    shufps               m6, m9, m3, q1032  ; 5 6
   4790    pshufd              m10, m3, q1032      ; 7 _
   4791    punpcklwd            m0, m7, m4 ; 01
   4792    punpckhwd            m7, m4     ; 12
   4793    punpcklwd            m1, m8, m5 ; 23
   4794    punpckhwd            m8, m5     ; 34
   4795    punpcklwd            m2, m9, m6 ; 45
   4796    punpckhwd            m9, m6     ; 56
   4797    punpcklwd            m3, m10    ; 67
   4798    mova         [rsp+0x40], m7
   4799    mova         [rsp+0x50], m8
   4800    mova         [rsp+0x60], m9
   4801 %else
   4802    mova         [stk+0x00], m12
   4803    mova         [stk+0x10], m14
   4804    add                  r4, srcq
   4805    MC_4TAP_SCALED_H   0x40 ; 0 1
   4806    MC_4TAP_SCALED_H   0x50 ; 2 3
   4807    MC_4TAP_SCALED_H   0x60 ; 4 5
   4808    MC_4TAP_SCALED_H   0x70 ; 6 7
   4809    mova                 m4, [stk+0x40]
   4810    mova                 m5, [stk+0x50]
   4811    mova                 m6, [stk+0x60]
   4812    mova                 m7, [stk+0x70]
   4813    mov          [stk+0xc0], r4
   4814    shufps               m1, m4, m5, q1032 ; 1 2
   4815    shufps               m2, m5, m6, q1032 ; 3 4
   4816    shufps               m3, m6, m7, q1032 ; 5 6
   4817    pshufd               m0, m7, q1032     ; 7 _
   4818    mova         [stk+0xb0], m0
   4819    punpcklwd            m0, m4, m1         ; 01
   4820    punpckhwd            m4, m1             ; 12
   4821    punpcklwd            m1, m5, m2         ; 23
   4822    punpckhwd            m5, m2             ; 34
   4823    punpcklwd            m2, m6, m3         ; 45
   4824    punpckhwd            m6, m3             ; 56
   4825    punpcklwd            m3, m7, [stk+0xb0] ; 67
   4826    mov                 myd, mym
   4827    mov                  r0, r0m
   4828    mova         [stk+0x40], m0 ; 01
   4829    mova         [stk+0x50], m1 ; 23
   4830    mova         [stk+0x60], m2 ; 45
   4831    mova         [stk+0x70], m3 ; 67
   4832    mova         [stk+0x80], m4 ; 12
   4833    mova         [stk+0x90], m5 ; 34
   4834    mova         [stk+0xa0], m6 ; 56
   4835 %define m12 [stk+0x00]
   4836 %define m14 [stk+0x10]
   4837 %define m13 [stk+0x20]
   4838 %define m15 [stk+0x30]
   4839 %define hrnd_mem [esp+0x00]
   4840 %define hsh_mem  [esp+0x10]
   4841 %if isput
   4842  %define vrnd_mem [esp+0x20]
   4843 %else
   4844  %define vrnd_mem [base+pd_m524256]
   4845 %endif
   4846 %endif
   4847 .w4_loop:
   4848    and                 myd, 0x3ff
   4849 %if ARCH_X86_64
   4850    mov                r11d, 64 << 24
   4851    mov                r13d, myd
   4852    shr                r13d, 6
   4853    lea                r13d, [t1+r13]
   4854    cmovnz             r11q, [base+subpel_filters+r13*8]
   4855    movq                 m9, r11q
   4856    punpcklbw            m9, m9
   4857    psraw                m9, 8
   4858    pshufd               m7, m9, q0000
   4859    pshufd               m8, m9, q1111
   4860    pmaddwd              m4, m0, m7
   4861    pmaddwd              m5, m1, m8
   4862    pshufd               m7, m9, q2222
   4863    pshufd               m9, m9, q3333
   4864    pmaddwd              m6, m2, m7
   4865    pmaddwd              m8, m3, m9
   4866 %if isput
   4867    movd                 m9, [rsp+0x28]
   4868  %define vrnd_mem [rsp+0x30]
   4869 %else
   4870  %define vrnd_mem [base+pd_m524256]
   4871 %endif
   4872    paddd                m4, m5
   4873    paddd                m6, m8
   4874    paddd                m4, m6
   4875    paddd                m4, vrnd_mem
   4876 %else
   4877    mov                 mym, myd
   4878    mov                  r5, [esp+0x1f4]
   4879    xor                  r3, r3
   4880    shr                  r4, 6
   4881    lea                  r5, [r5+r4]
   4882    mov                  r4, 64 << 24
   4883    cmovnz               r4, [base+subpel_filters+r5*8+0]
   4884    cmovnz               r3, [base+subpel_filters+r5*8+4]
   4885    movd                 m7, r4
   4886    movd                 m6, r3
   4887    punpckldq            m7, m6
   4888    punpcklbw            m7, m7
   4889    psraw                m7, 8
   4890    pshufd               m4, m7, q0000
   4891    pshufd               m5, m7, q1111
   4892    pshufd               m6, m7, q2222
   4893    pshufd               m7, m7, q3333
   4894    pmaddwd              m0, m4
   4895    pmaddwd              m1, m5
   4896    pmaddwd              m2, m6
   4897    pmaddwd              m3, m7
   4898 %if isput
   4899    movd                 m4, [esp+0x18]
   4900 %endif
   4901    paddd                m0, m1
   4902    paddd                m2, m3
   4903    paddd                m0, vrnd_mem
   4904    paddd                m0, m2
   4905    SWAP                 m4, m0
   4906 %define m9 m0
   4907 %endif
   4908 %if isput
   4909    pxor                 m5, m5
   4910    psrad                m4, m9
   4911    packssdw             m4, m4
   4912    pmaxsw               m4, m5
   4913    pminsw               m4, pxmaxm
   4914    movq             [dstq], m4
   4915    add                dstq, dsmp
   4916 %else
   4917    psrad                m4, 6
   4918    packssdw             m4, m4
   4919    movq             [tmpq], m4
   4920    add                tmpq, 8
   4921 %endif
   4922    dec                  hd
   4923    jz .ret
   4924 %if ARCH_X86_64
   4925    add                 myd, dyd
   4926    test                myd, ~0x3ff
   4927    jz .w4_loop
   4928    mova                 m8, [rsp+0x10]
   4929    movd                 m9, [rsp+0x20]
   4930    movu                 m4, [srcq]
   4931    movu                 m5, [srcq+r4]
   4932    test                myd, 0x400
   4933    jz .w4_skip_line
   4934    mova                 m0, [rsp+0x40]
   4935    mova         [rsp+0x40], m1
   4936    mova                 m1, [rsp+0x50]
   4937    mova         [rsp+0x50], m2
   4938    mova                 m2, [rsp+0x60]
   4939    mova         [rsp+0x60], m3
   4940    pshufb               m4, m12
   4941    pshufb               m5, m14
   4942    pmaddwd              m4, m13
   4943    pmaddwd              m5, m15
   4944    phaddd               m4, m5
   4945    paddd                m4, m8
   4946    psrad                m4, m9
   4947    packssdw             m4, m4
   4948    punpcklwd            m3, m10, m4
   4949    mova                m10, m4
   4950    add                srcq, ssq
   4951    jmp .w4_loop
   4952 .w4_skip_line:
   4953    movu                 m6, [srcq+ssq*1]
   4954    movu                 m7, [srcq+r6]
   4955    mova                 m0, [rsp+0x50]
   4956    mova                m11, [rsp+0x60]
   4957    pshufb               m4, m12
   4958    pshufb               m6, m12
   4959    pshufb               m5, m14
   4960    pshufb               m7, m14
   4961    pmaddwd              m4, m13
   4962    pmaddwd              m6, m13
   4963    pmaddwd              m5, m15
   4964    pmaddwd              m7, m15
   4965    mova         [rsp+0x40], m0
   4966    mova         [rsp+0x50], m11
   4967    phaddd               m4, m5
   4968    phaddd               m6, m7
   4969    paddd                m4, m8
   4970    paddd                m6, m8
   4971    psrad                m4, m9
   4972    psrad                m6, m9
   4973    packssdw             m4, m6
   4974    punpcklwd            m9, m10, m4
   4975    mova         [rsp+0x60], m9
   4976    pshufd              m10, m4, q1032
   4977    mova                 m0, m1
   4978    mova                 m1, m2
   4979    mova                 m2, m3
   4980    punpcklwd            m3, m4, m10
   4981    lea                srcq, [srcq+ssq*2]
   4982    jmp .w4_loop
   4983 %else
   4984    SWAP                 m0, m4
   4985    mov                 myd, mym
   4986    mov                  r3, r3m
   4987    add                 myd, dym
   4988    test                myd, ~0x3ff
   4989    jnz .w4_next_line
   4990    mova                 m0, [stk+0x40]
   4991    mova                 m1, [stk+0x50]
   4992    mova                 m2, [stk+0x60]
   4993    mova                 m3, [stk+0x70]
   4994    jmp .w4_loop
   4995 .w4_next_line:
   4996    mov                  r5, [stk+0xc0]
   4997    movu                 m4, [srcq]
   4998    movu                 m5, [r5]
   4999    test                myd, 0x400
   5000    jz .w4_skip_line
   5001    add          [stk+0xc0], ssq
   5002    mova                 m0, [stk+0x80]
   5003    mova                 m3, [stk+0x50]
   5004    mova         [stk+0x40], m0
   5005    mova         [stk+0x80], m3
   5006    mova                 m1, [stk+0x90]
   5007    mova                 m6, [stk+0x60]
   5008    mova         [stk+0x50], m1
   5009    mova         [stk+0x90], m6
   5010    mova                 m2, [stk+0xa0]
   5011    mova                 m7, [stk+0x70]
   5012    mova         [stk+0x60], m2
   5013    mova         [stk+0xa0], m7
   5014    pshufb               m4, m12
   5015    pshufb               m5, m14
   5016    pmaddwd              m4, m13
   5017    pmaddwd              m5, m15
   5018    phaddd               m4, m5
   5019    paddd                m4, hrnd_mem
   5020    psrad                m4, hsh_mem
   5021    packssdw             m4, m4
   5022    punpcklwd            m3, [stk+0xb0], m4
   5023    mova         [stk+0xb0], m4
   5024    mova         [stk+0x70], m3
   5025    add                srcq, ssq
   5026    jmp .w4_loop
   5027 .w4_skip_line:
   5028    movu                 m6, [srcq+ssq*1]
   5029    movu                 m7, [r5  +ssq*1]
   5030    lea                  r5, [r5  +ssq*2]
   5031    mov          [stk+0xc0], r5
   5032    mova                 m0, [stk+0x50]
   5033    mova                 m1, [stk+0x60]
   5034    mova                 m2, [stk+0x70]
   5035    mova                 m3, [stk+0x90]
   5036    pshufb               m4, m12
   5037    pshufb               m6, m12
   5038    pshufb               m5, m14
   5039    pshufb               m7, m14
   5040    pmaddwd              m4, m13
   5041    pmaddwd              m6, m13
   5042    pmaddwd              m5, m15
   5043    pmaddwd              m7, m15
   5044    mova         [stk+0x40], m0
   5045    mova         [stk+0x50], m1
   5046    mova         [stk+0x60], m2
   5047    mova         [stk+0x80], m3
   5048    phaddd               m4, m5
   5049    phaddd               m6, m7
   5050    mova                 m5, [stk+0xa0]
   5051    mova                 m7, [stk+0xb0]
   5052    paddd                m4, hrnd_mem
   5053    paddd                m6, hrnd_mem
   5054    psrad                m4, hsh_mem
   5055    psrad                m6, hsh_mem
   5056    packssdw             m4, m6
   5057    punpcklwd            m7, m4
   5058    pshufd               m6, m4, q1032
   5059    mova         [stk+0x90], m5
   5060    mova         [stk+0xa0], m7
   5061    mova         [stk+0xb0], m6
   5062    punpcklwd            m3, m4, m6
   5063    mova         [stk+0x70], m3
   5064    lea                srcq, [srcq+ssq*2]
   5065    jmp .w4_loop
   5066 %endif
   5067 INIT_XMM ssse3
   5068 %if ARCH_X86_64
   5069 %define stk rsp+0x20
   5070 %endif
   5071 .w8:
   5072    mov    dword [stk+0xf0], 1
   5073    movifprep   tmp_stridem, 16
   5074    jmp .w_start
   5075 .w16:
   5076    mov    dword [stk+0xf0], 2
   5077    movifprep   tmp_stridem, 32
   5078    jmp .w_start
   5079 .w32:
   5080    mov    dword [stk+0xf0], 4
   5081    movifprep   tmp_stridem, 64
   5082    jmp .w_start
   5083 .w64:
   5084    mov    dword [stk+0xf0], 8
   5085    movifprep   tmp_stridem, 128
   5086    jmp .w_start
   5087 .w128:
   5088    mov    dword [stk+0xf0], 16
   5089    movifprep   tmp_stridem, 256
   5090 .w_start:
   5091 %if ARCH_X86_64
   5092 %ifidn %1, put
   5093    movifnidn           dsm, dsq
   5094 %endif
   5095    mova         [rsp+0x10], m11
   5096 %define hround m11
   5097    shr                 t0d, 16
   5098    movd                m15, t0d
   5099 %if isprep
   5100    mova                m13, [base+pd_m524256]
   5101 %endif
   5102 %else
   5103 %define hround [esp+0x00]
   5104 %define m12    [esp+0x10]
   5105 %define m10    [base+pd_0x3ff]
   5106 %define m8  m0
   5107 %xdefine m14 m4
   5108 %define m15 m3
   5109 %if isprep
   5110  %define ssq ssm
   5111 %endif
   5112    mov                  r4, [esp+0x1f0]
   5113    shr                  r4, 16
   5114    movd                m15, r4
   5115    mov                  r0, r0m
   5116    mov                 myd, mym
   5117 %endif
   5118    sub                srcq, 6
   5119    pslld                m7, m8, 2 ; dx*4
   5120    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
   5121    pshufd              m15, m15, q0000
   5122    paddd               m14, m8 ; mx+dx*[0-3]
   5123    mova        [stk+0x100], m7
   5124    mova        [stk+0x120], m15
   5125    mov         [stk+0x0f8], srcq
   5126    mov         [stk+0x130], r0q ; dstq / tmpq
   5127 %if ARCH_X86_64 && UNIX64
   5128    mov                  hm, hd
   5129 %elif ARCH_X86_32
   5130    mov                  r5, hm
   5131    mov         [stk+0x0f4], myd
   5132    mov         [stk+0x134], r5
   5133 %endif
   5134    jmp .hloop
   5135 .hloop_prep:
   5136    dec   dword [stk+0x0f0]
   5137    jz .ret
   5138 %if ARCH_X86_64
   5139    add   qword [stk+0x130], 16
   5140    mov                  hd, hm
   5141 %else
   5142    add   dword [stk+0x130], 16
   5143    mov                 myd, [stk+0x0f4]
   5144    mov                  r5, [stk+0x134]
   5145    mov                  r0, [stk+0x130]
   5146 %endif
   5147    mova                 m7, [stk+0x100]
   5148    mova                m14, [stk+0x110]
   5149 %if ARCH_X86_64
   5150    mova                m10, [base+pd_0x3ff]
   5151    mova                m11, [rsp+0x10]
   5152 %endif
   5153    mova                m15, [stk+0x120]
   5154    mov                srcq, [stk+0x0f8]
   5155 %if ARCH_X86_64
   5156    mov                 r0q, [stk+0x130] ; dstq / tmpq
   5157 %else
   5158    mov                 mym, myd
   5159    mov                  hm, r5
   5160    mov                 r0m, r0
   5161    mov                  r3, r3m
   5162 %endif
   5163    paddd               m14, m7
   5164 .hloop:
   5165 %if ARCH_X86_64
   5166    mova                 m9, [base+pq_0x40000000]
   5167 %else
   5168 %define m9 [base+pq_0x40000000]
   5169 %endif
   5170    pxor                 m1, m1
   5171    psrld                m2, m14, 10
   5172    mova              [stk], m2
   5173    pand                 m6, m14, m10
   5174    psrld                m6, 6
   5175    paddd                m5, m15, m6
   5176    pcmpeqd              m6, m1
   5177    pshufd               m2, m5, q1032
   5178 %if ARCH_X86_64
   5179    movd                r4d, m5
   5180    movd                r6d, m2
   5181    pshufd               m5, m5, q0321
   5182    pshufd               m2, m2, q0321
   5183    movd                r7d, m5
   5184    movd                r9d, m2
   5185    movq                 m0, [base+subpel_filters+r4*8]
   5186    movq                 m1, [base+subpel_filters+r6*8]
   5187    movhps               m0, [base+subpel_filters+r7*8]
   5188    movhps               m1, [base+subpel_filters+r9*8]
   5189 %else
   5190    movd                 r0, m5
   5191    movd                 rX, m2
   5192    pshufd               m5, m5, q0321
   5193    pshufd               m2, m2, q0321
   5194    movd                 r4, m5
   5195    movd                 r5, m2
   5196    movq                 m0, [base+subpel_filters+r0*8]
   5197    movq                 m1, [base+subpel_filters+rX*8]
   5198    movhps               m0, [base+subpel_filters+r4*8]
   5199    movhps               m1, [base+subpel_filters+r5*8]
   5200 %endif
   5201    paddd               m14, m7 ; mx+dx*[4-7]
   5202    pand                 m5, m14, m10
   5203    psrld                m5, 6
   5204    paddd               m15, m5
   5205    pxor                 m2, m2
   5206    pcmpeqd              m5, m2
   5207    mova        [stk+0x110], m14
   5208    pshufd               m4, m15, q1032
   5209 %if ARCH_X86_64
   5210    movd               r10d, m15
   5211    movd               r11d, m4
   5212    pshufd              m15, m15, q0321
   5213    pshufd               m4, m4, q0321
   5214    movd               r13d, m15
   5215    movd                rXd, m4
   5216    movq                 m2, [base+subpel_filters+r10*8]
   5217    movq                 m3, [base+subpel_filters+r11*8]
   5218    movhps               m2, [base+subpel_filters+r13*8]
   5219    movhps               m3, [base+subpel_filters+ rX*8]
   5220    psrld               m14, 10
   5221    movq                r11, m14
   5222    punpckhqdq          m14, m14
   5223    movq                 rX, m14
   5224    mov                r10d, r11d
   5225    shr                 r11, 32
   5226    mov                r13d, rXd
   5227    shr                  rX, 32
   5228    mov                 r4d, [stk+ 0]
   5229    mov                 r6d, [stk+ 4]
   5230    mov                 r7d, [stk+ 8]
   5231    mov                 r9d, [stk+12]
   5232    pshufd               m4, m6, q1100
   5233    pshufd               m6, m6, q3322
   5234    pshufd              m14, m5, q1100
   5235    pshufd               m5, m5, q3322
   5236    pand                 m7, m9, m4
   5237    pand                 m8, m9, m6
   5238    pand                m15, m9, m14
   5239    pand                 m9, m9, m5
   5240    pandn                m4, m0
   5241    pandn                m6, m1
   5242    pandn               m14, m2
   5243    pandn                m5, m3
   5244    por                  m7, m4
   5245    por                  m8, m6
   5246    por                 m15, m14
   5247    por                  m9, m5
   5248    punpcklbw            m0, m7, m7
   5249    punpckhbw            m7, m7
   5250    punpcklbw            m1, m8, m8
   5251    punpckhbw            m8, m8
   5252    psraw                m0, 8
   5253    psraw                m7, 8
   5254    psraw                m1, 8
   5255    psraw                m8, 8
   5256    punpcklbw            m2, m15, m15
   5257    punpckhbw           m15, m15
   5258    punpcklbw            m3, m9, m9
   5259    punpckhbw            m9, m9
   5260    psraw                m2, 8
   5261    psraw               m15, 8
   5262    psraw                m3, 8
   5263    psraw                m9, 8
   5264    mova         [stk+0x10], m0
   5265    mova         [stk+0x20], m7
   5266    mova         [stk+0x30], m1
   5267    mova         [stk+0x40], m8
   5268    mova         [stk+0x50], m2
   5269    mova         [stk+0x60], m15
   5270    mova         [stk+0x70], m3
   5271    mova         [stk+0x80], m9
   5272    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
   5273    mova         [stk+0x90], m1
   5274    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
   5275    mova         [stk+0xa0], m2
   5276    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
   5277    mova         [stk+0xb0], m3
   5278    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
   5279    mova         [stk+0xc0], m4
   5280    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
   5281    mova         [stk+0xd0], m5
   5282    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
   5283    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
   5284    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
   5285    mova                 m5, [stk+0xd0]
   5286    mova                 m1, [stk+0x90]
   5287    mova                 m2, [stk+0xa0]
   5288    mova                 m3, [stk+0xb0]
   5289    mova                 m9, [stk+0xc0]
   5290    mov                 myd, mym
   5291    mov                 dyd, dym
   5292    punpcklwd            m4, m5, m6 ; 45a
   5293    punpckhwd            m5, m6     ; 45b
   5294    punpcklwd            m6, m7, m8 ; 67a
   5295    punpckhwd            m7, m8     ; 67b
   5296    punpcklwd            m0, m1, m2 ; 01a
   5297    punpckhwd            m1, m2     ; 01b
   5298    punpcklwd            m2, m3, m9 ; 23a
   5299    punpckhwd            m3, m9     ; 23b
   5300    mova         [stk+0x90], m4
   5301    mova         [stk+0xa0], m5
   5302    mova         [stk+0xb0], m6
   5303    mova         [stk+0xc0], m7
   5304 %define hround [rsp+0x10]
   5305 .vloop:
   5306    and                 myd, 0x3ff
   5307    mov                 r6d, 64 << 24
   5308    mov                 r4d, myd
   5309    shr                 r4d, 6
   5310    lea                 r4d, [t1+r4]
   5311    cmovnz              r6q, [base+subpel_filters+r4*8]
   5312    movq                m11, r6q
   5313    punpcklbw           m11, m11
   5314    psraw               m11, 8
   5315    pshufd               m5, m11, q0000
   5316    pshufd               m7, m11, q1111
   5317    pshufd              m10, m11, q2222
   5318    pshufd              m11, m11, q3333
   5319    pmaddwd              m4, m5, m0
   5320    pmaddwd              m5, m5, m1
   5321    pmaddwd              m6, m7, m2
   5322    pmaddwd              m7, m7, m3
   5323    paddd                m4, m13
   5324    paddd                m5, m13
   5325    paddd                m4, m6
   5326    paddd                m5, m7
   5327    pmaddwd              m6, [stk+0x90], m10
   5328    pmaddwd              m7, [stk+0xa0], m10
   5329    pmaddwd              m8, [stk+0xb0], m11
   5330    pmaddwd              m9, [stk+0xc0], m11
   5331    paddd                m4, m6
   5332    paddd                m5, m7
   5333 %if isput
   5334    pshufd               m6, m12, q1032
   5335 %endif
   5336    paddd                m4, m8
   5337    paddd                m5, m9
   5338 %else
   5339    movd                 r0, m15
   5340    movd                 rX, m4
   5341    pshufd              m15, m15, q0321
   5342    pshufd               m4, m4, q0321
   5343    movd                 r4, m15
   5344    movd                 r5, m4
   5345    mova                m14, [stk+0x110]
   5346    movq                 m2, [base+subpel_filters+r0*8]
   5347    movq                 m3, [base+subpel_filters+rX*8]
   5348    movhps               m2, [base+subpel_filters+r4*8]
   5349    movhps               m3, [base+subpel_filters+r5*8]
   5350    psrld               m14, 10
   5351    mova           [stk+16], m14
   5352    mov                  r0, [stk+ 0]
   5353    mov                  rX, [stk+ 4]
   5354    mov                  r4, [stk+ 8]
   5355    mov                  r5, [stk+12]
   5356    mova         [stk+0x20], m0
   5357    mova         [stk+0x30], m1
   5358    mova         [stk+0x40], m2
   5359    mova         [stk+0x50], m3
   5360    pshufd               m4, m6, q1100
   5361    pshufd               m6, m6, q3322
   5362    pshufd               m7, m5, q1100
   5363    pshufd               m5, m5, q3322
   5364    pand                 m0, m9, m4
   5365    pand                 m1, m9, m6
   5366    pand                 m2, m9, m7
   5367    pand                 m3, m9, m5
   5368    pandn                m4, [stk+0x20]
   5369    pandn                m6, [stk+0x30]
   5370    pandn                m7, [stk+0x40]
   5371    pandn                m5, [stk+0x50]
   5372    por                  m0, m4
   5373    por                  m1, m6
   5374    por                  m2, m7
   5375    por                  m3, m5
   5376    punpcklbw            m4, m0, m0
   5377    punpckhbw            m0, m0
   5378    punpcklbw            m5, m1, m1
   5379    punpckhbw            m1, m1
   5380    psraw                m4, 8
   5381    psraw                m0, 8
   5382    psraw                m5, 8
   5383    psraw                m1, 8
   5384    punpcklbw            m6, m2, m2
   5385    punpckhbw            m2, m2
   5386    punpcklbw            m7, m3, m3
   5387    punpckhbw            m3, m3
   5388    psraw                m6, 8
   5389    psraw                m2, 8
   5390    psraw                m7, 8
   5391    psraw                m3, 8
   5392    mova        [stk+0x0a0], m4
   5393    mova        [stk+0x0b0], m0
   5394    mova        [stk+0x0c0], m5
   5395    mova        [stk+0x0d0], m1
   5396    mova        [stk+0x140], m6
   5397    mova        [stk+0x150], m2
   5398    mova        [stk+0x160], m7
   5399    mova        [stk+0x170], m3
   5400    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
   5401    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
   5402    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
   5403    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
   5404    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
   5405    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
   5406    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
   5407    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
   5408    mova                 m5, [stk+0x60]
   5409    mova                 m6, [stk+0x70]
   5410    mova                 m7, [stk+0x80]
   5411    mova                 m0, [stk+0x90]
   5412    mov                 myd, mym
   5413    punpcklwd            m4, m5, m6      ; 45a
   5414    punpckhwd            m5, m6          ; 45b
   5415    punpcklwd            m6, m7, m0      ; 67a
   5416    punpckhwd            m7, m0          ; 67b
   5417    mova         [stk+0x60], m4
   5418    mova         [stk+0x70], m5
   5419    mova         [stk+0x80], m6
   5420    mova         [stk+0x90], m7
   5421    mova                 m1, [stk+0x20]
   5422    mova                 m2, [stk+0x30]
   5423    mova                 m3, [stk+0x40]
   5424    mova                 m4, [stk+0x50]
   5425    punpcklwd            m0, m1, m2      ; 01a
   5426    punpckhwd            m1, m2          ; 01b
   5427    punpcklwd            m2, m3, m4      ; 23a
   5428    punpckhwd            m3, m4          ; 23b
   5429    mova         [stk+0x20], m0
   5430    mova         [stk+0x30], m1
   5431    mova         [stk+0x40], m2
   5432    mova         [stk+0x50], m3
   5433 .vloop:
   5434    mov                  r0, r0m
   5435    mov                  r5, [esp+0x1f4]
   5436    and                 myd, 0x3ff
   5437    mov                 mym, myd
   5438    xor                  r3, r3
   5439    shr                  r4, 6
   5440    lea                  r5, [r5+r4]
   5441    mov                  r4, 64 << 24
   5442    cmovnz               r4, [base+subpel_filters+r5*8+0]
   5443    cmovnz               r3, [base+subpel_filters+r5*8+4]
   5444    movd                 m7, r4
   5445    movd                 m6, r3
   5446    punpckldq            m7, m6
   5447    punpcklbw            m7, m7
   5448    psraw                m7, 8
   5449    pshufd               m4, m7, q0000
   5450    pshufd               m5, m7, q1111
   5451    pmaddwd              m0, m4
   5452    pmaddwd              m1, m4
   5453    pmaddwd              m2, m5
   5454    pmaddwd              m3, m5
   5455    pshufd               m6, m7, q2222
   5456    pshufd               m7, m7, q3333
   5457    paddd                m0, m2
   5458    paddd                m1, m3
   5459    pmaddwd              m2, [stk+0x60], m6
   5460    pmaddwd              m3, [stk+0x70], m6
   5461    pmaddwd              m4, [stk+0x80], m7
   5462    pmaddwd              m5, [stk+0x90], m7
   5463 %if isput
   5464    movd                 m6, [esp+0x18]
   5465 %endif
   5466    paddd                m0, m2
   5467    paddd                m1, m3
   5468    paddd                m0, vrnd_mem
   5469    paddd                m1, vrnd_mem
   5470    paddd                m4, m0
   5471    paddd                m5, m1
   5472 %endif
   5473 %ifidn %1, put
   5474    psrad                m4, m6
   5475    psrad                m5, m6
   5476    packssdw             m4, m5
   5477    pxor                 m7, m7
   5478    pmaxsw               m4, m7
   5479    pminsw               m4, pxmaxm
   5480    mova             [dstq], m4
   5481    add                dstq, dsm
   5482 %else
   5483    psrad                m4, 6
   5484    psrad                m5, 6
   5485    packssdw             m4, m5
   5486    mova             [tmpq], m4
   5487    add                tmpq, tmp_stridem
   5488 %endif
   5489    dec                  hd
   5490    jz .hloop_prep
   5491 %if ARCH_X86_64
   5492    add                 myd, dyd
   5493    test                myd, ~0x3ff
   5494    jz .vloop
   5495    test                myd, 0x400
   5496    mov         [stk+0x140], myd
   5497    mov                 r4d, [stk+ 0]
   5498    mov                 r6d, [stk+ 4]
   5499    mov                 r7d, [stk+ 8]
   5500    mov                 r9d, [stk+12]
   5501    jz .skip_line
   5502    mova                m14, [base+unpckw]
   5503    movu                 m8, [srcq+r10*2]
   5504    movu                 m9, [srcq+r11*2]
   5505    movu                m10, [srcq+r13*2]
   5506    movu                m11, [srcq+ rX*2]
   5507    movu                 m4, [srcq+ r4*2]
   5508    movu                 m5, [srcq+ r6*2]
   5509    movu                 m6, [srcq+ r7*2]
   5510    movu                 m7, [srcq+ r9*2]
   5511    add                srcq, ssq
   5512    mov                 myd, [stk+0x140]
   5513    mov                 dyd, dym
   5514    pshufd              m15, m14, q1032
   5515    pshufb               m0, m14                ; 0a 1a
   5516    pshufb               m1, m14                ; 0b 1b
   5517    pshufb               m2, m15                ; 3a 2a
   5518    pshufb               m3, m15                ; 3b 2b
   5519    pmaddwd              m8, [stk+0x50]
   5520    pmaddwd              m9, [stk+0x60]
   5521    pmaddwd             m10, [stk+0x70]
   5522    pmaddwd             m11, [stk+0x80]
   5523    pmaddwd              m4, [stk+0x10]
   5524    pmaddwd              m5, [stk+0x20]
   5525    pmaddwd              m6, [stk+0x30]
   5526    pmaddwd              m7, [stk+0x40]
   5527    phaddd               m8, m9
   5528    phaddd              m10, m11
   5529    mova                m11, hround
   5530    phaddd               m4, m5
   5531    phaddd               m6, m7
   5532    phaddd               m8, m10
   5533    phaddd               m4, m6
   5534    paddd                m4, m11
   5535    paddd                m8, m11
   5536    psrad                m4, m12
   5537    psrad                m8, m12
   5538    packssdw             m4, m8
   5539    pshufb               m5, [stk+0x90], m14    ; 4a 5a
   5540    pshufb               m6, [stk+0xa0], m14    ; 4b 5b
   5541    pshufb               m7, [stk+0xb0], m15    ; 7a 6a
   5542    pshufb               m8, [stk+0xc0], m15    ; 7b 6b
   5543    punpckhwd            m0, m2 ; 12a
   5544    punpckhwd            m1, m3 ; 12b
   5545    punpcklwd            m2, m5 ; 34a
   5546    punpcklwd            m3, m6 ; 34b
   5547    punpckhwd            m5, m7 ; 56a
   5548    punpckhwd            m6, m8 ; 56b
   5549    punpcklwd            m7, m4 ; 78a
   5550    punpckhqdq           m4, m4
   5551    punpcklwd            m8, m4 ; 78b
   5552    mova         [stk+0x90], m5
   5553    mova         [stk+0xa0], m6
   5554    mova         [stk+0xb0], m7
   5555    mova         [stk+0xc0], m8
   5556    jmp .vloop
   5557 .skip_line:
   5558    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11
   5559    MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11
   5560    mov                 myd, [stk+0x140]
   5561    mov                 dyd, dym
   5562    mova                 m0, m2         ; 01a
   5563    mova                 m1, m3         ; 01b
   5564    mova                 m2, [stk+0x90] ; 23a
   5565    mova                 m3, [stk+0xa0] ; 23b
   5566    mova                 m5, [stk+0xb0] ; 45a
   5567    mova                 m6, [stk+0xc0] ; 45b
   5568    punpcklwd            m7, m4, m8     ; 67a
   5569    punpckhwd            m4, m8         ; 67b
   5570    mova         [stk+0x90], m5
   5571    mova         [stk+0xa0], m6
   5572    mova         [stk+0xb0], m7
   5573    mova         [stk+0xc0], m4
   5574 %else
   5575    mov                 r0m, r0
   5576    mov                 myd, mym
   5577    mov                  r3, r3m
   5578    add                 myd, dym
   5579    test                myd, ~0x3ff
   5580    mov                 mym, myd
   5581    jnz .next_line
   5582    mova                 m0, [stk+0x20]
   5583    mova                 m1, [stk+0x30]
   5584    mova                 m2, [stk+0x40]
   5585    mova                 m3, [stk+0x50]
   5586    jmp .vloop
   5587 .next_line:
   5588    test                myd, 0x400
   5589    mov                  r0, [stk+ 0]
   5590    mov                  rX, [stk+ 4]
   5591    mov                  r4, [stk+ 8]
   5592    mov                  r5, [stk+12]
   5593    jz .skip_line
   5594    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
   5595    mova                 m7, [base+unpckw]
   5596    pshufd               m4, m7, q1032
   5597    pshufb               m0, [stk+0x20], m7 ; 0a 1a
   5598    pshufb               m1, [stk+0x30], m7 ; 0b 1b
   5599    pshufb               m2, [stk+0x40], m4 ; 3a 2a
   5600    pshufb               m3, [stk+0x50], m4 ; 3b 2b
   5601    pshufb               m5, [stk+0x60], m7 ; 4a 5a
   5602    pshufb               m6, [stk+0x70], m7 ; 4b 5b
   5603    pshufb               m7, [stk+0x80], m4 ; 7a 6a
   5604    punpckhwd            m0, m2 ; 12a
   5605    punpckhwd            m1, m3 ; 12b
   5606    punpcklwd            m2, m5 ; 34a
   5607    punpcklwd            m3, m6 ; 34b
   5608    mova         [stk+0x20], m0
   5609    mova         [stk+0x30], m1
   5610    mova         [stk+0x40], m2
   5611    mova         [stk+0x50], m3
   5612    punpckhwd            m5, m7 ; 56a
   5613    mova         [stk+0x60], m5
   5614    pshufb               m5, [stk+0x90], m4 ; 7b 6b
   5615    punpcklwd            m7, [stk+0xe0] ; 78a
   5616    punpckhwd            m6, m5 ; 56b
   5617    mova         [stk+0x70], m6
   5618    movq                 m6, [stk+0xe8]
   5619    mova         [stk+0x80], m7
   5620    punpcklwd            m5, m6
   5621    mov                 myd, mym
   5622    mova         [stk+0x90], m5
   5623    jmp .vloop
   5624 .skip_line:
   5625    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
   5626    MC_8TAP_SCALED_H 0xa0, 0       ; 9
   5627    mova                 m7, [stk+0xe0]
   5628    mova                 m2, [stk+0x60] ; 23a
   5629    mova                 m3, [stk+0x70] ; 23b
   5630    mova                 m4, [stk+0x80] ; 45a
   5631    mova                 m5, [stk+0x90] ; 45b
   5632    punpcklwd            m6, m7, m0     ; 67a
   5633    punpckhwd            m7, m0         ; 67b
   5634    mova                 m0, [stk+0x40] ; 01a
   5635    mova                 m1, [stk+0x50] ; 01b
   5636    mov                 myd, mym
   5637    mova         [stk+0x40], m2
   5638    mova         [stk+0x50], m3
   5639    mova         [stk+0x60], m4
   5640    mova         [stk+0x70], m5
   5641    mova         [stk+0x80], m6
   5642    mova         [stk+0x90], m7
   5643    mova         [stk+0x20], m0
   5644    mova         [stk+0x30], m1
   5645 %endif
   5646    jmp .vloop
   5647 INIT_XMM ssse3
   5648 .dy1:
   5649    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
   5650    add                  wq, base_reg
   5651    jmp                  wq
   5652 %if isput
   5653 .dy1_w2:
   5654 %if ARCH_X86_64
   5655    mov                 myd, mym
   5656    movzx               t0d, t0b
   5657    sub                srcq, 2
   5658    movd                m15, t0d
   5659 %else
   5660  %define m8  m0
   5661  %define m9  m1
   5662  %define m14 m4
   5663  %define m15 m3
   5664  %define m11 [esp+0x00]
   5665  %define m12 [esp+0x10]
   5666  %define m13 [esp+0x20]
   5667    movzx                r5, byte [esp+0x1f0]
   5668    sub                srcq, 2
   5669    movd                m15, r5
   5670    mov                  r1, r1m
   5671 %endif
   5672    pxor                 m9, m9
   5673    punpckldq            m9, m8
   5674    paddd               m14, m9 ; mx+dx*[0-1]
   5675 %if ARCH_X86_64
   5676    mova                 m9, [base+pd_0x4000]
   5677 %endif
   5678    pshufd              m15, m15, q0000
   5679    pand                 m8, m14, m10
   5680    psrld                m8, 6
   5681    paddd               m15, m8
   5682    movd                r4d, m15
   5683    pshufd              m15, m15, q0321
   5684 %if ARCH_X86_64
   5685    movd                r6d, m15
   5686 %else
   5687    movd                r3d, m15
   5688 %endif
   5689    mova                 m5, [base+bdct_lb_q]
   5690    mova                 m6, [base+spel_s_shuf2]
   5691    movd                m15, [base+subpel_filters+r4*8+2]
   5692 %if ARCH_X86_64
   5693    movd                 m7, [base+subpel_filters+r6*8+2]
   5694 %else
   5695    movd                 m7, [base+subpel_filters+r3*8+2]
   5696 %endif
   5697    pxor                 m2, m2
   5698    pcmpeqd              m8, m2
   5699    psrld               m14, 10
   5700    paddd               m14, m14
   5701 %if ARCH_X86_32
   5702    mov                  r3, r3m
   5703    pshufb              m14, m5
   5704    paddb               m14, m6
   5705    mova              [stk], m14
   5706    SWAP                 m5, m0
   5707    SWAP                 m6, m3
   5708  %define m15 m6
   5709 %endif
   5710    movu                 m0, [srcq+ssq*0]
   5711    movu                 m1, [srcq+ssq*1]
   5712    movu                 m2, [srcq+ssq*2]
   5713    movu                 m3, [srcq+ss3q ]
   5714    lea                srcq, [srcq+ssq*4]
   5715    punpckldq           m15, m7
   5716 %if ARCH_X86_64
   5717    pshufb              m14, m5
   5718    paddb               m14, m6
   5719    pand                 m9, m8
   5720    pandn                m8, m15
   5721    SWAP                m15, m8
   5722    por                 m15, m9
   5723    movu                 m4, [srcq+ssq*0]
   5724    movu                 m5, [srcq+ssq*1]
   5725    movu                 m6, [srcq+ssq*2]
   5726    add                srcq, ss3q
   5727    shr                 myd, 6
   5728    mov                 r4d, 64 << 24
   5729    lea                 myd, [t1+myq]
   5730    cmovnz              r4q, [base+subpel_filters+myq*8]
   5731 %else
   5732    pand                 m7, m5, [base+pd_0x4000]
   5733    pandn                m5, m15
   5734    por                  m5, m7
   5735  %define m15 m5
   5736    mov                 myd, mym
   5737    mov                  r5, [esp+0x1f4]
   5738    xor                  r3, r3
   5739    shr                 myd, 6
   5740    lea                  r5, [r5+myd]
   5741    mov                  r4, 64 << 24
   5742    cmovnz               r4, [base+subpel_filters+r5*8+0]
   5743    cmovnz               r3, [base+subpel_filters+r5*8+4]
   5744    mov          [stk+0x20], r3
   5745    mov                  r3, r3m
   5746 %endif
   5747    punpcklbw           m15, m15
   5748    psraw               m15, 8
   5749    REPX    {pshufb x, m14}, m0, m1, m2, m3
   5750    REPX   {pmaddwd x, m15}, m0, m1, m2, m3
   5751 %if ARCH_X86_64
   5752    REPX    {pshufb x, m14}, m4, m5, m6
   5753    REPX   {pmaddwd x, m15}, m4, m5, m6
   5754    phaddd               m0, m1
   5755    phaddd               m2, m3
   5756    phaddd               m4, m5
   5757    phaddd               m6, m6
   5758    REPX     {paddd x, m11}, m0, m2, m4, m6
   5759    REPX     {psrad x, m12}, m0, m2, m4, m6
   5760    packssdw             m0, m2 ; 0 1 2 3
   5761    packssdw             m4, m6 ; 4 5 6
   5762    SWAP                 m1, m4
   5763    movq                m10, r4
   5764 %else
   5765    mova         [stk+0x10], m15
   5766    phaddd               m0, m1
   5767    phaddd               m2, m3
   5768    movu                 m1, [srcq+ssq*0]
   5769    movu                 m7, [srcq+ssq*1]
   5770    movu                 m6, [srcq+ssq*2]
   5771    add                srcq, ss3q
   5772    REPX    {pshufb x, m14}, m1, m7, m6
   5773    REPX   {pmaddwd x, m15}, m1, m7, m6
   5774  %define m14 [stk+0x00]
   5775  %define m15 [stk+0x10]
   5776    phaddd               m1, m7
   5777    phaddd               m6, m6
   5778    REPX     {paddd x, m11}, m0, m2, m1, m6
   5779    REPX     {psrad x, m12}, m0, m2, m1, m6
   5780    packssdw             m0, m2
   5781    packssdw             m1, m6
   5782  %define m8  m6
   5783  %define m9  m4
   5784  %define m10 m5
   5785    movd                m10, r4
   5786    movd                 m9, [stk+0x20]
   5787    punpckldq           m10, m9
   5788 %endif
   5789    punpcklbw           m10, m10
   5790    psraw               m10, 8
   5791    pshufd               m7, m10, q0000
   5792    pshufd               m8, m10, q1111
   5793    pshufd               m9, m10, q2222
   5794    pshufd              m10, m10, q3333
   5795 %if ARCH_X86_32
   5796    mova         [stk+0x50], m7
   5797    mova         [stk+0x60], m8
   5798    mova         [stk+0x70], m9
   5799    mova         [stk+0x80], m10
   5800  %define m7  [stk+0x50]
   5801  %define m8  [stk+0x60]
   5802  %define m9  [stk+0x70]
   5803  %define m10 [stk+0x80]
   5804 %endif
   5805    palignr              m2, m1, m0, 4 ; 1 2 3 4
   5806    punpcklwd            m3, m0, m2    ; 01 12
   5807    punpckhwd            m0, m2        ; 23 34
   5808    pshufd               m4, m1, q2121 ; 5 6 5 6
   5809    punpcklwd            m2, m1, m4    ; 45 56
   5810 %if ARCH_X86_32
   5811    mov                  r0, r0m
   5812 %endif
   5813 .dy1_w2_loop:
   5814    movu                 m1, [srcq+ssq*0]
   5815    movu                 m6, [srcq+ssq*1]
   5816    lea                srcq, [srcq+ssq*2]
   5817    pmaddwd              m5, m3, m7
   5818    mova                 m3, m0
   5819    pmaddwd              m0, m8
   5820    pshufb               m1, m14
   5821    pshufb               m6, m14
   5822    pmaddwd              m1, m15
   5823    pmaddwd              m6, m15
   5824    phaddd               m1, m6
   5825    paddd                m1, m11
   5826    psrad                m1, m12
   5827    packssdw             m1, m1
   5828    paddd                m5, m0
   5829    mova                 m0, m2
   5830    pmaddwd              m2, m9
   5831    paddd                m5, m2
   5832    palignr              m2, m1, m4, 12
   5833    punpcklwd            m2, m1        ; 67 78
   5834    pmaddwd              m4, m2, m10
   5835    paddd                m5, m13
   5836    paddd                m5, m4
   5837    pxor                 m6, m6
   5838    mova                 m4, m1
   5839    pshufd               m1, m12, q1032
   5840    psrad                m5, m1
   5841    packssdw             m5, m5
   5842    pmaxsw               m5, m6
   5843    pminsw               m5, pxmaxm
   5844    movd       [dstq+dsq*0], m5
   5845    pshuflw              m5, m5, q1032
   5846    movd       [dstq+dsq*1], m5
   5847    lea                dstq, [dstq+dsq*2]
   5848    sub                  hd, 2
   5849    jg .dy1_w2_loop
   5850    RET
   5851 %endif
   5852 INIT_XMM ssse3
   5853 .dy1_w4:
   5854 %if ARCH_X86_64
   5855    mov                 myd, mym
   5856    mova         [rsp+0x10], m11
   5857    mova         [rsp+0x20], m12
   5858 %if isput
   5859    mova         [rsp+0x30], m13
   5860  %define vrnd_mem [rsp+0x30]
   5861  %define stk rsp+0x40
   5862 %else
   5863  %define vrnd_mem [base+pd_m524256]
   5864  %define stk rsp+0x30
   5865 %endif
   5866    movzx               t0d, t0b
   5867    sub                srcq, 2
   5868    movd                m15, t0d
   5869 %else
   5870 %define m10 [base+pd_0x3ff]
   5871 %define m9  [base+pd_0x4000]
   5872 %define m8  m0
   5873 %xdefine m14 m4
   5874 %define m15 m3
   5875 %if isprep
   5876  %define ssq r3
   5877 %endif
   5878    movzx                r5, byte [esp+0x1f0]
   5879    sub                srcq, 2
   5880    movd                m15, r5
   5881 %endif
   5882    pmaddwd              m8, [base+rescale_mul]
   5883 %if ARCH_X86_64
   5884    mova                 m9, [base+pd_0x4000]
   5885 %endif
   5886    pshufd              m15, m15, q0000
   5887    paddd               m14, m8 ; mx+dx*[0-3]
   5888    pand                 m0, m14, m10
   5889    psrld                m0, 6
   5890    paddd               m15, m0
   5891    pshufd               m7, m15, q1032
   5892 %if ARCH_X86_64
   5893    movd                r4d, m15
   5894    movd               r11d, m7
   5895    pshufd              m15, m15, q0321
   5896    pshufd               m7, m7, q0321
   5897    movd                r6d, m15
   5898    movd               r13d, m7
   5899    mova                m10, [base+bdct_lb_q+ 0]
   5900    mova                m11, [base+bdct_lb_q+16]
   5901    movd                m13, [base+subpel_filters+ r4*8+2]
   5902    movd                 m2, [base+subpel_filters+ r6*8+2]
   5903    movd                m15, [base+subpel_filters+r11*8+2]
   5904    movd                 m4, [base+subpel_filters+r13*8+2]
   5905 %else
   5906    movd                 r0, m15
   5907    movd                 r4, m7
   5908    pshufd              m15, m15, q0321
   5909    pshufd               m7, m7, q0321
   5910    movd                 rX, m15
   5911    movd                 r5, m7
   5912    mova                 m5, [base+bdct_lb_q+ 0]
   5913    mova                 m6, [base+bdct_lb_q+16]
   5914    movd                 m1, [base+subpel_filters+r0*8+2]
   5915    movd                 m2, [base+subpel_filters+rX*8+2]
   5916    movd                 m3, [base+subpel_filters+r4*8+2]
   5917    movd                 m7, [base+subpel_filters+r5*8+2]
   5918    SWAP                 m4, m7
   5919 %if isprep
   5920    mov                  r3, r3m
   5921 %endif
   5922 %define m10 m5
   5923 %define m11 m6
   5924 %define m12 m1
   5925 %define m13 m1
   5926 %endif
   5927    psrld               m14, 10
   5928    paddd               m14, m14
   5929    punpckldq           m13, m2
   5930    punpckldq           m15, m4
   5931    punpcklqdq          m13, m15
   5932    pxor                 m2, m2
   5933    pcmpeqd              m0, m2
   5934 %if ARCH_X86_64
   5935    pand                 m9, m0
   5936 %else
   5937    pand                 m2, m9, m0
   5938 %define m9 m2
   5939    SWAP                 m7, m4
   5940 %endif
   5941    pandn                m0, m13
   5942 %if ARCH_X86_64
   5943    SWAP                m13, m0
   5944 %else
   5945 %define m13 m0
   5946 %endif
   5947    por                 m13, m9
   5948    punpckhbw           m15, m13, m13
   5949    punpcklbw           m13, m13
   5950    psraw               m15, 8
   5951    psraw               m13, 8
   5952    pshufb              m12, m14, m10
   5953    pshufb              m14, m11
   5954    mova                m10, [base+spel_s_shuf2]
   5955    movd                r4d, m14
   5956    shr                 r4d, 24
   5957 %if ARCH_X86_32
   5958    mova         [stk+0x40], m13
   5959    mova         [stk+0x50], m15
   5960    pxor                 m2, m2
   5961 %endif
   5962    pshufb               m7, m14, m2
   5963    psubb               m14, m7
   5964    paddb               m12, m10
   5965    paddb               m14, m10
   5966 %if ARCH_X86_64
   5967    lea                  r6, [r4+ssq*1]
   5968    lea                 r11, [r4+ssq*2]
   5969    lea                 r13, [r4+ss3q ]
   5970    movu                 m7, [srcq+ssq*0]
   5971    movu                 m9, [srcq+ssq*1]
   5972    movu                 m8, [srcq+ssq*2]
   5973    movu                m10, [srcq+ss3q ]
   5974    movu                 m1, [srcq+r4   ]
   5975    movu                 m3, [srcq+r6   ]
   5976    movu                 m2, [srcq+r11  ]
   5977    movu                 m4, [srcq+r13  ]
   5978    lea                srcq, [srcq+ssq*4]
   5979    REPX    {pshufb x, m12}, m7, m9, m8, m10
   5980    REPX   {pmaddwd x, m13}, m7, m9, m8, m10
   5981    REPX    {pshufb x, m14}, m1, m3, m2, m4
   5982    REPX   {pmaddwd x, m15}, m1, m3, m2, m4
   5983    mova                 m5, [rsp+0x10]
   5984    movd                xm6, [rsp+0x20]
   5985    phaddd               m7, m1
   5986    phaddd               m9, m3
   5987    phaddd               m8, m2
   5988    phaddd              m10, m4
   5989    movu                 m1, [srcq+ssq*0]
   5990    movu                 m2, [srcq+ssq*1]
   5991    movu                 m3, [srcq+ssq*2]
   5992    REPX      {paddd x, m5}, m7, m9, m8, m10
   5993    REPX     {psrad x, xm6}, m7, m9, m8, m10
   5994    packssdw             m7, m9  ; 0 1
   5995    packssdw             m8, m10 ; 2 3
   5996    movu                 m0, [srcq+r4   ]
   5997    movu                 m9, [srcq+r6   ]
   5998    movu                m10, [srcq+r11  ]
   5999    add                srcq, ss3q
   6000    REPX    {pshufb x, m12}, m1, m2, m3
   6001    REPX   {pmaddwd x, m13}, m1, m2, m3
   6002    REPX    {pshufb x, m14}, m0, m9, m10
   6003    REPX   {pmaddwd x, m15}, m0, m9, m10
   6004    phaddd               m1, m0
   6005    phaddd               m2, m9
   6006    phaddd               m3, m10
   6007    shr                 myd, 6
   6008    mov                r13d, 64 << 24
   6009    lea                 myd, [t1+myq]
   6010    cmovnz             r13q, [base+subpel_filters+myq*8]
   6011    REPX      {paddd x, m5}, m1, m2, m3
   6012    REPX     {psrad x, xm6}, m1, m2, m3
   6013    packssdw             m1, m2 ; 4 5
   6014    packssdw             m3, m3 ; 6 6
   6015    SWAP                 m9, m1
   6016    shufps               m4, m7, m8, q1032  ; 1 2
   6017    shufps               m5, m8, m9, q1032  ; 3 4
   6018    shufps               m6, m9, m3, q1032  ; 5 6
   6019    punpcklwd            m0, m7, m4 ; 01
   6020    punpckhwd            m7, m4     ; 12
   6021    punpcklwd            m1, m8, m5 ; 23
   6022    punpckhwd            m8, m5     ; 34
   6023    punpcklwd            m2, m9, m6 ; 45
   6024    punpckhwd            m9, m6     ; 56
   6025    movq                m10, r13
   6026    mova         [stk+0x00], m1
   6027    mova         [stk+0x10], m8
   6028    mova         [stk+0x20], m2
   6029    mova         [stk+0x30], m9
   6030    mova         [stk+0x40], m3
   6031 %define hrnd_mem [rsp+0x10]
   6032 %define hsh_mem  [rsp+0x20]
   6033 %define vsh_mem  [rsp+0x28]
   6034 %if isput
   6035  %define vrnd_mem [rsp+0x30]
   6036 %else
   6037  %define vrnd_mem [base+pd_m524256]
   6038 %endif
   6039 %else
   6040    mova         [stk+0x20], m12
   6041    mova         [stk+0x30], m14
   6042    add                  r4, srcq
   6043    MC_4TAP_SCALED_H   0x60 ; 0 1
   6044    MC_4TAP_SCALED_H   0x70 ; 2 3
   6045    MC_4TAP_SCALED_H   0x80 ; 4 5
   6046    movu                 m7, [srcq]
   6047    movu                 m2, [r4]
   6048    add                srcq, ssq
   6049    add                  r4, ssq
   6050    mov          [stk+0xb0], r4
   6051    pshufb               m7, m12
   6052    pshufb               m2, m14
   6053    pmaddwd              m7, m13
   6054    pmaddwd              m2, m15
   6055    phaddd               m7, m2
   6056    paddd                m7, [esp+0x00]
   6057    psrad                m7, [esp+0x10]
   6058    packssdw             m7, m7 ; 6 6
   6059    mova                 m4, [stk+0x60]
   6060    mova                 m5, [stk+0x70]
   6061    mova                 m6, [stk+0x80]
   6062    mov                 myd, mym
   6063    mov                  rX, [esp+0x1f4]
   6064    xor                  r5, r5
   6065    shr                 myd, 6
   6066    lea                  rX, [rX+myd]
   6067    mov                  r4, 64 << 24
   6068    cmovnz               r4, [base+subpel_filters+rX*8+0]
   6069    cmovnz               r5, [base+subpel_filters+rX*8+4]
   6070    mov                  r3, r3m
   6071    shufps               m1, m4, m5, q1032 ; 1 2
   6072    shufps               m2, m5, m6, q1032 ; 3 4
   6073    shufps               m3, m6, m7, q1032 ; 5 6
   6074    mova         [stk+0xa0], m7
   6075    punpcklwd            m0, m4, m1         ; 01
   6076    punpckhwd            m4, m1             ; 12
   6077    punpcklwd            m1, m5, m2         ; 23
   6078    punpckhwd            m5, m2             ; 34
   6079    punpcklwd            m2, m6, m3         ; 45
   6080    punpckhwd            m6, m3             ; 56
   6081    movd                 m7, r4
   6082    movd                 m3, r5
   6083    mov                  r0, r0m
   6084 %if isput
   6085    mov                  r1, r1m
   6086 %endif
   6087    mov                  r4, [stk+0xb0]
   6088    mova         [stk+0xc0], m4 ; 12
   6089    mova         [stk+0x60], m1 ; 23
   6090    mova         [stk+0x70], m2 ; 45
   6091    mova         [stk+0x80], m5 ; 34
   6092    mova         [stk+0x90], m6 ; 56
   6093 %define m12 [stk+0x20]
   6094 %define m14 [stk+0x30]
   6095 %define m13 [stk+0x40]
   6096 %define m15 [stk+0x50]
   6097 %define hrnd_mem [esp+0x00]
   6098 %define hsh_mem  [esp+0x10]
   6099 %define vsh_mem  [esp+0x18]
   6100 %if isput
   6101  %define vrnd_mem [esp+0x20]
   6102 %else
   6103  %define vrnd_mem [base+pd_m524256]
   6104 %endif
   6105 %define m10 m7
   6106    punpckldq           m10, m3
   6107 %endif
   6108    punpcklbw           m10, m10
   6109    psraw               m10, 8
   6110    pshufd               m3, m10, q0000
   6111    pshufd               m4, m10, q1111
   6112    pshufd               m5, m10, q2222
   6113    pshufd              m10, m10, q3333
   6114 %if ARCH_X86_32
   6115 %xdefine m8  m3
   6116 %xdefine m9  m6
   6117 %xdefine m11 m5
   6118 %xdefine m6  m4
   6119    mova         [stk+0x100], m3
   6120    mova         [stk+0x110], m4
   6121    mova         [stk+0x120], m5
   6122    mova         [stk+0x130], m10
   6123 %define m3  [stk+0x100]
   6124 %define m4  [stk+0x110]
   6125 %define m5  [stk+0x120]
   6126 %define m10 [stk+0x130]
   6127    mova                 m7, [stk+0xc0]
   6128    mova                 m8, [stk+0x80]
   6129 %endif
   6130 .dy1_w4_loop:
   6131    movu                m11, [srcq+ssq*0]
   6132    movu                 m6, [srcq+ssq*1]
   6133    pmaddwd              m0, m3
   6134    pmaddwd              m7, m3
   6135    pmaddwd              m1, m4
   6136    pmaddwd              m8, m4
   6137    pmaddwd              m2, m5
   6138    pmaddwd              m9, m5
   6139    paddd                m1, m0
   6140    paddd                m8, m7
   6141 %if ARCH_X86_64
   6142    movu                 m0, [srcq+r4]
   6143    movu                 m7, [srcq+r6]
   6144 %else
   6145    movu                 m0, [r4+ssq*0]
   6146    movu                 m7, [r4+ssq*1]
   6147    lea                  r4, [r4+ssq*2]
   6148 %endif
   6149    lea                srcq, [srcq+ssq*2]
   6150    paddd                m1, m2
   6151    paddd                m8, m9
   6152    pshufb              m11, m12
   6153    pshufb               m6, m12
   6154    pmaddwd             m11, m13
   6155    pmaddwd              m6, m13
   6156    pshufb               m0, m14
   6157    pshufb               m7, m14
   6158    pmaddwd              m0, m15
   6159    pmaddwd              m7, m15
   6160    phaddd              m11, m0
   6161    phaddd               m6, m7
   6162    paddd               m11, hrnd_mem
   6163    paddd                m6, hrnd_mem
   6164    psrad               m11, hsh_mem
   6165    psrad                m6, hsh_mem
   6166    packssdw            m11, m6                     ; 7 8
   6167 %if ARCH_X86_64
   6168    shufps               m9, [stk+0x40], m11, q1032 ; 6 7
   6169    mova                 m0, [stk+0x00]
   6170    mova         [stk+0x40], m11
   6171 %else
   6172    shufps               m9, [stk+0xa0], m11, q1032 ; 6 7
   6173    mova                 m0, [stk+0x60]
   6174    mova         [stk+0xa0], m11
   6175 %endif
   6176    punpcklwd            m2, m9, m11 ; 67
   6177    punpckhwd            m9, m11     ; 78
   6178    pmaddwd              m6, m2, m10
   6179    pmaddwd              m7, m9, m10
   6180 %if isput
   6181    movd                m11, vsh_mem
   6182 %endif
   6183    paddd                m1, vrnd_mem
   6184    paddd                m8, vrnd_mem
   6185    paddd                m1, m6
   6186    paddd                m8, m7
   6187 %if ARCH_X86_64
   6188    mova                 m7, [stk+0x10]
   6189 %else
   6190    mova                 m7, [stk+0x80]
   6191 %endif
   6192 %if isput
   6193    psrad                m1, m11
   6194    psrad                m8, m11
   6195 %else
   6196    psrad                m1, 6
   6197    psrad                m8, 6
   6198 %endif
   6199    packssdw             m1, m8
   6200 %if ARCH_X86_64
   6201    mova                 m8, [stk+0x30]
   6202 %else
   6203    mova                 m8, [stk+0x90]
   6204 %endif
   6205 %if isput
   6206    pxor                 m6, m6
   6207    pmaxsw               m1, m6
   6208    pminsw               m1, pxmaxm
   6209    movq       [dstq+dsq*0], m1
   6210    movhps     [dstq+dsq*1], m1
   6211    lea                dstq, [dstq+dsq*2]
   6212 %else
   6213    mova             [tmpq], m1
   6214    add                tmpq, 16
   6215 %endif
   6216 %if ARCH_X86_64
   6217    mova                 m1, [stk+0x20]
   6218    mova         [stk+0x10], m8
   6219    mova         [stk+0x00], m1
   6220    mova         [stk+0x20], m2
   6221    mova         [stk+0x30], m9
   6222 %else
   6223    mova                 m1, [stk+0x70]
   6224    mova         [stk+0x80], m8
   6225    mova         [stk+0x60], m1
   6226    mova         [stk+0x70], m2
   6227    mova         [stk+0x90], m9
   6228 %endif
   6229    sub                  hd, 2
   6230    jg .dy1_w4_loop
   6231    MC_8TAP_SCALED_RET ; why not jz .ret?
   6232 INIT_XMM ssse3
   6233 .dy1_w8:
   6234    mov    dword [stk+0xf0], 1
   6235    movifprep   tmp_stridem, 16
   6236    jmp .dy1_w_start
   6237 .dy1_w16:
   6238    mov    dword [stk+0xf0], 2
   6239    movifprep   tmp_stridem, 32
   6240    jmp .dy1_w_start
   6241 .dy1_w32:
   6242    mov    dword [stk+0xf0], 4
   6243    movifprep   tmp_stridem, 64
   6244    jmp .dy1_w_start
   6245 .dy1_w64:
   6246    mov    dword [stk+0xf0], 8
   6247    movifprep   tmp_stridem, 128
   6248    jmp .dy1_w_start
   6249 .dy1_w128:
   6250    mov    dword [stk+0xf0], 16
   6251    movifprep   tmp_stridem, 256
   6252 .dy1_w_start:
   6253    mov                 myd, mym
   6254 %if ARCH_X86_64
   6255 %ifidn %1, put
   6256    movifnidn           dsm, dsq
   6257 %endif
   6258    mova         [rsp+0x10], m11
   6259    mova         [rsp+0x20], m12
   6260 %define hround m11
   6261 %if isput
   6262    mova         [rsp+0x30], m13
   6263 %else
   6264    mova                m13, [base+pd_m524256]
   6265 %endif
   6266    shr                 t0d, 16
   6267    shr                 myd, 6
   6268    mov                 r4d, 64 << 24
   6269    lea                 myd, [t1+myq]
   6270    cmovnz              r4q, [base+subpel_filters+myq*8]
   6271    movd                m15, t0d
   6272 %else
   6273 %define hround [esp+0x00]
   6274 %define m12    [esp+0x10]
   6275 %define m10    [base+pd_0x3ff]
   6276 %define m8  m0
   6277 %xdefine m14 m4
   6278 %xdefine m15 m3
   6279 %if isprep
   6280  %define ssq ssm
   6281 %endif
   6282    mov                  r5, [esp+0x1f0]
   6283    mov                  r3, [esp+0x1f4]
   6284    shr                  r5, 16
   6285    movd                m15, r5
   6286    xor                  r5, r5
   6287    shr                 myd, 6
   6288    lea                  r3, [r3+myd]
   6289    mov                  r4, 64 << 24
   6290    cmovnz               r4, [base+subpel_filters+r3*8+0]
   6291    cmovnz               r5, [base+subpel_filters+r3*8+4]
   6292    mov                  r0, r0m
   6293    mov                  r3, r3m
   6294 %endif
   6295    sub                srcq, 6
   6296    pslld                m7, m8, 2 ; dx*4
   6297    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
   6298    pshufd              m15, m15, q0000
   6299    paddd               m14, m8 ; mx+dx*[0-3]
   6300 %if ARCH_X86_64
   6301    movq                 m3, r4q
   6302 %else
   6303    movd                 m5, r4
   6304    movd                 m6, r5
   6305    punpckldq            m5, m6
   6306    SWAP                 m3, m5
   6307 %endif
   6308    punpcklbw            m3, m3
   6309    psraw                m3, 8
   6310    mova        [stk+0x100], m7
   6311    mova        [stk+0x120], m15
   6312    mov         [stk+0x0f8], srcq
   6313    mov         [stk+0x130], r0q ; dstq / tmpq
   6314    pshufd               m0, m3, q0000
   6315    pshufd               m1, m3, q1111
   6316    pshufd               m2, m3, q2222
   6317    pshufd               m3, m3, q3333
   6318 %if ARCH_X86_64
   6319    mova        [stk+0x140], m0
   6320    mova        [stk+0x150], m1
   6321    mova        [stk+0x160], m2
   6322    mova        [stk+0x170], m3
   6323 %if UNIX64
   6324    mov                  hm, hd
   6325 %endif
   6326 %else
   6327    mova        [stk+0x180], m0
   6328    mova        [stk+0x190], m1
   6329    mova        [stk+0x1a0], m2
   6330    mova        [stk+0x1b0], m3
   6331    SWAP                 m5, m3
   6332    mov                  r5, hm
   6333    mov         [stk+0x134], r5
   6334 %endif
   6335    jmp .dy1_hloop
   6336 .dy1_hloop_prep:
   6337    dec   dword [stk+0x0f0]
   6338    jz .ret
   6339 %if ARCH_X86_64
   6340    add   qword [stk+0x130], 16
   6341    mov                  hd, hm
   6342 %else
   6343    add   dword [stk+0x130], 16
   6344    mov                  r5, [stk+0x134]
   6345    mov                  r0, [stk+0x130]
   6346 %endif
   6347    mova                 m7, [stk+0x100]
   6348    mova                m14, [stk+0x110]
   6349 %if ARCH_X86_64
   6350    mova                m10, [base+pd_0x3ff]
   6351    mova                m11, [rsp+0x10]
   6352 %endif
   6353    mova                m15, [stk+0x120]
   6354    mov                srcq, [stk+0x0f8]
   6355 %if ARCH_X86_64
   6356    mov                 r0q, [stk+0x130] ; dstq / tmpq
   6357 %else
   6358    mov                  hm, r5
   6359    mov                 r0m, r0
   6360    mov                  r3, r3m
   6361 %endif
   6362    paddd               m14, m7
   6363 .dy1_hloop:
   6364 %if ARCH_X86_64
   6365    mova                 m9, [base+pq_0x40000000]
   6366 %else
   6367 %define m9 [base+pq_0x40000000]
   6368 %endif
   6369    pxor                 m1, m1
   6370    psrld                m2, m14, 10
   6371    mova              [stk], m2
   6372    pand                 m6, m14, m10
   6373    psrld                m6, 6
   6374    paddd                m5, m15, m6
   6375    pcmpeqd              m6, m1
   6376    pshufd               m2, m5, q1032
   6377 %if ARCH_X86_64
   6378    movd                r4d, m5
   6379    movd                r6d, m2
   6380    pshufd               m5, m5, q0321
   6381    pshufd               m2, m2, q0321
   6382    movd                r7d, m5
   6383    movd                r9d, m2
   6384    movq                 m0, [base+subpel_filters+r4*8]
   6385    movq                 m1, [base+subpel_filters+r6*8]
   6386    movhps               m0, [base+subpel_filters+r7*8]
   6387    movhps               m1, [base+subpel_filters+r9*8]
   6388 %else
   6389    movd                 r0, m5
   6390    movd                 rX, m2
   6391    pshufd               m5, m5, q0321
   6392    pshufd               m2, m2, q0321
   6393    movd                 r4, m5
   6394    movd                 r5, m2
   6395    movq                 m0, [base+subpel_filters+r0*8]
   6396    movq                 m1, [base+subpel_filters+rX*8]
   6397    movhps               m0, [base+subpel_filters+r4*8]
   6398    movhps               m1, [base+subpel_filters+r5*8]
   6399 %endif
   6400    paddd               m14, m7 ; mx+dx*[4-7]
   6401    pand                 m5, m14, m10
   6402    psrld                m5, 6
   6403    paddd               m15, m5
   6404    pxor                 m2, m2
   6405    pcmpeqd              m5, m2
   6406    mova        [stk+0x110], m14
   6407    pshufd               m4, m15, q1032
   6408 %if ARCH_X86_64
   6409    movd               r10d, m15
   6410    movd               r11d, m4
   6411    pshufd              m15, m15, q0321
   6412    pshufd               m4, m4, q0321
   6413    movd               r13d, m15
   6414    movd                rXd, m4
   6415    movq                 m2, [base+subpel_filters+r10*8]
   6416    movq                 m3, [base+subpel_filters+r11*8]
   6417    movhps               m2, [base+subpel_filters+r13*8]
   6418    movhps               m3, [base+subpel_filters+ rX*8]
   6419    psrld               m14, 10
   6420    movq                r11, m14
   6421    punpckhqdq          m14, m14
   6422    movq                 rX, m14
   6423    mov                r10d, r11d
   6424    shr                 r11, 32
   6425    mov                r13d, rXd
   6426    shr                  rX, 32
   6427    mov                 r4d, [stk+ 0]
   6428    mov                 r6d, [stk+ 4]
   6429    mov                 r7d, [stk+ 8]
   6430    mov                 r9d, [stk+12]
   6431    pshufd               m4, m6, q1100
   6432    pshufd               m6, m6, q3322
   6433    pshufd              m14, m5, q1100
   6434    pshufd               m5, m5, q3322
   6435    pand                 m7, m9, m4
   6436    pand                 m8, m9, m6
   6437    pand                m15, m9, m14
   6438    pand                 m9, m9, m5
   6439    pandn                m4, m0
   6440    pandn                m6, m1
   6441    pandn               m14, m2
   6442    pandn                m5, m3
   6443    por                  m7, m4
   6444    por                  m8, m6
   6445    por                 m15, m14
   6446    por                  m9, m5
   6447    punpcklbw            m0, m7, m7
   6448    punpckhbw            m7, m7
   6449    punpcklbw            m1, m8, m8
   6450    punpckhbw            m8, m8
   6451    psraw                m0, 8
   6452    psraw                m7, 8
   6453    psraw                m1, 8
   6454    psraw                m8, 8
   6455    punpcklbw            m2, m15, m15
   6456    punpckhbw           m15, m15
   6457    punpcklbw            m3, m9, m9
   6458    punpckhbw            m9, m9
   6459    psraw                m2, 8
   6460    psraw               m15, 8
   6461    psraw                m3, 8
   6462    psraw                m9, 8
   6463    mova         [stk+0x10], m0
   6464    mova         [stk+0x20], m7
   6465    mova         [stk+0x30], m1
   6466    mova         [stk+0x40], m8
   6467    mova         [stk+0x50], m2
   6468    mova         [stk+0x60], m15
   6469    mova         [stk+0x70], m3
   6470    mova         [stk+0x80], m9
   6471    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
   6472    mova         [stk+0x90], m1
   6473    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
   6474    mova         [stk+0xa0], m2
   6475    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
   6476    mova         [stk+0xb0], m3
   6477    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
   6478    mova         [stk+0xc0], m4
   6479    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
   6480    mova         [stk+0xd0], m5
   6481    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
   6482    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
   6483    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
   6484    mova                 m5, [stk+0xd0]
   6485    mova                 m1, [stk+0x90]
   6486    mova                 m2, [stk+0xa0]
   6487    mova                 m3, [stk+0xb0]
   6488    mova                 m9, [stk+0xc0]
   6489    punpcklwd            m4, m5, m6 ; 45a
   6490    punpckhwd            m5, m6     ; 45b
   6491    punpcklwd            m6, m7, m8 ; 67a
   6492    punpckhwd            m7, m8     ; 67b
   6493    punpcklwd            m0, m1, m2 ; 01a
   6494    punpckhwd            m1, m2     ; 01b
   6495    punpcklwd            m2, m3, m9 ; 23a
   6496    punpckhwd            m3, m9     ; 23b
   6497    mova                m10, [stk+0x140]
   6498    mova                m11, [stk+0x150]
   6499    mova                m14, [stk+0x160]
   6500    mova                m15, [stk+0x170]
   6501    mova         [stk+0x90], m4
   6502    mova         [stk+0xa0], m5
   6503    mova         [stk+0xb0], m6
   6504    mova         [stk+0xc0], m7
   6505 %define hround [rsp+0x10]
   6506 %define shift  [rsp+0x20]
   6507 %if isput
   6508  %define vround [rsp+0x30]
   6509 %else
   6510  %define vround [base+pd_m524256]
   6511 %endif
   6512 .dy1_vloop:
   6513    pmaddwd              m4, m0, m10
   6514    pmaddwd              m5, m1, m10
   6515    pmaddwd              m6, m2, m11
   6516    pmaddwd              m7, m3, m11
   6517    paddd                m4, m13
   6518    paddd                m5, m13
   6519    paddd                m4, m6
   6520    paddd                m5, m7
   6521    pmaddwd              m6, [stk+0x90], m14
   6522    pmaddwd              m7, [stk+0xa0], m14
   6523    pmaddwd              m8, [stk+0xb0], m15
   6524    pmaddwd              m9, [stk+0xc0], m15
   6525    paddd                m4, m6
   6526    paddd                m5, m7
   6527 %if isput
   6528    pshufd               m6, m12, q1032
   6529 %endif
   6530    paddd                m4, m8
   6531    paddd                m5, m9
   6532 %else
   6533    movd                 r0, m15
   6534    movd                 rX, m4
   6535    pshufd              m15, m15, q0321
   6536    pshufd               m4, m4, q0321
   6537    movd                 r4, m15
   6538    movd                 r5, m4
   6539    mova                m14, [stk+0x110]
   6540    movq                 m2, [base+subpel_filters+r0*8]
   6541    movq                 m3, [base+subpel_filters+rX*8]
   6542    movhps               m2, [base+subpel_filters+r4*8]
   6543    movhps               m3, [base+subpel_filters+r5*8]
   6544    psrld               m14, 10
   6545    mova           [stk+16], m14
   6546    mov                  r0, [stk+ 0]
   6547    mov                  rX, [stk+ 4]
   6548    mov                  r4, [stk+ 8]
   6549    mov                  r5, [stk+12]
   6550    mova         [stk+0x20], m0
   6551    mova         [stk+0x30], m1
   6552    mova         [stk+0x40], m2
   6553    mova         [stk+0x50], m3
   6554    pshufd               m4, m6, q1100
   6555    pshufd               m6, m6, q3322
   6556    pshufd               m7, m5, q1100
   6557    pshufd               m5, m5, q3322
   6558    pand                 m0, m9, m4
   6559    pand                 m1, m9, m6
   6560    pand                 m2, m9, m7
   6561    pand                 m3, m9, m5
   6562    pandn                m4, [stk+0x20]
   6563    pandn                m6, [stk+0x30]
   6564    pandn                m7, [stk+0x40]
   6565    pandn                m5, [stk+0x50]
   6566    por                  m0, m4
   6567    por                  m1, m6
   6568    por                  m2, m7
   6569    por                  m3, m5
   6570    punpcklbw            m4, m0, m0
   6571    punpckhbw            m0, m0
   6572    punpcklbw            m5, m1, m1
   6573    punpckhbw            m1, m1
   6574    psraw                m4, 8
   6575    psraw                m0, 8
   6576    psraw                m5, 8
   6577    psraw                m1, 8
   6578    punpcklbw            m6, m2, m2
   6579    punpckhbw            m2, m2
   6580    punpcklbw            m7, m3, m3
   6581    punpckhbw            m3, m3
   6582    psraw                m6, 8
   6583    psraw                m2, 8
   6584    psraw                m7, 8
   6585    psraw                m3, 8
   6586    mova        [stk+0x0a0], m4
   6587    mova        [stk+0x0b0], m0
   6588    mova        [stk+0x0c0], m5
   6589    mova        [stk+0x0d0], m1
   6590    mova        [stk+0x140], m6
   6591    mova        [stk+0x150], m2
   6592    mova        [stk+0x160], m7
   6593    mova        [stk+0x170], m3
   6594    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
   6595    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
   6596    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
   6597    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
   6598    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
   6599    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
   6600    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
   6601    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
   6602    mova                 m5, [stk+0x60]
   6603    mova                 m6, [stk+0x70]
   6604    mova                 m7, [stk+0x80]
   6605    mova                 m0, [stk+0x90]
   6606    mov                  r0, r0m
   6607    punpcklwd            m4, m5, m6      ; 45a
   6608    punpckhwd            m5, m6          ; 45b
   6609    punpcklwd            m6, m7, m0      ; 67a
   6610    punpckhwd            m7, m0          ; 67b
   6611    mova         [stk+0x60], m4
   6612    mova         [stk+0x70], m5
   6613    mova         [stk+0x80], m6
   6614    mova         [stk+0x90], m7
   6615    mova                 m1, [stk+0x20]
   6616    mova                 m2, [stk+0x30]
   6617    mova                 m3, [stk+0x40]
   6618    mova                 m4, [stk+0x50]
   6619    punpcklwd            m0, m1, m2      ; 01a
   6620    punpckhwd            m1, m2          ; 01b
   6621    punpcklwd            m2, m3, m4      ; 23a
   6622    punpckhwd            m3, m4          ; 23b
   6623    mova                 m4, [stk+0x180]
   6624    mova                 m5, [stk+0x190]
   6625    mova                 m6, [stk+0x1a0]
   6626    mova                 m7, [stk+0x1b0]
   6627    mova         [stk+0x20], m0
   6628    mova         [stk+0x30], m1
   6629    mova         [stk+0x40], m2
   6630    mova         [stk+0x50], m3
   6631 .dy1_vloop:
   6632    pmaddwd              m0, m4
   6633    pmaddwd              m1, m4
   6634    pmaddwd              m2, m5
   6635    pmaddwd              m3, m5
   6636    paddd                m0, m2
   6637    paddd                m1, m3
   6638    pmaddwd              m2, [stk+0x60], m6
   6639    pmaddwd              m3, [stk+0x70], m6
   6640    pmaddwd              m4, [stk+0x80], m7
   6641    pmaddwd              m5, [stk+0x90], m7
   6642 %if isput
   6643    movd                 m6, [esp+0x18]
   6644 %endif
   6645    paddd                m0, m2
   6646    paddd                m1, m3
   6647    paddd                m0, vrnd_mem
   6648    paddd                m1, vrnd_mem
   6649    paddd                m4, m0
   6650    paddd                m5, m1
   6651 %endif
   6652 %ifidn %1, put
   6653    psrad                m4, m6
   6654    psrad                m5, m6
   6655    packssdw             m4, m5
   6656    pxor                 m7, m7
   6657    pmaxsw               m4, m7
   6658    pminsw               m4, pxmaxm
   6659    mova             [dstq], m4
   6660    add                dstq, dsm
   6661 %else
   6662    psrad                m4, 6
   6663    psrad                m5, 6
   6664    packssdw             m4, m5
   6665    mova             [tmpq], m4
   6666    add                tmpq, tmp_stridem
   6667 %endif
   6668    dec                  hd
   6669    jz .dy1_hloop_prep
   6670 %if ARCH_X86_64
   6671    movu                 m8, [srcq+r10*2]
   6672    movu                 m9, [srcq+r11*2]
   6673    movu                m12, [srcq+r13*2]
   6674    movu                m13, [srcq+ rX*2]
   6675    movu                 m4, [srcq+ r4*2]
   6676    movu                 m5, [srcq+ r6*2]
   6677    movu                 m6, [srcq+ r7*2]
   6678    movu                 m7, [srcq+ r9*2]
   6679    add                srcq, ssq
   6680    pmaddwd              m8, [stk+0x50]
   6681    pmaddwd              m9, [stk+0x60]
   6682    pmaddwd             m12, [stk+0x70]
   6683    pmaddwd             m13, [stk+0x80]
   6684    pmaddwd              m4, [stk+0x10]
   6685    pmaddwd              m5, [stk+0x20]
   6686    pmaddwd              m6, [stk+0x30]
   6687    pmaddwd              m7, [stk+0x40]
   6688    phaddd               m8, m9
   6689    phaddd              m12, m13
   6690    mova                 m9, [base+unpckw]
   6691    mova                m13, hround
   6692    phaddd               m4, m5
   6693    phaddd               m6, m7
   6694    phaddd               m8, m12
   6695    phaddd               m4, m6
   6696    pshufd               m5, m9, q1032
   6697    pshufb               m0, m9             ; 0a 1a
   6698    pshufb               m1, m9             ; 0b 1b
   6699    pshufb               m2, m5             ; 3a 2a
   6700    pshufb               m3, m5             ; 3b 2b
   6701    mova                m12, shift
   6702    paddd                m4, m13
   6703    paddd                m8, m13
   6704    psrad                m4, m12
   6705    psrad                m8, m12
   6706    packssdw             m4, m8
   6707    pshufb               m6, [stk+0x90], m9 ; 4a 5a
   6708    pshufb               m7, [stk+0xa0], m9 ; 4b 5b
   6709    pshufb               m8, [stk+0xb0], m5 ; 7a 6a
   6710    pshufb              m13, [stk+0xc0], m5 ; 7b 6b
   6711    punpckhwd            m0, m2  ; 12a
   6712    punpckhwd            m1, m3  ; 12b
   6713    punpcklwd            m2, m6  ; 34a
   6714    punpcklwd            m3, m7  ; 34b
   6715    punpckhwd            m6, m8  ; 56a
   6716    punpckhwd            m7, m13 ; 56b
   6717    punpcklwd            m8, m4  ; 78a
   6718    punpckhqdq           m4, m4
   6719    punpcklwd           m13, m4  ; 78b
   6720    mova         [stk+0x90], m6
   6721    mova         [stk+0xa0], m7
   6722    mova         [stk+0xb0], m8
   6723    mova         [stk+0xc0], m13
   6724    mova                m13, vround
   6725 %else
   6726    mov                 r0m, r0
   6727    mov                  r3, r3m
   6728    mov                  r0, [stk+ 0]
   6729    mov                  rX, [stk+ 4]
   6730    mov                  r4, [stk+ 8]
   6731    mov                  r5, [stk+12]
   6732    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
   6733    mova                 m7, [base+unpckw]
   6734    pshufd               m4, m7, q1032
   6735    pshufb               m0, [stk+0x20], m7 ; 0a 1a
   6736    pshufb               m1, [stk+0x30], m7 ; 0b 1b
   6737    pshufb               m2, [stk+0x40], m4 ; 3a 2a
   6738    pshufb               m3, [stk+0x50], m4 ; 3b 2b
   6739    pshufb               m5, [stk+0x60], m7 ; 4a 5a
   6740    pshufb               m6, [stk+0x70], m7 ; 4b 5b
   6741    pshufb               m7, [stk+0x80], m4 ; 7a 6a
   6742    punpckhwd            m0, m2 ; 12a
   6743    punpckhwd            m1, m3 ; 12b
   6744    punpcklwd            m2, m5 ; 34a
   6745    punpcklwd            m3, m6 ; 34b
   6746    mova         [stk+0x20], m0
   6747    mova         [stk+0x30], m1
   6748    mova         [stk+0x40], m2
   6749    mova         [stk+0x50], m3
   6750    punpckhwd            m5, m7 ; 56a
   6751    mova         [stk+0x60], m5
   6752    pshufb               m5, [stk+0x90], m4 ; 7b 6b
   6753    punpcklwd            m7, [stk+0xe0] ; 78a
   6754    mova                 m4, [stk+0x180]
   6755    punpckhwd            m6, m5 ; 56b
   6756    mova         [stk+0x70], m6
   6757    movq                 m6, [stk+0xe8]
   6758    mova         [stk+0x80], m7
   6759    mova                 m7, [stk+0x1b0]
   6760    punpcklwd            m5, m6
   6761    mova                 m6, [stk+0x1a0]
   6762    mova         [stk+0x90], m5
   6763    mova                 m5, [stk+0x190]
   6764    mov                  r0, r0m
   6765 %endif
   6766    jmp .dy1_vloop
   6767 INIT_XMM ssse3
   6768 %if ARCH_X86_64
   6769 %define stk rsp+0x20
   6770 %endif
   6771 .dy2:
   6772    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
   6773    add                  wq, base_reg
   6774    jmp                  wq
   6775 %if isput
   6776 .dy2_w2:
   6777 %if ARCH_X86_64
   6778    mov                 myd, mym
   6779    mova         [rsp+0x10], m13
   6780  %define vrnd_mem [rsp+0x10]
   6781    movzx               t0d, t0b
   6782    sub                srcq, 2
   6783    movd                m15, t0d
   6784 %else
   6785  %define m8  m0
   6786  %define m9  m1
   6787  %define m14 m4
   6788  %define m15 m3
   6789  %define m11 [esp+0x00]
   6790  %define m12 [esp+0x10]
   6791  %define vrnd_mem [esp+0x20]
   6792    mov                  r1, r1m
   6793    movzx                r5, byte [esp+0x1f0]
   6794    sub                srcq, 2
   6795    movd                m15, r5
   6796 %endif
   6797    pxor                 m9, m9
   6798    punpckldq            m9, m8
   6799    paddd               m14, m9 ; mx+dx*[0-1]
   6800 %if ARCH_X86_64
   6801    mova                 m9, [base+pd_0x4000]
   6802 %endif
   6803    pshufd              m15, m15, q0000
   6804    pand                 m8, m14, m10
   6805    psrld                m8, 6
   6806    paddd               m15, m8
   6807    movd                r4d, m15
   6808    pshufd              m15, m15, q0321
   6809 %if ARCH_X86_64
   6810    movd                r6d, m15
   6811 %else
   6812    movd                r3d, m15
   6813 %endif
   6814    mova                 m5, [base+bdct_lb_q]
   6815    mova                 m6, [base+spel_s_shuf2]
   6816    movd                m15, [base+subpel_filters+r4*8+2]
   6817 %if ARCH_X86_64
   6818    movd                 m7, [base+subpel_filters+r6*8+2]
   6819 %else
   6820    movd                 m7, [base+subpel_filters+r3*8+2]
   6821 %endif
   6822    pxor                 m2, m2
   6823    pcmpeqd              m8, m2
   6824    psrld               m14, 10
   6825    paddd               m14, m14
   6826 %if ARCH_X86_32
   6827    mov                  r3, r3m
   6828    pshufb              m14, m5
   6829    paddb               m14, m6
   6830    mova              [stk], m14
   6831    SWAP                 m5, m0
   6832    SWAP                 m6, m3
   6833  %define m15 m6
   6834 %endif
   6835    movu                 m0, [srcq+ssq*0]
   6836    movu                 m1, [srcq+ssq*2]
   6837    movu                 m2, [srcq+ssq*4]
   6838    punpckldq           m15, m7
   6839 %if ARCH_X86_64
   6840    pshufb              m14, m5
   6841    paddb               m14, m6
   6842    pand                 m9, m8
   6843    pandn                m8, m15
   6844    SWAP                m15, m8
   6845    por                 m15, m9
   6846    movu                 m4, [srcq+ssq*1]
   6847    movu                 m5, [srcq+ss3q ]
   6848    lea                srcq, [srcq+ssq*4]
   6849    movu                 m6, [srcq+ssq*1]
   6850    lea                srcq, [srcq+ssq*2]
   6851    shr                 myd, 6
   6852    mov                 r4d, 64 << 24
   6853    lea                 myd, [t1+myq]
   6854    cmovnz              r4q, [base+subpel_filters+myq*8]
   6855 %else
   6856    pand                 m7, m5, [base+pd_0x4000]
   6857    pandn                m5, m15
   6858    por                  m5, m7
   6859  %define m15 m5
   6860    mov                 myd, mym
   6861    mov                  r5, [esp+0x1f4]
   6862    xor                  r3, r3
   6863    shr                 myd, 6
   6864    lea                  r5, [r5+myd]
   6865    mov                  r4, 64 << 24
   6866    cmovnz               r4, [base+subpel_filters+r5*8+0]
   6867    cmovnz               r3, [base+subpel_filters+r5*8+4]
   6868    mov          [stk+0x20], r3
   6869    mov                  r3, r3m
   6870 %endif
   6871    punpcklbw           m15, m15
   6872    psraw               m15, 8
   6873    REPX    {pshufb x, m14}, m0, m1, m2
   6874    REPX   {pmaddwd x, m15}, m0, m1, m2
   6875 %if ARCH_X86_64
   6876    REPX    {pshufb x, m14}, m4, m5, m6
   6877    REPX   {pmaddwd x, m15}, m4, m5, m6
   6878    phaddd               m0, m1
   6879    phaddd               m1, m2
   6880    phaddd               m4, m5
   6881    phaddd               m5, m6
   6882    REPX     {paddd x, m11}, m0, m1, m4, m5
   6883    REPX     {psrad x, m12}, m0, m1, m4, m5
   6884    packssdw             m0, m1 ; 0 2 2 4
   6885    packssdw             m4, m5 ; 1 3 3 5
   6886    SWAP                 m2, m4
   6887    movq                m10, r4
   6888 %else
   6889    mova         [stk+0x10], m15
   6890    phaddd               m0, m1
   6891    phaddd               m1, m2
   6892    movu                 m2, [srcq+ssq*1]
   6893    movu                 m7, [srcq+ss3q ]
   6894    lea                srcq, [srcq+ssq*4]
   6895    movu                 m6, [srcq+ssq*1]
   6896    lea                srcq, [srcq+ssq*2]
   6897    REPX    {pshufb x, m14}, m2, m7, m6
   6898    REPX   {pmaddwd x, m15}, m2, m7, m6
   6899  %define m14 [stk+0x00]
   6900  %define m15 [stk+0x10]
   6901    phaddd               m2, m7
   6902    phaddd               m7, m6
   6903    REPX     {paddd x, m11}, m0, m1, m2, m7
   6904    REPX     {psrad x, m12}, m0, m1, m2, m7
   6905    packssdw             m0, m1
   6906    packssdw             m2, m7
   6907  %define m8  m6
   6908  %define m9  m4
   6909  %define m10 m5
   6910    movd                m10, r4
   6911    movd                 m9, [stk+0x20]
   6912    punpckldq           m10, m9
   6913 %endif
   6914    punpcklbw           m10, m10
   6915    psraw               m10, 8
   6916    pshufd               m7, m10, q0000
   6917    pshufd               m8, m10, q1111
   6918    pshufd               m9, m10, q2222
   6919    pshufd              m10, m10, q3333
   6920 %if ARCH_X86_32
   6921    mova         [stk+0x50], m7
   6922    mova         [stk+0x60], m8
   6923    mova         [stk+0x70], m9
   6924    mova         [stk+0x80], m10
   6925  %xdefine m13 m7
   6926  %define m7  [stk+0x50]
   6927  %define m8  [stk+0x60]
   6928  %define m9  [stk+0x70]
   6929  %define m10 [stk+0x80]
   6930 %endif
   6931    punpcklwd            m1, m0, m2    ; 01 23
   6932    punpckhwd            m3, m0, m2    ; 23 45
   6933 %if ARCH_X86_32
   6934    mov                  r4, r0m
   6935  %define dstq r4
   6936    mova         [stk+0x20], m3
   6937    mova         [stk+0x30], m0
   6938 %endif
   6939 .dy2_w2_loop:
   6940    movu                 m4, [srcq+ssq*0]
   6941    movu                 m5, [srcq+ssq*1]
   6942    movu                 m6, [srcq+ssq*2]
   6943    movu                m13, [srcq+ss3q ]
   6944    lea                srcq, [srcq+ssq*4]
   6945    pmaddwd              m3, m8
   6946    REPX    {pshufb x, m14}, m4, m5, m6, m13
   6947    REPX   {pmaddwd x, m15}, m4, m5, m6, m13
   6948    phaddd               m4, m5
   6949    phaddd               m6, m13
   6950    pmaddwd              m5, m1, m7
   6951    paddd                m4, m11
   6952    paddd                m6, m11
   6953    psrad                m4, m12
   6954    psrad                m6, m12
   6955    packssdw             m4, m6 ; 6 7 8 9
   6956    paddd                m5, m3
   6957    pshufd               m3, m4, q2200
   6958    pshufd               m4, m4, q3311
   6959    palignr              m3, m0, 12 ; 4 6 6 8
   6960    palignr              m4, m2, 12 ; 5 7 7 9
   6961    mova                 m0, m3
   6962    mova                 m2, m4
   6963    punpcklwd            m1, m3, m4
   6964    punpckhwd            m3, m4
   6965    pmaddwd              m6, m1, m9
   6966    pmaddwd              m4, m3, m10
   6967    paddd                m5, vrnd_mem
   6968    paddd                m6, m4
   6969    paddd                m5, m6
   6970    pshufd               m4, m12, q1032
   6971    pxor                 m6, m6
   6972    psrad                m5, m4
   6973    packssdw             m5, m5
   6974    pmaxsw               m5, m6
   6975    pminsw               m5, pxmaxm
   6976    movd       [dstq+dsq*0], m5
   6977    pshuflw              m5, m5, q1032
   6978    movd       [dstq+dsq*1], m5
   6979    lea                dstq, [dstq+dsq*2]
   6980    sub                  hd, 2
   6981    jg .dy2_w2_loop
   6982    RET
   6983 %endif
   6984 INIT_XMM ssse3
   6985 .dy2_w4:
   6986 %if ARCH_X86_64
   6987    mov                 myd, mym
   6988    mova         [rsp+0x10], m11
   6989    mova         [rsp+0x20], m12
   6990 %if isput
   6991    mova         [rsp+0x30], m13
   6992  %define vrnd_mem [rsp+0x30]
   6993  %define stk rsp+0x40
   6994 %else
   6995  %define vrnd_mem [base+pd_m524256]
   6996  %define stk rsp+0x30
   6997 %endif
   6998    movzx               t0d, t0b
   6999    sub                srcq, 2
   7000    movd                m15, t0d
   7001 %else
   7002 %define m10 [base+pd_0x3ff]
   7003 %define m9  [base+pd_0x4000]
   7004 %define m8  m0
   7005 %xdefine m14 m4
   7006 %define m15 m3
   7007 %if isprep
   7008  %define ssq r3
   7009 %endif
   7010    movzx                r5, byte [esp+0x1f0]
   7011    sub                srcq, 2
   7012    movd                m15, r5
   7013 %endif
   7014    pmaddwd              m8, [base+rescale_mul]
   7015 %if ARCH_X86_64
   7016    mova                 m9, [base+pd_0x4000]
   7017 %endif
   7018    pshufd              m15, m15, q0000
   7019    paddd               m14, m8 ; mx+dx*[0-3]
   7020    pand                 m0, m14, m10
   7021    psrld                m0, 6
   7022    paddd               m15, m0
   7023    pshufd               m7, m15, q1032
   7024 %if ARCH_X86_64
   7025    movd                r4d, m15
   7026    movd               r11d, m7
   7027    pshufd              m15, m15, q0321
   7028    pshufd               m7, m7, q0321
   7029    movd                r6d, m15
   7030    movd               r13d, m7
   7031    mova                m10, [base+bdct_lb_q+ 0]
   7032    mova                m11, [base+bdct_lb_q+16]
   7033    movd                m13, [base+subpel_filters+ r4*8+2]
   7034    movd                 m2, [base+subpel_filters+ r6*8+2]
   7035    movd                m15, [base+subpel_filters+r11*8+2]
   7036    movd                 m4, [base+subpel_filters+r13*8+2]
   7037 %else
   7038    movd                 r1, m15
   7039    movd                 r4, m7
   7040    pshufd              m15, m15, q0321
   7041    pshufd               m7, m7, q0321
   7042    movd                 r3, m15
   7043    movd                 r5, m7
   7044    mova                 m5, [base+bdct_lb_q+ 0]
   7045    mova                 m6, [base+bdct_lb_q+16]
   7046    movd                 m1, [base+subpel_filters+r1*8+2]
   7047    movd                 m2, [base+subpel_filters+r3*8+2]
   7048    movd                 m3, [base+subpel_filters+r4*8+2]
   7049    movd                 m7, [base+subpel_filters+r5*8+2]
   7050    SWAP                 m4, m7
   7051    mov                  r3, r3m
   7052 %if isprep
   7053    lea                ss3q, [ssq*3]
   7054 %endif
   7055 %define m10 m5
   7056 %define m11 m6
   7057 %define m12 m1
   7058 %define m13 m1
   7059 %endif
   7060    psrld               m14, 10
   7061    paddd               m14, m14
   7062    punpckldq           m13, m2
   7063    punpckldq           m15, m4
   7064    punpcklqdq          m13, m15
   7065    pxor                 m2, m2
   7066    pcmpeqd              m0, m2
   7067 %if ARCH_X86_64
   7068    pand                 m9, m0
   7069 %else
   7070    pand                 m2, m9, m0
   7071 %define m9 m2
   7072    SWAP                 m7, m4
   7073 %endif
   7074    pandn                m0, m13
   7075 %if ARCH_X86_64
   7076    SWAP                m13, m0
   7077 %else
   7078 %define m13 m0
   7079 %endif
   7080    por                 m13, m9
   7081    punpckhbw           m15, m13, m13
   7082    punpcklbw           m13, m13
   7083    psraw               m15, 8
   7084    psraw               m13, 8
   7085    pshufb              m12, m14, m10
   7086    pshufb              m14, m11
   7087    mova                m10, [base+spel_s_shuf2]
   7088    movd                r4d, m14
   7089    shr                 r4d, 24
   7090 %if ARCH_X86_32
   7091    mova         [stk+0x40], m13
   7092    mova         [stk+0x50], m15
   7093    pxor                 m2, m2
   7094 %endif
   7095    pshufb               m7, m14, m2
   7096    psubb               m14, m7
   7097    paddb               m12, m10
   7098    paddb               m14, m10
   7099 %if ARCH_X86_64
   7100    lea                  r6, [r4+ssq*1]
   7101    lea                 r11, [r4+ssq*2]
   7102    lea                 r13, [r4+ss3q ]
   7103    movu                 m1, [srcq+ssq*0]
   7104    movu                 m8, [srcq+ssq*2]
   7105    movu                 m9, [srcq+ssq*1]
   7106    movu                m10, [srcq+ss3q ]
   7107    movu                 m7, [srcq+r4   ]
   7108    movu                 m2, [srcq+r11  ]
   7109    movu                 m3, [srcq+r6   ]
   7110    movu                 m4, [srcq+r13  ]
   7111    lea                srcq, [srcq+ssq*4]
   7112    REPX    {pshufb x, m12}, m1, m9, m8, m10
   7113    REPX   {pmaddwd x, m13}, m1, m9, m8, m10
   7114    REPX    {pshufb x, m14}, m7, m3, m2, m4
   7115    REPX   {pmaddwd x, m15}, m7, m3, m2, m4
   7116    mova                 m5, [rsp+0x10]
   7117    movd                xm6, [rsp+0x20]
   7118    phaddd               m1, m7
   7119    phaddd               m8, m2
   7120    phaddd               m9, m3
   7121    phaddd              m10, m4
   7122    movu                 m2, [srcq+ssq*0]
   7123    movu                 m3, [srcq+ssq*1]
   7124    REPX      {paddd x, m5}, m1, m9, m8, m10
   7125    REPX     {psrad x, xm6}, m1, m9, m8, m10
   7126    packssdw             m1, m8     ; 0 2
   7127    packssdw             m9, m10    ; 1 3
   7128    movu                 m0, [srcq+r4   ]
   7129    movu                 m8, [srcq+r6   ]
   7130    lea                srcq, [srcq+ssq*2]
   7131    REPX    {pshufb x, m12}, m2, m3
   7132    REPX   {pmaddwd x, m13}, m2, m3
   7133    REPX    {pshufb x, m14}, m0, m8
   7134    REPX   {pmaddwd x, m15}, m0, m8
   7135    phaddd               m2, m0
   7136    phaddd               m3, m8
   7137    shr                 myd, 6
   7138    mov                 r9d, 64 << 24
   7139    lea                 myd, [t1+myq]
   7140    cmovnz              r9q, [base+subpel_filters+myq*8]
   7141    REPX      {paddd x, m5}, m2, m3
   7142    REPX     {psrad x, xm6}, m2, m3
   7143    packssdw             m2, m3        ; 4 5
   7144    pshufd               m3, m2, q1032 ; 5 _
   7145    punpcklwd            m0, m1, m9    ; 01
   7146    punpckhwd            m1, m9        ; 23
   7147    punpcklwd            m2, m3        ; 45
   7148    movq                m10, r9
   7149 %define hrnd_mem [rsp+0x10]
   7150 %define hsh_mem  [rsp+0x20]
   7151 %define vsh_mem  [rsp+0x28]
   7152 %if isput
   7153  %define vrnd_mem [rsp+0x30]
   7154 %else
   7155  %define vrnd_mem [base+pd_m524256]
   7156 %endif
   7157 %else
   7158    mova         [stk+0x20], m12
   7159    mova         [stk+0x30], m14
   7160    add                  r4, srcq
   7161    MC_4TAP_SCALED_H   0x60 ; 0 1
   7162    MC_4TAP_SCALED_H   0x70 ; 2 3
   7163    MC_4TAP_SCALED_H   0x80 ; 4 5
   7164    mov          [stk+0xe0], r4
   7165    mova                 m3, [base+spel_s_shuf8]
   7166    mova                 m0, [stk+0x60]
   7167    mova                 m1, [stk+0x70]
   7168    mova                 m2, [stk+0x80]
   7169    mov                 myd, mym
   7170    mov                  rX, [esp+0x1f4]
   7171    xor                  r5, r5
   7172    shr                 myd, 6
   7173    lea                  rX, [rX+myd]
   7174    mov                  r4, 64 << 24
   7175    cmovnz               r4, [base+subpel_filters+rX*8+0]
   7176    cmovnz               r5, [base+subpel_filters+rX*8+4]
   7177    mov                  r3, r3m
   7178    pshufb               m0, m3 ; 01
   7179    pshufb               m1, m3 ; 23
   7180    pshufb               m2, m3 ; 45
   7181    movd                 m7, r4
   7182    movd                 m4, r5
   7183    mov                  r5, r0m
   7184 %if isput
   7185    mov                  r1, r1m
   7186 %endif
   7187    mov                  r4, [stk+0xe0]
   7188 %define dstq r5
   7189 %define tmpq r5
   7190 %define m12 [stk+0x20]
   7191 %define m14 [stk+0x30]
   7192 %define m13 [stk+0x40]
   7193 %define m15 [stk+0x50]
   7194 %define hrnd_mem [esp+0x00]
   7195 %define hsh_mem  [esp+0x10]
   7196 %define vsh_mem  [esp+0x18]
   7197 %if isput
   7198  %define vrnd_mem [esp+0x20]
   7199 %else
   7200  %define vrnd_mem [base+pd_m524256]
   7201 %endif
   7202 %define m10 m7
   7203    punpckldq           m10, m4
   7204 %endif
   7205    punpcklbw           m10, m10
   7206    psraw               m10, 8
   7207    pshufd               m3, m10, q0000
   7208    pshufd               m4, m10, q1111
   7209    pshufd               m5, m10, q2222
   7210    pshufd              m10, m10, q3333
   7211 %if ARCH_X86_32
   7212 %xdefine m8  m3
   7213 %xdefine m9  m6
   7214 %xdefine m11 m5
   7215 %xdefine m6  m4
   7216    mova         [stk+0x100], m3
   7217    mova         [stk+0x110], m4
   7218    mova         [stk+0x120], m5
   7219    mova         [stk+0x130], m10
   7220 %define m3  [stk+0x100]
   7221 %define m4  [stk+0x110]
   7222 %define m5  [stk+0x120]
   7223 %define m10 [stk+0x130]
   7224 %endif
   7225 .dy2_w4_loop:
   7226    pmaddwd              m8, m0, m3
   7227    pmaddwd              m9, m1, m3
   7228    mova                 m0, m2
   7229    pmaddwd              m1, m4
   7230    pmaddwd             m11, m2, m4
   7231    paddd                m8, vrnd_mem
   7232    paddd                m9, vrnd_mem
   7233    pmaddwd              m2, m5
   7234    paddd                m8, m1
   7235    paddd                m9, m11
   7236    paddd                m8, m2
   7237    movu                 m6, [srcq+ssq*0]
   7238    movu                 m1, [srcq+ssq*2]
   7239 %if ARCH_X86_64
   7240    movu                m11, [srcq+r4 ]
   7241    movu                 m2, [srcq+r11]
   7242 %else
   7243    movu                m11, [r4+ssq*0]
   7244    movu                 m2, [r4+ssq*2]
   7245 %endif
   7246    pshufb               m6, m12
   7247    pshufb               m1, m12
   7248    pmaddwd              m6, m13
   7249    pmaddwd              m1, m13
   7250    pshufb              m11, m14
   7251    pshufb               m2, m14
   7252    pmaddwd             m11, m15
   7253    pmaddwd              m2, m15
   7254    phaddd               m6, m11
   7255    phaddd               m1, m2
   7256    paddd                m6, hrnd_mem
   7257    paddd                m1, hrnd_mem
   7258    psrad                m6, hsh_mem
   7259    psrad                m1, hsh_mem
   7260    movu                 m7, [srcq+ssq*1]
   7261    movu                m11, [srcq+ss3q ]
   7262    packssdw             m6, m1 ; 6 8
   7263 %if ARCH_X86_64
   7264    movu                 m2, [srcq+r6 ]
   7265    movu                 m1, [srcq+r13]
   7266 %else
   7267    movu                 m2, [r4+ssq*1]
   7268    movu                 m1, [r4+ss3q ]
   7269 %endif
   7270    pshufb               m7, m12
   7271    pshufb              m11, m12
   7272    pmaddwd              m7, m13
   7273    pmaddwd             m11, m13
   7274    pshufb               m2, m14
   7275    pshufb               m1, m14
   7276    pmaddwd              m2, m15
   7277    pmaddwd              m1, m15
   7278    phaddd               m7, m2
   7279    phaddd              m11, m1
   7280    paddd                m7, hrnd_mem
   7281    paddd               m11, hrnd_mem
   7282    psrad                m7, hsh_mem
   7283    psrad               m11, hsh_mem
   7284    packssdw             m7, m11 ; 7 9
   7285 %if ARCH_X86_32
   7286    lea                  r4, [r4+ssq*4]
   7287 %endif
   7288    lea                srcq, [srcq+ssq*4]
   7289    punpcklwd            m1, m6, m7 ; 67
   7290    punpckhwd            m6, m7     ; 89
   7291    mova                 m2, m6
   7292    pmaddwd             m11, m1, m5
   7293    pmaddwd              m7, m1, m10
   7294    pmaddwd              m6, m10
   7295    paddd                m9, m11
   7296 %if isput
   7297    movd                m11, vsh_mem
   7298 %endif
   7299    paddd                m8, m7
   7300    paddd                m9, m6
   7301 %if isput
   7302    psrad                m8, m11
   7303    psrad                m9, m11
   7304    packssdw             m8, m9
   7305    pxor                 m7, m7
   7306    pmaxsw               m8, m7
   7307    pminsw               m8, pxmaxm
   7308    movq       [dstq+dsq*0], m8
   7309    movhps     [dstq+dsq*1], m8
   7310    lea                dstq, [dstq+dsq*2]
   7311 %else
   7312    psrad                m8, 6
   7313    psrad                m9, 6
   7314    packssdw             m8, m9
   7315    mova             [tmpq], m8
   7316    add                tmpq, 16
   7317 %endif
   7318    sub                  hd, 2
   7319    jg .dy2_w4_loop
   7320    MC_8TAP_SCALED_RET ; why not jz .ret?
   7321 INIT_XMM ssse3
   7322 .dy2_w8:
   7323    mov    dword [stk+0xf0], 1
   7324    movifprep   tmp_stridem, 16
   7325    jmp .dy2_w_start
   7326 .dy2_w16:
   7327    mov    dword [stk+0xf0], 2
   7328    movifprep   tmp_stridem, 32
   7329    jmp .dy2_w_start
   7330 .dy2_w32:
   7331    mov    dword [stk+0xf0], 4
   7332    movifprep   tmp_stridem, 64
   7333    jmp .dy2_w_start
   7334 .dy2_w64:
   7335    mov    dword [stk+0xf0], 8
   7336    movifprep   tmp_stridem, 128
   7337    jmp .dy2_w_start
   7338 .dy2_w128:
   7339    mov    dword [stk+0xf0], 16
   7340    movifprep   tmp_stridem, 256
   7341 .dy2_w_start:
   7342    mov                 myd, mym
   7343 %if ARCH_X86_64
   7344 %ifidn %1, put
   7345    movifnidn           dsm, dsq
   7346 %endif
   7347    mova         [rsp+0x10], m11
   7348    mova         [rsp+0x20], m12
   7349 %define hround m11
   7350 %if isput
   7351    mova         [rsp+0x30], m13
   7352 %else
   7353    mova                m13, [base+pd_m524256]
   7354 %endif
   7355    shr                 t0d, 16
   7356    shr                 myd, 6
   7357    mov                 r4d, 64 << 24
   7358    lea                 myd, [t1+myq]
   7359    cmovnz              r4q, [base+subpel_filters+myq*8]
   7360    movd                m15, t0d
   7361 %else
   7362 %define hround [esp+0x00]
   7363 %define m12    [esp+0x10]
   7364 %define m10    [base+pd_0x3ff]
   7365 %define m8  m0
   7366 %xdefine m14 m4
   7367 %xdefine m15 m3
   7368 %if isput
   7369  %define dstq r0
   7370 %else
   7371  %define tmpq r0
   7372  %define ssq ssm
   7373 %endif
   7374    mov                  r5, [esp+0x1f0]
   7375    mov                  r3, [esp+0x1f4]
   7376    shr                  r5, 16
   7377    movd                m15, r5
   7378    xor                  r5, r5
   7379    shr                 myd, 6
   7380    lea                  r3, [r3+myd]
   7381    mov                  r4, 64 << 24
   7382    cmovnz               r4, [base+subpel_filters+r3*8+0]
   7383    cmovnz               r5, [base+subpel_filters+r3*8+4]
   7384    mov                  r0, r0m
   7385    mov                  r3, r3m
   7386 %endif
   7387    sub                srcq, 6
   7388    pslld                m7, m8, 2 ; dx*4
   7389    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
   7390    pshufd              m15, m15, q0000
   7391    paddd               m14, m8 ; mx+dx*[0-3]
   7392 %if ARCH_X86_64
   7393    movq                 m3, r4q
   7394 %else
   7395    movd                 m5, r4
   7396    movd                 m6, r5
   7397    punpckldq            m5, m6
   7398    SWAP                 m3, m5
   7399 %endif
   7400    punpcklbw            m3, m3
   7401    psraw                m3, 8
   7402    mova        [stk+0x100], m7
   7403    mova        [stk+0x120], m15
   7404    mov         [stk+0x0f8], srcq
   7405    mov         [stk+0x130], r0q ; dstq / tmpq
   7406    pshufd               m0, m3, q0000
   7407    pshufd               m1, m3, q1111
   7408    pshufd               m2, m3, q2222
   7409    pshufd               m3, m3, q3333
   7410 %if ARCH_X86_64
   7411    mova        [stk+0x140], m0
   7412    mova        [stk+0x150], m1
   7413    mova        [stk+0x160], m2
   7414    mova        [stk+0x170], m3
   7415 %if UNIX64
   7416    mov                  hm, hd
   7417 %endif
   7418 %else
   7419    mova        [stk+0x180], m0
   7420    mova        [stk+0x190], m1
   7421    mova        [stk+0x1a0], m2
   7422    mova        [stk+0x1b0], m3
   7423    SWAP                 m5, m3
   7424    mov                  r5, hm
   7425    mov         [stk+0x134], r5
   7426 %endif
   7427    jmp .dy2_hloop
   7428 .dy2_hloop_prep:
   7429    dec   dword [stk+0x0f0]
   7430    jz .ret
   7431 %if ARCH_X86_64
   7432    add   qword [stk+0x130], 16
   7433    mov                  hd, hm
   7434 %else
   7435    add   dword [stk+0x130], 16
   7436    mov                  r5, [stk+0x134]
   7437    mov                  r0, [stk+0x130]
   7438 %endif
   7439    mova                 m7, [stk+0x100]
   7440    mova                m14, [stk+0x110]
   7441 %if ARCH_X86_64
   7442    mova                m10, [base+pd_0x3ff]
   7443    mova                m11, [rsp+0x10]
   7444 %endif
   7445    mova                m15, [stk+0x120]
   7446    mov                srcq, [stk+0x0f8]
   7447 %if ARCH_X86_64
   7448    mov                 r0q, [stk+0x130] ; dstq / tmpq
   7449 %else
   7450    mov                  hm, r5
   7451    mov                 r0m, r0
   7452    mov                  r3, r3m
   7453 %endif
   7454    paddd               m14, m7
   7455 .dy2_hloop:
   7456 %if ARCH_X86_64
   7457    mova                 m9, [base+pq_0x40000000]
   7458 %else
   7459 %define m9 [base+pq_0x40000000]
   7460 %endif
   7461    pxor                 m1, m1
   7462    psrld                m2, m14, 10
   7463    mova              [stk], m2
   7464    pand                 m6, m14, m10
   7465    psrld                m6, 6
   7466    paddd                m5, m15, m6
   7467    pcmpeqd              m6, m1
   7468    pshufd               m2, m5, q1032
   7469 %if ARCH_X86_64
   7470    movd                r4d, m5
   7471    movd                r6d, m2
   7472    pshufd               m5, m5, q0321
   7473    pshufd               m2, m2, q0321
   7474    movd                r7d, m5
   7475    movd                r9d, m2
   7476    movq                 m0, [base+subpel_filters+r4*8]
   7477    movq                 m1, [base+subpel_filters+r6*8]
   7478    movhps               m0, [base+subpel_filters+r7*8]
   7479    movhps               m1, [base+subpel_filters+r9*8]
   7480 %else
   7481    movd                 r0, m5
   7482    movd                 rX, m2
   7483    pshufd               m5, m5, q0321
   7484    pshufd               m2, m2, q0321
   7485    movd                 r4, m5
   7486    movd                 r5, m2
   7487    movq                 m0, [base+subpel_filters+r0*8]
   7488    movq                 m1, [base+subpel_filters+rX*8]
   7489    movhps               m0, [base+subpel_filters+r4*8]
   7490    movhps               m1, [base+subpel_filters+r5*8]
   7491 %endif
   7492    paddd               m14, m7 ; mx+dx*[4-7]
   7493    pand                 m5, m14, m10
   7494    psrld                m5, 6
   7495    paddd               m15, m5
   7496    pxor                 m2, m2
   7497    pcmpeqd              m5, m2
   7498    mova        [stk+0x110], m14
   7499    pshufd               m4, m15, q1032
   7500 %if ARCH_X86_64
   7501    movd               r10d, m15
   7502    movd               r11d, m4
   7503    pshufd              m15, m15, q0321
   7504    pshufd               m4, m4, q0321
   7505    movd               r13d, m15
   7506    movd                rXd, m4
   7507    movq                 m2, [base+subpel_filters+r10*8]
   7508    movq                 m3, [base+subpel_filters+r11*8]
   7509    movhps               m2, [base+subpel_filters+r13*8]
   7510    movhps               m3, [base+subpel_filters+ rX*8]
   7511    psrld               m14, 10
   7512    movq                r11, m14
   7513    punpckhqdq          m14, m14
   7514    movq                 rX, m14
   7515    mov                r10d, r11d
   7516    shr                 r11, 32
   7517    mov                r13d, rXd
   7518    shr                  rX, 32
   7519    mov                 r4d, [stk+ 0]
   7520    mov                 r6d, [stk+ 4]
   7521    mov                 r7d, [stk+ 8]
   7522    mov                 r9d, [stk+12]
   7523    pshufd               m4, m6, q1100
   7524    pshufd               m6, m6, q3322
   7525    pshufd              m14, m5, q1100
   7526    pshufd               m5, m5, q3322
   7527    pand                 m7, m9, m4
   7528    pand                 m8, m9, m6
   7529    pand                m15, m9, m14
   7530    pand                 m9, m9, m5
   7531    pandn                m4, m0
   7532    pandn                m6, m1
   7533    pandn               m14, m2
   7534    pandn                m5, m3
   7535    por                  m7, m4
   7536    por                  m8, m6
   7537    por                 m15, m14
   7538    por                  m9, m5
   7539    punpcklbw            m0, m7, m7
   7540    punpckhbw            m7, m7
   7541    punpcklbw            m1, m8, m8
   7542    punpckhbw            m8, m8
   7543    psraw                m0, 8
   7544    psraw                m7, 8
   7545    psraw                m1, 8
   7546    psraw                m8, 8
   7547    punpcklbw            m2, m15, m15
   7548    punpckhbw           m15, m15
   7549    punpcklbw            m3, m9, m9
   7550    punpckhbw            m9, m9
   7551    psraw                m2, 8
   7552    psraw               m15, 8
   7553    psraw                m3, 8
   7554    psraw                m9, 8
   7555    mova         [stk+0x10], m0
   7556    mova         [stk+0x20], m7
   7557    mova         [stk+0x30], m1
   7558    mova         [stk+0x40], m8
   7559    mova         [stk+0x50], m2
   7560    mova         [stk+0x60], m15
   7561    mova         [stk+0x70], m3
   7562    mova         [stk+0x80], m9
   7563    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
   7564    mova         [stk+0x90], m1
   7565    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
   7566    mova         [stk+0xa0], m2
   7567    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
   7568    mova         [stk+0xb0], m3
   7569    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
   7570    mova         [stk+0xc0], m4
   7571    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
   7572    mova         [stk+0xd0], m5
   7573    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
   7574    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
   7575    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
   7576    mova                 m5, [stk+0xd0]
   7577    mova                 m1, [stk+0x90]
   7578    mova                 m2, [stk+0xa0]
   7579    mova                 m3, [stk+0xb0]
   7580    mova                 m9, [stk+0xc0]
   7581    punpcklwd            m4, m5, m6 ; 45a
   7582    punpckhwd            m5, m6     ; 45b
   7583    punpcklwd            m6, m7, m8 ; 67a
   7584    punpckhwd            m7, m8     ; 67b
   7585    punpcklwd            m0, m1, m2 ; 01a
   7586    punpckhwd            m1, m2     ; 01b
   7587    punpcklwd            m2, m3, m9 ; 23a
   7588    punpckhwd            m3, m9     ; 23b
   7589    mova                m10, [stk+0x140]
   7590    mova                m11, [stk+0x150]
   7591    mova                m14, [stk+0x160]
   7592    mova                m15, [stk+0x170]
   7593    mova         [stk+0x90], m4
   7594    mova         [stk+0xa0], m5
   7595    mova         [stk+0xb0], m6
   7596    mova         [stk+0xc0], m7
   7597 %define hround [rsp+0x10]
   7598 %define shift  [rsp+0x20]
   7599 %if isput
   7600  %define vround [rsp+0x30]
   7601 %else
   7602  %define vround [base+pd_m524256]
   7603 %endif
   7604 .dy2_vloop:
   7605    pmaddwd              m4, m0, m10
   7606    pmaddwd              m5, m1, m10
   7607    pmaddwd              m6, m2, m11
   7608    pmaddwd              m7, m3, m11
   7609    paddd                m4, m13
   7610    paddd                m5, m13
   7611    paddd                m4, m6
   7612    paddd                m5, m7
   7613    pmaddwd              m6, [stk+0x90], m14
   7614    pmaddwd              m7, [stk+0xa0], m14
   7615    pmaddwd              m8, [stk+0xb0], m15
   7616    pmaddwd              m9, [stk+0xc0], m15
   7617    paddd                m4, m6
   7618    paddd                m5, m7
   7619 %if isput
   7620    pshufd               m6, m12, q1032
   7621 %endif
   7622    paddd                m4, m8
   7623    paddd                m5, m9
   7624 %else
   7625    movd                 r0, m15
   7626    movd                 rX, m4
   7627    pshufd              m15, m15, q0321
   7628    pshufd               m4, m4, q0321
   7629    movd                 r4, m15
   7630    movd                 r5, m4
   7631    mova                m14, [stk+0x110]
   7632    movq                 m2, [base+subpel_filters+r0*8]
   7633    movq                 m3, [base+subpel_filters+rX*8]
   7634    movhps               m2, [base+subpel_filters+r4*8]
   7635    movhps               m3, [base+subpel_filters+r5*8]
   7636    psrld               m14, 10
   7637    mova           [stk+16], m14
   7638    mov                  r0, [stk+ 0]
   7639    mov                  rX, [stk+ 4]
   7640    mov                  r4, [stk+ 8]
   7641    mov                  r5, [stk+12]
   7642    mova         [stk+0x20], m0
   7643    mova         [stk+0x30], m1
   7644    mova         [stk+0x40], m2
   7645    mova         [stk+0x50], m3
   7646    pshufd               m4, m6, q1100
   7647    pshufd               m6, m6, q3322
   7648    pshufd               m7, m5, q1100
   7649    pshufd               m5, m5, q3322
   7650    pand                 m0, m9, m4
   7651    pand                 m1, m9, m6
   7652    pand                 m2, m9, m7
   7653    pand                 m3, m9, m5
   7654    pandn                m4, [stk+0x20]
   7655    pandn                m6, [stk+0x30]
   7656    pandn                m7, [stk+0x40]
   7657    pandn                m5, [stk+0x50]
   7658    por                  m0, m4
   7659    por                  m1, m6
   7660    por                  m2, m7
   7661    por                  m3, m5
   7662    punpcklbw            m4, m0, m0
   7663    punpckhbw            m0, m0
   7664    punpcklbw            m5, m1, m1
   7665    punpckhbw            m1, m1
   7666    psraw                m4, 8
   7667    psraw                m0, 8
   7668    psraw                m5, 8
   7669    psraw                m1, 8
   7670    punpcklbw            m6, m2, m2
   7671    punpckhbw            m2, m2
   7672    punpcklbw            m7, m3, m3
   7673    punpckhbw            m3, m3
   7674    psraw                m6, 8
   7675    psraw                m2, 8
   7676    psraw                m7, 8
   7677    psraw                m3, 8
   7678    mova        [stk+0x0a0], m4
   7679    mova        [stk+0x0b0], m0
   7680    mova        [stk+0x0c0], m5
   7681    mova        [stk+0x0d0], m1
   7682    mova        [stk+0x140], m6
   7683    mova        [stk+0x150], m2
   7684    mova        [stk+0x160], m7
   7685    mova        [stk+0x170], m3
   7686    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
   7687    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
   7688    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
   7689    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
   7690    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
   7691    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
   7692    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
   7693    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
   7694    mova                 m5, [stk+0x60]
   7695    mova                 m6, [stk+0x70]
   7696    mova                 m7, [stk+0x80]
   7697    mova                 m0, [stk+0x90]
   7698    mov                  r0, r0m
   7699    punpcklwd            m4, m5, m6      ; 45a
   7700    punpckhwd            m5, m6          ; 45b
   7701    punpcklwd            m6, m7, m0      ; 67a
   7702    punpckhwd            m7, m0          ; 67b
   7703    mova         [stk+0x60], m4
   7704    mova         [stk+0x70], m5
   7705    mova         [stk+0x80], m6
   7706    mova         [stk+0x90], m7
   7707    mova                 m1, [stk+0x20]
   7708    mova                 m2, [stk+0x30]
   7709    mova                 m3, [stk+0x40]
   7710    mova                 m4, [stk+0x50]
   7711    punpcklwd            m0, m1, m2      ; 01a
   7712    punpckhwd            m1, m2          ; 01b
   7713    punpcklwd            m2, m3, m4      ; 23a
   7714    punpckhwd            m3, m4          ; 23b
   7715    mova                 m4, [stk+0x180]
   7716    mova                 m5, [stk+0x190]
   7717    mova                 m6, [stk+0x1a0]
   7718    mova                 m7, [stk+0x1b0]
   7719    mova         [stk+0x40], m2
   7720    mova         [stk+0x50], m3
   7721 .dy2_vloop:
   7722    pmaddwd              m0, m4
   7723    pmaddwd              m1, m4
   7724    pmaddwd              m2, m5
   7725    pmaddwd              m3, m5
   7726    paddd                m0, m2
   7727    paddd                m1, m3
   7728    pmaddwd              m2, [stk+0x60], m6
   7729    pmaddwd              m3, [stk+0x70], m6
   7730    pmaddwd              m4, [stk+0x80], m7
   7731    pmaddwd              m5, [stk+0x90], m7
   7732 %if isput
   7733    movd                 m6, [esp+0x18]
   7734 %endif
   7735    paddd                m0, m2
   7736    paddd                m1, m3
   7737    paddd                m0, vrnd_mem
   7738    paddd                m1, vrnd_mem
   7739    paddd                m4, m0
   7740    paddd                m5, m1
   7741 %endif
   7742 %ifidn %1, put
   7743    psrad                m4, m6
   7744    psrad                m5, m6
   7745    packssdw             m4, m5
   7746    pxor                 m7, m7
   7747    pmaxsw               m4, m7
   7748    pminsw               m4, pxmaxm
   7749    mova             [dstq], m4
   7750    add                dstq, dsm
   7751 %else
   7752    psrad                m4, 6
   7753    psrad                m5, 6
   7754    packssdw             m4, m5
   7755    mova             [tmpq], m4
   7756    add                tmpq, tmp_stridem
   7757 %endif
   7758    dec                  hd
   7759    jz .dy2_hloop_prep
   7760 %if ARCH_X86_64
   7761    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1
   7762    mova         [stk+0xd0], m4
   7763    MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1
   7764    mova                 m4, [stk+0xd0]
   7765    mova                 m0, m2         ; 01a
   7766    mova                 m1, m3         ; 01b
   7767    mova                 m2, [stk+0x90] ; 23a
   7768    mova                 m3, [stk+0xa0] ; 23b
   7769    mova                 m5, [stk+0xb0] ; 45a
   7770    mova                 m6, [stk+0xc0] ; 45b
   7771    punpcklwd            m7, m4, m8     ; 67a
   7772    punpckhwd            m4, m8         ; 67b
   7773    mova         [stk+0x90], m5
   7774    mova         [stk+0xa0], m6
   7775    mova         [stk+0xb0], m7
   7776    mova         [stk+0xc0], m4
   7777 %else
   7778    mov                 r0m, r0
   7779    mov                  r3, r3m
   7780    MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8
   7781    MC_8TAP_SCALED_H 0xa0, 0    ; 9
   7782    mova                 m7, [stk+0xe0]
   7783    mova                 m2, [stk+0x60] ; 23a
   7784    mova                 m3, [stk+0x70] ; 23b
   7785    mova                 m4, [stk+0x80] ; 45a
   7786    mova                 m5, [stk+0x90] ; 45b
   7787    punpcklwd            m6, m7, m0     ; 67a
   7788    punpckhwd            m7, m0         ; 67b
   7789    mova                 m0, [stk+0x40] ; 01a
   7790    mova                 m1, [stk+0x50] ; 01b
   7791    mova         [stk+0x40], m2
   7792    mova         [stk+0x50], m3
   7793    mova         [stk+0x60], m4
   7794    mova         [stk+0x70], m5
   7795    mova                 m4, [stk+0x180]
   7796    mova                 m5, [stk+0x190]
   7797    mova         [stk+0x80], m6
   7798    mova         [stk+0x90], m7
   7799    mova                 m6, [stk+0x1a0]
   7800    mova                 m7, [stk+0x1b0]
   7801    mov                  r0, r0m
   7802 %endif
   7803    jmp .dy2_vloop
   7804 INIT_XMM ssse3
   7805 .ret:
   7806    MC_8TAP_SCALED_RET 0
   7807 %if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
   7808 %define r0m [rstk+stack_offset+ 4]
   7809 %define r1m [rstk+stack_offset+ 8]
   7810 %define r2m [rstk+stack_offset+12]
   7811 %define r3m [rstk+stack_offset+16]
   7812 %endif
   7813 %undef isput
   7814 %undef isprep
   7815 %endmacro
   7816 
   7817 %macro BILIN_SCALED_FN 1
   7818 cglobal %1_bilin_scaled_16bpc
   7819    mov                 t0d, (5*15 << 16) | 5*15
   7820    mov                 t1d, (5*15 << 16) | 5*15
   7821    jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
   7822 %endmacro
   7823 
   7824 %if WIN64
   7825 DECLARE_REG_TMP 6, 5
   7826 %elif ARCH_X86_64
   7827 DECLARE_REG_TMP 6, 8
   7828 %else
   7829 DECLARE_REG_TMP 1, 2
   7830 %endif
   7831 
   7832 %define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
   7833 BILIN_SCALED_FN put
   7834 PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_16bpc
   7835 PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_16bpc
   7836 PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_16bpc
   7837 PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_16bpc
   7838 PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_16bpc
   7839 PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_16bpc
   7840 PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_16bpc
   7841 PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_16bpc
   7842 PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
   7843 MC_8TAP_SCALED put
   7844 
   7845 %if WIN64
   7846 DECLARE_REG_TMP 5, 4
   7847 %elif ARCH_X86_64
   7848 DECLARE_REG_TMP 6, 7
   7849 %else
   7850 DECLARE_REG_TMP 1, 2
   7851 %endif
   7852 
   7853 %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
   7854 BILIN_SCALED_FN prep
   7855 PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_16bpc
   7856 PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_16bpc
   7857 PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_16bpc
   7858 PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_16bpc
   7859 PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_16bpc
   7860 PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_16bpc
   7861 PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_16bpc
   7862 PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_16bpc
   7863 PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
   7864 MC_8TAP_SCALED prep
   7865 
   7866 %if ARCH_X86_64
   7867 DECLARE_REG_TMP 6
   7868 %else
   7869 DECLARE_REG_TMP 2
   7870 %endif
   7871 
   7872 %if ARCH_X86_64
   7873 ; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that
   7874 ; by allocating 16 bytes more stack space so that stack offsets match up.
   7875 %if WIN64 && STACK_ALIGNMENT == 16
   7876 %assign stksz 16*14
   7877 %else
   7878 %assign stksz 16*13
   7879 %endif
   7880 cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \
   7881                                                 mx, tmp, alpha, beta, \
   7882                                                 filter, my, gamma, cnt
   7883 %assign stack_size_padded_8x8t stack_size_padded
   7884 %else
   7885 cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
   7886                                                 filter, mx, my
   7887 %define m8   [esp+16*13]
   7888 %define m9   [esp+16*14]
   7889 %define cntd dword [esp+4*63]
   7890 %define dstq tmpq
   7891 %define dsq  0
   7892 %if STACK_ALIGNMENT < 16
   7893 %define dstm [esp+4*65]
   7894 %define dsm  [esp+4*66]
   7895 %else
   7896 %define dstm r0m
   7897 %define dsm  r1m
   7898 %endif
   7899 %endif
   7900 %define base filterq-$$
   7901    mov                 t0d, r7m
   7902    LEA             filterq, $$
   7903    shr                 t0d, 11
   7904 %if ARCH_X86_64
   7905    movddup              m8, [base+warp8x8t_rnd]
   7906 %else
   7907    movddup              m1, [base+warp8x8t_rnd]
   7908    mov                  r1, r1m
   7909    add                  r1, r1
   7910    mova                 m8, m1
   7911    mov                 r1m, r1 ; ds *= 2
   7912 %endif
   7913    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main
   7914    jmp .start
   7915 .loop:
   7916 %if ARCH_X86_64
   7917    lea                dstq, [dstq+dsq*4]
   7918 %else
   7919    add                dstq, dsm
   7920    mov                dstm, dstq
   7921 %endif
   7922    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2
   7923 .start:
   7924 %if ARCH_X86_32
   7925    mov                dstq, dstm
   7926 %endif
   7927    paddd                m1, m8
   7928    paddd                m2, m8
   7929    psrad                m1, 15
   7930    psrad                m2, 15
   7931    packssdw             m1, m2
   7932    mova       [dstq+dsq*0], m1
   7933    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3
   7934 %if ARCH_X86_32
   7935    mov                dstq, dstm
   7936    add                dstq, dsm
   7937 %endif
   7938    paddd                m1, m8
   7939    paddd                m2, m8
   7940    psrad                m1, 15
   7941    psrad                m2, 15
   7942    packssdw             m1, m2
   7943    mova       [dstq+dsq*2], m1
   7944    dec                cntd
   7945    jg .loop
   7946    RET
   7947 
   7948 %if ARCH_X86_64
   7949 cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \
   7950                                                 mx, tmp, alpha, beta, \
   7951                                                 filter, my, gamma, cnt
   7952 ASSERT stack_size_padded == stack_size_padded_8x8t
   7953 %else
   7954 cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
   7955                                                filter, mx, my
   7956 %endif
   7957    mov                 t0d, r7m
   7958    LEA             filterq, $$
   7959    shr                 t0d, 11
   7960 %if ARCH_X86_64
   7961    movddup              m8, [base+warp8x8_rnd2+t0*8]
   7962    movd                 m9, r7m ; pixel_max
   7963    pshufb               m9, [base+pw_256]
   7964 %else
   7965    movddup              m1, [base+warp8x8_rnd2+t0*8]
   7966    movd                 m2, r7m ; pixel_max
   7967    pshufb               m2, [base+pw_256]
   7968    mova                 m8, m1
   7969    mova                 m9, m2
   7970 %endif
   7971    call .main
   7972    jmp .start
   7973 .loop:
   7974 %if ARCH_X86_64
   7975    lea                dstq, [dstq+dsq*2]
   7976 %else
   7977    add                dstq, dsm
   7978    mov                dstm, dstq
   7979 %endif
   7980    call .main2
   7981 .start:
   7982 %if ARCH_X86_32
   7983    mov                dstq, dstm
   7984 %endif
   7985    psrad                m1, 16
   7986    psrad                m2, 16
   7987    packssdw             m1, m2
   7988    pmaxsw               m1, m6
   7989    pmulhrsw             m1, m8
   7990    pminsw               m1, m9
   7991    mova       [dstq+dsq*0], m1
   7992    call .main3
   7993 %if ARCH_X86_32
   7994    mov                dstq, dstm
   7995    add                dstq, dsm
   7996 %endif
   7997    psrad                m1, 16
   7998    psrad                m2, 16
   7999    packssdw             m1, m2
   8000    pmaxsw               m1, m6
   8001    pmulhrsw             m1, m8
   8002    pminsw               m1, m9
   8003    mova       [dstq+dsq*1], m1
   8004    dec                cntd
   8005    jg .loop
   8006    RET
   8007 ALIGN function_align
   8008 .main:
   8009    ; Stack args offset by one (r4m -> r5m etc.) due to call
   8010 %if WIN64
   8011    mov              deltaq, r5m
   8012    mov                 mxd, r6m
   8013 %endif
   8014    movd                 m0, [base+warp8x8_shift+t0*4]
   8015    movddup              m7, [base+warp8x8_rnd1+t0*8]
   8016    add             filterq, mc_warp_filter-$$
   8017 %if ARCH_X86_64
   8018    movsx            alphad, word [deltaq+2*0]
   8019    movsx             betad, word [deltaq+2*1]
   8020    movsx            gammad, word [deltaq+2*2]
   8021    movsx            deltad, word [deltaq+2*3]
   8022    lea                tmpq, [ssq*3]
   8023    add                 mxd, 512+(64<<10)
   8024    sub                srcq, tmpq             ; src -= ss*3
   8025    imul               tmpd, alphad, -7
   8026    mov                 myd, r7m
   8027    add               betad, tmpd             ; beta -= alpha*7
   8028    imul               tmpd, gammad, -7
   8029    add                 myd, 512+(64<<10)
   8030    mov                cntd, 4
   8031    add              deltad, tmpd             ; delta -= gamma*7
   8032 %else
   8033 %if STACK_ALIGNMENT < 16
   8034    %assign stack_offset stack_offset - gprsize
   8035 %endif
   8036    mov                 r3d, r5m              ; abcd
   8037 %if STACK_ALIGNMENT < 16
   8038    mov                  r0, r1m              ; dst
   8039    mov                  r1, r2m              ; ds
   8040    mov  [esp+gprsize+4*65], r0
   8041    mov  [esp+gprsize+4*66], r1
   8042 %endif
   8043    movsx            alphad, word [r3+2*0]
   8044    movsx               r2d, word [r3+2*1]
   8045    movsx            gammad, word [r3+2*2]
   8046    movsx               r3d, word [r3+2*3]
   8047    imul                r5d, alphad, -7
   8048    add                 r2d, r5d              ; beta -= alpha*7
   8049    imul                r5d, gammad, -7
   8050    mov  [esp+gprsize+4*60], r2d
   8051    add                 r3d, r5d              ; delta -= gamma*7
   8052    mov  [esp+gprsize+4*61], r3d
   8053    mov                 r3d, r4m              ; ss
   8054    mov                srcq, r3m
   8055    mov                 mxd, r6m
   8056    mov                 myd, r7m
   8057    mov dword [esp+gprsize+4*63], 4           ; cnt
   8058    mov  [esp+gprsize+4*62], r3
   8059    lea                  r3, [r3*3]
   8060    add                 mxd, 512+(64<<10)
   8061    add                 myd, 512+(64<<10)
   8062    sub                srcq, r3               ; src -= ss*3
   8063 %if STACK_ALIGNMENT < 16
   8064    %assign stack_offset stack_offset + gprsize
   8065 %endif
   8066 %endif
   8067    mova      [rsp+gprsize], m0
   8068    pxor                 m6, m6
   8069    call .h
   8070    mova                 m5, m0
   8071    call .h
   8072    punpcklwd            m1, m5, m0           ; 01
   8073    punpckhwd            m5, m0
   8074    mova [rsp+gprsize+16* 1], m1
   8075    mova [rsp+gprsize+16* 4], m5
   8076    mova                 m5, m0
   8077    call .h
   8078    punpcklwd            m1, m5, m0           ; 12
   8079    punpckhwd            m5, m0
   8080    mova [rsp+gprsize+16* 7], m1
   8081    mova [rsp+gprsize+16*10], m5
   8082    mova                 m5, m0
   8083    call .h
   8084    punpcklwd            m1, m5, m0           ; 23
   8085    punpckhwd            m5, m0
   8086    mova [rsp+gprsize+16* 2], m1
   8087    mova [rsp+gprsize+16* 5], m5
   8088    mova                 m5, m0
   8089    call .h
   8090    punpcklwd            m1, m5, m0           ; 34
   8091    punpckhwd            m5, m0
   8092    mova [rsp+gprsize+16* 8], m1
   8093    mova [rsp+gprsize+16*11], m5
   8094    mova                 m5, m0
   8095    call .h
   8096    punpcklwd            m1, m5, m0           ; 45
   8097    punpckhwd            m5, m0
   8098    mova [rsp+gprsize+16* 3], m1
   8099    mova [rsp+gprsize+16* 6], m5
   8100    mova                 m5, m0
   8101    call .h
   8102    punpcklwd            m1, m5, m0           ; 56
   8103    punpckhwd            m5, m0
   8104    mova [rsp+gprsize+16* 9], m1
   8105    mova [rsp+gprsize+16*12], m5
   8106    mova                 m5, m0
   8107 .main2:
   8108    call .h
   8109 %macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h
   8110    lea                tmpd, [myq+gammaq]
   8111    shr                 myd, 10
   8112    movq                 m4, [filterq+myq*8]  ; a
   8113    lea                 myd, [tmpq+gammaq]
   8114    shr                tmpd, 10
   8115    movq                 m2, [filterq+tmpq*8] ; b
   8116    lea                tmpd, [myq+gammaq]
   8117    shr                 myd, 10
   8118    movq                 m3, [filterq+myq*8]  ; c
   8119    lea                 myd, [tmpq+gammaq]
   8120    shr                tmpd, 10
   8121    movq                 m1, [filterq+tmpq*8] ; d
   8122    lea                tmpd, [myq+gammaq]
   8123    shr                 myd, 10
   8124    punpcklwd            m4, m2
   8125    punpcklwd            m3, m1
   8126    punpckldq            m2, m4, m3
   8127    punpckhdq            m4, m3
   8128    punpcklbw            m1, m6, m2           ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
   8129    pmaddwd              m1, [rsp+gprsize+16*%1]
   8130    punpckhbw            m3, m6, m2           ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
   8131    mova                 m2, [rsp+gprsize+16*%2]
   8132    pmaddwd              m3, m2
   8133    mova [rsp+gprsize+16*%1], m2
   8134    paddd                m1, m3
   8135    punpcklbw            m3, m6, m4           ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
   8136    mova                 m2, [rsp+gprsize+16*%3]
   8137    pmaddwd              m3, m2
   8138    mova [rsp+gprsize+16*%2], m2
   8139    paddd                m1, m3
   8140    punpcklwd            m3, m5, m0           ; 67
   8141    punpckhbw            m2, m6, m4           ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
   8142    pmaddwd              m2, m3
   8143    mova [rsp+gprsize+16*%3], m3
   8144    paddd                m1, m2
   8145    movq                 m4, [filterq+myq*8]  ; e
   8146    lea                 myd, [tmpq+gammaq]
   8147    shr                tmpd, 10
   8148    movq                 m3, [filterq+tmpq*8] ; f
   8149    lea                tmpd, [myq+gammaq]
   8150    shr                 myd, 10
   8151    movq                 m2, [filterq+myq*8]  ; g
   8152 %if ARCH_X86_64
   8153    lea                 myd, [tmpq+deltaq]    ; my += delta
   8154 %else
   8155    mov                 myd, [esp+gprsize+4*61]
   8156    add                 myd, tmpd
   8157 %endif
   8158    shr                tmpd, 10
   8159    punpcklwd            m4, m3
   8160    movq                 m3, [filterq+tmpq*8] ; h
   8161    punpcklwd            m2, m3
   8162    punpckldq            m3, m4, m2
   8163    punpckhdq            m4, m2
   8164    punpcklbw            m2, m6, m3           ; e0 e1 f0 f1 g0 g1 h0 h1 << 8
   8165    pmaddwd              m2, [rsp+gprsize+16*%4]
   8166    punpckhbw            m6, m3               ; e2 e3 f2 f3 g2 g3 h2 h3 << 8
   8167    mova                 m3, [rsp+gprsize+16*%5]
   8168    pmaddwd              m6, m3
   8169    mova [rsp+gprsize+16*%4], m3
   8170    pxor                 m3, m3
   8171    paddd                m2, m6
   8172    punpcklbw            m3, m4               ; e4 e5 f4 f5 g4 g5 h4 h5 << 8
   8173    mova                 m6, [rsp+gprsize+16*%6]
   8174    pmaddwd              m3, m6
   8175    mova [rsp+gprsize+16*%5], m6
   8176    punpckhwd            m5, m0
   8177    pxor                 m6, m6
   8178    paddd                m2, m3
   8179    punpckhbw            m3, m6, m4           ; e6 e7 f6 f7 g6 g7 h6 h7 << 8
   8180    pmaddwd              m3, m5
   8181    mova [rsp+gprsize+16*%6], m5
   8182    mova                 m5, m0
   8183    paddd                m2, m3
   8184 %endmacro
   8185    WARP_V                1,  2,  3,  4,  5,  6
   8186    ret
   8187 .main3:
   8188    call .h
   8189    WARP_V                7,  8,  9, 10, 11, 12
   8190    ret
   8191 ALIGN function_align
   8192 .h:
   8193    lea                tmpd, [mxq+alphaq]
   8194    shr                 mxd, 10
   8195    movq                 m3, [filterq+mxq*8]
   8196    punpcklbw            m0, m6, m3
   8197    movu                 m3, [srcq-6]
   8198    pmaddwd              m0, m3               ; 0
   8199    lea                 mxd, [tmpq+alphaq]
   8200    shr                tmpd, 10
   8201    movq                 m3, [filterq+tmpq*8]
   8202    punpcklbw            m2, m6, m3
   8203    movu                 m3, [srcq-4]
   8204    pmaddwd              m2, m3               ; 1
   8205    lea                tmpd, [mxq+alphaq]
   8206    shr                 mxd, 10
   8207    movq                 m3, [filterq+mxq*8]
   8208    phaddd               m0, m2               ; 0 1
   8209    punpcklbw            m2, m6, m3
   8210    movu                 m3, [srcq-2]
   8211    pmaddwd              m2, m3               ; 2
   8212    lea                 mxd, [tmpq+alphaq]
   8213    shr                tmpd, 10
   8214    movq                 m3, [filterq+tmpq*8]
   8215    punpcklbw            m1, m6, m3
   8216    movu                 m3, [srcq+0]
   8217    pmaddwd              m1, m3               ; 3
   8218    lea                tmpd, [mxq+alphaq]
   8219    shr                 mxd, 10
   8220    movq                 m3, [filterq+mxq*8]
   8221    phaddd               m2, m1               ; 2 3
   8222    punpcklbw            m1, m6, m3
   8223    movu                 m3, [srcq+2]
   8224    pmaddwd              m1, m3               ; 4
   8225    lea                 mxd, [tmpq+alphaq]
   8226    shr                tmpd, 10
   8227    movq                 m3, [filterq+tmpq*8]
   8228    phaddd               m0, m2               ; 0 1 2 3
   8229    punpcklbw            m2, m6, m3
   8230    movu                 m3, [srcq+4]
   8231    pmaddwd              m2, m3               ; 5
   8232    lea                tmpd, [mxq+alphaq]
   8233    shr                 mxd, 10
   8234    movq                 m3, [filterq+mxq*8]
   8235    phaddd               m1, m2               ; 4 5
   8236    punpcklbw            m2, m6, m3
   8237    movu                 m3, [srcq+6]
   8238    pmaddwd              m2, m3               ; 6
   8239 %if ARCH_X86_64
   8240    lea                 mxd, [tmpq+betaq]     ; mx += beta
   8241 %else
   8242    mov                 mxd, [esp+gprsize*2+4*60]
   8243    add                 mxd, tmpd
   8244 %endif
   8245    shr                tmpd, 10
   8246    movq                 m3, [filterq+tmpq*8]
   8247    punpcklbw            m4, m6, m3
   8248    movu                 m3, [srcq+8]
   8249 %if ARCH_X86_64
   8250    add                srcq, ssq
   8251 %else
   8252    add                srcq, [esp+gprsize*2+4*62]
   8253 %endif
   8254    pmaddwd              m3, m4               ; 7
   8255    phaddd               m2, m3               ; 6 7
   8256    phaddd               m1, m2               ; 4 5 6 7
   8257    paddd                m0, m7
   8258    paddd                m1, m7
   8259    psrad                m0, [rsp+gprsize*2]
   8260    psrad                m1, [rsp+gprsize*2]
   8261    packssdw             m0, m1
   8262    ret
   8263 
   8264 %macro BIDIR_FN 0
   8265    call .main
   8266    jmp                  wq
   8267 .w4_loop:
   8268    call .main
   8269    lea                dstq, [dstq+strideq*2]
   8270 .w4:
   8271    movq   [dstq+strideq*0], m0
   8272    movhps [dstq+strideq*1], m0
   8273    lea                dstq, [dstq+strideq*2]
   8274    movq   [dstq+strideq*0], m1
   8275    movhps [dstq+strideq*1], m1
   8276    sub                  hd, 4
   8277    jg .w4_loop
   8278 .ret:
   8279    RET
   8280 .w8_loop:
   8281    call .main
   8282    lea                dstq, [dstq+strideq*2]
   8283 .w8:
   8284    mova   [dstq+strideq*0], m0
   8285    mova   [dstq+strideq*1], m1
   8286    sub                  hd, 2
   8287    jne .w8_loop
   8288    RET
   8289 .w16_loop:
   8290    call .main
   8291    add                dstq, strideq
   8292 .w16:
   8293    mova        [dstq+16*0], m0
   8294    mova        [dstq+16*1], m1
   8295    dec                  hd
   8296    jg .w16_loop
   8297    RET
   8298 .w32_loop:
   8299    call .main
   8300    add                dstq, strideq
   8301 .w32:
   8302    mova        [dstq+16*0], m0
   8303    mova        [dstq+16*1], m1
   8304    call .main
   8305    mova        [dstq+16*2], m0
   8306    mova        [dstq+16*3], m1
   8307    dec                  hd
   8308    jg .w32_loop
   8309    RET
   8310 .w64_loop:
   8311    call .main
   8312    add                dstq, strideq
   8313 .w64:
   8314    mova        [dstq+16*0], m0
   8315    mova        [dstq+16*1], m1
   8316    call .main
   8317    mova        [dstq+16*2], m0
   8318    mova        [dstq+16*3], m1
   8319    call .main
   8320    mova        [dstq+16*4], m0
   8321    mova        [dstq+16*5], m1
   8322    call .main
   8323    mova        [dstq+16*6], m0
   8324    mova        [dstq+16*7], m1
   8325    dec                  hd
   8326    jg .w64_loop
   8327    RET
   8328 .w128_loop:
   8329    call .main
   8330    add                dstq, strideq
   8331 .w128:
   8332    mova       [dstq+16* 0], m0
   8333    mova       [dstq+16* 1], m1
   8334    call .main
   8335    mova       [dstq+16* 2], m0
   8336    mova       [dstq+16* 3], m1
   8337    call .main
   8338    mova       [dstq+16* 4], m0
   8339    mova       [dstq+16* 5], m1
   8340    call .main
   8341    mova       [dstq+16* 6], m0
   8342    mova       [dstq+16* 7], m1
   8343    call .main
   8344    mova       [dstq+16* 8], m0
   8345    mova       [dstq+16* 9], m1
   8346    call .main
   8347    mova       [dstq+16*10], m0
   8348    mova       [dstq+16*11], m1
   8349    call .main
   8350    mova       [dstq+16*12], m0
   8351    mova       [dstq+16*13], m1
   8352    call .main
   8353    mova       [dstq+16*14], m0
   8354    mova       [dstq+16*15], m1
   8355    dec                  hd
   8356    jg .w128_loop
   8357    RET
   8358 %endmacro
   8359 
   8360 %if UNIX64
   8361 DECLARE_REG_TMP 7
   8362 %else
   8363 DECLARE_REG_TMP 5
   8364 %endif
   8365 
   8366 cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h
   8367 %define base r6-avg_ssse3_table
   8368    LEA                  r6, avg_ssse3_table
   8369    tzcnt                wd, wm
   8370    mov                 t0d, r6m ; pixel_max
   8371    movsxd               wq, [r6+wq*4]
   8372    shr                 t0d, 11
   8373    movddup              m2, [base+bidir_rnd+t0*8]
   8374    movddup              m3, [base+bidir_mul+t0*8]
   8375    movifnidn            hd, hm
   8376    add                  wq, r6
   8377    BIDIR_FN
   8378 ALIGN function_align
   8379 .main:
   8380    mova                 m0, [tmp1q+16*0]
   8381    paddsw               m0, [tmp2q+16*0]
   8382    mova                 m1, [tmp1q+16*1]
   8383    paddsw               m1, [tmp2q+16*1]
   8384    add               tmp1q, 16*2
   8385    add               tmp2q, 16*2
   8386    pmaxsw               m0, m2
   8387    pmaxsw               m1, m2
   8388    psubsw               m0, m2
   8389    psubsw               m1, m2
   8390    pmulhw               m0, m3
   8391    pmulhw               m1, m3
   8392    ret
   8393 
   8394 cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h
   8395 %define base r6-w_avg_ssse3_table
   8396    LEA                  r6, w_avg_ssse3_table
   8397    tzcnt                wd, wm
   8398    mov                 t0d, r6m ; weight
   8399    movd                 m6, r7m ; pixel_max
   8400    movddup              m5, [base+pd_65538]
   8401    movsxd               wq, [r6+wq*4]
   8402    pshufb               m6, [base+pw_256]
   8403    add                  wq, r6
   8404    lea                 r6d, [t0-16]
   8405    shl                 t0d, 16
   8406    sub                 t0d, r6d ; 16-weight, weight
   8407    paddw                m5, m6
   8408    mov                 r6d, t0d
   8409    shl                 t0d, 2
   8410    test          dword r7m, 0x800
   8411    cmovnz              r6d, t0d
   8412    movifnidn            hd, hm
   8413    movd                 m4, r6d
   8414    pslld                m5, 7
   8415    pxor                 m7, m7
   8416    pshufd               m4, m4, q0000
   8417    BIDIR_FN
   8418 ALIGN function_align
   8419 .main:
   8420    mova                 m2, [tmp1q+16*0]
   8421    mova                 m0, [tmp2q+16*0]
   8422    punpckhwd            m3, m0, m2
   8423    punpcklwd            m0, m2
   8424    mova                 m2, [tmp1q+16*1]
   8425    mova                 m1, [tmp2q+16*1]
   8426    add               tmp1q, 16*2
   8427    add               tmp2q, 16*2
   8428    pmaddwd              m3, m4
   8429    pmaddwd              m0, m4
   8430    paddd                m3, m5
   8431    paddd                m0, m5
   8432    psrad                m3, 8
   8433    psrad                m0, 8
   8434    packssdw             m0, m3
   8435    punpckhwd            m3, m1, m2
   8436    punpcklwd            m1, m2
   8437    pmaddwd              m3, m4
   8438    pmaddwd              m1, m4
   8439    paddd                m3, m5
   8440    paddd                m1, m5
   8441    psrad                m3, 8
   8442    psrad                m1, 8
   8443    packssdw             m1, m3
   8444    pminsw               m0, m6
   8445    pminsw               m1, m6
   8446    pmaxsw               m0, m7
   8447    pmaxsw               m1, m7
   8448    ret
   8449 
   8450 %if ARCH_X86_64
   8451 cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
   8452 %else
   8453 cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
   8454 %define hd dword r5m
   8455 %define m8 [base+pw_64]
   8456 %endif
   8457 %define base r6-mask_ssse3_table
   8458    LEA                  r6, mask_ssse3_table
   8459    tzcnt                wd, wm
   8460    mov                 t0d, r7m ; pixel_max
   8461    shr                 t0d, 11
   8462    movsxd               wq, [r6+wq*4]
   8463    movddup              m6, [base+bidir_rnd+t0*8]
   8464    movddup              m7, [base+bidir_mul+t0*8]
   8465 %if ARCH_X86_64
   8466    mova                 m8, [base+pw_64]
   8467    movifnidn            hd, hm
   8468 %endif
   8469    add                  wq, r6
   8470    mov               maskq, r6mp
   8471    BIDIR_FN
   8472 ALIGN function_align
   8473 .main:
   8474    movq                 m3, [maskq+8*0]
   8475    mova                 m0, [tmp1q+16*0]
   8476    mova                 m4, [tmp2q+16*0]
   8477    pxor                 m5, m5
   8478    punpcklbw            m3, m5
   8479    punpckhwd            m2, m0, m4
   8480    punpcklwd            m0, m4
   8481    psubw                m1, m8, m3
   8482    punpckhwd            m4, m3, m1 ; m, 64-m
   8483    punpcklwd            m3, m1
   8484    pmaddwd              m2, m4     ; tmp1 * m + tmp2 * (64-m)
   8485    pmaddwd              m0, m3
   8486    movq                 m3, [maskq+8*1]
   8487    mova                 m1, [tmp1q+16*1]
   8488    mova                 m4, [tmp2q+16*1]
   8489    add               maskq, 8*2
   8490    add               tmp1q, 16*2
   8491    add               tmp2q, 16*2
   8492    psrad                m2, 5
   8493    psrad                m0, 5
   8494    packssdw             m0, m2
   8495    punpcklbw            m3, m5
   8496    punpckhwd            m2, m1, m4
   8497    punpcklwd            m1, m4
   8498    psubw                m5, m8, m3
   8499    punpckhwd            m4, m3, m5 ; m, 64-m
   8500    punpcklwd            m3, m5
   8501    pmaddwd              m2, m4     ; tmp1 * m + tmp2 * (64-m)
   8502    pmaddwd              m1, m3
   8503    psrad                m2, 5
   8504    psrad                m1, 5
   8505    packssdw             m1, m2
   8506    pmaxsw               m0, m6
   8507    pmaxsw               m1, m6
   8508    psubsw               m0, m6
   8509    psubsw               m1, m6
   8510    pmulhw               m0, m7
   8511    pmulhw               m1, m7
   8512    ret
   8513 
   8514 cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
   8515 %define base t0-w_mask_420_ssse3_table
   8516    LEA                  t0, w_mask_420_ssse3_table
   8517    tzcnt                wd, wm
   8518    mov                 r6d, r8m ; pixel_max
   8519    movd                 m0, r7m ; sign
   8520    shr                 r6d, 11
   8521    movsxd               wq, [t0+wq*4]
   8522 %if ARCH_X86_64
   8523    mova                 m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
   8524    mova                 m9, [base+pw_64]
   8525    movddup             m10, [base+bidir_rnd+r6*8]
   8526    movddup             m11, [base+bidir_mul+r6*8]
   8527 %else
   8528    mova                 m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
   8529    mova                 m2, [base+pw_64]
   8530    movddup              m3, [base+bidir_rnd+r6*8]
   8531    movddup              m4, [base+bidir_mul+r6*8]
   8532    ALLOC_STACK       -16*4
   8533    mova         [rsp+16*0], m1
   8534    mova         [rsp+16*1], m2
   8535    mova         [rsp+16*2], m3
   8536    mova         [rsp+16*3], m4
   8537    %define              m8  [rsp+gprsize+16*0]
   8538    %define              m9  [rsp+gprsize+16*1]
   8539    %define             m10  [rsp+gprsize+16*2]
   8540    %define             m11  [rsp+gprsize+16*3]
   8541 %endif
   8542    movd                 m7, [base+pw_2]
   8543    psubw                m7, m0
   8544    pshufb               m7, [base+pw_256]
   8545    add                  wq, t0
   8546    movifnidn            hd, r5m
   8547    mov               maskq, r6mp
   8548    call .main
   8549    jmp                  wq
   8550 .w4_loop:
   8551    call .main
   8552    lea                dstq, [dstq+strideq*2]
   8553    add               maskq, 4
   8554 .w4:
   8555    movq   [dstq+strideq*0], m0
   8556    phaddw               m2, m3
   8557    movhps [dstq+strideq*1], m0
   8558    phaddd               m2, m2
   8559    lea                dstq, [dstq+strideq*2]
   8560    paddw                m2, m7
   8561    movq   [dstq+strideq*0], m1
   8562    psrlw                m2, 2
   8563    movhps [dstq+strideq*1], m1
   8564    packuswb             m2, m2
   8565    movd            [maskq], m2
   8566    sub                  hd, 4
   8567    jg .w4_loop
   8568    RET
   8569 .w8_loop:
   8570    call .main
   8571    lea                dstq, [dstq+strideq*2]
   8572    add               maskq, 4
   8573 .w8:
   8574    mova   [dstq+strideq*0], m0
   8575    paddw                m2, m3
   8576    phaddw               m2, m2
   8577    mova   [dstq+strideq*1], m1
   8578    paddw                m2, m7
   8579    psrlw                m2, 2
   8580    packuswb             m2, m2
   8581    movd            [maskq], m2
   8582    sub                  hd, 2
   8583    jg .w8_loop
   8584    RET
   8585 .w16_loop:
   8586    call .main
   8587    lea                dstq, [dstq+strideq*2]
   8588    add               maskq, 8
   8589 .w16:
   8590    mova [dstq+strideq*1+16*0], m2
   8591    mova [dstq+strideq*0+16*0], m0
   8592    mova [dstq+strideq*1+16*1], m3
   8593    mova [dstq+strideq*0+16*1], m1
   8594    call .main
   8595    paddw                m2, [dstq+strideq*1+16*0]
   8596    paddw                m3, [dstq+strideq*1+16*1]
   8597    mova [dstq+strideq*1+16*0], m0
   8598    phaddw               m2, m3
   8599    mova [dstq+strideq*1+16*1], m1
   8600    paddw                m2, m7
   8601    psrlw                m2, 2
   8602    packuswb             m2, m2
   8603    movq            [maskq], m2
   8604    sub                  hd, 2
   8605    jg .w16_loop
   8606    RET
   8607 .w32_loop:
   8608    call .main
   8609    lea                dstq, [dstq+strideq*2]
   8610    add               maskq, 16
   8611 .w32:
   8612    mova [dstq+strideq*1+16*0], m2
   8613    mova [dstq+strideq*0+16*0], m0
   8614    mova [dstq+strideq*1+16*1], m3
   8615    mova [dstq+strideq*0+16*1], m1
   8616    call .main
   8617    mova [dstq+strideq*0+16*2], m0
   8618    phaddw               m2, m3
   8619    mova [dstq+strideq*1+16*3], m2
   8620    mova [dstq+strideq*0+16*3], m1
   8621    call .main
   8622    paddw                m2, [dstq+strideq*1+16*0]
   8623    paddw                m3, [dstq+strideq*1+16*1]
   8624    mova [dstq+strideq*1+16*0], m0
   8625    phaddw               m2, m3
   8626    mova [dstq+strideq*1+16*2], m2
   8627    mova [dstq+strideq*1+16*1], m1
   8628    call .main
   8629    phaddw               m2, m3
   8630    paddw                m3, m7, [dstq+strideq*1+16*2]
   8631    paddw                m2, [dstq+strideq*1+16*3]
   8632    mova [dstq+strideq*1+16*2], m0
   8633    paddw                m2, m7
   8634    psrlw                m3, 2
   8635    psrlw                m2, 2
   8636    mova [dstq+strideq*1+16*3], m1
   8637    packuswb             m3, m2
   8638    mova            [maskq], m3
   8639    sub                  hd, 2
   8640    jg .w32_loop
   8641    RET
   8642 .w64_loop:
   8643    call .main
   8644    lea                dstq, [dstq+strideq*2]
   8645    add               maskq, 16*2
   8646 .w64:
   8647    mova [dstq+strideq*1+16*1], m2
   8648    mova [dstq+strideq*0+16*0], m0
   8649    mova [dstq+strideq*1+16*2], m3
   8650    mova [dstq+strideq*0+16*1], m1
   8651    call .main
   8652    mova [dstq+strideq*1+16*3], m2
   8653    mova [dstq+strideq*0+16*2], m0
   8654    mova [dstq+strideq*1+16*4], m3
   8655    mova [dstq+strideq*0+16*3], m1
   8656    call .main
   8657    mova [dstq+strideq*1+16*5], m2
   8658    mova [dstq+strideq*0+16*4], m0
   8659    mova [dstq+strideq*1+16*6], m3
   8660    mova [dstq+strideq*0+16*5], m1
   8661    call .main
   8662    mova [dstq+strideq*0+16*6], m0
   8663    phaddw               m2, m3
   8664    mova [dstq+strideq*1+16*7], m2
   8665    mova [dstq+strideq*0+16*7], m1
   8666    call .main
   8667    paddw                m2, [dstq+strideq*1+16*1]
   8668    paddw                m3, [dstq+strideq*1+16*2]
   8669    mova [dstq+strideq*1+16*0], m0
   8670    phaddw               m2, m3
   8671    mova [dstq+strideq*1+16*2], m2
   8672    mova [dstq+strideq*1+16*1], m1
   8673    call .main
   8674    paddw                m2, [dstq+strideq*1+16*3]
   8675    paddw                m3, [dstq+strideq*1+16*4]
   8676    phaddw               m2, m3
   8677    paddw                m3, m7, [dstq+strideq*1+16*2]
   8678    mova [dstq+strideq*1+16*2], m0
   8679    paddw                m2, m7
   8680    psrlw                m3, 2
   8681    psrlw                m2, 2
   8682    mova [dstq+strideq*1+16*3], m1
   8683    packuswb             m3, m2
   8684    mova       [maskq+16*0], m3
   8685    call .main
   8686    paddw                m2, [dstq+strideq*1+16*5]
   8687    paddw                m3, [dstq+strideq*1+16*6]
   8688    mova [dstq+strideq*1+16*4], m0
   8689    phaddw               m2, m3
   8690    mova [dstq+strideq*1+16*6], m2
   8691    mova [dstq+strideq*1+16*5], m1
   8692    call .main
   8693    phaddw               m2, m3
   8694    paddw                m3, m7, [dstq+strideq*1+16*6]
   8695    paddw                m2, [dstq+strideq*1+16*7]
   8696    mova [dstq+strideq*1+16*6], m0
   8697    paddw                m2, m7
   8698    psrlw                m3, 2
   8699    psrlw                m2, 2
   8700    mova [dstq+strideq*1+16*7], m1
   8701    packuswb             m3, m2
   8702    mova       [maskq+16*1], m3
   8703    sub                  hd, 2
   8704    jg .w64_loop
   8705    RET
   8706 .w128_loop:
   8707    call .main
   8708    lea                dstq, [dstq+strideq*2]
   8709    add               maskq, 16*4
   8710 .w128:
   8711    mova [dstq+strideq*1+16* 1], m2
   8712    mova [dstq+strideq*0+16* 0], m0
   8713    mova [dstq+strideq*1+16* 2], m3
   8714    mova [dstq+strideq*0+16* 1], m1
   8715    call .main
   8716    mova [dstq+strideq*1+16* 3], m2
   8717    mova [dstq+strideq*0+16* 2], m0
   8718    mova [dstq+strideq*1+16* 4], m3
   8719    mova [dstq+strideq*0+16* 3], m1
   8720    call .main
   8721    mova [dstq+strideq*1+16* 5], m2
   8722    mova [dstq+strideq*0+16* 4], m0
   8723    mova [dstq+strideq*1+16* 6], m3
   8724    mova [dstq+strideq*0+16* 5], m1
   8725    call .main
   8726    mova [dstq+strideq*1+16* 7], m2
   8727    mova [dstq+strideq*0+16* 6], m0
   8728    mova [dstq+strideq*1+16* 8], m3
   8729    mova [dstq+strideq*0+16* 7], m1
   8730    call .main
   8731    mova [dstq+strideq*1+16* 9], m2
   8732    mova [dstq+strideq*0+16* 8], m0
   8733    mova [dstq+strideq*1+16*10], m3
   8734    mova [dstq+strideq*0+16* 9], m1
   8735    call .main
   8736    mova [dstq+strideq*1+16*11], m2
   8737    mova [dstq+strideq*0+16*10], m0
   8738    mova [dstq+strideq*1+16*12], m3
   8739    mova [dstq+strideq*0+16*11], m1
   8740    call .main
   8741    mova [dstq+strideq*1+16*13], m2
   8742    mova [dstq+strideq*0+16*12], m0
   8743    mova [dstq+strideq*1+16*14], m3
   8744    mova [dstq+strideq*0+16*13], m1
   8745    call .main
   8746    mova [dstq+strideq*0+16*14], m0
   8747    phaddw               m2, m3
   8748    mova [dstq+strideq*1+16*15], m2
   8749    mova [dstq+strideq*0+16*15], m1
   8750    call .main
   8751    paddw                m2, [dstq+strideq*1+16* 1]
   8752    paddw                m3, [dstq+strideq*1+16* 2]
   8753    mova [dstq+strideq*1+16* 0], m0
   8754    phaddw               m2, m3
   8755    mova [dstq+strideq*1+16* 2], m2
   8756    mova [dstq+strideq*1+16* 1], m1
   8757    call .main
   8758    paddw                m2, [dstq+strideq*1+16* 3]
   8759    paddw                m3, [dstq+strideq*1+16* 4]
   8760    phaddw               m2, m3
   8761    paddw                m3, m7, [dstq+strideq*1+16* 2]
   8762    mova [dstq+strideq*1+16* 2], m0
   8763    paddw                m2, m7
   8764    psrlw                m3, 2
   8765    psrlw                m2, 2
   8766    mova [dstq+strideq*1+16* 3], m1
   8767    packuswb             m3, m2
   8768    mova       [maskq+16*0], m3
   8769    call .main
   8770    paddw                m2, [dstq+strideq*1+16* 5]
   8771    paddw                m3, [dstq+strideq*1+16* 6]
   8772    mova [dstq+strideq*1+16* 4], m0
   8773    phaddw               m2, m3
   8774    mova [dstq+strideq*1+16* 6], m2
   8775    mova [dstq+strideq*1+16* 5], m1
   8776    call .main
   8777    paddw                m2, [dstq+strideq*1+16* 7]
   8778    paddw                m3, [dstq+strideq*1+16* 8]
   8779    phaddw               m2, m3
   8780    paddw                m3, m7, [dstq+strideq*1+16* 6]
   8781    mova [dstq+strideq*1+16* 6], m0
   8782    paddw                m2, m7
   8783    psrlw                m3, 2
   8784    psrlw                m2, 2
   8785    mova [dstq+strideq*1+16* 7], m1
   8786    packuswb             m3, m2
   8787    mova       [maskq+16*1], m3
   8788    call .main
   8789    paddw                m2, [dstq+strideq*1+16* 9]
   8790    paddw                m3, [dstq+strideq*1+16*10]
   8791    mova [dstq+strideq*1+16* 8], m0
   8792    phaddw               m2, m3
   8793    mova [dstq+strideq*1+16*10], m2
   8794    mova [dstq+strideq*1+16* 9], m1
   8795    call .main
   8796    paddw                m2, [dstq+strideq*1+16*11]
   8797    paddw                m3, [dstq+strideq*1+16*12]
   8798    phaddw               m2, m3
   8799    paddw                m3, m7, [dstq+strideq*1+16*10]
   8800    mova [dstq+strideq*1+16*10], m0
   8801    paddw                m2, m7
   8802    psrlw                m3, 2
   8803    psrlw                m2, 2
   8804    mova [dstq+strideq*1+16*11], m1
   8805    packuswb             m3, m2
   8806    mova       [maskq+16*2], m3
   8807    call .main
   8808    paddw                m2, [dstq+strideq*1+16*13]
   8809    paddw                m3, [dstq+strideq*1+16*14]
   8810    mova [dstq+strideq*1+16*12], m0
   8811    phaddw               m2, m3
   8812    mova [dstq+strideq*1+16*14], m2
   8813    mova [dstq+strideq*1+16*13], m1
   8814    call .main
   8815    phaddw               m2, m3
   8816    paddw                m3, m7, [dstq+strideq*1+16*14]
   8817    paddw                m2, [dstq+strideq*1+16*15]
   8818    mova [dstq+strideq*1+16*14], m0
   8819    paddw                m2, m7
   8820    psrlw                m3, 2
   8821    psrlw                m2, 2
   8822    mova [dstq+strideq*1+16*15], m1
   8823    packuswb             m3, m2
   8824    mova       [maskq+16*3], m3
   8825    sub                  hd, 2
   8826    jg .w128_loop
   8827    RET
   8828 ALIGN function_align
   8829 .main:
   8830 %macro W_MASK 2 ; dst/tmp_offset, mask
   8831    mova                m%1, [tmp1q+16*%1]
   8832    mova                m%2, [tmp2q+16*%1]
   8833    punpcklwd            m4, m%2, m%1
   8834    punpckhwd            m5, m%2, m%1
   8835    psubsw              m%1, m%2
   8836    pabsw               m%1, m%1
   8837    psubusw              m6, m8, m%1
   8838    psrlw                m6, 10      ; 64-m
   8839    psubw               m%2, m9, m6  ; m
   8840    punpcklwd           m%1, m6, m%2
   8841    punpckhwd            m6, m%2
   8842    pmaddwd             m%1, m4
   8843    pmaddwd              m6, m5
   8844    psrad               m%1, 5
   8845    psrad                m6, 5
   8846    packssdw            m%1, m6
   8847    pmaxsw              m%1, m10
   8848    psubsw              m%1, m10
   8849    pmulhw              m%1, m11
   8850 %endmacro
   8851    W_MASK                0, 2
   8852    W_MASK                1, 3
   8853    add               tmp1q, 16*2
   8854    add               tmp2q, 16*2
   8855    ret
   8856 
   8857 cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
   8858 %define base t0-w_mask_422_ssse3_table
   8859    LEA                  t0, w_mask_422_ssse3_table
   8860    tzcnt                wd, wm
   8861    mov                 r6d, r8m ; pixel_max
   8862    movd                 m7, r7m ; sign
   8863    shr                 r6d, 11
   8864    movsxd               wq, [t0+wq*4]
   8865 %if ARCH_X86_64
   8866    mova                 m8, [base+pw_27615]
   8867    mova                 m9, [base+pw_64]
   8868    movddup             m10, [base+bidir_rnd+r6*8]
   8869    movddup             m11, [base+bidir_mul+r6*8]
   8870 %else
   8871    mova                 m1, [base+pw_27615]
   8872    mova                 m2, [base+pw_64]
   8873    movddup              m3, [base+bidir_rnd+r6*8]
   8874    movddup              m4, [base+bidir_mul+r6*8]
   8875    ALLOC_STACK       -16*4
   8876    mova         [rsp+16*0], m1
   8877    mova         [rsp+16*1], m2
   8878    mova         [rsp+16*2], m3
   8879    mova         [rsp+16*3], m4
   8880 %endif
   8881    pxor                 m0, m0
   8882    add                  wq, t0
   8883    pshufb               m7, m0
   8884    movifnidn            hd, r5m
   8885    mov               maskq, r6mp
   8886    call .main
   8887    jmp                  wq
   8888 .w4_loop:
   8889    call .main
   8890    lea                dstq, [dstq+strideq*2]
   8891 .w4:
   8892    movq   [dstq+strideq*0], m0
   8893    movhps [dstq+strideq*1], m0
   8894    lea                dstq, [dstq+strideq*2]
   8895    movq   [dstq+strideq*0], m1
   8896    movhps [dstq+strideq*1], m1
   8897    sub                  hd, 4
   8898    jg .w4_loop
   8899 .end:
   8900    RET
   8901 .w8_loop:
   8902    call .main
   8903    lea                dstq, [dstq+strideq*2]
   8904 .w8:
   8905    mova   [dstq+strideq*0], m0
   8906    mova   [dstq+strideq*1], m1
   8907    sub                  hd, 2
   8908    jg .w8_loop
   8909 .w8_end:
   8910    RET
   8911 .w16_loop:
   8912    call .main
   8913    lea                dstq, [dstq+strideq*2]
   8914 .w16:
   8915    mova [dstq+strideq*0+16*0], m0
   8916    mova [dstq+strideq*0+16*1], m1
   8917    call .main
   8918    mova [dstq+strideq*1+16*0], m0
   8919    mova [dstq+strideq*1+16*1], m1
   8920    sub                  hd, 2
   8921    jg .w16_loop
   8922    RET
   8923 .w32_loop:
   8924    call .main
   8925    add                dstq, strideq
   8926 .w32:
   8927    mova        [dstq+16*0], m0
   8928    mova        [dstq+16*1], m1
   8929    call .main
   8930    mova        [dstq+16*2], m0
   8931    mova        [dstq+16*3], m1
   8932    dec                  hd
   8933    jg .w32_loop
   8934    RET
   8935 .w64_loop:
   8936    call .main
   8937    add                dstq, strideq
   8938 .w64:
   8939    mova        [dstq+16*0], m0
   8940    mova        [dstq+16*1], m1
   8941    call .main
   8942    mova        [dstq+16*2], m0
   8943    mova        [dstq+16*3], m1
   8944    call .main
   8945    mova        [dstq+16*4], m0
   8946    mova        [dstq+16*5], m1
   8947    call .main
   8948    mova        [dstq+16*6], m0
   8949    mova        [dstq+16*7], m1
   8950    dec                  hd
   8951    jg .w64_loop
   8952    RET
   8953 .w128_loop:
   8954    call .main
   8955    add                dstq, strideq
   8956 .w128:
   8957    mova       [dstq+16* 0], m0
   8958    mova       [dstq+16* 1], m1
   8959    call .main
   8960    mova       [dstq+16* 2], m0
   8961    mova       [dstq+16* 3], m1
   8962    call .main
   8963    mova       [dstq+16* 4], m0
   8964    mova       [dstq+16* 5], m1
   8965    call .main
   8966    mova       [dstq+16* 6], m0
   8967    mova       [dstq+16* 7], m1
   8968    call .main
   8969    mova       [dstq+16* 8], m0
   8970    mova       [dstq+16* 9], m1
   8971    call .main
   8972    mova       [dstq+16*10], m0
   8973    mova       [dstq+16*11], m1
   8974    call .main
   8975    mova       [dstq+16*12], m0
   8976    mova       [dstq+16*13], m1
   8977    call .main
   8978    mova       [dstq+16*14], m0
   8979    mova       [dstq+16*15], m1
   8980    dec                  hd
   8981    jg .w128_loop
   8982    RET
   8983 ALIGN function_align
   8984 .main:
   8985    W_MASK                0, 2
   8986    W_MASK                1, 3
   8987    phaddw               m2, m3
   8988    add               tmp1q, 16*2
   8989    add               tmp2q, 16*2
   8990    packuswb             m2, m2
   8991    pxor                 m3, m3
   8992    psubb                m2, m7
   8993    pavgb                m2, m3
   8994    movq            [maskq], m2
   8995    add               maskq, 8
   8996    ret
   8997 
   8998 cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
   8999 %define base t0-w_mask_444_ssse3_table
   9000    LEA                  t0, w_mask_444_ssse3_table
   9001    tzcnt                wd, wm
   9002    mov                 r6d, r8m ; pixel_max
   9003    shr                 r6d, 11
   9004    movsxd               wq, [t0+wq*4]
   9005 %if ARCH_X86_64
   9006    mova                 m8, [base+pw_27615]
   9007    mova                 m9, [base+pw_64]
   9008    movddup             m10, [base+bidir_rnd+r6*8]
   9009    movddup             m11, [base+bidir_mul+r6*8]
   9010 %else
   9011    mova                 m1, [base+pw_27615]
   9012    mova                 m2, [base+pw_64]
   9013    movddup              m3, [base+bidir_rnd+r6*8]
   9014    movddup              m7, [base+bidir_mul+r6*8]
   9015    ALLOC_STACK       -16*3
   9016    mova         [rsp+16*0], m1
   9017    mova         [rsp+16*1], m2
   9018    mova         [rsp+16*2], m3
   9019    %define             m11  m7
   9020 %endif
   9021    add                  wq, t0
   9022    movifnidn            hd, r5m
   9023    mov               maskq, r6mp
   9024    call .main
   9025    jmp                  wq
   9026 .w4_loop:
   9027    call .main
   9028    lea                dstq, [dstq+strideq*2]
   9029 .w4:
   9030    movq   [dstq+strideq*0], m0
   9031    movhps [dstq+strideq*1], m0
   9032    lea                dstq, [dstq+strideq*2]
   9033    movq   [dstq+strideq*0], m1
   9034    movhps [dstq+strideq*1], m1
   9035    sub                  hd, 4
   9036    jg .w4_loop
   9037 .end:
   9038    RET
   9039 .w8_loop:
   9040    call .main
   9041    lea                dstq, [dstq+strideq*2]
   9042 .w8:
   9043    mova   [dstq+strideq*0], m0
   9044    mova   [dstq+strideq*1], m1
   9045    sub                  hd, 2
   9046    jg .w8_loop
   9047 .w8_end:
   9048    RET
   9049 .w16_loop:
   9050    call .main
   9051    lea                dstq, [dstq+strideq*2]
   9052 .w16:
   9053    mova [dstq+strideq*0+16*0], m0
   9054    mova [dstq+strideq*0+16*1], m1
   9055    call .main
   9056    mova [dstq+strideq*1+16*0], m0
   9057    mova [dstq+strideq*1+16*1], m1
   9058    sub                  hd, 2
   9059    jg .w16_loop
   9060    RET
   9061 .w32_loop:
   9062    call .main
   9063    add                dstq, strideq
   9064 .w32:
   9065    mova        [dstq+16*0], m0
   9066    mova        [dstq+16*1], m1
   9067    call .main
   9068    mova        [dstq+16*2], m0
   9069    mova        [dstq+16*3], m1
   9070    dec                  hd
   9071    jg .w32_loop
   9072    RET
   9073 .w64_loop:
   9074    call .main
   9075    add                dstq, strideq
   9076 .w64:
   9077    mova        [dstq+16*0], m0
   9078    mova        [dstq+16*1], m1
   9079    call .main
   9080    mova        [dstq+16*2], m0
   9081    mova        [dstq+16*3], m1
   9082    call .main
   9083    mova        [dstq+16*4], m0
   9084    mova        [dstq+16*5], m1
   9085    call .main
   9086    mova        [dstq+16*6], m0
   9087    mova        [dstq+16*7], m1
   9088    dec                  hd
   9089    jg .w64_loop
   9090    RET
   9091 .w128_loop:
   9092    call .main
   9093    add                dstq, strideq
   9094 .w128:
   9095    mova       [dstq+16* 0], m0
   9096    mova       [dstq+16* 1], m1
   9097    call .main
   9098    mova       [dstq+16* 2], m0
   9099    mova       [dstq+16* 3], m1
   9100    call .main
   9101    mova       [dstq+16* 4], m0
   9102    mova       [dstq+16* 5], m1
   9103    call .main
   9104    mova       [dstq+16* 6], m0
   9105    mova       [dstq+16* 7], m1
   9106    call .main
   9107    mova       [dstq+16* 8], m0
   9108    mova       [dstq+16* 9], m1
   9109    call .main
   9110    mova       [dstq+16*10], m0
   9111    mova       [dstq+16*11], m1
   9112    call .main
   9113    mova       [dstq+16*12], m0
   9114    mova       [dstq+16*13], m1
   9115    call .main
   9116    mova       [dstq+16*14], m0
   9117    mova       [dstq+16*15], m1
   9118    dec                  hd
   9119    jg .w128_loop
   9120    RET
   9121 ALIGN function_align
   9122 .main:
   9123    W_MASK                0, 2
   9124    W_MASK                1, 3
   9125    packuswb             m2, m3
   9126    add               tmp1q, 16*2
   9127    add               tmp2q, 16*2
   9128    mova            [maskq], m2
   9129    add               maskq, 16
   9130    ret
   9131 
   9132 ; (a * (64 - m) + b * m + 32) >> 6
   9133 ; = (((b - a) * m + 32) >> 6) + a
   9134 ; = (((b - a) * (m << 9) + 16384) >> 15) + a
   9135 ;   except m << 9 overflows int16_t when m == 64 (which is possible),
   9136 ;   but if we negate m it works out (-64 << 9 == -32768).
   9137 ; = (((a - b) * (m * -512) + 16384) >> 15) + a
   9138 cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3
   9139 %define base r6-blend_ssse3_table
   9140    LEA                  r6, blend_ssse3_table
   9141    tzcnt                wd, wm
   9142    movifnidn            hd, hm
   9143    movsxd               wq, [r6+wq*4]
   9144    movifnidn         maskq, maskmp
   9145    mova                 m7, [base+pw_m512]
   9146    add                  wq, r6
   9147    lea            stride3q, [strideq*3]
   9148    pxor                 m6, m6
   9149    jmp                  wq
   9150 .w4:
   9151    mova                 m5, [maskq]
   9152    movq                 m0, [dstq+strideq*0]
   9153    movhps               m0, [dstq+strideq*1]
   9154    movq                 m1, [dstq+strideq*2]
   9155    movhps               m1, [dstq+stride3q ]
   9156    psubw                m2, m0, [tmpq+16*0]
   9157    psubw                m3, m1, [tmpq+16*1]
   9158    add               maskq, 16
   9159    add                tmpq, 32
   9160    punpcklbw            m4, m5, m6
   9161    punpckhbw            m5, m6
   9162    pmullw               m4, m7
   9163    pmullw               m5, m7
   9164    pmulhrsw             m2, m4
   9165    pmulhrsw             m3, m5
   9166    paddw                m0, m2
   9167    paddw                m1, m3
   9168    movq   [dstq+strideq*0], m0
   9169    movhps [dstq+strideq*1], m0
   9170    movq   [dstq+strideq*2], m1
   9171    movhps [dstq+stride3q ], m1
   9172    lea                dstq, [dstq+strideq*4]
   9173    sub                  hd, 4
   9174    jg .w4
   9175    RET
   9176 .w8:
   9177    mova                 m5, [maskq]
   9178    mova                 m0, [dstq+strideq*0]
   9179    mova                 m1, [dstq+strideq*1]
   9180    psubw                m2, m0, [tmpq+16*0]
   9181    psubw                m3, m1, [tmpq+16*1]
   9182    add               maskq, 16
   9183    add                tmpq, 32
   9184    punpcklbw            m4, m5, m6
   9185    punpckhbw            m5, m6
   9186    pmullw               m4, m7
   9187    pmullw               m5, m7
   9188    pmulhrsw             m2, m4
   9189    pmulhrsw             m3, m5
   9190    paddw                m0, m2
   9191    paddw                m1, m3
   9192    mova   [dstq+strideq*0], m0
   9193    mova   [dstq+strideq*1], m1
   9194    lea                dstq, [dstq+strideq*2]
   9195    sub                  hd, 2
   9196    jg .w8
   9197    RET
   9198 .w16:
   9199    mova                 m5, [maskq]
   9200    mova                 m0, [dstq+16*0]
   9201    mova                 m1, [dstq+16*1]
   9202    psubw                m2, m0, [tmpq+16*0]
   9203    psubw                m3, m1, [tmpq+16*1]
   9204    add               maskq, 16
   9205    add                tmpq, 32
   9206    punpcklbw            m4, m5, m6
   9207    punpckhbw            m5, m6
   9208    pmullw               m4, m7
   9209    pmullw               m5, m7
   9210    pmulhrsw             m2, m4
   9211    pmulhrsw             m3, m5
   9212    paddw                m0, m2
   9213    paddw                m1, m3
   9214    mova        [dstq+16*0], m0
   9215    mova        [dstq+16*1], m1
   9216    add                dstq, strideq
   9217    dec                  hd
   9218    jg .w16
   9219    RET
   9220 .w32:
   9221    mova                 m5, [maskq+16*0]
   9222    mova                 m0, [dstq+16*0]
   9223    mova                 m1, [dstq+16*1]
   9224    psubw                m2, m0, [tmpq+16*0]
   9225    psubw                m3, m1, [tmpq+16*1]
   9226    punpcklbw            m4, m5, m6
   9227    punpckhbw            m5, m6
   9228    pmullw               m4, m7
   9229    pmullw               m5, m7
   9230    pmulhrsw             m2, m4
   9231    pmulhrsw             m3, m5
   9232    paddw                m0, m2
   9233    paddw                m1, m3
   9234    mova        [dstq+16*0], m0
   9235    mova        [dstq+16*1], m1
   9236    mova                 m5, [maskq+16*1]
   9237    mova                 m0, [dstq+16*2]
   9238    mova                 m1, [dstq+16*3]
   9239    psubw                m2, m0, [tmpq+16*2]
   9240    psubw                m3, m1, [tmpq+16*3]
   9241    add               maskq, 32
   9242    add                tmpq, 64
   9243    punpcklbw            m4, m5, m6
   9244    punpckhbw            m5, m6
   9245    pmullw               m4, m7
   9246    pmullw               m5, m7
   9247    pmulhrsw             m2, m4
   9248    pmulhrsw             m3, m5
   9249    paddw                m0, m2
   9250    paddw                m1, m3
   9251    mova        [dstq+16*2], m0
   9252    mova        [dstq+16*3], m1
   9253    add                dstq, strideq
   9254    dec                  hd
   9255    jg .w32
   9256    RET
   9257 
   9258 cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
   9259 %define base r5-blend_v_ssse3_table
   9260    LEA                  r5, blend_v_ssse3_table
   9261    tzcnt                wd, wm
   9262    movifnidn            hd, hm
   9263    movsxd               wq, [r5+wq*4]
   9264    add                  wq, r5
   9265    jmp                  wq
   9266 .w2:
   9267    movd                 m4, [base+obmc_masks+2*2]
   9268 .w2_loop:
   9269    movd                 m0, [dstq+strideq*0]
   9270    movd                 m2, [tmpq+4*0]
   9271    movd                 m1, [dstq+strideq*1]
   9272    movd                 m3, [tmpq+4*1]
   9273    add                tmpq, 4*2
   9274    psubw                m2, m0
   9275    psubw                m3, m1
   9276    pmulhrsw             m2, m4
   9277    pmulhrsw             m3, m4
   9278    paddw                m0, m2
   9279    paddw                m1, m3
   9280    movd   [dstq+strideq*0], m0
   9281    movd   [dstq+strideq*1], m1
   9282    lea                dstq, [dstq+strideq*2]
   9283    sub                  hd, 2
   9284    jg .w2_loop
   9285    RET
   9286 .w4:
   9287    movddup              m2, [base+obmc_masks+4*2]
   9288 .w4_loop:
   9289    movq                 m0, [dstq+strideq*0]
   9290    movhps               m0, [dstq+strideq*1]
   9291    mova                 m1, [tmpq]
   9292    add                tmpq, 8*2
   9293    psubw                m1, m0
   9294    pmulhrsw             m1, m2
   9295    paddw                m0, m1
   9296    movq   [dstq+strideq*0], m0
   9297    movhps [dstq+strideq*1], m0
   9298    lea                dstq, [dstq+strideq*2]
   9299    sub                  hd, 2
   9300    jg .w4_loop
   9301    RET
   9302 .w8:
   9303    mova                 m4, [base+obmc_masks+8*2]
   9304 .w8_loop:
   9305    mova                 m0, [dstq+strideq*0]
   9306    mova                 m2, [tmpq+16*0]
   9307    mova                 m1, [dstq+strideq*1]
   9308    mova                 m3, [tmpq+16*1]
   9309    add                tmpq, 16*2
   9310    psubw                m2, m0
   9311    psubw                m3, m1
   9312    pmulhrsw             m2, m4
   9313    pmulhrsw             m3, m4
   9314    paddw                m0, m2
   9315    paddw                m1, m3
   9316    mova   [dstq+strideq*0], m0
   9317    mova   [dstq+strideq*1], m1
   9318    lea                dstq, [dstq+strideq*2]
   9319    sub                  hd, 2
   9320    jg .w8_loop
   9321    RET
   9322 .w16:
   9323    mova                 m4, [base+obmc_masks+16*2]
   9324    movq                 m5, [base+obmc_masks+16*3]
   9325 .w16_loop:
   9326    mova                 m0, [dstq+16*0]
   9327    mova                 m2, [tmpq+16*0]
   9328    mova                 m1, [dstq+16*1]
   9329    mova                 m3, [tmpq+16*1]
   9330    add                tmpq, 16*2
   9331    psubw                m2, m0
   9332    psubw                m3, m1
   9333    pmulhrsw             m2, m4
   9334    pmulhrsw             m3, m5
   9335    paddw                m0, m2
   9336    paddw                m1, m3
   9337    mova        [dstq+16*0], m0
   9338    mova        [dstq+16*1], m1
   9339    add                dstq, strideq
   9340    dec                  hd
   9341    jg .w16_loop
   9342    RET
   9343 .w32:
   9344 %if WIN64
   9345    movaps          [rsp+8], m6
   9346 %endif
   9347    mova                 m4, [base+obmc_masks+16*4]
   9348    mova                 m5, [base+obmc_masks+16*5]
   9349    mova                 m6, [base+obmc_masks+16*6]
   9350 .w32_loop:
   9351    mova                 m0, [dstq+16*0]
   9352    mova                 m2, [tmpq+16*0]
   9353    mova                 m1, [dstq+16*1]
   9354    mova                 m3, [tmpq+16*1]
   9355    psubw                m2, m0
   9356    psubw                m3, m1
   9357    pmulhrsw             m2, m4
   9358    pmulhrsw             m3, m5
   9359    paddw                m0, m2
   9360    mova                 m2, [dstq+16*2]
   9361    paddw                m1, m3
   9362    mova                 m3, [tmpq+16*2]
   9363    add                tmpq, 16*4
   9364    psubw                m3, m2
   9365    pmulhrsw             m3, m6
   9366    paddw                m2, m3
   9367    mova        [dstq+16*0], m0
   9368    mova        [dstq+16*1], m1
   9369    mova        [dstq+16*2], m2
   9370    add                dstq, strideq
   9371    dec                  hd
   9372    jg .w32_loop
   9373 %if WIN64
   9374    movaps               m6, [rsp+8]
   9375 %endif
   9376    RET
   9377 
   9378 %macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
   9379    mova                 m0, [dstq+16*(%1+0)]
   9380    mova                 m2, [tmpq+16*(%2+0)]
   9381    mova                 m1, [dstq+16*(%1+1)]
   9382    mova                 m3, [tmpq+16*(%2+1)]
   9383 %if %3
   9384    add                tmpq, 16*%3
   9385 %endif
   9386    psubw                m2, m0
   9387    psubw                m3, m1
   9388    pmulhrsw             m2, m5
   9389    pmulhrsw             m3, m5
   9390    paddw                m0, m2
   9391    paddw                m1, m3
   9392    mova   [dstq+16*(%1+0)], m0
   9393    mova   [dstq+16*(%1+1)], m1
   9394 %endmacro
   9395 
   9396 cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
   9397 %define base r6-blend_h_ssse3_table
   9398    LEA                  r6, blend_h_ssse3_table
   9399    tzcnt                wd, wm
   9400    mov                  hd, hm
   9401    movsxd               wq, [r6+wq*4]
   9402    movddup              m4, [base+blend_shuf]
   9403    lea               maskq, [base+obmc_masks+hq*2]
   9404    lea                  hd, [hq*3]
   9405    add                  wq, r6
   9406    shr                  hd, 2 ; h * 3/4
   9407    lea               maskq, [maskq+hq*2]
   9408    neg                  hq
   9409    jmp                  wq
   9410 .w2:
   9411    movd                 m0, [dstq+dsq*0]
   9412    movd                 m2, [dstq+dsq*1]
   9413    movd                 m3, [maskq+hq*2]
   9414    movq                 m1, [tmpq]
   9415    add                tmpq, 4*2
   9416    punpckldq            m0, m2
   9417    punpcklwd            m3, m3
   9418    psubw                m1, m0
   9419    pmulhrsw             m1, m3
   9420    paddw                m0, m1
   9421    movd       [dstq+dsq*0], m0
   9422    psrlq                m0, 32
   9423    movd       [dstq+dsq*1], m0
   9424    lea                dstq, [dstq+dsq*2]
   9425    add                  hq, 2
   9426    jl .w2
   9427    RET
   9428 .w4:
   9429    mova                 m3, [base+blend_shuf]
   9430 .w4_loop:
   9431    movq                 m0, [dstq+dsq*0]
   9432    movhps               m0, [dstq+dsq*1]
   9433    movd                 m2, [maskq+hq*2]
   9434    mova                 m1, [tmpq]
   9435    add                tmpq, 8*2
   9436    psubw                m1, m0
   9437    pshufb               m2, m3
   9438    pmulhrsw             m1, m2
   9439    paddw                m0, m1
   9440    movq       [dstq+dsq*0], m0
   9441    movhps     [dstq+dsq*1], m0
   9442    lea                dstq, [dstq+dsq*2]
   9443    add                  hq, 2
   9444    jl .w4_loop
   9445    RET
   9446 .w8:
   9447    movddup              m5, [base+blend_shuf+8]
   9448 %if WIN64
   9449    movaps         [rsp+ 8], m6
   9450    movaps         [rsp+24], m7
   9451 %endif
   9452 .w8_loop:
   9453    movd                 m7, [maskq+hq*2]
   9454    mova                 m0, [dstq+dsq*0]
   9455    mova                 m2, [tmpq+16*0]
   9456    mova                 m1, [dstq+dsq*1]
   9457    mova                 m3, [tmpq+16*1]
   9458    add                tmpq, 16*2
   9459    pshufb               m6, m7, m4
   9460    psubw                m2, m0
   9461    pshufb               m7, m5
   9462    psubw                m3, m1
   9463    pmulhrsw             m2, m6
   9464    pmulhrsw             m3, m7
   9465    paddw                m0, m2
   9466    paddw                m1, m3
   9467    mova       [dstq+dsq*0], m0
   9468    mova       [dstq+dsq*1], m1
   9469    lea                dstq, [dstq+dsq*2]
   9470    add                  hq, 2
   9471    jl .w8_loop
   9472 %if WIN64
   9473    movaps               m6, [rsp+ 8]
   9474    movaps               m7, [rsp+24]
   9475 %endif
   9476    RET
   9477 .w16:
   9478    movd                 m5, [maskq+hq*2]
   9479    pshufb               m5, m4
   9480    BLEND_H_ROW           0, 0, 2
   9481    add                dstq, dsq
   9482    inc                  hq
   9483    jl .w16
   9484    RET
   9485 .w32:
   9486    movd                 m5, [maskq+hq*2]
   9487    pshufb               m5, m4
   9488    BLEND_H_ROW           0, 0
   9489    BLEND_H_ROW           2, 2, 4
   9490    add                dstq, dsq
   9491    inc                  hq
   9492    jl .w32
   9493    RET
   9494 .w64:
   9495    movd                 m5, [maskq+hq*2]
   9496    pshufb               m5, m4
   9497    BLEND_H_ROW           0, 0
   9498    BLEND_H_ROW           2, 2
   9499    BLEND_H_ROW           4, 4
   9500    BLEND_H_ROW           6, 6, 8
   9501    add                dstq, dsq
   9502    inc                  hq
   9503    jl .w64
   9504    RET
   9505 .w128:
   9506    movd                 m5, [maskq+hq*2]
   9507    pshufb               m5, m4
   9508    BLEND_H_ROW           0,  0
   9509    BLEND_H_ROW           2,  2
   9510    BLEND_H_ROW           4,  4
   9511    BLEND_H_ROW           6,  6, 16
   9512    BLEND_H_ROW           8, -8
   9513    BLEND_H_ROW          10, -6
   9514    BLEND_H_ROW          12, -4
   9515    BLEND_H_ROW          14, -2
   9516    add                dstq, dsq
   9517    inc                  hq
   9518    jl .w128
   9519    RET
   9520 
   9521 ; emu_edge args:
   9522 ; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
   9523 ; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
   9524 ; const pixel *ref, const ptrdiff_t ref_stride
   9525 ;
   9526 ; bw, bh total filled size
   9527 ; iw, ih, copied block -> fill bottom, right
   9528 ; x, y, offset in bw/bh -> fill top, left
   9529 cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \
   9530                             y, dst, dstride, src, sstride, \
   9531                             bottomext, rightext, blk
   9532    ; we assume that the buffer (stride) is larger than width, so we can
   9533    ; safely overwrite by a few bytes
   9534 
   9535 %if ARCH_X86_64
   9536 %define reg_zero       r12q
   9537 %define reg_tmp        r10
   9538 %define reg_src        srcq
   9539 %define reg_bottomext  bottomextq
   9540 %define reg_rightext   rightextq
   9541 %define reg_blkm       r9m
   9542 %else
   9543 %define reg_zero       r6
   9544 %define reg_tmp        r0
   9545 %define reg_src        r1
   9546 %define reg_bottomext  r0
   9547 %define reg_rightext   r1
   9548 %define reg_blkm       r2m
   9549 %endif
   9550    ;
   9551    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
   9552    xor            reg_zero, reg_zero
   9553    lea             reg_tmp, [ihq-1]
   9554    cmp                  yq, ihq
   9555    cmovs           reg_tmp, yq
   9556    test                 yq, yq
   9557    cmovs           reg_tmp, reg_zero
   9558 %if ARCH_X86_64
   9559    imul            reg_tmp, sstrideq
   9560    add                srcq, reg_tmp
   9561 %else
   9562    imul            reg_tmp, sstridem
   9563    mov             reg_src, srcm
   9564    add             reg_src, reg_tmp
   9565 %endif
   9566    ;
   9567    ; ref += iclip(x, 0, iw - 1)
   9568    lea             reg_tmp, [iwq-1]
   9569    cmp                  xq, iwq
   9570    cmovs           reg_tmp, xq
   9571    test                 xq, xq
   9572    cmovs           reg_tmp, reg_zero
   9573    lea             reg_src, [reg_src+reg_tmp*2]
   9574 %if ARCH_X86_32
   9575    mov                srcm, reg_src
   9576 %endif
   9577    ;
   9578    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
   9579 %if ARCH_X86_32
   9580    mov                  r1, r1m ; restore bh
   9581 %endif
   9582    lea       reg_bottomext, [yq+bhq]
   9583    sub       reg_bottomext, ihq
   9584    lea                  r3, [bhq-1]
   9585    cmovs     reg_bottomext, reg_zero
   9586    ;
   9587 
   9588    DEFINE_ARGS bw, bh, iw, ih, x, \
   9589                topext, dst, dstride, src, sstride, \
   9590                bottomext, rightext, blk
   9591 
   9592    ; top_ext = iclip(-y, 0, bh - 1)
   9593    neg             topextq
   9594    cmovs           topextq, reg_zero
   9595    cmp       reg_bottomext, bhq
   9596    cmovns    reg_bottomext, r3
   9597    cmp             topextq, bhq
   9598    cmovg           topextq, r3
   9599 %if ARCH_X86_32
   9600    mov                 r4m, reg_bottomext
   9601    ;
   9602    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
   9603    mov                  r0, r0m ; restore bw
   9604 %endif
   9605    lea        reg_rightext, [xq+bwq]
   9606    sub        reg_rightext, iwq
   9607    lea                  r2, [bwq-1]
   9608    cmovs      reg_rightext, reg_zero
   9609 
   9610    DEFINE_ARGS bw, bh, iw, ih, leftext, \
   9611                topext, dst, dstride, src, sstride, \
   9612                bottomext, rightext, blk
   9613 
   9614    ; left_ext = iclip(-x, 0, bw - 1)
   9615    neg            leftextq
   9616    cmovs          leftextq, reg_zero
   9617    cmp        reg_rightext, bwq
   9618    cmovns     reg_rightext, r2
   9619 %if ARCH_X86_32
   9620    mov                 r3m, r1
   9621 %endif
   9622    cmp            leftextq, bwq
   9623    cmovns         leftextq, r2
   9624 
   9625 %undef reg_zero
   9626 %undef reg_tmp
   9627 %undef reg_src
   9628 %undef reg_bottomext
   9629 %undef reg_rightext
   9630 
   9631    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
   9632                topext, dst, dstride, src, sstride, \
   9633                bottomext, rightext, blk
   9634 
   9635    ; center_h = bh - top_ext - bottom_ext
   9636 %if ARCH_X86_64
   9637    lea                  r3, [bottomextq+topextq]
   9638    sub            centerhq, r3
   9639 %else
   9640    mov                   r1, centerhm ; restore r1
   9641    sub             centerhq, topextq
   9642    sub             centerhq, r4m
   9643    mov                  r1m, centerhq
   9644 %endif
   9645    ;
   9646    ; blk += top_ext * PXSTRIDE(dst_stride)
   9647    mov                  r2, topextq
   9648 %if ARCH_X86_64
   9649    imul                 r2, dstrideq
   9650 %else
   9651    mov                  r6, r6m ; restore dstq
   9652    imul                 r2, dstridem
   9653 %endif
   9654    add                dstq, r2
   9655    mov            reg_blkm, dstq ; save pointer for ext
   9656    ;
   9657    ; center_w = bw - left_ext - right_ext
   9658    mov            centerwq, bwq
   9659 %if ARCH_X86_64
   9660    lea                  r3, [rightextq+leftextq]
   9661    sub            centerwq, r3
   9662 %else
   9663    sub            centerwq, r3m
   9664    sub            centerwq, leftextq
   9665 %endif
   9666 
   9667 ; vloop Macro
   9668 %macro v_loop 3 ; need_left_ext, need_right_ext, suffix
   9669  %if ARCH_X86_64
   9670    %define reg_tmp        r12
   9671  %else
   9672    %define reg_tmp        r0
   9673  %endif
   9674 .v_loop_%3:
   9675  %if ARCH_X86_32
   9676    mov                  r0, r0m
   9677    mov                  r1, r1m
   9678  %endif
   9679 %if %1
   9680    ; left extension
   9681  %if ARCH_X86_64
   9682    movd                 m0, [srcq]
   9683  %else
   9684    mov                  r3, srcm
   9685    movd                 m0, [r3]
   9686  %endif
   9687    pshuflw              m0, m0, q0000
   9688    punpcklqdq           m0, m0
   9689    xor                  r3, r3
   9690 .left_loop_%3:
   9691    mova        [dstq+r3*2], m0
   9692    add                  r3, mmsize/2
   9693    cmp                  r3, leftextq
   9694    jl .left_loop_%3
   9695    ; body
   9696    lea             reg_tmp, [dstq+leftextq*2]
   9697 %endif
   9698    xor                  r3, r3
   9699 .body_loop_%3:
   9700  %if ARCH_X86_64
   9701    movu                 m0, [srcq+r3*2]
   9702  %else
   9703    mov                  r1, srcm
   9704    movu                 m0, [r1+r3*2]
   9705  %endif
   9706 %if %1
   9707    movu     [reg_tmp+r3*2], m0
   9708 %else
   9709    movu        [dstq+r3*2], m0
   9710 %endif
   9711    add                  r3, mmsize/2
   9712    cmp                  r3, centerwq
   9713    jl .body_loop_%3
   9714 %if %2
   9715    ; right extension
   9716 %if %1
   9717    lea             reg_tmp, [reg_tmp+centerwq*2]
   9718 %else
   9719    lea             reg_tmp, [dstq+centerwq*2]
   9720 %endif
   9721  %if ARCH_X86_64
   9722    movd                 m0, [srcq+centerwq*2-2]
   9723  %else
   9724    mov                  r3, srcm
   9725    movd                 m0, [r3+centerwq*2-2]
   9726  %endif
   9727    pshuflw              m0, m0, q0000
   9728    punpcklqdq           m0, m0
   9729    xor                  r3, r3
   9730 .right_loop_%3:
   9731    movu     [reg_tmp+r3*2], m0
   9732    add                  r3, mmsize/2
   9733  %if ARCH_X86_64
   9734    cmp                  r3, rightextq
   9735  %else
   9736    cmp                  r3, r3m
   9737  %endif
   9738    jl .right_loop_%3
   9739 %endif
   9740  %if ARCH_X86_64
   9741    add                dstq, dstrideq
   9742    add                srcq, sstrideq
   9743    dec            centerhq
   9744    jg .v_loop_%3
   9745  %else
   9746    add                dstq, dstridem
   9747    mov                  r0, sstridem
   9748    add                srcm, r0
   9749    sub       dword centerhm, 1
   9750    jg .v_loop_%3
   9751    mov                  r0, r0m ; restore r0
   9752  %endif
   9753 %endmacro ; vloop MACRO
   9754 
   9755    test           leftextq, leftextq
   9756    jnz .need_left_ext
   9757 %if ARCH_X86_64
   9758    test          rightextq, rightextq
   9759    jnz .need_right_ext
   9760 %else
   9761    cmp            leftextq, r3m ; leftextq == 0
   9762    jne .need_right_ext
   9763 %endif
   9764    v_loop                0, 0, 0
   9765    jmp .body_done
   9766 
   9767    ;left right extensions
   9768 .need_left_ext:
   9769 %if ARCH_X86_64
   9770    test          rightextq, rightextq
   9771 %else
   9772    mov                  r3, r3m
   9773    test                 r3, r3
   9774 %endif
   9775    jnz .need_left_right_ext
   9776    v_loop                1, 0, 1
   9777    jmp .body_done
   9778 
   9779 .need_left_right_ext:
   9780    v_loop                1, 1, 2
   9781    jmp .body_done
   9782 
   9783 .need_right_ext:
   9784    v_loop                0, 1, 3
   9785 
   9786 .body_done:
   9787 ; r0 ; bw
   9788 ; r1 ;; x loop
   9789 ; r4 ;; y loop
   9790 ; r5 ; topextq
   9791 ; r6 ;dstq
   9792 ; r7 ;dstrideq
   9793 ; r8 ; srcq
   9794 %if ARCH_X86_64
   9795 %define reg_dstride    dstrideq
   9796 %else
   9797 %define reg_dstride    r2
   9798 %endif
   9799    ;
   9800    ; bottom edge extension
   9801 %if ARCH_X86_64
   9802    test         bottomextq, bottomextq
   9803    jz .top
   9804 %else
   9805    xor                  r1, r1
   9806    cmp                  r1, r4m
   9807    je .top
   9808 %endif
   9809    ;
   9810 %if ARCH_X86_64
   9811    mov                srcq, dstq
   9812    sub                srcq, dstrideq
   9813    xor                  r1, r1
   9814 %else
   9815    mov                  r3, dstq
   9816    mov         reg_dstride, dstridem
   9817    sub                  r3, reg_dstride
   9818    mov                srcm, r3
   9819 %endif
   9820    ;
   9821 .bottom_x_loop:
   9822 %if ARCH_X86_64
   9823    mova                 m0, [srcq+r1*2]
   9824    lea                  r3, [dstq+r1*2]
   9825    mov                  r4, bottomextq
   9826 %else
   9827    mov                  r3, srcm
   9828    mova                 m0, [r3+r1*2]
   9829    lea                  r3, [dstq+r1*2]
   9830    mov                  r4, r4m
   9831 %endif
   9832    ;
   9833 .bottom_y_loop:
   9834    mova               [r3], m0
   9835    add                  r3, reg_dstride
   9836    dec                  r4
   9837    jg .bottom_y_loop
   9838    add                  r1, mmsize/2
   9839    cmp                  r1, bwq
   9840    jl .bottom_x_loop
   9841 
   9842 .top:
   9843    ; top edge extension
   9844    test            topextq, topextq
   9845    jz .end
   9846 %if ARCH_X86_64
   9847    mov                srcq, reg_blkm
   9848 %else
   9849    mov                  r3, reg_blkm
   9850    mov         reg_dstride, dstridem
   9851 %endif
   9852    mov                dstq, dstm
   9853    xor                  r1, r1
   9854    ;
   9855 .top_x_loop:
   9856 %if ARCH_X86_64
   9857    mova                 m0, [srcq+r1*2]
   9858 %else
   9859    mov                  r3, reg_blkm
   9860    mova                 m0, [r3+r1*2]
   9861 %endif
   9862    lea                  r3, [dstq+r1*2]
   9863    mov                  r4, topextq
   9864    ;
   9865 .top_y_loop:
   9866    mova               [r3], m0
   9867    add                  r3, reg_dstride
   9868    dec                  r4
   9869    jg .top_y_loop
   9870    add                  r1, mmsize/2
   9871    cmp                  r1, bwq
   9872    jl .top_x_loop
   9873 
   9874 .end:
   9875    RET
   9876 
   9877 %undef reg_dstride
   9878 %undef reg_blkm
   9879 %undef reg_tmp
   9880 
   9881 %macro SCRATCH 3
   9882 %if ARCH_X86_32
   9883    mova [rsp+%3*mmsize], m%1
   9884 %define m%2 [rsp+%3*mmsize]
   9885 %else
   9886    SWAP             %1, %2
   9887 %endif
   9888 %endmacro
   9889 
   9890 %if ARCH_X86_64
   9891 cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \
   9892                                       dst_w, h, src_w, dx, mx0, pxmax
   9893 %elif STACK_ALIGNMENT >= 16
   9894 cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \
   9895                                     dst_w, h, src_w, dx, mx0, pxmax
   9896 %else
   9897 cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
   9898                                     dst_w, h, src_w, dx, mx0, pxmax
   9899 %endif
   9900    movifnidn         dstq, dstmp
   9901    movifnidn         srcq, srcmp
   9902 %if STACK_ALIGNMENT >= 16
   9903    movifnidn       dst_wd, dst_wm
   9904 %endif
   9905 %if ARCH_X86_64
   9906    movifnidn           hd, hm
   9907 %endif
   9908    sub         dword mx0m, 4<<14
   9909    sub       dword src_wm, 8
   9910    movd                m4, pxmaxm
   9911    movd                m7, dxm
   9912    movd                m6, mx0m
   9913    movd                m5, src_wm
   9914    punpcklwd           m4, m4
   9915    pshufd              m4, m4, q0000
   9916    pshufd              m7, m7, q0000
   9917    pshufd              m6, m6, q0000
   9918    pshufd              m5, m5, q0000
   9919    mova [rsp+16*3*ARCH_X86_32], m4
   9920 %if ARCH_X86_64
   9921 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
   9922    LEA                 r7, $$
   9923 %define base r7-$$
   9924 %else
   9925 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
   9926 %define hd dword r5m
   9927 %if STACK_ALIGNMENT >= 16
   9928    LEA                 r6, $$
   9929  %define base r6-$$
   9930 %else
   9931    LEA                 r4, $$
   9932  %define base r4-$$
   9933 %endif
   9934 %endif
   9935 %if ARCH_X86_64
   9936    mova               m12, [base+pd_64]
   9937    mova               m11, [base+pd_63]
   9938 %else
   9939 %define m12 [base+pd_64]
   9940 %define m11 [base+pd_63]
   9941 %endif
   9942    pmaddwd             m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
   9943    pslld               m7, 2                      ; dx*4
   9944    pslld               m5, 14
   9945    paddd               m6, m4                     ; mx+[0..3]*dx
   9946    SCRATCH              7, 15, 0
   9947    SCRATCH              6, 14, 1
   9948    SCRATCH              5, 13, 2
   9949    pxor                m1, m1
   9950 .loop_y:
   9951    xor                 xd, xd
   9952    mova                m0, m14            ; per-line working version of mx
   9953 .loop_x:
   9954    pcmpgtd             m1, m0
   9955    pandn               m1, m0
   9956    psrad               m2, m0, 8          ; filter offset (unmasked)
   9957    pcmpgtd             m3, m13, m1
   9958    pand                m1, m3
   9959    pandn               m3, m13
   9960    por                 m1, m3
   9961    psubd               m3, m0, m1         ; pshufb offset
   9962    psrad               m1, 14             ; clipped src_x offset
   9963    psrad               m3, 14             ; pshufb edge_emu offset
   9964    pand                m2, m11            ; filter offset (masked)
   9965    ; load source pixels
   9966 %if ARCH_X86_64
   9967    movd               r8d, m1
   9968    pshuflw             m1, m1, q3232
   9969    movd               r9d, m1
   9970    punpckhqdq          m1, m1
   9971    movd              r10d, m1
   9972    psrlq               m1, 32
   9973    movd              r11d, m1
   9974    movu                m4, [srcq+r8*2]
   9975    movu                m5, [srcq+r9*2]
   9976    movu                m6, [srcq+r10*2]
   9977    movu                m7, [srcq+r11*2]
   9978    ; if no emulation is required, we don't need to shuffle or emulate edges
   9979    packssdw            m3, m3
   9980    movq               r11, m3
   9981    test               r11, r11
   9982    jz .filter
   9983    movsx               r8, r11w
   9984    sar                r11, 16
   9985    movsx               r9, r11w
   9986    sar                r11, 16
   9987    movsx              r10, r11w
   9988    sar                r11, 16
   9989    movu                m1, [base+resize_shuf+8+r8*2]
   9990    movu                m3, [base+resize_shuf+8+r9*2]
   9991    movu                m8, [base+resize_shuf+8+r10*2]
   9992    movu                m9, [base+resize_shuf+8+r11*2]
   9993    pshufb              m4, m1
   9994    pshufb              m5, m3
   9995    pshufb              m6, m8
   9996    pshufb              m7, m9
   9997 .filter:
   9998    movd               r8d, m2
   9999    pshuflw             m2, m2, q3232
  10000    movd               r9d, m2
  10001    punpckhqdq          m2, m2
  10002    movd              r10d, m2
  10003    psrlq               m2, 32
  10004    movd              r11d, m2
  10005    movq                m8, [base+resize_filter+r8*8]
  10006    movq                m2, [base+resize_filter+r9*8]
  10007    pxor                m9, m9
  10008    punpcklbw           m1, m9, m8
  10009    punpcklbw           m3, m9, m2
  10010    psraw               m1, 8
  10011    psraw               m3, 8
  10012    movq               m10, [base+resize_filter+r10*8]
  10013    movq                m2, [base+resize_filter+r11*8]
  10014    punpcklbw           m8, m9, m10
  10015    punpcklbw           m9, m2
  10016    psraw               m8, 8
  10017    psraw               m9, 8
  10018    pmaddwd             m4, m1
  10019    pmaddwd             m5, m3
  10020    pmaddwd             m6, m8
  10021    pmaddwd             m7, m9
  10022    phaddd              m4, m5
  10023 %else
  10024    movd                r3, m1
  10025    pshuflw             m1, m1, q3232
  10026    movd                r1, m1
  10027    punpckhqdq          m1, m1
  10028    movu                m4, [srcq+r3*2]
  10029    movu                m5, [srcq+r1*2]
  10030    movd                r3, m1
  10031    psrlq               m1, 32
  10032    movd                r1, m1
  10033    movu                m6, [srcq+r3*2]
  10034    movu                m7, [srcq+r1*2]
  10035    ; if no emulation is required, we don't need to shuffle or emulate edges
  10036    pxor                m1, m1
  10037    pcmpeqb             m1, m3
  10038    pmovmskb           r3d, m1
  10039    cmp                r3d, 0xffff
  10040    je .filter
  10041    movd                r3, m3
  10042    movu                m1, [base+resize_shuf+8+r3*2]
  10043    pshuflw             m3, m3, q3232
  10044    movd                r1, m3
  10045    pshufb              m4, m1
  10046    movu                m1, [base+resize_shuf+8+r1*2]
  10047    punpckhqdq          m3, m3
  10048    movd                r3, m3
  10049    pshufb              m5, m1
  10050    movu                m1, [base+resize_shuf+8+r3*2]
  10051    psrlq               m3, 32
  10052    movd                r1, m3
  10053    pshufb              m6, m1
  10054    movu                m1, [base+resize_shuf+8+r1*2]
  10055    pshufb              m7, m1
  10056 .filter:
  10057    mova        [esp+4*16], m6
  10058    mova        [esp+5*16], m7
  10059    movd                r3, m2
  10060    pshuflw             m2, m2, q3232
  10061    movd                r1, m2
  10062    movq                m6, [base+resize_filter+r3*8]
  10063    movq                m7, [base+resize_filter+r1*8]
  10064    pxor                m3, m3
  10065    punpcklbw           m1, m3, m6
  10066    punpcklbw           m3, m7
  10067    psraw               m1, 8
  10068    psraw               m3, 8
  10069    pmaddwd             m4, m1
  10070    pmaddwd             m5, m3
  10071    punpckhqdq          m2, m2
  10072    movd                r3, m2
  10073    psrlq               m2, 32
  10074    movd                r1, m2
  10075    phaddd              m4, m5
  10076    movq                m2, [base+resize_filter+r3*8]
  10077    movq                m5, [base+resize_filter+r1*8]
  10078    mova                m6, [esp+4*16]
  10079    mova                m7, [esp+5*16]
  10080    pxor                m3, m3
  10081    punpcklbw           m1, m3, m2
  10082    punpcklbw           m3, m5
  10083    psraw               m1, 8
  10084    psraw               m3, 8
  10085    pmaddwd             m6, m1
  10086    pmaddwd             m7, m3
  10087 %endif
  10088    phaddd              m6, m7
  10089    phaddd              m4, m6
  10090    pxor                m1, m1
  10091    psubd               m2, m12, m4
  10092    psrad               m2, 7
  10093    packssdw            m2, m2
  10094    pmaxsw              m2, m1
  10095    pminsw              m2, [rsp+16*3*ARCH_X86_32]
  10096    movq       [dstq+xq*2], m2
  10097    paddd               m0, m15
  10098    add                 xd, 4
  10099 %if STACK_ALIGNMENT >= 16
  10100    cmp                 xd, dst_wd
  10101 %else
  10102    cmp                 xd, dst_wm
  10103 %endif
  10104    jl .loop_x
  10105    add               dstq, dst_stridemp
  10106    add               srcq, src_stridemp
  10107    dec                 hd
  10108    jg .loop_y
  10109    RET