tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc_sse.asm (333824B)


      1 ; Copyright © 2018, VideoLAN and dav1d authors
      2 ; Copyright © 2018, Two Orioles, LLC
      3 ; Copyright © 2018, VideoLabs
      4 ; All rights reserved.
      5 ;
      6 ; Redistribution and use in source and binary forms, with or without
      7 ; modification, are permitted provided that the following conditions are met:
      8 ;
      9 ; 1. Redistributions of source code must retain the above copyright notice, this
     10 ;    list of conditions and the following disclaimer.
     11 ;
     12 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     13 ;    this list of conditions and the following disclaimer in the documentation
     14 ;    and/or other materials provided with the distribution.
     15 ;
     16 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 
     27 %include "config.asm"
     28 %include "ext/x86/x86inc.asm"
     29 
     30 SECTION_RODATA 16
     31 
     32 ; dav1d_obmc_masks[] with 64-x interleaved
     33 obmc_masks: db  0,  0,  0,  0
     34            ; 2 @4
     35            db 45, 19, 64,  0
     36            ; 4 @8
     37            db 39, 25, 50, 14, 59,  5, 64,  0
     38            ; 8 @16
     39            db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
     40            ; 16 @32
     41            db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
     42            db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
     43            ; 32 @64
     44            db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
     45            db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
     46            db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
     47 
     48 warp_8x8_shufA: db 0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
     49 warp_8x8_shufB: db 4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
     50 warp_8x8_shufC: db 2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
     51 warp_8x8_shufD: db 6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
     52 blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
     53 subpel_h_shuf4: db 0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
     54                db 2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
     55 subpel_h_shufA: db 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
     56 subpel_h_shufB: db 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
     57 subpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
     58 subpel_h_shufD: db 0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
     59 subpel_h_shufE: db 2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
     60 subpel_h_shufF: db 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12
     61 subpel_s_shuf2: db 0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
     62 subpel_s_shuf8: db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
     63 bilin_h_shuf4:  db 0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
     64 unpckw:         db 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
     65 rescale_mul:    dd 0,  1,  2,  3
     66 resize_shuf:    db 0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7
     67 
     68 wm_420_sign:    times 4 dw 258
     69                times 4 dw 257
     70 wm_422_sign:    times 8 db 128
     71                times 8 db 127
     72 
     73 pb_8x0_8x8: times 8 db 0
     74            times 8 db 8
     75 bdct_lb_dw: times 4 db 0
     76            times 4 db 4
     77            times 4 db 8
     78            times 4 db 12
     79 
     80 pb_64:    times 16 db 64
     81 pw_m256:  times 8 dw -256
     82 pw_1:     times 8 dw 1
     83 pw_2:     times 8 dw 2
     84 pw_8:     times 8 dw 8
     85 pw_15:    times 8 dw 15
     86 pw_26:    times 8 dw 26
     87 pw_34:    times 8 dw 34
     88 pw_512:   times 8 dw 512
     89 pw_1024:  times 8 dw 1024
     90 pw_2048:  times 8 dw 2048
     91 pw_6903:  times 8 dw 6903
     92 pw_8192:  times 8 dw 8192
     93 pd_32:    times 4 dd 32
     94 pd_63:    times 4 dd 63
     95 pd_512:   times 4 dd 512
     96 pd_16384: times 4 dd 16484
     97 pd_32768: times 4 dd 32768
     98 pd_262144:times 4 dd 262144
     99 pd_0x3ff: times 4 dd 0x3ff
    100 pd_0x4000:times 4 dd 0x4000
    101 pq_0x40000000: times 2 dq 0x40000000
    102 
    103 const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage
    104    ; [-1, 0)
    105    db 0, 127,   0, 0,   0,   1, 0, 0, 0, 127,   0, 0,  -1,   2, 0, 0
    106    db 1, 127,  -1, 0,  -3,   4, 0, 0, 1, 126,  -2, 0,  -4,   6, 1, 0
    107    db 1, 126,  -3, 0,  -5,   8, 1, 0, 1, 125,  -4, 0,  -6,  11, 1, 0
    108    db 1, 124,  -4, 0,  -7,  13, 1, 0, 2, 123,  -5, 0,  -8,  15, 1, 0
    109    db 2, 122,  -6, 0,  -9,  18, 1, 0, 2, 121,  -6, 0, -10,  20, 1, 0
    110    db 2, 120,  -7, 0, -11,  22, 2, 0, 2, 119,  -8, 0, -12,  25, 2, 0
    111    db 3, 117,  -8, 0, -13,  27, 2, 0, 3, 116,  -9, 0, -13,  29, 2, 0
    112    db 3, 114, -10, 0, -14,  32, 3, 0, 3, 113, -10, 0, -15,  35, 2, 0
    113    db 3, 111, -11, 0, -15,  37, 3, 0, 3, 109, -11, 0, -16,  40, 3, 0
    114    db 3, 108, -12, 0, -16,  42, 3, 0, 4, 106, -13, 0, -17,  45, 3, 0
    115    db 4, 104, -13, 0, -17,  47, 3, 0, 4, 102, -14, 0, -17,  50, 3, 0
    116    db 4, 100, -14, 0, -17,  52, 3, 0, 4,  98, -15, 0, -18,  55, 4, 0
    117    db 4,  96, -15, 0, -18,  58, 3, 0, 4,  94, -16, 0, -18,  60, 4, 0
    118    db 4,  91, -16, 0, -18,  63, 4, 0, 4,  89, -16, 0, -18,  65, 4, 0
    119    db 4,  87, -17, 0, -18,  68, 4, 0, 4,  85, -17, 0, -18,  70, 4, 0
    120    db 4,  82, -17, 0, -18,  73, 4, 0, 4,  80, -17, 0, -18,  75, 4, 0
    121    db 4,  78, -18, 0, -18,  78, 4, 0, 4,  75, -18, 0, -17,  80, 4, 0
    122    db 4,  73, -18, 0, -17,  82, 4, 0, 4,  70, -18, 0, -17,  85, 4, 0
    123    db 4,  68, -18, 0, -17,  87, 4, 0, 4,  65, -18, 0, -16,  89, 4, 0
    124    db 4,  63, -18, 0, -16,  91, 4, 0, 4,  60, -18, 0, -16,  94, 4, 0
    125    db 3,  58, -18, 0, -15,  96, 4, 0, 4,  55, -18, 0, -15,  98, 4, 0
    126    db 3,  52, -17, 0, -14, 100, 4, 0, 3,  50, -17, 0, -14, 102, 4, 0
    127    db 3,  47, -17, 0, -13, 104, 4, 0, 3,  45, -17, 0, -13, 106, 4, 0
    128    db 3,  42, -16, 0, -12, 108, 3, 0, 3,  40, -16, 0, -11, 109, 3, 0
    129    db 3,  37, -15, 0, -11, 111, 3, 0, 2,  35, -15, 0, -10, 113, 3, 0
    130    db 3,  32, -14, 0, -10, 114, 3, 0, 2,  29, -13, 0,  -9, 116, 3, 0
    131    db 2,  27, -13, 0,  -8, 117, 3, 0, 2,  25, -12, 0,  -8, 119, 2, 0
    132    db 2,  22, -11, 0,  -7, 120, 2, 0, 1,  20, -10, 0,  -6, 121, 2, 0
    133    db 1,  18,  -9, 0,  -6, 122, 2, 0, 1,  15,  -8, 0,  -5, 123, 2, 0
    134    db 1,  13,  -7, 0,  -4, 124, 1, 0, 1,  11,  -6, 0,  -4, 125, 1, 0
    135    db 1,   8,  -5, 0,  -3, 126, 1, 0, 1,   6,  -4, 0,  -2, 126, 1, 0
    136    db 0,   4,  -3, 0,  -1, 127, 1, 0, 0,   2,  -1, 0,   0, 127, 0, 0
    137    ; [0, 1)
    138    db  0,   0,   1, 0, 0, 127,   0,  0,  0,  -1,   2, 0, 0, 127,   0,  0
    139    db  0,  -3,   4, 1, 1, 127,  -2,  0,  0,  -5,   6, 1, 1, 127,  -2,  0
    140    db  0,  -6,   8, 1, 2, 126,  -3,  0, -1,  -7,  11, 2, 2, 126,  -4, -1
    141    db -1,  -8,  13, 2, 3, 125,  -5, -1, -1, -10,  16, 3, 3, 124,  -6, -1
    142    db -1, -11,  18, 3, 4, 123,  -7, -1, -1, -12,  20, 3, 4, 122,  -7, -1
    143    db -1, -13,  23, 3, 4, 121,  -8, -1, -2, -14,  25, 4, 5, 120,  -9, -1
    144    db -1, -15,  27, 4, 5, 119, -10, -1, -1, -16,  30, 4, 5, 118, -11, -1
    145    db -2, -17,  33, 5, 6, 116, -12, -1, -2, -17,  35, 5, 6, 114, -12, -1
    146    db -2, -18,  38, 5, 6, 113, -13, -1, -2, -19,  41, 6, 7, 111, -14, -2
    147    db -2, -19,  43, 6, 7, 110, -15, -2, -2, -20,  46, 6, 7, 108, -15, -2
    148    db -2, -20,  49, 6, 7, 106, -16, -2, -2, -21,  51, 7, 7, 104, -16, -2
    149    db -2, -21,  54, 7, 7, 102, -17, -2, -2, -21,  56, 7, 8, 100, -18, -2
    150    db -2, -22,  59, 7, 8,  98, -18, -2, -2, -22,  62, 7, 8,  96, -19, -2
    151    db -2, -22,  64, 7, 8,  94, -19, -2, -2, -22,  67, 8, 8,  91, -20, -2
    152    db -2, -22,  69, 8, 8,  89, -20, -2, -2, -22,  72, 8, 8,  87, -21, -2
    153    db -2, -21,  74, 8, 8,  84, -21, -2, -2, -22,  77, 8, 8,  82, -21, -2
    154    db -2, -21,  79, 8, 8,  79, -21, -2, -2, -21,  82, 8, 8,  77, -22, -2
    155    db -2, -21,  84, 8, 8,  74, -21, -2, -2, -21,  87, 8, 8,  72, -22, -2
    156    db -2, -20,  89, 8, 8,  69, -22, -2, -2, -20,  91, 8, 8,  67, -22, -2
    157    db -2, -19,  94, 8, 7,  64, -22, -2, -2, -19,  96, 8, 7,  62, -22, -2
    158    db -2, -18,  98, 8, 7,  59, -22, -2, -2, -18, 100, 8, 7,  56, -21, -2
    159    db -2, -17, 102, 7, 7,  54, -21, -2, -2, -16, 104, 7, 7,  51, -21, -2
    160    db -2, -16, 106, 7, 6,  49, -20, -2, -2, -15, 108, 7, 6,  46, -20, -2
    161    db -2, -15, 110, 7, 6,  43, -19, -2, -2, -14, 111, 7, 6,  41, -19, -2
    162    db -1, -13, 113, 6, 5,  38, -18, -2, -1, -12, 114, 6, 5,  35, -17, -2
    163    db -1, -12, 116, 6, 5,  33, -17, -2, -1, -11, 118, 5, 4,  30, -16, -1
    164    db -1, -10, 119, 5, 4,  27, -15, -1, -1,  -9, 120, 5, 4,  25, -14, -2
    165    db -1,  -8, 121, 4, 3,  23, -13, -1, -1,  -7, 122, 4, 3,  20, -12, -1
    166    db -1,  -7, 123, 4, 3,  18, -11, -1, -1,  -6, 124, 3, 3,  16, -10, -1
    167    db -1,  -5, 125, 3, 2,  13,  -8, -1, -1,  -4, 126, 2, 2,  11,  -7, -1
    168    db  0,  -3, 126, 2, 1,   8,  -6,  0,  0,  -2, 127, 1, 1,   6,  -5,  0
    169    db  0,  -2, 127, 1, 1,   4,  -3,  0,  0,   0, 127, 0, 0,   2,  -1,  0
    170    ; [1, 2)
    171    db 0, 0, 127,   0, 0,   1,   0, 0, 0, 0, 127,   0, 0,  -1,   2, 0
    172    db 0, 1, 127,  -1, 0,  -3,   4, 0, 0, 1, 126,  -2, 0,  -4,   6, 1
    173    db 0, 1, 126,  -3, 0,  -5,   8, 1, 0, 1, 125,  -4, 0,  -6,  11, 1
    174    db 0, 1, 124,  -4, 0,  -7,  13, 1, 0, 2, 123,  -5, 0,  -8,  15, 1
    175    db 0, 2, 122,  -6, 0,  -9,  18, 1, 0, 2, 121,  -6, 0, -10,  20, 1
    176    db 0, 2, 120,  -7, 0, -11,  22, 2, 0, 2, 119,  -8, 0, -12,  25, 2
    177    db 0, 3, 117,  -8, 0, -13,  27, 2, 0, 3, 116,  -9, 0, -13,  29, 2
    178    db 0, 3, 114, -10, 0, -14,  32, 3, 0, 3, 113, -10, 0, -15,  35, 2
    179    db 0, 3, 111, -11, 0, -15,  37, 3, 0, 3, 109, -11, 0, -16,  40, 3
    180    db 0, 3, 108, -12, 0, -16,  42, 3, 0, 4, 106, -13, 0, -17,  45, 3
    181    db 0, 4, 104, -13, 0, -17,  47, 3, 0, 4, 102, -14, 0, -17,  50, 3
    182    db 0, 4, 100, -14, 0, -17,  52, 3, 0, 4,  98, -15, 0, -18,  55, 4
    183    db 0, 4,  96, -15, 0, -18,  58, 3, 0, 4,  94, -16, 0, -18,  60, 4
    184    db 0, 4,  91, -16, 0, -18,  63, 4, 0, 4,  89, -16, 0, -18,  65, 4
    185    db 0, 4,  87, -17, 0, -18,  68, 4, 0, 4,  85, -17, 0, -18,  70, 4
    186    db 0, 4,  82, -17, 0, -18,  73, 4, 0, 4,  80, -17, 0, -18,  75, 4
    187    db 0, 4,  78, -18, 0, -18,  78, 4, 0, 4,  75, -18, 0, -17,  80, 4
    188    db 0, 4,  73, -18, 0, -17,  82, 4, 0, 4,  70, -18, 0, -17,  85, 4
    189    db 0, 4,  68, -18, 0, -17,  87, 4, 0, 4,  65, -18, 0, -16,  89, 4
    190    db 0, 4,  63, -18, 0, -16,  91, 4, 0, 4,  60, -18, 0, -16,  94, 4
    191    db 0, 3,  58, -18, 0, -15,  96, 4, 0, 4,  55, -18, 0, -15,  98, 4
    192    db 0, 3,  52, -17, 0, -14, 100, 4, 0, 3,  50, -17, 0, -14, 102, 4
    193    db 0, 3,  47, -17, 0, -13, 104, 4, 0, 3,  45, -17, 0, -13, 106, 4
    194    db 0, 3,  42, -16, 0, -12, 108, 3, 0, 3,  40, -16, 0, -11, 109, 3
    195    db 0, 3,  37, -15, 0, -11, 111, 3, 0, 2,  35, -15, 0, -10, 113, 3
    196    db 0, 3,  32, -14, 0, -10, 114, 3, 0, 2,  29, -13, 0,  -9, 116, 3
    197    db 0, 2,  27, -13, 0,  -8, 117, 3, 0, 2,  25, -12, 0,  -8, 119, 2
    198    db 0, 2,  22, -11, 0,  -7, 120, 2, 0, 1,  20, -10, 0,  -6, 121, 2
    199    db 0, 1,  18,  -9, 0,  -6, 122, 2, 0, 1,  15,  -8, 0,  -5, 123, 2
    200    db 0, 1,  13,  -7, 0,  -4, 124, 1, 0, 1,  11,  -6, 0,  -4, 125, 1
    201    db 0, 1,   8,  -5, 0,  -3, 126, 1, 0, 1,   6,  -4, 0,  -2, 126, 1
    202    db 0, 0,   4,  -3, 0,  -1, 127, 1, 0, 0,   2,  -1, 0,   0, 127, 0
    203    db 0, 0,   2,  -1, 0,   0, 127, 0
    204 
    205 pw_258:  times 2 dw 258
    206 
    207 cextern mc_subpel_filters
    208 %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
    209 
    210 %macro BIDIR_JMP_TABLE 2-*
    211    ;evaluated at definition time (in loop below)
    212    %xdefine %1_%2_table (%%table - 2*%3)
    213    %xdefine %%base %1_%2_table
    214    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
    215    ; dynamically generated label
    216    %%table:
    217    %rep %0 - 2 ; repeat for num args
    218        dd %%prefix %+ .w%3 - %%base
    219        %rotate 1
    220    %endrep
    221 %endmacro
    222 
    223 BIDIR_JMP_TABLE avg, ssse3,        4, 8, 16, 32, 64, 128
    224 BIDIR_JMP_TABLE w_avg, ssse3,      4, 8, 16, 32, 64, 128
    225 BIDIR_JMP_TABLE mask, ssse3,       4, 8, 16, 32, 64, 128
    226 BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
    227 BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
    228 BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
    229 BIDIR_JMP_TABLE blend, ssse3,      4, 8, 16, 32
    230 BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
    231 BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16
    232 
    233 %macro BASE_JMP_TABLE 3-*
    234    %xdefine %1_%2_table (%%table - %3)
    235    %xdefine %%base %1_%2
    236    %%table:
    237    %rep %0 - 2
    238        dw %%base %+ _w%3 - %%base
    239        %rotate 1
    240    %endrep
    241 %endmacro
    242 
    243 %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put)
    244 %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep)
    245 
    246 BASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
    247 BASE_JMP_TABLE prep, ssse3,    4, 8, 16, 32, 64, 128
    248 
    249 %macro HV_JMP_TABLE 5-*
    250    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
    251    %xdefine %%base %1_%3
    252    %assign %%types %4
    253    %if %%types & 1
    254        %xdefine %1_%2_h_%3_table  (%%h  - %5)
    255        %%h:
    256        %rep %0 - 4
    257            dw %%prefix %+ .h_w%5 - %%base
    258            %rotate 1
    259        %endrep
    260        %rotate 4
    261    %endif
    262    %if %%types & 2
    263        %xdefine %1_%2_v_%3_table  (%%v  - %5)
    264        %%v:
    265        %rep %0 - 4
    266            dw %%prefix %+ .v_w%5 - %%base
    267            %rotate 1
    268        %endrep
    269        %rotate 4
    270    %endif
    271    %if %%types & 4
    272        %xdefine %1_%2_hv_%3_table (%%hv - %5)
    273        %%hv:
    274        %rep %0 - 4
    275            dw %%prefix %+ .hv_w%5 - %%base
    276            %rotate 1
    277        %endrep
    278    %endif
    279 %endmacro
    280 
    281 HV_JMP_TABLE prep,  8tap, ssse3, 1,    4, 8, 16, 32, 64, 128
    282 HV_JMP_TABLE put,  bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
    283 HV_JMP_TABLE prep, bilin, ssse3, 7,    4, 8, 16, 32, 64, 128
    284 
    285 %macro SCALED_JMP_TABLE 2-*
    286    %xdefine %1_%2_table (%%table - %3)
    287    %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
    288 %%table:
    289    %rep %0 - 2
    290        dw %%base %+ .w%3 - %%base
    291        %rotate 1
    292    %endrep
    293    %rotate 2
    294 %%dy_1024:
    295    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
    296    %rep %0 - 2
    297        dw %%base %+ .dy1_w%3 - %%base
    298        %rotate 1
    299    %endrep
    300    %rotate 2
    301 %%dy_2048:
    302    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
    303    %rep %0 - 2
    304        dw %%base %+ .dy2_w%3 - %%base
    305        %rotate 1
    306    %endrep
    307 %endmacro
    308 
    309 SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
    310 SCALED_JMP_TABLE prep_8tap_scaled, ssse3,   4, 8, 16, 32, 64, 128
    311 
    312 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
    313 
    314 SECTION .text
    315 
    316 INIT_XMM ssse3
    317 
    318 %if ARCH_X86_32
    319 DECLARE_REG_TMP 1
    320 %define base t0-put_ssse3
    321 %else
    322 DECLARE_REG_TMP 7
    323 %define base 0
    324 %endif
    325 
    326 %macro RESTORE_DSQ_32 1
    327 %if ARCH_X86_32
    328   mov                  %1, dsm ; restore dsq
    329 %endif
    330 %endmacro
    331 
    332 cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy
    333    movifnidn          mxyd, r6m ; mx
    334    LEA                  t0, put_ssse3
    335    movifnidn          srcq, srcmp
    336    movifnidn           ssq, ssmp
    337    tzcnt                wd, wm
    338    mov                  hd, hm
    339    test               mxyd, mxyd
    340    jnz .h
    341    mov                mxyd, r7m ; my
    342    test               mxyd, mxyd
    343    jnz .v
    344 .put:
    345    movzx                wd, word [t0+wq*2+table_offset(put,)]
    346    add                  wq, t0
    347    RESTORE_DSQ_32       t0
    348    jmp                  wq
    349 .put_w2:
    350    movzx               r4d, word [srcq+ssq*0]
    351    movzx               r6d, word [srcq+ssq*1]
    352    lea                srcq, [srcq+ssq*2]
    353    mov        [dstq+dsq*0], r4w
    354    mov        [dstq+dsq*1], r6w
    355    lea                dstq, [dstq+dsq*2]
    356    sub                  hd, 2
    357    jg .put_w2
    358    RET
    359 .put_w4:
    360    mov                 r4d, [srcq+ssq*0]
    361    mov                 r6d, [srcq+ssq*1]
    362    lea                srcq, [srcq+ssq*2]
    363    mov        [dstq+dsq*0], r4d
    364    mov        [dstq+dsq*1], r6d
    365    lea                dstq, [dstq+dsq*2]
    366    sub                  hd, 2
    367    jg .put_w4
    368    RET
    369 .put_w8:
    370    movq                 m0, [srcq+ssq*0]
    371    movq                 m1, [srcq+ssq*1]
    372    lea                srcq, [srcq+ssq*2]
    373    movq       [dstq+dsq*0], m0
    374    movq       [dstq+dsq*1], m1
    375    lea                dstq, [dstq+dsq*2]
    376    sub                  hd, 2
    377    jg .put_w8
    378    RET
    379 .put_w16:
    380    movu                 m0, [srcq+ssq*0]
    381    movu                 m1, [srcq+ssq*1]
    382    lea                srcq, [srcq+ssq*2]
    383    mova       [dstq+dsq*0], m0
    384    mova       [dstq+dsq*1], m1
    385    lea                dstq, [dstq+dsq*2]
    386    sub                  hd, 2
    387    jg .put_w16
    388    RET
    389 .put_w32:
    390    movu                 m0, [srcq+ssq*0+16*0]
    391    movu                 m1, [srcq+ssq*0+16*1]
    392    movu                 m2, [srcq+ssq*1+16*0]
    393    movu                 m3, [srcq+ssq*1+16*1]
    394    lea                srcq, [srcq+ssq*2]
    395    mova  [dstq+dsq*0+16*0], m0
    396    mova  [dstq+dsq*0+16*1], m1
    397    mova  [dstq+dsq*1+16*0], m2
    398    mova  [dstq+dsq*1+16*1], m3
    399    lea                dstq, [dstq+dsq*2]
    400    sub                  hd, 2
    401    jg .put_w32
    402    RET
    403 .put_w64:
    404    movu                 m0, [srcq+16*0]
    405    movu                 m1, [srcq+16*1]
    406    movu                 m2, [srcq+16*2]
    407    movu                 m3, [srcq+16*3]
    408    add                srcq, ssq
    409    mova        [dstq+16*0], m0
    410    mova        [dstq+16*1], m1
    411    mova        [dstq+16*2], m2
    412    mova        [dstq+16*3], m3
    413    add                dstq, dsq
    414    dec                  hd
    415    jg .put_w64
    416    RET
    417 .put_w128:
    418    movu                 m0, [srcq+16*0]
    419    movu                 m1, [srcq+16*1]
    420    movu                 m2, [srcq+16*2]
    421    movu                 m3, [srcq+16*3]
    422    mova        [dstq+16*0], m0
    423    mova        [dstq+16*1], m1
    424    mova        [dstq+16*2], m2
    425    mova        [dstq+16*3], m3
    426    movu                 m0, [srcq+16*4]
    427    movu                 m1, [srcq+16*5]
    428    movu                 m2, [srcq+16*6]
    429    movu                 m3, [srcq+16*7]
    430    mova        [dstq+16*4], m0
    431    mova        [dstq+16*5], m1
    432    mova        [dstq+16*6], m2
    433    mova        [dstq+16*7], m3
    434    add                srcq, ssq
    435    add                dstq, dsq
    436    dec                  hd
    437    jg .put_w128
    438    RET
    439 .h:
    440    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
    441    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
    442    imul               mxyd, 0x00ff00ff
    443    mova                 m4, [base+subpel_h_shufD]
    444    mova                 m0, [base+bilin_h_shuf4]
    445    add                mxyd, 0x00100010
    446    movd                 m5, mxyd
    447    mov                mxyd, r7m ; my
    448    pshufd               m5, m5, q0000
    449    test               mxyd, mxyd
    450    jnz .hv
    451    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_h)]
    452    mova                 m3, [base+pw_2048]
    453    add                  wq, t0
    454    movifnidn           dsq, dsmp
    455    jmp                  wq
    456 .h_w2:
    457    pshufd               m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
    458 .h_w2_loop:
    459    movd                 m0, [srcq+ssq*0]
    460    movd                 m1, [srcq+ssq*1]
    461    lea                srcq, [srcq+ssq*2]
    462    punpckldq            m0, m1
    463    pshufb               m0, m4
    464    pmaddubsw            m0, m5
    465    pmulhrsw             m0, m3
    466    packuswb             m0, m0
    467    movd                r6d, m0
    468    mov        [dstq+dsq*0], r6w
    469    shr                 r6d, 16
    470    mov        [dstq+dsq*1], r6w
    471    lea                dstq, [dstq+dsq*2]
    472    sub                  hd, 2
    473    jg .h_w2_loop
    474    RET
    475 .h_w4:
    476    movq                 m4, [srcq+ssq*0]
    477    movhps               m4, [srcq+ssq*1]
    478    lea                srcq, [srcq+ssq*2]
    479    pshufb               m4, m0
    480    pmaddubsw            m4, m5
    481    pmulhrsw             m4, m3
    482    packuswb             m4, m4
    483    movd       [dstq+dsq*0], m4
    484    psrlq                m4, 32
    485    movd       [dstq+dsq*1], m4
    486    lea                dstq, [dstq+dsq*2]
    487    sub                  hd, 2
    488    jg .h_w4
    489    RET
    490 .h_w8:
    491    movu                 m0, [srcq+ssq*0]
    492    movu                 m1, [srcq+ssq*1]
    493    lea                srcq, [srcq+ssq*2]
    494    pshufb               m0, m4
    495    pshufb               m1, m4
    496    pmaddubsw            m0, m5
    497    pmaddubsw            m1, m5
    498    pmulhrsw             m0, m3
    499    pmulhrsw             m1, m3
    500    packuswb             m0, m1
    501    movq       [dstq+dsq*0], m0
    502    movhps     [dstq+dsq*1], m0
    503    lea                dstq, [dstq+dsq*2]
    504    sub                  hd, 2
    505    jg .h_w8
    506    RET
    507 .h_w16:
    508    movu                 m0, [srcq+8*0]
    509    movu                 m1, [srcq+8*1]
    510    add                srcq, ssq
    511    pshufb               m0, m4
    512    pshufb               m1, m4
    513    pmaddubsw            m0, m5
    514    pmaddubsw            m1, m5
    515    pmulhrsw             m0, m3
    516    pmulhrsw             m1, m3
    517    packuswb             m0, m1
    518    mova             [dstq], m0
    519    add                dstq, dsq
    520    dec                  hd
    521    jg .h_w16
    522    RET
    523 .h_w32:
    524    movu                 m0, [srcq+mmsize*0+8*0]
    525    movu                 m1, [srcq+mmsize*0+8*1]
    526    pshufb               m0, m4
    527    pshufb               m1, m4
    528    pmaddubsw            m0, m5
    529    pmaddubsw            m1, m5
    530    pmulhrsw             m0, m3
    531    pmulhrsw             m1, m3
    532    packuswb             m0, m1
    533    movu                 m1, [srcq+mmsize*1+8*0]
    534    movu                 m2, [srcq+mmsize*1+8*1]
    535    add                srcq, ssq
    536    pshufb               m1, m4
    537    pshufb               m2, m4
    538    pmaddubsw            m1, m5
    539    pmaddubsw            m2, m5
    540    pmulhrsw             m1, m3
    541    pmulhrsw             m2, m3
    542    packuswb             m1, m2
    543    mova        [dstq+16*0], m0
    544    mova        [dstq+16*1], m1
    545    add                dstq, dsq
    546    dec                  hd
    547    jg .h_w32
    548    RET
    549 .h_w64:
    550    mov                  r6, -16*3
    551 .h_w64_loop:
    552    movu                 m0, [srcq+r6+16*3+8*0]
    553    movu                 m1, [srcq+r6+16*3+8*1]
    554    pshufb               m0, m4
    555    pshufb               m1, m4
    556    pmaddubsw            m0, m5
    557    pmaddubsw            m1, m5
    558    pmulhrsw             m0, m3
    559    pmulhrsw             m1, m3
    560    packuswb             m0, m1
    561    mova     [dstq+r6+16*3], m0
    562    add                  r6, 16
    563    jle .h_w64_loop
    564    add                srcq, ssq
    565    add                dstq, dsq
    566    dec                  hd
    567    jg .h_w64
    568    RET
    569 .h_w128:
    570    mov                  r6, -16*7
    571 .h_w128_loop:
    572    movu                 m0, [srcq+r6+16*7+8*0]
    573    movu                 m1, [srcq+r6+16*7+8*1]
    574    pshufb               m0, m4
    575    pshufb               m1, m4
    576    pmaddubsw            m0, m5
    577    pmaddubsw            m1, m5
    578    pmulhrsw             m0, m3
    579    pmulhrsw             m1, m3
    580    packuswb             m0, m1
    581    mova     [dstq+r6+16*7], m0
    582    add                  r6, 16
    583    jle .h_w128_loop
    584    add                srcq, ssq
    585    add                dstq, dsq
    586    dec                  hd
    587    jg .h_w128
    588    RET
    589 .v:
    590    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_v)]
    591    imul               mxyd, 0x00ff00ff
    592    mova                 m5, [base+pw_2048]
    593    add                mxyd, 0x00100010
    594    add                  wq, t0
    595    movd                 m4, mxyd
    596    pshufd               m4, m4, q0000
    597    movifnidn           dsq, dsmp
    598    jmp                  wq
    599 .v_w2:
    600    movd                 m0, [srcq+ssq*0]
    601 .v_w2_loop:
    602    pinsrw               m0, [srcq+ssq*1], 1 ; 0 1
    603    lea                srcq, [srcq+ssq*2]
    604    pshuflw              m1, m0, q2301
    605    pinsrw               m0, [srcq+ssq*0], 0 ; 2 1
    606    punpcklbw            m1, m0
    607    pmaddubsw            m1, m4
    608    pmulhrsw             m1, m5
    609    packuswb             m1, m1
    610    movd                r6d, m1
    611    mov        [dstq+dsq*1], r6w
    612    shr                 r6d, 16
    613    mov        [dstq+dsq*0], r6w
    614    lea                dstq, [dstq+dsq*2]
    615    sub                  hd, 2
    616    jg .v_w2_loop
    617    RET
    618 .v_w4:
    619    movd                 m0, [srcq+ssq*0]
    620 .v_w4_loop:
    621    movd                 m2, [srcq+ssq*1]
    622    lea                srcq, [srcq+ssq*2]
    623    mova                 m1, m0
    624    movd                 m0, [srcq+ssq*0]
    625    punpckldq            m1, m2 ; 0 1
    626    punpckldq            m2, m0 ; 1 2
    627    punpcklbw            m1, m2
    628    pmaddubsw            m1, m4
    629    pmulhrsw             m1, m5
    630    packuswb             m1, m1
    631    movd       [dstq+dsq*0], m1
    632    psrlq                m1, 32
    633    movd       [dstq+dsq*1], m1
    634    ;
    635    lea                dstq, [dstq+dsq*2]
    636    sub                  hd, 2
    637    jg .v_w4_loop
    638    RET
    639 .v_w8:
    640    movq                 m0, [srcq+ssq*0]
    641 .v_w8_loop:
    642    movq                 m2, [srcq+ssq*1]
    643    lea                srcq, [srcq+ssq*2]
    644    mova                 m1, m0
    645    movq                 m0, [srcq+ssq*0]
    646    punpcklbw            m1, m2
    647    punpcklbw            m2, m0
    648    pmaddubsw            m1, m4
    649    pmaddubsw            m2, m4
    650    pmulhrsw             m1, m5
    651    pmulhrsw             m2, m5
    652    packuswb             m1, m2
    653    movq       [dstq+dsq*0], m1
    654    movhps     [dstq+dsq*1], m1
    655    lea                dstq, [dstq+dsq*2]
    656    sub                  hd, 2
    657    jg .v_w8_loop
    658    RET
    659 %macro PUT_BILIN_V_W16 0
    660    movu                 m0, [srcq+ssq*0]
    661 %%loop:
    662    movu                 m3, [srcq+ssq*1]
    663    lea                srcq, [srcq+ssq*2]
    664    mova                 m1, m0
    665    mova                 m2, m0
    666    movu                 m0, [srcq+ssq*0]
    667    punpcklbw            m1, m3
    668    punpckhbw            m2, m3
    669    pmaddubsw            m1, m4
    670    pmaddubsw            m2, m4
    671    pmulhrsw             m1, m5
    672    pmulhrsw             m2, m5
    673    packuswb             m1, m2
    674    punpcklbw            m2, m3, m0
    675    punpckhbw            m3, m0
    676    pmaddubsw            m2, m4
    677    pmaddubsw            m3, m4
    678    pmulhrsw             m2, m5
    679    pmulhrsw             m3, m5
    680    packuswb             m2, m3
    681    mova       [dstq+dsq*0], m1
    682    mova       [dstq+dsq*1], m2
    683    lea                dstq, [dstq+dsq*2]
    684    sub                  hd, 2
    685    jg %%loop
    686 %endmacro
    687 .v_w16:
    688    PUT_BILIN_V_W16
    689    RET
    690 .v_w128:
    691    lea                 r6d, [hq+(7<<16)]
    692    jmp .v_w16gt
    693 .v_w64:
    694    lea                 r6d, [hq+(3<<16)]
    695    jmp .v_w16gt
    696 .v_w32:
    697    lea                 r6d, [hq+(1<<16)]
    698 .v_w16gt:
    699    mov                  r4, srcq
    700 %if ARCH_X86_64
    701    mov                  r7, dstq
    702 %endif
    703 .v_w16gt_loop:
    704    PUT_BILIN_V_W16
    705 %if ARCH_X86_64
    706    add                  r4, 16
    707    add                  r7, 16
    708    movzx                hd, r6b
    709    mov                srcq, r4
    710    mov                dstq, r7
    711 %else
    712    mov                dstq, dstmp
    713    add                  r4, 16
    714    movzx                hd, r6w
    715    add                dstq, 16
    716    mov                srcq, r4
    717    mov               dstmp, dstq
    718 %endif
    719    sub                 r6d, 1<<16
    720    jg .v_w16gt
    721    RET
    722 .hv:
    723    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
    724    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
    725    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
    726    WIN64_SPILL_XMM       8
    727    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
    728    mova                 m7, [base+pw_15]
    729    movd                 m6, mxyd
    730    add                  wq, t0
    731    pshuflw              m6, m6, q0000
    732    paddb                m5, m5
    733    punpcklqdq           m6, m6
    734    jmp                  wq
    735 .hv_w2:
    736    RESTORE_DSQ_32       t0
    737    movd                 m0, [srcq+ssq*0]
    738    punpckldq            m0, m0
    739    pshufb               m0, m4
    740    pmaddubsw            m0, m5
    741 .hv_w2_loop:
    742    movd                 m1, [srcq+ssq*1]
    743    lea                srcq, [srcq+ssq*2]
    744    movd                 m2, [srcq+ssq*0]
    745    punpckldq            m1, m2
    746    pshufb               m1, m4
    747    pmaddubsw            m1, m5             ; 1 _ 2 _
    748    shufps               m2, m0, m1, q1032  ; 0 _ 1 _
    749    mova                 m0, m1
    750    psubw                m1, m2   ; 2 * (src[x + src_stride] - src[x])
    751    pmulhw               m1, m6   ; (my * (src[x + src_stride] - src[x]) >> 4
    752    pavgw                m2, m7   ; src[x] + 8
    753    paddw                m1, m2   ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8
    754    psrlw                m1, 4
    755    packuswb             m1, m1
    756 %if ARCH_X86_64
    757    movq                 r6, m1
    758 %else
    759    pshuflw              m1, m1, q2020
    760    movd                r6d, m1
    761 %endif
    762    mov        [dstq+dsq*0], r6w
    763    shr                  r6, gprsize*4
    764    mov        [dstq+dsq*1], r6w
    765    lea                dstq, [dstq+dsq*2]
    766    sub                  hd, 2
    767    jg .hv_w2_loop
    768    RET
    769 .hv_w4:
    770    mova                 m4, [base+bilin_h_shuf4]
    771    movddup              m0, [srcq+ssq*0]
    772    movifnidn           dsq, dsmp
    773    pshufb               m0, m4
    774    pmaddubsw            m0, m5
    775 .hv_w4_loop:
    776    movq                 m1, [srcq+ssq*1]
    777    lea                srcq, [srcq+ssq*2]
    778    movhps               m1, [srcq+ssq*0]
    779    pshufb               m1, m4
    780    pmaddubsw            m1, m5            ; 1 2
    781    shufps               m2, m0, m1, q1032 ; 0 1
    782    mova                 m0, m1
    783    psubw                m1, m2
    784    pmulhw               m1, m6
    785    pavgw                m2, m7
    786    paddw                m1, m2
    787    psrlw                m1, 4
    788    packuswb             m1, m1
    789    movd       [dstq+dsq*0], m1
    790    psrlq                m1, 32
    791    movd       [dstq+dsq*1], m1
    792    lea                dstq, [dstq+dsq*2]
    793    sub                  hd, 2
    794    jg .hv_w4_loop
    795    RET
    796 .hv_w8:
    797    movu                 m0, [srcq+ssq*0]
    798    movifnidn           dsq, dsmp
    799    pshufb               m0, m4
    800    pmaddubsw            m0, m5
    801 .hv_w8_loop:
    802    movu                 m2, [srcq+ssq*1]
    803    lea                srcq, [srcq+ssq*2]
    804    pshufb               m2, m4
    805    pmaddubsw            m2, m5
    806    psubw                m1, m2, m0
    807    pmulhw               m1, m6
    808    pavgw                m0, m7
    809    paddw                m1, m0
    810    movu                 m0, [srcq+ssq*0]
    811    pshufb               m0, m4
    812    pmaddubsw            m0, m5
    813    psubw                m3, m0, m2
    814    pmulhw               m3, m6
    815    pavgw                m2, m7
    816    paddw                m3, m2
    817    psrlw                m1, 4
    818    psrlw                m3, 4
    819    packuswb             m1, m3
    820    movq       [dstq+dsq*0], m1
    821    movhps     [dstq+dsq*1], m1
    822    lea                dstq, [dstq+dsq*2]
    823    sub                  hd, 2
    824    jg .hv_w8_loop
    825    RET
    826 .hv_w128:
    827    lea                 r6d, [hq+(7<<16)]
    828    jmp .hv_w16_start
    829 .hv_w64:
    830    lea                 r6d, [hq+(3<<16)]
    831    jmp .hv_w16_start
    832 .hv_w32:
    833    lea                 r6d, [hq+(1<<16)]
    834 .hv_w16_start:
    835    mov                  r4, srcq
    836 %if ARCH_X86_32
    837    %define m8 [dstq]
    838 %else
    839    mov                  r7, dstq
    840 %endif
    841 .hv_w16:
    842    movifnidn           dsq, dsmp
    843 %if WIN64
    844    movaps              r4m, m8
    845 %endif
    846 .hv_w16_loop0:
    847    movu                 m0, [srcq+8*0]
    848    movu                 m1, [srcq+8*1]
    849    pshufb               m0, m4
    850    pshufb               m1, m4
    851    pmaddubsw            m0, m5
    852    pmaddubsw            m1, m5
    853 .hv_w16_loop:
    854    add                srcq, ssq
    855    movu                 m2, [srcq+8*0]
    856    movu                 m3, [srcq+8*1]
    857    pshufb               m2, m4
    858    pshufb               m3, m4
    859    pmaddubsw            m2, m5
    860    pmaddubsw            m3, m5
    861    mova                 m8, m2
    862    psubw                m2, m0
    863    pmulhw               m2, m6
    864    pavgw                m0, m7
    865    paddw                m2, m0
    866    mova                 m0, m3
    867    psubw                m3, m1
    868    pmulhw               m3, m6
    869    pavgw                m1, m7
    870    paddw                m3, m1
    871    mova                 m1, m0
    872    mova                 m0, m8
    873    psrlw                m2, 4
    874    psrlw                m3, 4
    875    packuswb             m2, m3
    876    mova             [dstq], m2
    877    add                dstq, dsmp
    878    dec                  hd
    879    jg .hv_w16_loop
    880 %if ARCH_X86_32
    881    mov                dstq, dstm
    882    add                  r4, 16
    883    movzx                hd, r6w
    884    add                dstq, 16
    885    mov                srcq, r4
    886    mov                dstm, dstq
    887 %else
    888    add                  r4, 16
    889    add                  r7, 16
    890    movzx                hd, r6b
    891    mov                srcq, r4
    892    mov                dstq, r7
    893 %endif
    894    sub                 r6d, 1<<16
    895    jg .hv_w16_loop0
    896 %if WIN64
    897    movaps               m8, r4m
    898 %endif
    899    RET
    900 
    901 %if ARCH_X86_32
    902    %define base r6-prep%+SUFFIX
    903 %else
    904    %define base 0
    905 %endif
    906 
    907 cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
    908    movifnidn          mxyd, r5m ; mx
    909    LEA                  r6, prep_ssse3
    910    tzcnt                wd, wm
    911    movifnidn            hd, hm
    912    test               mxyd, mxyd
    913    jnz .h
    914    mov                mxyd, r6m ; my
    915    test               mxyd, mxyd
    916    jnz .v
    917 .prep:
    918    movzx                wd, word [r6+wq*2+table_offset(prep,)]
    919    pxor                 m4, m4
    920    add                  wq, r6
    921    lea            stride3q, [strideq*3]
    922    jmp                  wq
    923 .prep_w4:
    924    movd                 m0, [srcq+strideq*0]
    925    movd                 m1, [srcq+strideq*1]
    926    movd                 m2, [srcq+strideq*2]
    927    movd                 m3, [srcq+stride3q ]
    928    lea                srcq, [srcq+strideq*4]
    929    punpckldq            m0, m1
    930    punpckldq            m2, m3
    931    punpcklbw            m0, m4
    932    punpcklbw            m2, m4
    933    psllw                m0, 4
    934    psllw                m2, 4
    935    mova        [tmpq+16*0], m0
    936    mova        [tmpq+16*1], m2
    937    add                tmpq, 16*2
    938    sub                  hd, 4
    939    jg .prep_w4
    940    RET
    941 .prep_w8:
    942    movq                 m0, [srcq+strideq*0]
    943    movq                 m1, [srcq+strideq*1]
    944    movq                 m2, [srcq+strideq*2]
    945    movq                 m3, [srcq+stride3q ]
    946    lea                srcq, [srcq+strideq*4]
    947    punpcklbw            m0, m4
    948    punpcklbw            m1, m4
    949    punpcklbw            m2, m4
    950    punpcklbw            m3, m4
    951    psllw                m0, 4
    952    psllw                m1, 4
    953    psllw                m2, 4
    954    psllw                m3, 4
    955    mova        [tmpq+16*0], m0
    956    mova        [tmpq+16*1], m1
    957    mova        [tmpq+16*2], m2
    958    mova        [tmpq+16*3], m3
    959    add                tmpq, 16*4
    960    sub                  hd, 4
    961    jg .prep_w8
    962    RET
    963 .prep_w16:
    964    movu                 m1, [srcq+strideq*0]
    965    movu                 m3, [srcq+strideq*1]
    966    lea                srcq, [srcq+strideq*2]
    967    punpcklbw            m0, m1, m4
    968    punpckhbw            m1, m4
    969    punpcklbw            m2, m3, m4
    970    punpckhbw            m3, m4
    971    psllw                m0, 4
    972    psllw                m1, 4
    973    psllw                m2, 4
    974    psllw                m3, 4
    975    mova        [tmpq+16*0], m0
    976    mova        [tmpq+16*1], m1
    977    mova        [tmpq+16*2], m2
    978    mova        [tmpq+16*3], m3
    979    add                tmpq, 16*4
    980    sub                  hd, 2
    981    jg .prep_w16
    982    RET
    983 .prep_w128:
    984    mov                  r3, -128
    985    jmp .prep_w32_start
    986 .prep_w64:
    987    mov                  r3, -64
    988    jmp .prep_w32_start
    989 .prep_w32:
    990    mov                  r3, -32
    991 .prep_w32_start:
    992    sub                srcq, r3
    993 .prep_w32_vloop:
    994    mov                  r6, r3
    995 .prep_w32_hloop:
    996    movu                 m1, [srcq+r6+16*0]
    997    movu                 m3, [srcq+r6+16*1]
    998    punpcklbw            m0, m1, m4
    999    punpckhbw            m1, m4
   1000    punpcklbw            m2, m3, m4
   1001    punpckhbw            m3, m4
   1002    psllw                m0, 4
   1003    psllw                m1, 4
   1004    psllw                m2, 4
   1005    psllw                m3, 4
   1006    mova        [tmpq+16*0], m0
   1007    mova        [tmpq+16*1], m1
   1008    mova        [tmpq+16*2], m2
   1009    mova        [tmpq+16*3], m3
   1010    add                tmpq, 16*4
   1011    add                  r6, 32
   1012    jl .prep_w32_hloop
   1013    add                srcq, strideq
   1014    dec                  hd
   1015    jg .prep_w32_vloop
   1016    RET
   1017 .h:
   1018    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
   1019    ; = (16 - mx) * src[x] + mx * src[x + 1]
   1020    imul               mxyd, 0x00ff00ff
   1021    mova                 m4, [base+subpel_h_shufD]
   1022    add                mxyd, 0x00100010
   1023    movd                 m5, mxyd
   1024    mov                mxyd, r6m ; my
   1025    pshufd               m5, m5, q0000
   1026    test               mxyd, mxyd
   1027    jnz .hv
   1028    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
   1029    add                  wq, r6
   1030    jmp                  wq
   1031 .h_w4:
   1032    mova                 m4, [base+bilin_h_shuf4]
   1033    lea            stride3q, [strideq*3]
   1034 .h_w4_loop:
   1035    movq                 m0, [srcq+strideq*0]
   1036    movhps               m0, [srcq+strideq*1]
   1037    movq                 m1, [srcq+strideq*2]
   1038    movhps               m1, [srcq+stride3q ]
   1039    lea                srcq, [srcq+strideq*4]
   1040    pshufb               m0, m4
   1041    pshufb               m1, m4
   1042    pmaddubsw            m0, m5
   1043    pmaddubsw            m1, m5
   1044    mova          [tmpq+0 ], m0
   1045    mova          [tmpq+16], m1
   1046    add                tmpq, 32
   1047    sub                  hd, 4
   1048    jg .h_w4_loop
   1049    RET
   1050 .h_w8:
   1051    lea            stride3q, [strideq*3]
   1052 .h_w8_loop:
   1053    movu                 m0, [srcq+strideq*0]
   1054    movu                 m1, [srcq+strideq*1]
   1055    movu                 m2, [srcq+strideq*2]
   1056    movu                 m3, [srcq+stride3q ]
   1057    lea                srcq, [srcq+strideq*4]
   1058    REPX  {pshufb    x, m4}, m0, m1, m2, m3
   1059    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
   1060    mova        [tmpq+16*0], m0
   1061    mova        [tmpq+16*1], m1
   1062    mova        [tmpq+16*2], m2
   1063    mova        [tmpq+16*3], m3
   1064    add                tmpq, 16*4
   1065    sub                  hd, 4
   1066    jg .h_w8_loop
   1067    RET
   1068 .h_w16:
   1069    movu                 m0, [srcq+strideq*0+8*0]
   1070    movu                 m1, [srcq+strideq*0+8*1]
   1071    movu                 m2, [srcq+strideq*1+8*0]
   1072    movu                 m3, [srcq+strideq*1+8*1]
   1073    lea                srcq, [srcq+strideq*2]
   1074    REPX  {pshufb    x, m4}, m0, m1, m2, m3
   1075    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
   1076    mova        [tmpq+16*0], m0
   1077    mova        [tmpq+16*1], m1
   1078    mova        [tmpq+16*2], m2
   1079    mova        [tmpq+16*3], m3
   1080    add                tmpq, 16*4
   1081    sub                  hd, 2
   1082    jg .h_w16
   1083    RET
   1084 .h_w128:
   1085    mov                  r3, -128
   1086    jmp .h_w32_start
   1087 .h_w64:
   1088    mov                  r3, -64
   1089    jmp .h_w32_start
   1090 .h_w32:
   1091    mov                  r3, -32
   1092 .h_w32_start:
   1093    sub                srcq, r3
   1094 .h_w32_vloop:
   1095    mov                  r6, r3
   1096 .h_w32_hloop:
   1097    movu                 m0, [srcq+r6+8*0]
   1098    movu                 m1, [srcq+r6+8*1]
   1099    movu                 m2, [srcq+r6+8*2]
   1100    movu                 m3, [srcq+r6+8*3]
   1101    REPX  {pshufb    x, m4}, m0, m1, m2, m3
   1102    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
   1103    mova        [tmpq+16*0], m0
   1104    mova        [tmpq+16*1], m1
   1105    mova        [tmpq+16*2], m2
   1106    mova        [tmpq+16*3], m3
   1107    add                tmpq, 16*4
   1108    add                  r6, 32
   1109    jl .h_w32_hloop
   1110    add                srcq, strideq
   1111    dec                  hd
   1112    jg .h_w32_vloop
   1113    RET
   1114 .v:
   1115    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
   1116    imul               mxyd, 0x00ff00ff
   1117    add                mxyd, 0x00100010
   1118    add                  wq, r6
   1119    lea            stride3q, [strideq*3]
   1120    movd                 m5, mxyd
   1121    pshufd               m5, m5, q0000
   1122    jmp                  wq
   1123 .v_w4:
   1124    movd                 m0, [srcq+strideq*0]
   1125 .v_w4_loop:
   1126    movd                 m1, [srcq+strideq*1]
   1127    movd                 m2, [srcq+strideq*2]
   1128    movd                 m3, [srcq+stride3q ]
   1129    lea                srcq, [srcq+strideq*4]
   1130    punpckldq            m0, m1
   1131    punpckldq            m1, m2
   1132    punpcklbw            m0, m1 ; 01 12
   1133    pmaddubsw            m0, m5
   1134    mova        [tmpq+16*0], m0
   1135    movd                 m0, [srcq+strideq*0]
   1136    punpckldq            m2, m3
   1137    punpckldq            m3, m0
   1138    punpcklbw            m2, m3 ; 23 34
   1139    pmaddubsw            m2, m5
   1140    mova        [tmpq+16*1], m2
   1141    add                tmpq, 16*2
   1142    sub                  hd, 4
   1143    jg .v_w4_loop
   1144    RET
   1145 .v_w8:
   1146    movq                 m0, [srcq+strideq*0]
   1147 .v_w8_loop:
   1148    movq                 m1, [srcq+strideq*1]
   1149    movq                 m2, [srcq+strideq*2]
   1150    movq                 m3, [srcq+stride3q ]
   1151    lea                srcq, [srcq+strideq*4]
   1152    punpcklbw            m0, m1 ; 01
   1153    punpcklbw            m1, m2 ; 12
   1154    pmaddubsw            m0, m5
   1155    pmaddubsw            m1, m5
   1156    mova        [tmpq+16*0], m0
   1157    movq                 m0, [srcq+strideq*0]
   1158    punpcklbw            m2, m3 ; 23
   1159    punpcklbw            m3, m0 ; 34
   1160    pmaddubsw            m2, m5
   1161    mova        [tmpq+16*1], m1
   1162    pmaddubsw            m3, m5
   1163    mova        [tmpq+16*2], m2
   1164    mova        [tmpq+16*3], m3
   1165    add                tmpq, 16*4
   1166    sub                  hd, 4
   1167    jg .v_w8_loop
   1168    RET
   1169 .v_w16:
   1170    movu                 m0, [srcq+strideq*0]
   1171 .v_w16_loop:
   1172    movu                 m1, [srcq+strideq*1]
   1173    movu                 m2, [srcq+strideq*2]
   1174    movu                 m3, [srcq+stride3q ]
   1175    lea                srcq, [srcq+strideq*4]
   1176    punpcklbw            m4, m0, m1
   1177    punpckhbw            m0, m1
   1178    pmaddubsw            m4, m5
   1179    pmaddubsw            m0, m5
   1180    mova        [tmpq+16*0], m4
   1181    punpcklbw            m4, m1, m2
   1182    punpckhbw            m1, m2
   1183    pmaddubsw            m4, m5
   1184    mova        [tmpq+16*1], m0
   1185    movu                 m0, [srcq+strideq*0]
   1186    pmaddubsw            m1, m5
   1187    mova        [tmpq+16*2], m4
   1188    punpcklbw            m4, m2, m3
   1189    punpckhbw            m2, m3
   1190    pmaddubsw            m4, m5
   1191    mova        [tmpq+16*3], m1
   1192    pmaddubsw            m2, m5
   1193    mova        [tmpq+16*4], m4
   1194    punpcklbw            m4, m3, m0
   1195    punpckhbw            m3, m0
   1196    pmaddubsw            m4, m5
   1197    mova        [tmpq+16*5], m2
   1198    pmaddubsw            m3, m5
   1199    mova        [tmpq+16*6], m4
   1200    mova        [tmpq+16*7], m3
   1201    add                tmpq, 16*8
   1202    sub                  hd, 4
   1203    jg .v_w16_loop
   1204    RET
   1205 .v_w128:
   1206    lea                 r3d, [hq+(3<<8)]
   1207    mov                 r6d, 256
   1208    jmp .v_w32_start
   1209 .v_w64:
   1210    lea                 r3d, [hq+(1<<8)]
   1211    mov                 r6d, 128
   1212    jmp .v_w32_start
   1213 .v_w32:
   1214    xor                 r3d, r3d
   1215    mov                 r6d, 64
   1216 .v_w32_start:
   1217 %if ARCH_X86_64
   1218 %if WIN64
   1219    PUSH                 r7
   1220 %endif
   1221    mov                  r7, tmpq
   1222 %endif
   1223    mov                  r5, srcq
   1224 .v_w32_hloop:
   1225    movu                 m0, [srcq+strideq*0+16*0]
   1226    movu                 m1, [srcq+strideq*0+16*1]
   1227 .v_w32_vloop:
   1228    movu                 m2, [srcq+strideq*1+16*0]
   1229    movu                 m3, [srcq+strideq*1+16*1]
   1230    lea                srcq, [srcq+strideq*2]
   1231    punpcklbw            m4, m0, m2
   1232    punpckhbw            m0, m2
   1233    pmaddubsw            m4, m5
   1234    pmaddubsw            m0, m5
   1235    mova        [tmpq+16*0], m4
   1236    mova        [tmpq+16*1], m0
   1237    movu                 m0, [srcq+strideq*0+16*0]
   1238    punpcklbw            m4, m1, m3
   1239    punpckhbw            m1, m3
   1240    pmaddubsw            m4, m5
   1241    pmaddubsw            m1, m5
   1242    mova        [tmpq+16*2], m4
   1243    mova        [tmpq+16*3], m1
   1244    movu                 m1, [srcq+strideq*0+16*1]
   1245    add                tmpq, r6
   1246    punpcklbw            m4, m2, m0
   1247    punpckhbw            m2, m0
   1248    pmaddubsw            m4, m5
   1249    pmaddubsw            m2, m5
   1250    mova        [tmpq+16*0], m4
   1251    mova        [tmpq+16*1], m2
   1252    punpcklbw            m4, m3, m1
   1253    punpckhbw            m3, m1
   1254    pmaddubsw            m4, m5
   1255    pmaddubsw            m3, m5
   1256    mova        [tmpq+16*2], m4
   1257    mova        [tmpq+16*3], m3
   1258    add                tmpq, r6
   1259    sub                  hd, 2
   1260    jg .v_w32_vloop
   1261    add                  r5, 32
   1262    movzx                hd, r3b
   1263    mov                srcq, r5
   1264 %if ARCH_X86_64
   1265    add                  r7, 16*4
   1266    mov                tmpq, r7
   1267 %else
   1268    mov                tmpq, tmpmp
   1269    add                tmpq, 16*4
   1270    mov               tmpmp, tmpq
   1271 %endif
   1272    sub                 r3d, 1<<8
   1273    jg .v_w32_hloop
   1274 %if WIN64
   1275    POP                  r7
   1276 %endif
   1277    RET
   1278 .hv:
   1279    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
   1280    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
   1281    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
   1282    imul               mxyd, 0x08000800
   1283    WIN64_SPILL_XMM 8
   1284    movd                 m6, mxyd
   1285    add                  wq, r6
   1286    pshufd               m6, m6, q0000
   1287    jmp                  wq
   1288 .hv_w4:
   1289    mova                 m4, [base+bilin_h_shuf4]
   1290    movddup              m0, [srcq+strideq*0]
   1291    lea                  r3, [strideq*3]
   1292    pshufb               m0, m4
   1293    pmaddubsw            m0, m5            ; _ 0
   1294 .hv_w4_loop:
   1295    movq                 m1, [srcq+strideq*1]
   1296    movhps               m1, [srcq+strideq*2]
   1297    movq                 m2, [srcq+r3       ]
   1298    lea                srcq, [srcq+strideq*4]
   1299    movhps               m2, [srcq+strideq*0]
   1300    pshufb               m1, m4
   1301    pshufb               m2, m4
   1302    pmaddubsw            m1, m5            ; 1 2
   1303    pmaddubsw            m2, m5            ; 3 4
   1304    shufpd               m0, m1, 0x01      ; 0 1
   1305    shufpd               m3, m1, m2, 0x01  ; 2 3
   1306    psubw                m1, m0
   1307    pmulhrsw             m1, m6
   1308    paddw                m1, m0
   1309    mova                 m0, m2
   1310    psubw                m2, m3
   1311    pmulhrsw             m2, m6
   1312    paddw                m2, m3
   1313    mova        [tmpq+16*0], m1
   1314    mova        [tmpq+16*1], m2
   1315    add                tmpq, 32
   1316    sub                  hd, 4
   1317    jg .hv_w4_loop
   1318    RET
   1319 .hv_w8:
   1320    movu                 m0, [srcq+strideq*0]
   1321    pshufb               m0, m4
   1322    pmaddubsw            m0, m5 ; 0
   1323 .hv_w8_loop:
   1324    movu                 m1, [srcq+strideq*1]
   1325    lea                srcq, [srcq+strideq*2]
   1326    movu                 m2, [srcq+strideq*0]
   1327    pshufb               m1, m4
   1328    pshufb               m2, m4
   1329    pmaddubsw            m1, m5 ; 1
   1330    pmaddubsw            m2, m5 ; 2
   1331    psubw                m3, m1, m0
   1332    pmulhrsw             m3, m6
   1333    paddw                m3, m0
   1334    mova                 m0, m2
   1335    psubw                m2, m1
   1336    pmulhrsw             m2, m6
   1337    paddw                m2, m1
   1338    mova        [tmpq+16*0], m3
   1339    mova        [tmpq+16*1], m2
   1340    add                tmpq, 16*2
   1341    sub                  hd, 2
   1342    jg .hv_w8_loop
   1343    RET
   1344 .hv_w128:
   1345    lea                 r3d, [hq+(7<<8)]
   1346    mov                 r5d, 256
   1347    jmp .hv_w16_start
   1348 .hv_w64:
   1349    lea                 r3d, [hq+(3<<8)]
   1350    mov                 r5d, 128
   1351    jmp .hv_w16_start
   1352 .hv_w32:
   1353    lea                 r3d, [hq+(1<<8)]
   1354    mov                 r5d, 64
   1355    jmp .hv_w16_start
   1356 .hv_w16:
   1357    xor                 r3d, r3d
   1358    mov                 r5d, 32
   1359 .hv_w16_start:
   1360    mov                  r6, srcq
   1361 %if ARCH_X86_64
   1362 %if WIN64
   1363    PUSH                 r7
   1364 %endif
   1365    mov                  r7, tmpq
   1366 %endif
   1367 .hv_w16_hloop:
   1368    movu                 m0, [srcq+strideq*0+8*0]
   1369    movu                 m1, [srcq+strideq*0+8*1]
   1370    pshufb               m0, m4
   1371    pshufb               m1, m4
   1372    pmaddubsw            m0, m5 ; 0a
   1373    pmaddubsw            m1, m5 ; 0b
   1374 .hv_w16_vloop:
   1375    movu                 m2, [srcq+strideq*1+8*0]
   1376    pshufb               m2, m4
   1377    pmaddubsw            m2, m5 ; 1a
   1378    psubw                m3, m2, m0
   1379    pmulhrsw             m3, m6
   1380    paddw                m3, m0
   1381    mova        [tmpq+16*0], m3
   1382    movu                 m3, [srcq+strideq*1+8*1]
   1383    lea                srcq, [srcq+strideq*2]
   1384    pshufb               m3, m4
   1385    pmaddubsw            m3, m5 ; 1b
   1386    psubw                m0, m3, m1
   1387    pmulhrsw             m0, m6
   1388    paddw                m0, m1
   1389    mova        [tmpq+16*1], m0
   1390    add                tmpq, r5
   1391    movu                 m0, [srcq+strideq*0+8*0]
   1392    pshufb               m0, m4
   1393    pmaddubsw            m0, m5 ; 2a
   1394    psubw                m1, m0, m2
   1395    pmulhrsw             m1, m6
   1396    paddw                m1, m2
   1397    mova        [tmpq+16*0], m1
   1398    movu                 m1, [srcq+strideq*0+8*1]
   1399    pshufb               m1, m4
   1400    pmaddubsw            m1, m5 ; 2b
   1401    psubw                m2, m1, m3
   1402    pmulhrsw             m2, m6
   1403    paddw                m2, m3
   1404    mova        [tmpq+16*1], m2
   1405    add                tmpq, r5
   1406    sub                  hd, 2
   1407    jg .hv_w16_vloop
   1408    movzx                hd, r3b
   1409 %if ARCH_X86_64
   1410    add                  r6, 16
   1411    add                  r7, 2*16
   1412    mov                srcq, r6
   1413    mov                tmpq, r7
   1414 %else
   1415    mov                tmpq, tmpm
   1416    add                  r6, 16
   1417    add                tmpq, 2*16
   1418    mov                srcq, r6
   1419    mov                tmpm, tmpq
   1420 %endif
   1421    sub                 r3d, 1<<8
   1422    jg .hv_w16_hloop
   1423 %if WIN64
   1424    POP                  r7
   1425 %endif
   1426    RET
   1427 
   1428 ; int8_t subpel_filters[5][15][8]
   1429 %assign FILTER_REGULAR (0*15 << 16) | 3*15
   1430 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
   1431 %assign FILTER_SHARP   (2*15 << 16) | 3*15
   1432 
   1433 %macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
   1434 cglobal %1_%2_8bpc
   1435    mov                 t0d, FILTER_%3
   1436 %ifidn %3, %4
   1437    mov                 t1d, t0d
   1438 %else
   1439    mov                 t1d, FILTER_%4
   1440 %endif
   1441 %if %0 == 5 ; skip the jump in the last filter
   1442    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
   1443 %endif
   1444 %endmacro
   1445 
   1446 %if ARCH_X86_32
   1447 DECLARE_REG_TMP 1, 2
   1448 %elif WIN64
   1449 DECLARE_REG_TMP 4, 5
   1450 %else
   1451 DECLARE_REG_TMP 7, 8
   1452 %endif
   1453 
   1454 %if ARCH_X86_32
   1455 %define base_reg r1
   1456 %define base base_reg-put_ssse3
   1457 %else
   1458 %define base_reg r8
   1459 %define base 0
   1460 %endif
   1461 
   1462 %define PUT_8TAP_FN FN put_8tap,
   1463 PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_8bpc
   1464 PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_8bpc
   1465 PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_8bpc
   1466 PUT_8TAP_FN regular,        REGULAR, REGULAR
   1467 
   1468 cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
   1469    imul                mxd, mxm, 0x010101
   1470    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   1471 %if ARCH_X86_64
   1472    imul                myd, mym, 0x010101
   1473    add                 myd, t1d ; 8tap_v, my, 4tap_v
   1474 %else
   1475    imul                ssd, mym, 0x010101
   1476    add                 ssd, t1d ; 8tap_v, my, 4tap_v
   1477    mov                srcq, srcm
   1478 %endif
   1479    mov                  wd, wm
   1480    movifnidn            hd, hm
   1481    LEA            base_reg, put_ssse3
   1482    test                mxd, 0xf00
   1483    jnz .h
   1484 %if ARCH_X86_32
   1485    test                ssd, 0xf00
   1486 %else
   1487    test                myd, 0xf00
   1488 %endif
   1489    jnz .v
   1490 .put:
   1491    tzcnt                wd, wd
   1492    movzx                wd, word [base_reg+wq*2+table_offset(put,)]
   1493    movifnidn           ssq, ssmp
   1494    add                  wq, base_reg
   1495    movifnidn           dsq, dsmp
   1496 %if WIN64
   1497    pop                  r8
   1498 %endif
   1499    lea                  r6, [ssq*3]
   1500    jmp                  wq
   1501 .h:
   1502 %if ARCH_X86_32
   1503    test                ssd, 0xf00
   1504 %else
   1505    test                myd, 0xf00
   1506 %endif
   1507    jnz .hv
   1508    movifnidn           ssq, ssmp
   1509    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
   1510    cmp                  wd, 4
   1511    jle mangle(private_prefix %+ _put_8tap_8bpc %+ SUFFIX).h_w4
   1512    WIN64_SPILL_XMM      11
   1513 %if ARCH_X86_64
   1514    mova                 m8, [base+subpel_h_shufD]
   1515    mova                 m9, [base+subpel_h_shufE]
   1516    mova                m10, [base+subpel_h_shufF]
   1517 %endif
   1518    shr                 mxd, 16
   1519    sub                srcq, 2
   1520    movq                 m7, [base_reg-put_ssse3+subpel_filters+1+mxq*8]
   1521    punpcklwd            m7, m7
   1522    pshufd               m4, m7, q0000
   1523    pshufd               m6, m7, q1111
   1524    pshufd               m7, m7, q2222
   1525    sub                  wd, 16
   1526    jge .h_w16
   1527 %macro PUT_6TAP_H 3 ; dst/src, tmp[1-2]
   1528 %if ARCH_X86_32
   1529    pshufb               %2, %1, [base+subpel_h_shufD]
   1530    pshufb               %3, %1, [base+subpel_h_shufE]
   1531    pshufb               %1, [base+subpel_h_shufF]
   1532 %else
   1533    pshufb               %2, %1, m8
   1534    pshufb               %3, %1, m9
   1535    pshufb               %1, m10
   1536 %endif
   1537    pmaddubsw            %2, m4
   1538    pmaddubsw            %3, m6
   1539    pmaddubsw            %1, m7
   1540    paddw                %2, m5
   1541    paddw                %2, %3
   1542    paddw                %1, %2
   1543    psraw                %1, 6
   1544 %endmacro
   1545 %if ARCH_X86_32
   1546    mov                  r4, dsm
   1547 %endif
   1548 .h_w8:
   1549    movu                 m0, [srcq+ssq*0]
   1550    movu                 m1, [srcq+ssq*1]
   1551    lea                srcq, [srcq+ssq*2]
   1552    PUT_6TAP_H           m0, m2, m3
   1553    PUT_6TAP_H           m1, m2, m3
   1554    packuswb             m0, m1
   1555 %if ARCH_X86_32
   1556    movq        [dstq+r4*0], m0
   1557    movhps      [dstq+r4*1], m0
   1558    lea                dstq, [dstq+r4*2]
   1559 %else
   1560    movq       [dstq+dsq*0], m0
   1561    movhps     [dstq+dsq*1], m0
   1562    lea                dstq, [dstq+dsq*2]
   1563 %endif
   1564    sub                  hd, 2
   1565    jg .h_w8
   1566    RET
   1567 .h_w16:
   1568    add                srcq, wq
   1569    add                dstq, wq
   1570    neg                  wq
   1571 .h_w16_loop_v:
   1572    mov                  r6, wq
   1573 .h_w16_loop_h:
   1574    movu                 m0, [srcq+r6+8*0]
   1575    movu                 m1, [srcq+r6+8*1]
   1576    PUT_6TAP_H           m0, m2, m3
   1577    PUT_6TAP_H           m1, m2, m3
   1578    packuswb             m0, m1
   1579    mova          [dstq+r6], m0
   1580    add                  r6, 16
   1581    jle .h_w16_loop_h
   1582    add                srcq, ssq
   1583    add                dstq, dsmp
   1584    dec                  hd
   1585    jg .h_w16_loop_v
   1586    RET
   1587 .v:
   1588 %if ARCH_X86_32
   1589    %define             dsq  r4
   1590    %define              m8  [base+pw_512]
   1591    movzx               mxd, ssb
   1592    shr                 ssd, 16
   1593    cmp                  hd, 6
   1594    cmovs               ssd, mxd
   1595    movq                 m7, [base_reg-put_ssse3+subpel_filters+1+ssq*8]
   1596    mov                 ssq, ssm
   1597    punpcklwd            m7, m7
   1598    pshufd               m5, m7, q0000
   1599    mov                  r6, ssq
   1600    pshufd               m6, m7, q1111
   1601    neg                  r6
   1602    pshufd               m7, m7, q2222
   1603    cmp                  wd, 4
   1604    jge .v_w4
   1605 %else
   1606    WIN64_SPILL_XMM       9, 12
   1607    movzx               mxd, myb
   1608    shr                 myd, 16
   1609    cmp                  hd, 6
   1610    cmovs               myd, mxd
   1611    movq                 m7, [base_reg-put_ssse3+subpel_filters+1+myq*8]
   1612    mova                 m8, [base+pw_512]
   1613    punpcklwd            m7, m7
   1614    pshufd               m5, m7, q0000
   1615    mov                 nsq, ssq
   1616    pshufd               m6, m7, q1111
   1617    neg                 nsq
   1618    pshufd               m7, m7, q2222
   1619    cmp                  wd, 4
   1620    je .v_w4
   1621    jg .v_w8
   1622 %endif
   1623 .v_w2:
   1624 %if ARCH_X86_32
   1625    mov                 dsq, dsm
   1626    movd                 m1, [srcq+r6 *2]
   1627    movd                 m3, [srcq+r6 *1]
   1628 %else
   1629    movd                 m1, [srcq+nsq*2]
   1630    movd                 m3, [srcq+nsq*1]
   1631 %endif
   1632    movd                 m2, [srcq+ssq*0]
   1633    movd                 m4, [srcq+ssq*1]
   1634    lea                srcq, [srcq+ssq*2]
   1635    movd                 m0, [srcq+ssq*0]
   1636    punpcklwd            m1, m3     ; 0 1
   1637    punpcklwd            m3, m2     ; 1 2
   1638    punpcklwd            m2, m4     ; 2 3
   1639    punpcklwd            m4, m0     ; 3 4
   1640    punpcklbw            m1, m3     ; 01 12
   1641    punpcklbw            m2, m4     ; 23 34
   1642 .v_w2_loop:
   1643    movd                 m3, [srcq+ssq*1]
   1644    lea                srcq, [srcq+ssq*2]
   1645    pmaddubsw            m4, m1, m5 ; a0 b0
   1646    mova                 m1, m2
   1647    pmaddubsw            m2, m6     ; a1 b1
   1648    paddw                m4, m2
   1649    punpcklwd            m2, m0, m3 ; 4 5
   1650    movd                 m0, [srcq+ssq*0]
   1651    punpcklwd            m3, m0     ; 5 6
   1652    punpcklbw            m2, m3     ; 67 78
   1653    pmaddubsw            m3, m2, m7 ; a2 b2
   1654    paddw                m4, m3
   1655    pmulhrsw             m4, m8
   1656    packuswb             m4, m4
   1657    movd                r6d, m4
   1658    mov        [dstq+dsq*0], r6w
   1659    shr                 r6d, 16
   1660    mov        [dstq+dsq*1], r6w
   1661    lea                dstq, [dstq+dsq*2]
   1662    sub                  hd, 2
   1663    jg .v_w2_loop
   1664    RET
   1665 .v_w4:
   1666 %if ARCH_X86_32
   1667    shl                  wd, 14
   1668    lea                srcq, [srcq+r6*2]
   1669    lea                 r6d, [hq+wq-(1<<16)]
   1670    mov                srcm, srcq
   1671    mov                 dsq, dsm
   1672 .v_w4_loop0:
   1673    movd                 m1, [srcq+ssq*0]
   1674    movd                 m3, [srcq+ssq*1]
   1675    lea                srcq, [srcq+ssq*2]
   1676 %else
   1677    movd                 m1, [srcq+nsq*2]
   1678    movd                 m3, [srcq+nsq*1]
   1679 %endif
   1680    movd                 m2, [srcq+ssq*0]
   1681    movd                 m4, [srcq+ssq*1]
   1682    lea                srcq, [srcq+ssq*2]
   1683    movd                 m0, [srcq+ssq*0]
   1684    punpckldq            m1, m3     ; 0 1
   1685    punpckldq            m3, m2     ; 1 2
   1686    punpckldq            m2, m4     ; 2 3
   1687    punpckldq            m4, m0     ; 3 4
   1688    punpcklbw            m1, m3     ; 01 12
   1689    punpcklbw            m2, m4     ; 23 34
   1690 .v_w4_loop:
   1691    movd                 m3, [srcq+ssq*1]
   1692    lea                srcq, [srcq+ssq*2]
   1693    pmaddubsw            m4, m1, m5 ; a0 b0
   1694    mova                 m1, m2
   1695    pmaddubsw            m2, m6     ; a1 b1
   1696    paddw                m4, m2
   1697    punpckldq            m2, m0, m3 ; 4 5
   1698    movd                 m0, [srcq+ssq*0]
   1699    punpckldq            m3, m0     ; 5 6
   1700    punpcklbw            m2, m3     ; 67 78
   1701    pmaddubsw            m3, m2, m7 ; a2 b2
   1702    paddw                m4, m3
   1703    pmulhrsw             m4, m8
   1704    packuswb             m4, m4
   1705    movd       [dstq+dsq*0], m4
   1706    psrlq                m4, 32
   1707    movd       [dstq+dsq*1], m4
   1708    lea                dstq, [dstq+dsq*2]
   1709    sub                  hd, 2
   1710    jg .v_w4_loop
   1711 %if ARCH_X86_32
   1712    mov                srcq, srcm
   1713    mov                dstq, dstm
   1714    movzx                hd, r6w
   1715    add                srcq, 4
   1716    add                dstq, 4
   1717    mov                srcm, srcq
   1718    mov                dstm, dstq
   1719    sub                 r6d, 1<<16
   1720    jg .v_w4_loop0
   1721 %endif
   1722    RET
   1723 %if ARCH_X86_64
   1724 .v_w8:
   1725    WIN64_PUSH_XMM       12
   1726    shl                  wd, 5
   1727    lea                 r6d, [hq+wq-256]
   1728 .v_w8_loop0:
   1729    movq                 m1, [srcq+nsq*2]
   1730    movq                 m2, [srcq+nsq*1]
   1731    lea                  r4, [srcq+ssq*2]
   1732    movq                 m3, [srcq+ssq*0]
   1733    movq                 m4, [srcq+ssq*1]
   1734    mov                  r7, dstq
   1735    movq                 m0, [r4  +ssq*0]
   1736    punpcklbw            m1, m2     ; 01
   1737    punpcklbw            m2, m3     ; 12
   1738    punpcklbw            m3, m4     ; 23
   1739    punpcklbw            m4, m0     ; 34
   1740 .v_w8_loop:
   1741    pmaddubsw           m10, m1, m5 ; a0
   1742    mova                 m1, m3
   1743    pmaddubsw           m11, m2, m5 ; b0
   1744    mova                 m2, m4
   1745    pmaddubsw            m3, m6     ; a1
   1746    pmaddubsw            m4, m6     ; b1
   1747    paddw               m10, m3
   1748    paddw               m11, m4
   1749    movq                 m4, [r4+ssq*1]
   1750    lea                  r4, [r4+ssq*2]
   1751    punpcklbw            m3, m0, m4 ; 67
   1752    movq                 m0, [r4+ssq*0]
   1753    punpcklbw            m4, m0     ; 78
   1754    pmaddubsw            m9, m3, m7 ; a2
   1755    paddw               m10, m9
   1756    pmaddubsw            m9, m4, m7 ; b2
   1757    paddw               m11, m9
   1758    pmulhrsw            m10, m8
   1759    pmulhrsw            m11, m8
   1760    packuswb            m10, m11
   1761    movq         [r7+dsq*0], m10
   1762    movhps       [r7+dsq*1], m10
   1763    lea                  r7, [r7+dsq*2]
   1764    sub                  hd, 2
   1765    jg .v_w8_loop
   1766    add                srcq, 8
   1767    add                dstq, 8
   1768    movzx                hd, r6b
   1769    sub                 r6d, 1<<8
   1770    jg .v_w8_loop0
   1771    RET
   1772 %endif ;ARCH_X86_64
   1773 .hv:
   1774    RESET_STACK_STATE
   1775    cmp                  wd, 4
   1776    jg .hv_w8
   1777 %if ARCH_X86_32
   1778    and                 mxd, 0x7f
   1779 %else
   1780    movzx               mxd, mxb
   1781 %endif
   1782    dec                srcq
   1783    movd                 m1, [base_reg-put_ssse3+subpel_filters+2+mxq*8]
   1784 %if ARCH_X86_32
   1785    movzx               mxd, ssb
   1786    shr                 ssd, 16
   1787    cmp                  hd, 6
   1788    cmovs               ssd, mxd
   1789    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+ssq*8]
   1790    mov                 ssq, ssmp
   1791    ALLOC_STACK   -mmsize*4
   1792    %define              m8  [rsp+mmsize*0]
   1793    %define              m9  [rsp+mmsize*1]
   1794    %define             m10  [rsp+mmsize*2]
   1795    punpcklbw            m0, m0
   1796    sub                srcq, ssq
   1797    psraw                m0, 8 ; sign-extend
   1798    sub                srcq, ssq
   1799    pshufd               m2, m0, q0000
   1800    mova                 m8, m2
   1801    pshufd               m2, m0, q1111
   1802    mova                 m9, m2
   1803    pshufd               m2, m0, q2222
   1804    mova                m10, m2
   1805 %else
   1806    movzx               mxd, myb
   1807    shr                 myd, 16
   1808    cmp                  hd, 6
   1809    cmovs               myd, mxd
   1810    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+myq*8]
   1811    WIN64_SPILL_XMM      11, 14
   1812    mov                 nsq, ssq
   1813    punpcklbw            m0, m0
   1814    neg                 nsq
   1815    psraw                m0, 8 ; sign-extend
   1816    pshufd               m8, m0, q0000
   1817    pshufd               m9, m0, q1111
   1818    pshufd              m10, m0, q2222
   1819 %endif
   1820    cmp                  wd, 4
   1821    je .hv_w4
   1822 .hv_w2:
   1823    mova                 m5, [base+subpel_h_shuf4]
   1824    mova                 m6, [base+pw_34]
   1825    pshufd               m7, m1, q0000
   1826 %if ARCH_X86_32
   1827    movq                 m2, [srcq+ssq*0]
   1828    movhps               m2, [srcq+ssq*1]
   1829    lea                srcq, [srcq+ssq*2]
   1830    mov                 dsq, [rstk+stack_offset+gprsize*2]
   1831 %else
   1832    movq                 m2, [srcq+nsq*2]
   1833    movhps               m2, [srcq+nsq*1] ; 0 1
   1834 %endif
   1835    movq                 m1, [srcq+ssq*0]
   1836    movhps               m1, [srcq+ssq*1] ; 2 3
   1837    lea                srcq, [srcq+ssq*2]
   1838    movq                 m0, [srcq+ssq*0] ; 4
   1839    REPX  {pshufb    x, m5}, m2, m1, m0
   1840    REPX  {pmaddubsw x, m7}, m2, m1, m0
   1841    phaddw               m2, m1
   1842    phaddw               m0, m0
   1843    paddw                m2, m6
   1844    paddw                m0, m6
   1845    psraw                m2, 2            ; 0 1 2 3
   1846    psraw                m0, 2
   1847    palignr              m0, m2, 4        ; 1 2 3 4
   1848    punpcklwd            m1, m2, m0       ; 01 12
   1849    punpckhwd            m2, m0           ; 23 34
   1850 .hv_w2_loop:
   1851    movq                 m3, [srcq+ssq*1]
   1852    lea                srcq, [srcq+ssq*2]
   1853    movhps               m3, [srcq+ssq*0] ; 5 6
   1854    pshufb               m3, m5
   1855    pmaddubsw            m3, m7
   1856    pmaddwd              m4, m8, m1       ; a0 b0
   1857    mova                 m1, m2
   1858    pmaddwd              m2, m9           ; a1 b1
   1859    phaddw               m3, m3
   1860    paddw                m3, m6
   1861    psraw                m3, 2
   1862    paddd                m4, m2
   1863    palignr              m2, m3, m0, 12   ; 4 5
   1864    mova                 m0, m3
   1865    punpcklwd            m2, m3           ; 45 56
   1866    pmaddwd              m3, m10, m2      ; a2 b2
   1867    paddd                m4, m3
   1868    psrad                m4, 10
   1869    packssdw             m4, m5
   1870    packuswb             m4, m4
   1871    movd                r6d, m4
   1872    mov        [dstq+dsq*0], r6w
   1873    shr                 r6d, 16
   1874    mov        [dstq+dsq*1], r6w
   1875    lea                dstq, [dstq+dsq*2]
   1876    sub                  hd, 2
   1877    jg .hv_w2_loop
   1878    RET
   1879 .hv_w4:
   1880 %if ARCH_X86_32
   1881    movq                 m3, [srcq+ssq*0]
   1882    movq                 m4, [srcq+ssq*1]
   1883    lea                srcq, [srcq+ssq*2]
   1884    mov                 dsq, [rstk+stack_offset+gprsize*2]
   1885    %define             m11  [base+pw_34]
   1886    %define             m12  [base+subpel_h_shufA]
   1887    %define             m13  [rsp+mmsize*3]
   1888    pshufd               m1, m1, q0000
   1889    mova                m13, m1
   1890 %else
   1891    WIN64_PUSH_XMM       14
   1892    movq                 m3, [srcq+nsq*2]
   1893    movq                 m4, [srcq+nsq*1]
   1894    pshufd              m13, m1, q0000
   1895    mova                m12, [base+subpel_h_shufA]
   1896    mova                m11, [base+pw_34]
   1897 %endif
   1898    movq                 m0, [srcq+ssq*0]
   1899    movq                 m1, [srcq+ssq*1]
   1900    lea                srcq, [srcq+ssq*2]
   1901    movq                 m2, [srcq+ssq*0]
   1902 %if ARCH_X86_32
   1903    mova                 m5, m12
   1904    mova                 m6, m13
   1905    REPX {pshufb    x, m5 }, m3, m4, m0, m1, m2
   1906    mova                 m5, m11
   1907    REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2
   1908 %else
   1909    REPX {pshufb    x, m12}, m3, m4, m0, m1, m2
   1910    REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2
   1911 %endif
   1912    phaddw               m3, m0      ; 0 2
   1913    phaddw               m4, m1      ; 1 3
   1914    phaddw               m0, m2      ; 2 4
   1915 %if ARCH_X86_32
   1916    REPX     {paddw x, m5 }, m3, m4, m0
   1917 %else
   1918    REPX     {paddw x, m11}, m3, m4, m0
   1919 %endif
   1920    REPX     {psraw x, 2  }, m3, m4, m0
   1921    punpcklwd            m1, m3, m4  ; 01
   1922    punpckhwd            m3, m4      ; 23
   1923    punpcklwd            m2, m4, m0  ; 12
   1924    punpckhwd            m4, m0      ; 34
   1925 .hv_w4_loop:
   1926    movq                 m7, [srcq+ssq*1]
   1927    lea                srcq, [srcq+ssq*2]
   1928    movq                 m6, [srcq+ssq*0]
   1929    pshufb               m7, m12
   1930    pshufb               m6, m12
   1931    pmaddubsw            m7, m13
   1932    pmaddubsw            m6, m13
   1933    pmaddwd              m5, m8, m1  ; a0
   1934    mova                 m1, m3
   1935    phaddw               m7, m6      ; 5 6
   1936    pmaddwd              m6, m8, m2  ; b0
   1937    mova                 m2, m4
   1938    pmaddwd              m3, m9      ; a1
   1939    pmaddwd              m4, m9      ; b1
   1940    paddw                m7, m11
   1941    psraw                m7, 2
   1942    paddd                m5, m3
   1943    paddd                m6, m4
   1944    shufpd               m4, m0, m7, 0x01 ; 4 5
   1945    mova                 m0, m7
   1946    punpcklwd            m3, m4, m7  ; 45
   1947    punpckhwd            m4, m7      ; 56
   1948    pmaddwd              m7, m10, m3 ; a2
   1949    paddd                m5, m7
   1950    pmaddwd              m7, m10, m4 ; b2
   1951    paddd                m6, m7
   1952    psrad                m5, 10
   1953    psrad                m6, 10
   1954    packssdw             m5, m6
   1955    packuswb             m5, m5
   1956    movd       [dstq+dsq*0], m5
   1957    psrlq                m5, 32
   1958    movd       [dstq+dsq*1], m5
   1959    lea                dstq, [dstq+dsq*2]
   1960    sub                  hd, 2
   1961    jg .hv_w4_loop
   1962    RET
   1963 .hv_w8:
   1964    RESET_STACK_STATE
   1965    shr                 mxd, 16
   1966    sub                srcq, 2
   1967 %if ARCH_X86_32
   1968    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8]
   1969    movzx               mxd, ssb
   1970    shr                 ssd, 16
   1971    cmp                  hd, 6
   1972    cmovs               ssd, mxd
   1973    movq                 m1, [base_reg-put_ssse3+subpel_filters+1+ssq*8]
   1974    shl                  wd, 13
   1975    mov                 ssq, ssm
   1976    lea                 r6d, [hq+wq-(1<<16)]
   1977 %assign regs_used 5
   1978    ALLOC_STACK  -mmsize*16
   1979 %assign regs_used 7
   1980    mov                 dsq, [rstk+stack_offset+gprsize*2]
   1981    sub                srcq, ssq
   1982    sub                srcq, ssq
   1983 %if STACK_ALIGNMENT < 16
   1984    %define            srcm  [esp+mmsize*15+gprsize*0]
   1985    %define            dstm  [esp+mmsize*15+gprsize*1]
   1986    mov                dstm, dstq
   1987 %endif
   1988    mov                srcm, srcq
   1989 %else
   1990    ALLOC_STACK        16*6, 16
   1991    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8]
   1992    movzx               mxd, myb
   1993    shr                 myd, 16
   1994    cmp                  hd, 6
   1995    cmovs               myd, mxd
   1996    movq                 m1, [base_reg-put_ssse3+subpel_filters+1+myq*8]
   1997    mov                 nsq, ssq
   1998    shl                  wd, 13
   1999    neg                 nsq
   2000    lea                 r6d, [hq+wq-(1<<16)]
   2001 %endif
   2002    mova                 m7, [base+pw_34]
   2003    punpcklwd            m0, m0
   2004    punpcklbw            m1, m1
   2005    psraw                m1, 8 ; sign-extend
   2006    pshufd               m2, m0, q0000
   2007    mova         [rsp+16*0], m2
   2008    pshufd               m2, m0, q1111
   2009    mova         [rsp+16*1], m2
   2010    pshufd               m0, m0, q2222
   2011    mova         [rsp+16*2], m0
   2012    pshufd               m2, m1, q0000
   2013    mova         [rsp+16*3], m2
   2014    pshufd               m2, m1, q1111
   2015    mova         [rsp+16*4], m2
   2016    pshufd               m1, m1, q2222
   2017    mova         [rsp+16*5], m1
   2018 %macro HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \
   2019                     [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3]
   2020    pshufb               %2, %1, %4
   2021    pshufb               %1, %5
   2022    pmaddubsw            %3, %2, %6
   2023    shufps               %2, %1, q2121
   2024    pmaddubsw            %1, %8
   2025    pmaddubsw            %2, %7
   2026    paddw                %3, m7
   2027    paddw                %1, %3
   2028    paddw                %1, %2
   2029    psraw                %1, 2
   2030 %endmacro
   2031 .hv_w8_loop0:
   2032    mova                 m2, [base+subpel_h_shufD]
   2033    mova                 m3, [base+subpel_h_shufF]
   2034    mova                 m4, [rsp+16*0]
   2035 %if ARCH_X86_32
   2036    movu                 m0, [srcq+ssq*0]
   2037    movu                 m1, [srcq+ssq*1]
   2038    lea                srcq, [srcq+ssq*2]
   2039    HV_H_6TAP            m0, m5, m6, m2, m3, m4
   2040    HV_H_6TAP            m1, m5, m6, m2, m3, m4
   2041    movu                 m5, [srcq+ssq*0]
   2042    punpcklwd            m6, m0, m1   ; 01
   2043    punpckhwd            m0, m1
   2044    mova        [rsp+16* 6], m6
   2045    mova        [rsp+16* 7], m0
   2046    HV_H_6TAP            m5, m0, m6, m2, m3, m4
   2047    movu                 m0, [srcq+ssq*1]
   2048    lea                srcq, [srcq+ssq*2]
   2049    punpcklwd            m6, m1, m5   ; 12
   2050    punpckhwd            m1, m5
   2051    mova        [rsp+16* 8], m6
   2052    mova        [rsp+16* 9], m1
   2053    HV_H_6TAP            m0, m1, m6, m2, m3, m4
   2054    movu                 m1, [srcq+ssq*0]
   2055    punpcklwd            m6, m5, m0   ; 23
   2056    punpckhwd            m5, m0
   2057    mova        [rsp+16*10], m6
   2058    mova        [rsp+16*11], m5
   2059    HV_H_6TAP            m1, m5, m6, m2, m3, m4
   2060    mova        [rsp+16*14], m1
   2061    punpcklwd            m6, m0, m1   ; 34
   2062    punpckhwd            m0, m1
   2063    mova        [rsp+16*12], m6
   2064    mova        [rsp+16*13], m0
   2065 .hv_w8_loop:
   2066    mova                 m3, [rsp+16* 3]
   2067    pmaddwd              m0, m3, [rsp+16* 6] ; a0
   2068    pmaddwd              m2, m3, [rsp+16* 7] ; a0'
   2069    pmaddwd              m1, m3, [rsp+16* 8] ; b0
   2070    pmaddwd              m3, [rsp+16* 9]     ; b0'
   2071    mova                 m6, [rsp+16* 4]
   2072    mova                 m4, [rsp+16*10]
   2073    mova                 m5, [rsp+16*11]
   2074    mova        [rsp+16* 6], m4
   2075    pmaddwd              m4, m6       ; a1
   2076    mova        [rsp+16* 7], m5
   2077    pmaddwd              m5, m6       ; a1'
   2078    paddd                m0, m4
   2079    mova                 m4, [rsp+16*12]
   2080    paddd                m2, m5
   2081    mova                 m5, [rsp+16*13]
   2082    mova        [rsp+16* 8], m4
   2083    pmaddwd              m4, m6       ; b1
   2084    mova        [rsp+16* 9], m5
   2085    pmaddwd              m5, m6       ; b1'
   2086    movu                 m6, [srcq+ssq*1]
   2087    lea                srcq, [srcq+ssq*2]
   2088    paddd                m1, m4
   2089    paddd                m3, m5
   2090    HV_H_6TAP            m6, m4, m5
   2091    mova                 m5, [rsp+16*14]
   2092    punpcklwd            m4, m5, m6   ; 45
   2093    punpckhwd            m5, m6
   2094    mova        [rsp+16*10], m4
   2095    mova        [rsp+16*11], m5
   2096    pmaddwd              m4, [rsp+16*5] ; a2
   2097    pmaddwd              m5, [rsp+16*5] ; a2'
   2098    paddd                m0, m4
   2099    movu                 m4, [srcq+ssq*0]
   2100    paddd                m2, m5
   2101    psrad                m0, 10
   2102    psrad                m2, 10
   2103    packssdw             m0, m2
   2104    HV_H_6TAP            m4, m2, m5
   2105    mova                 m2, [rsp+16*5]
   2106    punpcklwd            m5, m6, m4   ; 56
   2107    mova        [rsp+16*14], m4
   2108    punpckhwd            m6, m4
   2109    mova        [rsp+16*12], m5
   2110    pmaddwd              m5, m2       ; b2
   2111    mova        [rsp+16*13], m6
   2112    pmaddwd              m6, m2       ; b2'
   2113    paddd                m1, m5
   2114    paddd                m3, m6
   2115    psrad                m1, 10
   2116    psrad                m3, 10
   2117    packssdw             m1, m3
   2118    packuswb             m0, m1
   2119    movq       [dstq+dsq*0], m0
   2120    movhps     [dstq+dsq*1], m0
   2121    lea                dstq, [dstq+dsq*2]
   2122    sub                  hd, 2
   2123    jg .hv_w8_loop
   2124    mov                srcq, srcm
   2125    mov                dstq, dstm
   2126    movzx                hd, r6w
   2127    add                srcq, 8
   2128    add                dstq, 8
   2129    mov                srcm, srcq
   2130    mov                dstm, dstq
   2131 %else
   2132    movu                 m9, [srcq+nsq*2]
   2133    movu                m11, [srcq+nsq*1]
   2134    lea                  r4, [srcq+ssq*2]
   2135    movu                m13, [srcq+ssq*0]
   2136    movu                m15, [srcq+ssq*1]
   2137    mov                  r7, dstq
   2138    movu                 m6, [r4  +ssq*0]
   2139    mova                 m5, [rsp+16*1]
   2140    mova                 m8, [rsp+16*2]
   2141    HV_H_6TAP            m9, m0, m1, m2, m3, m4, m5, m8
   2142    HV_H_6TAP           m11, m0, m1, m2, m3, m4, m5, m8
   2143    HV_H_6TAP           m13, m0, m1, m2, m3, m4, m5, m8
   2144    HV_H_6TAP           m15, m0, m1, m2, m3, m4, m5, m8
   2145    HV_H_6TAP            m6, m0, m1, m2, m3, m4, m5, m8
   2146    punpcklwd            m8, m9, m11  ; 01
   2147    punpckhwd            m9, m11
   2148    punpcklwd           m10, m11, m13 ; 12
   2149    punpckhwd           m11, m13
   2150    punpcklwd           m12, m13, m15 ; 23
   2151    punpckhwd           m13, m15
   2152    punpcklwd           m14, m15, m6  ; 34
   2153    punpckhwd           m15, m6
   2154 .hv_w8_loop:
   2155    mova                 m3, [rsp+16*3]
   2156    mova                 m4, [rsp+16*4]
   2157    pmaddwd              m0, m8, m3  ; a0
   2158    mova                 m8, m12
   2159    pmaddwd              m2, m9, m3  ; a0'
   2160    mova                 m9, m13
   2161    pmaddwd              m1, m10, m3 ; b0
   2162    mova                m10, m14
   2163    pmaddwd              m3, m11     ; b0'
   2164    mova                m11, m15
   2165    REPX    {pmaddwd x, m4}, m12, m13, m14, m15
   2166    paddd                m0, m12
   2167    paddd                m2, m13
   2168    paddd                m1, m14
   2169    paddd                m3, m15
   2170    movu                m15, [r4+ssq*1]
   2171    lea                  r4, [r4+ssq*2]
   2172    HV_H_6TAP           m15, m4, m5
   2173    punpcklwd           m12, m6, m15
   2174    punpckhwd           m13, m6, m15
   2175    movu                 m6, [r4+ssq*0]
   2176    HV_H_6TAP            m6, m4, m5
   2177    mova                 m4, [rsp+16*5]
   2178    punpcklwd           m14, m15, m6
   2179    punpckhwd           m15, m6
   2180    pmaddwd              m5, m12, m4  ; a2
   2181    paddd                m0, m5
   2182    pmaddwd              m5, m13, m4  ; a2'
   2183    paddd                m2, m5
   2184    pmaddwd              m5, m14, m4  ; b2
   2185    paddd                m1, m5
   2186    pmaddwd              m4, m15      ; b2'
   2187    paddd                m3, m4
   2188    REPX      {psrad x, 10}, m0, m2, m1, m3
   2189    packssdw             m0, m2
   2190    packssdw             m1, m3
   2191    packuswb             m0, m1
   2192    movq         [r7+dsq*0], m0
   2193    movhps       [r7+dsq*1], m0
   2194    lea                  r7, [r7+dsq*2]
   2195    sub                  hd, 2
   2196    jg .hv_w8_loop
   2197    add                srcq, 8
   2198    add                dstq, 8
   2199    movzx                hd, r6b
   2200 %endif
   2201    sub                 r6d, 1<<16
   2202    jg .hv_w8_loop0
   2203    RET
   2204 
   2205 PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_8bpc
   2206 PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_8bpc
   2207 PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_8bpc
   2208 PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_8bpc
   2209 PUT_8TAP_FN sharp,          SHARP,   SHARP
   2210 
   2211 cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
   2212    imul                mxd, mxm, 0x010101
   2213    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   2214 %if ARCH_X86_64
   2215    imul                myd, mym, 0x010101
   2216    add                 myd, t1d ; 8tap_v, my, 4tap_v
   2217 %else
   2218    imul                ssd, mym, 0x010101
   2219    add                 ssd, t1d ; 8tap_v, my, 4tap_v
   2220    mov                srcq, srcm
   2221 %endif
   2222    mov                  wd, wm
   2223    movifnidn            hd, hm
   2224    LEA            base_reg, put_ssse3
   2225    test                mxd, 0xf00
   2226    jnz .h
   2227 %if ARCH_X86_32
   2228    test                ssd, 0xf00
   2229 %else
   2230    test                myd, 0xf00
   2231 %endif
   2232    jnz .v
   2233    tzcnt                wd, wd
   2234    movzx                wd, word [base_reg+wq*2+table_offset(put,)]
   2235    movifnidn           ssq, ssmp
   2236    add                  wq, base_reg
   2237    movifnidn           dsq, dsmp
   2238 %if WIN64
   2239    pop                  r8
   2240 %endif
   2241    lea                  r6, [ssq*3]
   2242    jmp                  wq
   2243 .h_w2:
   2244    mova                 m3, [base+subpel_h_shuf4]
   2245    movifnidn           dsq, dsmp
   2246 .h_w2_loop:
   2247    movq                 m0, [srcq+ssq*0]
   2248    movhps               m0, [srcq+ssq*1]
   2249    lea                srcq, [srcq+ssq*2]
   2250    pshufb               m0, m3
   2251    pmaddubsw            m0, m4
   2252    phaddw               m0, m0
   2253    paddw                m0, m5 ; pw34
   2254    psraw                m0, 6
   2255    packuswb             m0, m0
   2256    movd                r6d, m0
   2257    mov        [dstq+dsq*0], r6w
   2258    shr                 r6d, 16
   2259    mov        [dstq+dsq*1], r6w
   2260    lea                dstq, [dstq+dsq*2]
   2261    sub                  hd, 2
   2262    jg .h_w2_loop
   2263    RET
   2264 .h_w4:
   2265 %if ARCH_X86_32
   2266    and                 mxd, 0x7f
   2267 %else
   2268    movzx               mxd, mxb
   2269 %endif
   2270    movd                 m4, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
   2271    dec                srcq
   2272    pshufd               m4, m4, q0000
   2273    cmp                  wd, 4
   2274    jl .h_w2
   2275    mova                 m3, [base+subpel_h_shufA]
   2276    movifnidn           dsq, dsmp
   2277 .h_w4_loop:
   2278    movq                 m0, [srcq+ssq*0] ; 1
   2279    movq                 m1, [srcq+ssq*1] ; 2
   2280    lea                srcq, [srcq+ssq*2]
   2281    pshufb               m0, m3 ; subpel_h_shufA
   2282    pshufb               m1, m3 ; subpel_h_shufA
   2283    pmaddubsw            m0, m4 ; subpel_filters
   2284    pmaddubsw            m1, m4 ; subpel_filters
   2285    phaddw               m0, m1
   2286    paddw                m0, m5 ; pw34
   2287    psraw                m0, 6
   2288    packuswb             m0, m0
   2289    movd       [dstq+dsq*0], m0
   2290    psrlq                m0, 32
   2291    movd       [dstq+dsq*1], m0
   2292    lea                dstq, [dstq+dsq*2]
   2293    sub                  hd, 2
   2294    jg .h_w4_loop
   2295    RET
   2296 .h:
   2297 %if ARCH_X86_32
   2298    test                ssd, 0xf00
   2299 %else
   2300    test                myd, 0xf00
   2301 %endif
   2302    jnz .hv
   2303    movifnidn           ssq, ssmp
   2304    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
   2305    cmp                  wd, 4
   2306    jle .h_w4
   2307    WIN64_SPILL_XMM      12
   2308 %if ARCH_X86_64
   2309    mova                m10, [base+subpel_h_shufA]
   2310    mova                m11, [base+subpel_h_shufB]
   2311    mova                 m9, [base+subpel_h_shufC]
   2312 %endif
   2313    shr                 mxd, 16
   2314    sub                srcq, 3
   2315    movq                 m7, [base_reg+mxq*8+subpel_filters-put_ssse3]
   2316    pshufd               m6, m7, q0000
   2317    pshufd               m7, m7, q1111
   2318    sub                  wd, 16
   2319    jge .h_w16
   2320 %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
   2321 %if ARCH_X86_32
   2322    pshufb              %2, %1, [base+subpel_h_shufB]
   2323    pshufb              %3, %1, [base+subpel_h_shufC]
   2324    pshufb              %1,     [base+subpel_h_shufA]
   2325 %else
   2326    pshufb              %2, %1, m11; subpel_h_shufB
   2327    pshufb              %3, %1, m9 ; subpel_h_shufC
   2328    pshufb              %1, m10    ; subpel_h_shufA
   2329 %endif
   2330    pmaddubsw           %4, %2, m6 ; subpel +0 B0
   2331    pmaddubsw           %2, m7     ; subpel +4 B4
   2332    pmaddubsw           %3, m7     ; C4
   2333    pmaddubsw           %1, m6     ; A0
   2334    paddw               %3, %4     ; C4+B0
   2335    paddw               %1, %2     ; A0+B4
   2336    phaddw              %1, %3
   2337    paddw               %1, m5     ; pw34
   2338    psraw               %1, 6
   2339 %endmacro
   2340 .h_w8:
   2341    movu                 m0, [srcq+ssq*0]
   2342    movu                 m1, [srcq+ssq*1]
   2343    lea                srcq, [srcq+ssq*2]
   2344    PUT_8TAP_H           m0, m2, m3, m4
   2345    PUT_8TAP_H           m1, m2, m3, m4
   2346    packuswb             m0, m1
   2347 %if ARCH_X86_32
   2348    movq             [dstq], m0
   2349    add                dstq, dsm
   2350    movhps           [dstq], m0
   2351    add                dstq, dsm
   2352 %else
   2353    movq       [dstq+dsq*0], m0
   2354    movhps     [dstq+dsq*1], m0
   2355    lea                dstq, [dstq+dsq*2]
   2356 %endif
   2357    sub                  hd, 2
   2358    jg .h_w8
   2359    RET
   2360 .h_w16:
   2361    add                srcq, wq
   2362    add                dstq, wq
   2363    neg                  wq
   2364 .h_w16_loop_v:
   2365    mov                  r6, wq
   2366 .h_w16_loop_h:
   2367    movu                 m0, [srcq+r6+8*0]
   2368    movu                 m1, [srcq+r6+8*1]
   2369    PUT_8TAP_H           m0, m2, m3, m4
   2370    PUT_8TAP_H           m1, m2, m3, m4
   2371    packuswb             m0, m1
   2372    mova          [dstq+r6], m0
   2373    add                  r6, 16
   2374    jle .h_w16_loop_h
   2375    add                srcq, ssq
   2376    add                dstq, dsmp
   2377    dec                  hd
   2378    jg .h_w16_loop_v
   2379    RET
   2380 .v:
   2381 %if ARCH_X86_32
   2382    movzx               mxd, ssb
   2383    shr                 ssd, 16
   2384    cmp                  hd, 6
   2385    cmovs               ssd, mxd
   2386    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
   2387 %else
   2388    WIN64_SPILL_XMM      16
   2389    movzx               mxd, myb
   2390    shr                 myd, 16
   2391    cmp                  hd, 6
   2392    cmovs               myd, mxd
   2393    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
   2394 %endif
   2395    punpcklwd            m0, m0
   2396    mova                 m7, [base+pw_512]
   2397 %if ARCH_X86_32
   2398 %define            subpel0  [rsp+mmsize*0]
   2399 %define            subpel1  [rsp+mmsize*1]
   2400 %define            subpel2  [rsp+mmsize*2]
   2401 %define            subpel3  [rsp+mmsize*3]
   2402 %assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
   2403    ALLOC_STACK       -16*4
   2404 %assign regs_used 7
   2405    pshufd               m1, m0, q0000
   2406    mova            subpel0, m1
   2407    pshufd               m1, m0, q1111
   2408    mova            subpel1, m1
   2409    pshufd               m1, m0, q2222
   2410    mova            subpel2, m1
   2411    pshufd               m1, m0, q3333
   2412    mova            subpel3, m1
   2413    mov                 ssq, [rstk+stack_offset+gprsize*4]
   2414    lea                 ssq, [ssq*3]
   2415    sub                srcq, ssq
   2416    mov                 ssq, [rstk+stack_offset+gprsize*4]
   2417    mov                 dsq, [rstk+stack_offset+gprsize*2]
   2418    cmp                  wd, 2
   2419    jne .v_w4
   2420 %else
   2421 %define            subpel0  m8
   2422 %define            subpel1  m9
   2423 %define            subpel2  m10
   2424 %define            subpel3  m11
   2425    lea                ss3q, [ssq*3]
   2426    pshufd               m8, m0, q0000
   2427    sub                srcq, ss3q
   2428    pshufd               m9, m0, q1111
   2429    pshufd              m10, m0, q2222
   2430    pshufd              m11, m0, q3333
   2431    cmp                  wd, 4
   2432    je .v_w4
   2433    jg .v_w8
   2434 %endif
   2435 .v_w2:
   2436    movd                 m1, [srcq+ssq*0]
   2437    movd                 m0, [srcq+ssq*1]
   2438 %if ARCH_X86_32
   2439    lea                srcq, [srcq+ssq*2]
   2440    movd                 m2, [srcq+ssq*0]
   2441    movd                 m5, [srcq+ssq*1]
   2442    lea                srcq, [srcq+ssq*2]
   2443    movd                 m3, [srcq+ssq*0]
   2444    movd                 m4, [srcq+ssq*1]
   2445    lea                srcq, [srcq+ssq*2]
   2446 %else
   2447    movd                 m2, [srcq+ssq*2]
   2448    add                srcq, ss3q
   2449    movd                 m5, [srcq+ssq*0]
   2450    movd                 m3, [srcq+ssq*1]
   2451    movd                 m4, [srcq+ssq*2]
   2452    add                srcq, ss3q
   2453 %endif
   2454    punpcklwd            m1, m0           ; 0 1
   2455    punpcklwd            m0, m2           ; 1 2
   2456    punpcklbw            m1, m0           ; 01 12
   2457    movd                 m0, [srcq+ssq*0]
   2458    punpcklwd            m2, m5           ; 2 3
   2459    punpcklwd            m5, m3           ; 3 4
   2460    punpcklwd            m3, m4           ; 4 5
   2461    punpcklwd            m4, m0           ; 5 6
   2462    punpcklbw            m2, m5           ; 23 34
   2463    punpcklbw            m3, m4           ; 45 56
   2464 .v_w2_loop:
   2465    movd                 m4, [srcq+ssq*1]
   2466    lea                srcq, [srcq+ssq*2]
   2467    pmaddubsw            m5, m1, subpel0     ; a0 b0
   2468    mova                 m1, m2
   2469    pmaddubsw            m2, subpel1         ; a1 b1
   2470    paddw                m5, m2
   2471    mova                 m2, m3
   2472    pmaddubsw            m3, subpel2         ; a2 b2
   2473    paddw                m5, m3
   2474    punpcklwd            m3, m0, m4          ; 6 7
   2475    movd                 m0, [srcq+ssq*0]
   2476    punpcklwd            m4, m0              ; 7 8
   2477    punpcklbw            m3, m4              ; 67 78
   2478    pmaddubsw            m4, m3, subpel3     ; a3 b3
   2479    paddw                m5, m4
   2480    pmulhrsw             m5, m7
   2481    packuswb             m5, m5
   2482    movd                r6d, m5
   2483    mov        [dstq+dsq*0], r6w
   2484    shr                 r6d, 16
   2485    mov        [dstq+dsq*1], r6w
   2486    lea                dstq, [dstq+dsq*2]
   2487    sub                  hd, 2
   2488    jg .v_w2_loop
   2489    RET
   2490 .v_w4:
   2491 %if ARCH_X86_32
   2492    shl                  wd, 14
   2493 %if STACK_ALIGNMENT < 16
   2494 %define               dstm [rsp+mmsize*4+gprsize]
   2495    mov                dstm, dstq
   2496 %endif
   2497    lea                 r6d, [hq+wq-(1<<16)]
   2498    mov                  r4, srcq
   2499 .v_w4_loop0:
   2500 %endif
   2501    movd                 m1, [srcq+ssq*0]
   2502    movd                 m0, [srcq+ssq*1]
   2503 %if ARCH_X86_32
   2504    lea                srcq, [srcq+ssq*2]
   2505    movd                 m2, [srcq+ssq*0]
   2506    movd                 m5, [srcq+ssq*1]
   2507    lea                srcq, [srcq+ssq*2]
   2508    movd                 m3, [srcq+ssq*0]
   2509    movd                 m4, [srcq+ssq*1]
   2510    lea                srcq, [srcq+ssq*2]
   2511 %else
   2512    movd                 m2, [srcq+ssq*2]
   2513    add                srcq, ss3q
   2514    movd                 m5, [srcq+ssq*0]
   2515    movd                 m3, [srcq+ssq*1]
   2516    movd                 m4, [srcq+ssq*2]
   2517    add                srcq, ss3q
   2518 %endif
   2519    punpckldq            m1, m0           ; 0 1
   2520    punpckldq            m0, m2           ; 1 2
   2521    punpcklbw            m1, m0           ; 01 12
   2522    movd                 m0, [srcq+ssq*0]
   2523    punpckldq            m2, m5           ; 2 3
   2524    punpckldq            m5, m3           ; 3 4
   2525    punpckldq            m3, m4           ; 4 5
   2526    punpckldq            m4, m0           ; 5 6
   2527    punpcklbw            m2, m5           ; 23 34
   2528    punpcklbw            m3, m4           ; 45 56
   2529 .v_w4_loop:
   2530    movd                 m4, [srcq+ssq*1]
   2531    lea                srcq, [srcq+ssq*2]
   2532    pmaddubsw            m5, m1, subpel0  ; a0 b0
   2533    mova                 m1, m2
   2534    pmaddubsw            m2, subpel1      ; a1 b1
   2535    paddw                m5, m2
   2536    mova                 m2, m3
   2537    pmaddubsw            m3, subpel2      ; a2 b2
   2538    paddw                m5, m3
   2539    punpckldq            m3, m0, m4       ; 6 7 _ _
   2540    movd                 m0, [srcq+ssq*0]
   2541    punpckldq            m4, m0           ; 7 8 _ _
   2542    punpcklbw            m3, m4           ; 67 78
   2543    pmaddubsw            m4, m3, subpel3  ; a3 b3
   2544    paddw                m5, m4
   2545    pmulhrsw             m5, m7
   2546    packuswb             m5, m5
   2547    movd       [dstq+dsq*0], m5
   2548    psrlq                m5, 32
   2549    movd       [dstq+dsq*1], m5
   2550    lea                dstq, [dstq+dsq*2]
   2551    sub                  hd, 2
   2552    jg .v_w4_loop
   2553 %if ARCH_X86_32
   2554    mov                dstq, dstm
   2555    add                  r4, 4
   2556    movzx                hd, r6w
   2557    add                dstq, 4
   2558    mov                srcq, r4
   2559    mov                dstm, dstq
   2560    sub                 r6d, 1<<16
   2561    jg .v_w4_loop0
   2562 %endif
   2563    RET
   2564 %if ARCH_X86_64
   2565 .v_w8:
   2566    shl                  wd, 5
   2567    lea                 r6d, [hq+wq-256]
   2568 .v_w8_loop0:
   2569    movq                 m1, [srcq+ssq*0]
   2570    movq                 m2, [srcq+ssq*1]
   2571    lea                  r4, [srcq+ss3q]
   2572    movq                 m3, [srcq+ssq*2]
   2573    movq                 m4, [r4  +ssq*0]
   2574    mov                  r7, dstq
   2575    movq                 m5, [r4  +ssq*1]
   2576    movq                 m6, [r4  +ssq*2]
   2577    add                  r4, ss3q
   2578    movq                 m0, [r4  +ssq*0]
   2579    punpcklbw            m1, m2 ; 01
   2580    punpcklbw            m2, m3 ; 12
   2581    punpcklbw            m3, m4 ; 23
   2582    punpcklbw            m4, m5 ; 34
   2583    punpcklbw            m5, m6 ; 45
   2584    punpcklbw            m6, m0 ; 56
   2585 .v_w8_loop:
   2586    movq                m13, [r4+ssq*1]
   2587    lea                  r4, [r4+ssq*2]
   2588    pmaddubsw           m14, m1, subpel0 ; a0
   2589    mova                 m1, m3
   2590    pmaddubsw           m15, m2, subpel0 ; b0
   2591    mova                 m2, m4
   2592    pmaddubsw            m3, subpel1 ; a1
   2593    mova                m12, m0
   2594    pmaddubsw            m4, subpel1 ; b1
   2595    movq                 m0, [r4+ssq*0]
   2596    paddw               m14, m3
   2597    paddw               m15, m4
   2598    mova                 m3, m5
   2599    pmaddubsw            m5, subpel2 ; a2
   2600    mova                 m4, m6
   2601    pmaddubsw            m6, subpel2 ; b2
   2602    punpcklbw           m12, m13     ; 67
   2603    punpcklbw           m13, m0      ; 78
   2604    paddw               m14, m5
   2605    mova                 m5, m12
   2606    pmaddubsw           m12, subpel3 ; a3
   2607    paddw               m15, m6
   2608    mova                 m6, m13
   2609    pmaddubsw           m13, subpel3 ; b3
   2610    paddw               m14, m12
   2611    paddw               m15, m13
   2612    pmulhrsw            m14, m7
   2613    pmulhrsw            m15, m7
   2614    packuswb            m14, m15
   2615    movq         [r7+dsq*0], m14
   2616    movhps       [r7+dsq*1], m14
   2617    lea                  r7, [r7+dsq*2]
   2618    sub                  hd, 2
   2619    jg .v_w8_loop
   2620    add                srcq, 8
   2621    add                dstq, 8
   2622    movzx                hd, r6b
   2623    sub                 r6d, 1<<8
   2624    jg .v_w8_loop0
   2625    RET
   2626 %endif ;ARCH_X86_64
   2627 %undef subpel0
   2628 %undef subpel1
   2629 %undef subpel2
   2630 %undef subpel3
   2631 .hv:
   2632    RESET_STACK_STATE
   2633    cmp                  wd, 4
   2634    jg .hv_w8
   2635 %if ARCH_X86_32
   2636    and                 mxd, 0x7f
   2637 %else
   2638    movzx               mxd, mxb
   2639 %endif
   2640    dec                srcq
   2641    movd                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
   2642 %if ARCH_X86_32
   2643    movzx               mxd, ssb
   2644    shr                 ssd, 16
   2645    cmp                  hd, 6
   2646    cmovs               ssd, mxd
   2647    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
   2648    mov                 ssq, ssmp
   2649    lea                  r6, [ssq*3]
   2650    sub                srcq, r6
   2651 %define           base_reg  r6
   2652    mov                  r6, r1; use as new base
   2653 %assign regs_used 2
   2654    ALLOC_STACK  -mmsize*14
   2655 %assign regs_used 7
   2656    mov                 dsq, [rstk+stack_offset+gprsize*2]
   2657 %define           subpelv0  [rsp+mmsize*0]
   2658 %define           subpelv1  [rsp+mmsize*1]
   2659 %define           subpelv2  [rsp+mmsize*2]
   2660 %define           subpelv3  [rsp+mmsize*3]
   2661    punpcklbw            m0, m0
   2662    psraw                m0, 8 ; sign-extend
   2663    pshufd               m6, m0, q0000
   2664    mova           subpelv0, m6
   2665    pshufd               m6, m0, q1111
   2666    mova           subpelv1, m6
   2667    pshufd               m6, m0, q2222
   2668    mova           subpelv2, m6
   2669    pshufd               m6, m0, q3333
   2670    mova           subpelv3, m6
   2671 %else
   2672    movzx               mxd, myb
   2673    shr                 myd, 16
   2674    cmp                  hd, 6
   2675    cmovs               myd, mxd
   2676    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
   2677    ALLOC_STACK   mmsize*14, 14
   2678    lea                ss3q, [ssq*3]
   2679    sub                srcq, ss3q
   2680 %define           subpelv0  m10
   2681 %define           subpelv1  m11
   2682 %define           subpelv2  m12
   2683 %define           subpelv3  m13
   2684    punpcklbw            m0, m0
   2685    psraw                m0, 8 ; sign-extend
   2686    mova                 m8, [base+pw_8192]
   2687    mova                 m9, [base+pd_512]
   2688    pshufd              m10, m0, q0000
   2689    pshufd              m11, m0, q1111
   2690    pshufd              m12, m0, q2222
   2691    pshufd              m13, m0, q3333
   2692 %endif
   2693    pshufd               m7, m1, q0000
   2694    cmp                  wd, 4
   2695    je .hv_w4
   2696 .hv_w2:
   2697    mova                 m6, [base+subpel_h_shuf4]
   2698    movq                 m2, [srcq+ssq*0]     ; 0
   2699    movhps               m2, [srcq+ssq*1]     ; 0 _ 1
   2700 %if ARCH_X86_32
   2701 %define           w8192reg  [base+pw_8192]
   2702 %define            d512reg  [base+pd_512]
   2703    lea                srcq, [srcq+ssq*2]
   2704    movq                 m0, [srcq+ssq*0]     ; 2
   2705    movhps               m0, [srcq+ssq*1]     ; 2 _ 3
   2706    lea                srcq, [srcq+ssq*2]
   2707 %else
   2708 %define           w8192reg  m8
   2709 %define            d512reg  m9
   2710    movq                 m0, [srcq+ssq*2]     ; 2
   2711    add                srcq, ss3q
   2712    movhps               m0, [srcq+ssq*0]     ; 2 _ 3
   2713 %endif
   2714    pshufb               m2, m6 ; 0 ~ 1 ~
   2715    pshufb               m0, m6 ; 2 ~ 3 ~
   2716    pmaddubsw            m2, m7 ; subpel_filters
   2717    pmaddubsw            m0, m7 ; subpel_filters
   2718    phaddw               m2, m0 ; 0 1 2 3
   2719    pmulhrsw             m2, w8192reg
   2720 %if ARCH_X86_32
   2721    movq                 m3, [srcq+ssq*0]     ; 4
   2722    movhps               m3, [srcq+ssq*1]     ; 4 _ 5
   2723    lea                srcq, [srcq+ssq*2]
   2724 %else
   2725    movq                 m3, [srcq+ssq*1]     ; 4
   2726    movhps               m3, [srcq+ssq*2]     ; 4 _ 5
   2727    add                srcq, ss3q
   2728 %endif
   2729    movq                 m0, [srcq+ssq*0]     ; 6
   2730    pshufb               m3, m6 ; 4 ~ 5 ~
   2731    pshufb               m0, m6 ; 6 ~
   2732    pmaddubsw            m3, m7 ; subpel_filters
   2733    pmaddubsw            m0, m7 ; subpel_filters
   2734    phaddw               m3, m0 ; 4 5 6 _
   2735    pmulhrsw             m3, w8192reg
   2736    palignr              m4, m3, m2, 4; V        1 2 3 4
   2737    punpcklwd            m1, m2, m4   ; V 01 12    0 1 1 2
   2738    punpckhwd            m2, m4       ; V 23 34    2 3 3 4
   2739    pshufd               m0, m3, q2121; V          5 6 5 6
   2740    punpcklwd            m3, m0       ; V 45 56    4 5 5 6
   2741 .hv_w2_loop:
   2742    movq                 m4, [srcq+ssq*1] ; V 7
   2743    lea                srcq, [srcq+ssq*2] ; V
   2744    movhps               m4, [srcq+ssq*0] ; V 7 8
   2745    pshufb               m4, m6
   2746    pmaddubsw            m4, m7
   2747    pmaddwd              m5, m1, subpelv0; V a0 b0
   2748    mova                 m1, m2       ; V
   2749    pmaddwd              m2, subpelv1 ; V a1 b1
   2750    paddd                m5, m2       ; V
   2751    mova                 m2, m3       ; V
   2752    pmaddwd              m3, subpelv2 ; a2 b2
   2753    phaddw               m4, m4
   2754    pmulhrsw             m4, w8192reg
   2755    paddd                m5, m3       ; V
   2756    palignr              m3, m4, m0, 12
   2757    mova                 m0, m4
   2758    punpcklwd            m3, m0           ; V 67 78
   2759    pmaddwd              m4, m3, subpelv3 ; V a3 b3
   2760    paddd                m5, d512reg
   2761    paddd                m5, m4
   2762    psrad                m5, 10
   2763    packssdw             m5, m5
   2764    packuswb             m5, m5
   2765    movd                r4d, m5
   2766    mov        [dstq+dsq*0], r4w
   2767    shr                 r4d, 16
   2768    mov        [dstq+dsq*1], r4w
   2769    lea                dstq, [dstq+dsq*2]
   2770    sub                  hd, 2
   2771    jg .hv_w2_loop
   2772    RET
   2773 %undef w8192reg
   2774 %undef d512reg
   2775 .hv_w4:
   2776 %define hv4_line_0_0 4
   2777 %define hv4_line_0_1 5
   2778 %define hv4_line_0_2 6
   2779 %define hv4_line_0_3 7
   2780 %define hv4_line_0_4 8
   2781 %define hv4_line_0_5 9
   2782 %define hv4_line_1_0 10
   2783 %define hv4_line_1_1 11
   2784 %define hv4_line_1_2 12
   2785 %define hv4_line_1_3 13
   2786 %macro SAVELINE_W4 3
   2787    mova     [rsp+mmsize*hv4_line_%3_%2], %1
   2788 %endmacro
   2789 %macro RESTORELINE_W4 3
   2790    mova     %1, [rsp+mmsize*hv4_line_%3_%2]
   2791 %endmacro
   2792 %if ARCH_X86_32
   2793 %define           w8192reg  [base+pw_8192]
   2794 %define            d512reg  [base+pd_512]
   2795 %else
   2796 %define           w8192reg  m8
   2797 %define            d512reg  m9
   2798 %endif
   2799    ; lower shuffle 0 1 2 3 4
   2800    mova                 m6, [base+subpel_h_shuf4]
   2801    movq                 m5, [srcq+ssq*0]   ; 0 _ _ _
   2802    movhps               m5, [srcq+ssq*1]   ; 0 _ 1 _
   2803 %if ARCH_X86_32
   2804    lea                srcq, [srcq+ssq*2]
   2805    movq                 m4, [srcq+ssq*0]   ; 2 _ _ _
   2806    movhps               m4, [srcq+ssq*1]   ; 2 _ 3 _
   2807    lea                srcq, [srcq+ssq*2]
   2808 %else
   2809    movq                 m4, [srcq+ssq*2]   ; 2 _ _ _
   2810    movhps               m4, [srcq+ss3q ]   ; 2 _ 3 _
   2811    lea                srcq, [srcq+ssq*4]
   2812 %endif
   2813    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
   2814    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
   2815    pmaddubsw            m2, m7 ;H subpel_filters
   2816    pmaddubsw            m0, m7 ;H subpel_filters
   2817    phaddw               m2, m0 ;H 0 1 2 3
   2818    pmulhrsw             m2, w8192reg ;H pw_8192
   2819    SAVELINE_W4          m2, 2, 0
   2820    ; upper shuffle 2 3 4 5 6
   2821    mova                 m6, [base+subpel_h_shuf4+16]
   2822    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
   2823    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
   2824    pmaddubsw            m2, m7 ;H subpel_filters
   2825    pmaddubsw            m0, m7 ;H subpel_filters
   2826    phaddw               m2, m0 ;H 0 1 2 3
   2827    pmulhrsw             m2, w8192reg ;H pw_8192
   2828    ;
   2829    ; lower shuffle
   2830    mova                 m6, [base+subpel_h_shuf4]
   2831    movq                 m5, [srcq+ssq*0]   ; 4 _ _ _
   2832    movhps               m5, [srcq+ssq*1]   ; 4 _ 5 _
   2833 %if ARCH_X86_32
   2834    lea                srcq, [srcq+ssq*2]
   2835    movq                 m4, [srcq+ssq*0]   ; 6 _ _ _
   2836    add                srcq, ssq
   2837 %else
   2838    movq                 m4, [srcq+ssq*2]   ; 6 _ _ _
   2839    add                srcq, ss3q
   2840 %endif
   2841    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
   2842    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
   2843    pmaddubsw            m3, m7 ;H subpel_filters
   2844    pmaddubsw            m0, m7 ;H subpel_filters
   2845    phaddw               m3, m0 ;H 4 5 6 7
   2846    pmulhrsw             m3, w8192reg ;H pw_8192
   2847    SAVELINE_W4          m3, 3, 0
   2848    ; upper shuffle
   2849    mova                 m6, [base+subpel_h_shuf4+16]
   2850    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
   2851    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
   2852    pmaddubsw            m3, m7 ;H subpel_filters
   2853    pmaddubsw            m0, m7 ;H subpel_filters
   2854    phaddw               m3, m0 ;H 4 5 6 7
   2855    pmulhrsw             m3, w8192reg ;H pw_8192
   2856    ;process high
   2857    palignr              m4, m3, m2, 4;V 1 2 3 4
   2858    punpcklwd            m1, m2, m4  ; V 01 12
   2859    punpckhwd            m2, m4      ; V 23 34
   2860    pshufd               m0, m3, q2121;V 5 6 5 6
   2861    punpcklwd            m3, m0      ; V 45 56
   2862    SAVELINE_W4          m0, 0, 1
   2863    SAVELINE_W4          m1, 1, 1
   2864    SAVELINE_W4          m2, 2, 1
   2865    SAVELINE_W4          m3, 3, 1
   2866    ;process low
   2867    RESTORELINE_W4       m2, 2, 0
   2868    RESTORELINE_W4       m3, 3, 0
   2869    palignr              m4, m3, m2, 4;V 1 2 3 4
   2870    punpcklwd            m1, m2, m4  ; V 01 12
   2871    punpckhwd            m2, m4      ; V 23 34
   2872    pshufd               m0, m3, q2121;V 5 6 5 6
   2873    punpcklwd            m3, m0      ; V 45 56
   2874 .hv_w4_loop:
   2875    ;process low
   2876    pmaddwd              m5, m1, subpelv0 ; V a0 b0
   2877    mova                 m1, m2
   2878    pmaddwd              m2, subpelv1; V a1 b1
   2879    paddd                m5, m2
   2880    mova                 m2, m3
   2881    pmaddwd              m3, subpelv2; V a2 b2
   2882    paddd                m5, m3
   2883    mova                 m6, [base+subpel_h_shuf4]
   2884    movq                 m4, [srcq+ssq*0] ; 7
   2885    movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
   2886    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
   2887    pmaddubsw            m4, m7 ;H subpel_filters
   2888    phaddw               m4, m4 ;H                7 8 7 8
   2889    pmulhrsw             m4, w8192reg ;H pw_8192
   2890    palignr              m3, m4, m0, 12         ; 6 7 8 7
   2891    mova                 m0, m4
   2892    punpcklwd            m3, m4      ; 67 78
   2893    pmaddwd              m4, m3, subpelv3; a3 b3
   2894    paddd                m5, d512reg ; pd_512
   2895    paddd                m5, m4
   2896    psrad                m5, 10
   2897    SAVELINE_W4          m0, 0, 0
   2898    SAVELINE_W4          m1, 1, 0
   2899    SAVELINE_W4          m2, 2, 0
   2900    SAVELINE_W4          m3, 3, 0
   2901    SAVELINE_W4          m5, 5, 0
   2902    ;process high
   2903    RESTORELINE_W4       m0, 0, 1
   2904    RESTORELINE_W4       m1, 1, 1
   2905    RESTORELINE_W4       m2, 2, 1
   2906    RESTORELINE_W4       m3, 3, 1
   2907    pmaddwd              m5, m1, subpelv0; V a0 b0
   2908    mova                 m1, m2
   2909    pmaddwd              m2, subpelv1; V a1 b1
   2910    paddd                m5, m2
   2911    mova                 m2, m3
   2912    pmaddwd              m3, subpelv2; V a2 b2
   2913    paddd                m5, m3
   2914    mova                 m6, [base+subpel_h_shuf4+16]
   2915    movq                 m4, [srcq+ssq*0] ; 7
   2916    movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
   2917    lea                srcq, [srcq+ssq*2]
   2918    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
   2919    pmaddubsw            m4, m7 ;H subpel_filters
   2920    phaddw               m4, m4 ;H                7 8 7 8
   2921    pmulhrsw             m4, w8192reg ;H pw_8192
   2922    palignr              m3, m4, m0, 12         ; 6 7 8 7
   2923    mova                 m0, m4
   2924    punpcklwd            m3, m4      ; 67 78
   2925    pmaddwd              m4, m3, subpelv3; a3 b3
   2926    paddd                m5, d512reg ; pd_512
   2927    paddd                m5, m4
   2928    psrad                m4, m5, 10
   2929    RESTORELINE_W4       m5, 5, 0
   2930    packssdw             m5, m4 ; d -> w
   2931    packuswb             m5, m5 ; w -> b
   2932    pshuflw              m5, m5, q3120
   2933    movd       [dstq+dsq*0], m5
   2934    psrlq                m5, 32
   2935    movd       [dstq+dsq*1], m5
   2936    lea                dstq, [dstq+dsq*2]
   2937    sub                  hd, 2
   2938    SAVELINE_W4          m0, 0, 1
   2939    SAVELINE_W4          m1, 1, 1
   2940    SAVELINE_W4          m2, 2, 1
   2941    SAVELINE_W4          m3, 3, 1
   2942    RESTORELINE_W4       m0, 0, 0
   2943    RESTORELINE_W4       m1, 1, 0
   2944    RESTORELINE_W4       m2, 2, 0
   2945    RESTORELINE_W4       m3, 3, 0
   2946    jg .hv_w4_loop
   2947    RET
   2948 %undef subpelv0
   2949 %undef subpelv1
   2950 %undef subpelv2
   2951 %undef subpelv3
   2952 .hv_w8:
   2953    RESET_STACK_STATE
   2954 %define hv8_line_1 0
   2955 %define hv8_line_2 1
   2956 %define hv8_line_3 2
   2957 %define hv8_line_4 3
   2958 %define hv8_line_6 4
   2959 %macro SAVELINE_W8 2
   2960    mova     [rsp+hv8_line_%1*mmsize], %2
   2961 %endmacro
   2962 %macro RESTORELINE_W8 2
   2963    mova     %2, [rsp+hv8_line_%1*mmsize]
   2964 %endmacro
   2965    shr                 mxd, 16
   2966    sub                srcq, 3
   2967 %if ARCH_X86_32
   2968 %define           base_reg  r1
   2969 %define           subpelh0  [rsp+mmsize*5]
   2970 %define           subpelh1  [rsp+mmsize*6]
   2971 %define           subpelv0  [rsp+mmsize*7]
   2972 %define           subpelv1  [rsp+mmsize*8]
   2973 %define           subpelv2  [rsp+mmsize*9]
   2974 %define           subpelv3  [rsp+mmsize*10]
   2975 %define             accuv0  [rsp+mmsize*11]
   2976 %define             accuv1  [rsp+mmsize*12]
   2977    movq                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
   2978    movzx               mxd, ssb
   2979    shr                 ssd, 16
   2980    cmp                  hd, 6
   2981    cmovs               ssd, mxd
   2982    movq                 m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
   2983    mov                 ssq, ssmp
   2984    ALLOC_STACK  -mmsize*13
   2985 %if STACK_ALIGNMENT < 16
   2986 %define               dstm  [rsp+mmsize*13+gprsize*1]
   2987 %define                dsm  [rsp+mmsize*13+gprsize*2]
   2988    mov                  r6, [rstk+stack_offset+gprsize*2]
   2989    mov                 dsm, r6
   2990 %endif
   2991    pshufd               m0, m1, q0000
   2992    pshufd               m1, m1, q1111
   2993    punpcklbw            m5, m5
   2994    psraw                m5, 8 ; sign-extend
   2995    pshufd               m2, m5, q0000
   2996    pshufd               m3, m5, q1111
   2997    pshufd               m4, m5, q2222
   2998    pshufd               m5, m5, q3333
   2999    mova           subpelh0, m0
   3000    mova           subpelh1, m1
   3001    mova           subpelv0, m2
   3002    mova           subpelv1, m3
   3003    mova           subpelv2, m4
   3004    mova           subpelv3, m5
   3005    lea                  r6, [ssq*3]
   3006    mov                dstm, dstq
   3007    sub                srcq, r6
   3008 %else
   3009    ALLOC_STACK        16*5, 16
   3010 %define           subpelh0  m10
   3011 %define           subpelh1  m11
   3012 %define           subpelv0  m12
   3013 %define           subpelv1  m13
   3014 %define           subpelv2  m14
   3015 %define           subpelv3  m15
   3016 %define             accuv0  m8
   3017 %define             accuv1  m9
   3018    movq                 m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
   3019    movzx               mxd, myb
   3020    shr                 myd, 16
   3021    cmp                  hd, 6
   3022    cmovs               myd, mxd
   3023    movq                 m1, [base_reg+myq*8+subpel_filters-put_ssse3]
   3024    pshufd         subpelh0, m0, q0000
   3025    pshufd         subpelh1, m0, q1111
   3026    punpcklbw            m1, m1
   3027    psraw                m1, 8 ; sign-extend
   3028    pshufd         subpelv0, m1, q0000
   3029    pshufd         subpelv1, m1, q1111
   3030    pshufd         subpelv2, m1, q2222
   3031    pshufd         subpelv3, m1, q3333
   3032    lea                ss3q, [ssq*3]
   3033    mov                  r7, dstq
   3034    sub                srcq, ss3q
   3035 %endif
   3036    shl                  wd, 14
   3037    lea                 r6d, [hq+wq-(1<<16)]
   3038    mov                  r4, srcq
   3039 .hv_w8_loop0:
   3040    movu                 m4, [srcq+ssq*0] ; 0 = _ _
   3041    movu                 m5, [srcq+ssq*1] ; 1 = _ _
   3042 %if ARCH_X86_32
   3043    lea                srcq, [srcq+ssq*2]
   3044 %endif
   3045 %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
   3046 %if ARCH_X86_32
   3047    pshufb               %3, %1, [base+subpel_h_shufB]
   3048    pshufb               %4, %1, [base+subpel_h_shufC]
   3049    pshufb               %1,     [base+subpel_h_shufA]
   3050 %else
   3051    pshufb               %3, %1, %6  ; subpel_h_shufB
   3052    pshufb               %4, %1, %7  ; subpel_h_shufC
   3053    pshufb               %1, %5      ; subpel_h_shufA
   3054 %endif
   3055    pmaddubsw            %2, %3, subpelh0 ; subpel +0 C0
   3056    pmaddubsw            %4, subpelh1; subpel +4 B4
   3057    pmaddubsw            %3, subpelh1; C4
   3058    pmaddubsw            %1, subpelh0; A0
   3059    paddw                %2, %4      ; C0+B4
   3060    paddw                %1, %3      ; A0+C4
   3061    phaddw               %1, %2
   3062 %endmacro
   3063 %if ARCH_X86_64
   3064    mova                 m7, [base+subpel_h_shufA]
   3065    mova                 m8, [base+subpel_h_shufB]
   3066    mova                 m9, [base+subpel_h_shufC]
   3067 %endif
   3068    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
   3069    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
   3070 %if ARCH_X86_32
   3071    movu                 m6, [srcq+ssq*0] ; 2 = _ _
   3072    movu                 m0, [srcq+ssq*1] ; 3 = _ _
   3073    lea                srcq, [srcq+ssq*2]
   3074 %else
   3075    movu                 m6, [srcq+ssq*2] ; 2 = _ _
   3076    add                srcq, ss3q
   3077    movu                 m0, [srcq+ssq*0] ; 3 = _ _
   3078 %endif
   3079    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
   3080    HV_H_W8              m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
   3081    mova                 m7, [base+pw_8192]
   3082    pmulhrsw             m4, m7 ; H pw_8192
   3083    pmulhrsw             m5, m7 ; H pw_8192
   3084    pmulhrsw             m6, m7 ; H pw_8192
   3085    pmulhrsw             m0, m7 ; H pw_8192
   3086    punpcklwd            m1, m4, m5  ; 0 1 ~
   3087    punpcklwd            m2, m5, m6  ; 1 2 ~
   3088    punpcklwd            m3, m6, m0  ; 2 3 ~
   3089    SAVELINE_W8           1, m1
   3090    SAVELINE_W8           2, m2
   3091    SAVELINE_W8           3, m3
   3092    mova                 m7, [base+subpel_h_shufA]
   3093 %if ARCH_X86_32
   3094    movu                 m4, [srcq+ssq*0]       ; 4 = _ _
   3095    movu                 m5, [srcq+ssq*1]       ; 5 = _ _
   3096    lea                srcq, [srcq+ssq*2]
   3097 %else
   3098    movu                 m4, [srcq+ssq*1]       ; 4 = _ _
   3099    movu                 m5, [srcq+ssq*2]       ; 5 = _ _
   3100    add                srcq, ss3q
   3101 %endif
   3102    movu                 m6, [srcq+ssq*0]       ; 6 = _ _
   3103    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
   3104    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
   3105    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
   3106    mova                 m7, [base+pw_8192]
   3107    pmulhrsw             m1, m4, m7 ; H pw_8192 4 ~
   3108    pmulhrsw             m2, m5, m7 ; H pw_8192 5 ~
   3109    pmulhrsw             m3, m6, m7 ; H pw_8192 6 ~
   3110    punpcklwd            m4, m0, m1  ; 3 4 ~
   3111    punpcklwd            m5, m1, m2  ; 4 5 ~
   3112    punpcklwd            m6, m2, m3  ; 5 6 ~
   3113    SAVELINE_W8           6, m3
   3114    RESTORELINE_W8        1, m1
   3115    RESTORELINE_W8        2, m2
   3116    RESTORELINE_W8        3, m3
   3117 .hv_w8_loop:
   3118    ; m8 accu for V a
   3119    ; m9 accu for V b
   3120    SAVELINE_W8           1, m3
   3121    SAVELINE_W8           2, m4
   3122    SAVELINE_W8           3, m5
   3123    SAVELINE_W8           4, m6
   3124 %if ARCH_X86_32
   3125    pmaddwd              m0, m1, subpelv0 ; a0
   3126    pmaddwd              m7, m2, subpelv0 ; b0
   3127    pmaddwd              m3, subpelv1     ; a1
   3128    pmaddwd              m4, subpelv1     ; b1
   3129    paddd                m0, m3
   3130    paddd                m7, m4
   3131    pmaddwd              m5, subpelv2     ; a2
   3132    pmaddwd              m6, subpelv2     ; b2
   3133    paddd                m0, m5
   3134    paddd                m7, m6
   3135    mova                 m5, [base+pd_512]
   3136    paddd                m0, m5 ;   pd_512
   3137    paddd                m7, m5 ;   pd_512
   3138    mova             accuv0, m0
   3139    mova             accuv1, m7
   3140 %else
   3141    pmaddwd              m8, m1, subpelv0 ; a0
   3142    pmaddwd              m9, m2, subpelv0 ; b0
   3143    pmaddwd              m3, subpelv1     ; a1
   3144    pmaddwd              m4, subpelv1     ; b1
   3145    paddd                m8, m3
   3146    paddd                m9, m4
   3147    pmaddwd              m5, subpelv2     ; a2
   3148    pmaddwd              m6, subpelv2     ; b2
   3149    paddd                m8, m5
   3150    paddd                m9, m6
   3151    mova                 m7, [base+pd_512]
   3152    paddd                m8, m7 ;   pd_512
   3153    paddd                m9, m7 ;   pd_512
   3154    mova                 m7, [base+subpel_h_shufB]
   3155    mova                 m6, [base+subpel_h_shufC]
   3156    mova                 m5, [base+subpel_h_shufA]
   3157 %endif
   3158    movu                 m0, [srcq+ssq*1] ; 7
   3159    movu                 m4, [srcq+ssq*2] ; 8
   3160    lea                srcq, [srcq+ssq*2]
   3161    HV_H_W8              m0, m1, m2, m3, m5, m7, m6
   3162    HV_H_W8              m4, m1, m2, m3, m5, m7, m6
   3163    mova                 m5, [base+pw_8192]
   3164    pmulhrsw             m0, m5 ; H pw_8192
   3165    pmulhrsw             m4, m5 ; H pw_8192
   3166    RESTORELINE_W8        6, m6
   3167    punpcklwd            m5, m6, m0  ; 6 7  ~
   3168    punpcklwd            m6, m0, m4  ; 7 8 ~
   3169    pmaddwd              m1, m5, subpelv3 ; a3
   3170    paddd                m2, m1, accuv0
   3171    pmaddwd              m1, m6, subpelv3 ; b3
   3172    paddd                m1, m1, accuv1 ; H + V
   3173    psrad                m2, 10
   3174    psrad                m1, 10
   3175    packssdw             m2, m1  ; d -> w
   3176    packuswb             m2, m1 ; w -> b
   3177    movd       [dstq+dsq*0], m2
   3178    psrlq                m2, 32
   3179 %if ARCH_X86_32
   3180    add                dstq, dsm
   3181    movd       [dstq+dsq*0], m2
   3182    add                dstq, dsm
   3183 %else
   3184    movd       [dstq+dsq*1], m2
   3185    lea                dstq, [dstq+dsq*2]
   3186 %endif
   3187    sub                  hd, 2
   3188    jle .hv_w8_outer
   3189    SAVELINE_W8           6, m4
   3190    RESTORELINE_W8        1, m1
   3191    RESTORELINE_W8        2, m2
   3192    RESTORELINE_W8        3, m3
   3193    RESTORELINE_W8        4, m4
   3194    jmp .hv_w8_loop
   3195 .hv_w8_outer:
   3196 %if ARCH_X86_32
   3197    mov                dstq, dstm
   3198    add                  r4, 4
   3199    movzx                hd, r6w
   3200    add                dstq, 4
   3201    mov                srcq, r4
   3202    mov                dstm, dstq
   3203 %else
   3204    add                  r4, 4
   3205    add                  r7, 4
   3206    movzx                hd, r6b
   3207    mov                srcq, r4
   3208    mov                dstq, r7
   3209 %endif
   3210    sub                 r6d, 1<<16
   3211    jg .hv_w8_loop0
   3212    RET
   3213 
   3214 %if ARCH_X86_32
   3215 DECLARE_REG_TMP 1, 2
   3216 %elif WIN64
   3217 DECLARE_REG_TMP 6, 4
   3218 %else
   3219 DECLARE_REG_TMP 6, 7
   3220 %endif
   3221 
   3222 %if ARCH_X86_32
   3223 %define base_reg r2
   3224 %define base base_reg-prep_ssse3
   3225 %else
   3226 %define base_reg r7
   3227 %define base 0
   3228 %endif
   3229 
   3230 %define PREP_8TAP_FN FN prep_8tap,
   3231 PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_8bpc
   3232 PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_8bpc
   3233 PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_8bpc
   3234 PREP_8TAP_FN regular,        REGULAR, REGULAR
   3235 
   3236 cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, mx, my, ns
   3237    imul                mxd, mxm, 0x010101
   3238    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   3239    imul                myd, mym, 0x010101
   3240    add                 myd, t1d ; 8tap_v, my, 4tap_v
   3241    mov                  wd, wm
   3242    movifnidn          srcd, srcm
   3243    movifnidn            hd, hm
   3244    LEA            base_reg, prep_ssse3
   3245    test                mxd, 0xf00
   3246    jnz .h
   3247    test                myd, 0xf00
   3248    jnz .v
   3249 .prep:
   3250    tzcnt                wd, wd
   3251    movzx                wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
   3252    pxor                 m4, m4
   3253    add                  wq, base_reg
   3254    movifnidn           ssq, ssmp
   3255    lea                  r6, [ssq*3]
   3256 %if WIN64
   3257    pop                  r8
   3258    pop                  r7
   3259 %endif
   3260    jmp                  wq
   3261 .h:
   3262    test                myd, 0xf00
   3263    jnz .hv
   3264    test                myd, 0xf00
   3265    jnz .hv
   3266 %if ARCH_X86_32
   3267 %define ssq r6
   3268    mov                 ssq, ssmp
   3269 %endif
   3270    cmp                  wd, 4
   3271    jle mangle(private_prefix %+ _prep_8tap_8bpc %+ SUFFIX).h_w4
   3272    WIN64_SPILL_XMM      11
   3273    mova                 m5, [base+pw_8192]
   3274 %if ARCH_X86_64
   3275    mova                 m8, [base+subpel_h_shufD]
   3276    mova                 m9, [base+subpel_h_shufE]
   3277    mova                m10, [base+subpel_h_shufF]
   3278 %endif
   3279    shr                 mxd, 16
   3280    sub                srcq, 2
   3281    movq                 m7, [base_reg-prep_ssse3+subpel_filters+1+mxq*8]
   3282    punpcklwd            m7, m7
   3283    pshufd               m4, m7, q0000
   3284    pshufd               m6, m7, q1111
   3285    pshufd               m7, m7, q2222
   3286    sub                  wd, 16
   3287    jge .h_w16
   3288 %macro PREP_6TAP_H 3 ; dst/src, tmp[1-2]
   3289 %if ARCH_X86_32
   3290    pshufb               %2, %1, [base+subpel_h_shufD]
   3291    pshufb               %3, %1, [base+subpel_h_shufE]
   3292    pshufb               %1, [base+subpel_h_shufF]
   3293 %else
   3294    pshufb               %2, %1, m8
   3295    pshufb               %3, %1, m9
   3296    pshufb               %1, m10
   3297 %endif
   3298    pmaddubsw            %2, m4
   3299    pmaddubsw            %3, m6
   3300    pmaddubsw            %1, m7
   3301    paddw                %2, %3
   3302    paddw                %1, %2
   3303    pmulhrsw             %1, m5
   3304 %endmacro
   3305 .h_w8:
   3306    movu                 m0, [srcq+ssq*0]
   3307    movu                 m1, [srcq+ssq*1]
   3308    lea                srcq, [srcq+ssq*2]
   3309    PREP_6TAP_H          m0, m2, m3
   3310    PREP_6TAP_H          m1, m2, m3
   3311    mova        [tmpq+16*0], m0
   3312    mova        [tmpq+16*1], m1
   3313    add                tmpq, 32
   3314    sub                  hd, 2
   3315    jg .h_w8
   3316    RET
   3317 .h_w16:
   3318    add                srcq, wq
   3319    neg                  wq
   3320 .h_w16_loop_v:
   3321    mov                  r5, wq
   3322 .h_w16_loop_h:
   3323    movu                 m0, [srcq+r5+8*0]
   3324    movu                 m1, [srcq+r5+8*1]
   3325    PREP_6TAP_H          m0, m2, m3
   3326    PREP_6TAP_H          m1, m2, m3
   3327    mova        [tmpq+16*0], m0
   3328    mova        [tmpq+16*1], m1
   3329    add                tmpq, 32
   3330    add                  r5, 16
   3331    jle .h_w16_loop_h
   3332    add                srcq, ssq
   3333    dec                  hd
   3334    jg .h_w16_loop_v
   3335    RET
   3336 .v:
   3337 %if ARCH_X86_32
   3338    mov                 mxd, myd
   3339    and                 mxd, 0x7f
   3340 %else
   3341    WIN64_SPILL_XMM       9, 12
   3342    movzx               mxd, myb
   3343 %endif
   3344    shr                 myd, 16
   3345    cmp                  hd, 6
   3346    cmovs               myd, mxd
   3347    movq                 m7, [base_reg-prep_ssse3+subpel_filters+1+myq*8]
   3348    punpcklwd            m7, m7
   3349    pshufd               m5, m7, q0000
   3350    pshufd               m6, m7, q1111
   3351    pshufd               m7, m7, q2222
   3352 %if ARCH_X86_32
   3353    %define              m8  [base+pw_8192]
   3354    mov                 ssq, ssm
   3355    sub                srcq, ssq
   3356    sub                srcq, ssq
   3357 %else
   3358    mova                 m8, [base+pw_8192]
   3359    mov                 nsq, ssq
   3360    neg                 nsq
   3361    cmp                  wd, 4
   3362    jg .v_w8
   3363 %endif
   3364 .v_w4:
   3365 %if ARCH_X86_32
   3366    lea                 r5d, [wq-4]
   3367    shl                 r5d, 14
   3368    add                 r5d, hd
   3369    mov                srcm, srcq
   3370 .v_w4_loop0:
   3371    movd                 m1, [srcq+ssq*0]
   3372    movd                 m3, [srcq+ssq*1]
   3373    lea                srcq, [srcq+ssq*2]
   3374 %else
   3375    movd                 m1, [srcq+nsq*2]
   3376    movd                 m3, [srcq+nsq*1]
   3377 %endif
   3378    movd                 m2, [srcq+ssq*0]
   3379    movd                 m4, [srcq+ssq*1]
   3380    lea                srcq, [srcq+ssq*2]
   3381    movd                 m0, [srcq+ssq*0]
   3382    punpckldq            m1, m3     ; 0 1
   3383    punpckldq            m3, m2     ; 1 2
   3384    punpckldq            m2, m4     ; 2 3
   3385    punpckldq            m4, m0     ; 3 4
   3386    punpcklbw            m1, m3     ; 01 12
   3387    punpcklbw            m2, m4     ; 23 34
   3388 .v_w4_loop:
   3389    movd                 m3, [srcq+ssq*1]
   3390    lea                srcq, [srcq+ssq*2]
   3391    pmaddubsw            m4, m1, m5 ; a0 b0
   3392    mova                 m1, m2
   3393    pmaddubsw            m2, m6     ; a1 b1
   3394    paddw                m4, m2
   3395    punpckldq            m2, m0, m3 ; 4 5
   3396    movd                 m0, [srcq+ssq*0]
   3397    punpckldq            m3, m0     ; 5 6
   3398    punpcklbw            m2, m3     ; 67 78
   3399    pmaddubsw            m3, m2, m7 ; a2 b2
   3400    paddw                m4, m3
   3401    pmulhrsw             m4, m8
   3402 %if ARCH_X86_32
   3403    movq        [tmpq+wq*0], m4
   3404    movhps      [tmpq+wq*2], m4
   3405    lea                tmpq, [tmpq+wq*4]
   3406    sub                  hd, 2
   3407    jg .v_w4_loop
   3408    mov                srcq, srcm
   3409    mov                tmpq, tmpm
   3410    movzx                hd, r5w
   3411    add                srcq, 4
   3412    add                tmpq, 8
   3413    mov                srcm, srcq
   3414    mov                tmpm, tmpq
   3415    sub                 r5d, 1<<16
   3416    jg .v_w4_loop0
   3417 %else
   3418    mova             [tmpq], m4
   3419    add                tmpq, 16
   3420    sub                  hd, 2
   3421    jg .v_w4_loop
   3422 %endif
   3423    RET
   3424 %if ARCH_X86_64
   3425 .v_w8:
   3426    WIN64_PUSH_XMM       12
   3427    lea                 r6d, [wq*4-32]
   3428    lea                 r6d, [r6*8+hq]
   3429 .v_w8_loop0:
   3430    movq                 m1, [srcq+nsq*2]
   3431    movq                 m2, [srcq+nsq*1]
   3432    lea                  r5, [srcq+ssq*2]
   3433    movq                 m3, [srcq+ssq*0]
   3434    movq                 m4, [srcq+ssq*1]
   3435    mov                  r8, tmpq
   3436    movq                 m0, [r5  +ssq*0]
   3437    punpcklbw            m1, m2     ; 01
   3438    punpcklbw            m2, m3     ; 12
   3439    punpcklbw            m3, m4     ; 23
   3440    punpcklbw            m4, m0     ; 34
   3441 .v_w8_loop:
   3442    pmaddubsw           m10, m1, m5 ; a0
   3443    mova                 m1, m3
   3444    pmaddubsw           m11, m2, m5 ; b0
   3445    mova                 m2, m4
   3446    pmaddubsw            m3, m6     ; a1
   3447    pmaddubsw            m4, m6     ; b1
   3448    paddw               m10, m3
   3449    paddw               m11, m4
   3450    movq                 m4, [r5+ssq*1]
   3451    lea                  r5, [r5+ssq*2]
   3452    punpcklbw            m3, m0, m4 ; 67
   3453    movq                 m0, [r5+ssq*0]
   3454    punpcklbw            m4, m0     ; 78
   3455    pmaddubsw            m9, m3, m7 ; a2
   3456    paddw               m10, m9
   3457    pmaddubsw            m9, m4, m7 ; b2
   3458    paddw               m11, m9
   3459    pmulhrsw            m10, m8
   3460    pmulhrsw            m11, m8
   3461    mova          [r8+wq*0], m10
   3462    mova          [r8+wq*2], m11
   3463    lea                  r8, [r8+wq*4]
   3464    sub                  hd, 2
   3465    jg .v_w8_loop
   3466    add                srcq, 8
   3467    add                tmpq, 16
   3468    movzx                hd, r6b
   3469    sub                 r6d, 1<<8
   3470    jg .v_w8_loop0
   3471    RET
   3472 %endif ;ARCH_X86_64
   3473 .hv:
   3474    RESET_STACK_STATE
   3475    cmp                  wd, 4
   3476    jg .hv_w8
   3477 %if ARCH_X86_32
   3478    and                 mxd, 0x7f
   3479 %else
   3480    movzx               mxd, mxb
   3481 %endif
   3482    dec                srcq
   3483    movd                 m1, [base_reg-prep_ssse3+subpel_filters+2+mxq*8]
   3484 %if ARCH_X86_32
   3485    mov                 mxd, myd
   3486    and                 mxd, 0x7f
   3487 %else
   3488    movzx               mxd, myb
   3489 %endif
   3490    shr                 myd, 16
   3491    cmp                  hd, 6
   3492    cmovs               myd, mxd
   3493    movq                 m0, [base_reg-prep_ssse3+subpel_filters+1+myq*8]
   3494 %if ARCH_X86_32
   3495    mov                 ssq, ssmp
   3496 %define regs_used 6
   3497    ALLOC_STACK   -mmsize*4
   3498 %define regs_used 7
   3499    %define              m8  [rsp+mmsize*0]
   3500    %define              m9  [rsp+mmsize*1]
   3501    %define             m10  [rsp+mmsize*2]
   3502    punpcklbw            m0, m0
   3503    sub                srcq, ssq
   3504    psraw                m0, 8 ; sign-extend
   3505    sub                srcq, ssq
   3506    pshufd               m2, m0, q0000
   3507    mova                 m8, m2
   3508    pshufd               m2, m0, q1111
   3509    mova                 m9, m2
   3510    pshufd               m2, m0, q2222
   3511    mova                m10, m2
   3512    movq                 m3, [srcq+ssq*0]
   3513    movq                 m4, [srcq+ssq*1]
   3514    lea                srcq, [srcq+ssq*2]
   3515    %define             m11  [base+pw_8192]
   3516    %define             m12  [base+subpel_h_shufA]
   3517    %define             m13  [rsp+mmsize*3]
   3518    %define             m14  [base+pd_32]
   3519    pshufd               m1, m1, q0000
   3520    mova                m13, m1
   3521 %else
   3522    WIN64_SPILL_XMM      15
   3523    mov                 nsq, ssq
   3524    punpcklbw            m0, m0
   3525    neg                 nsq
   3526    psraw                m0, 8 ; sign-extend
   3527    pshufd               m8, m0, q0000
   3528    pshufd               m9, m0, q1111
   3529    pshufd              m10, m0, q2222
   3530    movq                 m3, [srcq+nsq*2]
   3531    movq                 m4, [srcq+nsq*1]
   3532    pshufd              m13, m1, q0000
   3533    mova                m12, [base+subpel_h_shufA]
   3534    mova                m11, [base+pw_8192]
   3535    mova                m14, [base+pd_32]
   3536 %endif
   3537    movq                 m0, [srcq+ssq*0]
   3538    movq                 m1, [srcq+ssq*1]
   3539    lea                srcq, [srcq+ssq*2]
   3540    movq                 m2, [srcq+ssq*0]
   3541 %if ARCH_X86_32
   3542    mova                 m5, m12
   3543    mova                 m6, m13
   3544    REPX {pshufb    x, m5 }, m3, m4, m0, m1, m2
   3545    mova                 m5, m11
   3546    REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2
   3547 %else
   3548    REPX {pshufb    x, m12}, m3, m4, m0, m1, m2
   3549    REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2
   3550 %endif
   3551    phaddw               m3, m0      ; 0 2
   3552    phaddw               m4, m1      ; 1 3
   3553    phaddw               m0, m2      ; 2 4
   3554 %if ARCH_X86_32
   3555    REPX  {pmulhrsw x, m5 }, m3, m4, m0
   3556 %else
   3557    REPX  {pmulhrsw x, m11}, m3, m4, m0
   3558 %endif
   3559    punpcklwd            m1, m3, m4  ; 01
   3560    punpckhwd            m3, m4      ; 23
   3561    punpcklwd            m2, m4, m0  ; 12
   3562    punpckhwd            m4, m0      ; 34
   3563 .hv_w4_loop:
   3564    movq                 m7, [srcq+ssq*1]
   3565    lea                srcq, [srcq+ssq*2]
   3566    movq                 m6, [srcq+ssq*0]
   3567    pshufb               m7, m12
   3568    pshufb               m6, m12
   3569    pmaddubsw            m7, m13
   3570    pmaddubsw            m6, m13
   3571    pmaddwd              m5, m8, m1  ; a0
   3572    mova                 m1, m3
   3573    phaddw               m7, m6      ; 5 6
   3574    pmaddwd              m6, m8, m2  ; b0
   3575    mova                 m2, m4
   3576    pmaddwd              m3, m9      ; a1
   3577    pmaddwd              m4, m9      ; b1
   3578    pmulhrsw             m7, m11
   3579    paddd                m5, m14
   3580    paddd                m6, m14
   3581    paddd                m5, m3
   3582    paddd                m6, m4
   3583    shufpd               m4, m0, m7, 0x01 ; 4 5
   3584    mova                 m0, m7
   3585    punpcklwd            m3, m4, m7  ; 45
   3586    punpckhwd            m4, m7      ; 56
   3587    pmaddwd              m7, m10, m3 ; a2
   3588    paddd                m5, m7
   3589    pmaddwd              m7, m10, m4 ; b2
   3590    paddd                m6, m7
   3591    psrad                m5, 6
   3592    psrad                m6, 6
   3593    packssdw             m5, m6
   3594    mova             [tmpq], m5
   3595    add                tmpq, 16
   3596    sub                  hd, 2
   3597    jg .hv_w4_loop
   3598    RET
   3599 .hv_w8:
   3600    RESET_STACK_STATE
   3601    shr                 mxd, 16
   3602    sub                srcq, 2
   3603    movq                 m0, [base_reg-prep_ssse3+subpel_filters+1+mxq*8]
   3604 %if ARCH_X86_32
   3605    mov                 mxd, myd
   3606    and                 mxd, 0x7f
   3607 %else
   3608    movzx               mxd, myb
   3609 %endif
   3610    shr                 myd, 16
   3611    cmp                  hd, 6
   3612    cmovs               myd, mxd
   3613    movq                 m1, [base_reg-prep_ssse3+subpel_filters+1+myq*8]
   3614 %if ARCH_X86_32
   3615    mov                 ssq, ssm
   3616 %assign regs_used 6
   3617    ALLOC_STACK  -mmsize*16
   3618 %assign regs_used 7
   3619    sub                srcq, ssq
   3620    sub                srcq, ssq
   3621 %if STACK_ALIGNMENT < 16
   3622    %define            srcm  [esp+mmsize*15+gprsize*0]
   3623    %define            tmpm  [esp+mmsize*15+gprsize*1]
   3624    mov                tmpm, tmpq
   3625 %endif
   3626    mov                srcm, srcq
   3627 %else
   3628    ALLOC_STACK        16*6, 16
   3629    mov                 nsq, ssq
   3630    neg                 nsq
   3631 %endif
   3632    mova                 m7, [base+pw_8192]
   3633    lea                 r5d, [wq-8]
   3634    punpcklwd            m0, m0
   3635    shl                 r5d, 13
   3636    punpcklbw            m1, m1
   3637    add                 r5d, hd
   3638    psraw                m1, 8 ; sign-extend
   3639    pshufd               m2, m0, q0000
   3640    mova         [rsp+16*0], m2
   3641    pshufd               m2, m0, q1111
   3642    mova         [rsp+16*1], m2
   3643    pshufd               m0, m0, q2222
   3644    mova         [rsp+16*2], m0
   3645    pshufd               m2, m1, q0000
   3646    mova         [rsp+16*3], m2
   3647    pshufd               m2, m1, q1111
   3648    mova         [rsp+16*4], m2
   3649    pshufd               m1, m1, q2222
   3650    mova         [rsp+16*5], m1
   3651 %macro PREP_HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \
   3652                          [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3]
   3653    pshufb               %2, %1, %4
   3654    pshufb               %1, %5
   3655    pmaddubsw            %3, %2, %6
   3656    shufps               %2, %1, q2121
   3657    pmaddubsw            %1, %8
   3658    pmaddubsw            %2, %7
   3659    paddw                %1, %3
   3660    paddw                %1, %2
   3661    pmulhrsw             %1, m7
   3662 %endmacro
   3663 .hv_w8_loop0:
   3664    mova                 m2, [base+subpel_h_shufD]
   3665    mova                 m3, [base+subpel_h_shufF]
   3666    mova                 m4, [rsp+16*0]
   3667 %if ARCH_X86_32
   3668    movu                 m0, [srcq+ssq*0]
   3669    movu                 m1, [srcq+ssq*1]
   3670    lea                srcq, [srcq+ssq*2]
   3671    PREP_HV_H_6TAP       m0, m5, m6, m2, m3, m4
   3672    PREP_HV_H_6TAP       m1, m5, m6, m2, m3, m4
   3673    movu                 m5, [srcq+ssq*0]
   3674    punpcklwd            m6, m0, m1   ; 01
   3675    punpckhwd            m0, m1
   3676    mova        [rsp+16* 6], m6
   3677    mova        [rsp+16* 7], m0
   3678    PREP_HV_H_6TAP       m5, m0, m6, m2, m3, m4
   3679    movu                 m0, [srcq+ssq*1]
   3680    lea                srcq, [srcq+ssq*2]
   3681    punpcklwd            m6, m1, m5   ; 12
   3682    punpckhwd            m1, m5
   3683    mova        [rsp+16* 8], m6
   3684    mova        [rsp+16* 9], m1
   3685    PREP_HV_H_6TAP       m0, m1, m6, m2, m3, m4
   3686    movu                 m1, [srcq+ssq*0]
   3687    punpcklwd            m6, m5, m0   ; 23
   3688    punpckhwd            m5, m0
   3689    mova        [rsp+16*10], m6
   3690    mova        [rsp+16*11], m5
   3691    PREP_HV_H_6TAP       m1, m5, m6, m2, m3, m4
   3692    mova        [rsp+16*14], m1
   3693    punpcklwd            m6, m0, m1   ; 34
   3694    punpckhwd            m0, m1
   3695    mova        [rsp+16*12], m6
   3696    mova        [rsp+16*13], m0
   3697 .hv_w8_loop:
   3698    mova                 m3, [rsp+16* 3]
   3699    pmaddwd              m0, m3, [rsp+16* 6] ; a0
   3700    pmaddwd              m2, m3, [rsp+16* 7] ; a0'
   3701    pmaddwd              m1, m3, [rsp+16* 8] ; b0
   3702    pmaddwd              m3, [rsp+16* 9]     ; b0'
   3703    mova                 m6, [rsp+16* 4]
   3704    mova                 m4, [rsp+16*10]
   3705    mova                 m5, [rsp+16*11]
   3706    mova        [rsp+16* 6], m4
   3707    pmaddwd              m4, m6       ; a1
   3708    mova        [rsp+16* 7], m5
   3709    pmaddwd              m5, m6       ; a1'
   3710    paddd                m0, m4
   3711    mova                 m4, [rsp+16*12]
   3712    paddd                m2, m5
   3713    mova                 m5, [rsp+16*13]
   3714    mova        [rsp+16* 8], m4
   3715    pmaddwd              m4, m6       ; b1
   3716    mova        [rsp+16* 9], m5
   3717    pmaddwd              m5, m6       ; b1'
   3718    movu                 m6, [srcq+ssq*1]
   3719    lea                srcq, [srcq+ssq*2]
   3720    paddd                m1, m4
   3721    paddd                m3, m5
   3722    PREP_HV_H_6TAP       m6, m4, m5
   3723    mova                 m4, [base+pd_32]
   3724    mova                 m5, [rsp+16*14]
   3725    REPX      {paddd x, m4}, m0, m2, m1, m3
   3726    punpcklwd            m4, m5, m6   ; 45
   3727    punpckhwd            m5, m6
   3728    mova        [rsp+16*10], m4
   3729    mova        [rsp+16*11], m5
   3730    pmaddwd              m4, [rsp+16*5] ; a2
   3731    pmaddwd              m5, [rsp+16*5] ; a2'
   3732    paddd                m0, m4
   3733    movu                 m4, [srcq+ssq*0]
   3734    paddd                m2, m5
   3735    psrad                m0, 6
   3736    psrad                m2, 6
   3737    packssdw             m0, m2
   3738    PREP_HV_H_6TAP       m4, m2, m5
   3739    mova                 m2, [rsp+16*5]
   3740    punpcklwd            m5, m6, m4   ; 56
   3741    mova        [rsp+16*14], m4
   3742    punpckhwd            m6, m4
   3743    mova        [rsp+16*12], m5
   3744    pmaddwd              m5, m2       ; b2
   3745    mova        [rsp+16*13], m6
   3746    pmaddwd              m6, m2       ; b2'
   3747    paddd                m1, m5
   3748    paddd                m3, m6
   3749    psrad                m1, 6
   3750    psrad                m3, 6
   3751    packssdw             m1, m3
   3752    mova        [tmpq+wq*0], m0
   3753    mova        [tmpq+wq*2], m1
   3754    lea                tmpq, [tmpq+wq*4]
   3755    sub                  hd, 2
   3756    jg .hv_w8_loop
   3757    mov                srcq, srcm
   3758    mov                tmpq, tmpm
   3759    movzx                hd, r5w
   3760    add                srcq, 8
   3761    add                tmpq, 16
   3762    mov                srcm, srcq
   3763    mov                tmpm, tmpq
   3764 %else
   3765    movu                 m9, [srcq+nsq*2]
   3766    movu                m11, [srcq+nsq*1]
   3767    lea                  r6, [srcq+ssq*2]
   3768    movu                m13, [srcq+ssq*0]
   3769    movu                m15, [srcq+ssq*1]
   3770    mov                  r8, tmpq
   3771    movu                 m6, [r6  +ssq*0]
   3772    mova                 m5, [rsp+16*1]
   3773    mova                 m8, [rsp+16*2]
   3774    PREP_HV_H_6TAP       m9, m0, m1, m2, m3, m4, m5, m8
   3775    PREP_HV_H_6TAP      m11, m0, m1, m2, m3, m4, m5, m8
   3776    PREP_HV_H_6TAP      m13, m0, m1, m2, m3, m4, m5, m8
   3777    PREP_HV_H_6TAP      m15, m0, m1, m2, m3, m4, m5, m8
   3778    PREP_HV_H_6TAP       m6, m0, m1, m2, m3, m4, m5, m8
   3779    punpcklwd            m8, m9, m11  ; 01
   3780    punpckhwd            m9, m11
   3781    punpcklwd           m10, m11, m13 ; 12
   3782    punpckhwd           m11, m13
   3783    punpcklwd           m12, m13, m15 ; 23
   3784    punpckhwd           m13, m15
   3785    punpcklwd           m14, m15, m6  ; 34
   3786    punpckhwd           m15, m6
   3787 .hv_w8_loop:
   3788    mova                 m3, [rsp+16*3]
   3789    mova                 m4, [rsp+16*4]
   3790    mova                 m5, [base+pd_32]
   3791    pmaddwd              m0, m8, m3  ; a0
   3792    mova                 m8, m12
   3793    pmaddwd              m2, m9, m3  ; a0'
   3794    mova                 m9, m13
   3795    pmaddwd              m1, m10, m3 ; b0
   3796    mova                m10, m14
   3797    pmaddwd              m3, m11     ; b0'
   3798    mova                m11, m15
   3799    REPX    {pmaddwd x, m4}, m12, m13, m14, m15
   3800    REPX    {paddd   x, m5}, m0, m2, m1, m3
   3801    paddd                m0, m12
   3802    paddd                m2, m13
   3803    paddd                m1, m14
   3804    paddd                m3, m15
   3805    movu                m15, [r6+ssq*1]
   3806    lea                  r6, [r6+ssq*2]
   3807    PREP_HV_H_6TAP      m15, m4, m5
   3808    punpcklwd           m12, m6, m15
   3809    punpckhwd           m13, m6, m15
   3810    movu                 m6, [r6+ssq*0]
   3811    PREP_HV_H_6TAP       m6, m4, m5
   3812    mova                 m4, [rsp+16*5]
   3813    punpcklwd           m14, m15, m6
   3814    punpckhwd           m15, m6
   3815    pmaddwd              m5, m12, m4  ; a2
   3816    paddd                m0, m5
   3817    pmaddwd              m5, m13, m4  ; a2'
   3818    paddd                m2, m5
   3819    pmaddwd              m5, m14, m4  ; b2
   3820    paddd                m1, m5
   3821    pmaddwd              m4, m15      ; b2'
   3822    paddd                m3, m4
   3823    REPX       {psrad x, 6}, m0, m2, m1, m3
   3824    packssdw             m0, m2
   3825    packssdw             m1, m3
   3826    mova          [r8+wq*0], m0
   3827    mova          [r8+wq*2], m1
   3828    lea                  r8, [r8+wq*4]
   3829    sub                  hd, 2
   3830    jg .hv_w8_loop
   3831    add                srcq, 8
   3832    add                tmpq, 16
   3833    movzx                hd, r5b
   3834 %endif
   3835    sub                 r5d, 1<<16
   3836    jg .hv_w8_loop0
   3837    RET
   3838 
   3839 PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_8bpc
   3840 PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_8bpc
   3841 PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_8bpc
   3842 PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_8bpc
   3843 PREP_8TAP_FN sharp,          SHARP,   SHARP
   3844 
   3845 cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
   3846    imul                mxd, mxm, 0x010101
   3847    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   3848    imul                myd, mym, 0x010101
   3849    add                 myd, t1d ; 8tap_v, my, 4tap_v
   3850    mov                  wd, wm
   3851    movifnidn          srcd, srcm
   3852    movifnidn            hd, hm
   3853    LEA            base_reg, prep_ssse3
   3854    test                mxd, 0xf00
   3855    jnz .h
   3856    test                myd, 0xf00
   3857    jz mangle(private_prefix %+ _prep_6tap_8bpc_ssse3).prep
   3858 .v:
   3859 %if ARCH_X86_32
   3860    mov                 mxd, myd
   3861    and                 mxd, 0x7f
   3862 %else
   3863    WIN64_SPILL_XMM      16
   3864    movzx               mxd, myb
   3865 %endif
   3866    shr                 myd, 16
   3867    cmp                  hd, 6
   3868    cmovs               myd, mxd
   3869    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
   3870    mova                 m2, [base+pw_512]
   3871    mova                 m7, [base+pw_8192]
   3872    punpcklwd            m0, m0
   3873 %if ARCH_X86_32
   3874 %define            subpel0  [rsp+mmsize*0]
   3875 %define            subpel1  [rsp+mmsize*1]
   3876 %define            subpel2  [rsp+mmsize*2]
   3877 %define            subpel3  [rsp+mmsize*3]
   3878 %assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed
   3879    ALLOC_STACK   -mmsize*4
   3880 %assign regs_used 7
   3881    mov             strideq, [rstk+stack_offset+gprsize*3]
   3882    pshufd               m1, m0, q0000
   3883    mova            subpel0, m1
   3884    pshufd               m1, m0, q1111
   3885    mova            subpel1, m1
   3886    lea                  r5, [strideq*3]
   3887    pshufd               m1, m0, q2222
   3888    mova            subpel2, m1
   3889    pshufd               m1, m0, q3333
   3890    mova            subpel3, m1
   3891    sub                srcq, r5
   3892 %else
   3893 %define            subpel0  m8
   3894 %define            subpel1  m9
   3895 %define            subpel2  m10
   3896 %define            subpel3  m11
   3897    pshufd               m8, m0, q0000
   3898    pshufd               m9, m0, q1111
   3899    lea            stride3q, [strideq*3]
   3900    pshufd              m10, m0, q2222
   3901    pshufd              m11, m0, q3333
   3902    sub                srcq, stride3q
   3903    cmp                  wd, 8
   3904    jns .v_w8
   3905 %endif
   3906 .v_w4:
   3907 %if ARCH_X86_32
   3908 %if STACK_ALIGNMENT < mmsize
   3909  %define srcm [esp+stack_size+gprsize*1]
   3910  %define tmpm [esp+stack_size+gprsize*2]
   3911 %endif
   3912    mov                tmpm, tmpq
   3913    mov                srcm, srcq
   3914    lea                 r5d, [wq - 4] ; horizontal loop
   3915    shl                 r5d, (16 - 2)  ; (wq / 4) << 16
   3916    mov                 r5w, hw
   3917 .v_w4_loop0:
   3918 %endif
   3919    movd                 m1, [srcq+strideq*0]
   3920    movd                 m0, [srcq+strideq*1]
   3921 %if ARCH_X86_32
   3922    lea                srcq, [srcq+strideq*2]
   3923    movd                 m2, [srcq+strideq*0]
   3924    movd                 m4, [srcq+strideq*1]
   3925    lea                srcq, [srcq+strideq*2]
   3926    movd                 m3, [srcq+strideq*0]
   3927    movd                 m5, [srcq+strideq*1]
   3928    lea                srcq, [srcq+strideq*2]
   3929 %else
   3930    movd                 m2, [srcq+strideq*2]
   3931    add                srcq, stride3q
   3932    movd                 m4, [srcq+strideq*0]
   3933    movd                 m3, [srcq+strideq*1]
   3934    movd                 m5, [srcq+strideq*2]
   3935    add                srcq, stride3q
   3936 %endif
   3937    punpckldq            m1, m0 ; 0 1
   3938    punpckldq            m0, m2 ; 1 2
   3939    punpcklbw            m1, m0 ; 01 12
   3940    movd                 m0, [srcq+strideq*0]
   3941    punpckldq            m2, m4 ; 2 3
   3942    punpckldq            m4, m3 ; 3 4
   3943    punpckldq            m3, m5 ; 4 5
   3944    punpckldq            m5, m0 ; 5 6
   3945    punpcklbw            m2, m4 ; 23 34
   3946    punpcklbw            m3, m5 ; 45 56
   3947 .v_w4_loop:
   3948    mova                 m5, m1
   3949    pmaddubsw            m5, subpel0      ; a0 b0
   3950    mova                 m1, m2
   3951    pmaddubsw            m2, subpel1      ; a1 b1
   3952    paddw                m5, m2
   3953    mova                 m2, m3
   3954    pmaddubsw            m3, subpel2      ; a2 b2
   3955    movd                 m4, [srcq+strideq*1]
   3956    lea                srcq, [srcq+strideq*2]
   3957    paddw                m5, m3
   3958    punpckldq            m3, m0, m4       ; 6 7 _ _
   3959    movd                 m0, [srcq+strideq*0]
   3960    punpckldq            m4, m0           ; 7 8 _ _
   3961    punpcklbw            m3, m4           ; 67 78
   3962    mova                 m4, m3
   3963    pmaddubsw            m4, subpel3      ; a3 b3
   3964    paddw                m5, m4
   3965    pmulhrsw             m5, m7
   3966    movq        [tmpq+wq*0], m5
   3967    movhps      [tmpq+wq*2], m5
   3968    lea                tmpq, [tmpq+wq*4]
   3969    sub                  hd, 2
   3970    jg .v_w4_loop
   3971 %if ARCH_X86_32
   3972    mov                srcq, srcm
   3973    mov                tmpq, tmpm
   3974    movzx                hd, r5w
   3975    add                srcq, 4
   3976    add                tmpq, 8
   3977    mov                srcm, srcq
   3978    mov                tmpm, tmpq
   3979    sub                 r5d, 1<<16 ; horizontal--
   3980    jg .v_w4_loop0
   3981 %endif
   3982    RET
   3983 %if ARCH_X86_64
   3984 .v_w8:
   3985    lea                 r6d, [wq*8-64]
   3986    mov                  r5, srcq
   3987    mov                  r8, tmpq
   3988    lea                 r6d, [hq+r6*4]
   3989 .v_w8_loop0:
   3990    movq                 m1, [srcq+strideq*0]
   3991    movq                 m2, [srcq+strideq*1]
   3992    movq                 m3, [srcq+strideq*2]
   3993    add                srcq, stride3q
   3994    movq                 m4, [srcq+strideq*0]
   3995    movq                 m5, [srcq+strideq*1]
   3996    movq                 m6, [srcq+strideq*2]
   3997    add                srcq, stride3q
   3998    movq                 m0, [srcq+strideq*0]
   3999    punpcklbw            m1, m2 ; 01
   4000    punpcklbw            m2, m3 ; 12
   4001    punpcklbw            m3, m4 ; 23
   4002    punpcklbw            m4, m5 ; 34
   4003    punpcklbw            m5, m6 ; 45
   4004    punpcklbw            m6, m0 ; 56
   4005 .v_w8_loop:
   4006    movq                m13, [srcq+strideq*1]
   4007    lea                srcq, [srcq+strideq*2]
   4008    pmaddubsw           m14, m1, subpel0 ; a0
   4009    pmaddubsw           m15, m2, subpel0 ; b0
   4010    mova                 m1, m3
   4011    mova                 m2, m4
   4012    pmaddubsw            m3, subpel1 ; a1
   4013    pmaddubsw            m4, subpel1 ; b1
   4014    paddw               m14, m3
   4015    paddw               m15, m4
   4016    mova                 m3, m5
   4017    mova                 m4, m6
   4018    pmaddubsw            m5, subpel2 ; a2
   4019    pmaddubsw            m6, subpel2 ; b2
   4020    punpcklbw           m12, m0, m13 ; 67
   4021    movq                 m0, [srcq+strideq*0]
   4022    punpcklbw           m13, m0      ; 78
   4023    paddw               m14, m5
   4024    mova                 m5, m12
   4025    pmaddubsw           m12, subpel3 ; a3
   4026    paddw               m15, m6
   4027    mova                 m6, m13
   4028    pmaddubsw           m13, subpel3 ; b3
   4029    paddw               m14, m12
   4030    paddw               m15, m13
   4031    pmulhrsw            m14, m7
   4032    pmulhrsw            m15, m7
   4033    movu        [tmpq+wq*0], m14
   4034    movu        [tmpq+wq*2], m15
   4035    lea                tmpq, [tmpq+wq*4]
   4036    sub                  hd, 2
   4037    jg .v_w8_loop
   4038    add                  r5, 8
   4039    add                  r8, 16
   4040    movzx                hd, r6b
   4041    mov                srcq, r5
   4042    mov                tmpq, r8
   4043    sub                 r6d, 1<<8
   4044    jg .v_w8_loop0
   4045    RET
   4046 %endif ;ARCH_X86_64
   4047 %undef subpel0
   4048 %undef subpel1
   4049 %undef subpel2
   4050 %undef subpel3
   4051 .h_w4:
   4052    WIN64_SPILL_XMM       7
   4053 %if ARCH_X86_32
   4054    and                 mxd, 0x7f
   4055 %else
   4056    movzx               mxd, mxb
   4057 %endif
   4058    dec                srcq
   4059    movd                 m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
   4060    mova                 m5, [base+subpel_h_shufA]
   4061    mova                 m6, [base+pw_8192]
   4062    movifnidn            r2, stridemp
   4063    pshufd               m4, m4, q0000
   4064    lea                  r3, [r2*3]
   4065 .h_w4_loop:
   4066    movq                 m0, [srcq+r2*0]
   4067    movq                 m1, [srcq+r2*1]
   4068    movq                 m2, [srcq+r2*2]
   4069    movq                 m3, [srcq+r3  ]
   4070    lea                srcq, [srcq+r2*4]
   4071    REPX  {pshufb    x, m5}, m0, m1, m2, m3
   4072    REPX  {pmaddubsw x, m4}, m0, m1, m2, m3
   4073    phaddw               m0, m1
   4074    phaddw               m2, m3
   4075    pmulhrsw             m0, m6
   4076    pmulhrsw             m2, m6
   4077    mova        [tmpq+16*0], m0
   4078    mova        [tmpq+16*1], m2
   4079    add                tmpq, 32
   4080    sub                  hd, 4
   4081    jg .h_w4_loop
   4082    RET
   4083 .h:
   4084    test                myd, 0xf00
   4085    jnz .hv
   4086    cmp                  wd, 4
   4087    je .h_w4
   4088    WIN64_SPILL_XMM      12
   4089 %if ARCH_X86_32
   4090 %define strideq r6
   4091    mov             strideq, stridem
   4092 %endif
   4093    tzcnt                wd, wd
   4094 %if ARCH_X86_64
   4095    mova                m10, [base+subpel_h_shufA]
   4096    mova                m11, [base+subpel_h_shufB]
   4097    mova                 m9, [base+subpel_h_shufC]
   4098 %else
   4099    %define             m10  [base+subpel_h_shufA]
   4100    %define             m11  [base+subpel_h_shufB]
   4101    %define              m9  [base+subpel_h_shufC]
   4102 %endif
   4103    shr                 mxd, 16
   4104    sub                srcq, 3
   4105    movzx                wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
   4106    movq                 m6, [base_reg+mxq*8+subpel_filters-prep_ssse3]
   4107    mova                 m7, [base+pw_8192]
   4108    pshufd               m5, m6, q0000
   4109    pshufd               m6, m6, q1111
   4110    add                  wq, base_reg
   4111    jmp                  wq
   4112 %macro PREP_8TAP_H 2 ; dst, src_memloc
   4113    movu                m%1, [%2]
   4114    pshufb               m2, m%1, m11 ; subpel_h_shufB
   4115    pshufb               m3, m%1, m9  ; subpel_h_shufC
   4116    pshufb              m%1, m10      ; subpel_h_shufA
   4117    mova                 m4, m2
   4118    pmaddubsw            m4, m5       ; subpel +0 B0
   4119    pmaddubsw            m2, m6       ; subpel +4 B4
   4120    pmaddubsw            m3, m6       ; subpel +4 C4
   4121    pmaddubsw           m%1, m5       ; subpel +0 A0
   4122    paddw                m3, m4
   4123    paddw               m%1, m2
   4124    phaddw              m%1, m3
   4125    pmulhrsw            m%1, m7
   4126 %endmacro
   4127 .h_w8:
   4128    PREP_8TAP_H           0, srcq+strideq*0
   4129    PREP_8TAP_H           1, srcq+strideq*1
   4130    mova        [tmpq+16*0], m0
   4131    mova        [tmpq+16*1], m1
   4132    lea                srcq, [srcq+strideq*2]
   4133    add                tmpq, 32
   4134    sub                  hd, 2
   4135    jg .h_w8
   4136    RET
   4137 .h_w16:
   4138    mov                  r3, -16*1
   4139    jmp .h_start
   4140 .h_w32:
   4141    mov                  r3, -16*2
   4142    jmp .h_start
   4143 .h_w64:
   4144    mov                  r3, -16*4
   4145    jmp .h_start
   4146 .h_w128:
   4147    mov                  r3, -16*8
   4148 .h_start:
   4149    sub                srcq, r3
   4150    mov                  r5, r3
   4151 .h_loop:
   4152    PREP_8TAP_H           0, srcq+r3+8*0
   4153    PREP_8TAP_H           1, srcq+r3+8*1
   4154    mova        [tmpq+16*0], m0
   4155    mova        [tmpq+16*1], m1
   4156    add                tmpq, 32
   4157    add                  r3, 16
   4158    jl .h_loop
   4159    add                srcq, strideq
   4160    mov                  r3, r5
   4161    dec                  hd
   4162    jg .h_loop
   4163    RET
   4164 .hv:
   4165    RESET_STACK_STATE
   4166    cmp                  wd, 4
   4167    jg .hv_w8
   4168    and                 mxd, 0x7f
   4169    movd                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
   4170 %if ARCH_X86_32
   4171    mov                 mxd, myd
   4172    shr                 myd, 16
   4173    and                 mxd, 0x7f
   4174    cmp                  hd, 6
   4175    cmovs               myd, mxd
   4176    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
   4177    mov             strideq, stridem
   4178 %assign regs_used 6
   4179    ALLOC_STACK  -mmsize*14
   4180 %assign regs_used 7
   4181    lea                  r5, [strideq*3+1]
   4182    sub                srcq, r5
   4183 %define           subpelv0  [rsp+mmsize*0]
   4184 %define           subpelv1  [rsp+mmsize*1]
   4185 %define           subpelv2  [rsp+mmsize*2]
   4186 %define           subpelv3  [rsp+mmsize*3]
   4187    punpcklbw            m0, m0
   4188    psraw                m0, 8
   4189    pshufd               m6, m0, q0000
   4190    mova           subpelv0, m6
   4191    pshufd               m6, m0, q1111
   4192    mova           subpelv1, m6
   4193    pshufd               m6, m0, q2222
   4194    mova           subpelv2, m6
   4195    pshufd               m6, m0, q3333
   4196    mova           subpelv3, m6
   4197 %else
   4198    movzx               mxd, myb
   4199    shr                 myd, 16
   4200    cmp                  hd, 6
   4201    cmovs               myd, mxd
   4202    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
   4203    ALLOC_STACK   mmsize*14, 14
   4204    lea            stride3q, [strideq*3]
   4205    sub                srcq, stride3q
   4206    dec                srcq
   4207 %define           subpelv0  m10
   4208 %define           subpelv1  m11
   4209 %define           subpelv2  m12
   4210 %define           subpelv3  m13
   4211    punpcklbw            m0, m0
   4212    psraw                m0, 8
   4213    mova                 m8, [base+pw_8192]
   4214    mova                 m9, [base+pd_32]
   4215    pshufd              m10, m0, q0000
   4216    pshufd              m11, m0, q1111
   4217    pshufd              m12, m0, q2222
   4218    pshufd              m13, m0, q3333
   4219 %endif
   4220    pshufd               m7, m1, q0000
   4221 %define hv4_line_0_0 4
   4222 %define hv4_line_0_1 5
   4223 %define hv4_line_0_2 6
   4224 %define hv4_line_0_3 7
   4225 %define hv4_line_0_4 8
   4226 %define hv4_line_0_5 9
   4227 %define hv4_line_1_0 10
   4228 %define hv4_line_1_1 11
   4229 %define hv4_line_1_2 12
   4230 %define hv4_line_1_3 13
   4231 %if ARCH_X86_32
   4232    %define        w8192reg  [base+pw_8192]
   4233    %define          d32reg  [base+pd_32]
   4234 %else
   4235    %define        w8192reg  m8
   4236    %define          d32reg  m9
   4237 %endif
   4238    ; lower shuffle 0 1 2 3 4
   4239    mova                 m6, [base+subpel_h_shuf4]
   4240    movq                 m5, [srcq+strideq*0]   ; 0 _ _ _
   4241    movhps               m5, [srcq+strideq*1]   ; 0 _ 1 _
   4242 %if ARCH_X86_32
   4243    lea                srcq, [srcq+strideq*2]
   4244    movq                 m4, [srcq+strideq*0]   ; 2 _ _ _
   4245    movhps               m4, [srcq+strideq*1]   ; 2 _ 3 _
   4246    lea                srcq, [srcq+strideq*2]
   4247 %else
   4248    movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
   4249    movhps               m4, [srcq+stride3q ]   ; 2 _ 3 _
   4250    lea                srcq, [srcq+strideq*4]
   4251 %endif
   4252    pshufb               m2, m5, m6             ;H subpel_h_shuf4 0~1~
   4253    pshufb               m0, m4, m6             ;H subpel_h_shuf4 2~3~
   4254    pmaddubsw            m2, m7                 ;H subpel_filters
   4255    pmaddubsw            m0, m7                 ;H subpel_filters
   4256    phaddw               m2, m0
   4257    pmulhrsw             m2, w8192reg
   4258    SAVELINE_W4          m2, 2, 0
   4259    ; upper shuffle 2 3 4 5 6
   4260    mova                 m6, [base+subpel_h_shuf4+16]
   4261    pshufb               m2, m5, m6             ;H subpel_h_shuf4 0~1~
   4262    pshufb               m0, m4, m6             ;H subpel_h_shuf4 2~3~
   4263    pmaddubsw            m2, m7                 ;H subpel_filters
   4264    pmaddubsw            m0, m7                 ;H subpel_filters
   4265    phaddw               m2, m0                 ;H 0 1 2 3
   4266    pmulhrsw             m2, w8192reg
   4267    ; lower shuffle
   4268    mova                 m6, [base+subpel_h_shuf4]
   4269    movq                 m5, [srcq+strideq*0]   ; 4 _ _ _
   4270    movhps               m5, [srcq+strideq*1]   ; 4 _ 5 _
   4271 %if ARCH_X86_32
   4272    lea                srcq, [srcq+strideq*2]
   4273    movq                 m4, [srcq+strideq*0]   ; 6 _ _ _
   4274    add                srcq, strideq
   4275 %else
   4276    movq                 m4, [srcq+strideq*2]   ; 6 _ _ _
   4277    add                srcq, stride3q
   4278 %endif
   4279    pshufb               m3, m5, m6             ;H subpel_h_shuf4 4~5~
   4280    pshufb               m0, m4, m6             ;H subpel_h_shuf4 6~6~
   4281    pmaddubsw            m3, m7                 ;H subpel_filters
   4282    pmaddubsw            m0, m7                 ;H subpel_filters
   4283    phaddw               m3, m0                 ;H 4 5 6 7
   4284    pmulhrsw             m3, w8192reg
   4285    SAVELINE_W4          m3, 3, 0
   4286    ; upper shuffle
   4287    mova                 m6, [base+subpel_h_shuf4+16]
   4288    pshufb               m3, m5, m6             ;H subpel_h_shuf4 4~5~
   4289    pshufb               m0, m4, m6             ;H subpel_h_shuf4 6~6~
   4290    pmaddubsw            m3, m7                 ;H subpel_filters
   4291    pmaddubsw            m0, m7                 ;H subpel_filters
   4292    phaddw               m3, m0                 ;H 4 5 6 7
   4293    pmulhrsw             m3, w8192reg
   4294    ;process high
   4295    palignr              m4, m3, m2, 4;V 1 2 3 4
   4296    punpcklwd            m1, m2, m4  ; V 01 12
   4297    punpckhwd            m2, m4      ; V 23 34
   4298    pshufd               m0, m3, q2121;V 5 6 5 6
   4299    punpcklwd            m3, m0      ; V 45 56
   4300    SAVELINE_W4          m0, 0, 1
   4301    SAVELINE_W4          m1, 1, 1
   4302    SAVELINE_W4          m2, 2, 1
   4303    SAVELINE_W4          m3, 3, 1
   4304    ;process low
   4305    RESTORELINE_W4       m2, 2, 0
   4306    RESTORELINE_W4       m3, 3, 0
   4307    palignr              m4, m3, m2, 4;V 1 2 3 4
   4308    punpcklwd            m1, m2, m4  ; V 01 12
   4309    punpckhwd            m2, m4      ; V 23 34
   4310    pshufd               m0, m3, q2121;V 5 6 5 6
   4311    punpcklwd            m3, m0      ; V 45 56
   4312 .hv_w4_loop:
   4313    ;process low
   4314    pmaddwd              m5, m1, subpelv0 ; V a0 b0
   4315    mova                 m1, m2
   4316    pmaddwd              m2, subpelv1; V a1 b1
   4317    paddd                m5, m2
   4318    mova                 m2, m3
   4319    pmaddwd              m3, subpelv2; V a2 b2
   4320    paddd                m5, m3
   4321    mova                 m6, [base+subpel_h_shuf4]
   4322    movq                 m4, [srcq+strideq*0] ; 7
   4323    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
   4324    pshufb               m4, m6               ; H subpel_h_shuf4 7~8~
   4325    pmaddubsw            m4, m7               ; H subpel_filters
   4326    phaddw               m4, m4               ; H                7878
   4327    pmulhrsw             m4, w8192reg
   4328    palignr              m3, m4, m0, 12       ;                  6787
   4329    mova                 m0, m4
   4330    punpcklwd            m3, m4      ; 67 78
   4331    pmaddwd              m4, m3, subpelv3; a3 b3
   4332    paddd                m5, d32reg ; pd_32
   4333    paddd                m5, m4
   4334    psrad                m5, 6
   4335    SAVELINE_W4          m0, 0, 0
   4336    SAVELINE_W4          m1, 1, 0
   4337    SAVELINE_W4          m2, 2, 0
   4338    SAVELINE_W4          m3, 3, 0
   4339    SAVELINE_W4          m5, 5, 0
   4340    ;process high
   4341    RESTORELINE_W4       m0, 0, 1
   4342    RESTORELINE_W4       m1, 1, 1
   4343    RESTORELINE_W4       m2, 2, 1
   4344    RESTORELINE_W4       m3, 3, 1
   4345    pmaddwd              m5, m1, subpelv0; V a0 b0
   4346    mova                 m1, m2
   4347    pmaddwd              m2, subpelv1; V a1 b1
   4348    paddd                m5, m2
   4349    mova                 m2, m3
   4350    pmaddwd              m3, subpelv2; V a2 b2
   4351    paddd                m5, m3
   4352    mova                 m6, [base+subpel_h_shuf4+16]
   4353    movq                 m4, [srcq+strideq*0] ; 7
   4354    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
   4355    pshufb               m4, m6               ; H subpel_h_shuf4 7~8~
   4356    pmaddubsw            m4, m7               ; H subpel_filters
   4357    phaddw               m4, m4               ; H                7878
   4358    pmulhrsw             m4, w8192reg
   4359    palignr              m3, m4, m0, 12       ;                  6787
   4360    mova                 m0, m4
   4361    punpcklwd            m3, m4      ; 67 78
   4362    pmaddwd              m4, m3, subpelv3; a3 b3
   4363    paddd                m5, d32reg ; pd_32
   4364    paddd                m5, m4
   4365    psrad                m4, m5, 6
   4366    RESTORELINE_W4       m5, 5, 0
   4367    packssdw             m5, m4
   4368    pshufd               m5, m5, q3120
   4369    movu             [tmpq], m5
   4370    lea                srcq, [srcq+strideq*2]
   4371    add                tmpq, 16
   4372    sub                  hd, 2
   4373    SAVELINE_W4          m0, 0, 1
   4374    SAVELINE_W4          m1, 1, 1
   4375    SAVELINE_W4          m2, 2, 1
   4376    SAVELINE_W4          m3, 3, 1
   4377    RESTORELINE_W4       m0, 0, 0
   4378    RESTORELINE_W4       m1, 1, 0
   4379    RESTORELINE_W4       m2, 2, 0
   4380    RESTORELINE_W4       m3, 3, 0
   4381    jg .hv_w4_loop
   4382    RET
   4383 %undef subpelv0
   4384 %undef subpelv1
   4385 %undef subpelv2
   4386 %undef subpelv3
   4387 .hv_w8:
   4388    RESET_STACK_STATE
   4389 %define hv8_line_1 0
   4390 %define hv8_line_2 1
   4391 %define hv8_line_3 2
   4392 %define hv8_line_4 3
   4393 %define hv8_line_6 4
   4394    shr                 mxd, 16
   4395 %if ARCH_X86_32
   4396 %define           subpelh0  [rsp+mmsize*5]
   4397 %define           subpelh1  [rsp+mmsize*6]
   4398 %define           subpelv0  [rsp+mmsize*7]
   4399 %define           subpelv1  [rsp+mmsize*8]
   4400 %define           subpelv2  [rsp+mmsize*9]
   4401 %define           subpelv3  [rsp+mmsize*10]
   4402 %define             accuv0  [rsp+mmsize*11]
   4403 %define             accuv1  [rsp+mmsize*12]
   4404    movq                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
   4405    mov                 mxd, myd
   4406    shr                 myd, 16
   4407    and                 mxd, 0x7f
   4408    cmp                  hd, 6
   4409    cmovs               myd, mxd
   4410    movq                 m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
   4411    mov             strideq, stridem
   4412 %assign regs_used 6
   4413    ALLOC_STACK  -mmsize*14
   4414 %assign regs_used 7
   4415 %if STACK_ALIGNMENT < mmsize
   4416  %define              tmpm  [rsp+mmsize*13+gprsize*1]
   4417  %define              srcm  [rsp+mmsize*13+gprsize*2]
   4418  %define           stridem  [rsp+mmsize*13+gprsize*3]
   4419    mov                tmpm, tmpq
   4420    mov             stridem, strideq
   4421 %endif
   4422    pshufd               m0, m1, q0000
   4423    pshufd               m1, m1, q1111
   4424    punpcklbw            m5, m5
   4425    psraw                m5, 8
   4426    pshufd               m2, m5, q0000
   4427    pshufd               m3, m5, q1111
   4428    pshufd               m4, m5, q2222
   4429    pshufd               m5, m5, q3333
   4430    mova           subpelh0, m0
   4431    mova           subpelh1, m1
   4432    mova           subpelv0, m2
   4433    mova           subpelv1, m3
   4434    mova           subpelv2, m4
   4435    mova           subpelv3, m5
   4436    lea                  r5, [strideq*3+3]
   4437    sub                srcq, r5
   4438    mov                srcm, srcq
   4439 %else
   4440    ALLOC_STACK    mmsize*5, 16
   4441 %define           subpelh0  m10
   4442 %define           subpelh1  m11
   4443 %define           subpelv0  m12
   4444 %define           subpelv1  m13
   4445 %define           subpelv2  m14
   4446 %define           subpelv3  m15
   4447 %define             accuv0  m8
   4448 %define             accuv1  m9
   4449    movq                 m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
   4450    movzx               mxd, myb
   4451    shr                 myd, 16
   4452    cmp                  hd, 6
   4453    cmovs               myd, mxd
   4454    movq                 m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
   4455    pshufd         subpelh0, m0, q0000
   4456    pshufd         subpelh1, m0, q1111
   4457    punpcklbw            m1, m1
   4458    psraw                m1, 8
   4459    pshufd         subpelv0, m1, q0000
   4460    pshufd         subpelv1, m1, q1111
   4461    pshufd         subpelv2, m1, q2222
   4462    pshufd         subpelv3, m1, q3333
   4463    lea            stride3q, [strideq*3]
   4464    sub                srcq, 3
   4465    sub                srcq, stride3q
   4466    mov                  r6, srcq
   4467    mov                  r8, tmpq
   4468 %endif
   4469    lea                 r5d, [wq-4]
   4470    shl                 r5d, 14
   4471    add                 r5d, hd
   4472 .hv_w8_loop0:
   4473 %if ARCH_X86_64
   4474    mova                 m7, [base+subpel_h_shufA]
   4475    mova                 m8, [base+subpel_h_shufB]
   4476    mova                 m9, [base+subpel_h_shufC]
   4477    %define           shufA  m7
   4478    %define           shufB  m8
   4479    %define           shufC  m9
   4480 %else
   4481    %define           shufA  [base+subpel_h_shufA]
   4482    %define           shufB  [base+subpel_h_shufB]
   4483    %define           shufC  [base+subpel_h_shufC]
   4484 %endif
   4485 %macro PREP_8TAP_HV 2 ; dst, src_memloc, tmp[1-2]
   4486    movu                 %1, [%2]
   4487    pshufb               m2, %1, shufB
   4488    pshufb               m3, %1, shufC
   4489    pshufb               %1, shufA
   4490    mova                 m1, m2
   4491    pmaddubsw            m1, subpelh0 ; subpel +0 C0
   4492    pmaddubsw            m3, subpelh1 ; subpel +4 B4
   4493    pmaddubsw            m2, subpelh1 ; C4
   4494    pmaddubsw            %1, subpelh0 ; A0
   4495    paddw                m1, m3       ; C0+B4
   4496    paddw                %1, m2       ; A0+C4
   4497    phaddw               %1, m1
   4498 %endmacro
   4499    PREP_8TAP_HV         m4, srcq+strideq*0
   4500    PREP_8TAP_HV         m5, srcq+strideq*1
   4501 %if ARCH_X86_64
   4502    PREP_8TAP_HV         m6, srcq+strideq*2
   4503    add                srcq, stride3q
   4504    PREP_8TAP_HV         m0, srcq+strideq*0
   4505 %else
   4506    lea                srcq, [srcq+strideq*2]
   4507    PREP_8TAP_HV         m6, srcq+strideq*0
   4508    PREP_8TAP_HV         m0, srcq+strideq*1
   4509    lea                srcq, [srcq+strideq*2]
   4510 %endif
   4511    mova                 m7, [base+pw_8192]
   4512    REPX   {pmulhrsw x, m7}, m4, m5, m6, m0
   4513    punpcklwd            m1, m4, m5 ; 01
   4514    punpcklwd            m2, m5, m6 ; 12
   4515    punpcklwd            m3, m6, m0 ; 23
   4516    SAVELINE_W8           1, m1
   4517    SAVELINE_W8           2, m2
   4518    SAVELINE_W8           3, m3
   4519    mova                 m7, [base+subpel_h_shufA]
   4520 %if ARCH_X86_64
   4521    PREP_8TAP_HV         m4, srcq+strideq*1
   4522    PREP_8TAP_HV         m5, srcq+strideq*2
   4523    add                srcq, stride3q
   4524    PREP_8TAP_HV         m6, srcq+strideq*0
   4525 %else
   4526    PREP_8TAP_HV         m4, srcq+strideq*0
   4527    PREP_8TAP_HV         m5, srcq+strideq*1
   4528    lea                srcq, [srcq+strideq*2]
   4529    PREP_8TAP_HV         m6, srcq+strideq*0
   4530 %endif
   4531    mova                 m3, [base+pw_8192]
   4532    pmulhrsw             m1, m3, m4
   4533    pmulhrsw             m2, m3, m5
   4534    pmulhrsw             m3, m6
   4535    punpcklwd            m4, m0, m1 ; 34
   4536    punpcklwd            m5, m1, m2 ; 45
   4537    punpcklwd            m6, m2, m3 ; 56
   4538    SAVELINE_W8           6, m3
   4539    RESTORELINE_W8        1, m1
   4540    RESTORELINE_W8        2, m2
   4541    RESTORELINE_W8        3, m3
   4542 .hv_w8_loop:
   4543    SAVELINE_W8           1, m3
   4544    SAVELINE_W8           2, m4
   4545    SAVELINE_W8           3, m5
   4546    SAVELINE_W8           4, m6
   4547 %if ARCH_X86_32
   4548    pmaddwd              m0, m1, subpelv0 ; a0
   4549    pmaddwd              m7, m2, subpelv0 ; b0
   4550    pmaddwd              m3, subpelv1     ; a1
   4551    pmaddwd              m4, subpelv1     ; b1
   4552    paddd                m0, m3
   4553    paddd                m7, m4
   4554    pmaddwd              m5, subpelv2     ; a2
   4555    pmaddwd              m6, subpelv2     ; b2
   4556    paddd                m0, m5
   4557    paddd                m7, m6
   4558    mova                 m5, [base+pd_32]
   4559    paddd                m0, m5
   4560    paddd                m7, m5
   4561    mova             accuv0, m0
   4562    mova             accuv1, m7
   4563 %else
   4564    pmaddwd          accuv0, m1, subpelv0 ; a0
   4565    pmaddwd          accuv1, m2, subpelv0 ; b0
   4566    pmaddwd              m3, subpelv1     ; a1
   4567    pmaddwd              m4, subpelv1     ; b1
   4568    paddd            accuv0, m3
   4569    paddd            accuv1, m4
   4570    pmaddwd              m5, subpelv2     ; a2
   4571    pmaddwd              m6, subpelv2     ; b2
   4572    paddd            accuv0, m5
   4573    paddd            accuv1, m6
   4574    mova                 m7, [base+pd_32]
   4575    paddd            accuv0, m7
   4576    paddd            accuv1, m7
   4577    mova                 m7, [base+subpel_h_shufB]
   4578    mova                 m6, [base+subpel_h_shufC]
   4579    mova                 m5, [base+subpel_h_shufA]
   4580    %define           shufA  m5
   4581    %define           shufB  m7
   4582    %define           shufC  m6
   4583 %endif
   4584    PREP_8TAP_HV         m0, srcq+strideq*1
   4585    lea                srcq, [srcq+strideq*2]
   4586    PREP_8TAP_HV         m4, srcq+strideq*0
   4587    mova                 m5, [base+pw_8192]
   4588    pmulhrsw             m0, m5
   4589    pmulhrsw             m4, m5
   4590    RESTORELINE_W8        6, m6
   4591    punpcklwd            m5, m6, m0 ; 67
   4592    punpcklwd            m6, m0, m4 ; 78
   4593    pmaddwd              m1, m5, subpelv3 ; a3
   4594    paddd                m2, m1, accuv0
   4595    pmaddwd              m1, m6, subpelv3 ; b3
   4596    paddd                m1, m1, accuv1
   4597    psrad                m2, 6
   4598    psrad                m1, 6
   4599    packssdw             m2, m1
   4600    movq        [tmpq+wq*0], m2
   4601    movhps      [tmpq+wq*2], m2
   4602    lea                tmpq, [tmpq+wq*4]
   4603    sub                  hd, 2
   4604    jle .hv_w8_outer
   4605    SAVELINE_W8           6, m4
   4606    RESTORELINE_W8        1, m1
   4607    RESTORELINE_W8        2, m2
   4608    RESTORELINE_W8        3, m3
   4609    RESTORELINE_W8        4, m4
   4610    jmp .hv_w8_loop
   4611 .hv_w8_outer:
   4612 %if ARCH_X86_32
   4613    mov                srcq, srcm
   4614    mov                tmpq, tmpm
   4615    movzx                hd, r5w
   4616    add                srcq, 4
   4617    add                tmpq, 8
   4618    mov                srcm, srcq
   4619    mov                tmpm, tmpq
   4620 %else
   4621    add                  r6, 4
   4622    add                  r8, 8
   4623    movzx                hd, r5b
   4624    mov                srcq, r6
   4625    mov                tmpq, r8
   4626 %endif
   4627    sub                 r5d, 1<<16
   4628    jg .hv_w8_loop0
   4629    RET
   4630 
   4631 %macro movifprep 2
   4632 %if isprep
   4633    mov %1, %2
   4634 %endif
   4635 %endmacro
   4636 
   4637 %macro SAVE_REG 1
   4638 %xdefine r%1_save  r%1
   4639 %xdefine r%1q_save r%1q
   4640 %xdefine r%1d_save r%1d
   4641 %if ARCH_X86_32
   4642  %define r%1m_save [rstk+stack_offset+(%1+1)*4]
   4643 %endif
   4644 %endmacro
   4645 
   4646 %macro LOAD_REG 1
   4647 %xdefine r%1  r%1_save
   4648 %xdefine r%1q r%1q_save
   4649 %xdefine r%1d r%1d_save
   4650 %if ARCH_X86_32
   4651  %define r%1m r%1m_save
   4652 %endif
   4653 %undef r%1d_save
   4654 %undef r%1q_save
   4655 %undef r%1_save
   4656 %endmacro
   4657 
   4658 %macro REMAP_REG 2-3
   4659 %xdefine r%1  r%2
   4660 %xdefine r%1q r%2q
   4661 %xdefine r%1d r%2d
   4662 %if ARCH_X86_32
   4663  %if %3 == 0
   4664   %xdefine r%1m r%2m
   4665  %else
   4666   %define r%1m [rstk+stack_offset+(%1+1)*4]
   4667  %endif
   4668 %endif
   4669 %endmacro
   4670 
   4671 %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
   4672 %if isprep
   4673  %if ARCH_X86_64
   4674   SAVE_REG 14
   4675   %assign %%i 14
   4676   %rep 14
   4677    %assign %%j %%i-1
   4678    REMAP_REG %%i, %%j
   4679    %assign %%i %%i-1
   4680   %endrep
   4681  %else
   4682   SAVE_REG 5
   4683   %assign %%i 5
   4684   %rep 5
   4685    %assign %%j %%i-1
   4686    REMAP_REG %%i, %%j, 0
   4687    %assign %%i %%i-1
   4688   %endrep
   4689  %endif
   4690 %endif
   4691 %endmacro
   4692 
   4693 %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
   4694 %if isprep
   4695  %assign %%i 1
   4696  %if ARCH_X86_64
   4697   %rep 13
   4698    %assign %%j %%i+1
   4699    REMAP_REG %%i, %%j
   4700    %assign %%i %%i+1
   4701   %endrep
   4702   LOAD_REG 14
   4703  %else
   4704   %rep 4
   4705    %assign %%j %%i+1
   4706    REMAP_REG %%i, %%j, 1
   4707    %assign %%i %%i+1
   4708   %endrep
   4709   LOAD_REG 5
   4710  %endif
   4711 %endif
   4712 %endmacro
   4713 
   4714 %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
   4715    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
   4716    RET
   4717 %if %1
   4718    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
   4719 %endif
   4720 %endmacro
   4721 
   4722 %if ARCH_X86_64
   4723 %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3]
   4724    SWAP                m%2, m%5
   4725    movq                m%1, [srcq+ r4]
   4726    movq                m%2, [srcq+ r6]
   4727    movhps              m%1, [srcq+ r7]
   4728    movhps              m%2, [srcq+ r9]
   4729    movq                m%3, [srcq+r10]
   4730    movq                m%4, [srcq+r11]
   4731    movhps              m%3, [srcq+r13]
   4732    movhps              m%4, [srcq+ rX]
   4733    add                srcq, ssq
   4734    movq                m%5, [srcq+ r4]
   4735    movq                m%6, [srcq+ r6]
   4736    movhps              m%5, [srcq+ r7]
   4737    movhps              m%6, [srcq+ r9]
   4738    movq                m%7, [srcq+r10]
   4739    movq                m%8, [srcq+r11]
   4740    movhps              m%7, [srcq+r13]
   4741    movhps              m%8, [srcq+ rX]
   4742    add                srcq, ssq
   4743    pmaddubsw           m%1, m%9
   4744    pmaddubsw           m%5, m%9
   4745    pmaddubsw           m%2, m%10
   4746    pmaddubsw           m%6, m%10
   4747    pmaddubsw           m%3, m%11
   4748    pmaddubsw           m%7, m%11
   4749    pmaddubsw           m%4, m%12
   4750    pmaddubsw           m%8, m%12
   4751    phaddw              m%1, m%2
   4752    phaddw              m%5, m%6
   4753    phaddw              m%3, m%4
   4754    phaddw              m%7, m%8
   4755    phaddw              m%1, m%3
   4756    phaddw              m%5, m%7
   4757    pmulhrsw            m%1, m12
   4758    pmulhrsw            m%5, m12
   4759    SWAP                m%2, m%5
   4760 %endmacro
   4761 %else
   4762 %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets
   4763  %if %3 == 1
   4764    mov                  r0, [esp+ 0]
   4765    mov                  rX, [esp+ 8]
   4766    mov                  r4, [esp+ 4]
   4767    mov                  r5, [esp+12]
   4768  %endif
   4769    movq                 m0, [srcq+r0]
   4770    movq                 m1, [srcq+rX]
   4771    movhps               m0, [srcq+r4]
   4772    movhps               m1, [srcq+r5]
   4773    add                srcq, ssq
   4774    movq                 m4, [srcq+r0]
   4775    movq                 m5, [srcq+rX]
   4776    movhps               m4, [srcq+r4]
   4777    movhps               m5, [srcq+r5]
   4778    mov                  r0, [esp+16]
   4779    mov                  rX, [esp+24]
   4780    mov                  r4, [esp+20]
   4781    mov                  r5, [esp+28]
   4782    sub                srcq, ssq
   4783    movq                 m2, [srcq+r0]
   4784    movq                 m3, [srcq+rX]
   4785    movhps               m2, [srcq+r4]
   4786    movhps               m3, [srcq+r5]
   4787    add                srcq, ssq
   4788    movq                 m6, [srcq+r0]
   4789    movq                 m7, [srcq+rX]
   4790    movhps               m6, [srcq+r4]
   4791    movhps               m7, [srcq+r5]
   4792    add                srcq, ssq
   4793    pmaddubsw            m0, [esp+%1+ 0]
   4794    pmaddubsw            m4, [esp+%1+ 0]
   4795    pmaddubsw            m1, [esp+%1+16]
   4796    pmaddubsw            m5, [esp+%1+16]
   4797    pmaddubsw            m2, [esp+%1+32]
   4798    pmaddubsw            m6, [esp+%1+32]
   4799    pmaddubsw            m3, [esp+%1+48]
   4800    pmaddubsw            m7, [esp+%1+48]
   4801    phaddw               m0, m1
   4802    phaddw               m4, m5
   4803    phaddw               m2, m3
   4804    phaddw               m6, m7
   4805    phaddw               m0, m2
   4806    phaddw               m4, m6
   4807    pmulhrsw             m0, m12
   4808    pmulhrsw             m4, m12
   4809  %if %2 != 0
   4810    mova        [esp+%2+ 0], m0
   4811    mova        [esp+%2+16], m4
   4812  %endif
   4813 %endmacro
   4814 %endif
   4815 
   4816 %macro MC_8TAP_SCALED 1
   4817 %ifidn %1, put
   4818 %assign isprep 0
   4819 %if ARCH_X86_64
   4820  %if required_stack_alignment <= STACK_ALIGNMENT
   4821 cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
   4822  %else
   4823 cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
   4824  %endif
   4825 %else ; ARCH_X86_32
   4826  %if required_stack_alignment <= STACK_ALIGNMENT
   4827 cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy
   4828  %else
   4829 cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy
   4830  %endif
   4831 %endif
   4832 %xdefine base_reg r12
   4833 %define rndshift 10
   4834 %else ; prep
   4835 %assign isprep 1
   4836 %if ARCH_X86_64
   4837  %if required_stack_alignment <= STACK_ALIGNMENT
   4838 cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
   4839   %xdefine tmp_stridem r14q
   4840  %else
   4841 cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
   4842   %define tmp_stridem qword [rsp+0x138]
   4843  %endif
   4844  %xdefine base_reg r11
   4845 %else ; ARCH_X86_32
   4846  %if required_stack_alignment <= STACK_ALIGNMENT
   4847 cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
   4848  %else
   4849 cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
   4850  %endif
   4851  %define tmp_stridem dword [esp+0x138]
   4852 %endif
   4853 %define rndshift 6
   4854 %endif
   4855 %if ARCH_X86_32
   4856    mov         [esp+0x1f0], t0d
   4857    mov         [esp+0x1f4], t1d
   4858 %if !isprep && required_stack_alignment > STACK_ALIGNMENT
   4859    mov                dstd, dstm
   4860    mov                 dsd, dsm
   4861    mov                srcd, srcm
   4862    mov                 ssd, ssm
   4863    mov                  hd, hm
   4864    mov                  r4, mxm
   4865  %define r0m  [esp+0x200]
   4866  %define dsm  [esp+0x204]
   4867  %define dsmp dsm
   4868  %define r1m  dsm
   4869  %define r2m  [esp+0x208]
   4870  %define ssm  [esp+0x20c]
   4871  %define r3m  ssm
   4872  %define hm   [esp+0x210]
   4873  %define mxm  [esp+0x214]
   4874    mov                 r0m, dstd
   4875    mov                 dsm, dsd
   4876    mov                 r2m, srcd
   4877    mov                 ssm, ssd
   4878    mov                  hm, hd
   4879    mov                  r0, mym
   4880    mov                  r1, dxm
   4881    mov                  r2, dym
   4882  %define mym [esp+0x218]
   4883  %define dxm [esp+0x09c]
   4884  %define dym [esp+0x21c]
   4885    mov                 mxm, r4
   4886    mov                 mym, r0
   4887    mov                 dxm, r1
   4888    mov                 dym, r2
   4889    tzcnt                wd, wm
   4890 %endif
   4891 %if isprep && required_stack_alignment > STACK_ALIGNMENT
   4892  %xdefine base_reg r5
   4893 %else
   4894  %xdefine base_reg r6
   4895 %endif
   4896    mov                 ssd, ssm
   4897 %endif
   4898    LEA            base_reg, %1_8tap_scaled_8bpc_ssse3
   4899 %xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3
   4900 %if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
   4901    tzcnt                wd, wm
   4902 %endif
   4903 %if ARCH_X86_32
   4904 %define m8  m0
   4905 %define m9  m1
   4906 %define m14 m4
   4907 %define m15 m3
   4908 %endif
   4909    movd                 m8, dxm
   4910    movd                m14, mxm
   4911    pshufd               m8, m8, q0000
   4912    pshufd              m14, m14, q0000
   4913 %if isprep && UNIX64
   4914    mov                 r5d, t0d
   4915 DECLARE_REG_TMP 5, 7
   4916 %endif
   4917 %if ARCH_X86_64
   4918    mov                 dyd, dym
   4919 %endif
   4920 %ifidn %1, put
   4921 %if WIN64
   4922    mov                 r8d, hm
   4923  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
   4924  %define hm r5m
   4925  %define dxm r8m
   4926 %elif ARCH_X86_64
   4927  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
   4928  %define hm r6m
   4929 %endif
   4930 %if ARCH_X86_64
   4931  %if required_stack_alignment > STACK_ALIGNMENT
   4932   %define dsm [rsp+0x138]
   4933   %define rX r1
   4934   %define rXd r1d
   4935  %else
   4936   %define dsm dsq
   4937   %define rX r14
   4938   %define rXd r14d
   4939  %endif
   4940 %else
   4941  %define rX r1
   4942 %endif
   4943 %else ; prep
   4944 %if WIN64
   4945    mov                 r7d, hm
   4946  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
   4947  %define hm r4m
   4948  %define dxm r7m
   4949 %elif ARCH_X86_64
   4950  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
   4951  %define hm [rsp+0x94]
   4952 %endif
   4953 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
   4954 %if ARCH_X86_64
   4955  %define rX r14
   4956  %define rXd r14d
   4957 %else
   4958  %define rX r3
   4959 %endif
   4960 %endif
   4961 %if ARCH_X86_64
   4962    mova                m10, [base+pd_0x3ff]
   4963    mova                m12, [base+pw_8192]
   4964 %ifidn %1, put
   4965    mova                m13, [base+pd_512]
   4966 %else
   4967    mova                m13, [base+pd_32]
   4968 %endif
   4969 %else
   4970 %define m10 [base+pd_0x3ff]
   4971 %define m12 [base+pw_8192]
   4972 %ifidn %1, put
   4973  %define m13 [base+pd_512]
   4974 %else
   4975  %define m13 [base+pd_32]
   4976 %endif
   4977 %endif
   4978    pxor                 m9, m9
   4979 %if ARCH_X86_64
   4980    lea                ss3q, [ssq*3]
   4981    movzx               r7d, t1b
   4982    shr                 t1d, 16
   4983    cmp                  hd, 6
   4984    cmovs               t1d, r7d
   4985    sub                srcq, ss3q
   4986 %else
   4987 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
   4988    mov                  r1, [esp+0x1f4]
   4989    lea                  r0, [ssq*3]
   4990    movzx                r2, r1b
   4991    shr                  r1, 16
   4992    cmp            dword hm, 6
   4993    cmovs                r1, r2
   4994    mov         [esp+0x1f4], r1
   4995    mov                  r1, r1m
   4996    mov                  r2, r2m
   4997    sub                srcq, r0
   4998 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
   4999 %define ss3q r0
   5000 %define myd r4
   5001 %define dyd dword dym
   5002 %define hd  dword hm
   5003 %endif
   5004    cmp                 dyd, 1024
   5005    je .dy1
   5006    cmp                 dyd, 2048
   5007    je .dy2
   5008    movzx                wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
   5009    add                  wq, base_reg
   5010    jmp                  wq
   5011 %ifidn %1, put
   5012 .w2:
   5013 %if ARCH_X86_64
   5014    mov                 myd, mym
   5015    movzx               t0d, t0b
   5016    dec                srcq
   5017    movd                m15, t0d
   5018 %else
   5019    movzx                r4, byte [esp+0x1f0]
   5020    dec                srcq
   5021    movd                m15, r4
   5022 %endif
   5023    punpckldq            m9, m8
   5024    SWAP                 m8, m9
   5025    paddd               m14, m8 ; mx+dx*[0-1]
   5026 %if ARCH_X86_64
   5027    mova                m11, [base+pd_0x4000]
   5028 %else
   5029  %define m11 [base+pd_0x4000]
   5030 %endif
   5031    pshufd              m15, m15, q0000
   5032    pand                 m8, m14, m10
   5033    psrld                m8, 6
   5034    paddd               m15, m8
   5035    movd                r4d, m15
   5036    psrldq              m15, 4
   5037 %if ARCH_X86_64
   5038    movd                r6d, m15
   5039 %else
   5040    movd                r3d, m15
   5041 %endif
   5042    mova                 m5, [base+bdct_lb_dw]
   5043    mova                 m6, [base+subpel_s_shuf2]
   5044    movd                m15, [base+subpel_filters+r4*8+2]
   5045 %if ARCH_X86_64
   5046    movd                 m7, [base+subpel_filters+r6*8+2]
   5047 %else
   5048    movd                 m7, [base+subpel_filters+r3*8+2]
   5049 %endif
   5050    pxor                 m9, m9
   5051    pcmpeqd              m8, m9
   5052    psrld               m14, 10
   5053 %if ARCH_X86_32
   5054    mov                  r3, r3m
   5055    pshufb              m14, m5
   5056    paddb               m14, m6
   5057    mova        [rsp+0x180], m14
   5058    SWAP                 m5, m0
   5059    SWAP                 m6, m3
   5060  %define m8  m5
   5061  %define m15 m6
   5062 %endif
   5063    movq                 m0, [srcq+ssq*0]
   5064    movq                 m2, [srcq+ssq*2]
   5065    movhps               m0, [srcq+ssq*1]
   5066    movhps               m2, [srcq+ss3q ]
   5067    lea                srcq, [srcq+ssq*4]
   5068 %if ARCH_X86_64
   5069    pshufb              m14, m5
   5070    paddb               m14, m6
   5071 %endif
   5072    movq                 m1, [srcq+ssq*0]
   5073    movq                 m3, [srcq+ssq*2]
   5074    movhps               m1, [srcq+ssq*1]
   5075    movhps               m3, [srcq+ss3q ]
   5076    lea                srcq, [srcq+ssq*4]
   5077    punpckldq           m15, m7
   5078    punpcklqdq          m15, m15
   5079 %if ARCH_X86_64
   5080    pand                m11, m8
   5081    pandn                m8, m15
   5082    SWAP                m15, m8
   5083    por                 m15, m11
   5084 %else
   5085    pand                 m7, m8, m11
   5086    pandn                m8, m15
   5087  %define m8  m6
   5088  %define m15 m5
   5089    por                 m15, m7
   5090    mova        [rsp+0x190], m15
   5091 %endif
   5092    pshufb               m0, m14
   5093    pshufb               m2, m14
   5094    pshufb               m1, m14
   5095    pshufb               m3, m14
   5096    pmaddubsw            m0, m15
   5097    pmaddubsw            m2, m15
   5098    pmaddubsw            m1, m15
   5099    pmaddubsw            m3, m15
   5100    phaddw               m0, m2
   5101    phaddw               m1, m3
   5102    pmulhrsw             m0, m12       ; 0 1 2 3
   5103    pmulhrsw             m1, m12       ; 4 5 6 7
   5104    palignr              m2, m1, m0, 4 ; 1 2 3 4
   5105    punpcklwd            m3, m0, m2    ; 01 12
   5106    punpckhwd            m0, m2        ; 23 34
   5107    pshufd               m5, m1, q0321 ; 5 6 7 _
   5108    punpcklwd            m2, m1, m5    ; 45 56
   5109    punpckhwd            m4, m1, m5    ; 67 __
   5110 %if ARCH_X86_32
   5111    mov                 myd, mym
   5112    mov                  r0, r0m
   5113    mova        [rsp+0x1a0], m3
   5114    mova        [rsp+0x1b0], m0
   5115    mova        [rsp+0x1c0], m2
   5116    mova        [rsp+0x1d0], m4
   5117 %endif
   5118 .w2_loop:
   5119    and                 myd, 0x3ff
   5120 %if ARCH_X86_64
   5121    mov                 r6d, 64 << 24
   5122    mov                 r4d, myd
   5123    shr                 r4d, 6
   5124    lea                 r4d, [t1+r4]
   5125    cmovnz              r6q, [base+subpel_filters+r4*8]
   5126    movq                m11, r6q
   5127    punpcklbw           m11, m11
   5128    psraw               m11, 8
   5129    pshufd               m8, m11, q0000
   5130    pshufd               m9, m11, q1111
   5131    pshufd              m10, m11, q2222
   5132    pshufd              m11, m11, q3333
   5133    pmaddwd              m5, m3, m8
   5134    pmaddwd              m6, m0, m9
   5135    pmaddwd              m7, m2, m10
   5136    pmaddwd              m8, m4, m11
   5137    paddd                m5, m6
   5138    paddd                m7, m8
   5139 %else
   5140    mov                 mym, myd
   5141    mov                  r1, [esp+0x1f4]
   5142    xor                  r3, r3
   5143    shr                  r4, 6
   5144    lea                  r1, [r1+r4]
   5145    mov                  r4, 64 << 24
   5146    cmovnz               r4, [base+subpel_filters+r1*8+0]
   5147    cmovnz               r3, [base+subpel_filters+r1*8+4]
   5148    movd                 m7, r4
   5149    movd                 m6, r3
   5150    punpckldq            m7, m6
   5151    punpcklbw            m7, m7
   5152    psraw                m7, 8
   5153    pshufd               m5, m7, q0000
   5154    pshufd               m6, m7, q1111
   5155    pmaddwd              m3, m5
   5156    pmaddwd              m0, m6
   5157    pshufd               m5, m7, q2222
   5158    pshufd               m7, m7, q3333
   5159    pmaddwd              m2, m5
   5160    pmaddwd              m4, m7
   5161    paddd                m3, m0
   5162    paddd                m2, m4
   5163    SWAP                 m5, m3
   5164    SWAP                 m7, m2
   5165 %endif
   5166    paddd                m5, m13
   5167    paddd                m5, m7
   5168    psrad                m5, 10
   5169    packssdw             m5, m5
   5170    packuswb             m5, m5
   5171 %if ARCH_X86_64
   5172    pextrw              r6d, m5, 0
   5173    mov              [dstq], r6w
   5174    add                dstq, dsq
   5175    dec                  hd
   5176    jz .ret
   5177    add                 myd, dyd
   5178 %else
   5179    pextrw              r3d, m5, 0
   5180    mov              [dstq], r3w
   5181    add                dstq, dsm
   5182    dec                  hd
   5183    jz .ret
   5184    mov                 myd, mym
   5185    add                 myd, dym
   5186 %endif
   5187    test                myd, ~0x3ff
   5188 %if ARCH_X86_32
   5189    SWAP                 m3, m5
   5190    SWAP                 m2, m7
   5191    mova                 m3, [rsp+0x1a0]
   5192    mova                 m0, [rsp+0x1b0]
   5193    mova                 m2, [rsp+0x1c0]
   5194    mova                 m4, [rsp+0x1d0]
   5195  %define m14 [esp+0x180]
   5196  %define m15 [esp+0x190]
   5197 %endif
   5198    jz .w2_loop
   5199 %if ARCH_X86_32
   5200    mov                  r3, r3m
   5201 %endif
   5202    movq                 m5, [srcq]
   5203    test                myd, 0x400
   5204    jz .w2_skip_line
   5205    add                srcq, ssq
   5206    shufps               m3, m0, q1032      ; 01 12
   5207    shufps               m0, m2, q1032      ; 23 34
   5208    shufps               m2, m4, q1032      ; 45 56
   5209    pshufb               m5, m14
   5210    pmaddubsw            m5, m15
   5211    phaddw               m5, m5
   5212    pmulhrsw             m5, m12
   5213    palignr              m4, m5, m1, 12
   5214    punpcklqdq           m1, m4, m4         ; 6 7 6 7
   5215    punpcklwd            m4, m1, m5         ; 67 __
   5216 %if ARCH_X86_32
   5217    mova        [rsp+0x1a0], m3
   5218    mova        [rsp+0x1b0], m0
   5219    mova        [rsp+0x1c0], m2
   5220    mova        [rsp+0x1d0], m4
   5221 %endif
   5222    jmp .w2_loop
   5223 .w2_skip_line:
   5224    movhps               m5, [srcq+ssq*1]
   5225    lea                srcq, [srcq+ssq*2]
   5226    mova                 m3, m0             ; 01 12
   5227    mova                 m0, m2             ; 23 34
   5228    pshufb               m5, m14
   5229    pmaddubsw            m5, m15
   5230    phaddw               m5, m5
   5231    pmulhrsw             m5, m12            ; 6 7 6 7
   5232    palignr              m4, m5, m1, 8      ; 4 5 6 7
   5233    pshufd               m5, m4, q0321      ; 5 6 7 _
   5234    mova                 m1, m4
   5235    punpcklwd            m2, m4, m5         ; 45 56
   5236    punpckhwd            m4, m5             ; 67 __
   5237 %if ARCH_X86_32
   5238    mova        [rsp+0x1a0], m3
   5239    mova        [rsp+0x1b0], m0
   5240    mova        [rsp+0x1c0], m2
   5241    mova        [rsp+0x1d0], m4
   5242 %endif
   5243    jmp .w2_loop
   5244 %endif
   5245 INIT_XMM ssse3
   5246 .w4:
   5247 %if ARCH_X86_64
   5248    mov                 myd, mym
   5249    movzx               t0d, t0b
   5250    dec                srcq
   5251    movd                m15, t0d
   5252 %else
   5253 %define m8  m0
   5254 %xdefine m14 m4
   5255 %define m15 m3
   5256    movzx                r4, byte [esp+0x1f0]
   5257    dec                srcq
   5258    movd                m15, r4
   5259 %endif
   5260    pmaddwd              m8, [base+rescale_mul]
   5261 %if ARCH_X86_64
   5262    mova                m11, [base+pd_0x4000]
   5263 %else
   5264  %define m11 [base+pd_0x4000]
   5265 %endif
   5266    pshufd              m15, m15, q0000
   5267    paddd               m14, m8 ; mx+dx*[0-3]
   5268    pand                 m0, m14, m10
   5269    psrld                m0, 6
   5270    paddd               m15, m0
   5271    psrldq               m7, m15, 8
   5272 %if ARCH_X86_64
   5273    movd                r4d, m15
   5274    movd               r11d, m7
   5275    psrldq              m15, 4
   5276    psrldq               m7, 4
   5277    movd                r6d, m15
   5278    movd               r13d, m7
   5279    movd                m15, [base+subpel_filters+ r4*8+2]
   5280    movd                 m2, [base+subpel_filters+r11*8+2]
   5281    movd                 m3, [base+subpel_filters+ r6*8+2]
   5282    movd                 m4, [base+subpel_filters+r13*8+2]
   5283 %else
   5284    movd                 r0, m15
   5285    movd                 rX, m7
   5286    psrldq              m15, 4
   5287    psrldq               m7, 4
   5288    movd                 r4, m15
   5289    movd                 r5, m7
   5290    movd                 m1, [base+subpel_filters+r0*8+2]
   5291    movd                 m2, [base+subpel_filters+rX*8+2]
   5292    movd                 m3, [base+subpel_filters+r4*8+2]
   5293    movd                 m7, [base+subpel_filters+r5*8+2]
   5294    movifprep            r3, r3m
   5295    SWAP                 m4, m7
   5296 %define m15 m1
   5297 %endif
   5298    mova                 m5, [base+bdct_lb_dw]
   5299    movq                 m6, [base+subpel_s_shuf2]
   5300    psrld               m14, 10
   5301    punpckldq           m15, m3
   5302    punpckldq            m2, m4
   5303    punpcklqdq          m15, m2
   5304    punpcklqdq           m6, m6
   5305    pshufb              m14, m5
   5306    paddb               m14, m6
   5307 %if ARCH_X86_64
   5308    pcmpeqd              m0, m9
   5309    pand                m11, m0
   5310 %else
   5311    mova        [esp+0x180], m14
   5312    SWAP                 m7, m4
   5313    pxor                 m3, m3
   5314    pcmpeqd              m0, m3
   5315    pand                 m2, m11, m0
   5316 %define m11 m2
   5317 %endif
   5318    pandn                m0, m15
   5319 %if ARCH_X86_64
   5320    SWAP                m15, m0
   5321 %else
   5322 %define m15 m0
   5323 %endif
   5324    por                 m15, m11
   5325 %if ARCH_X86_64
   5326    movu                 m7, [srcq+ssq*0]
   5327    movu                 m9, [srcq+ssq*1]
   5328    movu                 m8, [srcq+ssq*2]
   5329    movu                m10, [srcq+ss3q ]
   5330    lea                srcq, [srcq+ssq*4]
   5331    movu                 m2, [srcq+ssq*0]
   5332    movu                 m4, [srcq+ssq*1]
   5333    movu                 m3, [srcq+ssq*2]
   5334    movu                 m5, [srcq+ss3q ]
   5335    lea                srcq, [srcq+ssq*4]
   5336    pshufb               m7, m14
   5337    pshufb               m9, m14
   5338    pshufb               m8, m14
   5339    pshufb              m10, m14
   5340    pshufb               m2, m14
   5341    pshufb               m4, m14
   5342    pshufb               m3, m14
   5343    pshufb               m5, m14
   5344    pmaddubsw            m7, m15
   5345    pmaddubsw            m9, m15
   5346    pmaddubsw            m8, m15
   5347    pmaddubsw           m10, m15
   5348    pmaddubsw            m2, m15
   5349    pmaddubsw            m4, m15
   5350    pmaddubsw            m3, m15
   5351    pmaddubsw            m5, m15
   5352    phaddw               m7, m9
   5353    phaddw               m8, m10
   5354    phaddw               m9, m2, m4
   5355    phaddw               m3, m5
   5356    pmulhrsw             m7, m12            ; 0 1
   5357    pmulhrsw             m8, m12            ; 2 3
   5358    pmulhrsw             m9, m12            ; 4 5
   5359    pmulhrsw             m3, m12            ; 6 7
   5360    shufps               m4, m7, m8, q1032  ; 1 2
   5361    shufps               m5, m8, m9, q1032  ; 3 4
   5362    shufps               m6, m9, m3, q1032  ; 5 6
   5363    psrldq              m11, m3, 8          ; 7 _
   5364    punpcklwd            m0, m7, m4 ; 01
   5365    punpckhwd            m7, m4     ; 12
   5366    punpcklwd            m1, m8, m5 ; 23
   5367    punpckhwd            m8, m5     ; 34
   5368    punpcklwd            m2, m9, m6 ; 45
   5369    punpckhwd            m9, m6     ; 56
   5370    punpcklwd            m3, m11    ; 67
   5371    mova         [rsp+0x00], m7
   5372    mova         [rsp+0x10], m8
   5373    mova         [rsp+0x20], m9
   5374 %else
   5375    mova        [esp+0x190], m15
   5376    lea                ss3q, [ssq*3]
   5377    movu                 m2, [srcq+ssq*0]
   5378    movu                 m3, [srcq+ssq*1]
   5379    movu                 m7, [srcq+ssq*2]
   5380    movu                 m6, [srcq+ss3q ]
   5381    lea                srcq, [srcq+ssq*4]
   5382    pshufb               m2, m14
   5383    pshufb               m3, m14
   5384    pshufb               m7, m14
   5385    pshufb               m6, m14
   5386    pmaddubsw            m2, m15
   5387    pmaddubsw            m3, m15
   5388    pmaddubsw            m7, m15
   5389    pmaddubsw            m6, m15
   5390    phaddw               m2, m3
   5391    phaddw               m7, m6
   5392    movu                 m1, [srcq+ssq*0]
   5393    movu                 m5, [srcq+ssq*1]
   5394    movu                 m3, [srcq+ssq*2]
   5395    movu                 m6, [srcq+ss3q ]
   5396    lea                srcq, [srcq+ssq*4]
   5397    pshufb               m1, m14
   5398    pshufb               m5, m14
   5399    pshufb               m3, m14
   5400    pshufb               m6, m14
   5401    pmaddubsw            m1, m15
   5402    pmaddubsw            m5, m15
   5403    pmaddubsw            m3, m15
   5404    pmaddubsw            m6, m15
   5405    phaddw               m1, m5
   5406    phaddw               m3, m6
   5407    pmulhrsw             m2, m12
   5408    pmulhrsw             m7, m12
   5409    pmulhrsw             m1, m12
   5410    pmulhrsw             m3, m12
   5411    shufps               m4, m2, m7, q1032  ; 1 2
   5412    shufps               m5, m7, m1, q1032  ; 3 4
   5413    shufps               m6, m1, m3, q1032  ; 5 6
   5414    psrldq               m0, m3, 8          ; 7 _
   5415    mova        [esp+0x1a0], m0
   5416 %define m11 [esp+0x1a0]
   5417    punpcklwd            m0, m2, m4      ; 01
   5418    punpckhwd            m2, m4          ; 12
   5419    punpcklwd            m4, m7, m5      ; 23
   5420    punpckhwd            m7, m5          ; 34
   5421    punpcklwd            m5, m1, m6      ; 45
   5422    punpckhwd            m1, m6          ; 56
   5423    punpcklwd            m3, [esp+0x1a0] ; 67
   5424    mov                 myd, mym
   5425    mov                  r0, r0m
   5426    mova        [esp+0x1b0], m0 ; 01
   5427    mova        [esp+0x1c0], m4 ; 23
   5428    mova        [esp+0x1d0], m5 ; 45
   5429    mova        [esp+0x1e0], m3 ; 67
   5430    mova         [rsp+0x00], m2 ; 12
   5431    mova         [rsp+0x10], m7 ; 34
   5432    mova         [rsp+0x20], m1 ; 56
   5433    SWAP                 m1, m4
   5434    SWAP                 m2, m5
   5435 %endif
   5436 .w4_loop:
   5437    and                 myd, 0x3ff
   5438 %if ARCH_X86_64
   5439    mov                 r6d, 64 << 24
   5440    mov                 r4d, myd
   5441    shr                 r4d, 6
   5442    lea                 r4d, [t1+r4]
   5443    cmovnz              r6q, [base+subpel_filters+r4*8]
   5444    movq                m10, r6q
   5445    punpcklbw           m10, m10
   5446    psraw               m10, 8
   5447    pshufd               m7, m10, q0000
   5448    pshufd               m8, m10, q1111
   5449    pshufd               m9, m10, q2222
   5450    pshufd              m10, m10, q3333
   5451    pmaddwd              m4, m0, m7
   5452    pmaddwd              m5, m1, m8
   5453    pmaddwd              m6, m2, m9
   5454    pmaddwd              m7, m3, m10
   5455    paddd                m4, m5
   5456    paddd                m6, m7
   5457    paddd                m4, m13
   5458    paddd                m4, m6
   5459 %else
   5460    mov                 mym, myd
   5461    mov                  r5, [esp+0x1f4]
   5462    xor                  r3, r3
   5463    shr                  r4, 6
   5464    lea                  r5, [r5+r4]
   5465    mov                  r4, 64 << 24
   5466    cmovnz               r4, [base+subpel_filters+r5*8+0]
   5467    cmovnz               r3, [base+subpel_filters+r5*8+4]
   5468    movd                 m7, r4
   5469    movd                 m6, r3
   5470    punpckldq            m7, m6
   5471    punpcklbw            m7, m7
   5472    psraw                m7, 8
   5473    pshufd               m4, m7, q0000
   5474    pshufd               m5, m7, q1111
   5475    pshufd               m6, m7, q2222
   5476    pshufd               m7, m7, q3333
   5477    pmaddwd              m0, m4
   5478    pmaddwd              m1, m5
   5479    pmaddwd              m2, m6
   5480    pmaddwd              m3, m7
   5481    paddd                m0, m1
   5482    paddd                m2, m3
   5483    paddd                m0, m13
   5484    paddd                m0, m2
   5485    SWAP                 m4, m0
   5486 %endif
   5487    psrad                m4, rndshift
   5488    packssdw             m4, m4
   5489 %ifidn %1, put
   5490    packuswb             m4, m4
   5491    movd             [dstq], m4
   5492    add                dstq, dsmp
   5493 %else
   5494    movq             [tmpq], m4
   5495    add                tmpq, 8
   5496 %endif
   5497    dec                  hd
   5498    jz .ret
   5499 %if ARCH_X86_64
   5500    add                 myd, dyd
   5501    test                myd, ~0x3ff
   5502    jz .w4_loop
   5503 %else
   5504    SWAP                 m0, m4
   5505    mov                 myd, mym
   5506    mov                  r3, r3m
   5507    add                 myd, dym
   5508    test                myd, ~0x3ff
   5509    jnz .w4_next_line
   5510    mova                 m0, [esp+0x1b0]
   5511    mova                 m1, [esp+0x1c0]
   5512    mova                 m2, [esp+0x1d0]
   5513    mova                 m3, [esp+0x1e0]
   5514    jmp .w4_loop
   5515 .w4_next_line:
   5516  %define m14 [esp+0x180]
   5517  %define m15 [esp+0x190]
   5518 %endif
   5519    movu                 m4, [srcq]
   5520    test                myd, 0x400
   5521    jz .w4_skip_line
   5522 %if ARCH_X86_64
   5523    mova                 m0, [rsp+0x00]
   5524    mova         [rsp+0x00], m1
   5525    mova                 m1, [rsp+0x10]
   5526    mova         [rsp+0x10], m2
   5527    mova                 m2, [rsp+0x20]
   5528    mova         [rsp+0x20], m3
   5529 %else
   5530    mova                 m5, [esp+0x1c0]
   5531    mova                 m0, [rsp+0x000]
   5532    mova         [rsp+0x00], m5
   5533    mova        [esp+0x1b0], m0
   5534    mova                 m6, [esp+0x1d0]
   5535    mova                 m1, [rsp+0x010]
   5536    mova         [rsp+0x10], m6
   5537    mova        [esp+0x1c0], m1
   5538    mova                 m7, [esp+0x1e0]
   5539    mova                 m2, [rsp+0x020]
   5540    mova         [rsp+0x20], m7
   5541    mova        [esp+0x1d0], m2
   5542 %endif
   5543    pshufb               m4, m14
   5544    pmaddubsw            m4, m15
   5545    phaddw               m4, m4
   5546    pmulhrsw             m4, m12
   5547    punpcklwd            m3, m11, m4
   5548 %if ARCH_X86_32
   5549    mova        [esp+0x1e0], m3
   5550 %endif
   5551    mova                m11, m4
   5552    add                srcq, ssq
   5553    jmp .w4_loop
   5554 .w4_skip_line:
   5555 %if ARCH_X86_32
   5556    mova                 m0, [esp+0x1c0]
   5557    mova                 m1, [esp+0x1d0]
   5558    mova                 m2, [esp+0x1e0]
   5559 %endif
   5560    movu                 m5, [srcq+ssq*1]
   5561    lea                srcq, [srcq+ssq*2]
   5562    mova                 m6, [rsp+0x10]
   5563    mova                 m7, [rsp+0x20]
   5564    pshufb               m4, m14
   5565    pshufb               m5, m14
   5566    pmaddubsw            m4, m15
   5567    pmaddubsw            m5, m15
   5568    phaddw               m4, m5
   5569    pmulhrsw             m4, m12
   5570    punpcklwd            m5, m11, m4
   5571    mova         [rsp+0x00], m6
   5572    mova         [rsp+0x10], m7
   5573    mova         [rsp+0x20], m5
   5574 %if ARCH_X86_64
   5575    psrldq              m11, m4, 8
   5576    mova                 m0, m1
   5577    mova                 m1, m2
   5578    mova                 m2, m3
   5579    punpcklwd            m3, m4, m11
   5580 %else
   5581    psrldq               m6, m4, 8
   5582    punpcklwd            m3, m4, m6
   5583    mova        [esp+0x1a0], m6
   5584    mova        [esp+0x1b0], m0
   5585    mova        [esp+0x1c0], m1
   5586    mova        [esp+0x1d0], m2
   5587    mova        [esp+0x1e0], m3
   5588 %endif
   5589    jmp .w4_loop
   5590 INIT_XMM ssse3
   5591 .w8:
   5592    mov    dword [rsp+0x90], 1
   5593    movifprep   tmp_stridem, 16
   5594    jmp .w_start
   5595 .w16:
   5596    mov    dword [rsp+0x90], 2
   5597    movifprep   tmp_stridem, 32
   5598    jmp .w_start
   5599 .w32:
   5600    mov    dword [rsp+0x90], 4
   5601    movifprep   tmp_stridem, 64
   5602    jmp .w_start
   5603 .w64:
   5604    mov    dword [rsp+0x90], 8
   5605    movifprep   tmp_stridem, 128
   5606    jmp .w_start
   5607 .w128:
   5608    mov    dword [rsp+0x90], 16
   5609    movifprep   tmp_stridem, 256
   5610 .w_start:
   5611 %ifidn %1, put
   5612    movifnidn           dsm, dsq
   5613 %endif
   5614 %if ARCH_X86_64
   5615    shr                 t0d, 16
   5616    movd                m15, t0d
   5617 %else
   5618 %define m8  m0
   5619 %xdefine m14 m4
   5620 %define m15 m3
   5621 %if isprep
   5622  %define ssq ssm
   5623 %endif
   5624    mov                  r4, [esp+0x1f0]
   5625    shr                  r4, 16
   5626    movd                m15, r4
   5627    mov                  r0, r0m
   5628    mov                 myd, mym
   5629 %endif
   5630    sub                srcq, 3
   5631    pslld                m7, m8, 2 ; dx*4
   5632    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
   5633    pshufd              m15, m15, q0000
   5634    paddd               m14, m8 ; mx+dx*[0-3]
   5635    mova        [rsp+0x100], m7
   5636    mova        [rsp+0x120], m15
   5637    mov         [rsp+0x098], srcq
   5638    mov         [rsp+0x130], r0q ; dstq / tmpq
   5639 %if ARCH_X86_64 && UNIX64
   5640    mov                  hm, hd
   5641 %elif ARCH_X86_32
   5642    mov                  r5, hm
   5643    mov         [esp+0x094], myd
   5644    mov         [esp+0x134], r5
   5645 %endif
   5646    jmp .hloop
   5647 .hloop_prep:
   5648    dec   dword [rsp+0x090]
   5649    jz .ret
   5650 %if ARCH_X86_64
   5651    add   qword [rsp+0x130], 8*(isprep+1)
   5652    mov                  hd, hm
   5653 %else
   5654    add   dword [esp+0x130], 8*(isprep+1)
   5655    mov                 myd, [esp+0x094]
   5656    mov                  r5, [esp+0x134]
   5657    mov                  r0, [esp+0x130]
   5658 %endif
   5659    mova                 m7, [rsp+0x100]
   5660    mova                m14, [rsp+0x110]
   5661 %if ARCH_X86_64
   5662    mova                m10, [base+pd_0x3ff]
   5663 %endif
   5664    mova                m15, [rsp+0x120]
   5665    pxor                 m9, m9
   5666    mov                srcq, [rsp+0x098]
   5667 %if ARCH_X86_64
   5668    mov                 r0q, [rsp+0x130] ; dstq / tmpq
   5669 %else
   5670    mov                 mym, myd
   5671    mov                  hm, r5
   5672    mov                 r0m, r0
   5673    mov                  r3, r3m
   5674 %endif
   5675    paddd               m14, m7
   5676 .hloop:
   5677 %if ARCH_X86_64
   5678    mova                m11, [base+pq_0x40000000]
   5679 %else
   5680 %define m11 [base+pq_0x40000000]
   5681 %endif
   5682    psrld                m2, m14, 10
   5683    mova              [rsp], m2
   5684    pand                 m6, m14, m10
   5685    psrld                m6, 6
   5686    paddd                m5, m15, m6
   5687    pcmpeqd              m6, m9
   5688    psrldq               m2, m5, 8
   5689 %if ARCH_X86_64
   5690    movd                r4d, m5
   5691    movd                r6d, m2
   5692    psrldq               m5, 4
   5693    psrldq               m2, 4
   5694    movd                r7d, m5
   5695    movd                r9d, m2
   5696    movq                 m0, [base+subpel_filters+r4*8]
   5697    movq                 m1, [base+subpel_filters+r6*8]
   5698    movhps               m0, [base+subpel_filters+r7*8]
   5699    movhps               m1, [base+subpel_filters+r9*8]
   5700 %else
   5701    movd                 r0, m5
   5702    movd                 rX, m2
   5703    psrldq               m5, 4
   5704    psrldq               m2, 4
   5705    movd                 r4, m5
   5706    movd                 r5, m2
   5707    movq                 m0, [base+subpel_filters+r0*8]
   5708    movq                 m1, [base+subpel_filters+rX*8]
   5709    movhps               m0, [base+subpel_filters+r4*8]
   5710    movhps               m1, [base+subpel_filters+r5*8]
   5711    pxor                 m2, m2
   5712 %define m9 m2
   5713 %endif
   5714    paddd               m14, m7 ; mx+dx*[4-7]
   5715    pand                 m5, m14, m10
   5716    psrld                m5, 6
   5717    paddd               m15, m5
   5718    pcmpeqd              m5, m9
   5719    mova        [rsp+0x110], m14
   5720    psrldq               m4, m15, 8
   5721 %if ARCH_X86_64
   5722    movd               r10d, m15
   5723    movd               r11d, m4
   5724    psrldq              m15, 4
   5725    psrldq               m4, 4
   5726    movd               r13d, m15
   5727    movd                rXd, m4
   5728    movq                 m2, [base+subpel_filters+r10*8]
   5729    movq                 m3, [base+subpel_filters+r11*8]
   5730    movhps               m2, [base+subpel_filters+r13*8]
   5731    movhps               m3, [base+subpel_filters+ rX*8]
   5732    psrld               m14, 10
   5733    psrldq               m4, m14, 8
   5734    movd               r10d, m14
   5735    movd               r11d, m4
   5736    psrldq              m14, 4
   5737    psrldq               m4, 4
   5738    movd               r13d, m14
   5739    movd                rXd, m4
   5740    mov                 r4d, [rsp+ 0]
   5741    mov                 r6d, [rsp+ 8]
   5742    mov                 r7d, [rsp+ 4]
   5743    mov                 r9d, [rsp+12]
   5744    pshufd               m4, m6, q1100
   5745    pshufd               m6, m6, q3322
   5746    pshufd              m14, m5, q1100
   5747    pshufd               m5, m5, q3322
   5748    pand                 m7, m11, m4
   5749    pand                 m8, m11, m6
   5750    pand                m15, m11, m14
   5751    pand                m11, m11, m5
   5752    pandn                m4, m0
   5753    pandn                m6, m1
   5754    pandn               m14, m2
   5755    pandn                m5, m3
   5756    por                  m7, m4
   5757    por                  m8, m6
   5758    por                 m15, m14
   5759    por                 m11, m5
   5760    mova         [rsp+0x10], m7
   5761    mova         [rsp+0x20], m8
   5762    mova         [rsp+0x30], m15
   5763    mova         [rsp+0x40], m11
   5764    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1
   5765    mova         [rsp+0x50], m1
   5766    mova         [rsp+0x60], m2
   5767    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3
   5768    mova         [rsp+0x70], m3
   5769    mova         [rsp+0x80], m4
   5770    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5
   5771    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7
   5772    SWAP                 m7, m0
   5773    SWAP                 m8, m14
   5774    mova                 m1, [rsp+0x50]
   5775    mova                 m2, [rsp+0x60]
   5776    mova                 m3, [rsp+0x70]
   5777    mova                 m9, [rsp+0x80]
   5778    mov                 myd, mym
   5779    mov                 dyd, dym
   5780    punpcklwd            m4, m5, m6 ; 45a
   5781    punpckhwd            m5, m6     ; 45b
   5782    punpcklwd            m6, m7, m8 ; 67a
   5783    punpckhwd            m7, m8     ; 67b
   5784    punpcklwd            m0, m1, m2 ; 01a
   5785    punpckhwd            m1, m2     ; 01b
   5786    punpcklwd            m2, m3, m9 ; 23a
   5787    punpckhwd            m3, m9     ; 23b
   5788    mova         [rsp+0x50], m4
   5789    mova         [rsp+0x60], m5
   5790    mova         [rsp+0x70], m6
   5791    mova         [rsp+0x80], m7
   5792    SWAP                m14, m8
   5793 .vloop:
   5794    and                 myd, 0x3ff
   5795    mov                 r6d, 64 << 24
   5796    mov                 r4d, myd
   5797    shr                 r4d, 6
   5798    lea                 r4d, [t1+r4]
   5799    cmovnz              r6q, [base+subpel_filters+r4*8]
   5800    movq                m11, r6q
   5801    punpcklbw           m11, m11
   5802    psraw               m11, 8
   5803    pshufd               m5, m11, q0000
   5804    pshufd               m7, m11, q1111
   5805    pshufd              m10, m11, q2222
   5806    pshufd              m11, m11, q3333
   5807    pmaddwd              m4, m5, m0
   5808    pmaddwd              m5, m5, m1
   5809    pmaddwd              m6, m7, m2
   5810    pmaddwd              m7, m7, m3
   5811    paddd                m4, m13
   5812    paddd                m5, m13
   5813    paddd                m4, m6
   5814    paddd                m5, m7
   5815    pmaddwd              m6, [rsp+0x50], m10
   5816    pmaddwd              m7, [rsp+0x60], m10
   5817    pmaddwd              m8, [rsp+0x70], m11
   5818    pmaddwd              m9, [rsp+0x80], m11
   5819    paddd                m4, m6
   5820    paddd                m5, m7
   5821    paddd                m4, m8
   5822    paddd                m5, m9
   5823 %else
   5824    movd                 r0, m15
   5825    movd                 rX, m4
   5826    psrldq              m15, 4
   5827    psrldq               m4, 4
   5828    movd                 r4, m15
   5829    movd                 r5, m4
   5830    mova                m14, [esp+0x110]
   5831    movq                 m2, [base+subpel_filters+r0*8]
   5832    movq                 m3, [base+subpel_filters+rX*8]
   5833    movhps               m2, [base+subpel_filters+r4*8]
   5834    movhps               m3, [base+subpel_filters+r5*8]
   5835    psrld               m14, 10
   5836    mova           [esp+16], m14
   5837    mov                  r0, [esp+ 0]
   5838    mov                  rX, [esp+ 8]
   5839    mov                  r4, [esp+ 4]
   5840    mov                  r5, [esp+12]
   5841    mova         [esp+0x20], m0
   5842    mova         [esp+0x30], m1
   5843    mova         [esp+0x40], m2
   5844    mova         [esp+0x50], m3
   5845    pshufd               m4, m6, q1100
   5846    pshufd               m6, m6, q3322
   5847    pshufd               m7, m5, q1100
   5848    pshufd               m5, m5, q3322
   5849    pand                 m0, m11, m4
   5850    pand                 m1, m11, m6
   5851    pand                 m2, m11, m7
   5852    pand                 m3, m11, m5
   5853    pandn                m4, [esp+0x20]
   5854    pandn                m6, [esp+0x30]
   5855    pandn                m7, [esp+0x40]
   5856    pandn                m5, [esp+0x50]
   5857    por                  m0, m4
   5858    por                  m1, m6
   5859    por                  m2, m7
   5860    por                  m3, m5
   5861    mova         [esp+0x20], m0
   5862    mova         [esp+0x30], m1
   5863    mova         [esp+0x40], m2
   5864    mova         [esp+0x50], m3
   5865    MC_8TAP_SCALED_H   0x20, 0x140, 0 ; 0-1
   5866    MC_8TAP_SCALED_H   0x20, 0x160    ; 2-3
   5867    MC_8TAP_SCALED_H   0x20, 0x180    ; 4-5
   5868    MC_8TAP_SCALED_H   0x20, 0x1a0    ; 6-7
   5869    mova                 m5, [esp+0x180]
   5870    mova                 m6, [esp+0x190]
   5871    mova                 m7, [esp+0x1a0]
   5872    mova                 m0, [esp+0x1b0]
   5873    mov                 myd, mym
   5874    punpcklwd            m4, m5, m6      ; 45a
   5875    punpckhwd            m5, m6          ; 45b
   5876    punpcklwd            m6, m7, m0      ; 67a
   5877    punpckhwd            m7, m0          ; 67b
   5878    mova        [esp+0x180], m4
   5879    mova        [esp+0x190], m5
   5880    mova        [esp+0x1a0], m6
   5881    mova        [esp+0x1b0], m7
   5882    mova                 m1, [esp+0x140]
   5883    mova                 m2, [esp+0x150]
   5884    mova                 m3, [esp+0x160]
   5885    mova                 m4, [esp+0x170]
   5886    punpcklwd            m0, m1, m2      ; 01a
   5887    punpckhwd            m1, m2          ; 01b
   5888    punpcklwd            m2, m3, m4      ; 23a
   5889    punpckhwd            m3, m4          ; 23b
   5890    mova        [esp+0x140], m0
   5891    mova        [esp+0x150], m1
   5892    mova        [esp+0x160], m2
   5893    mova        [esp+0x170], m3
   5894 .vloop:
   5895    mov                  r0, r0m
   5896    mov                  r5, [esp+0x1f4]
   5897    and                 myd, 0x3ff
   5898    mov                 mym, myd
   5899    xor                  r3, r3
   5900    shr                  r4, 6
   5901    lea                  r5, [r5+r4]
   5902    mov                  r4, 64 << 24
   5903    cmovnz               r4, [base+subpel_filters+r5*8+0]
   5904    cmovnz               r3, [base+subpel_filters+r5*8+4]
   5905    movd                 m7, r4
   5906    movd                 m6, r3
   5907    punpckldq            m7, m6
   5908    punpcklbw            m7, m7
   5909    psraw                m7, 8
   5910    pshufd               m4, m7, q0000
   5911    pshufd               m5, m7, q1111
   5912    pmaddwd              m0, m4
   5913    pmaddwd              m1, m4
   5914    pmaddwd              m2, m5
   5915    pmaddwd              m3, m5
   5916    pshufd               m6, m7, q2222
   5917    pshufd               m7, m7, q3333
   5918    paddd                m0, m2
   5919    paddd                m1, m3
   5920    pmaddwd              m2, [esp+0x180], m6
   5921    pmaddwd              m3, [esp+0x190], m6
   5922    pmaddwd              m4, [esp+0x1a0], m7
   5923    pmaddwd              m5, [esp+0x1b0], m7
   5924    paddd                m0, m2
   5925    paddd                m1, m3
   5926    paddd                m0, m13
   5927    paddd                m1, m13
   5928    paddd                m4, m0
   5929    paddd                m5, m1
   5930 %endif
   5931    psrad                m4, rndshift
   5932    psrad                m5, rndshift
   5933    packssdw             m4, m5
   5934 %ifidn %1, put
   5935    packuswb             m4, m4
   5936    movq             [dstq], m4
   5937    add                dstq, dsm
   5938 %else
   5939    mova             [tmpq], m4
   5940    add                tmpq, tmp_stridem
   5941 %endif
   5942    dec                  hd
   5943    jz .hloop_prep
   5944 %if ARCH_X86_64
   5945    add                 myd, dyd
   5946    test                myd, ~0x3ff
   5947    jz .vloop
   5948    test                myd, 0x400
   5949    mov         [rsp+0x140], myd
   5950    mov                 r4d, [rsp+ 0]
   5951    mov                 r6d, [rsp+ 8]
   5952    mov                 r7d, [rsp+ 4]
   5953    mov                 r9d, [rsp+12]
   5954    jz .skip_line
   5955    mova                m14, [base+unpckw]
   5956    movq                 m6, [srcq+r10]
   5957    movq                 m7, [srcq+r11]
   5958    movhps               m6, [srcq+r13]
   5959    movhps               m7, [srcq+ rX]
   5960    movq                 m4, [srcq+ r4]
   5961    movq                 m5, [srcq+ r6]
   5962    movhps               m4, [srcq+ r7]
   5963    movhps               m5, [srcq+ r9]
   5964    add                srcq, ssq
   5965    mov                 myd, [rsp+0x140]
   5966    mov                 dyd, dym
   5967    pshufd               m9, m14, q1032
   5968    pshufb               m0, m14                ; 0a 1a
   5969    pshufb               m1, m14                ; 0b 1b
   5970    pshufb               m2, m9                 ; 3a 2a
   5971    pshufb               m3, m9                 ; 3b 2b
   5972    pmaddubsw            m6, [rsp+0x30]
   5973    pmaddubsw            m7, [rsp+0x40]
   5974    pmaddubsw            m4, [rsp+0x10]
   5975    pmaddubsw            m5, [rsp+0x20]
   5976    phaddw               m6, m7
   5977    phaddw               m4, m5
   5978    phaddw               m4, m6
   5979    pmulhrsw             m4, m12
   5980    pshufb               m5, [rsp+0x50], m14    ; 4a 5a
   5981    pshufb               m6, [rsp+0x60], m14    ; 4b 5b
   5982    pshufb               m7, [rsp+0x70], m9     ; 7a 6a
   5983    pshufb               m8, [rsp+0x80], m9     ; 7b 6b
   5984    punpckhwd            m0, m2 ; 12a
   5985    punpckhwd            m1, m3 ; 12b
   5986    punpcklwd            m2, m5 ; 34a
   5987    punpcklwd            m3, m6 ; 34b
   5988    punpckhwd            m5, m7 ; 56a
   5989    punpckhwd            m6, m8 ; 56b
   5990    punpcklwd            m7, m4 ; 78a
   5991    punpckhqdq           m4, m4
   5992    punpcklwd            m8, m4 ; 78b
   5993    mova         [rsp+0x50], m5
   5994    mova         [rsp+0x60], m6
   5995    mova         [rsp+0x70], m7
   5996    mova         [rsp+0x80], m8
   5997    jmp .vloop
   5998 .skip_line:
   5999    mova                 m0, [rsp+0x10]
   6000    mova                 m1, [rsp+0x20]
   6001    mova                m14, [rsp+0x30]
   6002    mova                m15, [rsp+0x40]
   6003    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15
   6004    mov                 myd, [rsp+0x140]
   6005    mov                 dyd, dym
   6006    mova                 m0, m2         ; 01a
   6007    mova                 m1, m3         ; 01b
   6008    mova                 m2, [rsp+0x50] ; 23a
   6009    mova                 m3, [rsp+0x60] ; 23b
   6010    mova                 m5, [rsp+0x70] ; 45a
   6011    mova                 m6, [rsp+0x80] ; 45b
   6012    punpcklwd            m7, m4, m8     ; 67a
   6013    punpckhwd            m4, m8         ; 67b
   6014    mova         [rsp+0x50], m5
   6015    mova         [rsp+0x60], m6
   6016    mova         [rsp+0x70], m7
   6017    mova         [rsp+0x80], m4
   6018 %else
   6019    mov                 r0m, r0
   6020    mov                 myd, mym
   6021    mov                  r3, r3m
   6022    add                 myd, dym
   6023    test                myd, ~0x3ff
   6024    mov                 mym, myd
   6025    jnz .next_line
   6026    mova                 m0, [esp+0x140]
   6027    mova                 m1, [esp+0x150]
   6028    mova                 m2, [esp+0x160]
   6029    mova                 m3, [esp+0x170]
   6030    jmp .vloop
   6031 .next_line:
   6032    test                myd, 0x400
   6033    mov                  r0, [esp+ 0]
   6034    mov                  rX, [esp+ 8]
   6035    mov                  r4, [esp+ 4]
   6036    mov                  r5, [esp+12]
   6037    jz .skip_line
   6038    mova                 m6, [base+unpckw]
   6039    mova                 m0, [esp+0x140]
   6040    mova                 m1, [esp+0x150]
   6041    mova                 m7, [esp+0x180]
   6042    movq                 m4, [srcq+r0]
   6043    movq                 m5, [srcq+rX]
   6044    movhps               m4, [srcq+r4]
   6045    movhps               m5, [srcq+r5]
   6046    pshufb               m0, m6         ; 0a 1a
   6047    pshufb               m1, m6         ; 0b 1b
   6048    pshufb               m7, m6         ; 4a 5a
   6049    mov                  r0, [esp+16]
   6050    mov                  rX, [esp+24]
   6051    mov                  r4, [esp+20]
   6052    mov                  r5, [esp+28]
   6053    movq                 m3, [srcq+r0]
   6054    movq                 m2, [srcq+rX]
   6055    movhps               m3, [srcq+r4]
   6056    movhps               m2, [srcq+r5]
   6057    add                srcq, ssq
   6058    pmaddubsw            m4, [esp+0x20]
   6059    pmaddubsw            m5, [esp+0x30]
   6060    pmaddubsw            m3, [esp+0x40]
   6061    pmaddubsw            m2, [esp+0x50]
   6062    phaddw               m4, m5
   6063    phaddw               m3, m2
   6064    mova                 m5, [esp+0x190]
   6065    mova                 m2, [esp+0x160]
   6066    phaddw               m4, m3
   6067    mova                 m3, [esp+0x170]
   6068    pmulhrsw             m4, m12        ; 8a 8b
   6069    mov                 myd, mym
   6070    pshufb               m5, m6         ; 4b 5b
   6071    pshufd               m6, m6, q1032
   6072    pshufb               m2, m6         ; 3a 2a
   6073    pshufb               m3, m6         ; 3b 2b
   6074    punpckhwd            m0, m2         ; 12a
   6075    punpckhwd            m1, m3         ; 12b
   6076    mova        [esp+0x140], m0
   6077    mova        [esp+0x150], m1
   6078    mova                 m0, [esp+0x1a0]
   6079    mova                 m1, [esp+0x1b0]
   6080    punpcklwd            m2, m7         ; 34a
   6081    punpcklwd            m3, m5         ; 34b
   6082    mova        [esp+0x160], m2
   6083    mova        [esp+0x170], m3
   6084    pshufb               m0, m6         ; 7a 6a
   6085    pshufb               m1, m6         ; 7b 6b
   6086    punpckhwd            m7, m0         ; 56a
   6087    punpckhwd            m5, m1         ; 56b
   6088    punpcklwd            m0, m4
   6089    punpckhqdq           m4, m4
   6090    punpcklwd            m1, m4
   6091    mova        [esp+0x180], m7
   6092    mova        [esp+0x190], m5
   6093    mova        [esp+0x1a0], m0
   6094    mova        [esp+0x1b0], m1
   6095    mova                 m0, [esp+0x140]
   6096    mova                 m1, [esp+0x150]
   6097    jmp .vloop
   6098 .skip_line:
   6099    MC_8TAP_SCALED_H   0x20, 0x1c0, 0
   6100    mov                 myd, mym
   6101    mova                 m0, [esp+0x160]
   6102    mova                 m1, [esp+0x170]
   6103    mova                 m2, [esp+0x180]
   6104    mova                 m3, [esp+0x190]
   6105    mova         [esp+0x140], m0
   6106    mova         [esp+0x150], m1
   6107    mova                 m4, [esp+0x1a0]
   6108    mova                 m5, [esp+0x1b0]
   6109    mova        [esp+0x160], m2
   6110    mova        [esp+0x170], m3
   6111    mova                 m6, [esp+0x1c0]
   6112    mova                 m7, [esp+0x1d0]
   6113    mova        [esp+0x180], m4
   6114    mova        [esp+0x190], m5
   6115    punpcklwd            m4, m6, m7
   6116    punpckhwd            m6, m7
   6117    mova        [esp+0x1a0], m4
   6118    mova        [esp+0x1b0], m6
   6119 %endif
   6120    jmp .vloop
   6121 INIT_XMM ssse3
   6122 .dy1:
   6123    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
   6124    add                  wq, base_reg
   6125    jmp                  wq
   6126 %ifidn %1, put
   6127 .dy1_w2:
   6128 %if ARCH_X86_64
   6129    mov                 myd, mym
   6130    movzx               t0d, t0b
   6131    dec                srcq
   6132    movd                m15, t0d
   6133 %else
   6134  %define m8  m0
   6135  %define m9  m1
   6136  %define m14 m4
   6137  %define m15 m3
   6138    movzx                r5, byte [esp+0x1f0]
   6139    dec                srcd
   6140    movd                m15, r5
   6141 %endif
   6142    punpckldq            m9, m8
   6143    SWAP                 m8, m9
   6144    paddd               m14, m8 ; mx+dx*[0-1]
   6145 %if ARCH_X86_64
   6146    mova                m11, [base+pd_0x4000]
   6147 %else
   6148  %define m11 [base+pd_0x4000]
   6149 %endif
   6150    pshufd              m15, m15, q0000
   6151    pand                 m8, m14, m10
   6152    psrld                m8, 6
   6153    paddd               m15, m8
   6154    movd                r4d, m15
   6155    psrldq              m15, 4
   6156 %if ARCH_X86_64
   6157    movd                r6d, m15
   6158 %else
   6159    movd                r3d, m15
   6160 %endif
   6161    mova                 m5, [base+bdct_lb_dw]
   6162    mova                 m6, [base+subpel_s_shuf2]
   6163    movd                m15, [base+subpel_filters+r4*8+2]
   6164 %if ARCH_X86_64
   6165    movd                 m7, [base+subpel_filters+r6*8+2]
   6166 %else
   6167    movd                 m7, [base+subpel_filters+r3*8+2]
   6168 %endif
   6169    pxor                 m9, m9
   6170    pcmpeqd              m8, m9
   6171    psrld               m14, 10
   6172 %if ARCH_X86_32
   6173    mov                  r3, r3m
   6174    pshufb              m14, m5
   6175    paddb               m14, m6
   6176    mova         [esp+0x00], m14
   6177  %define m14 [esp+0x00]
   6178    SWAP                 m5, m0
   6179    SWAP                 m6, m3
   6180  %define m8  m5
   6181  %define m15 m6
   6182 %endif
   6183    movq                 m0, [srcq+ssq*0]
   6184    movq                 m2, [srcq+ssq*2]
   6185    movhps               m0, [srcq+ssq*1]
   6186    movhps               m2, [srcq+ss3q ]
   6187    lea                srcq, [srcq+ssq*4]
   6188 %if ARCH_X86_64
   6189    shr                 myd, 6
   6190    mov                 r4d, 64 << 24
   6191    lea                 myd, [t1+myq]
   6192    cmovnz              r4q, [base+subpel_filters+myq*8]
   6193    pshufb              m14, m5
   6194    paddb               m14, m6
   6195    movq                m10, r4
   6196 %else
   6197    mov                 myd, mym
   6198    mov                  r5, [esp+0x1f4]
   6199    xor                  r3, r3
   6200    shr                 myd, 6
   6201    lea                  r5, [r5+myd]
   6202    mov                  r4, 64 << 24
   6203    cmovnz               r4, [base+subpel_filters+r5*8+0]
   6204    cmovnz               r3, [base+subpel_filters+r5*8+4]
   6205  %define m10 m4
   6206    movd                m10, r4
   6207    movd                 m3, r3
   6208    mov                  r3, r3m
   6209    punpckldq           m10, m3
   6210 %endif
   6211    movq                 m1, [srcq+ssq*0]
   6212    movq                 m3, [srcq+ssq*2]
   6213    movhps               m1, [srcq+ssq*1]
   6214    add                srcq, ss3q
   6215    punpcklbw           m10, m10
   6216    psraw               m10, 8
   6217    punpckldq           m15, m7
   6218    punpcklqdq          m15, m15
   6219 %if ARCH_X86_64
   6220    pand                m11, m8
   6221 %else
   6222    pand                 m7, m11, m8
   6223  %define m11 m7
   6224 %endif
   6225    pandn                m8, m15
   6226    SWAP                m15, m8
   6227    por                 m15, m11
   6228 %if ARCH_X86_64
   6229    pshufd               m8, m10, q0000
   6230    pshufd               m9, m10, q1111
   6231    pshufd              m11, m10, q3333
   6232    pshufd              m10, m10, q2222
   6233 %else
   6234    mova         [esp+0x10], m15
   6235  %define m15 [esp+0x10]
   6236    mov                  r0, r0m
   6237    pshufd               m5, m4, q0000
   6238    pshufd               m6, m4, q1111
   6239    pshufd               m7, m4, q2222
   6240    pshufd               m4, m4, q3333
   6241  %define m8  [esp+0x20]
   6242  %define m9  [esp+0x30]
   6243  %define m10 [esp+0x40]
   6244  %define m11 [esp+0x50]
   6245    mova                 m8, m5
   6246    mova                 m9, m6
   6247    mova                m10, m7
   6248    mova                m11, m4
   6249 %endif
   6250    pshufb               m0, m14
   6251    pshufb               m2, m14
   6252    pshufb               m1, m14
   6253    pshufb               m3, m14
   6254    pmaddubsw            m0, m15
   6255    pmaddubsw            m2, m15
   6256    pmaddubsw            m1, m15
   6257    pmaddubsw            m3, m15
   6258    phaddw               m0, m2
   6259    phaddw               m1, m3
   6260    pmulhrsw             m0, m12
   6261    pmulhrsw             m1, m12
   6262    palignr              m2, m1, m0, 4
   6263    pshufd               m4, m1, q2121
   6264    punpcklwd            m3, m0, m2     ; 01 12
   6265    punpckhwd            m0, m2         ; 23 34
   6266    punpcklwd            m2, m1, m4     ; 45 56
   6267 .dy1_w2_loop:
   6268    movq                 m1, [srcq+ssq*0]
   6269    movhps               m1, [srcq+ssq*1]
   6270    lea                srcq, [srcq+ssq*2]
   6271    pmaddwd              m5, m3, m8
   6272    pmaddwd              m6, m0, m9
   6273    pmaddwd              m7, m2, m10
   6274    mova                 m3, m0
   6275    mova                 m0, m2
   6276    paddd                m5, m13
   6277    paddd                m6, m7
   6278    pshufb               m1, m14
   6279    pmaddubsw            m1, m15
   6280    phaddw               m1, m1
   6281    pmulhrsw             m1, m12
   6282    palignr              m7, m1, m4, 12
   6283    punpcklwd            m2, m7, m1     ; 67 78
   6284    pmaddwd              m7, m2, m11
   6285    mova                 m4, m1
   6286    paddd                m5, m6
   6287    paddd                m5, m7
   6288    psrad                m5, rndshift
   6289    packssdw             m5, m5
   6290    packuswb             m5, m5
   6291    movd                r4d, m5
   6292    mov        [dstq+dsq*0], r4w
   6293    shr                 r4d, 16
   6294    mov        [dstq+dsq*1], r4w
   6295    lea                dstq, [dstq+dsq*2]
   6296    sub                  hd, 2
   6297    jg .dy1_w2_loop
   6298    RET
   6299 %endif
   6300 INIT_XMM ssse3
   6301 .dy1_w4:
   6302 %if ARCH_X86_64
   6303    mov                 myd, mym
   6304    movzx               t0d, t0b
   6305    dec                srcq
   6306    movd                m15, t0d
   6307 %else
   6308 %define m10 [base+pd_0x3ff]
   6309 %define m11 [base+pd_0x4000]
   6310 %define m8  m0
   6311 %xdefine m14 m4
   6312 %define m15 m3
   6313 %if isprep
   6314  %define ssq r3
   6315 %endif
   6316    movzx                r4, byte [esp+0x1f0]
   6317    dec                srcq
   6318    movd                m15, r4
   6319 %endif
   6320    pmaddwd              m8, [base+rescale_mul]
   6321 %if ARCH_X86_64
   6322    mova                m11, [base+pd_0x4000]
   6323 %endif
   6324    pshufd              m15, m15, q0000
   6325    paddd               m14, m8 ; mx+dx*[0-3]
   6326    pand                 m8, m14, m10
   6327    psrld                m8, 6
   6328    paddd               m15, m8
   6329    psrldq               m7, m15, 8
   6330 %if ARCH_X86_64
   6331    movd                r4d, m15
   6332    movd               r11d, m7
   6333    psrldq              m15, 4
   6334    psrldq               m7, 4
   6335    movd                r6d, m15
   6336    movd               r13d, m7
   6337    movd                m15, [base+subpel_filters+ r4*8+2]
   6338    movd                 m2, [base+subpel_filters+r11*8+2]
   6339    movd                 m3, [base+subpel_filters+ r6*8+2]
   6340    movd                 m4, [base+subpel_filters+r13*8+2]
   6341    shr                 myd, 6
   6342    mov                 r4d, 64 << 24
   6343    lea                 myd, [t1+myq]
   6344    cmovnz              r4q, [base+subpel_filters+myq*8]
   6345 %else
   6346    movd                 r1, m15
   6347    movd                 r3, m7
   6348    psrldq              m15, 4
   6349    psrldq               m7, 4
   6350    movd                 r4, m15
   6351    movd                 r5, m7
   6352 %define m15 m5
   6353    SWAP                 m4, m7
   6354    movd                m15, [base+subpel_filters+r1*8+2]
   6355    movd                 m2, [base+subpel_filters+r3*8+2]
   6356    movd                 m3, [base+subpel_filters+r4*8+2]
   6357    movd                 m4, [base+subpel_filters+r5*8+2]
   6358    mov                 myd, mym
   6359    mov                  rX, [esp+0x1f4]
   6360    xor                  r5, r5
   6361    shr                 myd, 6
   6362    lea                  rX, [rX+myd]
   6363    mov                  r4, 64 << 24
   6364    cmovnz               r4, [base+subpel_filters+rX*8+0]
   6365    cmovnz               r5, [base+subpel_filters+rX*8+4]
   6366    mov                  r3, r3m
   6367 %if isprep
   6368    lea                ss3q, [ssq*3]
   6369 %endif
   6370 %endif
   6371    punpckldq           m15, m3
   6372    punpckldq            m2, m4
   6373    punpcklqdq          m15, m2
   6374    movq                 m6, [base+subpel_s_shuf2]
   6375 %if ARCH_X86_64
   6376    pcmpeqd              m8, m9
   6377    psrld               m14, 10
   6378    pshufb              m14, [base+bdct_lb_dw]
   6379    movu                 m0, [srcq+ssq*0]
   6380    movu                 m1, [srcq+ssq*1]
   6381    movu                 m2, [srcq+ssq*2]
   6382    movu                 m3, [srcq+ss3q ]
   6383    lea                srcq, [srcq+ssq*4]
   6384    punpcklqdq           m6, m6
   6385    movu                 m4, [srcq+ssq*0]
   6386    movu                 m5, [srcq+ssq*1]
   6387    movu                 m7, [srcq+ssq*2]
   6388    add                srcq, ss3q
   6389    pand                m11, m8
   6390    pandn                m8, m15
   6391    SWAP                m15, m8
   6392    por                 m15, m11
   6393    paddb               m14, m6
   6394    movq                m10, r4q
   6395    punpcklbw           m10, m10
   6396    psraw               m10, 8
   6397    pshufb               m0, m14
   6398    pshufb               m1, m14
   6399    pshufb               m2, m14
   6400    pshufb               m3, m14
   6401    pshufb               m4, m14
   6402    pshufb               m5, m14
   6403    pshufb               m7, m14
   6404    pmaddubsw            m0, m15
   6405    pmaddubsw            m1, m15
   6406    pmaddubsw            m2, m15
   6407    pmaddubsw            m3, m15
   6408    pmaddubsw            m4, m15
   6409    pmaddubsw            m5, m15
   6410    pmaddubsw            m7, m15
   6411    phaddw               m0, m1
   6412    phaddw               m2, m3
   6413    phaddw               m4, m5
   6414    phaddw               m6, m7, m7
   6415    pmulhrsw             m0, m12    ; 0 1
   6416    pmulhrsw             m2, m12    ; 2 3
   6417    pmulhrsw             m4, m12    ; 4 5
   6418    pmulhrsw             m6, m12    ; 6 _
   6419    shufps               m1, m0, m2, q1032  ; 1 2
   6420    shufps               m3, m2, m4, q1032  ; 3 4
   6421    shufps               m5, m4, m6, q1032  ; 5 6
   6422    punpcklwd            m7, m0, m1 ; 01
   6423    punpckhwd            m0, m1     ; 12
   6424    punpcklwd            m8, m2, m3 ; 23
   6425    punpckhwd            m2, m3     ; 34
   6426    punpcklwd            m9, m4, m5 ; 45
   6427    punpckhwd            m4, m5     ; 56
   6428 %else
   6429    pxor                 m3, m3
   6430    pcmpeqd              m8, m3
   6431    psrld               m14, 10
   6432    pshufb              m14, [base+bdct_lb_dw]
   6433    movu                 m1, [srcq+ssq*0]
   6434    movu                 m2, [srcq+ssq*1]
   6435    movu                 m3, [srcq+ssq*2]
   6436    add                srcq, ss3q
   6437    punpcklqdq           m6, m6
   6438    SWAP                 m4, m7
   6439    pand                 m7, m11, m8
   6440    pandn                m8, m15
   6441    SWAP                 m5, m0
   6442    por                 m15, m7
   6443    paddb               m14, m6
   6444    movu                 m0, [srcq+ssq*0]
   6445    movu                 m7, [srcq+ssq*1]
   6446    movu                 m6, [srcq+ssq*2]
   6447    pshufb               m1, m14
   6448    pshufb               m2, m14
   6449    pshufb               m3, m14
   6450    pshufb               m0, m14
   6451    pshufb               m7, m14
   6452    pshufb               m6, m14
   6453    pmaddubsw            m1, m15
   6454    pmaddubsw            m2, m15
   6455    pmaddubsw            m3, m15
   6456    mova         [esp+0x00], m14
   6457    mova         [esp+0x10], m15
   6458    pmaddubsw            m0, m15
   6459    pmaddubsw            m7, m15
   6460    pmaddubsw            m6, m15
   6461    phaddw               m1, m2
   6462    movu                 m2, [srcq+ss3q ]
   6463    lea                srcq, [srcq+ssq*4]
   6464    mov                  r0, r0m
   6465    phaddw               m3, m0
   6466    pshufb               m2, m14
   6467    pmaddubsw            m2, m15
   6468 %define m14 [esp+0x00]
   6469 %define m15 [esp+0x10]
   6470    phaddw               m7, m6
   6471    phaddw               m2, m2
   6472    movd                 m6, r4
   6473    movd                 m0, r5
   6474    punpckldq            m6, m0
   6475    punpcklbw            m6, m6
   6476    psraw                m6, 8
   6477    mova         [esp+0x20], m6
   6478    pmulhrsw             m1, m12 ; 0 1
   6479    pmulhrsw             m3, m12 ; 2 3
   6480    pmulhrsw             m7, m12 ; 4 5
   6481    pmulhrsw             m2, m12 ; 6 _
   6482    shufps               m0, m1, m3, q1032  ; 1 2
   6483    shufps               m4, m3, m7, q1032  ; 3 4
   6484    shufps               m5, m7, m2, q1032  ; 5 6
   6485    punpcklwd            m6, m1, m0 ; 01
   6486    punpckhwd            m1, m0     ; 12
   6487    mova         [esp+0x30], m1
   6488    punpcklwd            m1, m3, m4 ; 23
   6489    punpckhwd            m3, m4     ; 34
   6490    mova         [esp+0x40], m3
   6491    punpcklwd            m3, m7, m5 ; 45
   6492    punpckhwd            m7, m5     ; 56
   6493    mova         [esp+0x50], m7
   6494    mova         [esp+0x60], m2
   6495    mova                 m0, [esp+0x20]
   6496 %xdefine m8 m1
   6497 %xdefine m9 m3
   6498 %xdefine m10 m0
   6499    SWAP                 m7, m6
   6500    SWAP                 m1, m4
   6501    SWAP                 m3, m2
   6502 %endif
   6503    pshufd               m1, m10, q0000
   6504    pshufd               m3, m10, q1111
   6505    pshufd               m5, m10, q2222
   6506    pshufd              m10, m10, q3333
   6507 %if ARCH_X86_64
   6508    mova         [rsp+0x00], m8
   6509    mova         [rsp+0x10], m2
   6510    mova         [rsp+0x20], m9
   6511    mova         [rsp+0x30], m4
   6512 %else
   6513    mova         [esp+0x70], m8
   6514    mova         [esp+0x80], m9
   6515    mova         [esp+0x90], m1
   6516    mova         [esp+0xa0], m3
   6517    mova         [esp+0xb0], m5
   6518    mova         [esp+0xc0], m10
   6519 %ifidn %1, put
   6520    mov                 dsd, dsm
   6521 %endif
   6522 %define m11 m6
   6523 %endif
   6524 .dy1_w4_loop:
   6525 %if ARCH_X86_64
   6526    movu                m11, [srcq+ssq*0]
   6527    pmaddwd              m7, m1
   6528    pmaddwd              m8, m3
   6529    pmaddwd              m0, m1
   6530    pmaddwd              m2, m3
   6531    pmaddwd              m9, m5
   6532    pmaddwd              m4, m5
   6533    paddd                m7, m8
   6534    paddd                m0, m2
   6535    movu                 m8, [srcq+ssq*1]
   6536    lea                srcq, [srcq+ssq*2]
   6537    pshufb              m11, m14
   6538    pmaddubsw           m11, m15
   6539    paddd                m7, m13
   6540    paddd                m0, m13
   6541    paddd                m7, m9
   6542    paddd                m0, m4
   6543    pshufb               m8, m14
   6544    pmaddubsw            m8, m15
   6545    phaddw              m11, m8
   6546    mova                 m8, [rsp+0x20]
   6547    pmulhrsw            m11, m12
   6548    punpcklwd            m9, m6, m11    ; 67
   6549    psrldq               m6, m11, 8
   6550    punpcklwd            m4, m11, m6    ; 78
   6551    pmaddwd              m2, m9, m10
   6552    pmaddwd             m11, m4, m10
   6553    paddd                m7, m2
   6554    mova                 m2, [rsp+0x30]
   6555    paddd                m0, m11
   6556 %else
   6557    SWAP                 m7, m6
   6558    SWAP                 m1, m4
   6559    SWAP                 m3, m2
   6560    movu                 m5, [srcq+ssq*0]
   6561    mova                 m0, [esp+0x30]
   6562    mova                 m2, [esp+0x40]
   6563    mova                 m4, [esp+0x50]
   6564    pmaddwd              m6, [esp+0x90]
   6565    pmaddwd              m1, [esp+0xa0]
   6566    pmaddwd              m0, [esp+0x90]
   6567    pmaddwd              m2, [esp+0xa0]
   6568    pmaddwd              m3, [esp+0xb0]
   6569    pmaddwd              m4, [esp+0xb0]
   6570    paddd                m6, m1
   6571    paddd                m0, m2
   6572    movu                 m7, [srcq+ssq*1]
   6573    lea                srcq, [srcq+ssq*2]
   6574    pshufb               m5, m14
   6575    pmaddubsw            m5, m15
   6576    paddd                m6, m13
   6577    paddd                m0, m13
   6578    paddd                m6, m3
   6579    paddd                m0, m4
   6580    pshufb               m7, m14
   6581    pmaddubsw            m7, m15
   6582    phaddw               m5, m7
   6583    mova                 m7, [rsp+0x80]
   6584    pmulhrsw             m5, m12
   6585    punpcklwd            m3, [esp+0x60], m5 ; 67
   6586    psrldq               m1, m5, 8
   6587    punpcklwd            m4, m5, m1         ; 78
   6588    pmaddwd              m2, m3, [esp+0xc0]
   6589    pmaddwd              m5, m4, [esp+0xc0]
   6590    mova         [esp+0x60], m1
   6591    paddd                m6, m2
   6592    mova                 m2, [esp+0x50]
   6593    paddd                m0, m5
   6594    SWAP                 m7, m6
   6595 %endif
   6596    psrad                m7, rndshift
   6597    psrad                m0, rndshift
   6598    packssdw             m7, m0
   6599 %if ARCH_X86_64
   6600    mova                 m0, [rsp+0x10]
   6601 %else
   6602    mova                 m0, [esp+0x40]
   6603 %define m11 m5
   6604 %endif
   6605 %ifidn %1, put
   6606    packuswb             m7, m7
   6607    psrldq              m11, m7, 4
   6608    movd       [dstq+dsq*0], m7
   6609    movd       [dstq+dsq*1], m11
   6610    lea                dstq, [dstq+dsq*2]
   6611 %else
   6612    mova             [tmpq], m7
   6613    add                tmpq, 16
   6614 %endif
   6615    sub                  hd, 2
   6616    jz .ret
   6617 %if ARCH_X86_64
   6618    mova                 m7, [rsp+0x00]
   6619    mova         [rsp+0x00], m8
   6620    mova         [rsp+0x10], m2
   6621    mova         [rsp+0x20], m9
   6622    mova         [rsp+0x30], m4
   6623 %else
   6624    mova                 m7, [esp+0x70] ; 01
   6625    mova                 m1, [esp+0x80] ; 23
   6626    mova                 m2, [esp+0x50] ; 34
   6627    mova         [esp+0x30], m0
   6628    mova         [esp+0x70], m1
   6629    mova         [esp+0x40], m2
   6630    mova         [esp+0x80], m3
   6631    mova         [esp+0x50], m4
   6632 %endif
   6633    jmp .dy1_w4_loop
   6634 INIT_XMM ssse3
   6635 .dy1_w8:
   6636    mov    dword [rsp+0x90], 1
   6637    movifprep   tmp_stridem, 16
   6638    jmp .dy1_w_start
   6639 .dy1_w16:
   6640    mov    dword [rsp+0x90], 2
   6641    movifprep   tmp_stridem, 32
   6642    jmp .dy1_w_start
   6643 .dy1_w32:
   6644    mov    dword [rsp+0x90], 4
   6645    movifprep   tmp_stridem, 64
   6646    jmp .dy1_w_start
   6647 .dy1_w64:
   6648    mov    dword [rsp+0x90], 8
   6649    movifprep   tmp_stridem, 128
   6650    jmp .dy1_w_start
   6651 .dy1_w128:
   6652    mov    dword [rsp+0x90], 16
   6653    movifprep   tmp_stridem, 256
   6654 .dy1_w_start:
   6655    mov                 myd, mym
   6656 %ifidn %1, put
   6657    movifnidn           dsm, dsq
   6658 %endif
   6659 %if ARCH_X86_64
   6660    shr                 t0d, 16
   6661    sub                srcq, 3
   6662    shr                 myd, 6
   6663    mov                 r4d, 64 << 24
   6664    lea                 myd, [t1+myq]
   6665    cmovnz              r4q, [base+subpel_filters+myq*8]
   6666    movd                m15, t0d
   6667 %else
   6668 %define m8   m0
   6669 %define m9   m1
   6670 %xdefine m14 m4
   6671 %xdefine m15 m3
   6672 %if isprep
   6673  %define ssq ssm
   6674 %endif
   6675    mov                  r5, [esp+0x1f0]
   6676    mov                  r3, [esp+0x1f4]
   6677    shr                  r5, 16
   6678    sub                srcq, 3
   6679    movd                m15, r5
   6680    xor                  r5, r5
   6681    shr                 myd, 6
   6682    lea                  r3, [r3+myd]
   6683    mov                  r4, 64 << 24
   6684    cmovnz               r4, [base+subpel_filters+r3*8+0]
   6685    cmovnz               r5, [base+subpel_filters+r3*8+4]
   6686    mov                  r0, r0m
   6687    mov                  r3, r3m
   6688 %endif
   6689    pslld                m7, m8, 2 ; dx*4
   6690    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
   6691    pshufd              m15, m15, q0000
   6692    paddd               m14, m8 ; mx+dx*[0-3]
   6693 %if ARCH_X86_64
   6694    movq                 m3, r4q
   6695    punpcklbw            m3, m3
   6696    psraw                m3, 8
   6697 %else
   6698    movd                 m5, r4
   6699    movd                 m6, r5
   6700    punpckldq            m5, m6
   6701    punpcklbw            m5, m5
   6702    psraw                m5, 8
   6703    SWAP                 m3, m5
   6704 %endif
   6705    mova        [rsp+0x100], m7
   6706    mova        [rsp+0x120], m15
   6707    mov         [rsp+0x098], srcq
   6708    mov         [rsp+0x130], r0q ; dstq / tmpq
   6709    pshufd               m0, m3, q0000
   6710    pshufd               m1, m3, q1111
   6711    pshufd               m2, m3, q2222
   6712    pshufd               m3, m3, q3333
   6713    mova        [rsp+0x140], m0
   6714    mova        [rsp+0x150], m1
   6715    mova        [rsp+0x160], m2
   6716    mova        [rsp+0x170], m3
   6717 %if ARCH_X86_64 && UNIX64
   6718    mov                  hm, hd
   6719 %elif ARCH_X86_32
   6720    SWAP                  m5, m3
   6721    mov                   r5, hm
   6722    mov          [esp+0x134], r5
   6723 %endif
   6724    jmp .dy1_hloop
   6725 .dy1_hloop_prep:
   6726    dec   dword [rsp+0x090]
   6727    jz .ret
   6728 %if ARCH_X86_64
   6729    add   qword [rsp+0x130], 8*(isprep+1)
   6730    mov                  hd, hm
   6731 %else
   6732    add   dword [rsp+0x130], 8*(isprep+1)
   6733    mov                  r5, [esp+0x134]
   6734    mov                  r0, [esp+0x130]
   6735 %endif
   6736    mova                 m7, [rsp+0x100]
   6737    mova                m14, [rsp+0x110]
   6738 %if ARCH_X86_64
   6739    mova                m10, [base+pd_0x3ff]
   6740 %else
   6741 %define m10 [base+pd_0x3ff]
   6742 %endif
   6743    mova                m15, [rsp+0x120]
   6744    mov                srcq, [rsp+0x098]
   6745 %if ARCH_X86_64
   6746    mov                 r0q, [rsp+0x130] ; dstq / tmpq
   6747 %else
   6748    mov                  hm, r5
   6749    mov                 r0m, r0
   6750    mov                  r3, r3m
   6751 %endif
   6752    paddd               m14, m7
   6753 .dy1_hloop:
   6754    pxor                 m9, m9
   6755 %if ARCH_X86_64
   6756    mova                m11, [base+pq_0x40000000]
   6757 %else
   6758 %define m11 [base+pq_0x40000000]
   6759 %endif
   6760    psrld                m2, m14, 10
   6761    mova              [rsp], m2
   6762    pand                 m6, m14, m10
   6763    psrld                m6, 6
   6764    paddd                m5, m15, m6
   6765    pcmpeqd              m6, m9
   6766    psrldq               m2, m5, 8
   6767 %if ARCH_X86_64
   6768    movd                r4d, m5
   6769    movd                r6d, m2
   6770    psrldq               m5, 4
   6771    psrldq               m2, 4
   6772    movd                r7d, m5
   6773    movd                r9d, m2
   6774    movq                 m0, [base+subpel_filters+r4*8]
   6775    movq                 m1, [base+subpel_filters+r6*8]
   6776    movhps               m0, [base+subpel_filters+r7*8]
   6777    movhps               m1, [base+subpel_filters+r9*8]
   6778 %else
   6779    movd                 r0, m5
   6780    movd                 rX, m2
   6781    psrldq               m5, 4
   6782    psrldq               m2, 4
   6783    movd                 r4, m5
   6784    movd                 r5, m2
   6785    movq                 m0, [base+subpel_filters+r0*8]
   6786    movq                 m1, [base+subpel_filters+rX*8]
   6787    movhps               m0, [base+subpel_filters+r4*8]
   6788    movhps               m1, [base+subpel_filters+r5*8]
   6789    pxor                 m2, m2
   6790 %define m9 m2
   6791 %endif
   6792    paddd               m14, m7 ; mx+dx*[4-7]
   6793    pand                 m5, m14, m10
   6794    psrld                m5, 6
   6795    paddd               m15, m5
   6796    pcmpeqd              m5, m9
   6797    mova        [rsp+0x110], m14
   6798    psrldq               m4, m15, 8
   6799 %if ARCH_X86_64
   6800    movd               r10d, m15
   6801    movd               r11d, m4
   6802    psrldq              m15, 4
   6803    psrldq               m4, 4
   6804    movd               r13d, m15
   6805    movd                rXd, m4
   6806    movq                 m2, [base+subpel_filters+r10*8]
   6807    movq                 m3, [base+subpel_filters+r11*8]
   6808    movhps               m2, [base+subpel_filters+r13*8]
   6809    movhps               m3, [base+subpel_filters+ rX*8]
   6810    psrld               m14, 10
   6811    psrldq               m4, m14, 8
   6812    movd               r10d, m14
   6813    movd               r11d, m4
   6814    psrldq              m14, 4
   6815    psrldq               m4, 4
   6816    movd               r13d, m14
   6817    movd                rXd, m4
   6818    mov                 r4d, [rsp+ 0]
   6819    mov                 r6d, [rsp+ 8]
   6820    mov                 r7d, [rsp+ 4]
   6821    mov                 r9d, [rsp+12]
   6822    pshufd               m4, m6, q1100
   6823    pshufd               m6, m6, q3322
   6824    pshufd               m7, m5, q1100
   6825    pshufd               m5, m5, q3322
   6826    pand                 m8, m11, m4
   6827    pand                 m9, m11, m6
   6828    pand                m15, m11, m7
   6829    pand                m11, m11, m5
   6830    pandn                m4, m0
   6831    pandn                m6, m1
   6832    pandn                m7, m2
   6833    pandn                m5, m3
   6834    por                  m8, m4
   6835    por                  m9, m6
   6836    por                 m15, m7
   6837    por                 m11, m5
   6838    mova         [rsp+0x10], m8
   6839    mova         [rsp+0x20], m9
   6840    mova         [rsp+0x30], m15
   6841    mova         [rsp+0x40], m11
   6842    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
   6843    mova         [rsp+0x50], m1
   6844    mova         [rsp+0x60], m2
   6845    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
   6846    mova         [rsp+0x70], m3
   6847    mova         [rsp+0x80], m4
   6848    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
   6849    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
   6850    SWAP                 m7, m0
   6851    SWAP                 m8, m14
   6852    mova                 m1, [rsp+0x50]
   6853    mova                 m2, [rsp+0x60]
   6854    mova                 m3, [rsp+0x70]
   6855    mova                m15, [rsp+0x80]
   6856    punpcklwd            m4, m5, m6 ; 45a
   6857    punpckhwd            m5, m6     ; 45b
   6858    punpcklwd            m6, m7, m8 ; 67a
   6859    punpckhwd            m7, m8     ; 67b
   6860    SWAP                m14, m8
   6861    mova                 m8, [rsp+0x140]
   6862    mova                 m9, [rsp+0x150]
   6863    mova                m10, [rsp+0x160]
   6864    mova                m11, [rsp+0x170]
   6865    punpcklwd            m0, m1, m2 ; 01a
   6866    punpckhwd            m1, m2     ; 01b
   6867    punpcklwd            m2, m3, m15; 23a
   6868    punpckhwd            m3, m15    ; 23b
   6869    mova         [rsp+0x50], m4
   6870    mova         [rsp+0x60], m5
   6871    mova         [rsp+0x70], m6
   6872    mova         [rsp+0x80], m7
   6873    mova                m14, [base+unpckw]
   6874 %else
   6875    movd                 r0, m15
   6876    movd                 rX, m4
   6877    psrldq              m15, 4
   6878    psrldq               m4, 4
   6879    movd                 r4, m15
   6880    movd                 r5, m4
   6881    mova                m14, [esp+0x110]
   6882    movq                 m2, [base+subpel_filters+r0*8]
   6883    movq                 m3, [base+subpel_filters+rX*8]
   6884    movhps               m2, [base+subpel_filters+r4*8]
   6885    movhps               m3, [base+subpel_filters+r5*8]
   6886    psrld               m14, 10
   6887    mova           [esp+16], m14
   6888    mov                  r0, [esp+ 0]
   6889    mov                  rX, [esp+ 8]
   6890    mov                  r4, [esp+ 4]
   6891    mov                  r5, [esp+12]
   6892    mova         [esp+0x20], m0
   6893    mova         [esp+0x30], m1
   6894    mova         [esp+0x40], m2
   6895    mova         [esp+0x50], m3
   6896    pshufd               m4, m6, q1100
   6897    pshufd               m6, m6, q3322
   6898    pshufd               m7, m5, q1100
   6899    pshufd               m5, m5, q3322
   6900    pand                 m0, m11, m4
   6901    pand                 m1, m11, m6
   6902    pand                 m2, m11, m7
   6903    pand                 m3, m11, m5
   6904    pandn                m4, [esp+0x20]
   6905    pandn                m6, [esp+0x30]
   6906    pandn                m7, [esp+0x40]
   6907    pandn                m5, [esp+0x50]
   6908    por                  m0, m4
   6909    por                  m1, m6
   6910    por                  m2, m7
   6911    por                  m3, m5
   6912    mova        [esp+0x20], m0
   6913    mova        [esp+0x30], m1
   6914    mova        [esp+0x40], m2
   6915    mova        [esp+0x50], m3
   6916    MC_8TAP_SCALED_H   0x20, 0x60, 0 ; 0-1
   6917    MC_8TAP_SCALED_H   0x20, 0x180   ; 2-3
   6918    MC_8TAP_SCALED_H   0x20, 0x1a0   ; 4-5
   6919    MC_8TAP_SCALED_H   0x20, 0x1c0   ; 6-7
   6920    mova                 m5, [esp+0x1a0]
   6921    mova                 m6, [esp+0x1b0]
   6922    mova                 m7, [esp+0x1c0]
   6923    mova                 m0, [esp+0x1d0]
   6924    punpcklwd            m4, m5, m6      ; 45a
   6925    punpckhwd            m5, m6          ; 45b
   6926    punpcklwd            m6, m7, m0      ; 67a
   6927    punpckhwd            m7, m0          ; 67b
   6928    mova        [esp+0x1a0], m4
   6929    mova        [esp+0x1b0], m5
   6930    mova        [esp+0x1c0], m6
   6931    mova        [esp+0x1d0], m7
   6932    mova                 m1, [esp+0x060]
   6933    mova                 m2, [esp+0x070]
   6934    mova                 m3, [esp+0x180]
   6935    mova                 m4, [esp+0x190]
   6936    punpcklwd            m0, m1, m2      ; 01a
   6937    punpckhwd            m1, m2          ; 01b
   6938    punpcklwd            m2, m3, m4      ; 23a
   6939    punpckhwd            m3, m4          ; 23b
   6940    mova        [esp+0x060], m0
   6941    mova        [esp+0x070], m1
   6942    mova        [esp+0x180], m2
   6943    mova        [esp+0x190], m3
   6944 %define m8  [esp+0x140]
   6945 %define m9  [esp+0x150]
   6946 %define m10 [esp+0x160]
   6947 %define m11 [esp+0x170]
   6948 %endif
   6949 .dy1_vloop:
   6950 %if ARCH_X86_32
   6951    mov                  r0, r0m
   6952 %endif
   6953    pmaddwd              m4, m0, m8
   6954    pmaddwd              m5, m1, m8
   6955    pmaddwd              m6, m2, m9
   6956    pmaddwd              m7, m3, m9
   6957    paddd                m4, m13
   6958    paddd                m5, m13
   6959    paddd                m4, m6
   6960    paddd                m5, m7
   6961 %if ARCH_X86_64
   6962    pmaddwd              m6, [rsp+0x50], m10
   6963    pmaddwd              m7, [rsp+0x60], m10
   6964 %else
   6965    pmaddwd              m6, [rsp+0x1a0], m10
   6966    pmaddwd              m7, [rsp+0x1b0], m10
   6967 %endif
   6968    paddd                m4, m6
   6969    paddd                m5, m7
   6970 %if ARCH_X86_64
   6971    pmaddwd              m6, [rsp+0x70], m11
   6972    pmaddwd              m7, [rsp+0x80], m11
   6973 %else
   6974    pmaddwd              m6, [rsp+0x1c0], m11
   6975    pmaddwd              m7, [rsp+0x1d0], m11
   6976 %endif
   6977    paddd                m4, m6
   6978    paddd                m5, m7
   6979    psrad                m4, rndshift
   6980    psrad                m5, rndshift
   6981    packssdw             m4, m5
   6982 %ifidn %1, put
   6983    packuswb             m4, m4
   6984    movq             [dstq], m4
   6985    add                dstq, dsm
   6986 %else
   6987    mova             [tmpq], m4
   6988    add                tmpq, tmp_stridem
   6989 %endif
   6990 %if ARCH_X86_32
   6991    mov                 r0m, r0
   6992 %endif
   6993    dec                  hd
   6994    jz .dy1_hloop_prep
   6995 %if ARCH_X86_64
   6996    movq                 m4, [srcq+ r4]
   6997    movq                 m5, [srcq+ r6]
   6998    movhps               m4, [srcq+ r7]
   6999    movhps               m5, [srcq+ r9]
   7000    movq                 m6, [srcq+r10]
   7001    movq                 m7, [srcq+r11]
   7002    movhps               m6, [srcq+r13]
   7003    movhps               m7, [srcq+ rX]
   7004    add                srcq, ssq
   7005    pshufd              m15, m14, q1032
   7006    pshufb               m0, m14                ; 0a 1a
   7007    pshufb               m1, m14                ; 0b 1b
   7008    pshufb               m2, m15                ; 3a 2a
   7009    pshufb               m3, m15                ; 3b 2b
   7010    pmaddubsw            m4, [rsp+0x10]
   7011    pmaddubsw            m5, [rsp+0x20]
   7012    pmaddubsw            m6, [rsp+0x30]
   7013    pmaddubsw            m7, [rsp+0x40]
   7014    phaddw               m4, m5
   7015    phaddw               m6, m7
   7016    phaddw               m4, m6
   7017    pmulhrsw             m4, m12
   7018    pshufb               m5, [rsp+0x70], m15    ; 7a 6a
   7019    pshufb               m7, [rsp+0x80], m15    ; 7b 6b
   7020    pshufb               m6, [rsp+0x50], m14    ; 4a 5a
   7021    pshufb              m15, [rsp+0x60], m14    ; 4b 5b
   7022    punpckhwd            m0, m2  ; 12a
   7023    punpckhwd            m1, m3  ; 12b
   7024    punpcklwd            m2, m6  ; 34a
   7025    punpcklwd            m3, m15 ; 34b
   7026    punpckhwd            m6, m5  ; 56a
   7027    punpckhwd           m15, m7  ; 56b
   7028    punpcklwd            m5, m4  ; 78a
   7029    psrldq               m4, 8
   7030    punpcklwd            m7, m4  ; 78b
   7031    mova         [rsp+0x50], m6
   7032    mova         [rsp+0x60], m15
   7033    mova         [rsp+0x70], m5
   7034    mova         [rsp+0x80], m7
   7035 %else
   7036    mov                  r0, [esp+ 0]
   7037    mov                  rX, [esp+ 8]
   7038    mov                  r4, [esp+ 4]
   7039    mov                  r5, [esp+12]
   7040    mova                 m6, [base+unpckw]
   7041    mova                 m0, [esp+0x060]
   7042    mova                 m1, [esp+0x070]
   7043    mova                 m7, [esp+0x1a0]
   7044    movq                 m4, [srcq+r0]
   7045    movq                 m5, [srcq+rX]
   7046    movhps               m4, [srcq+r4]
   7047    movhps               m5, [srcq+r5]
   7048    pshufb               m0, m6         ; 0a 1a
   7049    pshufb               m1, m6         ; 0b 1b
   7050    pshufb               m7, m6         ; 4a 5a
   7051    mov                  r0, [esp+16]
   7052    mov                  rX, [esp+24]
   7053    mov                  r4, [esp+20]
   7054    mov                  r5, [esp+28]
   7055    movq                 m3, [srcq+r0]
   7056    movq                 m2, [srcq+rX]
   7057    movhps               m3, [srcq+r4]
   7058    movhps               m2, [srcq+r5]
   7059    add                srcq, ssq
   7060    pmaddubsw            m4, [esp+0x20]
   7061    pmaddubsw            m5, [esp+0x30]
   7062    pmaddubsw            m3, [esp+0x40]
   7063    pmaddubsw            m2, [esp+0x50]
   7064    phaddw               m4, m5
   7065    phaddw               m3, m2
   7066    mova                 m5, [esp+0x1b0]
   7067    mova                 m2, [esp+0x180]
   7068    phaddw               m4, m3
   7069    mova                 m3, [esp+0x190]
   7070    pmulhrsw             m4, m12        ; 8a 8b
   7071    pshufb               m5, m6         ; 4b 5b
   7072    pshufd               m6, m6, q1032
   7073    pshufb               m2, m6         ; 3a 2a
   7074    pshufb               m3, m6         ; 3b 2b
   7075    punpckhwd            m0, m2         ; 12a
   7076    punpckhwd            m1, m3         ; 12b
   7077    mova         [esp+0x60], m0
   7078    mova         [esp+0x70], m1
   7079    mova                 m0, [esp+0x1c0]
   7080    mova                 m1, [esp+0x1d0]
   7081    punpcklwd            m2, m7         ; 34a
   7082    punpcklwd            m3, m5         ; 34b
   7083    mova        [esp+0x180], m2
   7084    mova        [esp+0x190], m3
   7085    pshufb               m0, m6         ; 7a 6a
   7086    pshufb               m1, m6         ; 7b 6b
   7087    punpckhwd            m7, m0         ; 56a
   7088    punpckhwd            m5, m1         ; 56b
   7089    punpcklwd            m0, m4
   7090    punpckhqdq           m4, m4
   7091    punpcklwd            m1, m4
   7092    mova        [esp+0x1a0], m7
   7093    mova        [esp+0x1b0], m5
   7094    mova        [esp+0x1c0], m0
   7095    mova        [esp+0x1d0], m1
   7096    mova                 m0, [esp+0x60]
   7097    mova                 m1, [esp+0x70]
   7098 %endif
   7099    jmp .dy1_vloop
   7100 INIT_XMM ssse3
   7101 .dy2:
   7102    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
   7103    add                  wq, base_reg
   7104    jmp                  wq
   7105 %ifidn %1, put
   7106 .dy2_w2:
   7107 %if ARCH_X86_64
   7108    mov                 myd, mym
   7109    movzx               t0d, t0b
   7110    dec                srcq
   7111    movd                m15, t0d
   7112 %else
   7113  %define m10 [base+pd_0x3ff]
   7114  %define m11 [base+pd_0x4000]
   7115  %define m8  m0
   7116  %define m9  m1
   7117  %define m14 m4
   7118  %define m15 m3
   7119    movzx                r5, byte [esp+0x1f0]
   7120    dec                srcd
   7121    movd                m15, r5
   7122 %endif
   7123    punpckldq            m9, m8
   7124    SWAP                 m8, m9
   7125    paddd               m14, m8 ; mx+dx*[0-1]
   7126 %if ARCH_X86_64
   7127    mova                m11, [base+pd_0x4000]
   7128 %endif
   7129    pshufd              m15, m15, q0000
   7130    pand                 m8, m14, m10
   7131    psrld                m8, 6
   7132    paddd               m15, m8
   7133    movd                r4d, m15
   7134    psrldq              m15, 4
   7135 %if ARCH_X86_64
   7136    movd                r6d, m15
   7137 %else
   7138    movd                r3d, m15
   7139 %endif
   7140    mova                 m5, [base+bdct_lb_dw]
   7141    mova                 m6, [base+subpel_s_shuf2]
   7142    movd                m15, [base+subpel_filters+r4*8+2]
   7143 %if ARCH_X86_64
   7144    movd                 m7, [base+subpel_filters+r6*8+2]
   7145 %else
   7146    movd                 m7, [base+subpel_filters+r3*8+2]
   7147 %endif
   7148    pxor                 m9, m9
   7149    pcmpeqd              m8, m9
   7150    psrld               m14, 10
   7151 %if ARCH_X86_32
   7152    mov                  r3, r3m
   7153    pshufb              m14, m5
   7154    paddb               m14, m6
   7155    mova         [esp+0x00], m14
   7156  %define m14 [esp+0x00]
   7157    SWAP                 m5, m0
   7158    SWAP                 m6, m3
   7159  %define m8  m5
   7160  %define m15 m6
   7161 %endif
   7162    movq                 m0, [srcq+ssq*0]
   7163    movq                 m1, [srcq+ssq*1]
   7164    movhps               m0, [srcq+ssq*2]
   7165    movhps               m1, [srcq+ss3q ]
   7166    lea                srcq, [srcq+ssq*4]
   7167 %if ARCH_X86_64
   7168    shr                 myd, 6
   7169    mov                 r4d, 64 << 24
   7170    lea                 myd, [t1+myq]
   7171    cmovnz              r4q, [base+subpel_filters+myq*8]
   7172    pshufb              m14, m5
   7173    paddb               m14, m6
   7174    movq                m10, r4q
   7175 %else
   7176    mov                 myd, mym
   7177    mov                  r3, [esp+0x1f4]
   7178    xor                  r5, r5
   7179    shr                 myd, 6
   7180    lea                  r3, [r3+myd]
   7181    mov                  r4, 64 << 24
   7182    cmovnz               r4, [base+subpel_filters+r3*8+0]
   7183    cmovnz               r5, [base+subpel_filters+r3*8+4]
   7184    mov                  r3, r3m
   7185  %define m10 m4
   7186    movd                m10, r4
   7187    movd                 m3, r5
   7188    punpckldq           m10, m3
   7189 %endif
   7190    movq                 m3, [srcq+ssq*0]
   7191    movhps               m3, [srcq+ssq*1]
   7192    lea                srcq, [srcq+ssq*2]
   7193    punpcklbw           m10, m10
   7194    psraw               m10, 8
   7195    punpckldq           m15, m7
   7196    punpcklqdq          m15, m15
   7197 %if ARCH_X86_64
   7198    pand                m11, m8
   7199 %else
   7200    pand                 m7, m11, m8
   7201  %define m11 m7
   7202 %endif
   7203    pandn                m8, m15
   7204    SWAP                m15, m8
   7205    por                 m15, m11
   7206 %if ARCH_X86_64
   7207    pshufd               m8, m10, q0000
   7208    pshufd               m9, m10, q1111
   7209    pshufd              m11, m10, q3333
   7210    pshufd              m10, m10, q2222
   7211 %else
   7212    mova         [esp+0x10], m15
   7213  %define m15 [esp+0x10]
   7214    mov                  r5, r0m
   7215  %define dstq r5
   7216    mov                 dsd, dsm
   7217    pshufd               m5, m4, q0000
   7218    pshufd               m6, m4, q1111
   7219    pshufd               m7, m4, q2222
   7220    pshufd               m4, m4, q3333
   7221  %define m8  [esp+0x20]
   7222  %define m9  [esp+0x30]
   7223  %define m10 [esp+0x40]
   7224  %define m11 [esp+0x50]
   7225    mova                 m8, m5
   7226    mova                 m9, m6
   7227    mova                m10, m7
   7228    mova                m11, m4
   7229 %endif
   7230    pshufb               m0, m14
   7231    pshufb               m1, m14
   7232    pshufb               m3, m14
   7233    pmaddubsw            m0, m15
   7234    pmaddubsw            m1, m15
   7235    pmaddubsw            m3, m15
   7236    pslldq               m2, m3, 8
   7237    phaddw               m0, m2
   7238    phaddw               m1, m3
   7239    pmulhrsw             m0, m12            ; 0 2 _ 4
   7240    pmulhrsw             m1, m12            ; 1 3 _ 5
   7241    pshufd               m2, m0, q3110      ; 0 2 2 4
   7242    pshufd               m1, m1, q3110      ; 1 3 3 5
   7243    punpcklwd            m3, m2, m1         ; 01 23
   7244    punpckhwd            m2, m1             ; 23 45
   7245 .dy2_w2_loop:
   7246    movq                 m6, [srcq+ssq*0]
   7247    movq                 m7, [srcq+ssq*1]
   7248    movhps               m6, [srcq+ssq*2]
   7249    movhps               m7, [srcq+ss3q ]
   7250    lea                srcq, [srcq+ssq*4]
   7251    pmaddwd              m4, m3, m8
   7252    pmaddwd              m5, m2, m9
   7253    pshufb               m6, m14
   7254    pshufb               m7, m14
   7255    pmaddubsw            m6, m15
   7256    pmaddubsw            m7, m15
   7257    phaddw               m6, m7
   7258    pmulhrsw             m6, m12
   7259    psrldq               m7, m6, 8
   7260    palignr              m6, m0, 8
   7261    palignr              m7, m1, 8
   7262    mova                 m0, m6
   7263    mova                 m1, m7
   7264    pshufd               m6, m6, q3221
   7265    pshufd               m7, m7, q3221
   7266    punpcklwd            m3, m6, m7       ; 45 67
   7267    punpckhwd            m2, m6, m7       ; 67 89
   7268    pmaddwd              m6, m3, m10
   7269    pmaddwd              m7, m2, m11
   7270    paddd                m4, m5
   7271    paddd                m4, m13
   7272    paddd                m6, m7
   7273    paddd                m4, m6
   7274    psrad                m4, rndshift
   7275    packssdw             m4, m4
   7276    packuswb             m4, m4
   7277    movd                r4d, m4
   7278    mov        [dstq+dsq*0], r4w
   7279    shr                 r4d, 16
   7280    mov        [dstq+dsq*1], r4w
   7281    lea                dstq, [dstq+dsq*2]
   7282    sub                  hd, 2
   7283    jg .dy2_w2_loop
   7284    RET
   7285 %endif
   7286 INIT_XMM ssse3
   7287 .dy2_w4:
   7288 %if ARCH_X86_64
   7289    mov                 myd, mym
   7290    movzx               t0d, t0b
   7291    dec                srcq
   7292    movd                m15, t0d
   7293 %else
   7294 %define m10 [base+pd_0x3ff]
   7295 %define m11 [base+pd_0x4000]
   7296 %define m8  m0
   7297 %xdefine m14 m4
   7298 %define m15 m3
   7299 %define dstq r0
   7300 %if isprep
   7301  %define ssq r3
   7302 %endif
   7303    movzx                r4, byte [esp+0x1f0]
   7304    dec                srcq
   7305    movd                m15, r4
   7306 %endif
   7307    pmaddwd              m8, [base+rescale_mul]
   7308 %if ARCH_X86_64
   7309    mova                m11, [base+pd_0x4000]
   7310 %endif
   7311    pshufd              m15, m15, q0000
   7312    paddd               m14, m8 ; mx+dx*[0-3]
   7313    pand                 m8, m14, m10
   7314    psrld                m8, 6
   7315    paddd               m15, m8
   7316    psrldq               m7, m15, 8
   7317 %if ARCH_X86_64
   7318    movd                r4d, m15
   7319    movd               r11d, m7
   7320    psrldq              m15, 4
   7321    psrldq               m7, 4
   7322    movd                r6d, m15
   7323    movd               r13d, m7
   7324    movd                m15, [base+subpel_filters+ r4*8+2]
   7325    movd                 m2, [base+subpel_filters+r11*8+2]
   7326    movd                 m3, [base+subpel_filters+ r6*8+2]
   7327    movd                 m4, [base+subpel_filters+r13*8+2]
   7328    movq                 m6, [base+subpel_s_shuf2]
   7329    shr                 myd, 6
   7330    mov                 r4d, 64 << 24
   7331    lea                 myd, [t1+myq]
   7332    cmovnz              r4q, [base+subpel_filters+myq*8]
   7333 %else
   7334    movd                 r1, m15
   7335    movd                 r3, m7
   7336    psrldq              m15, 4
   7337    psrldq               m7, 4
   7338    movd                 r4, m15
   7339    movd                 r5, m7
   7340 %define m15 m5
   7341    SWAP                 m4, m7
   7342    movd                m15, [base+subpel_filters+r1*8+2]
   7343    movd                 m2, [base+subpel_filters+r3*8+2]
   7344    movd                 m3, [base+subpel_filters+r4*8+2]
   7345    movd                 m4, [base+subpel_filters+r5*8+2]
   7346    movq                 m6, [base+subpel_s_shuf2]
   7347    mov                 myd, mym
   7348    mov                  r3, [esp+0x1f4]
   7349    xor                  r5, r5
   7350    shr                 myd, 6
   7351    lea                  r3, [r3+myd]
   7352    mov                  r4, 64 << 24
   7353    cmovnz               r4, [base+subpel_filters+r3*8+0]
   7354    cmovnz               r5, [base+subpel_filters+r3*8+4]
   7355    mov                  r3, r3m
   7356 %if isprep
   7357    lea                ss3q, [ssq*3]
   7358 %endif
   7359 %endif
   7360    punpckldq           m15, m3
   7361    punpckldq            m2, m4
   7362    punpcklqdq          m15, m2
   7363 %if ARCH_X86_64
   7364    pcmpeqd              m8, m9
   7365    psrld               m14, 10
   7366    movu                 m0, [srcq+ssq*0]
   7367    movu                 m2, [srcq+ssq*2]
   7368    movu                 m1, [srcq+ssq*1]
   7369    movu                 m3, [srcq+ss3q ]
   7370    lea                srcq, [srcq+ssq*4]
   7371    punpcklqdq           m6, m6
   7372    pshufb              m14, [base+bdct_lb_dw]
   7373    movu                 m4, [srcq+ssq*0]
   7374    movu                 m5, [srcq+ssq*1]
   7375    lea                srcq, [srcq+ssq*2]
   7376    pand                m11, m8
   7377    pandn                m8, m15
   7378    SWAP                m15, m8
   7379    por                 m15, m11
   7380    paddb               m14, m6
   7381    movq                m11, r4q
   7382    punpcklbw           m11, m11
   7383    psraw               m11, 8
   7384    pshufb               m0, m14
   7385    pshufb               m2, m14
   7386    pshufb               m1, m14
   7387    pshufb               m3, m14
   7388    pshufb               m4, m14
   7389    pshufb               m5, m14
   7390    pmaddubsw            m0, m15
   7391    pmaddubsw            m2, m15
   7392    pmaddubsw            m1, m15
   7393    pmaddubsw            m3, m15
   7394    pmaddubsw            m4, m15
   7395    pmaddubsw            m5, m15
   7396    phaddw               m0, m2
   7397    phaddw               m1, m3
   7398    phaddw               m4, m5
   7399    pmulhrsw             m0, m12    ; 0 2
   7400    pmulhrsw             m1, m12    ; 1 3
   7401    pmulhrsw             m4, m12    ; 4 5
   7402    pshufd               m8, m11, q0000
   7403    pshufd               m9, m11, q1111
   7404    pshufd              m10, m11, q2222
   7405    pshufd              m11, m11, q3333
   7406 %else
   7407    pxor                 m3, m3
   7408    pcmpeqd              m8, m3
   7409    psrld               m14, 10
   7410    pshufb              m14, [base+bdct_lb_dw]
   7411    movu                 m1, [srcq+ssq*0]
   7412    movu                 m2, [srcq+ssq*2]
   7413    movu                 m3, [srcq+ssq*1]
   7414    add                srcq, ss3q
   7415    punpcklqdq           m6, m6
   7416    SWAP                 m4, m7
   7417    pand                 m7, m11, m8
   7418    pandn                m8, m15
   7419    SWAP                m15, m8
   7420    por                 m15, m7
   7421    paddb               m14, m6
   7422    movu                 m0, [srcq+ssq*0]
   7423    movu                 m7, [srcq+ssq*1]
   7424    movu                 m6, [srcq+ssq*2]
   7425    add                srcq, ss3q
   7426    pshufb               m1, m14
   7427    pshufb               m2, m14
   7428    pshufb               m3, m14
   7429    pshufb               m0, m14
   7430    pshufb               m7, m14
   7431    pshufb               m6, m14
   7432    pmaddubsw            m1, m15
   7433    pmaddubsw            m2, m15
   7434    pmaddubsw            m3, m15
   7435    mova         [esp+0x00], m14
   7436    mova         [esp+0x10], m15
   7437    pmaddubsw            m0, m15
   7438    pmaddubsw            m7, m15
   7439    pmaddubsw            m6, m15
   7440 %define m14 [esp+0x00]
   7441 %define m15 [esp+0x10]
   7442    phaddw               m1, m2
   7443    phaddw               m3, m0
   7444    phaddw               m7, m6
   7445 %ifidn %1, put
   7446    mov                 dsd, dsm
   7447  %define dstq r5
   7448 %else
   7449  %define tmpq r5
   7450 %endif
   7451    movd                 m6, r4
   7452    movd                 m0, r5
   7453    punpckldq            m6, m0
   7454    punpcklbw            m6, m6
   7455    psraw                m6, 8
   7456    mov                  r5, r0m
   7457    pmulhrsw             m1, m12 ; 0 2
   7458    pmulhrsw             m3, m12 ; 1 3
   7459    pmulhrsw             m7, m12 ; 4 5
   7460    SWAP                 m0, m1, m3
   7461    SWAP                 m4, m7
   7462    pshufd               m2, m6, q0000
   7463    pshufd               m3, m6, q1111
   7464    pshufd               m7, m6, q2222
   7465    pshufd               m6, m6, q3333
   7466    mova         [esp+0x30], m2
   7467    mova         [esp+0x40], m3
   7468    mova         [esp+0x50], m7
   7469    mova         [esp+0x60], m6
   7470 %define m8  [esp+0x30]
   7471 %define m9  [esp+0x40]
   7472 %define m10 [esp+0x50]
   7473 %define m11 [esp+0x60]
   7474 %endif
   7475    psrldq               m5, m4, 8  ; 5 _
   7476    punpckhwd            m2, m0, m1 ; 23
   7477    punpcklwd            m0, m1     ; 01
   7478    punpcklwd            m4, m5     ; 45
   7479 .dy2_w4_loop:
   7480    pmaddwd              m0, m8         ; a0
   7481    pmaddwd              m5, m2, m8     ; b0
   7482    pmaddwd              m2, m9         ; a1
   7483    pmaddwd              m7, m4, m9     ; b1
   7484    pmaddwd              m3, m4, m10    ; a2
   7485    paddd                m0, m13
   7486    paddd                m5, m13
   7487    paddd                m0, m2
   7488    paddd                m5, m7
   7489    paddd                m0, m3
   7490    movu                 m6, [srcq+ssq*0]
   7491    movu                 m7, [srcq+ssq*1]
   7492    movu                 m3, [srcq+ssq*2]
   7493    movu                 m1, [srcq+ss3q ]
   7494    lea                srcq, [srcq+ssq*4]
   7495    pshufb               m6, m14
   7496    pshufb               m7, m14
   7497    pshufb               m3, m14
   7498    pshufb               m1, m14
   7499    pmaddubsw            m6, m15
   7500    pmaddubsw            m7, m15
   7501    pmaddubsw            m3, m15
   7502    pmaddubsw            m1, m15
   7503    phaddw               m6, m7
   7504    phaddw               m3, m1
   7505    pmulhrsw             m6, m12    ; 6 7
   7506    pmulhrsw             m3, m12    ; 8 9
   7507    psrldq               m7, m6, 8
   7508    psrldq               m1, m3, 8
   7509    punpcklwd            m6, m7     ; 67
   7510    punpcklwd            m3, m1     ; 89
   7511    mova                 m2, m6
   7512    pmaddwd              m1, m6, m10    ; b2
   7513    pmaddwd              m6, m11        ; a3
   7514    pmaddwd              m7, m3, m11    ; b3
   7515    paddd                m5, m1
   7516    paddd                m0, m6
   7517    paddd                m5, m7
   7518    psrad                m0, rndshift
   7519    psrad                m5, rndshift
   7520    packssdw             m0, m5
   7521 %ifidn %1, put
   7522    packuswb             m0, m0
   7523    psrldq               m1, m0, 4
   7524    movd       [dstq+dsq*0], m0
   7525    movd       [dstq+dsq*1], m1
   7526    lea                dstq, [dstq+dsq*2]
   7527 %else
   7528    mova             [tmpq], m0
   7529    add                tmpq, 16
   7530 %endif
   7531    mova                 m0, m4
   7532    mova                 m4, m3
   7533    sub                  hd, 2
   7534    jg .dy2_w4_loop
   7535    MC_8TAP_SCALED_RET
   7536 INIT_XMM ssse3
   7537 .dy2_w8:
   7538    mov    dword [rsp+0x90], 1
   7539    movifprep   tmp_stridem, 16
   7540    jmp .dy2_w_start
   7541 .dy2_w16:
   7542    mov    dword [rsp+0x90], 2
   7543    movifprep   tmp_stridem, 32
   7544    jmp .dy2_w_start
   7545 .dy2_w32:
   7546    mov    dword [rsp+0x90], 4
   7547    movifprep   tmp_stridem, 64
   7548    jmp .dy2_w_start
   7549 .dy2_w64:
   7550    mov    dword [rsp+0x90], 8
   7551    movifprep   tmp_stridem, 128
   7552    jmp .dy2_w_start
   7553 .dy2_w128:
   7554    mov    dword [rsp+0x90], 16
   7555    movifprep   tmp_stridem, 256
   7556 .dy2_w_start:
   7557    mov                 myd, mym
   7558 %ifidn %1, put
   7559    movifnidn           dsm, dsq
   7560 %endif
   7561 %if ARCH_X86_64
   7562    shr                 t0d, 16
   7563    sub                srcq, 3
   7564    shr                 myd, 6
   7565    mov                 r4d, 64 << 24
   7566    lea                 myd, [t1+myq]
   7567    cmovnz              r4q, [base+subpel_filters+myq*8]
   7568    movd                m15, t0d
   7569 %else
   7570 %define m10 [base+pd_0x3ff]
   7571 %define m11 [base+pd_0x4000]
   7572 %define m8   m0
   7573 %define m9   m1
   7574 %xdefine m14 m4
   7575 %xdefine m15 m3
   7576 %if isprep
   7577  %define tmpq r0
   7578  %define ssq ssm
   7579 %else
   7580  %define dstq r0
   7581 %endif
   7582    mov                  r5, [esp+0x1f0]
   7583    mov                  r3, [esp+0x1f4]
   7584    shr                  r5, 16
   7585    sub                srcq, 3
   7586    movd                m15, r5
   7587    xor                  r5, r5
   7588    shr                 myd, 6
   7589    lea                  r3, [r3+myd]
   7590    mov                  r4, 64 << 24
   7591    cmovnz               r4, [base+subpel_filters+r3*8+0]
   7592    cmovnz               r5, [base+subpel_filters+r3*8+4]
   7593    mov                  r0, r0m
   7594    mov                  r3, r3m
   7595 %endif
   7596    pslld                m7, m8, 2 ; dx*4
   7597    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
   7598    pshufd              m15, m15, q0000
   7599    paddd               m14, m8 ; mx+dx*[0-3]
   7600 %if ARCH_X86_64
   7601    movq                 m3, r4q
   7602    punpcklbw            m3, m3
   7603    psraw                m3, 8
   7604 %else
   7605    movd                 m5, r4
   7606    movd                 m6, r5
   7607    punpckldq            m5, m6
   7608    punpcklbw            m5, m5
   7609    psraw                m5, 8
   7610    SWAP                 m3, m5
   7611 %endif
   7612    mova        [rsp+0x100], m7
   7613    mova        [rsp+0x120], m15
   7614    mov         [rsp+0x098], srcq
   7615    mov         [rsp+0x130], r0q ; dstq / tmpq
   7616    pshufd               m0, m3, q0000
   7617    pshufd               m1, m3, q1111
   7618    pshufd               m2, m3, q2222
   7619    pshufd               m3, m3, q3333
   7620    mova        [rsp+0x140], m0
   7621    mova        [rsp+0x150], m1
   7622    mova        [rsp+0x160], m2
   7623    mova        [rsp+0x170], m3
   7624 %if ARCH_X86_64 && UNIX64
   7625    mov                  hm, hd
   7626 %elif ARCH_X86_32
   7627    SWAP                  m5, m3
   7628    mov                   r5, hm
   7629    mov          [esp+0x134], r5
   7630 %endif
   7631    jmp .dy2_hloop
   7632 .dy2_hloop_prep:
   7633    dec   dword [rsp+0x090]
   7634    jz .ret
   7635 %if ARCH_X86_64
   7636    add   qword [rsp+0x130], 8*(isprep+1)
   7637    mov                  hd, hm
   7638 %else
   7639    add   dword [rsp+0x130], 8*(isprep+1)
   7640    mov                  r5, [esp+0x134]
   7641    mov                  r0, [esp+0x130]
   7642 %endif
   7643    mova                 m7, [rsp+0x100]
   7644    mova                m14, [rsp+0x110]
   7645 %if ARCH_X86_64
   7646    mova                m10, [base+pd_0x3ff]
   7647 %else
   7648 %define m10 [base+pd_0x3ff]
   7649 %endif
   7650    mova                m15, [rsp+0x120]
   7651    mov                srcq, [rsp+0x098]
   7652 %if ARCH_X86_64
   7653    mov                 r0q, [rsp+0x130] ; dstq / tmpq
   7654 %else
   7655    mov                  hm, r5
   7656    mov                 r0m, r0
   7657    mov                  r3, r3m
   7658 %endif
   7659    paddd               m14, m7
   7660 .dy2_hloop:
   7661    pxor                 m9, m9
   7662 %if ARCH_X86_64
   7663    mova                m11, [base+pq_0x40000000]
   7664 %else
   7665 %define m11 [base+pq_0x40000000]
   7666 %endif
   7667    psrld                m2, m14, 10
   7668    mova              [rsp], m2
   7669    pand                 m6, m14, m10
   7670    psrld                m6, 6
   7671    paddd                m5, m15, m6
   7672    pcmpeqd              m6, m9
   7673    psrldq               m2, m5, 8
   7674 %if ARCH_X86_64
   7675    movd                r4d, m5
   7676    movd                r6d, m2
   7677    psrldq               m5, 4
   7678    psrldq               m2, 4
   7679    movd                r7d, m5
   7680    movd                r9d, m2
   7681    movq                 m0, [base+subpel_filters+r4*8]
   7682    movq                 m1, [base+subpel_filters+r6*8]
   7683    movhps               m0, [base+subpel_filters+r7*8]
   7684    movhps               m1, [base+subpel_filters+r9*8]
   7685 %else
   7686    movd                 r0, m5
   7687    movd                 rX, m2
   7688    psrldq               m5, 4
   7689    psrldq               m2, 4
   7690    movd                 r4, m5
   7691    movd                 r5, m2
   7692    movq                 m0, [base+subpel_filters+r0*8]
   7693    movq                 m1, [base+subpel_filters+rX*8]
   7694    movhps               m0, [base+subpel_filters+r4*8]
   7695    movhps               m1, [base+subpel_filters+r5*8]
   7696    pxor                 m2, m2
   7697 %define m9 m2
   7698 %endif
   7699    paddd               m14, m7 ; mx+dx*[4-7]
   7700    pand                 m5, m14, m10
   7701    psrld                m5, 6
   7702    paddd               m15, m5
   7703    pcmpeqd              m5, m9
   7704    mova        [rsp+0x110], m14
   7705    psrldq               m4, m15, 8
   7706 %if ARCH_X86_64
   7707    movd               r10d, m15
   7708    movd               r11d, m4
   7709    psrldq              m15, 4
   7710    psrldq               m4, 4
   7711    movd               r13d, m15
   7712    movd                rXd, m4
   7713    movq                 m2, [base+subpel_filters+r10*8]
   7714    movq                 m3, [base+subpel_filters+r11*8]
   7715    movhps               m2, [base+subpel_filters+r13*8]
   7716    movhps               m3, [base+subpel_filters+ rX*8]
   7717    psrld               m14, 10
   7718    psrldq               m4, m14, 8
   7719    movd               r10d, m14
   7720    movd               r11d, m4
   7721    psrldq              m14, 4
   7722    psrldq               m4, 4
   7723    movd               r13d, m14
   7724    movd                rXd, m4
   7725    mov                 r4d, [rsp+ 0]
   7726    mov                 r6d, [rsp+ 8]
   7727    mov                 r7d, [rsp+ 4]
   7728    mov                 r9d, [rsp+12]
   7729    pshufd               m4, m6, q1100
   7730    pshufd               m6, m6, q3322
   7731    pshufd               m7, m5, q1100
   7732    pshufd               m5, m5, q3322
   7733    pand                 m8, m11, m4
   7734    pand                 m9, m11, m6
   7735    pand                m15, m11, m7
   7736    pand                m11, m11, m5
   7737    pandn                m4, m0
   7738    pandn                m6, m1
   7739    pandn                m7, m2
   7740    pandn                m5, m3
   7741    por                  m8, m4
   7742    por                  m9, m6
   7743    por                 m15, m7
   7744    por                 m11, m5
   7745    mova         [rsp+0x10], m8
   7746    mova         [rsp+0x20], m9
   7747    mova         [rsp+0x30], m15
   7748    mova         [rsp+0x40], m11
   7749    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
   7750    mova         [rsp+0x50], m1
   7751    mova         [rsp+0x60], m2
   7752    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
   7753    mova         [rsp+0x70], m3
   7754    mova         [rsp+0x80], m4
   7755    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
   7756    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
   7757    SWAP                 m7, m0
   7758    SWAP                 m8, m14
   7759    mova                 m1, [rsp+0x50]
   7760    mova                 m2, [rsp+0x60]
   7761    mova                 m3, [rsp+0x70]
   7762    mova                m15, [rsp+0x80]
   7763    punpcklwd            m4, m5, m6 ; 45a
   7764    punpckhwd            m5, m6     ; 45b
   7765    punpcklwd            m6, m7, m8 ; 67a
   7766    punpckhwd            m7, m8     ; 67b
   7767    SWAP                m14, m8
   7768    mova                 m8, [rsp+0x140]
   7769    mova                 m9, [rsp+0x150]
   7770    mova                m10, [rsp+0x160]
   7771    mova                m11, [rsp+0x170]
   7772    punpcklwd            m0, m1, m2 ; 01a
   7773    punpckhwd            m1, m2     ; 01b
   7774    punpcklwd            m2, m3, m15; 23a
   7775    punpckhwd            m3, m15    ; 23b
   7776    mova         [rsp+0x50], m4
   7777    mova         [rsp+0x60], m5
   7778    mova         [rsp+0x70], m6
   7779    mova         [rsp+0x80], m7
   7780 %else
   7781    movd                 r0, m15
   7782    movd                 rX, m4
   7783    psrldq              m15, 4
   7784    psrldq               m4, 4
   7785    movd                 r4, m15
   7786    movd                 r5, m4
   7787    mova                m14, [esp+0x110]
   7788    movq                 m2, [base+subpel_filters+r0*8]
   7789    movq                 m3, [base+subpel_filters+rX*8]
   7790    movhps               m2, [base+subpel_filters+r4*8]
   7791    movhps               m3, [base+subpel_filters+r5*8]
   7792    psrld               m14, 10
   7793    mova           [esp+16], m14
   7794    mov                  r0, [esp+ 0]
   7795    mov                  rX, [esp+ 8]
   7796    mov                  r4, [esp+ 4]
   7797    mov                  r5, [esp+12]
   7798    mova         [esp+0x20], m0
   7799    mova         [esp+0x30], m1
   7800    mova         [esp+0x40], m2
   7801    mova         [esp+0x50], m3
   7802    pshufd               m4, m6, q1100
   7803    pshufd               m6, m6, q3322
   7804    pshufd               m7, m5, q1100
   7805    pshufd               m5, m5, q3322
   7806    pand                 m0, m11, m4
   7807    pand                 m1, m11, m6
   7808    pand                 m2, m11, m7
   7809    pand                 m3, m11, m5
   7810    pandn                m4, [esp+0x20]
   7811    pandn                m6, [esp+0x30]
   7812    pandn                m7, [esp+0x40]
   7813    pandn                m5, [esp+0x50]
   7814    por                  m0, m4
   7815    por                  m1, m6
   7816    por                  m2, m7
   7817    por                  m3, m5
   7818    mova        [esp+0x20], m0
   7819    mova        [esp+0x30], m1
   7820    mova        [esp+0x40], m2
   7821    mova        [esp+0x50], m3
   7822    MC_8TAP_SCALED_H   0x20, 0x60, 0 ; 0-1
   7823    MC_8TAP_SCALED_H   0x20, 0x180   ; 2-3
   7824    MC_8TAP_SCALED_H   0x20, 0x1a0   ; 4-5
   7825    MC_8TAP_SCALED_H   0x20, 0x1c0   ; 6-7
   7826    mova                 m5, [esp+0x1a0]
   7827    mova                 m6, [esp+0x1b0]
   7828    mova                 m7, [esp+0x1c0]
   7829    mova                 m0, [esp+0x1d0]
   7830    punpcklwd            m4, m5, m6      ; 45a
   7831    punpckhwd            m5, m6          ; 45b
   7832    punpcklwd            m6, m7, m0      ; 67a
   7833    punpckhwd            m7, m0          ; 67b
   7834    mova        [esp+0x1a0], m4
   7835    mova        [esp+0x1b0], m5
   7836    mova        [esp+0x1c0], m6
   7837    mova        [esp+0x1d0], m7
   7838    mova                 m1, [esp+0x060]
   7839    mova                 m2, [esp+0x070]
   7840    mova                 m3, [esp+0x180]
   7841    mova                 m4, [esp+0x190]
   7842    punpcklwd            m0, m1, m2      ; 01a
   7843    punpckhwd            m1, m2          ; 01b
   7844    punpcklwd            m2, m3, m4      ; 23a
   7845    punpckhwd            m3, m4          ; 23b
   7846    mova        [esp+0x180], m2
   7847    mova        [esp+0x190], m3
   7848 %define m8  [esp+0x140]
   7849 %define m9  [esp+0x150]
   7850 %define m10 [esp+0x160]
   7851 %define m11 [esp+0x170]
   7852 %endif
   7853 .dy2_vloop:
   7854 %if ARCH_X86_32
   7855    mov                  r0, r0m
   7856 %endif
   7857    pmaddwd              m4, m0, m8
   7858    pmaddwd              m5, m1, m8
   7859    pmaddwd              m6, m2, m9
   7860    pmaddwd              m7, m3, m9
   7861    paddd                m4, m13
   7862    paddd                m5, m13
   7863    paddd                m4, m6
   7864    paddd                m5, m7
   7865 %if ARCH_X86_64
   7866    pmaddwd              m6, [rsp+0x50], m10
   7867    pmaddwd              m7, [rsp+0x60], m10
   7868 %else
   7869    pmaddwd              m6, [esp+0x1a0], m10
   7870    pmaddwd              m7, [esp+0x1b0], m10
   7871 %endif
   7872    paddd                m4, m6
   7873    paddd                m5, m7
   7874 %if ARCH_X86_64
   7875    pmaddwd              m6, [rsp+0x70], m11
   7876    pmaddwd              m7, [rsp+0x80], m11
   7877 %else
   7878    pmaddwd              m6, [esp+0x1c0], m11
   7879    pmaddwd              m7, [esp+0x1d0], m11
   7880 %endif
   7881    paddd                m4, m6
   7882    paddd                m5, m7
   7883    psrad                m4, rndshift
   7884    psrad                m5, rndshift
   7885    packssdw             m4, m5
   7886 %ifidn %1, put
   7887    packuswb             m4, m4
   7888    movq             [dstq], m4
   7889    add                dstq, dsm
   7890 %else
   7891    mova             [tmpq], m4
   7892    add                tmpq, tmp_stridem
   7893 %endif
   7894 %if ARCH_X86_32
   7895    mov                 r0m, r0
   7896 %endif
   7897    dec                  hd
   7898    jz .dy2_hloop_prep
   7899 %if ARCH_X86_64
   7900    mova                 m8, [rsp+0x10]
   7901    mova                 m9, [rsp+0x20]
   7902    mova                m10, [rsp+0x30]
   7903    mova                m11, [rsp+0x40]
   7904    mova                 m0, m2             ; 01a
   7905    mova                 m1, m3             ; 01b
   7906    MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11
   7907    mova                 m3, [rsp+0x50] ; 23a
   7908    mova                 m4, [rsp+0x60] ; 23b
   7909    mova                 m5, [rsp+0x70] ; 45a
   7910    mova                 m7, [rsp+0x80] ; 45b
   7911    mova                 m8, [rsp+0x140]
   7912    mova                 m9, [rsp+0x150]
   7913    mova                m10, [rsp+0x160]
   7914    mova                m11, [rsp+0x170]
   7915    punpcklwd           m14, m2, m6     ; 67a
   7916    punpckhwd            m2, m6         ; 67b
   7917    mova         [rsp+0x50], m5
   7918    mova         [rsp+0x60], m7
   7919    mova         [rsp+0x70], m14
   7920    mova         [rsp+0x80], m2
   7921    mova                 m2, m3
   7922    mova                 m3, m4
   7923 %else
   7924    MC_8TAP_SCALED_H   0x20, 0
   7925    punpcklwd            m6, m0, m4
   7926    punpckhwd            m7, m0, m4
   7927    mova                 m0, [esp+0x180] ; 01a
   7928    mova                 m1, [esp+0x190] ; 01b
   7929    mova                 m2, [rsp+0x1a0]  ; 23a
   7930    mova                 m3, [esp+0x1b0]  ; 23b
   7931    mova                 m4, [esp+0x1c0]  ; 45a
   7932    mova                 m5, [esp+0x1d0]  ; 45b
   7933    mova        [esp+0x180], m2
   7934    mova        [esp+0x190], m3
   7935    mova        [esp+0x1a0], m4
   7936    mova        [esp+0x1b0], m5
   7937    mova        [esp+0x1c0], m6          ; 67a
   7938    mova        [esp+0x1d0], m7          ; 67b
   7939 %endif
   7940    jmp .dy2_vloop
   7941 .ret:
   7942    MC_8TAP_SCALED_RET 0
   7943 %if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
   7944 %define r0m [rstk+stack_offset+ 4]
   7945 %define r1m [rstk+stack_offset+ 8]
   7946 %define r2m [rstk+stack_offset+12]
   7947 %define r3m [rstk+stack_offset+16]
   7948 %endif
   7949 %undef isprep
   7950 %endmacro
   7951 
   7952 %macro BILIN_SCALED_FN 1
   7953 cglobal %1_bilin_scaled_8bpc
   7954    mov                 t0d, (5*15 << 16) | 5*15
   7955    mov                 t1d, (5*15 << 16) | 5*15
   7956    jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
   7957 %endmacro
   7958 
   7959 %if WIN64
   7960 DECLARE_REG_TMP 6, 5
   7961 %elif ARCH_X86_64
   7962 DECLARE_REG_TMP 6, 8
   7963 %else
   7964 DECLARE_REG_TMP 1, 2
   7965 %endif
   7966 %define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
   7967 BILIN_SCALED_FN put
   7968 PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_8bpc
   7969 PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_8bpc
   7970 PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_8bpc
   7971 PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_8bpc
   7972 PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_8bpc
   7973 PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_8bpc
   7974 PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_8bpc
   7975 PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_8bpc
   7976 PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
   7977 MC_8TAP_SCALED put
   7978 
   7979 %if WIN64
   7980 DECLARE_REG_TMP 5, 4
   7981 %elif ARCH_X86_64
   7982 DECLARE_REG_TMP 6, 7
   7983 %else
   7984 DECLARE_REG_TMP 1, 2
   7985 %endif
   7986 %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
   7987 BILIN_SCALED_FN prep
   7988 PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_8bpc
   7989 PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_8bpc
   7990 PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_8bpc
   7991 PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_8bpc
   7992 PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_8bpc
   7993 PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_8bpc
   7994 PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_8bpc
   7995 PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_8bpc
   7996 PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
   7997 MC_8TAP_SCALED prep
   7998 
   7999 %if ARCH_X86_32
   8000 %macro SAVE_ALPHA_BETA 0
   8001    mov              alpham, alphad
   8002    mov               betam, betad
   8003 %endmacro
   8004 
   8005 %macro SAVE_DELTA_GAMMA 0
   8006    mov              deltam, deltad
   8007    mov              gammam, gammad
   8008 %endmacro
   8009 
   8010 %macro LOAD_ALPHA_BETA_MX 0
   8011    mov                 mym, myd
   8012    mov              alphad, alpham
   8013    mov               betad, betam
   8014    mov                 mxd, mxm
   8015 %endmacro
   8016 
   8017 %macro LOAD_DELTA_GAMMA_MY 0
   8018    mov                 mxm, mxd
   8019    mov              deltad, deltam
   8020    mov              gammad, gammam
   8021    mov                 myd, mym
   8022 %endmacro
   8023 
   8024 %define PIC_reg r2
   8025 %define PIC_base_offset $$
   8026 %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
   8027 %else
   8028 %define SAVE_ALPHA_BETA
   8029 %define SAVE_DELTA_GAMMA
   8030 %define PIC_sym(sym) sym
   8031 %endif
   8032 
   8033 %if ARCH_X86_32
   8034 %if STACK_ALIGNMENT < required_stack_alignment
   8035  %assign copy_args 8*4
   8036 %else
   8037  %assign copy_args 0
   8038 %endif
   8039 %endif
   8040 
   8041 %macro RELOC_ARGS 0
   8042 %if copy_args
   8043    mov                  r0, r0m
   8044    mov                  r1, r1m
   8045    mov                  r2, r2m
   8046    mov                  r3, r3m
   8047    mov                  r5, r5m
   8048    mov                dstm, r0
   8049    mov                 dsm, r1
   8050    mov                srcm, r2
   8051    mov                 ssm, r3
   8052    mov                 mxm, r5
   8053    mov                  r0, r6m
   8054    mov                 mym, r0
   8055 %endif
   8056 %endmacro
   8057 
   8058 %macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
   8059 %if cpuflag(sse4)
   8060    pblendw              %1, %2, 0xAA
   8061 %else
   8062    pand                 %2, m10
   8063    por                  %1, %2
   8064 %endif
   8065 %endmacro
   8066 
   8067 %macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
   8068 %if ARCH_X86_32
   8069  %define m8  m4
   8070  %define m9  m5
   8071  %define m14 m6
   8072  %define m15 m7
   8073  %define m11 m7
   8074 %endif
   8075 %if ARCH_X86_32
   8076    pxor                m11, m11
   8077 %endif
   8078    lea               tmp1d, [myq+deltaq*4]
   8079    lea               tmp2d, [myq+deltaq*1]
   8080    shr                 myd, 10
   8081    shr               tmp1d, 10
   8082    movq                 m2, [filterq+myq  *8] ; a
   8083    movq                 m8, [filterq+tmp1q*8] ; e
   8084    lea               tmp1d, [tmp2q+deltaq*4]
   8085    lea                 myd, [tmp2q+deltaq*1]
   8086    shr               tmp2d, 10
   8087    shr               tmp1d, 10
   8088    movq                 m3, [filterq+tmp2q*8] ; b
   8089    movq                 m0, [filterq+tmp1q*8] ; f
   8090    punpcklwd            m2, m3
   8091    punpcklwd            m8, m0
   8092    lea               tmp1d, [myq+deltaq*4]
   8093    lea               tmp2d, [myq+deltaq*1]
   8094    shr                 myd, 10
   8095    shr               tmp1d, 10
   8096    movq                 m0, [filterq+myq  *8] ; c
   8097    movq                 m9, [filterq+tmp1q*8] ; g
   8098    lea               tmp1d, [tmp2q+deltaq*4]
   8099    lea                 myd, [tmp2q+gammaq]       ; my += gamma
   8100    shr               tmp2d, 10
   8101    shr               tmp1d, 10
   8102    movq                 m3, [filterq+tmp2q*8] ; d
   8103    movq                 m1, [filterq+tmp1q*8] ; h
   8104    punpcklwd            m0, m3
   8105    punpcklwd            m9, m1
   8106    punpckldq            m1, m2, m0
   8107    punpckhdq            m2, m0
   8108    punpcklbw            m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
   8109    punpckhbw            m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
   8110    punpcklbw            m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
   8111    punpckhbw           m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
   8112    pmaddwd              m0, %3
   8113    pmaddwd              m3, %5
   8114    pmaddwd              m1, %7
   8115    pmaddwd             m14, %9
   8116    paddd                m0, m3
   8117    paddd                m1, m14
   8118    paddd                m0, m1
   8119    mova                 %1, m0
   8120 %if ARCH_X86_64
   8121    SWAP                 m3, m14
   8122 %endif
   8123    punpckldq            m0, m8, m9
   8124    punpckhdq            m8, m9
   8125    punpcklbw            m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
   8126    punpckhbw           m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
   8127    punpcklbw            m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
   8128    punpckhbw           m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
   8129    pmaddwd              m1, %4
   8130    pmaddwd             m14, %6
   8131    pmaddwd              m2, %8
   8132    pmaddwd             m15, %10
   8133    paddd                m1, m14
   8134    paddd                m2, m15
   8135    paddd                m1, m2
   8136    mova                 %2, m1
   8137 %if ARCH_X86_64
   8138    SWAP                m14, m3
   8139 %endif
   8140 %endmacro
   8141 
   8142 %if ARCH_X86_64
   8143 %define counterd r4d
   8144 %else
   8145 %if copy_args == 0
   8146  %define counterd dword r4m
   8147 %else
   8148  %define counterd dword [esp+stack_size-4*7]
   8149 %endif
   8150 %endif
   8151 
   8152 %macro WARP_AFFINE_8X8 0
   8153 %if ARCH_X86_64
   8154 cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts
   8155 %else
   8156 cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts
   8157 %if copy_args
   8158  %define tmpm [esp+stack_size-4*1]
   8159  %define tsm  [esp+stack_size-4*2]
   8160 %endif
   8161 %endif
   8162    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main
   8163 .loop:
   8164 %if ARCH_X86_32
   8165 %define m12 m4
   8166 %define m13 m5
   8167 %define m14 m6
   8168 %define m15 m7
   8169    mova                m12, [esp+0xC0]
   8170    mova                m13, [esp+0xD0]
   8171    mova                m14, [esp+0xE0]
   8172    mova                m15, [esp+0xF0]
   8173 %endif
   8174    psrad               m12, 13
   8175    psrad               m13, 13
   8176    psrad               m14, 13
   8177    psrad               m15, 13
   8178    packssdw            m12, m13
   8179    packssdw            m14, m15
   8180    mova                m13, [PIC_sym(pw_8192)]
   8181    pmulhrsw            m12, m13 ; (x + (1 << 6)) >> 7
   8182    pmulhrsw            m14, m13
   8183    mova       [tmpq+tsq*0], m12
   8184    mova       [tmpq+tsq*2], m14
   8185    dec            counterd
   8186    jz   mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end
   8187 %if ARCH_X86_32
   8188    mov                tmpm, tmpd
   8189    mov                  r0, [esp+0x100]
   8190    mov                  r1, [esp+0x104]
   8191 %endif
   8192    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2
   8193    lea                tmpq, [tmpq+tsq*4]
   8194    jmp .loop
   8195 
   8196 %if ARCH_X86_64
   8197 cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \
   8198                              dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
   8199                              filter, tmp1, delta, my, gamma
   8200 %else
   8201 cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \
   8202                              dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
   8203                              filter, tmp1, delta, my, gamma
   8204 %define alphaq     r0
   8205 %define alphad     r0
   8206 %define alpham     [esp+gprsize+0x100]
   8207 %define betaq      r1
   8208 %define betad      r1
   8209 %define betam      [esp+gprsize+0x104]
   8210 %define deltaq     r0
   8211 %define deltad     r0
   8212 %define deltam     [esp+gprsize+0x108]
   8213 %define gammaq     r1
   8214 %define gammad     r1
   8215 %define gammam     [esp+gprsize+0x10C]
   8216 %define filterq    r3
   8217 %define tmp1q      r4
   8218 %define tmp1d      r4
   8219 %define tmp1m      [esp+gprsize+0x110]
   8220 %define myq        r5
   8221 %define myd        r5
   8222 %define mym        r6m
   8223 %if copy_args
   8224  %define dstm [esp+stack_size-4*1]
   8225  %define dsm  [esp+stack_size-4*2]
   8226  %define srcm [esp+stack_size-4*3]
   8227  %define ssm  [esp+stack_size-4*4]
   8228  %define mxm  [esp+stack_size-4*5]
   8229  %define mym  [esp+stack_size-4*6]
   8230 %endif
   8231 %endif
   8232    call .main
   8233    jmp .start
   8234 .loop:
   8235 %if ARCH_X86_32
   8236    mov                dstm, dstd
   8237    mov              alphad, [esp+0x100]
   8238    mov               betad, [esp+0x104]
   8239 %endif
   8240    call .main2
   8241    lea                dstq, [dstq+dsq*2]
   8242 .start:
   8243 %if notcpuflag(sse4)
   8244  %define roundval pw_8192
   8245 %if ARCH_X86_64
   8246    mova                m10, [PIC_sym(roundval)]
   8247 %else
   8248  %define m10 [PIC_sym(roundval)]
   8249 %endif
   8250 %endif
   8251 %if ARCH_X86_32
   8252 %define m12 m5
   8253 %define m13 m6
   8254    mova                m12, [esp+0xC0]
   8255    mova                m13, [esp+0xD0]
   8256 %endif
   8257 %if cpuflag(sse4)
   8258 %if ARCH_X86_32
   8259  %define m11 m4
   8260    pxor                m11, m11
   8261 %endif
   8262    psrad               m12, 18
   8263    psrad               m13, 18
   8264    packusdw            m12, m13
   8265    pavgw               m12, m11 ; (x + (1 << 10)) >> 11
   8266 %else
   8267    psrad               m12, 17
   8268    psrad               m13, 17
   8269    packssdw            m12, m13
   8270    pmulhrsw            m12, m10
   8271 %endif
   8272 %if ARCH_X86_32
   8273 %define m14 m6
   8274 %define m15 m7
   8275    mova                m14, [esp+0xE0]
   8276    mova                m15, [esp+0xF0]
   8277 %endif
   8278 %if cpuflag(sse4)
   8279    psrad               m14, 18
   8280    psrad               m15, 18
   8281    packusdw            m14, m15
   8282    pavgw               m14, m11 ; (x + (1 << 10)) >> 11
   8283 %else
   8284    psrad               m14, 17
   8285    psrad               m15, 17
   8286    packssdw            m14, m15
   8287    pmulhrsw            m14, m10
   8288 %endif
   8289    packuswb            m12, m14
   8290    movq       [dstq+dsq*0], m12
   8291    movhps     [dstq+dsq*1], m12
   8292    dec            counterd
   8293    jg .loop
   8294 .end:
   8295    RET
   8296 ALIGN function_align
   8297 .main:
   8298 %assign stack_offset stack_offset+gprsize
   8299 %if ARCH_X86_32
   8300 %assign stack_size stack_size+4
   8301 %if copy_args
   8302  %assign stack_offset stack_offset-4
   8303 %endif
   8304    RELOC_ARGS
   8305    LEA             PIC_reg, $$
   8306 %define PIC_mem [esp+gprsize+0x114]
   8307    mov               abcdd, abcdm
   8308 %if copy_args == 0
   8309    mov                 ssd, ssm
   8310    mov                 mxd, mxm
   8311 %endif
   8312    mov             PIC_mem, PIC_reg
   8313    mov                srcd, srcm
   8314 %endif
   8315    movsx            deltad, word [abcdq+2*2]
   8316    movsx            gammad, word [abcdq+2*3]
   8317    lea               tmp1d, [deltaq*3]
   8318    sub              gammad, tmp1d    ; gamma -= delta*3
   8319    SAVE_DELTA_GAMMA
   8320 %if ARCH_X86_32
   8321    mov               abcdd, abcdm
   8322 %endif
   8323    movsx            alphad, word [abcdq+2*0]
   8324    movsx             betad, word [abcdq+2*1]
   8325    lea               tmp1q, [ssq*3+3]
   8326    add                 mxd, 512+(64<<10)
   8327    lea               tmp2d, [alphaq*3]
   8328    sub                srcq, tmp1q    ; src -= src_stride*3 + 3
   8329 %if ARCH_X86_32
   8330    mov                srcm, srcd
   8331    mov             PIC_reg, PIC_mem
   8332 %endif
   8333    sub               betad, tmp2d    ; beta -= alpha*3
   8334    lea             filterq, [PIC_sym(mc_warp_filter2)]
   8335 %if ARCH_X86_64
   8336    mov                 myd, r6m
   8337    pxor                m11, m11
   8338 %endif
   8339    call .h
   8340    psrld                m2, m0, 16
   8341    psrld                m3, m1, 16
   8342 %if ARCH_X86_32
   8343    mova [esp+gprsize+0x10], m3
   8344 %endif
   8345    call .h
   8346    psrld                m4, m0, 16
   8347    psrld                m5, m1, 16
   8348 %if ARCH_X86_32
   8349    mova [esp+gprsize+0x20], m4
   8350    mova [esp+gprsize+0x30], m5
   8351 %endif
   8352    call .h
   8353 %if ARCH_X86_64
   8354 %define blendmask [rsp+gprsize+0x80]
   8355 %else
   8356    mova                 m3, [esp+gprsize+0x10]
   8357 %define blendmask [esp+gprsize+0x120]
   8358 %define m10 m7
   8359 %endif
   8360    pcmpeqd             m10, m10
   8361    pslld               m10, 16
   8362    mova          blendmask, m10
   8363    BLENDHWDW            m2, m0 ; 0
   8364    BLENDHWDW            m3, m1 ; 2
   8365    mova [rsp+gprsize+0x00], m2
   8366    mova [rsp+gprsize+0x10], m3
   8367    call .h
   8368 %if ARCH_X86_32
   8369    mova                 m4, [esp+gprsize+0x20]
   8370    mova                 m5, [esp+gprsize+0x30]
   8371 %endif
   8372    mova                m10, blendmask
   8373    BLENDHWDW            m4, m0 ; 1
   8374    BLENDHWDW            m5, m1 ; 3
   8375    mova [rsp+gprsize+0x20], m4
   8376    mova [rsp+gprsize+0x30], m5
   8377    call .h
   8378 %if ARCH_X86_32
   8379    mova                 m3, [esp+gprsize+0x10]
   8380 %define m10 m5
   8381 %endif
   8382    psrld                m6, m2, 16
   8383    psrld                m7, m3, 16
   8384    mova                m10, blendmask
   8385    BLENDHWDW            m6, m0 ; 2
   8386    BLENDHWDW            m7, m1 ; 4
   8387    mova [rsp+gprsize+0x40], m6
   8388    mova [rsp+gprsize+0x50], m7
   8389    call .h
   8390 %if ARCH_X86_32
   8391    mova                m4, [esp+gprsize+0x20]
   8392    mova                m5, [esp+gprsize+0x30]
   8393 %endif
   8394    psrld               m2, m4, 16
   8395    psrld               m3, m5, 16
   8396    mova                m10, blendmask
   8397    BLENDHWDW           m2, m0 ; 3
   8398    BLENDHWDW           m3, m1 ; 5
   8399    mova [rsp+gprsize+0x60], m2
   8400    mova [rsp+gprsize+0x70], m3
   8401    call .h
   8402 %if ARCH_X86_32
   8403    mova                 m6, [esp+gprsize+0x40]
   8404    mova                 m7, [esp+gprsize+0x50]
   8405 %define m10 m7
   8406 %endif
   8407    psrld                m4, m6, 16
   8408    psrld                m5, m7, 16
   8409    mova                m10, blendmask
   8410    BLENDHWDW            m4, m0 ; 4
   8411    BLENDHWDW            m5, m1 ; 6
   8412 %if ARCH_X86_64
   8413    add                 myd, 512+(64<<10)
   8414    mova                 m6, m2
   8415    mova                 m7, m3
   8416 %else
   8417    mova [esp+gprsize+0x80], m4
   8418    mova [esp+gprsize+0x90], m5
   8419    add           dword mym, 512+(64<<10)
   8420 %endif
   8421    mov            counterd, 4
   8422    SAVE_ALPHA_BETA
   8423 .main2:
   8424    call .h
   8425 %if ARCH_X86_32
   8426    mova                 m6, [esp+gprsize+0x60]
   8427    mova                 m7, [esp+gprsize+0x70]
   8428 %define m10 m5
   8429 %endif
   8430    psrld                m6, 16
   8431    psrld                m7, 16
   8432    mova                m10, blendmask
   8433    BLENDHWDW            m6, m0 ; 5
   8434    BLENDHWDW            m7, m1 ; 7
   8435 %if ARCH_X86_64
   8436    WARP_V              m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
   8437                                  m4, m5, \
   8438                                  [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
   8439                                  m6, m7
   8440 %else
   8441    mova [esp+gprsize+0xA0], m6
   8442    mova [esp+gprsize+0xB0], m7
   8443    LOAD_DELTA_GAMMA_MY
   8444    WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
   8445           [esp+gprsize+0x00], [esp+gprsize+0x10], \
   8446           [esp+gprsize+0x80], [esp+gprsize+0x90], \
   8447           [esp+gprsize+0x20], [esp+gprsize+0x30], \
   8448           [esp+gprsize+0xA0], [esp+gprsize+0xB0]
   8449    LOAD_ALPHA_BETA_MX
   8450 %endif
   8451    call .h
   8452    mova                 m2, [rsp+gprsize+0x40]
   8453    mova                 m3, [rsp+gprsize+0x50]
   8454 %if ARCH_X86_32
   8455    mova                 m4, [rsp+gprsize+0x80]
   8456    mova                 m5, [rsp+gprsize+0x90]
   8457 %define m10 m7
   8458 %endif
   8459    mova [rsp+gprsize+0x00], m2
   8460    mova [rsp+gprsize+0x10], m3
   8461    mova [rsp+gprsize+0x40], m4
   8462    mova [rsp+gprsize+0x50], m5
   8463    psrld                m4, 16
   8464    psrld                m5, 16
   8465    mova                m10, blendmask
   8466    BLENDHWDW            m4, m0 ; 6
   8467    BLENDHWDW            m5, m1 ; 8
   8468 %if ARCH_X86_64
   8469    WARP_V              m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
   8470                                  m6, m7, \
   8471                                  [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
   8472                                  m4, m5
   8473 %else
   8474    mova [esp+gprsize+0x80], m4
   8475    mova [esp+gprsize+0x90], m5
   8476    LOAD_DELTA_GAMMA_MY
   8477    WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
   8478           [esp+gprsize+0x20], [esp+gprsize+0x30], \
   8479           [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
   8480           [esp+gprsize+0x00], [esp+gprsize+0x10], \
   8481           [esp+gprsize+0x80], [esp+gprsize+0x90]
   8482    mov                 mym, myd
   8483    mov                dstd, dstm
   8484    mov                 dsd, dsm
   8485    mov                 mxd, mxm
   8486 %endif
   8487    mova                 m2, [rsp+gprsize+0x60]
   8488    mova                 m3, [rsp+gprsize+0x70]
   8489 %if ARCH_X86_32
   8490    mova                 m6, [esp+gprsize+0xA0]
   8491    mova                 m7, [esp+gprsize+0xB0]
   8492 %endif
   8493    mova [rsp+gprsize+0x20], m2
   8494    mova [rsp+gprsize+0x30], m3
   8495    mova [rsp+gprsize+0x60], m6
   8496    mova [rsp+gprsize+0x70], m7
   8497    ret
   8498 ALIGN function_align
   8499 .h:
   8500 %if ARCH_X86_32
   8501 %define m8  m3
   8502 %define m9  m4
   8503 %define m10 m5
   8504 %define m14 m6
   8505 %define m15 m7
   8506 %endif
   8507    lea               tmp1d, [mxq+alphaq*4]
   8508    lea               tmp2d, [mxq+alphaq*1]
   8509 %if ARCH_X86_32
   8510 %assign stack_offset stack_offset+4
   8511 %assign stack_size stack_size+4
   8512 %define PIC_mem [esp+gprsize*2+0x114]
   8513    mov             PIC_mem, PIC_reg
   8514    mov                srcd, srcm
   8515 %endif
   8516    movu                m10, [srcq]
   8517 %if ARCH_X86_32
   8518    add                srcd, ssm
   8519    mov                srcm, srcd
   8520    mov             PIC_reg, PIC_mem
   8521 %else
   8522    add                srcq, ssq
   8523 %endif
   8524    shr                 mxd, 10
   8525    shr               tmp1d, 10
   8526    movq                 m1, [filterq+mxq  *8]  ; 0 X
   8527    movq                 m8, [filterq+tmp1q*8]  ; 4 X
   8528    lea               tmp1d, [tmp2q+alphaq*4]
   8529    lea                 mxd, [tmp2q+alphaq*1]
   8530    shr               tmp2d, 10
   8531    shr               tmp1d, 10
   8532    movhps               m1, [filterq+tmp2q*8]  ; 0 1
   8533    movhps               m8, [filterq+tmp1q*8]  ; 4 5
   8534    lea               tmp1d, [mxq+alphaq*4]
   8535    lea               tmp2d, [mxq+alphaq*1]
   8536    shr                 mxd, 10
   8537    shr               tmp1d, 10
   8538    movq                m14, [filterq+mxq  *8]  ; 2 X
   8539    movq                 m9, [filterq+tmp1q*8]  ; 6 X
   8540    lea               tmp1d, [tmp2q+alphaq*4]
   8541    lea                 mxd, [tmp2q+betaq]  ; mx += beta
   8542    shr               tmp2d, 10
   8543    shr               tmp1d, 10
   8544    movhps              m14, [filterq+tmp2q*8]  ; 2 3
   8545    movhps               m9, [filterq+tmp1q*8]  ; 6 7
   8546    pshufb               m0, m10, [PIC_sym(warp_8x8_shufA)]
   8547    pmaddubsw            m0, m1
   8548    pshufb               m1, m10, [PIC_sym(warp_8x8_shufB)]
   8549    pmaddubsw            m1, m8
   8550    pshufb              m15, m10, [PIC_sym(warp_8x8_shufC)]
   8551    pmaddubsw           m15, m14
   8552    pshufb              m10, m10, [PIC_sym(warp_8x8_shufD)]
   8553    pmaddubsw           m10, m9
   8554    phaddw               m0, m15
   8555    phaddw               m1, m10
   8556    mova                m14, [PIC_sym(pw_8192)]
   8557    mova                 m9, [PIC_sym(pd_32768)]
   8558    pmaddwd              m0, m14 ; 17-bit intermediate, upshifted by 13
   8559    pmaddwd              m1, m14
   8560    paddd                m0, m9  ; rounded 14-bit result in upper 16 bits of dword
   8561    paddd                m1, m9
   8562    ret
   8563 %endmacro
   8564 
   8565 %if WIN64
   8566 DECLARE_REG_TMP 6, 4
   8567 %else
   8568 DECLARE_REG_TMP 6, 7
   8569 %endif
   8570 
   8571 %macro BIDIR_FN 1 ; op
   8572    %1                    0
   8573    lea            stride3q, [strideq*3]
   8574    jmp                  wq
   8575 .w4_loop:
   8576    %1_INC_PTR            2
   8577    %1                    0
   8578    lea                dstq, [dstq+strideq*4]
   8579 .w4: ; tile 4x
   8580    movd   [dstq          ], m0      ; copy dw[0]
   8581    pshuflw              m1, m0, q1032 ; swap dw[1] and dw[0]
   8582    movd   [dstq+strideq*1], m1      ; copy dw[1]
   8583    punpckhqdq           m0, m0      ; swap dw[3,2] with dw[1,0]
   8584    movd   [dstq+strideq*2], m0      ; dw[2]
   8585    psrlq                m0, 32      ; shift right in dw[3]
   8586    movd   [dstq+stride3q ], m0      ; copy
   8587    sub                  hd, 4
   8588    jg .w4_loop
   8589    RET
   8590 .w8_loop:
   8591    %1_INC_PTR            2
   8592    %1                    0
   8593    lea                dstq, [dstq+strideq*2]
   8594 .w8:
   8595    movq   [dstq          ], m0
   8596    movhps [dstq+strideq*1], m0
   8597    sub                  hd, 2
   8598    jg .w8_loop
   8599    RET
   8600 .w16_loop:
   8601    %1_INC_PTR            2
   8602    %1                    0
   8603    lea                dstq, [dstq+strideq]
   8604 .w16:
   8605    mova   [dstq          ], m0
   8606    dec                  hd
   8607    jg .w16_loop
   8608    RET
   8609 .w32_loop:
   8610    %1_INC_PTR            4
   8611    %1                    0
   8612    lea                dstq, [dstq+strideq]
   8613 .w32:
   8614    mova   [dstq          ], m0
   8615    %1                    2
   8616    mova   [dstq + 16     ], m0
   8617    dec                  hd
   8618    jg .w32_loop
   8619    RET
   8620 .w64_loop:
   8621    %1_INC_PTR            8
   8622    %1                    0
   8623    add                dstq, strideq
   8624 .w64:
   8625    %assign i 0
   8626    %rep 4
   8627    mova   [dstq + i*16   ], m0
   8628    %assign i i+1
   8629    %if i < 4
   8630    %1                    2*i
   8631    %endif
   8632    %endrep
   8633    dec                  hd
   8634    jg .w64_loop
   8635    RET
   8636 .w128_loop:
   8637    %1_INC_PTR            16
   8638    %1                    0
   8639    add                dstq, strideq
   8640 .w128:
   8641    %assign i 0
   8642    %rep 8
   8643    mova   [dstq + i*16   ], m0
   8644    %assign i i+1
   8645    %if i < 8
   8646    %1                    2*i
   8647    %endif
   8648    %endrep
   8649    dec                  hd
   8650    jg .w128_loop
   8651    RET
   8652 %endmacro
   8653 
   8654 %macro AVG 1 ; src_offset
   8655    ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
   8656    mova                 m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
   8657    paddw                m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
   8658    mova                 m1, [tmp1q+(%1+1)*mmsize]
   8659    paddw                m1, [tmp2q+(%1+1)*mmsize]
   8660    pmulhrsw             m0, m2
   8661    pmulhrsw             m1, m2
   8662    packuswb             m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
   8663 %endmacro
   8664 
   8665 %macro AVG_INC_PTR 1
   8666    add               tmp1q, %1*mmsize
   8667    add               tmp2q, %1*mmsize
   8668 %endmacro
   8669 
   8670 cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
   8671    LEA                  r6, avg_ssse3_table
   8672    tzcnt                wd, wm ; leading zeros
   8673    movifnidn            hd, hm ; move h(stack) to h(register) if not already that register
   8674    movsxd               wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
   8675    mova                 m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
   8676    add                  wq, r6
   8677    BIDIR_FN            AVG
   8678 
   8679 %macro W_AVG 1 ; src_offset
   8680    ; (a * weight + b * (16 - weight) + 128) >> 8
   8681    ; = ((a - b) * weight + (b << 4) + 128) >> 8
   8682    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
   8683    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
   8684    mova                 m2, [tmp1q+(%1+0)*mmsize]
   8685    mova                 m0, m2
   8686    psubw                m2, [tmp2q+(%1+0)*mmsize]
   8687    mova                 m3, [tmp1q+(%1+1)*mmsize]
   8688    mova                 m1, m3
   8689    psubw                m3, [tmp2q+(%1+1)*mmsize]
   8690    pmulhw               m2, m4
   8691    pmulhw               m3, m4
   8692    paddw                m0, m2
   8693    paddw                m1, m3
   8694    pmulhrsw             m0, m5
   8695    pmulhrsw             m1, m5
   8696    packuswb             m0, m1
   8697 %endmacro
   8698 
   8699 %define W_AVG_INC_PTR AVG_INC_PTR
   8700 
   8701 cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
   8702    LEA                  r6, w_avg_ssse3_table
   8703    tzcnt                wd, wm
   8704    movd                 m4, r6m
   8705    movifnidn            hd, hm
   8706    pxor                 m0, m0
   8707    movsxd               wq, dword [r6+wq*4]
   8708    mova                 m5, [pw_2048+r6-w_avg_ssse3_table]
   8709    pshufb               m4, m0
   8710    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
   8711    add                  wq, r6
   8712    cmp           dword r6m, 7
   8713    jg .weight_gt7
   8714    mov                  r6, tmp1q
   8715    psubw                m0, m4
   8716    mov               tmp1q, tmp2q
   8717    mova                 m4, m0 ; -weight
   8718    mov               tmp2q, r6
   8719 .weight_gt7:
   8720    BIDIR_FN          W_AVG
   8721 
   8722 %macro MASK 1 ; src_offset
   8723    ; (a * m + b * (64 - m) + 512) >> 10
   8724    ; = ((a - b) * m + (b << 6) + 512) >> 10
   8725    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
   8726    mova                 m3,     [maskq+(%1+0)*(mmsize/2)]
   8727    mova                 m0,     [tmp2q+(%1+0)*mmsize] ; b
   8728    psubw                m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
   8729    mova                 m6, m3      ; m
   8730    psubb                m3, m4, m6  ; -m
   8731    paddw                m1, m1     ; (b - a) << 1
   8732    paddb                m3, m3     ; -m << 1
   8733    punpcklbw            m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
   8734    pmulhw               m1, m2     ; (-m * (b - a)) << 10
   8735    paddw                m0, m1     ; + b
   8736    mova                 m1,     [tmp2q+(%1+1)*mmsize] ; b
   8737    psubw                m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
   8738    paddw                m2, m2  ; (b - a) << 1
   8739    mova                 m6, m3  ; (-m << 1)
   8740    punpckhbw            m3, m4, m6 ; (-m << 9)
   8741    pmulhw               m2, m3 ; (-m << 9)
   8742    paddw                m1, m2 ; (-m * (b - a)) << 10
   8743    pmulhrsw             m0, m5 ; round
   8744    pmulhrsw             m1, m5 ; round
   8745    packuswb             m0, m1 ; interleave 16 -> 8
   8746 %endmacro
   8747 
   8748 %macro MASK_INC_PTR 1
   8749    add               maskq, %1*mmsize/2
   8750    add               tmp1q, %1*mmsize
   8751    add               tmp2q, %1*mmsize
   8752 %endmacro
   8753 
   8754 %if ARCH_X86_64
   8755 cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
   8756    movifnidn            hd, hm
   8757 %else
   8758 cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
   8759 %define hd dword r5m
   8760 %endif
   8761 %define base r6-mask_ssse3_table
   8762    LEA                  r6, mask_ssse3_table
   8763    tzcnt                wd, wm
   8764    movsxd               wq, dword [r6+wq*4]
   8765    pxor                 m4, m4
   8766    mova                 m5, [base+pw_2048]
   8767    add                  wq, r6
   8768    mov               maskq, r6m
   8769    BIDIR_FN           MASK
   8770 %undef hd
   8771 
   8772 %macro W_MASK_420_END 1-*
   8773 %rep %0
   8774    call .main
   8775    paddw                m2, [maskq+16*%1]
   8776    mova      [maskq+16*%1], m2
   8777    mova [dstq+strideq*1+16*(2*%1+0)], m0
   8778    call .main
   8779    psubw                m3, m7, m2
   8780    psubw                m1, m7, [maskq+16*%1]
   8781    psubw                m3, [dstq+strideq*1+16*(2*%1+1)]
   8782    psrlw                m1, 2
   8783    psrlw                m3, 2
   8784    packuswb             m1, m3
   8785    mova      [maskq+16*%1], m1
   8786    mova [dstq+strideq*1+16*(2*%1+1)], m0
   8787    %rotate 1
   8788 %endrep
   8789 %endmacro
   8790 
   8791 %if UNIX64
   8792 DECLARE_REG_TMP 7
   8793 %else
   8794 DECLARE_REG_TMP 5
   8795 %endif
   8796 
   8797 cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
   8798 %define base t0-w_mask_420_ssse3_table
   8799    LEA                  t0, w_mask_420_ssse3_table
   8800    tzcnt                wd, wm
   8801    mov                 r6d, r7m ; sign
   8802    sub               tmp2q, tmp1q
   8803    movsxd               wq, [t0+wq*4]
   8804    mova                 m6, [base+pw_2048]
   8805    movddup              m7, [base+wm_420_sign+r6*8] ; 258 - sign
   8806    add                  wq, t0
   8807 %if ARCH_X86_64
   8808    mova                 m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
   8809    movifnidn            hd, hm
   8810 %else
   8811    %define              m8  [base+pw_6903]
   8812    %define              hd  dword hm
   8813 %endif
   8814    mov               maskq, maskmp
   8815    call .main
   8816    jmp                  wq
   8817 .w4_loop:
   8818    call .main
   8819    add               maskq, 4
   8820    lea                dstq, [dstq+strideq*2]
   8821 .w4:
   8822    pshufd               m3, m2, q2020
   8823    pshufd               m2, m2, q3131
   8824    psubw                m1, m7, m3
   8825    psubw                m1, m2
   8826    psrlw                m1, 2
   8827    packuswb             m1, m1
   8828    movd            [maskq], m1
   8829    movd   [dstq+strideq*0], m0
   8830    pshuflw              m1, m0, q1032
   8831    movd   [dstq+strideq*1], m1
   8832    punpckhqdq           m0, m0
   8833    lea                dstq, [dstq+strideq*2]
   8834    movd   [dstq+strideq*0], m0
   8835    pshuflw              m1, m0, q1032
   8836    movd   [dstq+strideq*1], m1
   8837    sub                  hd, 4
   8838    jg .w4_loop
   8839    RET
   8840 .w8_loop:
   8841    call .main
   8842    add               maskq, 4
   8843    lea                dstq, [dstq+strideq*2]
   8844 .w8:
   8845    movhlps              m3, m2
   8846    psubw                m1, m7, m2
   8847    psubw                m1, m3
   8848    psrlw                m1, 2
   8849    packuswb             m1, m1
   8850    movd            [maskq], m1
   8851    movq   [dstq+strideq*0], m0
   8852    movhps [dstq+strideq*1], m0
   8853    sub                  hd, 2
   8854    jg .w8_loop
   8855    RET
   8856 .w16_loop:
   8857    call .main
   8858    add               maskq, 8
   8859    lea                dstq, [dstq+strideq*2]
   8860 .w16:
   8861    mova   [dstq+strideq*1], m2
   8862    mova   [dstq+strideq*0], m0
   8863    call .main
   8864    psubw                m1, m7, [dstq+strideq*1]
   8865    psubw                m1, m2
   8866    psrlw                m1, 2
   8867    packuswb             m1, m1
   8868    movq            [maskq], m1
   8869    mova   [dstq+strideq*1], m0
   8870    sub                  hd, 2
   8871    jg .w16_loop
   8872    RET
   8873 .w32_loop:
   8874    call .main
   8875    add               maskq, 16
   8876    lea                dstq, [dstq+strideq*2]
   8877 .w32:
   8878    mova            [maskq], m2
   8879    mova [dstq+strideq*0+16*0], m0
   8880    call .main
   8881    mova [dstq+strideq*1+16*1], m2
   8882    mova [dstq+strideq*0+16*1], m0
   8883    W_MASK_420_END        0
   8884    sub                  hd, 2
   8885    jg .w32_loop
   8886    RET
   8887 .w64_loop:
   8888    call .main
   8889    add               maskq, 16*2
   8890    lea                dstq, [dstq+strideq*2]
   8891 .w64:
   8892    mova       [maskq+16*0], m2
   8893    mova [dstq+strideq*0+16*0], m0
   8894    call .main
   8895    mova [dstq+strideq*1+16*1], m2
   8896    mova [dstq+strideq*0+16*1], m0
   8897    call .main
   8898    mova       [maskq+16*1], m2
   8899    mova [dstq+strideq*0+16*2], m0
   8900    call .main
   8901    mova [dstq+strideq*1+16*3], m2
   8902    mova [dstq+strideq*0+16*3], m0
   8903    W_MASK_420_END        0, 1
   8904    sub                  hd, 2
   8905    jg .w64_loop
   8906    RET
   8907 .w128_loop:
   8908    call .main
   8909    add               maskq, 16*4
   8910    lea                dstq, [dstq+strideq*2]
   8911 .w128:
   8912    mova       [maskq+16*0], m2
   8913    mova [dstq+strideq*0+16*0], m0
   8914    call .main
   8915    mova [dstq+strideq*1+16*1], m2
   8916    mova [dstq+strideq*0+16*1], m0
   8917    call .main
   8918    mova       [maskq+16*1], m2
   8919    mova [dstq+strideq*0+16*2], m0
   8920    call .main
   8921    mova [dstq+strideq*1+16*3], m2
   8922    mova [dstq+strideq*0+16*3], m0
   8923    call .main
   8924    mova       [maskq+16*2], m2
   8925    mova [dstq+strideq*0+16*4], m0
   8926    call .main
   8927    mova [dstq+strideq*1+16*5], m2
   8928    mova [dstq+strideq*0+16*5], m0
   8929    call .main
   8930    mova       [maskq+16*3], m2
   8931    mova [dstq+strideq*0+16*6], m0
   8932    call .main
   8933    mova [dstq+strideq*1+16*7], m2
   8934    mova [dstq+strideq*0+16*7], m0
   8935    W_MASK_420_END        0, 1, 2, 3
   8936    sub                  hd, 2
   8937    jg .w128_loop
   8938    RET
   8939 ALIGN function_align
   8940 .main:
   8941    mova                 m0, [tmp1q      +16*0]
   8942    mova                 m3, [tmp1q+tmp2q+16*0]
   8943    mova                 m1, [tmp1q      +16*1]
   8944    mova                 m4, [tmp1q+tmp2q+16*1]
   8945    add               tmp1q, 16*2
   8946    psubw                m3, m0
   8947    psubw                m4, m1
   8948    pabsw                m5, m3
   8949    psubusw              m2, m8, m5
   8950    psrlw                m2, 8 ; 64 - m
   8951    psllw                m5, m2, 10
   8952    pmulhw               m3, m5
   8953    pabsw                m5, m4
   8954    paddw                m0, m3
   8955    psubusw              m3, m8, m5
   8956    psrlw                m3, 8
   8957    phaddw               m2, m3
   8958    psllw                m3, 10
   8959    pmulhw               m4, m3
   8960    paddw                m1, m4
   8961    pmulhrsw             m0, m6
   8962    pmulhrsw             m1, m6
   8963    packuswb             m0, m1
   8964    ret
   8965 
   8966 %macro W_MASK_422_BACKUP 1 ; mask_offset
   8967 %if ARCH_X86_64
   8968    mova                m10, m2
   8969 %else
   8970    mova      [maskq+16*%1], m2
   8971 %endif
   8972 %endmacro
   8973 
   8974 %macro W_MASK_422_END 1 ; mask_offset
   8975 %if ARCH_X86_64
   8976    packuswb            m10, m2
   8977    psubb                m1, m7, m10
   8978    pavgb                m1, m9
   8979 %else
   8980    mova                 m3, [maskq+16*%1]
   8981    packuswb             m3, m2
   8982    pxor                 m2, m2
   8983    psubb                m1, m7, m3
   8984    pavgb                m1, m2
   8985 %endif
   8986    mova      [maskq+16*%1], m1
   8987 %endmacro
   8988 
   8989 cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask
   8990 %define base t0-w_mask_422_ssse3_table
   8991    LEA                  t0, w_mask_422_ssse3_table
   8992    tzcnt                wd, wm
   8993    mov                 r6d, r7m ; sign
   8994    sub               tmp2q, tmp1q
   8995    movsxd               wq, [t0+wq*4]
   8996    mova                 m6, [base+pw_2048]
   8997    movddup              m7, [base+wm_422_sign+r6*8] ; 128 - sign
   8998    add                  wq, t0
   8999 %if ARCH_X86_64
   9000    mova                 m8, [base+pw_6903]
   9001    pxor                 m9, m9
   9002    movifnidn            hd, hm
   9003 %else
   9004    add                  t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table
   9005    %define              hd  dword hm
   9006 %endif
   9007    mov               maskq, maskmp
   9008    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9009    jmp                  wq
   9010 .w4_loop:
   9011    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9012    add               maskq, 8
   9013    lea                dstq, [dstq+strideq*2]
   9014 .w4:
   9015    packuswb             m2, m2
   9016    psubb                m1, m7, m2
   9017 %if ARCH_X86_64
   9018    pavgb                m1, m9
   9019 %else
   9020    pxor                 m2, m2
   9021    pavgb                m1, m2
   9022 %endif
   9023    movq            [maskq], m1
   9024    movd   [dstq+strideq*0], m0
   9025    pshuflw              m1, m0, q1032
   9026    movd   [dstq+strideq*1], m1
   9027    punpckhqdq           m0, m0
   9028    lea                dstq, [dstq+strideq*2]
   9029    movd   [dstq+strideq*0], m0
   9030    pshuflw              m1, m0, q1032
   9031    movd   [dstq+strideq*1], m1
   9032    sub                  hd, 4
   9033    jg .w4_loop
   9034    RET
   9035 .w8_loop:
   9036    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9037    add               maskq, 16
   9038    lea                dstq, [dstq+strideq*2]
   9039 .w8:
   9040    W_MASK_422_BACKUP     0
   9041    movq   [dstq+strideq*0], m0
   9042    movhps [dstq+strideq*1], m0
   9043    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9044    lea                dstq, [dstq+strideq*2]
   9045    W_MASK_422_END        0
   9046    movq   [dstq+strideq*0], m0
   9047    movhps [dstq+strideq*1], m0
   9048    sub                  hd, 4
   9049    jg .w8_loop
   9050    RET
   9051 .w16_loop:
   9052    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9053    add               maskq, 16
   9054    lea                dstq, [dstq+strideq*2]
   9055 .w16:
   9056    W_MASK_422_BACKUP     0
   9057    mova   [dstq+strideq*0], m0
   9058    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9059    W_MASK_422_END        0
   9060    mova   [dstq+strideq*1], m0
   9061    sub                  hd, 2
   9062    jg .w16_loop
   9063    RET
   9064 .w32_loop:
   9065    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9066    add               maskq, 16
   9067    add                dstq, strideq
   9068 .w32:
   9069    W_MASK_422_BACKUP     0
   9070    mova        [dstq+16*0], m0
   9071    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9072    W_MASK_422_END        0
   9073    mova        [dstq+16*1], m0
   9074    dec                  hd
   9075    jg .w32_loop
   9076    RET
   9077 .w64_loop:
   9078    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9079    add               maskq, 16*2
   9080    add                dstq, strideq
   9081 .w64:
   9082    W_MASK_422_BACKUP     0
   9083    mova        [dstq+16*0], m0
   9084    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9085    W_MASK_422_END        0
   9086    mova        [dstq+16*1], m0
   9087    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9088    W_MASK_422_BACKUP     1
   9089    mova        [dstq+16*2], m0
   9090    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9091    W_MASK_422_END        1
   9092    mova        [dstq+16*3], m0
   9093    dec                  hd
   9094    jg .w64_loop
   9095    RET
   9096 .w128_loop:
   9097    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9098    add               maskq, 16*4
   9099    add                dstq, strideq
   9100 .w128:
   9101    W_MASK_422_BACKUP     0
   9102    mova        [dstq+16*0], m0
   9103    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9104    W_MASK_422_END        0
   9105    mova        [dstq+16*1], m0
   9106    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9107    W_MASK_422_BACKUP     1
   9108    mova        [dstq+16*2], m0
   9109    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9110    W_MASK_422_END        1
   9111    mova        [dstq+16*3], m0
   9112    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9113    W_MASK_422_BACKUP     2
   9114    mova        [dstq+16*4], m0
   9115    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9116    W_MASK_422_END        2
   9117    mova        [dstq+16*5], m0
   9118    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9119    W_MASK_422_BACKUP     3
   9120    mova        [dstq+16*6], m0
   9121    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
   9122    W_MASK_422_END        3
   9123    mova        [dstq+16*7], m0
   9124    dec                  hd
   9125    jg .w128_loop
   9126    RET
   9127 
   9128 cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
   9129 %define base t0-w_mask_444_ssse3_table
   9130    LEA                  t0, w_mask_444_ssse3_table
   9131    tzcnt                wd, wm
   9132    mov               maskq, maskmp
   9133    sub               tmp2q, tmp1q
   9134    movsxd               wq, [t0+wq*4]
   9135    mova                 m6, [base+pw_6903]
   9136    mova                 m7, [base+pw_2048]
   9137    add                  wq, t0
   9138 %if ARCH_X86_64
   9139    mova                 m8, [base+pb_64]
   9140    movifnidn            hd, hm
   9141 %else
   9142    %define              m8  [base+pb_64]
   9143    %define              hd  dword hm
   9144 %endif
   9145    call .main
   9146    jmp                  wq
   9147 .w4_loop:
   9148    call .main
   9149    lea                dstq, [dstq+strideq*2]
   9150 .w4:
   9151    movd   [dstq+strideq*0], m0
   9152    pshuflw              m1, m0, q1032
   9153    movd   [dstq+strideq*1], m1
   9154    punpckhqdq           m0, m0
   9155    lea                dstq, [dstq+strideq*2]
   9156    movd   [dstq+strideq*0], m0
   9157    pshuflw              m1, m0, q1032
   9158    movd   [dstq+strideq*1], m1
   9159    sub                  hd, 4
   9160    jg .w4_loop
   9161    RET
   9162 .w8_loop:
   9163    call .main
   9164    lea                dstq, [dstq+strideq*2]
   9165 .w8:
   9166    movq   [dstq+strideq*0], m0
   9167    movhps [dstq+strideq*1], m0
   9168    sub                  hd, 2
   9169    jg .w8_loop
   9170    RET
   9171 .w16_loop:
   9172    call .main
   9173    lea                dstq, [dstq+strideq*2]
   9174 .w16:
   9175    mova   [dstq+strideq*0], m0
   9176    call .main
   9177    mova   [dstq+strideq*1], m0
   9178    sub                  hd, 2
   9179    jg .w16_loop
   9180    RET
   9181 .w32_loop:
   9182    call .main
   9183    add                dstq, strideq
   9184 .w32:
   9185    mova        [dstq+16*0], m0
   9186    call .main
   9187    mova        [dstq+16*1], m0
   9188    dec                  hd
   9189    jg .w32_loop
   9190    RET
   9191 .w64_loop:
   9192    call .main
   9193    add                dstq, strideq
   9194 .w64:
   9195    mova        [dstq+16*0], m0
   9196    call .main
   9197    mova        [dstq+16*1], m0
   9198    call .main
   9199    mova        [dstq+16*2], m0
   9200    call .main
   9201    mova        [dstq+16*3], m0
   9202    dec                  hd
   9203    jg .w64_loop
   9204    RET
   9205 .w128_loop:
   9206    call .main
   9207    add                dstq, strideq
   9208 .w128:
   9209    mova        [dstq+16*0], m0
   9210    call .main
   9211    mova        [dstq+16*1], m0
   9212    call .main
   9213    mova        [dstq+16*2], m0
   9214    call .main
   9215    mova        [dstq+16*3], m0
   9216    call .main
   9217    mova        [dstq+16*4], m0
   9218    call .main
   9219    mova        [dstq+16*5], m0
   9220    call .main
   9221    mova        [dstq+16*6], m0
   9222    call .main
   9223    mova        [dstq+16*7], m0
   9224    dec                  hd
   9225    jg .w128_loop
   9226    RET
   9227 ALIGN function_align
   9228 .main:
   9229    mova                 m0, [tmp1q      +16*0]
   9230    mova                 m3, [tmp1q+tmp2q+16*0]
   9231    mova                 m1, [tmp1q      +16*1]
   9232    mova                 m4, [tmp1q+tmp2q+16*1]
   9233    add               tmp1q, 16*2
   9234    psubw                m3, m0
   9235    psubw                m4, m1
   9236    pabsw                m5, m3
   9237    psubusw              m2, m6, m5
   9238    psrlw                m2, 8 ; 64 - m
   9239    psllw                m5, m2, 10
   9240    pmulhw               m3, m5
   9241    pabsw                m5, m4
   9242    paddw                m0, m3
   9243    psubusw              m3, m6, m5
   9244    psrlw                m3, 8
   9245    packuswb             m2, m3
   9246    psllw                m3, 10
   9247    pmulhw               m4, m3
   9248    psubb                m3, m8, m2
   9249    paddw                m1, m4
   9250    pmulhrsw             m0, m7
   9251    pmulhrsw             m1, m7
   9252    mova            [maskq], m3
   9253    add               maskq, 16
   9254    packuswb             m0, m1
   9255    ret
   9256 
   9257 %macro BLEND_64M 4; a, b, mask1, mask2
   9258    punpcklbw            m0, %1, %2; {b;a}[7..0]
   9259    punpckhbw            %1, %2    ; {b;a}[15..8]
   9260    pmaddubsw            m0, %3    ; {b*m[0] + (64-m[0])*a}[7..0] u16
   9261    pmaddubsw            %1, %4    ; {b*m[1] + (64-m[1])*a}[15..8] u16
   9262    pmulhrsw             m0, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
   9263    pmulhrsw             %1, m5    ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
   9264    packuswb             m0, %1    ; {blendpx}[15..0] u8
   9265 %endmacro
   9266 
   9267 %macro BLEND 2; a, b
   9268    psubb                m3, m4, m0 ; m3 = (64 - m)
   9269    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
   9270    punpckhbw            m3, m0     ; {m;(64-m)}[15..8]
   9271    BLEND_64M            %1, %2, m2, m3
   9272 %endmacro
   9273 
   9274 cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
   9275 %define base r6-blend_ssse3_table
   9276    LEA                  r6, blend_ssse3_table
   9277    tzcnt                wd, wm
   9278    movifnidn            hd, hm
   9279    movifnidn         maskq, maskmp
   9280    movsxd               wq, dword [r6+wq*4]
   9281    mova                 m4, [base+pb_64]
   9282    mova                 m5, [base+pw_512]
   9283    add                  wq, r6
   9284    lea                  r6, [dsq*3]
   9285    jmp                  wq
   9286 .w4:
   9287    movq                 m0, [maskq]; m
   9288    movd                 m1, [dstq+dsq*0] ; a
   9289    movd                 m6, [dstq+dsq*1]
   9290    punpckldq            m1, m6
   9291    movq                 m6, [tmpq] ; b
   9292    psubb                m3, m4, m0 ; m3 = (64 - m)
   9293    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
   9294    punpcklbw            m1, m6    ; {b;a}[7..0]
   9295    pmaddubsw            m1, m2    ; {b*m[0] + (64-m[0])*a}[7..0] u16
   9296    pmulhrsw             m1, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
   9297    packuswb             m1, m0    ; {blendpx}[15..0] u8
   9298    movd       [dstq+dsq*0], m1
   9299    psrlq                m1, 32
   9300    movd       [dstq+dsq*1], m1
   9301    add               maskq, 8
   9302    add                tmpq, 8
   9303    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
   9304    sub                  hd, 2
   9305    jg .w4
   9306    RET
   9307 .w8:
   9308    mova                 m0, [maskq]; m
   9309    movq                 m1, [dstq+dsq*0] ; a
   9310    movhps               m1, [dstq+dsq*1]
   9311    mova                 m6, [tmpq] ; b
   9312    BLEND                m1, m6
   9313    movq       [dstq+dsq*0], m0
   9314    movhps     [dstq+dsq*1], m0
   9315    add               maskq, 16
   9316    add                tmpq, 16
   9317    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
   9318    sub                  hd, 2
   9319    jg .w8
   9320    RET
   9321 .w16:
   9322    mova                 m0, [maskq]; m
   9323    mova                 m1, [dstq] ; a
   9324    mova                 m6, [tmpq] ; b
   9325    BLEND                m1, m6
   9326    mova             [dstq], m0
   9327    add               maskq, 16
   9328    add                tmpq, 16
   9329    add                dstq, dsq ; dst_stride
   9330    dec                  hd
   9331    jg .w16
   9332    RET
   9333 .w32:
   9334    %assign i 0
   9335    %rep 2
   9336    mova                 m0, [maskq+16*i]; m
   9337    mova                 m1, [dstq+16*i] ; a
   9338    mova                 m6, [tmpq+16*i] ; b
   9339    BLEND                m1, m6
   9340    mova        [dstq+i*16], m0
   9341    %assign i i+1
   9342    %endrep
   9343    add               maskq, 32
   9344    add                tmpq, 32
   9345    add                dstq, dsq ; dst_stride
   9346    dec                  hd
   9347    jg .w32
   9348    RET
   9349 
   9350 cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
   9351 %define base r5-blend_v_ssse3_table
   9352    LEA                  r5, blend_v_ssse3_table
   9353    tzcnt                wd, wm
   9354    movifnidn            hd, hm
   9355    movsxd               wq, dword [r5+wq*4]
   9356    mova                 m5, [base+pw_512]
   9357    add                  wq, r5
   9358    add               maskq, obmc_masks-blend_v_ssse3_table
   9359    jmp                  wq
   9360 .w2:
   9361    movd                 m3, [maskq+4]
   9362    punpckldq            m3, m3
   9363    ; 2 mask blend is provided for 4 pixels / 2 lines
   9364 .w2_loop:
   9365    movd                 m1, [dstq+dsq*0] ; a {..;a;a}
   9366    pinsrw               m1, [dstq+dsq*1], 1
   9367    movd                 m2, [tmpq] ; b
   9368    punpcklbw            m0, m1, m2; {b;a}[7..0]
   9369    pmaddubsw            m0, m3    ; {b*m + (64-m)*a}[7..0] u16
   9370    pmulhrsw             m0, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
   9371    packuswb             m0, m1    ; {blendpx}[8..0] u8
   9372    movd                r3d, m0
   9373    mov        [dstq+dsq*0], r3w
   9374    shr                 r3d, 16
   9375    mov        [dstq+dsq*1], r3w
   9376    add                tmpq, 2*2
   9377    lea                dstq, [dstq + dsq * 2]
   9378    sub                  hd, 2
   9379    jg .w2_loop
   9380    RET
   9381 .w4:
   9382    movddup              m3, [maskq+8]
   9383    ; 4 mask blend is provided for 8 pixels / 2 lines
   9384 .w4_loop:
   9385    movd                 m1, [dstq+dsq*0] ; a
   9386    movd                 m2, [dstq+dsq*1] ;
   9387    punpckldq            m1, m2
   9388    movq                 m2, [tmpq] ; b
   9389    punpcklbw            m1, m2    ; {b;a}[7..0]
   9390    pmaddubsw            m1, m3    ; {b*m + (64-m)*a}[7..0] u16
   9391    pmulhrsw             m1, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
   9392    packuswb             m1, m1    ; {blendpx}[8..0] u8
   9393    movd             [dstq], m1
   9394    psrlq                m1, 32
   9395    movd       [dstq+dsq*1], m1
   9396    add                tmpq, 2*4
   9397    lea                dstq, [dstq+dsq*2]
   9398    sub                  hd, 2
   9399    jg .w4_loop
   9400    RET
   9401 .w8:
   9402    mova                 m3, [maskq+16]
   9403    ; 8 mask blend is provided for 16 pixels
   9404 .w8_loop:
   9405    movq                 m1, [dstq+dsq*0] ; a
   9406    movhps               m1, [dstq+dsq*1]
   9407    mova                 m2, [tmpq]; b
   9408    BLEND_64M            m1, m2, m3, m3
   9409    movq       [dstq+dsq*0], m0
   9410    movhps     [dstq+dsq*1], m0
   9411    add                tmpq, 16
   9412    lea                dstq, [dstq+dsq*2]
   9413    sub                  hd, 2
   9414    jg .w8_loop
   9415    RET
   9416 .w16:
   9417    ; 16 mask blend is provided for 32 pixels
   9418    mova                  m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
   9419    mova                  m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
   9420 .w16_loop:
   9421    mova                 m1, [dstq] ; a
   9422    mova                 m2, [tmpq] ; b
   9423    BLEND_64M            m1, m2, m3, m4
   9424    mova             [dstq], m0
   9425    add                tmpq, 16
   9426    add                dstq, dsq
   9427    dec                  hd
   9428    jg .w16_loop
   9429    RET
   9430 .w32:
   9431 %if WIN64
   9432    mova            [rsp+8], xmm6
   9433 %endif
   9434    mova                 m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
   9435    mova                 m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
   9436    mova                 m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
   9437    ; 16 mask blend is provided for 64 pixels
   9438 .w32_loop:
   9439    mova                 m1, [dstq+16*0] ; a
   9440    mova                 m2, [tmpq+16*0] ; b
   9441    BLEND_64M            m1, m2, m3, m4
   9442    movq                 m1, [dstq+16*1] ; a
   9443    punpcklbw            m1, [tmpq+16*1] ; b
   9444    pmaddubsw            m1, m6
   9445    pmulhrsw             m1, m5
   9446    packuswb             m1, m1
   9447    mova        [dstq+16*0], m0
   9448    movq        [dstq+16*1], m1
   9449    add                tmpq, 32
   9450    add                dstq, dsq
   9451    dec                  hd
   9452    jg .w32_loop
   9453 %if WIN64
   9454    mova               xmm6, [rsp+8]
   9455 %endif
   9456    RET
   9457 
   9458 cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
   9459 %define base t0-blend_h_ssse3_table
   9460 %if ARCH_X86_32
   9461    ; We need to keep the PIC pointer for w4, reload wd from stack instead
   9462    DECLARE_REG_TMP 6
   9463 %else
   9464    DECLARE_REG_TMP 5
   9465    mov                 r6d, wd
   9466 %endif
   9467    LEA                  t0, blend_h_ssse3_table
   9468    tzcnt                wd, wm
   9469    mov                  hd, hm
   9470    movsxd               wq, dword [t0+wq*4]
   9471    mova                 m5, [base+pw_512]
   9472    add                  wq, t0
   9473    lea               maskq, [base+obmc_masks+hq*2]
   9474    lea                  hd, [hq*3]
   9475    shr                  hd, 2 ; h * 3/4
   9476    lea               maskq, [maskq+hq*2]
   9477    neg                  hq
   9478    jmp                  wq
   9479 .w2:
   9480    movd                 m0, [dstq+dsq*0]
   9481    pinsrw               m0, [dstq+dsq*1], 1
   9482    movd                 m2, [maskq+hq*2]
   9483    movd                 m1, [tmpq]
   9484    punpcklwd            m2, m2
   9485    punpcklbw            m0, m1
   9486    pmaddubsw            m0, m2
   9487    pmulhrsw             m0, m5
   9488    packuswb             m0, m0
   9489    movd                r3d, m0
   9490    mov        [dstq+dsq*0], r3w
   9491    shr                 r3d, 16
   9492    mov        [dstq+dsq*1], r3w
   9493    lea                dstq, [dstq+dsq*2]
   9494    add                tmpq, 2*2
   9495    add                  hq, 2
   9496    jl .w2
   9497    RET
   9498 .w4:
   9499 %if ARCH_X86_32
   9500    mova                 m3, [base+blend_shuf]
   9501 %else
   9502    mova                 m3, [blend_shuf]
   9503 %endif
   9504 .w4_loop:
   9505    movd                 m0, [dstq+dsq*0]
   9506    movd                 m2, [dstq+dsq*1]
   9507    punpckldq            m0, m2 ; a
   9508    movq                 m1, [tmpq] ; b
   9509    movq                 m2, [maskq+hq*2] ; m
   9510    pshufb               m2, m3
   9511    punpcklbw            m0, m1
   9512    pmaddubsw            m0, m2
   9513    pmulhrsw             m0, m5
   9514    packuswb             m0, m0
   9515    movd       [dstq+dsq*0], m0
   9516    psrlq                m0, 32
   9517    movd       [dstq+dsq*1], m0
   9518    lea                dstq, [dstq+dsq*2]
   9519    add                tmpq, 4*2
   9520    add                  hq, 2
   9521    jl .w4_loop
   9522    RET
   9523 .w8:
   9524    movd                 m4, [maskq+hq*2]
   9525    punpcklwd            m4, m4
   9526    pshufd               m3, m4, q0000
   9527    pshufd               m4, m4, q1111
   9528    movq                 m1, [dstq+dsq*0] ; a
   9529    movhps               m1, [dstq+dsq*1]
   9530    mova                 m2, [tmpq]
   9531    BLEND_64M            m1, m2, m3, m4
   9532    movq       [dstq+dsq*0], m0
   9533    movhps     [dstq+dsq*1], m0
   9534    lea                dstq, [dstq+dsq*2]
   9535    add                tmpq, 8*2
   9536    add                  hq, 2
   9537    jl .w8
   9538    RET
   9539 ; w16/w32/w64/w128
   9540 .w16:
   9541 %if ARCH_X86_32
   9542    mov                 r6d, wm
   9543 %endif
   9544    sub                 dsq, r6
   9545 .w16_loop0:
   9546    movd                 m3, [maskq+hq*2]
   9547    pshuflw              m3, m3, q0000
   9548    punpcklqdq           m3, m3
   9549    mov                  wd, r6d
   9550 .w16_loop:
   9551    mova                 m1, [dstq] ; a
   9552    mova                 m2, [tmpq] ; b
   9553    BLEND_64M            m1, m2, m3, m3
   9554    mova             [dstq], m0
   9555    add                dstq, 16
   9556    add                tmpq, 16
   9557    sub                  wd, 16
   9558    jg .w16_loop
   9559    add                dstq, dsq
   9560    inc                  hq
   9561    jl .w16_loop0
   9562    RET
   9563 
   9564 ; emu_edge args:
   9565 ; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
   9566 ; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
   9567 ; const pixel *ref, const ptrdiff_t ref_stride
   9568 ;
   9569 ; bw, bh total filled size
   9570 ; iw, ih, copied block -> fill bottom, right
   9571 ; x, y, offset in bw/bh -> fill top, left
   9572 cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \
   9573                                  y, dst, dstride, src, sstride, \
   9574                                  bottomext, rightext, blk
   9575    ; we assume that the buffer (stride) is larger than width, so we can
   9576    ; safely overwrite by a few bytes
   9577    pxor                 m1, m1
   9578 
   9579 %if ARCH_X86_64
   9580 %define reg_zero       r12q
   9581 %define reg_tmp        r10
   9582 %define reg_src        srcq
   9583 %define reg_bottomext  bottomextq
   9584 %define reg_rightext   rightextq
   9585 %define reg_blkm       r9m
   9586 %else
   9587 %define reg_zero       r6
   9588 %define reg_tmp        r0
   9589 %define reg_src        r1
   9590 %define reg_bottomext  r0
   9591 %define reg_rightext   r1
   9592 %define reg_blkm       r2m
   9593 %endif
   9594    ;
   9595    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
   9596    xor            reg_zero, reg_zero
   9597    lea             reg_tmp, [ihq-1]
   9598    cmp                  yq, ihq
   9599    cmovs           reg_tmp, yq
   9600    test                 yq, yq
   9601    cmovs           reg_tmp, reg_zero
   9602 %if ARCH_X86_64
   9603    imul            reg_tmp, sstrideq
   9604    add                srcq, reg_tmp
   9605 %else
   9606    imul            reg_tmp, sstridem
   9607    mov             reg_src, srcm
   9608    add             reg_src, reg_tmp
   9609 %endif
   9610    ;
   9611    ; ref += iclip(x, 0, iw - 1)
   9612    lea             reg_tmp, [iwq-1]
   9613    cmp                  xq, iwq
   9614    cmovs           reg_tmp, xq
   9615    test                 xq, xq
   9616    cmovs           reg_tmp, reg_zero
   9617    add             reg_src, reg_tmp
   9618 %if ARCH_X86_32
   9619    mov                srcm, reg_src
   9620 %endif
   9621    ;
   9622    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
   9623 %if ARCH_X86_32
   9624    mov                  r1, r1m ; restore bh
   9625 %endif
   9626    lea       reg_bottomext, [yq+bhq]
   9627    sub       reg_bottomext, ihq
   9628    lea                  r3, [bhq-1]
   9629    cmovs     reg_bottomext, reg_zero
   9630    ;
   9631 
   9632    DEFINE_ARGS bw, bh, iw, ih, x, \
   9633                topext, dst, dstride, src, sstride, \
   9634                bottomext, rightext, blk
   9635 
   9636    ; top_ext = iclip(-y, 0, bh - 1)
   9637    neg             topextq
   9638    cmovs           topextq, reg_zero
   9639    cmp       reg_bottomext, bhq
   9640    cmovns    reg_bottomext, r3
   9641    cmp             topextq, bhq
   9642    cmovg           topextq, r3
   9643 %if ARCH_X86_32
   9644    mov                 r4m, reg_bottomext
   9645    ;
   9646    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
   9647    mov                  r0, r0m ; restore bw
   9648 %endif
   9649    lea        reg_rightext, [xq+bwq]
   9650    sub        reg_rightext, iwq
   9651    lea                  r2, [bwq-1]
   9652    cmovs      reg_rightext, reg_zero
   9653 
   9654    DEFINE_ARGS bw, bh, iw, ih, leftext, \
   9655                topext, dst, dstride, src, sstride, \
   9656                bottomext, rightext, blk
   9657 
   9658    ; left_ext = iclip(-x, 0, bw - 1)
   9659    neg            leftextq
   9660    cmovs          leftextq, reg_zero
   9661    cmp        reg_rightext, bwq
   9662    cmovns     reg_rightext, r2
   9663 %if ARCH_X86_32
   9664    mov                 r3m, r1
   9665 %endif
   9666    cmp            leftextq, bwq
   9667    cmovns         leftextq, r2
   9668 
   9669 %undef reg_zero
   9670 %undef reg_tmp
   9671 %undef reg_src
   9672 %undef reg_bottomext
   9673 %undef reg_rightext
   9674 
   9675    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
   9676                topext, dst, dstride, src, sstride, \
   9677                bottomext, rightext, blk
   9678 
   9679    ; center_h = bh - top_ext - bottom_ext
   9680 %if ARCH_X86_64
   9681    lea                  r3, [bottomextq+topextq]
   9682    sub            centerhq, r3
   9683 %else
   9684    mov                   r1, centerhm ; restore r1
   9685    sub             centerhq, topextq
   9686    sub             centerhq, r4m
   9687    mov                  r1m, centerhq
   9688 %endif
   9689    ;
   9690    ; blk += top_ext * PXSTRIDE(dst_stride)
   9691    mov                  r2, topextq
   9692 %if ARCH_X86_64
   9693    imul                 r2, dstrideq
   9694 %else
   9695    mov                  r6, r6m ; restore dstq
   9696    imul                 r2, dstridem
   9697 %endif
   9698    add                dstq, r2
   9699    mov            reg_blkm, dstq ; save pointer for ext
   9700    ;
   9701    ; center_w = bw - left_ext - right_ext
   9702    mov            centerwq, bwq
   9703 %if ARCH_X86_64
   9704    lea                  r3, [rightextq+leftextq]
   9705    sub            centerwq, r3
   9706 %else
   9707    sub            centerwq, r3m
   9708    sub            centerwq, leftextq
   9709 %endif
   9710 
   9711 ; vloop Macro
   9712 %macro v_loop 3 ; need_left_ext, need_right_ext, suffix
   9713  %if ARCH_X86_64
   9714    %define reg_tmp        r12
   9715  %else
   9716    %define reg_tmp        r0
   9717  %endif
   9718 .v_loop_%3:
   9719  %if ARCH_X86_32
   9720    mov                  r0, r0m
   9721    mov                  r1, r1m
   9722  %endif
   9723 %if %1
   9724    ; left extension
   9725  %if ARCH_X86_64
   9726    movd                 m0, [srcq]
   9727  %else
   9728    mov                  r3, srcm
   9729    movd                 m0, [r3]
   9730  %endif
   9731    pshufb               m0, m1
   9732    xor                  r3, r3
   9733 .left_loop_%3:
   9734    mova          [dstq+r3], m0
   9735    add                  r3, mmsize
   9736    cmp                  r3, leftextq
   9737    jl .left_loop_%3
   9738    ; body
   9739    lea             reg_tmp, [dstq+leftextq]
   9740 %endif
   9741    xor                  r3, r3
   9742 .body_loop_%3:
   9743  %if ARCH_X86_64
   9744    movu                 m0, [srcq+r3]
   9745  %else
   9746    mov                  r1, srcm
   9747    movu                 m0, [r1+r3]
   9748  %endif
   9749 %if %1
   9750    movu       [reg_tmp+r3], m0
   9751 %else
   9752    movu          [dstq+r3], m0
   9753 %endif
   9754    add                  r3, mmsize
   9755    cmp                  r3, centerwq
   9756    jl .body_loop_%3
   9757 %if %2
   9758    ; right extension
   9759 %if %1
   9760    add             reg_tmp, centerwq
   9761 %else
   9762    lea             reg_tmp, [dstq+centerwq]
   9763 %endif
   9764  %if ARCH_X86_64
   9765    movd                 m0, [srcq+centerwq-1]
   9766  %else
   9767    mov                  r3, srcm
   9768    movd                 m0, [r3+centerwq-1]
   9769  %endif
   9770    pshufb               m0, m1
   9771    xor                  r3, r3
   9772 .right_loop_%3:
   9773    movu       [reg_tmp+r3], m0
   9774    add                  r3, mmsize
   9775  %if ARCH_X86_64
   9776    cmp                  r3, rightextq
   9777  %else
   9778    cmp                  r3, r3m
   9779  %endif
   9780    jl .right_loop_%3
   9781 %endif
   9782  %if ARCH_X86_64
   9783    add                dstq, dstrideq
   9784    add                srcq, sstrideq
   9785    dec            centerhq
   9786    jg .v_loop_%3
   9787  %else
   9788    add                dstq, dstridem
   9789    mov                  r0, sstridem
   9790    add                srcm, r0
   9791    sub       dword centerhm, 1
   9792    jg .v_loop_%3
   9793    mov                  r0, r0m ; restore r0
   9794  %endif
   9795 %endmacro ; vloop MACRO
   9796 
   9797    test           leftextq, leftextq
   9798    jnz .need_left_ext
   9799 %if ARCH_X86_64
   9800    test          rightextq, rightextq
   9801    jnz .need_right_ext
   9802 %else
   9803    cmp            leftextq, r3m ; leftextq == 0
   9804    jne .need_right_ext
   9805 %endif
   9806    v_loop                0, 0, 0
   9807    jmp .body_done
   9808 
   9809    ;left right extensions
   9810 .need_left_ext:
   9811 %if ARCH_X86_64
   9812    test          rightextq, rightextq
   9813 %else
   9814    mov                  r3, r3m
   9815    test                 r3, r3
   9816 %endif
   9817    jnz .need_left_right_ext
   9818    v_loop                1, 0, 1
   9819    jmp .body_done
   9820 
   9821 .need_left_right_ext:
   9822    v_loop                1, 1, 2
   9823    jmp .body_done
   9824 
   9825 .need_right_ext:
   9826    v_loop                0, 1, 3
   9827 
   9828 .body_done:
   9829 ; r0 ; bw
   9830 ; r1 ;; x loop
   9831 ; r4 ;; y loop
   9832 ; r5 ; topextq
   9833 ; r6 ;dstq
   9834 ; r7 ;dstrideq
   9835 ; r8 ; srcq
   9836 %if ARCH_X86_64
   9837 %define reg_dstride    dstrideq
   9838 %else
   9839 %define reg_dstride    r2
   9840 %endif
   9841    ;
   9842    ; bottom edge extension
   9843 %if ARCH_X86_64
   9844    test         bottomextq, bottomextq
   9845    jz .top
   9846 %else
   9847    xor                  r1, r1
   9848    cmp                  r1, r4m
   9849    je .top
   9850 %endif
   9851    ;
   9852 %if ARCH_X86_64
   9853    mov                srcq, dstq
   9854    sub                srcq, dstrideq
   9855    xor                  r1, r1
   9856 %else
   9857    mov                  r3, dstq
   9858    mov         reg_dstride, dstridem
   9859    sub                  r3, reg_dstride
   9860    mov                srcm, r3
   9861 %endif
   9862    ;
   9863 .bottom_x_loop:
   9864 %if ARCH_X86_64
   9865    mova                 m0, [srcq+r1]
   9866    lea                  r3, [dstq+r1]
   9867    mov                  r4, bottomextq
   9868 %else
   9869    mov                  r3, srcm
   9870    mova                 m0, [r3+r1]
   9871    lea                  r3, [dstq+r1]
   9872    mov                  r4, r4m
   9873 %endif
   9874    ;
   9875 .bottom_y_loop:
   9876    mova               [r3], m0
   9877    add                  r3, reg_dstride
   9878    dec                  r4
   9879    jg .bottom_y_loop
   9880    add                  r1, mmsize
   9881    cmp                  r1, bwq
   9882    jl .bottom_x_loop
   9883 
   9884 .top:
   9885    ; top edge extension
   9886    test            topextq, topextq
   9887    jz .end
   9888 %if ARCH_X86_64
   9889    mov                srcq, reg_blkm
   9890 %else
   9891    mov                  r3, reg_blkm
   9892    mov         reg_dstride, dstridem
   9893 %endif
   9894    mov                dstq, dstm
   9895    xor                  r1, r1
   9896    ;
   9897 .top_x_loop:
   9898 %if ARCH_X86_64
   9899    mova                 m0, [srcq+r1]
   9900 %else
   9901    mov                  r3, reg_blkm
   9902    mova                 m0, [r3+r1]
   9903 %endif
   9904    lea                  r3, [dstq+r1]
   9905    mov                  r4, topextq
   9906    ;
   9907 .top_y_loop:
   9908    mova               [r3], m0
   9909    add                  r3, reg_dstride
   9910    dec                  r4
   9911    jg .top_y_loop
   9912    add                  r1, mmsize
   9913    cmp                  r1, bwq
   9914    jl .top_x_loop
   9915 
   9916 .end:
   9917    RET
   9918 
   9919 %undef reg_dstride
   9920 %undef reg_blkm
   9921 %undef reg_tmp
   9922 
   9923 cextern resize_filter
   9924 
   9925 %macro SCRATCH 3
   9926 %if ARCH_X86_32
   9927    mova [rsp+%3*mmsize], m%1
   9928 %define m%2 [rsp+%3*mmsize]
   9929 %else
   9930    SWAP             %1, %2
   9931 %endif
   9932 %endmacro
   9933 
   9934 %if ARCH_X86_64
   9935 cglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \
   9936                                dst_w, h, src_w, dx, mx0
   9937 %elif STACK_ALIGNMENT >= 16
   9938 cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
   9939                                      dst_w, h, src_w, dx, mx0
   9940 %else
   9941 cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
   9942                                      dst_w, h, src_w, dx, mx0
   9943 %endif
   9944    movifnidn          dstq, dstmp
   9945    movifnidn          srcq, srcmp
   9946 %if STACK_ALIGNMENT >= 16
   9947    movifnidn        dst_wd, dst_wm
   9948 %endif
   9949 %if ARCH_X86_64
   9950    movifnidn            hd, hm
   9951 %endif
   9952    sub          dword mx0m, 4<<14
   9953    sub        dword src_wm, 8
   9954    movd                 m7, dxm
   9955    movd                 m6, mx0m
   9956    movd                 m5, src_wm
   9957    pshufd               m7, m7, q0000
   9958    pshufd               m6, m6, q0000
   9959    pshufd               m5, m5, q0000
   9960 
   9961 %if ARCH_X86_64
   9962    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
   9963    LEA                  r7, $$
   9964 %define base r7-$$
   9965 %else
   9966    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
   9967 %define hd dword r5m
   9968 %if STACK_ALIGNMENT >= 16
   9969    LEA                  r6, $$
   9970 %define base r6-$$
   9971 %else
   9972    LEA                  r4, $$
   9973 %define base r4-$$
   9974 %endif
   9975 %endif
   9976 
   9977 %if ARCH_X86_64
   9978    mova                m10, [base+pw_m256]
   9979    mova                 m9, [base+pd_63]
   9980    mova                 m8, [base+pb_8x0_8x8]
   9981 %else
   9982 %define m10 [base+pw_m256]
   9983 %define m9  [base+pd_63]
   9984 %define m8  [base+pb_8x0_8x8]
   9985 %endif
   9986    pmaddwd              m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
   9987    pslld                m7, 2                      ; dx*4
   9988    pslld                m5, 14
   9989    paddd                m6, m4                     ; mx+[0..3]*dx
   9990    SCRATCH               7, 13, 0
   9991    SCRATCH               6, 12, 1
   9992    SCRATCH               5, 11, 2
   9993 
   9994    ; m10 = pmulhrsw constant for x=(x+64)>>7
   9995    ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8
   9996 
   9997 .loop_y:
   9998    xor                  xd, xd
   9999    mova                 m0, m12                    ; per-line working version of mx
  10000 
  10001 .loop_x:
  10002    pxor                 m1, m1
  10003    pcmpgtd              m1, m0
  10004    pandn                m1, m0
  10005    psrad                m2, m0, 8                  ; filter offset (unmasked)
  10006    pcmpgtd              m3, m11, m1
  10007    pand                 m1, m3
  10008    pandn                m3, m11
  10009    por                  m1, m3
  10010    psubd                m3, m0, m1                 ; pshufb offset
  10011    psrad                m1, 14                     ; clipped src_x offset
  10012    psrad                m3, 14                     ; pshufb edge_emu offset
  10013    pand                 m2, m9                     ; filter offset (masked)
  10014 
  10015    ; load source pixels
  10016 %if ARCH_X86_64
  10017    movd                r8d, m1
  10018    pshuflw              m1, m1, q3232
  10019    movd                r9d, m1
  10020    punpckhqdq           m1, m1
  10021    movd               r10d, m1
  10022    psrlq                m1, 32
  10023    movd               r11d, m1
  10024    movq                 m4, [srcq+r8]
  10025    movq                 m5, [srcq+r10]
  10026    movhps               m4, [srcq+r9]
  10027    movhps               m5, [srcq+r11]
  10028 %else
  10029    movd                r3d,  m1
  10030    pshufd               m1,  m1, q3312
  10031    movd                r1d,  m1
  10032    pshuflw              m1,  m1, q3232
  10033    movq                 m4, [srcq+r3]
  10034    movq                 m5, [srcq+r1]
  10035    movd                r3d,  m1
  10036    punpckhqdq           m1,  m1
  10037    movd                r1d,  m1
  10038    movhps               m4, [srcq+r3]
  10039    movhps               m5, [srcq+r1]
  10040 %endif
  10041 
  10042    ; if no emulation is required, we don't need to shuffle or emulate edges
  10043    ; this also saves 2 quasi-vpgatherdqs
  10044    pxor                 m6, m6
  10045    pcmpeqb              m6, m3
  10046 %if ARCH_X86_64
  10047    pmovmskb            r8d, m6
  10048    cmp                 r8d, 0xffff
  10049 %else
  10050    pmovmskb            r3d, m6
  10051    cmp                 r3d, 0xffff
  10052 %endif
  10053    je .filter
  10054 
  10055 %if ARCH_X86_64
  10056    movd                r8d, m3
  10057    pshuflw              m3, m3, q3232
  10058    movd                r9d, m3
  10059    punpckhqdq           m3, m3
  10060    movd               r10d, m3
  10061    psrlq                m3, 32
  10062    movd               r11d, m3
  10063    movsxd               r8, r8d
  10064    movsxd               r9, r9d
  10065    movsxd              r10, r10d
  10066    movsxd              r11, r11d
  10067    movq                 m6, [base+resize_shuf+4+r8]
  10068    movq                 m7, [base+resize_shuf+4+r10]
  10069    movhps               m6, [base+resize_shuf+4+r9]
  10070    movhps               m7, [base+resize_shuf+4+r11]
  10071 %else
  10072    movd                r3d, m3
  10073    pshufd               m3, m3, q3312
  10074    movd                r1d, m3
  10075    pshuflw              m3, m3, q3232
  10076    movq                 m6, [base+resize_shuf+4+r3]
  10077    movq                 m7, [base+resize_shuf+4+r1]
  10078    movd                r3d, m3
  10079    punpckhqdq           m3, m3
  10080    movd                r1d, m3
  10081    movhps               m6, [base+resize_shuf+4+r3]
  10082    movhps               m7, [base+resize_shuf+4+r1]
  10083 %endif
  10084 
  10085    paddb                m6, m8
  10086    paddb                m7, m8
  10087    pshufb               m4, m6
  10088    pshufb               m5, m7
  10089 
  10090 .filter:
  10091 %if ARCH_X86_64
  10092    movd                r8d, m2
  10093    pshuflw              m2, m2, q3232
  10094    movd                r9d, m2
  10095    punpckhqdq           m2, m2
  10096    movd               r10d, m2
  10097    psrlq                m2, 32
  10098    movd               r11d, m2
  10099    movq                 m6, [base+resize_filter+r8*8]
  10100    movq                 m7, [base+resize_filter+r10*8]
  10101    movhps               m6, [base+resize_filter+r9*8]
  10102    movhps               m7, [base+resize_filter+r11*8]
  10103 %else
  10104    movd                r3d, m2
  10105    pshufd               m2, m2, q3312
  10106    movd                r1d, m2
  10107    pshuflw              m2, m2, q3232
  10108    movq                 m6, [base+resize_filter+r3*8]
  10109    movq                 m7, [base+resize_filter+r1*8]
  10110    movd                r3d, m2
  10111    punpckhqdq           m2, m2
  10112    movd                r1d, m2
  10113    movhps               m6, [base+resize_filter+r3*8]
  10114    movhps               m7, [base+resize_filter+r1*8]
  10115 %endif
  10116 
  10117    pmaddubsw            m4, m6
  10118    pmaddubsw            m5, m7
  10119    phaddw               m4, m5
  10120    phaddsw              m4, m4
  10121    pmulhrsw             m4, m10                    ; x=(x+64)>>7
  10122    packuswb             m4, m4
  10123    movd          [dstq+xq], m4
  10124 
  10125    paddd                m0, m13
  10126    add                  xd, 4
  10127 %if STACK_ALIGNMENT >= 16
  10128    cmp                  xd, dst_wd
  10129 %else
  10130    cmp                  xd, dst_wm
  10131 %endif
  10132    jl .loop_x
  10133 
  10134    add                dstq, dst_stridemp
  10135    add                srcq, src_stridemp
  10136    dec                  hd
  10137    jg .loop_y
  10138    RET
  10139 
  10140 INIT_XMM ssse3
  10141 WARP_AFFINE_8X8
  10142 
  10143 INIT_XMM sse4
  10144 WARP_AFFINE_8X8