tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc16_avx512.asm (221233B)


      1 ; Copyright © 2020, VideoLAN and dav1d authors
      2 ; Copyright © 2020, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA 64
     32 
     33 spel_h_shufA:  db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
     34               db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41
     35 spel_h_shufC:  db  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17
     36               db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49
     37               db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25
     38               db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57
     39 spel_h_shufB:  db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
     40               db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45
     41 spel_h_shufD:  db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21
     42               db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53
     43               db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29
     44               db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61
     45 spel_v_shuf8:  db  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 23
     46               db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
     47               db  8,  9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
     48               db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
     49 spel_v_shuf16: db  0,  1, 32, 33,  2,  3, 34, 35,  4,  5, 36, 37,  6,  7, 38, 39
     50               db  8,  9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
     51               db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
     52               db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
     53 prep_endA:     db  1,  2,  5,  6,  9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
     54               db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
     55               db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
     56               db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
     57 prep_endB:     db  1,  2,  5,  6,  9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46
     58               db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62
     59               db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110
     60               db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126
     61 prep_endC:     db  1,  2,  5,  6,  9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78
     62               db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94
     63               db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110
     64               db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126
     65 spel_shuf4a:   db  1,  2, 17, 18,  5,  6, 21, 22,  9, 10, 25, 26, 13, 14, 29, 30
     66               db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46
     67               db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
     68               db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78
     69 spel_shuf4b:   db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78
     70               db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
     71               db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110
     72               db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
     73 spel_shuf8a:   db  1,  2, 17, 18,  5,  6, 21, 22,  9, 10, 25, 26, 13, 14, 29, 30
     74               db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78
     75               db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
     76               db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110
     77 spel_shuf8b:   db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78
     78               db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
     79               db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110
     80               db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
     81 spel_shuf16:   db  1,  2, 33, 34,  5,  6, 37, 38,  9, 10, 41, 42, 13, 14, 45, 46
     82               db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62
     83               db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110
     84               db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126
     85 spel_shuf32:   db  1,  2, 65, 66,  5,  6, 69, 70,  9, 10, 73, 74, 13, 14, 77, 78
     86               db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94
     87               db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110
     88               db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126
     89 spel_h_shuf2b: db  1,  2, 17, 18,  5,  6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38
     90               db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50,  9, 10, 53, 54, 13, 14
     91               db  9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46
     92 spel_shuf2:    db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30
     93 spel_h_shuf2a: db  0,  1,  2,  3,  2,  3,  4,  5, 16, 17, 18, 19, 18, 19, 20, 21
     94               db  4,  5,  6,  7,  6,  7,  8,  9, 20, 21, 22, 23, 22, 23, 24, 25
     95 w_mask_end42x: db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
     96               db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
     97 w_mask_end444: db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
     98               db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
     99               db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
    100               db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
    101 w_mask_shuf4:  db  0,  2,  8, 10,  4,  6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
    102               db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
    103               db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94
    104               db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126
    105 w_mask_shuf8:  db  0,  2, 16, 18,  4,  6, 20, 22,  8, 10, 24, 26, 12, 14, 28, 30
    106               db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
    107               db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94
    108               db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126
    109 w_mask_shuf16: db  0,  2, 32, 34,  4,  6, 36, 38,  8, 10, 40, 42, 12, 14, 44, 46
    110               db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
    111               db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110
    112               db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126
    113 warp8x8_permA: db  0,  1,  2,  3, 32, 33, 34, 35,  2,  3,  4,  5, 34, 35, 36, 37
    114               db  4,  5,  6,  7, 36, 37, 38, 39,  6,  7,  8,  9, 38, 39, 40, 41
    115               db  8,  9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45
    116               db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
    117 warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
    118               db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53
    119               db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57
    120               db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61
    121 warp8x8_end:   db  0,  1,  4,  5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
    122               db  2,  3,  6,  7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
    123               db  8,  9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
    124               db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
    125 deint_q_shuf: ;dq  0,  2,  4,  6,  1,  3,  5,  7
    126 pd_0to7:       dd  0,  1,  2,  3,  4,  5,  6,  7
    127               dd  1
    128 pw_2048:       times 2 dw 2048
    129               dd  3
    130 pw_8192:       times 2 dw 8192
    131 avg_shift:     dw  5,  5,  3,  3
    132 pw_27615:      times 2 dw 27615
    133 pw_32766:      times 2 dw 32766
    134 warp8x8_permC: db -1,  0, -1,  1, -1,  8, -1,  9, -1,  4, -1,  5, -1, 12, -1, 13
    135 warp8x8_permD: db -1,  2, -1,  3, -1, 10, -1, 11, -1,  6, -1,  7, -1, 14, -1, 15
    136 warp_shift_h:  db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
    137 blend_shuf:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
    138 resize_permA:  dd  0,  4,  8, 12,  1,  5,  9, 13, 16, 20, 24, 28, 17, 21, 25, 29
    139 resize_permB:  dd  2,  6, 10, 14,  3,  7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
    140 resize_permC:  dq  0,  1,  4,  5,  8,  9, 12, 13
    141 resize_permD:  dq  2,  3,  6,  7, 10, 11, 14, 15
    142 resize_permE:  dq  0,  2,  4,  6
    143 resize_shufA:  db -1,  0, -1,  1, -1,  4, -1,  5, -1,  8, -1,  9, -1, 12, -1, 13
    144 resize_shufB:  db -1,  2, -1,  3, -1,  6, -1,  7, -1, 10, -1, 11, -1, 14, -1, 15
    145 rescale_mul:   dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
    146 resize_shuf:   db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
    147               db  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
    148 
    149 prep_hv_shift:    dq  6,  4
    150 put_bilin_h_rnd:  dw  8,  8, 10, 10
    151 prep_mul:         dw 16, 16,  4,  4
    152 put_8tap_h_rnd:   dd 34, 40
    153 prep_8tap_rnd:    dd 128 - (8192 << 8)
    154 warp_8x8_rnd_h:   dd 512, 2048
    155 warp_8x8_rnd_v:   dd 262144, 65536
    156 warp_8x8t_rnd_v:  dd 16384 - (8192 << 15)
    157 avg_round:        dw -16400, -16400, -16388, -16388
    158 w_avg_round:      dd 128 + (8192 << 4),  32 + (8192 << 4)
    159 mask_round:       dd 512 + (8192 << 6), 128 + (8192 << 6)
    160 w_mask_round:     dd 128, 64
    161 bidir_shift:      dw  6,  6,  4,  4
    162 
    163 pb_64:    times 4 db 64
    164 pw_m512:  times 2 dw -512
    165 pw_2:     times 2 dw 2
    166 pw_64:    times 2 dw 64
    167 pd_32:    dd 32
    168 pd_63:    dd 63
    169 pd_128:   dd 128
    170 pd_640:   dd 640
    171 pd_2176:  dd 2176
    172 pd_16384: dd 16384
    173 pd_0_4:   dd 0, 4
    174 
    175 %define pw_16 prep_mul
    176 %define pd_512 warp_8x8_rnd_h
    177 
    178 %macro BASE_JMP_TABLE 3-*
    179    %xdefine %1_%2_table (%%table - %3)
    180    %xdefine %%base %1_%2
    181    %%table:
    182    %rep %0 - 2
    183        dw %%base %+ _w%3 - %%base
    184        %rotate 1
    185    %endrep
    186 %endmacro
    187 
    188 %macro HV_JMP_TABLE 5-*
    189    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
    190    %xdefine %%base %1_%3
    191    %assign %%types %4
    192    %if %%types & 1
    193        %xdefine %1_%2_h_%3_table  (%%h  - %5)
    194        %%h:
    195        %rep %0 - 4
    196            dw %%prefix %+ .h_w%5 - %%base
    197            %rotate 1
    198        %endrep
    199        %rotate 4
    200    %endif
    201    %if %%types & 2
    202        %xdefine %1_%2_v_%3_table  (%%v  - %5)
    203        %%v:
    204        %rep %0 - 4
    205            dw %%prefix %+ .v_w%5 - %%base
    206            %rotate 1
    207        %endrep
    208        %rotate 4
    209    %endif
    210    %if %%types & 4
    211        %xdefine %1_%2_hv_%3_table (%%hv - %5)
    212        %%hv:
    213        %rep %0 - 4
    214            dw %%prefix %+ .hv_w%5 - %%base
    215            %rotate 1
    216        %endrep
    217    %endif
    218 %endmacro
    219 
    220 %macro BIDIR_JMP_TABLE 2-*
    221    %xdefine %1_%2_table (%%table - 2*%3)
    222    %xdefine %%base %1_%2_table
    223    %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
    224    %%table:
    225    %rep %0 - 2
    226        dd %%prefix %+ .w%3 - %%base
    227        %rotate 1
    228    %endrep
    229 %endmacro
    230 
    231 %xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put)
    232 %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep)
    233 
    234 BIDIR_JMP_TABLE avg,        avx512icl,       4, 8, 16, 32, 64, 128
    235 BIDIR_JMP_TABLE w_avg,      avx512icl,       4, 8, 16, 32, 64, 128
    236 BIDIR_JMP_TABLE mask,       avx512icl,       4, 8, 16, 32, 64, 128
    237 BIDIR_JMP_TABLE w_mask_420, avx512icl,       4, 8, 16, 32, 64, 128
    238 BIDIR_JMP_TABLE w_mask_422, avx512icl,       4, 8, 16, 32, 64, 128
    239 BIDIR_JMP_TABLE w_mask_444, avx512icl,       4, 8, 16, 32, 64, 128
    240 BIDIR_JMP_TABLE blend,      avx512icl,       4, 8, 16, 32
    241 BIDIR_JMP_TABLE blend_v,    avx512icl,    2, 4, 8, 16, 32
    242 BIDIR_JMP_TABLE blend_h,    avx512icl,    2, 4, 8, 16, 32, 64, 128
    243 BASE_JMP_TABLE put,         avx512icl,    2, 4, 8, 16, 32, 64, 128
    244 BASE_JMP_TABLE prep,        avx512icl,       4, 8, 16, 32, 64, 128
    245 HV_JMP_TABLE   put,  bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
    246 HV_JMP_TABLE   prep, bilin, avx512icl, 7,    4, 8, 16, 32, 64, 128
    247 HV_JMP_TABLE   put,  6tap,  avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
    248 HV_JMP_TABLE   put,  8tap,  avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
    249 HV_JMP_TABLE   prep, 6tap,  avx512icl, 2,    4, 8, 16, 32, 64, 128
    250 HV_JMP_TABLE   prep, 8tap,  avx512icl, 2,    4, 8, 16, 32, 64, 128
    251 
    252 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
    253 
    254 cextern mc_subpel_filters
    255 %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
    256 
    257 cextern mc_warp_filter
    258 cextern obmc_masks_avx2
    259 cextern resize_filter
    260 
    261 SECTION .text
    262 
    263 %if WIN64
    264 DECLARE_REG_TMP 4
    265 %else
    266 DECLARE_REG_TMP 8
    267 %endif
    268 
    269 INIT_ZMM avx512icl
    270 cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy
    271    mov                mxyd, r6m ; mx
    272    lea                  r7, [put_avx512icl]
    273    tzcnt               t0d, wm
    274    movifnidn            hd, hm
    275    test               mxyd, mxyd
    276    jnz .h
    277    mov                mxyd, r7m ; my
    278    test               mxyd, mxyd
    279    jnz .v
    280 .put:
    281    movzx               t0d, word [r7+t0*2+table_offset(put,)]
    282    add                  t0, r7
    283    jmp                  t0
    284 .put_w2:
    285    mov                 r6d, [srcq+ssq*0]
    286    mov                 r7d, [srcq+ssq*1]
    287    lea                srcq, [srcq+ssq*2]
    288    mov        [dstq+dsq*0], r6d
    289    mov        [dstq+dsq*1], r7d
    290    lea                dstq, [dstq+dsq*2]
    291    sub                  hd, 2
    292    jg .put_w2
    293    RET
    294 .put_w4:
    295    mov                  r6, [srcq+ssq*0]
    296    mov                  r7, [srcq+ssq*1]
    297    lea                srcq, [srcq+ssq*2]
    298    mov        [dstq+dsq*0], r6
    299    mov        [dstq+dsq*1], r7
    300    lea                dstq, [dstq+dsq*2]
    301    sub                  hd, 2
    302    jg .put_w4
    303    RET
    304 .put_w8:
    305    movu               xmm0, [srcq+ssq*0]
    306    movu               xmm1, [srcq+ssq*1]
    307    lea                srcq, [srcq+ssq*2]
    308    mova       [dstq+dsq*0], xmm0
    309    mova       [dstq+dsq*1], xmm1
    310    lea                dstq, [dstq+dsq*2]
    311    sub                  hd, 2
    312    jg .put_w8
    313    RET
    314 .put_w16:
    315    movu                ym0, [srcq+ssq*0]
    316    movu                ym1, [srcq+ssq*1]
    317    lea                srcq, [srcq+ssq*2]
    318    mova       [dstq+dsq*0], ym0
    319    mova       [dstq+dsq*1], ym1
    320    lea                dstq, [dstq+dsq*2]
    321    sub                  hd, 2
    322    jg .put_w16
    323    RET
    324 .put_w32:
    325    movu                 m0, [srcq+ssq*0]
    326    movu                 m1, [srcq+ssq*1]
    327    lea                srcq, [srcq+ssq*2]
    328    mova       [dstq+dsq*0], m0
    329    mova       [dstq+dsq*1], m1
    330    lea                dstq, [dstq+dsq*2]
    331    sub                  hd, 2
    332    jg .put_w32
    333    RET
    334 .put_w64:
    335    movu                 m0, [srcq+ssq*0+64*0]
    336    movu                 m1, [srcq+ssq*0+64*1]
    337    movu                 m2, [srcq+ssq*1+64*0]
    338    movu                 m3, [srcq+ssq*1+64*1]
    339    lea                srcq, [srcq+ssq*2]
    340    mova  [dstq+dsq*0+64*0], m0
    341    mova  [dstq+dsq*0+64*1], m1
    342    mova  [dstq+dsq*1+64*0], m2
    343    mova  [dstq+dsq*1+64*1], m3
    344    lea                dstq, [dstq+dsq*2]
    345    sub                  hd, 2
    346    jg .put_w64
    347    RET
    348 .put_w128:
    349    movu                 m0, [srcq+64*0]
    350    movu                 m1, [srcq+64*1]
    351    movu                 m2, [srcq+64*2]
    352    movu                 m3, [srcq+64*3]
    353    add                srcq, ssq
    354    mova        [dstq+64*0], m0
    355    mova        [dstq+64*1], m1
    356    mova        [dstq+64*2], m2
    357    mova        [dstq+64*3], m3
    358    add                dstq, dsq
    359    dec                  hd
    360    jg .put_w128
    361    RET
    362 .h:
    363    vpbroadcastw         m5, mxyd
    364    mov                mxyd, r7m ; my
    365    vpbroadcastd         m4, [pw_16]
    366    psubw                m4, m5
    367    test               mxyd, mxyd
    368    jnz .hv
    369    ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
    370    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_h)]
    371    mov                 r6d, r8m ; bitdepth_max
    372    add                  t0, r7
    373    shr                 r6d, 11
    374    vpbroadcastd         m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
    375    jmp                  t0
    376 .h_w2:
    377    movq               xmm1, [srcq+ssq*0]
    378    movhps             xmm1, [srcq+ssq*1]
    379    lea                srcq, [srcq+ssq*2]
    380    pmullw             xmm0, xmm1, xm4
    381    psrlq              xmm1, 16
    382    pmullw             xmm1, xm5
    383    paddw              xmm0, xm6
    384    paddw              xmm0, xmm1
    385    psrlw              xmm0, 4
    386    movd       [dstq+dsq*0], xmm0
    387    pextrd     [dstq+dsq*1], xmm0, 2
    388    lea                dstq, [dstq+dsq*2]
    389    sub                  hd, 2
    390    jg .h_w2
    391    RET
    392 .h_w4:
    393    movq               xmm0, [srcq+ssq*0+0]
    394    movhps             xmm0, [srcq+ssq*1+0]
    395    movq               xmm1, [srcq+ssq*0+2]
    396    movhps             xmm1, [srcq+ssq*1+2]
    397    lea                srcq, [srcq+ssq*2]
    398    pmullw             xmm0, xm4
    399    pmullw             xmm1, xm5
    400    paddw              xmm0, xm6
    401    paddw              xmm0, xmm1
    402    psrlw              xmm0, 4
    403    movq       [dstq+dsq*0], xmm0
    404    movhps     [dstq+dsq*1], xmm0
    405    lea                dstq, [dstq+dsq*2]
    406    sub                  hd, 2
    407    jg .h_w4
    408    RET
    409 .h_w8:
    410    movu                xm0, [srcq+ssq*0+0]
    411    vinserti32x4        ym0, [srcq+ssq*1+0], 1
    412    movu                xm1, [srcq+ssq*0+2]
    413    vinserti32x4        ym1, [srcq+ssq*1+2], 1
    414    lea                srcq, [srcq+ssq*2]
    415    pmullw              ym0, ym4
    416    pmullw              ym1, ym5
    417    paddw               ym0, ym6
    418    paddw               ym0, ym1
    419    psrlw               ym0, 4
    420    mova          [dstq+dsq*0], xm0
    421    vextracti32x4 [dstq+dsq*1], ym0, 1
    422    lea                dstq, [dstq+dsq*2]
    423    sub                  hd, 2
    424    jg .h_w8
    425    RET
    426 .h_w16:
    427    movu                ym0, [srcq+ssq*0+0]
    428    vinserti32x8         m0, [srcq+ssq*1+0], 1
    429    movu                ym1, [srcq+ssq*0+2]
    430    vinserti32x8         m1, [srcq+ssq*1+2], 1
    431    lea                srcq, [srcq+ssq*2]
    432    pmullw               m0, m4
    433    pmullw               m1, m5
    434    paddw                m0, m6
    435    paddw                m0, m1
    436    psrlw                m0, 4
    437    mova          [dstq+dsq*0], ym0
    438    vextracti32x8 [dstq+dsq*1], m0, 1
    439    lea                dstq, [dstq+dsq*2]
    440    sub                  hd, 2
    441    jg .h_w16
    442    RET
    443 .h_w32:
    444    pmullw               m0, m4, [srcq+ssq*0+0]
    445    pmullw               m2, m5, [srcq+ssq*0+2]
    446    pmullw               m1, m4, [srcq+ssq*1+0]
    447    pmullw               m3, m5, [srcq+ssq*1+2]
    448    lea                srcq, [srcq+ssq*2]
    449    paddw                m0, m6
    450    paddw                m1, m6
    451    paddw                m0, m2
    452    paddw                m1, m3
    453    psrlw                m0, 4
    454    psrlw                m1, 4
    455    mova       [dstq+dsq*0], m0
    456    mova       [dstq+dsq*1], m1
    457    lea                dstq, [dstq+dsq*2]
    458    sub                  hd, 2
    459    jg .h_w32
    460    RET
    461 .h_w64:
    462    pmullw               m0, m4, [srcq+64*0+0]
    463    pmullw               m2, m5, [srcq+64*0+2]
    464    pmullw               m1, m4, [srcq+64*1+0]
    465    pmullw               m3, m5, [srcq+64*1+2]
    466    add                srcq, ssq
    467    paddw                m0, m6
    468    paddw                m1, m6
    469    paddw                m0, m2
    470    paddw                m1, m3
    471    psrlw                m0, 4
    472    psrlw                m1, 4
    473    mova        [dstq+64*0], m0
    474    mova        [dstq+64*1], m1
    475    add                dstq, dsq
    476    dec                  hd
    477    jg .h_w64
    478    RET
    479 .h_w128:
    480    pmullw               m0, m4, [srcq+64*0+0]
    481    pmullw               m7, m5, [srcq+64*0+2]
    482    pmullw               m1, m4, [srcq+64*1+0]
    483    pmullw               m8, m5, [srcq+64*1+2]
    484    pmullw               m2, m4, [srcq+64*2+0]
    485    pmullw               m9, m5, [srcq+64*2+2]
    486    pmullw               m3, m4, [srcq+64*3+0]
    487    pmullw              m10, m5, [srcq+64*3+2]
    488    add                srcq, ssq
    489    REPX      {paddw x, m6}, m0, m1, m2, m3
    490    paddw                m0, m7
    491    paddw                m1, m8
    492    paddw                m2, m9
    493    paddw                m3, m10
    494    REPX       {psrlw x, 4}, m0, m1, m2, m3
    495    mova        [dstq+64*0], m0
    496    mova        [dstq+64*1], m1
    497    mova        [dstq+64*2], m2
    498    mova        [dstq+64*3], m3
    499    add                dstq, dsq
    500    dec                  hd
    501    jg .h_w128
    502    RET
    503 .v:
    504    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_v)]
    505    shl                mxyd, 11
    506    vpbroadcastw         m8, mxyd
    507    add                  t0, r7
    508    jmp                  t0
    509 .v_w2:
    510    movd               xmm0, [srcq+ssq*0]
    511 .v_w2_loop:
    512    movd               xmm1, [srcq+ssq*1]
    513    lea                srcq, [srcq+ssq*2]
    514    punpckldq          xmm2, xmm0, xmm1
    515    movd               xmm0, [srcq+ssq*0]
    516    punpckldq          xmm1, xmm0
    517    psubw              xmm1, xmm2
    518    pmulhrsw           xmm1, xm8
    519    paddw              xmm1, xmm2
    520    movd       [dstq+dsq*0], xmm1
    521    pextrd     [dstq+dsq*1], xmm1, 1
    522    lea                dstq, [dstq+dsq*2]
    523    sub                  hd, 2
    524    jg .v_w2_loop
    525    RET
    526 .v_w4:
    527    movq               xmm0, [srcq+ssq*0]
    528 .v_w4_loop:
    529    movq               xmm1, [srcq+ssq*1]
    530    lea                srcq, [srcq+ssq*2]
    531    punpcklqdq         xmm2, xmm0, xmm1
    532    movq               xmm0, [srcq+ssq*0]
    533    punpcklqdq         xmm1, xmm0
    534    psubw              xmm1, xmm2
    535    pmulhrsw           xmm1, xm8
    536    paddw              xmm1, xmm2
    537    movq       [dstq+dsq*0], xmm1
    538    movhps     [dstq+dsq*1], xmm1
    539    lea                dstq, [dstq+dsq*2]
    540    sub                  hd, 2
    541    jg .v_w4_loop
    542    RET
    543 .v_w8:
    544    movu               xmm0, [srcq+ssq*0]
    545 .v_w8_loop:
    546    vbroadcasti128     ymm1, [srcq+ssq*1]
    547    lea                srcq, [srcq+ssq*2]
    548    vpblendd           ymm2, ymm0, ymm1, 0xf0
    549    vbroadcasti128     ymm0, [srcq+ssq*0]
    550    vpblendd           ymm1, ymm0, 0xf0
    551    psubw              ymm1, ymm2
    552    pmulhrsw           ymm1, ym8
    553    paddw              ymm1, ymm2
    554    mova         [dstq+dsq*0], xmm1
    555    vextracti128 [dstq+dsq*1], ymm1, 1
    556    lea                dstq, [dstq+dsq*2]
    557    sub                  hd, 2
    558    jg .v_w8_loop
    559    vzeroupper
    560    RET
    561 .v_w16:
    562    movu                ym0, [srcq+ssq*0]
    563 .v_w16_loop:
    564    movu                ym3, [srcq+ssq*1]
    565    lea                srcq, [srcq+ssq*2]
    566    psubw               ym1, ym3, ym0
    567    pmulhrsw            ym1, ym8
    568    paddw               ym1, ym0
    569    movu                ym0, [srcq+ssq*0]
    570    psubw               ym2, ym0, ym3
    571    pmulhrsw            ym2, ym8
    572    paddw               ym2, ym3
    573    mova       [dstq+dsq*0], ym1
    574    mova       [dstq+dsq*1], ym2
    575    lea                dstq, [dstq+dsq*2]
    576    sub                  hd, 2
    577    jg .v_w16_loop
    578    RET
    579 .v_w32:
    580    movu                 m0, [srcq+ssq*0]
    581 .v_w32_loop:
    582    movu                 m3, [srcq+ssq*1]
    583    lea                srcq, [srcq+ssq*2]
    584    psubw                m1, m3, m0
    585    pmulhrsw             m1, m8
    586    paddw                m1, m0
    587    movu                 m0, [srcq+ssq*0]
    588    psubw                m2, m0, m3
    589    pmulhrsw             m2, m8
    590    paddw                m2, m3
    591    mova       [dstq+dsq*0], m1
    592    mova       [dstq+dsq*1], m2
    593    lea                dstq, [dstq+dsq*2]
    594    sub                  hd, 2
    595    jg .v_w32_loop
    596    RET
    597 .v_w64:
    598    movu                 m0, [srcq+ssq*0+64*0]
    599    movu                 m1, [srcq+ssq*0+64*1]
    600 .v_w64_loop:
    601    movu                 m2, [srcq+ssq*1+64*0]
    602    movu                 m3, [srcq+ssq*1+64*1]
    603    lea                srcq, [srcq+ssq*2]
    604    psubw                m4, m2, m0
    605    pmulhrsw             m4, m8
    606    paddw                m4, m0
    607    movu                 m0, [srcq+ssq*0+64*0]
    608    psubw                m5, m3, m1
    609    pmulhrsw             m5, m8
    610    paddw                m5, m1
    611    movu                 m1, [srcq+ssq*0+64*1]
    612    psubw                m6, m0, m2
    613    pmulhrsw             m6, m8
    614    psubw                m7, m1, m3
    615    pmulhrsw             m7, m8
    616    mova  [dstq+dsq*0+64*0], m4
    617    mova  [dstq+dsq*0+64*1], m5
    618    paddw                m6, m2
    619    paddw                m7, m3
    620    mova  [dstq+dsq*1+64*0], m6
    621    mova  [dstq+dsq*1+64*1], m7
    622    lea                dstq, [dstq+dsq*2]
    623    sub                  hd, 2
    624    jg .v_w64_loop
    625    RET
    626 .v_w128:
    627    movu                 m0, [srcq+ssq*0+64*0]
    628    movu                 m1, [srcq+ssq*0+64*1]
    629    movu                 m2, [srcq+ssq*0+64*2]
    630    movu                 m3, [srcq+ssq*0+64*3]
    631 .v_w128_loop:
    632    movu                 m4, [srcq+ssq*1+64*0]
    633    movu                 m5, [srcq+ssq*1+64*1]
    634    movu                 m6, [srcq+ssq*1+64*2]
    635    movu                 m7, [srcq+ssq*1+64*3]
    636    lea                srcq, [srcq+ssq*2]
    637    psubw                m9, m4, m0
    638    pmulhrsw             m9, m8
    639    paddw                m9, m0
    640    movu                 m0, [srcq+ssq*0+64*0]
    641    psubw               m10, m5, m1
    642    pmulhrsw            m10, m8
    643    paddw               m10, m1
    644    movu                 m1, [srcq+ssq*0+64*1]
    645    psubw               m11, m6, m2
    646    pmulhrsw            m11, m8
    647    paddw               m11, m2
    648    movu                 m2, [srcq+ssq*0+64*2]
    649    psubw               m12, m7, m3
    650    pmulhrsw            m12, m8
    651    paddw               m12, m3
    652    movu                 m3, [srcq+ssq*0+64*3]
    653    mova  [dstq+dsq*0+64*0], m9
    654    psubw                m9, m0, m4
    655    pmulhrsw             m9, m8
    656    mova  [dstq+dsq*0+64*1], m10
    657    psubw               m10, m1, m5
    658    pmulhrsw            m10, m8
    659    mova  [dstq+dsq*0+64*2], m11
    660    psubw               m11, m2, m6
    661    pmulhrsw            m11, m8
    662    mova  [dstq+dsq*0+64*3], m12
    663    psubw               m12, m3, m7
    664    pmulhrsw            m12, m8
    665    paddw                m9, m4
    666    paddw               m10, m5
    667    mova  [dstq+dsq*1+64*0], m9
    668    mova  [dstq+dsq*1+64*1], m10
    669    paddw               m11, m6
    670    paddw               m12, m7
    671    mova  [dstq+dsq*1+64*2], m11
    672    mova  [dstq+dsq*1+64*3], m12
    673    lea                dstq, [dstq+dsq*2]
    674    sub                  hd, 2
    675    jg .v_w128_loop
    676    RET
    677 .hv:
    678    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_hv)]
    679    shl                mxyd, 11
    680    vpbroadcastd         m6, [pw_2]
    681    vpbroadcastw         m7, mxyd
    682    vpbroadcastd         m8, [pw_8192]
    683    add                  t0, r7
    684    test          dword r8m, 0x800
    685    jnz .hv_12bpc
    686    psllw                m4, 2
    687    psllw                m5, 2
    688    vpbroadcastd         m8, [pw_2048]
    689 .hv_12bpc:
    690    jmp                  t0
    691 .hv_w2:
    692    vpbroadcastq       xmm1, [srcq+ssq*0]
    693    pmullw             xmm0, xmm1, xm4
    694    psrlq              xmm1, 16
    695    pmullw             xmm1, xm5
    696    paddw              xmm0, xm6
    697    paddw              xmm0, xmm1
    698    psrlw              xmm0, 2
    699 .hv_w2_loop:
    700    movq               xmm2, [srcq+ssq*1]
    701    lea                srcq, [srcq+ssq*2]
    702    movhps             xmm2, [srcq+ssq*0]
    703    pmullw             xmm1, xmm2, xm4
    704    psrlq              xmm2, 16
    705    pmullw             xmm2, xm5
    706    paddw              xmm1, xm6
    707    paddw              xmm1, xmm2
    708    psrlw              xmm1, 2                ; 1 _ 2 _
    709    shufpd             xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _
    710    mova               xmm0, xmm1
    711    psubw              xmm1, xmm2
    712    paddw              xmm1, xmm1
    713    pmulhw             xmm1, xm7
    714    paddw              xmm1, xmm2
    715    pmulhrsw           xmm1, xm8
    716    movd       [dstq+dsq*0], xmm1
    717    pextrd     [dstq+dsq*1], xmm1, 2
    718    lea                dstq, [dstq+dsq*2]
    719    sub                  hd, 2
    720    jg .hv_w2_loop
    721    RET
    722 .hv_w4:
    723    pmullw             xmm0, xm4, [srcq+ssq*0-8]
    724    pmullw             xmm1, xm5, [srcq+ssq*0-6]
    725    paddw              xmm0, xm6
    726    paddw              xmm0, xmm1
    727    psrlw              xmm0, 2
    728 .hv_w4_loop:
    729    movq               xmm1, [srcq+ssq*1+0]
    730    movq               xmm2, [srcq+ssq*1+2]
    731    lea                srcq, [srcq+ssq*2]
    732    movhps             xmm1, [srcq+ssq*0+0]
    733    movhps             xmm2, [srcq+ssq*0+2]
    734    pmullw             xmm1, xm4
    735    pmullw             xmm2, xm5
    736    paddw              xmm1, xm6
    737    paddw              xmm1, xmm2
    738    psrlw              xmm1, 2                ; 1 2
    739    shufpd             xmm2, xmm0, xmm1, 0x01 ; 0 1
    740    mova               xmm0, xmm1
    741    psubw              xmm1, xmm2
    742    paddw              xmm1, xmm1
    743    pmulhw             xmm1, xm7
    744    paddw              xmm1, xmm2
    745    pmulhrsw           xmm1, xm8
    746    movq       [dstq+dsq*0], xmm1
    747    movhps     [dstq+dsq*1], xmm1
    748    lea                dstq, [dstq+dsq*2]
    749    sub                  hd, 2
    750    jg .hv_w4_loop
    751    RET
    752 .hv_w8:
    753    pmullw             xmm0, xm4, [srcq+ssq*0+0]
    754    pmullw             xmm1, xm5, [srcq+ssq*0+2]
    755    paddw              xmm0, xm6
    756    paddw              xmm0, xmm1
    757    psrlw              xmm0, 2
    758    vinserti32x4        ym0, xmm0, 1
    759 .hv_w8_loop:
    760    movu                xm1, [srcq+ssq*1+0]
    761    movu                xm2, [srcq+ssq*1+2]
    762    lea                srcq, [srcq+ssq*2]
    763    vinserti32x4        ym1, [srcq+ssq*0+0], 1
    764    vinserti32x4        ym2, [srcq+ssq*0+2], 1
    765    pmullw              ym1, ym4
    766    pmullw              ym2, ym5
    767    paddw               ym1, ym6
    768    paddw               ym1, ym2
    769    psrlw               ym1, 2              ; 1 2
    770    vshufi32x4          ym2, ym0, ym1, 0x01 ; 0 1
    771    mova                ym0, ym1
    772    psubw               ym1, ym2
    773    paddw               ym1, ym1
    774    pmulhw              ym1, ym7
    775    paddw               ym1, ym2
    776    pmulhrsw            ym1, ym8
    777    mova          [dstq+dsq*0], xm1
    778    vextracti32x4 [dstq+dsq*1], ym1, 1
    779    lea                dstq, [dstq+dsq*2]
    780    sub                  hd, 2
    781    jg .hv_w8_loop
    782    RET
    783 .hv_w16:
    784    pmullw              ym0, ym4, [srcq+ssq*0+0]
    785    pmullw              ym1, ym5, [srcq+ssq*0+2]
    786    paddw               ym0, ym6
    787    paddw               ym0, ym1
    788    psrlw               ym0, 2
    789    vinserti32x8         m0, ym0, 1
    790 .hv_w16_loop:
    791    movu                ym1, [srcq+ssq*1+0]
    792    movu                ym2, [srcq+ssq*1+2]
    793    lea                srcq, [srcq+ssq*2]
    794    vinserti32x8         m1, [srcq+ssq*0+0], 1
    795    vinserti32x8         m2, [srcq+ssq*0+2], 1
    796    pmullw               m1, m4
    797    pmullw               m2, m5
    798    paddw                m1, m6
    799    paddw                m1, m2
    800    psrlw                m1, 2             ; 1 2
    801    vshufi32x4           m2, m0, m1, q1032 ; 0 1
    802    mova                 m0, m1
    803    psubw                m1, m2
    804    paddw                m1, m1
    805    pmulhw               m1, m7
    806    paddw                m1, m2
    807    pmulhrsw             m1, m8
    808    mova          [dstq+dsq*0], ym1
    809    vextracti32x8 [dstq+dsq*1], m1, 1
    810    lea                dstq, [dstq+dsq*2]
    811    sub                  hd, 2
    812    jg .hv_w16_loop
    813    RET
    814 .hv_w32:
    815 .hv_w64:
    816 .hv_w128:
    817    movifnidn            wd, wm
    818    lea                 r6d, [hq+wq*8-256]
    819    mov                  r4, srcq
    820    mov                  r7, dstq
    821 .hv_w32_loop0:
    822    pmullw               m0, m4, [srcq+ssq*0+0]
    823    pmullw               m1, m5, [srcq+ssq*0+2]
    824    paddw                m0, m6
    825    paddw                m0, m1
    826    psrlw                m0, 2
    827 .hv_w32_loop:
    828    pmullw               m3, m4, [srcq+ssq*1+0]
    829    pmullw               m1, m5, [srcq+ssq*1+2]
    830    lea                srcq, [srcq+ssq*2]
    831    paddw                m3, m6
    832    paddw                m3, m1
    833    psrlw                m3, 2
    834    psubw                m1, m3, m0
    835    paddw                m1, m1
    836    pmulhw               m1, m7
    837    paddw                m1, m0
    838    pmullw               m0, m4, [srcq+ssq*0+0]
    839    pmullw               m2, m5, [srcq+ssq*0+2]
    840    paddw                m0, m6
    841    paddw                m0, m2
    842    psrlw                m0, 2
    843    psubw                m2, m0, m3
    844    paddw                m2, m2
    845    pmulhw               m2, m7
    846    paddw                m2, m3
    847    pmulhrsw             m1, m8
    848    pmulhrsw             m2, m8
    849    mova       [dstq+dsq*0], m1
    850    mova       [dstq+dsq*1], m2
    851    lea                dstq, [dstq+dsq*2]
    852    sub                  hd, 2
    853    jg .hv_w32_loop
    854    add                  r4, 64
    855    add                  r7, 64
    856    movzx                hd, r6b
    857    mov                srcq, r4
    858    mov                dstq, r7
    859    sub                 r6d, 1<<8
    860    jg .hv_w32_loop0
    861    RET
    862 
    863 cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3
    864    movifnidn          mxyd, r5m ; mx
    865    lea                  r6, [prep_avx512icl]
    866    tzcnt                wd, wm
    867    movifnidn            hd, hm
    868    test               mxyd, mxyd
    869    jnz .h
    870    mov                mxyd, r6m ; my
    871    test               mxyd, mxyd
    872    jnz .v
    873 .prep:
    874    movzx                wd, word [r6+wq*2+table_offset(prep,)]
    875    mov                 r5d, r7m ; bitdepth_max
    876    vpbroadcastd         m5, [r6-prep_avx512icl+pw_8192]
    877    add                  wq, r6
    878    shr                 r5d, 11
    879    vpbroadcastd         m4, [r6-prep_avx512icl+prep_mul+r5*4]
    880    lea            stride3q, [strideq*3]
    881    jmp                  wq
    882 .prep_w4:
    883    mov                 r3d, 0x0c
    884    kmovb                k1, r3d
    885 .prep_w4_loop:
    886    movq                xm0, [srcq+strideq*0]
    887    movhps              xm0, [srcq+strideq*1]
    888    vpbroadcastq        ym1, [srcq+strideq*2]
    889    vpunpcklqdq     ym0{k1}, ym1, [srcq+stride3q] {1to4}
    890    lea                srcq, [srcq+strideq*4]
    891    pmullw              ym0, ym4
    892    psubw               ym0, ym5
    893    mova             [tmpq], ym0
    894    add                tmpq, 32
    895    sub                  hd, 4
    896    jg .prep_w4_loop
    897    RET
    898 .prep_w8:
    899    movu                xm0, [srcq+strideq*0]
    900    vinserti32x4        ym0, [srcq+strideq*1], 1
    901    vinserti32x4         m0, [srcq+strideq*2], 2
    902    vinserti32x4         m0, [srcq+stride3q ], 3
    903    lea                srcq, [srcq+strideq*4]
    904    pmullw               m0, m4
    905    psubw                m0, m5
    906    mova             [tmpq], m0
    907    add                tmpq, 64
    908    sub                  hd, 4
    909    jg .prep_w8
    910    RET
    911 .prep_w16:
    912    movu                ym0, [srcq+strideq*0]
    913    vinserti32x8         m0, [srcq+strideq*1], 1
    914    movu                ym1, [srcq+strideq*2]
    915    vinserti32x8         m1, [srcq+stride3q ], 1
    916    lea                srcq, [srcq+strideq*4]
    917    pmullw               m0, m4
    918    pmullw               m1, m4
    919    psubw                m0, m5
    920    psubw                m1, m5
    921    mova        [tmpq+64*0], m0
    922    mova        [tmpq+64*1], m1
    923    add                tmpq, 64*2
    924    sub                  hd, 4
    925    jg .prep_w16
    926    RET
    927 .prep_w32:
    928    pmullw               m0, m4, [srcq+strideq*0]
    929    pmullw               m1, m4, [srcq+strideq*1]
    930    pmullw               m2, m4, [srcq+strideq*2]
    931    pmullw               m3, m4, [srcq+stride3q ]
    932    lea                srcq, [srcq+strideq*4]
    933    REPX      {psubw x, m5}, m0, m1, m2, m3
    934    mova        [tmpq+64*0], m0
    935    mova        [tmpq+64*1], m1
    936    mova        [tmpq+64*2], m2
    937    mova        [tmpq+64*3], m3
    938    add                tmpq, 64*4
    939    sub                  hd, 4
    940    jg .prep_w32
    941    RET
    942 .prep_w64:
    943    pmullw               m0, m4, [srcq+strideq*0+64*0]
    944    pmullw               m1, m4, [srcq+strideq*0+64*1]
    945    pmullw               m2, m4, [srcq+strideq*1+64*0]
    946    pmullw               m3, m4, [srcq+strideq*1+64*1]
    947    lea                srcq, [srcq+strideq*2]
    948    REPX      {psubw x, m5}, m0, m1, m2, m3
    949    mova        [tmpq+64*0], m0
    950    mova        [tmpq+64*1], m1
    951    mova        [tmpq+64*2], m2
    952    mova        [tmpq+64*3], m3
    953    add                tmpq, 64*4
    954    sub                  hd, 2
    955    jg .prep_w64
    956    RET
    957 .prep_w128:
    958    pmullw               m0, m4, [srcq+64*0]
    959    pmullw               m1, m4, [srcq+64*1]
    960    pmullw               m2, m4, [srcq+64*2]
    961    pmullw               m3, m4, [srcq+64*3]
    962    add                srcq, strideq
    963    REPX      {psubw x, m5}, m0, m1, m2, m3
    964    mova        [tmpq+64*0], m0
    965    mova        [tmpq+64*1], m1
    966    mova        [tmpq+64*2], m2
    967    mova        [tmpq+64*3], m3
    968    add                tmpq, 64*4
    969    dec                  hd
    970    jg .prep_w128
    971    RET
    972 .h:
    973    vpbroadcastw         m5, mxyd
    974    mov                mxyd, r6m ; my
    975    vpbroadcastd         m4, [pw_16]
    976    vpbroadcastd         m6, [pw_32766]
    977    psubw                m4, m5
    978    test          dword r7m, 0x800
    979    jnz .h_12bpc
    980    psllw                m4, 2
    981    psllw                m5, 2
    982 .h_12bpc:
    983    test               mxyd, mxyd
    984    jnz .hv
    985    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
    986    add                  wq, r6
    987    lea            stride3q, [strideq*3]
    988    jmp                  wq
    989 .h_w4:
    990    movu                xm1, [srcq+strideq*0]
    991    vinserti32x4        ym1, [srcq+strideq*2], 1
    992    movu                xm2, [srcq+strideq*1]
    993    vinserti32x4        ym2, [srcq+stride3q ], 1
    994    lea                srcq, [srcq+strideq*4]
    995    punpcklqdq          ym0, ym1, ym2
    996    psrldq              ym1, 2
    997    psrldq              ym2, 2
    998    pmullw              ym0, ym4
    999    punpcklqdq          ym1, ym2
   1000    pmullw              ym1, ym5
   1001    psubw               ym0, ym6
   1002    paddw               ym0, ym1
   1003    psraw               ym0, 2
   1004    mova             [tmpq], ym0
   1005    add                tmpq, 32
   1006    sub                  hd, 4
   1007    jg .h_w4
   1008    RET
   1009 .h_w8:
   1010    movu                xm0, [srcq+strideq*0+0]
   1011    movu                xm1, [srcq+strideq*0+2]
   1012    vinserti32x4        ym0, [srcq+strideq*1+0], 1
   1013    vinserti32x4        ym1, [srcq+strideq*1+2], 1
   1014    vinserti32x4         m0, [srcq+strideq*2+0], 2
   1015    vinserti32x4         m1, [srcq+strideq*2+2], 2
   1016    vinserti32x4         m0, [srcq+stride3q +0], 3
   1017    vinserti32x4         m1, [srcq+stride3q +2], 3
   1018    lea                srcq, [srcq+strideq*4]
   1019    pmullw               m0, m4
   1020    pmullw               m1, m5
   1021    psubw                m0, m6
   1022    paddw                m0, m1
   1023    psraw                m0, 2
   1024    mova             [tmpq], m0
   1025    add                tmpq, 64
   1026    sub                  hd, 4
   1027    jg .h_w8
   1028    RET
   1029 .h_w16:
   1030    movu                ym0, [srcq+strideq*0+0]
   1031    vinserti32x8         m0, [srcq+strideq*1+0], 1
   1032    movu                ym1, [srcq+strideq*0+2]
   1033    vinserti32x8         m1, [srcq+strideq*1+2], 1
   1034    lea                srcq, [srcq+strideq*2]
   1035    pmullw               m0, m4
   1036    pmullw               m1, m5
   1037    psubw                m0, m6
   1038    paddw                m0, m1
   1039    psraw                m0, 2
   1040    mova             [tmpq], m0
   1041    add                tmpq, 64
   1042    sub                  hd, 2
   1043    jg .h_w16
   1044    RET
   1045 .h_w32:
   1046    pmullw               m0, m4, [srcq+strideq*0+0]
   1047    pmullw               m2, m5, [srcq+strideq*0+2]
   1048    pmullw               m1, m4, [srcq+strideq*1+0]
   1049    pmullw               m3, m5, [srcq+strideq*1+2]
   1050    lea                srcq, [srcq+strideq*2]
   1051    psubw                m0, m6
   1052    psubw                m1, m6
   1053    paddw                m0, m2
   1054    paddw                m1, m3
   1055    psraw                m0, 2
   1056    psraw                m1, 2
   1057    mova        [tmpq+64*0], m0
   1058    mova        [tmpq+64*1], m1
   1059    add                tmpq, 64*2
   1060    sub                  hd, 2
   1061    jg .h_w32
   1062    RET
   1063 .h_w64:
   1064    pmullw               m0, m4, [srcq+ 0]
   1065    pmullw               m2, m5, [srcq+ 2]
   1066    pmullw               m1, m4, [srcq+64]
   1067    pmullw               m3, m5, [srcq+66]
   1068    add                srcq, strideq
   1069    psubw                m0, m6
   1070    psubw                m1, m6
   1071    paddw                m0, m2
   1072    paddw                m1, m3
   1073    psraw                m0, 2
   1074    psraw                m1, 2
   1075    mova        [tmpq+64*0], m0
   1076    mova        [tmpq+64*1], m1
   1077    add                tmpq, 64*2
   1078    dec                  hd
   1079    jg .h_w64
   1080    RET
   1081 .h_w128:
   1082    pmullw               m0, m4, [srcq+  0]
   1083    pmullw               m7, m5, [srcq+  2]
   1084    pmullw               m1, m4, [srcq+ 64]
   1085    pmullw               m8, m5, [srcq+ 66]
   1086    pmullw               m2, m4, [srcq+128]
   1087    pmullw               m9, m5, [srcq+130]
   1088    pmullw               m3, m4, [srcq+192]
   1089    pmullw              m10, m5, [srcq+194]
   1090    add                srcq, strideq
   1091    REPX      {psubw x, m6}, m0, m1, m2, m3
   1092    paddw                m0, m7
   1093    paddw                m1, m8
   1094    paddw                m2, m9
   1095    paddw                m3, m10
   1096    REPX       {psraw x, 2}, m0, m1, m2, m3
   1097    mova        [tmpq+64*0], m0
   1098    mova        [tmpq+64*1], m1
   1099    mova        [tmpq+64*2], m2
   1100    mova        [tmpq+64*3], m3
   1101    add                tmpq, 64*4
   1102    dec                  hd
   1103    jg .h_w128
   1104    RET
   1105 .v:
   1106    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
   1107    vpbroadcastw         m9, mxyd
   1108    vpbroadcastd         m8, [pw_16]
   1109    vpbroadcastd        m10, [pw_32766]
   1110    add                  wq, r6
   1111    lea            stride3q, [strideq*3]
   1112    psubw                m8, m9
   1113    test          dword r7m, 0x800
   1114    jnz .v_12bpc
   1115    psllw                m8, 2
   1116    psllw                m9, 2
   1117 .v_12bpc:
   1118    jmp                  wq
   1119 .v_w4:
   1120    movq               xmm0, [srcq+strideq*0]
   1121 .v_w4_loop:
   1122    vpbroadcastq       xmm2, [srcq+strideq*1]
   1123    vpbroadcastq       ymm1, [srcq+strideq*2]
   1124    vpbroadcastq       ymm3, [srcq+stride3q ]
   1125    lea                srcq, [srcq+strideq*4]
   1126    vpblendd           ymm2, ymm1, 0x30
   1127    vpblendd           ymm2, ymm3, 0xc0
   1128    vpblendd           ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3
   1129    movq               xmm0, [srcq+strideq*0]
   1130    valignq            ymm2, ymm0, ymm2, 1    ; 1 2 3 4
   1131    pmullw             ymm1, ym8
   1132    pmullw             ymm2, ym9
   1133    psubw              ymm1, ym10
   1134    paddw              ymm1, ymm2
   1135    psraw              ymm1, 2
   1136    mova             [tmpq], ymm1
   1137    add                tmpq, 32
   1138    sub                  hd, 4
   1139    jg .v_w4_loop
   1140    vzeroupper
   1141    RET
   1142 .v_w8:
   1143    movu                xm0, [srcq+strideq*0]
   1144 .v_w8_loop:
   1145    vinserti32x4        ym1, ym0, [srcq+strideq*1], 1
   1146    vinserti32x4         m1, [srcq+strideq*2], 2
   1147    vinserti32x4         m1, [srcq+stride3q ], 3 ; 0 1 2 3
   1148    lea                srcq, [srcq+strideq*4]
   1149    movu                xm0, [srcq+strideq*0]
   1150    valignq              m2, m0, m1, 2           ; 1 2 3 4
   1151    pmullw               m1, m8
   1152    pmullw               m2, m9
   1153    psubw                m1, m10
   1154    paddw                m1, m2
   1155    psraw                m1, 2
   1156    mova             [tmpq], m1
   1157    add                tmpq, 64
   1158    sub                  hd, 4
   1159    jg .v_w8_loop
   1160    RET
   1161 .v_w16:
   1162    movu                ym0, [srcq+strideq*0]
   1163 .v_w16_loop:
   1164    vinserti32x8         m1, m0, [srcq+strideq*1], 1 ; 0 1
   1165    movu                ym3, [srcq+strideq*2]
   1166    vinserti32x8         m2, m3, [srcq+stride3q ], 1 ; 2 3
   1167    lea                srcq, [srcq+strideq*4]
   1168    movu                ym0, [srcq+strideq*0]
   1169    vshufi32x4           m3, m1, m3, q1032           ; 1 2
   1170    vshufi32x4           m4, m2, m0, q1032           ; 3 4
   1171    pmullw               m1, m8
   1172    pmullw               m2, m8
   1173    pmullw               m3, m9
   1174    pmullw               m4, m9
   1175    psubw                m1, m10
   1176    psubw                m2, m10
   1177    paddw                m1, m3
   1178    paddw                m2, m4
   1179    psraw                m1, 2
   1180    psraw                m2, 2
   1181    mova        [tmpq+64*0], m1
   1182    mova        [tmpq+64*1], m2
   1183    add                tmpq, 64*2
   1184    sub                  hd, 4
   1185    jg .v_w16_loop
   1186    RET
   1187 .v_w32:
   1188    movu                 m0, [srcq+strideq*0]
   1189 .v_w32_loop:
   1190    movu                 m3, [srcq+strideq*1]
   1191    lea                srcq, [srcq+strideq*2]
   1192    pmullw               m1, m8, m0
   1193    movu                 m0, [srcq+strideq*0]
   1194    pmullw               m2, m8, m3
   1195    pmullw               m3, m9
   1196    pmullw               m4, m9, m0
   1197    psubw                m1, m10
   1198    psubw                m2, m10
   1199    paddw                m1, m3
   1200    paddw                m2, m4
   1201    psraw                m1, 2
   1202    psraw                m2, 2
   1203    mova        [tmpq+64*0], m1
   1204    mova        [tmpq+64*1], m2
   1205    add                tmpq, 64*2
   1206    sub                  hd, 2
   1207    jg .v_w32_loop
   1208    RET
   1209 .v_w64:
   1210    movu                 m0, [srcq+64*0]
   1211    movu                 m1, [srcq+64*1]
   1212 .v_w64_loop:
   1213    add                srcq, strideq
   1214    pmullw               m2, m8, m0
   1215    movu                 m0, [srcq+64*0]
   1216    pmullw               m3, m8, m1
   1217    movu                 m1, [srcq+64*1]
   1218    pmullw               m4, m9, m0
   1219    pmullw               m5, m9, m1
   1220    psubw                m2, m10
   1221    psubw                m3, m10
   1222    paddw                m2, m4
   1223    paddw                m3, m5
   1224    psraw                m2, 2
   1225    psraw                m3, 2
   1226    mova        [tmpq+64*0], m2
   1227    mova        [tmpq+64*1], m3
   1228    add                tmpq, 64*2
   1229    dec                  hd
   1230    jg .v_w64_loop
   1231    RET
   1232 .v_w128:
   1233    movu                 m0, [srcq+64*0]
   1234    movu                 m1, [srcq+64*1]
   1235    movu                 m2, [srcq+64*2]
   1236    movu                 m3, [srcq+64*3]
   1237 .v_w128_loop:
   1238    add                srcq, strideq
   1239    pmullw               m4, m8, m0
   1240    movu                 m0, [srcq+64*0]
   1241    pmullw               m5, m8, m1
   1242    movu                 m1, [srcq+64*1]
   1243    pmullw               m6, m8, m2
   1244    movu                 m2, [srcq+64*2]
   1245    pmullw               m7, m8, m3
   1246    movu                 m3, [srcq+64*3]
   1247    pmullw              m11, m9, m0
   1248    pmullw              m12, m9, m1
   1249    pmullw              m13, m9, m2
   1250    pmullw              m14, m9, m3
   1251    REPX     {psubw x, m10}, m4, m5, m6, m7
   1252    paddw                m4, m11
   1253    paddw                m5, m12
   1254    paddw                m6, m13
   1255    paddw                m7, m14
   1256    REPX       {psraw x, 2}, m4, m5, m6, m7
   1257    mova        [tmpq+64*0], m4
   1258    mova        [tmpq+64*1], m5
   1259    mova        [tmpq+64*2], m6
   1260    mova        [tmpq+64*3], m7
   1261    add                tmpq, 64*4
   1262    dec                  hd
   1263    jg .v_w128_loop
   1264    RET
   1265 .hv:
   1266    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
   1267    shl                mxyd, 11
   1268    vpbroadcastw         m7, mxyd
   1269    add                  wq, r6
   1270    lea            stride3q, [strideq*3]
   1271    jmp                  wq
   1272 .hv_w4:
   1273    movq               xmm0, [srcq+strideq*0+0]
   1274    movq               xmm1, [srcq+strideq*0+2]
   1275    pmullw             xmm0, xm4
   1276    pmullw             xmm1, xm5
   1277    psubw              xmm0, xm6
   1278    paddw              xmm0, xmm1
   1279    psraw              xmm0, 2
   1280    vpbroadcastq        ym0, xmm0
   1281 .hv_w4_loop:
   1282    movu                xm1, [srcq+strideq*1]
   1283    vinserti128         ym1, [srcq+stride3q ], 1
   1284    movu                xm2, [srcq+strideq*2]
   1285    lea                srcq, [srcq+strideq*4]
   1286    vinserti128         ym2, [srcq+strideq*0], 1
   1287    punpcklqdq          ym3, ym1, ym2
   1288    psrldq              ym1, 2
   1289    psrldq              ym2, 2
   1290    pmullw              ym3, ym4
   1291    punpcklqdq          ym1, ym2
   1292    pmullw              ym1, ym5
   1293    psubw               ym3, ym6
   1294    paddw               ym1, ym3
   1295    psraw               ym1, 2           ; 1 2 3 4
   1296    valignq             ym2, ym1, ym0, 3 ; 0 1 2 3
   1297    mova                ym0, ym1
   1298    psubw               ym1, ym2
   1299    pmulhrsw            ym1, ym7
   1300    paddw               ym1, ym2
   1301    mova             [tmpq], ym1
   1302    add                tmpq, 32
   1303    sub                  hd, 4
   1304    jg .hv_w4_loop
   1305    RET
   1306 .hv_w8:
   1307    pmullw              xm0, xm4, [srcq+strideq*0+0]
   1308    pmullw              xm1, xm5, [srcq+strideq*0+2]
   1309    psubw               xm0, xm6
   1310    paddw               xm0, xm1
   1311    psraw               xm0, 2
   1312    vinserti32x4         m0, xm0, 3
   1313 .hv_w8_loop:
   1314    movu                xm1, [srcq+strideq*1+0]
   1315    movu                xm2, [srcq+strideq*1+2]
   1316    vinserti32x4        ym1, [srcq+strideq*2+0], 1
   1317    vinserti32x4        ym2, [srcq+strideq*2+2], 1
   1318    vinserti32x4         m1, [srcq+stride3q +0], 2
   1319    vinserti32x4         m2, [srcq+stride3q +2], 2
   1320    lea                srcq, [srcq+strideq*4]
   1321    vinserti32x4         m1, [srcq+strideq*0+0], 3
   1322    vinserti32x4         m2, [srcq+strideq*0+2], 3
   1323    pmullw               m1, m4
   1324    pmullw               m2, m5
   1325    psubw                m1, m6
   1326    paddw                m1, m2
   1327    psraw                m1, 2         ; 1 2 3 4
   1328    valignq              m2, m1, m0, 6 ; 0 1 2 3
   1329    mova                 m0, m1
   1330    psubw                m1, m2
   1331    pmulhrsw             m1, m7
   1332    paddw                m1, m2
   1333    mova             [tmpq], m1
   1334    add                tmpq, 64
   1335    sub                  hd, 4
   1336    jg .hv_w8_loop
   1337    RET
   1338 .hv_w16:
   1339    pmullw              ym0, ym4, [srcq+strideq*0+0]
   1340    pmullw              ym1, ym5, [srcq+strideq*0+2]
   1341    psubw               ym0, ym6
   1342    paddw               ym0, ym1
   1343    psraw               ym0, 2
   1344    vinserti32x8         m0, ym0, 1
   1345 .hv_w16_loop:
   1346    movu                ym1, [srcq+strideq*1+0]
   1347    movu                ym2, [srcq+strideq*1+2]
   1348    lea                srcq, [srcq+strideq*2]
   1349    vinserti32x8         m1, [srcq+strideq*0+0], 1
   1350    vinserti32x8         m2, [srcq+strideq*0+2], 1
   1351    pmullw               m1, m4
   1352    pmullw               m2, m5
   1353    psubw                m1, m6
   1354    paddw                m1, m2
   1355    psraw                m1, 2             ; 1 2
   1356    vshufi32x4           m2, m0, m1, q1032 ; 0 1
   1357    mova                 m0, m1
   1358    psubw                m1, m2
   1359    pmulhrsw             m1, m7
   1360    paddw                m1, m2
   1361    mova             [tmpq], m1
   1362    add                tmpq, 64
   1363    sub                  hd, 2
   1364    jg .hv_w16_loop
   1365    RET
   1366 .hv_w32:
   1367    pmullw               m0, m4, [srcq+strideq*0+0]
   1368    pmullw               m1, m5, [srcq+strideq*0+2]
   1369    psubw                m0, m6
   1370    paddw                m0, m1
   1371    psraw                m0, 2
   1372 .hv_w32_loop:
   1373    pmullw               m3, m4, [srcq+strideq*1+0]
   1374    pmullw               m1, m5, [srcq+strideq*1+2]
   1375    lea                srcq, [srcq+strideq*2]
   1376    psubw                m3, m6
   1377    paddw                m3, m1
   1378    psraw                m3, 2
   1379    psubw                m1, m3, m0
   1380    pmulhrsw             m1, m7
   1381    paddw                m1, m0
   1382    pmullw               m0, m4, [srcq+strideq*0+0]
   1383    pmullw               m2, m5, [srcq+strideq*0+2]
   1384    psubw                m0, m6
   1385    paddw                m0, m2
   1386    psraw                m0, 2
   1387    psubw                m2, m0, m3
   1388    pmulhrsw             m2, m7
   1389    paddw                m2, m3
   1390    mova        [tmpq+64*0], m1
   1391    mova        [tmpq+64*1], m2
   1392    add                tmpq, 64*2
   1393    sub                  hd, 2
   1394    jg .hv_w32_loop
   1395    RET
   1396 .hv_w64:
   1397    pmullw               m0, m4, [srcq+ 0]
   1398    pmullw               m2, m5, [srcq+ 2]
   1399    pmullw               m1, m4, [srcq+64]
   1400    pmullw               m3, m5, [srcq+66]
   1401    psubw                m0, m6
   1402    psubw                m1, m6
   1403    paddw                m0, m2
   1404    paddw                m1, m3
   1405    psraw                m0, 2
   1406    psraw                m1, 2
   1407 .hv_w64_loop:
   1408    add                srcq, strideq
   1409    pmullw               m2, m4, [srcq+ 0]
   1410    pmullw               m8, m5, [srcq+ 2]
   1411    pmullw               m3, m4, [srcq+64]
   1412    pmullw               m9, m5, [srcq+66]
   1413    psubw                m2, m6
   1414    psubw                m3, m6
   1415    paddw                m2, m8
   1416    paddw                m3, m9
   1417    psraw                m2, 2
   1418    psraw                m3, 2
   1419    psubw                m8, m2, m0
   1420    psubw                m9, m3, m1
   1421    pmulhrsw             m8, m7
   1422    pmulhrsw             m9, m7
   1423    paddw                m8, m0
   1424    mova                 m0, m2
   1425    paddw                m9, m1
   1426    mova                 m1, m3
   1427    mova        [tmpq+64*0], m8
   1428    mova        [tmpq+64*1], m9
   1429    add                tmpq, 64*2
   1430    dec                  hd
   1431    jg .hv_w64_loop
   1432    RET
   1433 .hv_w128:
   1434    pmullw               m0, m4, [srcq+  0]
   1435    pmullw               m8, m5, [srcq+  2]
   1436    pmullw               m1, m4, [srcq+ 64]
   1437    pmullw               m9, m5, [srcq+ 66]
   1438    pmullw               m2, m4, [srcq+128]
   1439    pmullw              m10, m5, [srcq+130]
   1440    pmullw               m3, m4, [srcq+192]
   1441    pmullw              m11, m5, [srcq+194]
   1442    REPX      {psubw x, m6}, m0, m1, m2, m3
   1443    paddw                m0, m8
   1444    paddw                m1, m9
   1445    paddw                m2, m10
   1446    paddw                m3, m11
   1447    REPX       {psraw x, 2}, m0, m1, m2, m3
   1448 .hv_w128_loop:
   1449    add                srcq, strideq
   1450    pmullw               m8, m4, [srcq+  0]
   1451    pmullw              m12, m5, [srcq+  2]
   1452    pmullw               m9, m4, [srcq+ 64]
   1453    pmullw              m13, m5, [srcq+ 66]
   1454    pmullw              m10, m4, [srcq+128]
   1455    pmullw              m14, m5, [srcq+130]
   1456    pmullw              m11, m4, [srcq+192]
   1457    pmullw              m15, m5, [srcq+194]
   1458    REPX      {psubw x, m6}, m8, m9, m10, m11
   1459    paddw                m8, m12
   1460    paddw                m9, m13
   1461    paddw               m10, m14
   1462    paddw               m11, m15
   1463    REPX       {psraw x, 2}, m8, m9, m10, m11
   1464    psubw               m12, m8, m0
   1465    psubw               m13, m9, m1
   1466    psubw               m14, m10, m2
   1467    psubw               m15, m11, m3
   1468    REPX   {pmulhrsw x, m7}, m12, m13, m14, m15
   1469    paddw               m12, m0
   1470    mova                 m0, m8
   1471    paddw               m13, m1
   1472    mova                 m1, m9
   1473    mova        [tmpq+64*0], m12
   1474    mova        [tmpq+64*1], m13
   1475    paddw               m14, m2
   1476    mova                 m2, m10
   1477    paddw               m15, m3
   1478    mova                 m3, m11
   1479    mova        [tmpq+64*2], m14
   1480    mova        [tmpq+64*3], m15
   1481    add                tmpq, 64*4
   1482    dec                  hd
   1483    jg .hv_w128_loop
   1484    RET
   1485 
   1486 ; int8_t subpel_filters[5][15][8]
   1487 %assign FILTER_REGULAR (0*15 << 16) | 3*15
   1488 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
   1489 %assign FILTER_SHARP   (2*15 << 16) | 3*15
   1490 
   1491 %macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
   1492 cglobal %1_%2_16bpc
   1493    mov                 t0d, FILTER_%3
   1494 %ifidn %3, %4
   1495    mov                 t1d, t0d
   1496 %else
   1497    mov                 t1d, FILTER_%4
   1498 %endif
   1499 %if %0 == 5 ; skip the jump in the last filter
   1500    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
   1501 %endif
   1502 %endmacro
   1503 
   1504 %if WIN64
   1505 DECLARE_REG_TMP 4, 5
   1506 %define buf rsp+stack_offset+8 ; shadow space
   1507 %else
   1508 DECLARE_REG_TMP 7, 8
   1509 %define buf rsp-40 ; red zone
   1510 %endif
   1511 
   1512 %define PUT_8TAP_FN FN put_8tap,
   1513 PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
   1514 PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
   1515 PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
   1516 PUT_8TAP_FN regular,        REGULAR, REGULAR
   1517 
   1518 cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
   1519 %define base r8-put_avx512icl
   1520    imul                mxd, mxm, 0x010101
   1521    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
   1522    imul                myd, mym, 0x010101
   1523    add                 myd, t1d ; 6tap_v, my, 4tap_v
   1524    lea                  r8, [put_avx512icl]
   1525    movifnidn            wd, wm
   1526    movifnidn            hd, hm
   1527    test                mxd, 0xf00
   1528    jnz .h
   1529    test                myd, 0xf00
   1530    jnz .v
   1531 .put:
   1532    tzcnt                wd, wd
   1533    movzx                wd, word [r8+wq*2+table_offset(put,)]
   1534    add                  wq, r8
   1535 %if WIN64
   1536    pop                  r8
   1537 %endif
   1538    jmp                  wq
   1539 .h_w8:
   1540    mova                 m4, [spel_h_shufA]
   1541    movu                 m5, [spel_h_shufB]
   1542    movu                 m6, [spel_h_shufC]
   1543 .h_w8_loop:
   1544    movu                ym2, [srcq+ssq*0]
   1545    vinserti32x8         m2, [srcq+ssq*1], 1
   1546    lea                srcq, [srcq+ssq*2]
   1547    mova                 m0, m8
   1548    vpermb               m1, m4, m2
   1549    vpdpwssd             m0, m10, m1
   1550    vpermb               m1, m5, m2
   1551    vpdpwssd             m0, m11, m1
   1552    vpermb               m1, m6, m2
   1553    vpdpwssd             m0, m12, m1
   1554    psrad                m0, 6
   1555    vextracti32x8       ym1, m0, 1
   1556    packusdw            ym0, ym1
   1557    pminsw              ym0, ym15
   1558    mova          [dstq+dsq*0], xm0
   1559    vextracti32x4 [dstq+dsq*1], ym0, 1
   1560    lea                dstq, [dstq+dsq*2]
   1561    sub                  hd, 2
   1562    jg .h_w8_loop
   1563    RET
   1564 .h:
   1565    vpbroadcastw        m15, r8m
   1566    test                myd, 0xf00
   1567    jnz .hv
   1568    mov                 r7d, r8m
   1569    shr                 r7d, 11
   1570    vpbroadcastd         m8, [base+put_8tap_h_rnd+r7*4]
   1571    cmp                  wd, 4
   1572    jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4
   1573    shr                 mxd, 16
   1574    sub                srcq, 4
   1575    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
   1576    mova              [buf], xmm0
   1577    vpbroadcastd        m10, xmm0
   1578    vpbroadcastd        m12, [buf+8]
   1579    vpbroadcastd        m11, [buf+4]
   1580    sub                  wd, 16
   1581    jl .h_w8
   1582    vbroadcasti32x4      m6, [spel_h_shufA]
   1583    vbroadcasti32x4      m7, [spel_h_shufB]
   1584    jg .h_w32
   1585 .h_w16_loop:
   1586    movu                ym2, [srcq+ssq*0+ 0]
   1587    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
   1588    movu                ym3, [srcq+ssq*0+12]
   1589    vinserti32x8         m3, [srcq+ssq*1+12], 1
   1590    lea                srcq, [srcq+ssq*2]
   1591    mova                 m0, m8
   1592    mova                 m1, m8
   1593    pshufb               m4, m2, m6
   1594    vpdpwssd             m0, m10, m4 ; a0  b0
   1595    pshufb               m4, m3, m7
   1596    vpdpwssd             m1, m12, m4 ; a2' b2'
   1597    pshufb               m2, m7
   1598    pshufb               m3, m6
   1599    vpdpwssd             m0, m11, m2 ; a1  b1
   1600    vpdpwssd             m1, m11, m3 ; a1' b1'
   1601    shufpd               m2, m3, 0x55
   1602    vpdpwssd             m0, m12, m2 ; a2  b2
   1603    vpdpwssd             m1, m10, m2 ; a0' b0'
   1604    psrad                m0, 6
   1605    psrad                m1, 6
   1606    packusdw             m0, m1
   1607    pminsw               m0, m15
   1608    mova          [dstq+dsq*0], ym0
   1609    vextracti32x8 [dstq+dsq*1], m0, 1
   1610    lea                dstq, [dstq+dsq*2]
   1611    sub                  hd, 2
   1612    jg .h_w16_loop
   1613    RET
   1614 .h_w32:
   1615    lea                srcq, [srcq+wq*2]
   1616    lea                dstq, [dstq+wq*2]
   1617    neg                  wq
   1618 .h_w32_loop0:
   1619    mov                  r6, wq
   1620 .h_w32_loop:
   1621    movu                 m2, [srcq+r6*2+ 0]
   1622    movu                 m3, [srcq+r6*2+12]
   1623    mova                 m0, m8
   1624    mova                 m1, m8
   1625    pshufb               m4, m2, m6
   1626    vpdpwssd             m0, m10, m4 ; a0
   1627    pshufb               m4, m3, m7
   1628    vpdpwssd             m1, m12, m4 ; b2
   1629    pshufb               m2, m7
   1630    pshufb               m3, m6
   1631    vpdpwssd             m0, m11, m2 ; a1
   1632    vpdpwssd             m1, m11, m3 ; b1
   1633    shufpd               m2, m3, 0x55
   1634    vpdpwssd             m0, m12, m2 ; a2
   1635    vpdpwssd             m1, m10, m2 ; b0
   1636    psrad                m0, 6
   1637    psrad                m1, 6
   1638    packusdw             m0, m1
   1639    pminsw               m0, m15
   1640    mova        [dstq+r6*2], m0
   1641    add                  r6, 32
   1642    jl .h_w32_loop
   1643    add                srcq, ssq
   1644    add                dstq, dsq
   1645    dec                  hd
   1646    jg .h_w32_loop0
   1647    RET
   1648 .v:
   1649    movzx               mxd, myb
   1650    shr                 myd, 16
   1651    cmp                  hd, 6
   1652    cmovs               myd, mxd
   1653    vpbroadcastd        m11, [pd_32]
   1654    pmovsxbw           xmm0, [base+subpel_filters+1+myq*8]
   1655    tzcnt               r7d, wd
   1656    vpbroadcastw        m15, r8m
   1657    mov                  r6, ssq
   1658    movzx               r7d, word [r8+r7*2+table_offset(put, _6tap_v)]
   1659    neg                  r6
   1660    mova [rsp+stack_offset+8], xmm0
   1661    vpbroadcastd        m12, xmm0
   1662    add                  r7, r8
   1663    vpbroadcastd        m13, [rsp+stack_offset+12]
   1664    vpbroadcastd        m14, [rsp+stack_offset+16]
   1665    jmp                  r7
   1666 .v_w2:
   1667    movd               xmm2, [srcq+r6 *2]
   1668    pinsrd             xmm2, [srcq+r6 *1], 1
   1669    pinsrd             xmm2, [srcq+ssq*0], 2
   1670    pinsrd             xmm2, [srcq+ssq*1], 3 ; 0 1 2 3
   1671    lea                srcq, [srcq+ssq*2]
   1672    movd               xmm0, [srcq+ssq*0]
   1673    palignr            xmm3, xmm0, xmm2, 4   ; 1 2 3 4
   1674    punpcklwd          xmm1, xmm2, xmm3      ; 01 12
   1675    punpckhwd          xmm2, xmm3            ; 23 34
   1676 .v_w2_loop:
   1677    movd               xmm3, [srcq+ssq*1]
   1678    mova               xmm4, xm11
   1679    vpdpwssd           xmm4, xmm1, xm12      ; a0 b0
   1680    lea                srcq, [srcq+ssq*2]
   1681    mova               xmm1, xmm2
   1682    vpdpwssd           xmm4, xmm2, xm13      ; a1 b1
   1683    punpckldq          xmm2, xmm0, xmm3      ; 4 5
   1684    movd               xmm0, [srcq+ssq*0]
   1685    punpckldq          xmm3, xmm0            ; 5 6
   1686    punpcklwd          xmm2, xmm3            ; 45 56
   1687    vpdpwssd           xmm4, xmm2, xm14      ; a2 b2
   1688    psrad              xmm4, 6
   1689    packusdw           xmm4, xmm4
   1690    pminsw             xmm4, xm15
   1691    movd       [dstq+dsq*0], xmm4
   1692    pextrd     [dstq+dsq*1], xmm4, 1
   1693    lea                dstq, [dstq+dsq*2]
   1694    sub                  hd, 2
   1695    jg .v_w2_loop
   1696    RET
   1697 .v_w4:
   1698    movq               xmm1, [srcq+r6 *2]
   1699    vpbroadcastq       ymm3, [srcq+r6 *1]
   1700    vpbroadcastq       ymm2, [srcq+ssq*0]
   1701    vpbroadcastq       ymm4, [srcq+ssq*1]
   1702    lea                srcq, [srcq+ssq*2]
   1703    vpbroadcastq       ymm0, [srcq+ssq*0]
   1704    vpblendd           ymm1, ymm3, 0x30
   1705    vpblendd           ymm3, ymm2, 0x30
   1706    punpcklwd          ymm1, ymm3       ; 01 12
   1707    vpblendd           ymm2, ymm4, 0x30
   1708    vpblendd           ymm4, ymm0, 0x30
   1709    punpcklwd          ymm2, ymm4       ; 23 34
   1710 .v_w4_loop:
   1711    vpbroadcastq       ymm3, [srcq+ssq*1]
   1712    mova               ymm4, ym11
   1713    vpdpwssd           ymm4, ymm1, ym12 ; a0 b0
   1714    lea                srcq, [srcq+ssq*2]
   1715    mova               ymm1, ymm2
   1716    vpdpwssd           ymm4, ymm2, ym13 ; a1 b1
   1717    vpblendd           ymm2, ymm0, ymm3, 0x30
   1718    vpbroadcastq       ymm0, [srcq+ssq*0]
   1719    vpblendd           ymm3, ymm0, 0x30
   1720    punpcklwd          ymm2, ymm3       ; 45 56
   1721    vpdpwssd           ymm4, ymm2, ym14 ; a2 b2
   1722    psrad              ymm4, 6
   1723    vextracti128       xmm3, ymm4, 1
   1724    packusdw           xmm4, xmm3
   1725    pminsw             xmm4, xm15
   1726    movq       [dstq+dsq*0], xmm4
   1727    movhps     [dstq+dsq*1], xmm4
   1728    lea                dstq, [dstq+dsq*2]
   1729    sub                  hd, 2
   1730    jg .v_w4_loop
   1731    vzeroupper
   1732    RET
   1733 .v_w8:
   1734    vbroadcasti32x4      m0, [srcq+ssq*0]
   1735    vinserti32x4         m1, m0, [srcq+r6 *2], 0
   1736    vinserti32x4         m1, [srcq+r6 *1], 1 ; 0 1 2
   1737    vinserti32x4        ym0, [srcq+ssq*1], 1
   1738    lea                srcq, [srcq+ssq*2]
   1739    mova                 m5, [spel_v_shuf8]
   1740    vinserti32x4         m0, [srcq+ssq*0], 2 ; 2 3 4
   1741    vpermb               m1, m5, m1          ; 01 12
   1742    vpermb               m2, m5, m0          ; 23 34
   1743 .v_w8_loop:
   1744    vinserti32x4         m0, [srcq+ssq*1], 3
   1745    lea                srcq, [srcq+ssq*2]
   1746    movu                xm3, [srcq+ssq*0]
   1747    mova                 m4, m11
   1748    vpdpwssd             m4, m12, m1         ; a0 b0
   1749    vshufi32x4           m0, m3, q1032       ; 4 5 6
   1750    mova                 m1, m2
   1751    vpdpwssd             m4, m13, m2         ; a1 b1
   1752    vpermb               m2, m5, m0          ; 45 56
   1753    vpdpwssd             m4, m14, m2         ; a2 b2
   1754    psrad                m4, 6
   1755    vextracti32x8       ym3, m4, 1
   1756    packusdw            ym4, ym3
   1757    pminsw              ym4, ym15
   1758    mova          [dstq+dsq*0], xm4
   1759    vextracti32x4 [dstq+dsq*1], ym4, 1
   1760    lea                dstq, [dstq+dsq*2]
   1761    sub                  hd, 2
   1762    jg .v_w8_loop
   1763    RET
   1764 .v_w16:
   1765    vbroadcasti32x8      m0, [srcq+r6 *1]
   1766    vinserti32x8         m1, m0, [srcq+ssq*0], 1
   1767    vinserti32x8         m0, [srcq+r6*2], 0
   1768    mova                 m6, [spel_v_shuf16]
   1769    movu                ym3, [srcq+ssq*1]
   1770    lea                srcq, [srcq+ssq*2]
   1771    vinserti32x8         m3, [srcq+ssq*0], 1
   1772    vpermb               m1, m6, m1     ; 12
   1773    vpermb               m0, m6, m0     ; 01
   1774    vpermb               m3, m6, m3     ; 34
   1775    mova                 m7, [deint_q_shuf]
   1776    vpshrdd              m2, m1, m3, 16 ; 23
   1777 .v_w16_loop:
   1778    mova                 m5, m11
   1779    vpdpwssd             m5, m12, m1    ; b0
   1780    mova                 m4, m11
   1781    vpdpwssd             m4, m12, m0    ; a0
   1782    mova                 m1, m3
   1783    vpdpwssd             m5, m13, m3    ; b1
   1784    mova                 m0, m2
   1785    vpdpwssd             m4, m13, m2    ; a1
   1786    movu                ym3, [srcq+ssq*1]
   1787    lea                srcq, [srcq+ssq*2]
   1788    vinserti32x8         m3, [srcq+ssq*0], 1
   1789    vpermb               m3, m6, m3     ; 56
   1790    vpshrdd              m2, m1, m3, 16 ; 45
   1791    vpdpwssd             m5, m14, m3    ; b2
   1792    vpdpwssd             m4, m14, m2    ; a2
   1793    psrad                m5, 6
   1794    psrad                m4, 6
   1795    packusdw             m4, m5
   1796    pminsw               m4, m15
   1797    vpermq               m4, m7, m4
   1798    mova          [dstq+dsq*0], ym4
   1799    vextracti32x8 [dstq+dsq*1], m4, 1
   1800    lea                dstq, [dstq+dsq*2]
   1801    sub                  hd, 2
   1802    jg .v_w16_loop
   1803    RET
   1804 .v_w32:
   1805 .v_w64:
   1806 .v_w128:
   1807    lea                  wd, [hq+wq*8-256]
   1808 .v_w32_loop0:
   1809    movu                m16, [srcq+r6 *2]
   1810    movu                m17, [srcq+r6 *1]
   1811    lea                  r7, [srcq+ssq*2]
   1812    movu                m18, [srcq+ssq*0]
   1813    movu                m19, [srcq+ssq*1]
   1814    mov                  r8, dstq
   1815    movu                m20, [r7  +ssq*0]
   1816    punpcklwd            m0, m16, m17 ; 01
   1817    punpckhwd           m16, m17
   1818    punpcklwd            m1, m17, m18 ; 12
   1819    punpckhwd           m17, m18
   1820    punpcklwd            m2, m18, m19 ; 23
   1821    punpckhwd           m18, m19
   1822    punpcklwd            m3, m19, m20 ; 34
   1823    punpckhwd           m19, m20
   1824 .v_w32_loop:
   1825    mova                 m4, m11
   1826    vpdpwssd             m4, m12, m0  ; a0
   1827    mova                 m6, m11
   1828    vpdpwssd             m6, m12, m16
   1829    mova                 m5, m11
   1830    vpdpwssd             m5, m12, m1  ; b0
   1831    mova                 m7, m11
   1832    vpdpwssd             m7, m12, m17
   1833    mova                 m0, m2
   1834    vpdpwssd             m4, m13, m2  ; a1
   1835    mova                m16, m18
   1836    vpdpwssd             m6, m13, m18
   1837    mova                 m1, m3
   1838    vpdpwssd             m5, m13, m3  ; b1
   1839    mova                m17, m19
   1840    vpdpwssd             m7, m13, m19
   1841    movu                m19, [r7+ssq*1]
   1842    lea                  r7, [r7+ssq*2]
   1843    punpcklwd            m2, m20, m19 ; 45
   1844    punpckhwd           m18, m20, m19
   1845    movu                m20, [r7+ssq*0]
   1846    vpdpwssd             m4, m14, m2  ; a2
   1847    vpdpwssd             m6, m14, m18
   1848    punpcklwd            m3, m19, m20 ; 56
   1849    punpckhwd           m19, m20
   1850    vpdpwssd             m5, m14, m3  ; b2
   1851    vpdpwssd             m7, m14, m19
   1852    REPX       {psrad x, 6}, m4, m6, m5, m7
   1853    packusdw             m4, m6
   1854    packusdw             m5, m7
   1855    pminsw               m4, m15
   1856    pminsw               m5, m15
   1857    mova         [r8+dsq*0], m4
   1858    mova         [r8+dsq*1], m5
   1859    lea                  r8, [r8+dsq*2]
   1860    sub                  hd, 2
   1861    jg .v_w32_loop
   1862    add                srcq, 64
   1863    add                dstq, 64
   1864    movzx                hd, wb
   1865    sub                  wd, 1<<8
   1866    jg .v_w32_loop0
   1867    vzeroupper
   1868    RET
   1869 .hv:
   1870    cmp                  wd, 4
   1871    jg .hv_w8
   1872    movzx               mxd, mxb
   1873    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
   1874    movzx               mxd, myb
   1875    shr                 myd, 16
   1876    cmp                  hd, 6
   1877    cmovs               myd, mxd
   1878    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
   1879    mov                  r6, ssq
   1880    sub                srcq, 2
   1881    neg                  r6
   1882    test          dword r8m, 0x800
   1883    jnz .hv_12bit
   1884    vpbroadcastd        m10, [pd_2176]
   1885    psllw              xmm0, 6
   1886    jmp .hv_main
   1887 .hv_12bit:
   1888    vpbroadcastd        m10, [pd_640]
   1889    psllw              xmm0, 4
   1890    psllw              xmm1, 2
   1891 .hv_main:
   1892    movu                xm4, [srcq+r6 *2]
   1893    vinserti32x4        ym4, [srcq+r6 *1], 1
   1894    vinserti32x4         m4, [srcq+ssq*0], 2
   1895    vbroadcasti32x4      m6, [spel_h_shufA]
   1896    vinserti32x4         m4, [srcq+ssq*1], 3 ; 0 1 2 3
   1897    lea                srcq, [srcq+ssq*2]
   1898    movu                xm5, [srcq+ssq*0]    ; 4
   1899    mova           [buf+ 0], xmm0
   1900    mova           [buf+16], xmm1
   1901    vpbroadcastd         m8, [buf+ 4]
   1902    vpbroadcastd         m9, [buf+ 8]
   1903    vpbroadcastd       ym12, xmm1
   1904    vpbroadcastd       ym13, [buf+20]
   1905    vpbroadcastd       ym14, [buf+24]
   1906    cmp                  wd, 4
   1907    je .hv_w4
   1908    vbroadcasti32x4      m2, [spel_h_shufA]
   1909    mova                 m3, [spel_h_shuf2b]
   1910    mova                 m1, m10
   1911    pshufb               m4, m6
   1912    pshufb              xm5, xm6
   1913    punpcklqdq           m2, m4, m5
   1914    vpdpwssd             m1, m8, m2    ; 04 1_ 2_ 3_
   1915    mova                ym6, [spel_h_shuf2a]
   1916    punpckhqdq           m4, m5
   1917    mova                xm5, [spel_shuf2]
   1918    vpdpwssd             m1, m9, m4
   1919    vpermb               m1, m3, m1    ; 01 12
   1920    vextracti32x4       xm2, ym1, 1    ; 23 34
   1921 .hv_w2_loop:
   1922    movu                xm3, [srcq+ssq*1]
   1923    lea                srcq, [srcq+ssq*2]
   1924    vinserti32x4        ym3, [srcq+ssq*0], 1
   1925    vpermb              ym3, ym6, ym3
   1926    pmaddwd            xmm0, xm12, xm1 ; a0 b0
   1927    mova                xm4, xm10
   1928    vpdpwssd            xm4, xm8, xm3
   1929    vextracti32x4       xm3, ym3, 1
   1930    mova                xm1, xm2
   1931    vpdpwssd           xmm0, xm13, xm2 ; a1 b1
   1932    vpdpwssd            xm4, xm9, xm3  ; 5 6
   1933    vpermt2b            xm2, xm5, xm4  ; 45 56
   1934    vpdpwssd           xmm0, xm14, xm2 ; a2 b2
   1935    psrad              xmm0, 10
   1936    packusdw           xmm0, xmm0
   1937    pminsw             xmm0, xm15
   1938    movd       [dstq+dsq*0], xmm0
   1939    pextrd     [dstq+dsq*1], xmm0, 1
   1940    lea                dstq, [dstq+dsq*2]
   1941    sub                  hd, 2
   1942    jg .hv_w2_loop
   1943    RET
   1944 .hv_w4:
   1945    vbroadcasti32x4      m7, [spel_h_shufB]
   1946    mova                ym0, [spel_shuf4a]
   1947    pshufb               m1, m4, m6
   1948    mova                 m2, m10
   1949    vpdpwssd             m2, m8, m1
   1950    pshufb              xm1, xm5, xm6
   1951    mova                xm3, xm10
   1952    vpdpwssd            xm3, xm8, xm1
   1953    pshufb               m4, m7
   1954    pshufb              xm5, xm7
   1955    vpdpwssd             m2, m9, m4    ; 0 1 2 3
   1956    vpdpwssd            xm3, xm9, xm5  ; 4
   1957    mova                ym5, [spel_shuf4b]
   1958    vpermb               m1, m0, m2    ; 01 12
   1959    vshufi32x4           m2, m3, q1032 ; 2 3 4
   1960    vpermb               m2, m0, m2    ; 23 34
   1961 .hv_w4_loop:
   1962    movu                xm3, [srcq+ssq*1]
   1963    lea                srcq, [srcq+ssq*2]
   1964    vinserti32x4        ym3, [srcq+ssq*0], 1
   1965    pmaddwd             ym0, ym12, ym1 ; a0 b0
   1966    mova                ym1, ym2
   1967    pshufb              ym4, ym3, ym6
   1968    mova                ym2, ym10
   1969    vpdpwssd            ym2, ym8, ym4
   1970    pshufb              ym3, ym7
   1971    vpdpwssd            ym0, ym13, ym1 ; a1 b1
   1972    vpdpwssd            ym2, ym9, ym3  ; 5 6
   1973    vpermt2b            ym2, ym5, ym1  ; 45 56
   1974    vpdpwssd            ym0, ym14, ym2 ; a2 b2
   1975    psrad               ym0, 10
   1976    vextracti32x4       xm4, ym0, 1
   1977    packusdw            xm0, xm4
   1978    pminsw             xmm0, xm0, xm15
   1979    movq       [dstq+dsq*0], xmm0
   1980    movhps     [dstq+dsq*1], xmm0
   1981    lea                dstq, [dstq+dsq*2]
   1982    sub                  hd, 2
   1983    jg .hv_w4_loop
   1984    RET
   1985 .hv_w8:
   1986    shr                 mxd, 16
   1987    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
   1988    movzx               mxd, myb
   1989    shr                 myd, 16
   1990    cmp                  hd, 6
   1991    cmovs               myd, mxd
   1992    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
   1993    mov                  r6, ssq
   1994    sub                srcq, 4
   1995    neg                  r6
   1996    test          dword r8m, 0x800
   1997    jnz .hv_w8_12bit
   1998    vpbroadcastd         m8, [pd_2176]
   1999    psllw              xmm0, 6
   2000    jmp .hv_w8_main
   2001 .hv_w8_12bit:
   2002    vpbroadcastd         m8, [pd_640]
   2003    psllw              xmm0, 4
   2004    psllw              xmm1, 2
   2005 .hv_w8_main:
   2006    mova           [buf+ 0], xmm0
   2007    mova           [buf+16], xmm1
   2008    vpbroadcastd         m9, xmm0
   2009    vpbroadcastd        m10, [buf+ 4]
   2010    vpbroadcastd        m11, [buf+ 8]
   2011    vpbroadcastd        m12, xmm1
   2012    vpbroadcastd        m13, [buf+20]
   2013    vpbroadcastd        m14, [buf+24]
   2014    cmp                  wd, 16
   2015    jge .hv_w16
   2016    mova                 m6, [spel_h_shufA]
   2017    movu               ym16, [srcq+r6 *2]
   2018    vinserti32x8        m16, [srcq+r6 *1], 1 ; 0 1
   2019    movu               ym17, [srcq+ssq*0]
   2020    vinserti32x8        m17, [srcq+ssq*1], 1 ; 2 3
   2021    lea                srcq, [srcq+ssq*2]
   2022    movu               ym18, [srcq+ssq*0]    ; 4
   2023    movu                 m7, [spel_h_shufC]
   2024    vpermb               m3, m6, m16
   2025    mova                 m1, m8
   2026    vpermb               m4, m6, m17
   2027    vpdpwssd             m1, m9, m3   ; a0 b0
   2028    mova                 m2, m8
   2029    vpermb               m5, m6, m18
   2030    vpdpwssd             m2, m9, m4   ; c0 d0
   2031    mova                 m0, m8
   2032    vpermb              m16, m7, m16
   2033    vpdpwssd             m0, m9, m5   ; e0
   2034    vpermb              m17, m7, m17
   2035    vpdpwssd             m1, m11, m16 ; a2 b2
   2036    vpermb              m18, m7, m18
   2037    vpdpwssd             m2, m11, m17 ; c2 d2
   2038    shufpd               m3, m16, 0x55
   2039    vpdpwssd             m0, m11, m18 ; e2
   2040    mova                m16, [spel_shuf8a]
   2041    shufpd               m4, m17, 0x55
   2042    vpdpwssd             m1, m10, m3  ; a1 b1
   2043    shufpd               m5, m18, 0x55
   2044    vpdpwssd             m2, m10, m4  ; c1 d1
   2045    vpdpwssd             m0, m10, m5  ; e1
   2046    mova                 m5, [spel_shuf8b]
   2047    vpermt2b             m1, m16, m2  ; 01 12
   2048    vpermt2b             m2, m16, m0  ; 23 34
   2049 .hv_w8_loop:
   2050    movu               ym18, [srcq+ssq*1]
   2051    lea                srcq, [srcq+ssq*2]
   2052    vinserti32x8        m18, [srcq+ssq*0], 1
   2053    mova                 m0, m8
   2054    vpermb              m17, m6, m18
   2055    vpdpwssd             m0, m9, m17  ; f0 g0
   2056    vpermb              m18, m7, m18
   2057    pmaddwd             m16, m12, m1  ; A0 B0
   2058    vpdpwssd             m0, m11, m18 ; f2 g2
   2059    shufpd              m17, m18, 0x55
   2060    mova                 m1, m2
   2061    vpdpwssd            m16, m13, m2  ; A1 B1
   2062    vpdpwssd             m0, m10, m17 ; f1 g1
   2063    vpermt2b             m2, m5, m0   ; 45 56
   2064    vpdpwssd            m16, m14, m2  ; A2 B2
   2065    psrad               m16, 10
   2066    vextracti32x8      ym17, m16, 1
   2067    packusdw           ym16, ym17
   2068    pminsw             ym16, ym15
   2069    mova         [dstq+dsq*0], xm16
   2070    vextracti128 [dstq+dsq*1], ym16, 1
   2071    lea                dstq, [dstq+dsq*2]
   2072    sub                  hd, 2
   2073    jg .hv_w8_loop
   2074    vzeroupper
   2075    RET
   2076 .hv_w16:
   2077    vbroadcasti32x4     m20, [spel_h_shufA]
   2078    vbroadcasti32x4     m21, [spel_h_shufB]
   2079    jg .hv_w32
   2080    vbroadcasti32x8      m6, [srcq+r6 *2+ 8]
   2081    vinserti32x8         m2, m6, [srcq+r6 *2+16], 1
   2082    vinserti32x8         m6, [srcq+r6 *2+ 0], 0 ; 0
   2083    movu               ym16, [srcq+r6 *1+ 0]
   2084    movu               ym17, [srcq+r6 *1+12]
   2085    vinserti32x8        m16, [srcq+ssq*0+ 0], 1
   2086    vinserti32x8        m17, [srcq+ssq*0+12], 1 ; 1 2
   2087    movu               ym18, [srcq+ssq*1+ 0]
   2088    movu               ym19, [srcq+ssq*1+12]
   2089    lea                srcq, [srcq+ssq*2]
   2090    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
   2091    vinserti32x8        m19, [srcq+ssq*0+12], 1 ; 3 4
   2092    pshufb               m2, m20
   2093    mova                 m1, m8
   2094    pshufb               m3, m16, m20
   2095    vpdpwssd             m1, m11, m2    ; a2
   2096    mova                 m2, m8
   2097    pshufb               m4, m17, m21
   2098    vpdpwssd             m2, m9, m3     ; b0  c0
   2099    mova                 m3, m8
   2100    pshufb               m5, m18, m20
   2101    vpdpwssd             m3, m11, m4    ; b2' c2'
   2102    mova                 m4, m8
   2103    pshufb               m7, m19, m21
   2104    vpdpwssd             m4, m9, m5     ; d0  e0
   2105    mova                 m5, m8
   2106    pshufb               m0, m6, m20
   2107    vpdpwssd             m5, m11, m7    ; d2' e2'
   2108    mova                 m7, [spel_shuf16]
   2109    pshufb              m16, m21
   2110    vpdpwssd             m1, m9, m0     ; a0
   2111    pshufb              m17, m20
   2112    vpdpwssd             m2, m10, m16   ; b1  c1
   2113    pshufb              m18, m21
   2114    vpdpwssd             m3, m10, m17   ; b1' c1'
   2115    pshufb              m19, m20
   2116    vpdpwssd             m4, m10, m18   ; d1  e1
   2117    pshufb               m6, m21
   2118    vpdpwssd             m5, m10, m19   ; d1' e1'
   2119    shufpd              m16, m17, 0x55
   2120    vpdpwssd             m1, m10, m6    ; a1
   2121    shufpd              m18, m19, 0x55
   2122    vpdpwssd             m2, m11, m16   ; b2  c2
   2123    vpdpwssd             m3, m9, m16    ; b0' c0'
   2124    vpdpwssd             m4, m11, m18   ; d2  e2
   2125    vpdpwssd             m5, m9, m18    ; d0' e0'
   2126    pslldq               m1, 1
   2127    vpermt2b             m2, m7, m3     ; 12
   2128    vpermt2b             m4, m7, m5     ; 34
   2129    vpshrdd              m1, m2, 16     ; 01
   2130    vpshrdd              m3, m2, m4, 16 ; 23
   2131 .hv_w16_loop:
   2132    movu               ym18, [srcq+ssq*1+ 0]
   2133    movu               ym19, [srcq+ssq*1+12]
   2134    lea                srcq, [srcq+ssq*2]
   2135    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
   2136    vinserti32x8        m19, [srcq+ssq*0+12], 1
   2137    mova                 m5, m8
   2138    mova                 m6, m8
   2139    pshufb              m17, m18, m20
   2140    vpdpwssd             m5, m9, m17    ; f0  g0
   2141    pshufb              m16, m19, m21
   2142    vpdpwssd             m6, m11, m16   ; f2' g2'
   2143    pmaddwd             m17, m12, m2    ; B0
   2144    mova                 m2, m4
   2145    pmaddwd             m16, m12, m1    ; A0
   2146    mova                 m1, m3
   2147    pshufb              m18, m21
   2148    vpdpwssd             m5, m10, m18   ; f1  g1
   2149    pshufb              m19, m20
   2150    vpdpwssd             m6, m10, m19   ; f1' g1'
   2151    vpdpwssd            m17, m13, m4    ; B1
   2152    vpdpwssd            m16, m13, m3    ; A1
   2153    shufpd              m18, m19, 0x55
   2154    vpdpwssd             m5, m11, m18   ; f2  g2
   2155    vpdpwssd             m6, m9, m18    ; f0' g0'
   2156    mova                 m4, m7
   2157    vpermi2b             m4, m5, m6     ; 56
   2158    vpshrdd              m3, m2, m4, 16 ; 45
   2159    vpdpwssd            m17, m14, m4    ; B2
   2160    vpdpwssd            m16, m14, m3    ; A2
   2161    psrad               m16, 10
   2162    psrad               m17, 10
   2163    vshufi32x4          m18, m16, m17, q3232
   2164    vinserti32x8        m16, ym17, 1
   2165    packusdw            m16, m18
   2166    pminsw              m16, m15
   2167    mova          [dstq+dsq*0], ym16
   2168    vextracti32x8 [dstq+dsq*1], m16, 1
   2169    lea                dstq, [dstq+dsq*2]
   2170    sub                  hd, 2
   2171    jg .hv_w16_loop
   2172    vzeroupper
   2173    RET
   2174 .hv_w32:
   2175    WIN64_SPILL_XMM      28
   2176    mova                m27, [spel_shuf32]
   2177    lea                  wd, [hq+wq*8-256]
   2178 .hv_w32_loop0:
   2179    movu                m16, [srcq+r6 *2+ 0]
   2180    movu                 m7, [srcq+r6 *2+12]
   2181    movu                 m6, [srcq+r6 *1+ 0]
   2182    movu                m18, [srcq+r6 *1+12]
   2183    lea                  r7, [srcq+ssq*2]
   2184    movu                m17, [srcq+ssq*0+ 0]
   2185    movu                m19, [srcq+ssq*0+12]
   2186    movu                m22, [srcq+ssq*1+ 0]
   2187    movu                m24, [srcq+ssq*1+12]
   2188    mov                  r8, dstq
   2189    movu                m23, [r7  +ssq*0+ 0]
   2190    movu                m25, [r7  +ssq*0+12]
   2191    pshufb               m1, m16, m20
   2192    mova                 m0, m8
   2193    pshufb               m2, m7, m21
   2194    vpdpwssd             m0, m9, m1     ; a0
   2195    mova                 m1, m8
   2196    pshufb               m4, m6, m20
   2197    vpdpwssd             m1, m11, m2    ; a2'
   2198    mova                 m2, m8
   2199    pshufb               m3, m17, m20
   2200    vpdpwssd             m2, m9, m4     ; b0
   2201    mova                 m4, m8
   2202    pshufb               m5, m18, m21
   2203    vpdpwssd             m4, m9, m3     ; c0
   2204    mova                 m3, m8
   2205    pshufb              m26, m19, m21
   2206    vpdpwssd             m3, m11, m5    ; b2'
   2207    mova                 m5, m8
   2208    pshufb              m16, m21
   2209    vpdpwssd             m5, m11, m26   ; c2'
   2210    pshufb               m7, m20
   2211    vpdpwssd             m0, m10, m16   ; a1
   2212    pshufb               m6, m21
   2213    vpdpwssd             m1, m10, m7    ; a1'
   2214    pshufb              m17, m21
   2215    vpdpwssd             m2, m10, m6    ; b1
   2216    pshufb              m18, m20
   2217    vpdpwssd             m4, m10, m17   ; c1
   2218    pshufb              m19, m20
   2219    vpdpwssd             m3, m10, m18   ; b1'
   2220    shufpd              m16, m7, 0x55
   2221    vpdpwssd             m5, m10, m19   ; c1'
   2222    shufpd               m6, m18, 0x55
   2223    vpdpwssd             m0, m11, m16   ; a2
   2224    shufpd              m17, m19, 0x55
   2225    vpdpwssd             m1, m9, m16    ; a0'
   2226    pshufb              m16, m22, m20
   2227    vpdpwssd             m2, m11, m6    ; b2
   2228    pshufb               m7, m23, m20
   2229    vpdpwssd             m4, m11, m17   ; c2
   2230    vpdpwssd             m3, m9, m6     ; b0'
   2231    mova                 m6, m8
   2232    vpdpwssd             m5, m9, m17    ; c0'
   2233    pshufb              m17, m24, m21
   2234    vpdpwssd             m6, m9, m16    ; d0
   2235    mova                m16, m8
   2236    pshufb              m26, m25, m21
   2237    vpdpwssd            m16, m9, m7     ; e0
   2238    mova                 m7, m8
   2239    pshufb              m22, m21
   2240    vpdpwssd             m7, m11, m17   ; d2'
   2241    mova                m17, m8
   2242    pshufb              m23, m21
   2243    vpdpwssd            m17, m11, m26   ; e2'
   2244    pshufb              m24, m20
   2245    vpdpwssd             m6, m10, m22   ; d1
   2246    pshufb              m25, m20
   2247    vpdpwssd            m16, m10, m23   ; e1
   2248    shufpd              m22, m24, 0x55
   2249    vpdpwssd             m7, m10, m24   ; d1'
   2250    shufpd              m23, m25, 0x55
   2251    vpdpwssd            m17, m10, m25   ; e1'
   2252    pslldq               m0, 1
   2253    vpdpwssd             m6, m11, m22   ; d2
   2254    pslldq               m1, 1
   2255    vpdpwssd            m16, m11, m23   ; e2
   2256    vpermt2b             m2, m27, m4    ; 12
   2257    vpdpwssd             m7, m9, m22    ; d0'
   2258    vpermt2b             m3, m27, m5    ; 12'
   2259    vpdpwssd            m17, m9, m23    ; e0'
   2260    vpshrdd              m0, m2, 16     ; 01
   2261    vpermt2b             m6, m27, m16   ; 34
   2262    vpshrdd              m1, m3, 16     ; 01'
   2263    vpermt2b             m7, m27, m17   ; 34'
   2264    vpshrdd              m4, m2, m6, 16 ; 23
   2265    vpshrdd              m5, m3, m7, 16 ; 23'
   2266 .hv_w32_loop:
   2267    movu                m22, [r7+ssq*1+ 0]
   2268    movu                m24, [r7+ssq*1+12]
   2269    lea                  r7, [r7+ssq*2]
   2270    movu                m23, [r7+ssq*0+ 0]
   2271    movu                m25, [r7+ssq*0+12]
   2272    pmaddwd             m17, m12, m2    ; B0
   2273    mova                 m2, m6
   2274    pmaddwd             m19, m12, m3    ; B0'
   2275    mova                 m3, m7
   2276    pmaddwd             m16, m12, m0    ; A0
   2277    mova                 m0, m4
   2278    pmaddwd             m18, m12, m1    ; A0'
   2279    mova                 m1, m5
   2280    vpdpwssd            m17, m13, m6    ; B1
   2281    vpdpwssd            m19, m13, m7    ; B1'
   2282    mova                 m6, m8
   2283    vpdpwssd            m16, m13, m4    ; A1
   2284    pshufb               m4, m22, m20
   2285    vpdpwssd            m18, m13, m5    ; A1'
   2286    pshufb               m7, m23, m20
   2287    vpdpwssd             m6, m9, m4     ; f0
   2288    mova                 m4, m8
   2289    pshufb               m5, m24, m21
   2290    vpdpwssd             m4, m9, m7     ; g0
   2291    mova                 m7, m8
   2292    pshufb              m26, m25, m21
   2293    vpdpwssd             m7, m11, m5    ; f2'
   2294    mova                 m5, m8
   2295    pshufb              m22, m21
   2296    vpdpwssd             m5, m11, m26   ; g2'
   2297    pshufb              m23, m21
   2298    vpdpwssd             m6, m10, m22   ; f1
   2299    pshufb              m24, m20
   2300    vpdpwssd             m4, m10, m23   ; g1
   2301    pshufb              m25, m20
   2302    vpdpwssd             m7, m10, m24   ; f1'
   2303    shufpd              m22, m24, 0x55
   2304    vpdpwssd             m5, m10, m25   ; g1'
   2305    shufpd              m23, m25, 0x55
   2306    vpdpwssd             m6, m11, m22   ; f2
   2307    vpdpwssd             m4, m11, m23   ; g2
   2308    vpdpwssd             m7, m9, m22    ; f0'
   2309    vpdpwssd             m5, m9, m23    ; g0'
   2310    vpermt2b             m6, m27, m4    ; 56
   2311    vpermt2b             m7, m27, m5    ; 56'
   2312    vpdpwssd            m17, m14, m6    ; B2
   2313    vpshrdd              m4, m2, m6, 16 ; 45
   2314    vpdpwssd            m19, m14, m7    ; B2'
   2315    vpshrdd              m5, m3, m7, 16 ; 45'
   2316    vpdpwssd            m16, m14, m4    ; A2
   2317    vpdpwssd            m18, m14, m5    ; A2'
   2318    REPX      {psrad x, 10}, m17, m19, m16, m18
   2319    packusdw            m17, m19
   2320    packusdw            m16, m18
   2321    pminsw              m17, m15
   2322    pminsw              m16, m15
   2323    mova         [r8+dsq*0], m16
   2324    mova         [r8+dsq*1], m17
   2325    lea                  r8, [r8+dsq*2]
   2326    sub                  hd, 2
   2327    jg .hv_w32_loop
   2328    add                srcq, 64
   2329    add                dstq, 64
   2330    movzx                hd, wb
   2331    sub                  wd, 1<<8
   2332    jg .hv_w32_loop0
   2333    RET
   2334 
   2335 PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
   2336 PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
   2337 PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
   2338 PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
   2339 PUT_8TAP_FN sharp,          SHARP,   SHARP
   2340 
   2341 cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
   2342    imul                mxd, mxm, 0x010101
   2343    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   2344    imul                myd, mym, 0x010101
   2345    add                 myd, t1d ; 8tap_v, my, 4tap_v
   2346    lea                  r8, [put_avx512icl]
   2347    movifnidn            wd, wm
   2348    movifnidn            hd, hm
   2349    test                mxd, 0xf00
   2350    jnz .h
   2351    test                myd, 0xf00
   2352    jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put
   2353 .v:
   2354    movzx               mxd, myb
   2355    shr                 myd, 16
   2356    cmp                  hd, 6
   2357    cmovs               myd, mxd
   2358    vpbroadcastd        m10, [pd_32]
   2359    pmovsxbw           xmm0, [base+subpel_filters+myq*8]
   2360    tzcnt               r7d, wd
   2361    vpbroadcastw        m11, r8m
   2362    lea                  r6, [ssq*3]
   2363    movzx               r7d, word [r8+r7*2+table_offset(put, _8tap_v)]
   2364    sub                srcq, r6
   2365    mova [rsp+stack_offset+8], xmm0
   2366    vpbroadcastd        m12, xmm0
   2367    add                  r7, r8
   2368    vpbroadcastd        m13, [rsp+stack_offset+12]
   2369    vpbroadcastd        m14, [rsp+stack_offset+16]
   2370    vpbroadcastd        m15, [rsp+stack_offset+20]
   2371    jmp                  r7
   2372 .v_w2:
   2373    movd               xmm2, [srcq+ssq*0]
   2374    pinsrd             xmm2, [srcq+ssq*1], 1
   2375    pinsrd             xmm2, [srcq+ssq*2], 2
   2376    add                srcq, r6
   2377    pinsrd             xmm2, [srcq+ssq*0], 3  ; 0 1 2 3
   2378    movd               xmm3, [srcq+ssq*1]
   2379    vpbroadcastd       xmm1, [srcq+ssq*2]
   2380    add                srcq, r6
   2381    vpbroadcastd       xmm0, [srcq+ssq*0]
   2382    vpblendd           xmm3, xmm1, 0x02       ; 4 5
   2383    vpblendd           xmm1, xmm0, 0x02       ; 5 6
   2384    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
   2385    punpcklwd          xmm3, xmm1             ; 45 56
   2386    punpcklwd          xmm1, xmm2, xmm4       ; 01 12
   2387    punpckhwd          xmm2, xmm4             ; 23 34
   2388 .v_w2_loop:
   2389    vpbroadcastd       xmm4, [srcq+ssq*1]
   2390    lea                srcq, [srcq+ssq*2]
   2391    mova               xmm5, xm10
   2392    vpdpwssd           xmm5, xm12, xmm1       ; a0 b0
   2393    mova               xmm1, xmm2
   2394    vpdpwssd           xmm5, xm13, xmm2       ; a1 b1
   2395    mova               xmm2, xmm3
   2396    vpdpwssd           xmm5, xm14, xmm3       ; a2 b2
   2397    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
   2398    vpbroadcastd       xmm0, [srcq+ssq*0]
   2399    vpblendd           xmm4, xmm0, 0x02       ; 7 8
   2400    punpcklwd          xmm3, xmm4             ; 67 78
   2401    vpdpwssd           xmm5, xm15, xmm3       ; a3 b3
   2402    psrad              xmm5, 6
   2403    packusdw           xmm5, xmm5
   2404    pminsw             xmm5, xm11
   2405    movd       [dstq+dsq*0], xmm5
   2406    pextrd     [dstq+dsq*1], xmm5, 1
   2407    lea                dstq, [dstq+dsq*2]
   2408    sub                  hd, 2
   2409    jg .v_w2_loop
   2410    RET
   2411 .v_w4:
   2412    movq               xmm1, [srcq+ssq*0]
   2413    vpbroadcastq       ymm0, [srcq+ssq*1]
   2414    vpbroadcastq       ymm2, [srcq+ssq*2]
   2415    add                srcq, r6
   2416    vpbroadcastq       ymm4, [srcq+ssq*0]
   2417    vpbroadcastq       ymm3, [srcq+ssq*1]
   2418    vpbroadcastq       ymm5, [srcq+ssq*2]
   2419    add                srcq, r6
   2420    vpblendd           ymm1, ymm0, 0x30
   2421    vpblendd           ymm0, ymm2, 0x30
   2422    punpcklwd          ymm1, ymm0       ; 01 12
   2423    vpbroadcastq       ymm0, [srcq+ssq*0]
   2424    vpblendd           ymm2, ymm4, 0x30
   2425    vpblendd           ymm4, ymm3, 0x30
   2426    punpcklwd          ymm2, ymm4       ; 23 34
   2427    vpblendd           ymm3, ymm5, 0x30
   2428    vpblendd           ymm5, ymm0, 0x30
   2429    punpcklwd          ymm3, ymm5       ; 45 56
   2430 .v_w4_loop:
   2431    vpbroadcastq       ymm5, [srcq+ssq*1]
   2432    lea                srcq, [srcq+ssq*2]
   2433    mova               ymm4, ym10
   2434    vpdpwssd           ymm4, ym12, ymm1 ; a0 b0
   2435    mova               ymm1, ymm2
   2436    vpdpwssd           ymm4, ym13, ymm2 ; a1 b1
   2437    mova               ymm2, ymm3
   2438    vpdpwssd           ymm4, ym14, ymm3 ; a2 b2
   2439    vpblendd           ymm3, ymm0, ymm5, 0x30
   2440    vpbroadcastq       ymm0, [srcq+ssq*0]
   2441    vpblendd           ymm5, ymm0, 0x30
   2442    punpcklwd          ymm3, ymm5       ; 67 78
   2443    vpdpwssd           ymm4, ym15, ymm3 ; a3 b3
   2444    psrad              ymm4, 6
   2445    vextracti128       xmm5, ymm4, 1
   2446    packusdw           xmm4, xmm5
   2447    pminsw             xmm4, xm11
   2448    movq       [dstq+dsq*0], xmm4
   2449    movhps     [dstq+dsq*1], xmm4
   2450    lea                dstq, [dstq+dsq*2]
   2451    sub                  hd, 2
   2452    jg .v_w4_loop
   2453    vzeroupper
   2454    RET
   2455 .v_w8:
   2456    vbroadcasti32x4      m2, [srcq+ssq*2]
   2457    vinserti32x4         m1, m2, [srcq+ssq*0], 0
   2458    vinserti32x4         m1, [srcq+ssq*1], 1 ; 0 1 2
   2459    add                srcq, r6
   2460    vinserti32x4        ym2, [srcq+ssq*0], 1
   2461    vinserti32x4         m2, [srcq+ssq*1], 2 ; 2 3 4
   2462    mova                 m6, [spel_v_shuf8]
   2463    movu                xm0, [srcq+ssq*1]
   2464    vinserti32x4        ym0, [srcq+ssq*2], 1
   2465    add                srcq, r6
   2466    vinserti32x4         m0, [srcq+ssq*0], 2 ; 4 5 6
   2467    vpermb               m1, m6, m1          ; 01 12
   2468    vpermb               m2, m6, m2          ; 23 34
   2469    vpermb               m3, m6, m0          ; 45 56
   2470 .v_w8_loop:
   2471    vinserti32x4         m0, [srcq+ssq*1], 3
   2472    lea                srcq, [srcq+ssq*2]
   2473    movu                xm5, [srcq+ssq*0]
   2474    mova                 m4, m10
   2475    vpdpwssd             m4, m12, m1         ; a0 b0
   2476    mova                 m1, m2
   2477    vshufi32x4           m0, m5, q1032       ; 6 7 8
   2478    vpdpwssd             m4, m13, m2         ; a1 b1
   2479    mova                 m2, m3
   2480    vpdpwssd             m4, m14, m3         ; a2 b2
   2481    vpermb               m3, m6, m0          ; 67 78
   2482    vpdpwssd             m4, m15, m3         ; a3 b3
   2483    psrad                m4, 6
   2484    vextracti32x8       ym5, m4, 1
   2485    packusdw            ym4, ym5
   2486    pminsw              ym4, ym11
   2487    mova          [dstq+dsq*0], xm4
   2488    vextracti32x4 [dstq+dsq*1], ym4, 1
   2489    lea                dstq, [dstq+dsq*2]
   2490    sub                  hd, 2
   2491    jg .v_w8_loop
   2492    RET
   2493 .v_w16:
   2494    vbroadcasti32x8      m0, [srcq+ssq*1]
   2495    vinserti32x8         m1, m0, [srcq+ssq*2], 1
   2496    vinserti32x8         m0, [srcq+ssq*0], 0
   2497    mova                 m8, [spel_v_shuf16]
   2498    add                srcq, r6
   2499    movu                ym3, [srcq+ssq*0]
   2500    vinserti32x8         m3, [srcq+ssq*1], 1
   2501    movu                ym5, [srcq+ssq*2]
   2502    add                srcq, r6
   2503    vinserti32x8         m5, [srcq+ssq*0], 1
   2504    vpermb               m1, m8, m1     ; 12
   2505    vpermb               m0, m8, m0     ; 01
   2506    vpermb               m3, m8, m3     ; 34
   2507    vpermb               m5, m8, m5     ; 56
   2508    mova                 m9, [deint_q_shuf]
   2509    vpshrdd              m2, m1, m3, 16 ; 23
   2510    vpshrdd              m4, m3, m5, 16 ; 45
   2511 .v_w16_loop:
   2512    mova                 m7, m10
   2513    vpdpwssd             m7, m12, m1    ; b0
   2514    mova                 m6, m10
   2515    vpdpwssd             m6, m12, m0    ; a0
   2516    mova                 m1, m3
   2517    vpdpwssd             m7, m13, m3    ; b1
   2518    mova                 m0, m2
   2519    vpdpwssd             m6, m13, m2    ; a1
   2520    mova                 m3, m5
   2521    vpdpwssd             m7, m14, m5    ; b2
   2522    mova                 m2, m4
   2523    vpdpwssd             m6, m14, m4    ; a2
   2524    movu                ym5, [srcq+ssq*1]
   2525    lea                srcq, [srcq+ssq*2]
   2526    vinserti32x8         m5, [srcq+ssq*0], 1
   2527    vpermb               m5, m8, m5     ; 78
   2528    vpshrdd              m4, m3, m5, 16 ; 67
   2529    vpdpwssd             m7, m15, m5    ; b3
   2530    vpdpwssd             m6, m15, m4    ; a3
   2531    psrad                m7, 6
   2532    psrad                m6, 6
   2533    packusdw             m6, m7
   2534    pminsw               m6, m11
   2535    vpermq               m6, m9, m6
   2536    mova          [dstq+dsq*0], ym6
   2537    vextracti32x8 [dstq+dsq*1], m6, 1
   2538    lea                dstq, [dstq+dsq*2]
   2539    sub                  hd, 2
   2540    jg .v_w16_loop
   2541    RET
   2542 .v_w32:
   2543 .v_w64:
   2544 .v_w128:
   2545    WIN64_SPILL_XMM      23
   2546    lea                  wd, [hq+wq*8-256]
   2547 .v_w32_loop0:
   2548    movu                m16, [srcq+ssq*0]
   2549    movu                m17, [srcq+ssq*1]
   2550    lea                  r7, [srcq+r6   ]
   2551    movu                m18, [srcq+ssq*2]
   2552    movu                m19, [r7  +ssq*0]
   2553    mov                  r8, dstq
   2554    movu                m20, [r7  +ssq*1]
   2555    movu                m21, [r7  +ssq*2]
   2556    add                  r7, r6
   2557    movu                m22, [r7  +ssq*0]
   2558    punpcklwd            m0, m16, m17 ; 01l
   2559    punpckhwd           m16, m17      ; 01h
   2560    punpcklwd            m1, m17, m18 ; 12l
   2561    punpckhwd           m17, m18      ; 12h
   2562    punpcklwd            m2, m18, m19 ; 23l
   2563    punpckhwd           m18, m19      ; 23h
   2564    punpcklwd            m3, m19, m20 ; 34l
   2565    punpckhwd           m19, m20      ; 34h
   2566    punpcklwd            m4, m20, m21 ; 45l
   2567    punpckhwd           m20, m21      ; 45h
   2568    punpcklwd            m5, m21, m22 ; 56l
   2569    punpckhwd           m21, m22      ; 56h
   2570 .v_w32_loop:
   2571    mova                 m6, m10
   2572    vpdpwssd             m6, m12, m0  ; a0l
   2573    mova                 m8, m10
   2574    vpdpwssd             m8, m12, m16 ; a0h
   2575    mova                 m7, m10
   2576    vpdpwssd             m7, m12, m1  ; b0l
   2577    mova                 m9, m10
   2578    vpdpwssd             m9, m12, m17 ; b0h
   2579    mova                 m0, m2
   2580    vpdpwssd             m6, m13, m2  ; a1l
   2581    mova                m16, m18
   2582    vpdpwssd             m8, m13, m18 ; a1h
   2583    mova                 m1, m3
   2584    vpdpwssd             m7, m13, m3  ; b1l
   2585    mova                m17, m19
   2586    vpdpwssd             m9, m13, m19 ; b1h
   2587    mova                 m2, m4
   2588    vpdpwssd             m6, m14, m4  ; a2l
   2589    mova                m18, m20
   2590    vpdpwssd             m8, m14, m20 ; a2h
   2591    mova                 m3, m5
   2592    vpdpwssd             m7, m14, m5  ; b2l
   2593    mova                m19, m21
   2594    vpdpwssd             m9, m14, m21 ; b2h
   2595    movu                m21, [r7+ssq*1]
   2596    lea                  r7, [r7+ssq*2]
   2597    punpcklwd            m4, m22, m21 ; 67l
   2598    punpckhwd           m20, m22, m21 ; 67h
   2599    movu                m22, [r7+ssq*0]
   2600    vpdpwssd             m6, m15, m4  ; a3l
   2601    vpdpwssd             m8, m15, m20 ; a3h
   2602    punpcklwd            m5, m21, m22 ; 78l
   2603    punpckhwd           m21, m22      ; 78h
   2604    vpdpwssd             m7, m15, m5  ; b3l
   2605    vpdpwssd             m9, m15, m21 ; b3h
   2606    REPX       {psrad x, 6}, m6, m8, m7, m9
   2607    packusdw             m6, m8
   2608    packusdw             m7, m9
   2609    pminsw               m6, m11
   2610    pminsw               m7, m11
   2611    mova         [r8+dsq*0], m6
   2612    mova         [r8+dsq*1], m7
   2613    lea                  r8, [r8+dsq*2]
   2614    sub                  hd, 2
   2615    jg .v_w32_loop
   2616    add                srcq, 64
   2617    add                dstq, 64
   2618    movzx                hd, wb
   2619    sub                  wd, 1<<8
   2620    jg .v_w32_loop0
   2621    RET
   2622 .h_w2:
   2623    RESET_STACK_STATE
   2624    mova                ym2, [spel_h_shuf2a]
   2625    sub                srcq, 2
   2626    pshufd             xmm3, xmm0, q1111
   2627    pshufd             xmm4, xmm0, q2222
   2628 .h_w2_loop:
   2629    movu                xm1, [srcq+ssq*0]
   2630    vinserti32x4        ym1, [srcq+ssq*1], 1
   2631    lea                srcq, [srcq+ssq*2]
   2632    mova               xmm0, xm8
   2633    vpermb              ym1, ym2, ym1
   2634    vpdpwssd           xmm0, xmm3, xm1
   2635    vextracti32x4       xm1, ym1, 1
   2636    vpdpwssd           xmm0, xmm4, xm1
   2637    psrad              xmm0, 6
   2638    packusdw           xmm0, xmm0
   2639    pminsw             xmm0, xm15
   2640    movd       [dstq+dsq*0], xmm0
   2641    pextrd     [dstq+dsq*1], xmm0, 1
   2642    lea                dstq, [dstq+dsq*2]
   2643    sub                  hd, 2
   2644    jg .h_w2_loop
   2645    RET
   2646 .h_w4:
   2647    movzx               mxd, mxb
   2648    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
   2649    jl .h_w2
   2650    vbroadcasti32x4     ym4, [spel_h_shufA]
   2651    vbroadcasti32x4     ym5, [spel_h_shufB]
   2652    sub                srcq, 2
   2653    pshufd             xmm0, xmm0, q2211
   2654    vpbroadcastq        ym6, xmm0
   2655    vpermq              ym7, ymm0, q1111
   2656 .h_w4_loop:
   2657    movu                xm2, [srcq+ssq*0]
   2658    vinserti32x4        ym2, [srcq+ssq*1], 1
   2659    lea                srcq, [srcq+ssq*2]
   2660    mova                ym0, ym8
   2661    pshufb              ym1, ym2, ym4
   2662    vpdpwssd            ym0, ym6, ym1
   2663    pshufb              ym2, ym5
   2664    vpdpwssd            ym0, ym7, ym2
   2665    psrad               ym0, 6
   2666    vextracti32x4       xm1, ym0, 1
   2667    packusdw            xm0, xm1
   2668    pminsw             xmm0, xm0, xm15
   2669    movq       [dstq+dsq*0], xmm0
   2670    movhps     [dstq+dsq*1], xmm0
   2671    lea                dstq, [dstq+dsq*2]
   2672    sub                  hd, 2
   2673    jg .h_w4_loop
   2674    RET
   2675 .h_w8:
   2676    mova                 m4, [spel_h_shufA]
   2677    movu                 m5, [spel_h_shufB]
   2678    movu                 m6, [spel_h_shufC]
   2679    mova                 m7, [spel_h_shufD]
   2680 .h_w8_loop:
   2681    movu                ym2, [srcq+ssq*0]
   2682    vinserti32x8         m2, [srcq+ssq*1], 1
   2683    lea                srcq, [srcq+ssq*2]
   2684    mova                 m0, m8
   2685    vpermb               m1, m4, m2
   2686    vpdpwssd             m0, m10, m1
   2687    vpermb               m1, m5, m2
   2688    vpdpwssd             m0, m11, m1
   2689    vpermb               m1, m6, m2
   2690    vpdpwssd             m0, m12, m1
   2691    vpermb               m1, m7, m2
   2692    vpdpwssd             m0, m13, m1
   2693    psrad                m0, 6
   2694    vextracti32x8       ym1, m0, 1
   2695    packusdw            ym0, ym1
   2696    pminsw              ym0, ym15
   2697    mova          [dstq+dsq*0], xm0
   2698    vextracti32x4 [dstq+dsq*1], ym0, 1
   2699    lea                dstq, [dstq+dsq*2]
   2700    sub                  hd, 2
   2701    jg .h_w8_loop
   2702    RET
   2703 .h:
   2704    vpbroadcastw        m15, r8m
   2705    test                myd, 0xf00
   2706    jnz .hv
   2707    mov                 r7d, r8m
   2708    shr                 r7d, 11
   2709    vpbroadcastd         m8, [base+put_8tap_h_rnd+r7*4]
   2710    cmp                  wd, 4
   2711    jle .h_w4
   2712    shr                 mxd, 16
   2713    sub                srcq, 6
   2714    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
   2715    mova              [buf], xmm0
   2716    vpbroadcastd        m10, xmm0
   2717    vpbroadcastd        m11, [buf+ 4]
   2718    vpbroadcastd        m12, [buf+ 8]
   2719    vpbroadcastd        m13, [buf+12]
   2720    sub                  wd, 16
   2721    jl .h_w8
   2722    vbroadcasti32x4      m6, [spel_h_shufA]
   2723    vbroadcasti32x4      m7, [spel_h_shufB]
   2724    jg .h_w32
   2725 .h_w16_loop:
   2726    movu                ym2, [srcq+ssq*0+ 0]
   2727    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
   2728    movu                ym3, [srcq+ssq*0+16]
   2729    vinserti32x8         m3, [srcq+ssq*1+16], 1
   2730    lea                srcq, [srcq+ssq*2]
   2731    mova                 m0, m8
   2732    mova                 m1, m8
   2733    pshufb               m4, m2, m6
   2734    vpdpwssd             m0, m10, m4 ; a0
   2735    pshufb               m4, m3, m6
   2736    vpdpwssd             m1, m12, m4 ; b2
   2737    pshufb               m4, m2, m7
   2738    vpdpwssd             m0, m11, m4 ; a1
   2739    pshufb               m4, m3, m7
   2740    vpdpwssd             m1, m13, m4 ; b3
   2741    shufpd               m2, m3, 0x55
   2742    pshufb               m4, m2, m6
   2743    vpdpwssd             m0, m12, m4 ; a2
   2744    vpdpwssd             m1, m10, m4 ; b0
   2745    pshufb               m2, m7
   2746    vpdpwssd             m0, m13, m2 ; a3
   2747    vpdpwssd             m1, m11, m2 ; b1
   2748    psrad                m0, 6
   2749    psrad                m1, 6
   2750    packusdw             m0, m1
   2751    pminsw               m0, m15
   2752    mova          [dstq+dsq*0], ym0
   2753    vextracti32x8 [dstq+dsq*1], m0, 1
   2754    lea                dstq, [dstq+dsq*2]
   2755    sub                  hd, 2
   2756    jg .h_w16_loop
   2757    RET
   2758 .h_w32:
   2759    lea                srcq, [srcq+wq*2]
   2760    lea                dstq, [dstq+wq*2]
   2761    neg                  wq
   2762 .h_w32_loop0:
   2763    mov                  r6, wq
   2764 .h_w32_loop:
   2765    movu                 m2, [srcq+r6*2+ 0]
   2766    movu                 m3, [srcq+r6*2+ 8]
   2767    mova                 m0, m8
   2768    mova                 m1, m8
   2769    pshufb               m4, m2, m6
   2770    vpdpwssd             m0, m10, m4 ; a0
   2771    pshufb               m4, m3, m6
   2772    vpdpwssd             m1, m10, m4 ; b0
   2773    vpdpwssd             m0, m12, m4 ; a2
   2774    movu                 m4, [srcq+r6*2+16]
   2775    pshufb               m3, m7
   2776    vpdpwssd             m1, m11, m3 ; b1
   2777    vpdpwssd             m0, m13, m3 ; a3
   2778    pshufb               m3, m4, m6
   2779    vpdpwssd             m1, m12, m3 ; b2
   2780    pshufb               m2, m7
   2781    vpdpwssd             m0, m11, m2 ; a1
   2782    pshufb               m4, m7
   2783    vpdpwssd             m1, m13, m4 ; b3
   2784    psrad                m0, 6
   2785    psrad                m1, 6
   2786    packusdw             m0, m1
   2787    pminsw               m0, m15
   2788    mova        [dstq+r6*2], m0
   2789    add                  r6, 32
   2790    jl .h_w32_loop
   2791    add                srcq, ssq
   2792    add                dstq, dsq
   2793    dec                  hd
   2794    jg .h_w32_loop0
   2795    RET
   2796 .hv:
   2797    cmp                  wd, 4
   2798    jg .hv_w8
   2799    movzx               mxd, mxb
   2800    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
   2801    movzx               mxd, myb
   2802    shr                 myd, 16
   2803    cmp                  hd, 6
   2804    cmovs               myd, mxd
   2805    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
   2806    lea                  r6, [ssq*3]
   2807    sub                srcq, 2
   2808    sub                srcq, r6
   2809    test          dword r8m, 0x800
   2810    jnz .hv_12bit
   2811    vpbroadcastd        m10, [pd_2176]
   2812    psllw              xmm0, 6
   2813    jmp .hv_main
   2814 .hv_12bit:
   2815    vpbroadcastd        m10, [pd_640]
   2816    psllw              xmm0, 4
   2817    psllw              xmm1, 2
   2818 .hv_main:
   2819    mova           [buf+ 0], xmm0
   2820    mova           [buf+16], xmm1
   2821    vpbroadcastd         m8, [buf+ 4]
   2822    vpbroadcastd         m9, [buf+ 8]
   2823    vpbroadcastd       ym11, xmm1
   2824    vpbroadcastd       ym12, [buf+20]
   2825    vpbroadcastd       ym13, [buf+24]
   2826    vpbroadcastd       ym14, [buf+28]
   2827    movu                xm4, [srcq+ssq*0]
   2828    vinserti32x4        ym4, [srcq+ssq*1], 1
   2829    vinserti32x4         m4, [srcq+ssq*2], 2
   2830    add                srcq, r6
   2831    vinserti32x4         m4, [srcq+ssq*0], 3 ; 0 1 2 3
   2832    movu                xm0, [srcq+ssq*1]
   2833    vinserti32x4        ym0, [srcq+ssq*2], 1
   2834    add                srcq, r6
   2835    vinserti32x4         m0, [srcq+ssq*0], 2 ; 4 5 6
   2836    cmp                  wd, 4
   2837    je .hv_w4
   2838    vbroadcasti32x4      m2, [spel_h_shufA]
   2839    mova                 m3, [spel_h_shuf2b]
   2840    mova                ym6, [spel_h_shuf2a]
   2841    mova                xm7, [spel_shuf2]
   2842    mova                 m1, m10
   2843    pshufb               m4, m2
   2844    pshufb               m0, m2
   2845    punpcklqdq           m2, m4, m0
   2846    vpdpwssd             m1, m8, m2    ; 04 15 26 3_
   2847    punpckhqdq           m4, m0
   2848    vpdpwssd             m1, m9, m4
   2849    vpermb               m1, m3, m1    ; 01 12
   2850    vextracti32x4       xm2, ym1, 1    ; 23 34
   2851    vextracti32x4       xm3, m1, 2     ; 45 56
   2852 .hv_w2_loop:
   2853    movu                xm5, [srcq+ssq*1]
   2854    lea                srcq, [srcq+ssq*2]
   2855    vinserti32x4        ym5, [srcq+ssq*0], 1
   2856    mova                xm4, xm10
   2857    vpermb              ym5, ym6, ym5
   2858    pmaddwd            xmm0, xm11, xm1 ; a0 b0
   2859    vpdpwssd            xm4, xm8, xm5
   2860    vextracti32x4       xm5, ym5, 1
   2861    mova                xm1, xm2
   2862    vpdpwssd           xmm0, xm12, xm2 ; a1 b1
   2863    vpdpwssd            xm4, xm9, xm5  ; 7 8
   2864    mova                xm2, xm3
   2865    vpdpwssd           xmm0, xm13, xm3 ; a2 b2
   2866    vpermt2b            xm3, xm7, xm4  ; 67 78
   2867    vpdpwssd           xmm0, xm14, xm3 ; a3 b3
   2868    psrad              xmm0, 10
   2869    packusdw           xmm0, xmm0
   2870    pminsw             xmm0, xm15
   2871    movd       [dstq+dsq*0], xmm0
   2872    pextrd     [dstq+dsq*1], xmm0, 1
   2873    lea                dstq, [dstq+dsq*2]
   2874    sub                  hd, 2
   2875    jg .hv_w2_loop
   2876    RET
   2877 .hv_w4:
   2878    vbroadcasti32x4     m19, [spel_h_shufA]
   2879    vbroadcasti32x4     m20, [spel_h_shufB]
   2880    mova                ym6, [spel_shuf4a]
   2881    mova                ym7, [spel_shuf4b]
   2882    mova                 m2, m10
   2883    mova                 m3, m10
   2884    pshufb               m1, m4, m19
   2885    vpdpwssd             m2, m8, m1
   2886    pshufb               m1, m0, m19
   2887    vpdpwssd             m3, m8, m1
   2888    pshufb               m4, m20
   2889    vpdpwssd             m2, m9, m4
   2890    pshufb               m0, m20
   2891    vpdpwssd             m3, m9, m0
   2892    vpermb               m1, m6, m2    ; 01 12
   2893    vshufi32x4           m2, m3, q1032
   2894    vpermb               m3, m6, m3    ; 45 56
   2895    vpermb               m2, m6, m2    ; 23 34
   2896 .hv_w4_loop:
   2897    movu               xm18, [srcq+ssq*1]
   2898    lea                srcq, [srcq+ssq*2]
   2899    vinserti128        ym18, [srcq+ssq*0], 1
   2900    pmaddwd            ym16, ym11, ym1 ; a0 b0
   2901    mova                ym1, ym2
   2902    mova                ym2, ym3
   2903    pshufb             ym17, ym18, ym19
   2904    mova                ym3, ym10
   2905    vpdpwssd            ym3, ym8, ym17
   2906    pshufb             ym18, ym20
   2907    vpdpwssd           ym16, ym12, ym1 ; a1 b1
   2908    vpdpwssd            ym3, ym9, ym18 ; 7 8
   2909    vpdpwssd           ym16, ym13, ym2 ; a2 b2
   2910    vpermt2b            ym3, ym7, ym2  ; 67 78
   2911    vpdpwssd           ym16, ym14, ym3 ; a3 b3
   2912    psrad              ym16, 10
   2913    vextracti128       xm17, ym16, 1
   2914    packusdw           xm16, xm17
   2915    pminsw             xm16, xm15
   2916    movq       [dstq+dsq*0], xm16
   2917    movhps     [dstq+dsq*1], xm16
   2918    lea                dstq, [dstq+dsq*2]
   2919    sub                  hd, 2
   2920    jg .hv_w4_loop
   2921    vzeroupper
   2922    RET
   2923 .hv_w8:
   2924    shr                 mxd, 16
   2925    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
   2926    movzx               mxd, myb
   2927    shr                 myd, 16
   2928    cmp                  hd, 6
   2929    cmovs               myd, mxd
   2930    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
   2931    lea                  r6, [ssq*3]
   2932    sub                srcq, 6
   2933    sub                srcq, r6
   2934    test          dword r8m, 0x800
   2935    jnz .hv_w8_12bit
   2936    vpbroadcastd        m10, [pd_2176]
   2937    psllw              xmm0, 6
   2938    jmp .hv_w8_main
   2939 .hv_w8_12bit:
   2940    vpbroadcastd        m10, [pd_640]
   2941    psllw              xmm0, 4
   2942    psllw              xmm1, 2
   2943 .hv_w8_main:
   2944    mova           [buf+ 0], xmm0
   2945    mova           [buf+16], xmm1
   2946    vpbroadcastd        m11, xmm0
   2947    vpbroadcastd        m12, [buf+ 4]
   2948    vpbroadcastd        m13, [buf+ 8]
   2949    vpbroadcastd        m14, [buf+12]
   2950    vpbroadcastd        m16, xmm1
   2951    vpbroadcastd        m17, [buf+20]
   2952    vpbroadcastd        m18, [buf+24]
   2953    vpbroadcastd        m19, [buf+28]
   2954    cmp                  wd, 8
   2955    jg .hv_w16
   2956    mova                 m5, [spel_h_shufA]
   2957    movu                ym0, [srcq+ssq*0]
   2958    vinserti32x8         m0, [srcq+ssq*1], 1 ; 0 1
   2959    movu                ym9, [srcq+ssq*2]
   2960    add                srcq, r6
   2961    vinserti32x8         m9, [srcq+ssq*0], 1 ; 2 3
   2962    movu               ym20, [srcq+ssq*1]
   2963    vinserti32x8        m20, [srcq+ssq*2], 1 ; 4 5
   2964    add srcq, r6
   2965    movu               ym21, [srcq+ssq*0]    ; 6
   2966    movu                 m6, [spel_h_shufB]
   2967    movu                 m7, [spel_h_shufC]
   2968    vpermb               m8, m5, m0
   2969    mova                 m1, m10
   2970    vpdpwssd             m1, m11, m8  ; a0 b0
   2971    vpermb               m8, m5, m9
   2972    mova                 m2, m10
   2973    vpdpwssd             m2, m11, m8  ; c0 d0
   2974    vpermb               m8, m5, m20
   2975    mova                 m3, m10
   2976    vpdpwssd             m3, m11, m8  ; e0 f0
   2977    vpermb               m8, m5, m21
   2978    mova                 m4, m10
   2979    vpdpwssd             m4, m11, m8  ; g0
   2980    vpermb               m8, m6, m0
   2981    vpdpwssd             m1, m12, m8  ; a1 b1
   2982    vpermb               m8, m6, m9
   2983    vpdpwssd             m2, m12, m8  ; c1 d1
   2984    vpermb               m8, m6, m20
   2985    vpdpwssd             m3, m12, m8  ; e1 f1
   2986    vpermb               m8, m6, m21
   2987    vpdpwssd             m4, m12, m8  ; g1
   2988    vpermb               m8, m7, m0
   2989    vpdpwssd             m1, m13, m8  ; a2 b2
   2990    vpermb               m8, m7, m9
   2991    vpdpwssd             m2, m13, m8  ; c2 d2
   2992    vpermb               m8, m7, m20
   2993    vpdpwssd             m3, m13, m8  ; e2 f2
   2994    vpermb               m8, m7, m21
   2995    vpdpwssd             m4, m13, m8  ; g2
   2996    mova                 m8, [spel_h_shufD]
   2997    vpermb               m0, m8, m0
   2998    vpdpwssd             m1, m14, m0  ; a3 b3
   2999    mova                 m0, [spel_shuf8a]
   3000    vpermb               m9, m8, m9
   3001    vpdpwssd             m2, m14, m9  ; c3 d3
   3002    mova                 m9, [spel_shuf8b]
   3003    vpermb              m20, m8, m20
   3004    vpdpwssd             m3, m14, m20 ; e3 f3
   3005    vpermb              m21, m8, m21
   3006    vpdpwssd             m4, m14, m21 ; g3
   3007    vpermt2b             m1, m0, m2   ; 01 12
   3008    vpermt2b             m2, m0, m3   ; 23 34
   3009    vpermt2b             m3, m0, m4   ; 45 56
   3010 .hv_w8_loop:
   3011    movu                ym0, [srcq+ssq*1]
   3012    lea                srcq, [srcq+ssq*2]
   3013    vinserti32x8         m0, [srcq+ssq*0], 1
   3014    mova                 m4, m10
   3015    vpermb              m21, m5, m0
   3016    vpdpwssd             m4, m11, m21 ; h0 i0
   3017    vpermb              m21, m6, m0
   3018    pmaddwd             m20, m16, m1  ; A0 B0
   3019    vpdpwssd             m4, m12, m21 ; h1 i1
   3020    vpermb              m21, m7, m0
   3021    mova                 m1, m2
   3022    vpdpwssd            m20, m17, m2  ; A1 B1
   3023    vpdpwssd             m4, m13, m21 ; h2 i2
   3024    vpermb              m21, m8, m0
   3025    mova                 m2, m3
   3026    vpdpwssd            m20, m18, m3  ; A2 B2
   3027    vpdpwssd             m4, m14, m21 ; h3 i3
   3028    vpermt2b             m3, m9, m4   ; 67 78
   3029    vpdpwssd            m20, m19, m3  ; A3 B3
   3030    psrad               m20, 10
   3031    vextracti32x8      ym21, m20, 1
   3032    packusdw           ym20, ym21
   3033    pminsw             ym20, ym15
   3034    mova         [dstq+dsq*0], xm20
   3035    vextracti128 [dstq+dsq*1], ym20, 1
   3036    lea                dstq, [dstq+dsq*2]
   3037    sub                  hd, 2
   3038    jg .hv_w8_loop
   3039    vzeroupper
   3040    RET
   3041 .hv_w16:
   3042    WIN64_SPILL_XMM 26
   3043    vbroadcasti32x4     m20, [spel_h_shufA]
   3044    vbroadcasti32x4     m21, [spel_h_shufB]
   3045    add                  wd, wd
   3046    mova                 m9, [spel_shuf16]
   3047    lea                  wd, [hq+wq*8-256]
   3048 .hv_w16_loop0:
   3049    vbroadcasti32x8      m5, [srcq+ssq*0+ 8]
   3050    vinserti32x8         m4, m5, [srcq+ssq*0+ 0], 0
   3051    vinserti32x8         m5, [srcq+ssq*0+16], 1 ; 0
   3052    movu                ym6, [srcq+ssq*1+ 0]
   3053    movu                ym7, [srcq+ssq*1+16]
   3054    lea                  r7, [srcq+r6]
   3055    vinserti32x8         m6, [srcq+ssq*2+ 0], 1
   3056    vinserti32x8         m7, [srcq+ssq*2+16], 1 ; 1 2
   3057    movu               ym22, [r7  +ssq*0+ 0]
   3058    movu               ym23, [r7  +ssq*0+16]
   3059    mov                  r8, dstq
   3060    vinserti32x8        m22, [r7  +ssq*1+ 0], 1
   3061    vinserti32x8        m23, [r7  +ssq*1+16], 1 ; 3 4
   3062    movu               ym24, [r7  +ssq*2+ 0]
   3063    movu               ym25, [r7  +ssq*2+16]
   3064    add                  r7, r6
   3065    vinserti32x8        m24, [r7  +ssq*0+ 0], 1
   3066    vinserti32x8        m25, [r7  +ssq*0+16], 1 ; 5 6
   3067    pshufb               m0, m4, m20
   3068    mova                 m1, m10
   3069    vpdpwssd             m1, m11, m0    ; a0
   3070    pshufb               m0, m6, m20
   3071    mova                 m2, m10
   3072    vpdpwssd             m2, m11, m0    ; b0
   3073    pshufb               m0, m7, m20
   3074    mova                 m3, m10
   3075    vpdpwssd             m3, m13, m0    ; c2
   3076    pshufb               m0, m4, m21
   3077    vpdpwssd             m1, m12, m0    ; a1
   3078    pshufb               m0, m6, m21
   3079    vpdpwssd             m2, m12, m0    ; b1
   3080    pshufb               m0, m7, m21
   3081    vpdpwssd             m3, m14, m0    ; c3
   3082    pshufb               m0, m5, m20
   3083    vpdpwssd             m1, m13, m0    ; a2
   3084    shufpd               m6, m7, 0x55
   3085    pshufb               m7, m6, m20
   3086    vpdpwssd             m2, m13, m7    ; b2
   3087    vpdpwssd             m3, m11, m7    ; c0
   3088    pshufb               m5, m21
   3089    vpdpwssd             m1, m14, m5    ; a3
   3090    pshufb               m6, m21
   3091    vpdpwssd             m2, m14, m6    ; b3
   3092    vpdpwssd             m3, m12, m6    ; c1
   3093    pshufb               m0, m22, m20
   3094    mova                 m4, m10
   3095    vpdpwssd             m4, m11, m0    ; d0
   3096    pshufb               m0, m23, m20
   3097    mova                 m5, m10
   3098    vpdpwssd             m5, m13, m0    ; e2
   3099    pshufb               m0, m24, m20
   3100    mova                 m6, m10
   3101    vpdpwssd             m6, m11, m0    ; f0
   3102    pshufb               m0, m25, m20
   3103    mova                 m7, m10
   3104    vpdpwssd             m7, m13, m0    ; g2
   3105    pshufb               m0, m22, m21
   3106    vpdpwssd             m4, m12, m0    ; d1
   3107    pshufb               m0, m23, m21
   3108    vpdpwssd             m5, m14, m0    ; e3
   3109    pshufb               m0, m24, m21
   3110    vpdpwssd             m6, m12, m0    ; f1
   3111    pshufb               m0, m25, m21
   3112    vpdpwssd             m7, m14, m0    ; g3
   3113    shufpd              m22, m23, 0x55
   3114    pshufb              m23, m22, m20
   3115    vpdpwssd             m4, m13, m23   ; d2
   3116    vpdpwssd             m5, m11, m23   ; e0
   3117    shufpd              m24, m25, 0x55
   3118    pshufb              m25, m24, m20
   3119    vpdpwssd             m6, m13, m25   ; f2
   3120    vpdpwssd             m7, m11, m25   ; g0
   3121    pshufb              m22, m21
   3122    vpdpwssd             m4, m14, m22   ; d3
   3123    vpdpwssd             m5, m12, m22   ; e1
   3124    pshufb              m24, m21
   3125    vpdpwssd             m6, m14, m24   ; f3
   3126    vpdpwssd             m7, m12, m24   ; g1
   3127    pslldq               m1, 1
   3128    vpermt2b             m2, m9, m3     ; 12
   3129    vpermt2b             m4, m9, m5     ; 34
   3130    vpermt2b             m6, m9, m7     ; 56
   3131    vpshrdd              m1, m2, 16     ; 01
   3132    vpshrdd              m3, m2, m4, 16 ; 23
   3133    vpshrdd              m5, m4, m6, 16 ; 45
   3134 .hv_w16_loop:
   3135    movu               ym24, [r7+ssq*1+ 0]
   3136    movu               ym25, [r7+ssq*1+16]
   3137    lea                  r7, [r7+ssq*2]
   3138    vinserti32x8        m24, [r7+ssq*0+ 0], 1
   3139    vinserti32x8        m25, [r7+ssq*0+16], 1
   3140    mova                 m7, m10
   3141    mova                 m8, m10
   3142    pshufb               m0, m24, m20
   3143    vpdpwssd             m7, m11, m0    ; h0
   3144    pshufb               m0, m25, m20
   3145    vpdpwssd             m8, m13, m0    ; i2
   3146    pmaddwd             m22, m16, m1    ; A0
   3147    mova                 m1, m3
   3148    pmaddwd             m23, m16, m2    ; B0
   3149    mova                 m2, m4
   3150    pshufb               m0, m24, m21
   3151    vpdpwssd             m7, m12, m0    ; h1
   3152    pshufb               m0, m25, m21
   3153    vpdpwssd             m8, m14, m0    ; i3
   3154    vpdpwssd            m22, m17, m3    ; A1
   3155    mova                 m3, m5
   3156    vpdpwssd            m23, m17, m4    ; B1
   3157    mova                 m4, m6
   3158    shufpd              m24, m25, 0x55
   3159    pshufb              m25, m24, m20
   3160    vpdpwssd             m7, m13, m25   ; h2
   3161    vpdpwssd             m8, m11, m25   ; i0
   3162    vpdpwssd            m22, m18, m5    ; A2
   3163    vpdpwssd            m23, m18, m6    ; B2
   3164    pshufb              m24, m21
   3165    vpdpwssd             m7, m14, m24   ; h3
   3166    vpdpwssd             m8, m12, m24   ; i1
   3167    vpermt2b             m7, m9, m8     ; 78
   3168    vpshrdd              m5, m6, m7, 16 ; 67
   3169    vpdpwssd            m22, m19, m5    ; A3
   3170    vpdpwssd            m23, m19, m7    ; B3
   3171    mova                 m6, m7
   3172    psrad               m22, 10
   3173    psrad               m23, 10
   3174    vshufi32x4           m0, m22, m23, q3232
   3175    vinserti32x8        m22, ym23, 1
   3176    packusdw            m22, m0
   3177    pminsw              m22, m15
   3178    mova          [r8+dsq*0], ym22
   3179    vextracti32x8 [r8+dsq*1], m22, 1
   3180    lea                  r8, [r8+dsq*2]
   3181    sub                  hd, 2
   3182    jg .hv_w16_loop
   3183    add                srcq, 32
   3184    add                dstq, 32
   3185    movzx                hd, wb
   3186    sub                  wd, 1<<8
   3187    jg .hv_w16_loop0
   3188    RET
   3189 
   3190 %if WIN64
   3191 DECLARE_REG_TMP 6, 4
   3192 %else
   3193 DECLARE_REG_TMP 6, 7
   3194 %endif
   3195 
   3196 %define PREP_8TAP_FN FN prep_8tap,
   3197 PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
   3198 PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
   3199 PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
   3200 PREP_8TAP_FN regular,        REGULAR, REGULAR
   3201 
   3202 cglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my
   3203 %define base r7-prep_avx512icl
   3204    imul                mxd, mxm, 0x010101
   3205    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
   3206    imul                myd, mym, 0x010101
   3207    add                 myd, t1d ; 6tap_v, my, 4tap_v
   3208    lea                  r7, [prep_avx512icl]
   3209    mov                  wd, wm
   3210    movifnidn            hd, hm
   3211    test                mxd, 0xf00
   3212    jnz .h
   3213    test                myd, 0xf00
   3214    jnz .v
   3215 .prep:
   3216    tzcnt                wd, wd
   3217    mov                 r5d, r7m ; bitdepth_max
   3218    vpbroadcastd         m5, [pw_8192]
   3219    movzx                wd, word [r7+wq*2+table_offset(prep,)]
   3220    shr                 r5d, 11
   3221    vpbroadcastd         m4, [r7-prep_avx512icl+prep_mul+r5*4]
   3222    add                  wq, r7
   3223    lea                  r6, [ssq*3]
   3224 %if WIN64
   3225    pop                  r7
   3226 %endif
   3227    jmp                  wq
   3228 .h_w8:
   3229    mova                 m6, [spel_h_shufA]
   3230    movu                 m7, [spel_h_shufC]
   3231    mova                 m8, [prep_endB]
   3232 .h_w8_loop:
   3233    movu                ym4, [srcq+ssq*0]
   3234    vinserti32x8         m4, [srcq+ssq*1], 1
   3235    movu                ym5, [srcq+ssq*2]
   3236    vinserti32x8         m5, [srcq+r6   ], 1
   3237    lea                srcq, [srcq+ssq*4]
   3238    mova                 m0, m10
   3239    mova                 m1, m10
   3240    vpermb               m2, m6, m4
   3241    vpermb               m3, m6, m5
   3242    vpdpwssd             m0, m12, m2 ; a0 b0
   3243    vpdpwssd             m1, m12, m3 ; c0 d0
   3244    vpermb               m4, m7, m4
   3245    vpermb               m5, m7, m5
   3246    vpdpwssd             m0, m14, m4 ; a2 b2
   3247    vpdpwssd             m1, m14, m5 ; c2 d2
   3248    shufpd               m2, m4, 0x55
   3249    shufpd               m3, m5, 0x55
   3250    vpdpwssd             m0, m13, m2 ; a1 b1
   3251    vpdpwssd             m1, m13, m3 ; c1 d1
   3252    vpermt2b             m0, m8, m1
   3253    mova             [tmpq], m0
   3254    add                tmpq, 64
   3255    sub                  hd, 4
   3256    jg .h_w8_loop
   3257    RET
   3258 .h:
   3259    vpbroadcastd        m10, [prep_8tap_rnd]
   3260    test                myd, 0xf00
   3261    jnz .hv
   3262    lea                  r6, [ssq*3]
   3263    cmp                  wd, 4
   3264    je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4
   3265    shr                 mxd, 16
   3266    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
   3267    mov                 r5d, r7m
   3268    sub                srcq, 4
   3269    shr                 r5d, 11
   3270    psllw              xmm0, [base+prep_hv_shift+r5*8]
   3271    mova             [tmpq], xmm0
   3272    vpbroadcastd        m12, xmm0
   3273    vpbroadcastd        m13, [tmpq+ 4]
   3274    vpbroadcastd        m14, [tmpq+ 8]
   3275    cmp                  wd, 16
   3276    jl .h_w8
   3277    vbroadcasti32x4      m5, [spel_h_shufA]
   3278    vbroadcasti32x4      m6, [spel_h_shufB]
   3279    mova                 m7, [prep_endC]
   3280    jg .h_w32
   3281 .h_w16_loop:
   3282    movu                ym2, [srcq+ssq*0+ 0]
   3283    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
   3284    movu                ym3, [srcq+ssq*0+12]
   3285    vinserti32x8         m3, [srcq+ssq*1+12], 1
   3286    lea                srcq, [srcq+ssq*2]
   3287    mova                 m0, m10
   3288    mova                 m1, m10
   3289    pshufb               m4, m2, m5   ; 01
   3290    vpdpwssd             m0, m12, m4  ; a0  b0
   3291    pshufb               m4, m3, m6   ; 89
   3292    vpdpwssd             m1, m14, m4  ; a2' b2'
   3293    pshufb               m2, m6       ; 23
   3294    pshufb               m3, m5       ; 67
   3295    vpdpwssd             m0, m13, m2  ; a1  b1
   3296    vpdpwssd             m1, m13, m3  ; a1' b1'
   3297    shufpd               m2, m3, 0x55 ; 45
   3298    vpdpwssd             m0, m14, m2  ; a2  b2
   3299    vpdpwssd             m1, m12, m2  ; a0' b0'
   3300    vpermt2b             m0, m7, m1
   3301    mova             [tmpq], m0
   3302    add                tmpq, 64
   3303    sub                  hd, 2
   3304    jg .h_w16_loop
   3305    RET
   3306 .h_w32:
   3307    lea                srcq, [srcq+wq*2]
   3308    neg                  wq
   3309 .h_w32_loop0:
   3310    mov                  r6, wq
   3311 .h_w32_loop:
   3312    movu                 m2, [srcq+r6*2+ 0]
   3313    movu                 m3, [srcq+r6*2+12]
   3314    mova                 m0, m10
   3315    mova                 m1, m10
   3316    pshufb               m4, m2, m5
   3317    vpdpwssd             m0, m12, m4
   3318    pshufb               m4, m3, m6
   3319    vpdpwssd             m1, m14, m4
   3320    pshufb               m2, m6
   3321    pshufb               m3, m5
   3322    vpdpwssd             m0, m13, m2
   3323    vpdpwssd             m1, m13, m3
   3324    shufpd               m2, m3, 0x55
   3325    vpdpwssd             m0, m14, m2
   3326    vpdpwssd             m1, m12, m2
   3327    vpermt2b             m0, m7, m1
   3328    mova             [tmpq], m0
   3329    add                tmpq, 64
   3330    add                  r6, 32
   3331    jl .h_w32_loop
   3332    add                srcq, ssq
   3333    dec                  hd
   3334    jg .h_w32_loop0
   3335    RET
   3336 .v:
   3337    movzx               mxd, myb
   3338    shr                 myd, 16
   3339    cmp                  hd, 4
   3340    cmove               myd, mxd
   3341    mov                 r5d, r7m
   3342    vpbroadcastd        m10, [prep_8tap_rnd]
   3343    pmovsxbw           xmm0, [base+subpel_filters+1+myq*8]
   3344    tzcnt               r6d, wd
   3345    shr                 r5d, 11
   3346    movzx               r6d, word [r7+r6*2+table_offset(prep, _6tap_v)]
   3347    psllw              xmm0, [base+prep_hv_shift+r5*8]
   3348    add                  r7, r6
   3349    mova             [tmpq], xmm0
   3350    vpbroadcastd        m12, xmm0
   3351    mov                  r6, ssq
   3352    vpbroadcastd        m13, [tmpq+ 4]
   3353    neg                  r6
   3354    vpbroadcastd        m14, [tmpq+ 8]
   3355    jmp                  r7
   3356 .v_w4:
   3357    mov                 r3d, 0x330c
   3358    movq                xm1, [srcq+r6 *2]
   3359    kmovw                k1, r3d
   3360    vpbroadcastq    ym1{k1}, [srcq+r6 *1]
   3361    vpbroadcastq         m2, [srcq+ssq*0]
   3362    vinserti32x4     m1{k1}, m2, [srcq+ssq*1], 3
   3363    movq                xm0, [srcq+ssq*2]
   3364    mova                ym4, [prep_endA]
   3365    valignq              m0, m1, 2
   3366    punpcklwd            m1, m0        ; 01 12 23 34
   3367 .v_w4_loop:
   3368    lea                srcq, [srcq+ssq*4]
   3369    movq                xm2, [srcq+r6 *1]
   3370    vpbroadcastq    ym2{k1}, [srcq+ssq*0]
   3371    vpbroadcastq         m3, [srcq+ssq*1]
   3372    vinserti32x4     m2{k1}, m3, [srcq+ssq*2], 3
   3373    mova                 m3, m10
   3374    vpdpwssd             m3, m12, m1   ; a0 b0 c0 d0
   3375    valignq              m0, m2, m0, 6 ; 4 5 6 7
   3376    punpcklwd            m0, m2        ; 45 56 67 78
   3377    vpdpwssd             m3, m14, m0   ; a2 b2 c2 d2
   3378    vshufi32x4           m1, m0, q1032 ; 23 34 45 56
   3379    vpdpwssd             m3, m13, m1   ; a1 b1 c1 d1
   3380    mova                 m1, m0
   3381    mova                 m0, m2
   3382    vpermb               m3, m4, m3
   3383    mova             [tmpq], ym3
   3384    add                tmpq, 32
   3385    sub                  hd, 4
   3386    jg .v_w4_loop
   3387    RET
   3388 .v_w8:
   3389    vbroadcasti32x4     ym1, [srcq+r6 *1]
   3390    mov                 r3d, 0x33
   3391    vbroadcasti32x4      m2, [srcq+ssq*0]
   3392    kmovb                k1, r3d
   3393    mova                 m6, [spel_v_shuf8]
   3394    vinserti64x2     m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2
   3395    vbroadcasti32x4     ym0, [srcq+ssq*1]
   3396    vinserti64x2     m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4
   3397    mova                 m7, [prep_endB]
   3398    vpermb               m1, m6, m1  ; 01 12
   3399    vpermb               m2, m6, m0  ; 23 34
   3400 .v_w8_loop:
   3401    lea                srcq, [srcq+ssq*4]
   3402    vbroadcasti32x4     ym3, [srcq+r6 *1]
   3403    movu                xm4, [srcq+ssq*0]
   3404    vshufi64x2       m3{k1}, m0, m4, q1032       ; 4 5 6
   3405    vbroadcasti32x4     ym0, [srcq+ssq*1]
   3406    vinserti64x2     m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8
   3407    mova                 m4, m10
   3408    vpdpwssd             m4, m12, m1 ; a0 b0
   3409    mova                 m5, m10
   3410    vpdpwssd             m5, m12, m2 ; c0 d0
   3411    vpermb               m1, m6, m3  ; 45 56
   3412    vpdpwssd             m4, m13, m2 ; a1 b1
   3413    vpermb               m2, m6, m0  ; 67 78
   3414    vpdpwssd             m5, m13, m1 ; c1 d1
   3415    vpdpwssd             m4, m14, m1 ; a2 b2
   3416    vpdpwssd             m5, m14, m2 ; c2 d2
   3417    vpermt2b             m4, m7, m5
   3418    mova             [tmpq], m4
   3419    add                tmpq, 64
   3420    sub                  hd, 4
   3421    jg .v_w8_loop
   3422    RET
   3423 .v_w16:
   3424    vbroadcasti32x8      m0, [srcq+r6 *1]
   3425    vinserti32x8         m1, m0, [srcq+ssq*0], 1 ; 1 2
   3426    vinserti32x8         m0, [srcq+r6 *2], 0     ; 0 1
   3427    mova                 m6, [spel_v_shuf16]
   3428    movu                ym3, [srcq+ssq*1]
   3429    lea                srcq, [srcq+ssq*2]
   3430    vinserti32x8         m3, [srcq+ssq*0], 1     ; 3 4
   3431    mova                 m7, [prep_endA]
   3432    vpermb               m1, m6, m1     ; 12
   3433    vpermb               m0, m6, m0     ; 01
   3434    vpermb               m3, m6, m3     ; 34
   3435    vpshrdd              m2, m1, m3, 16 ; 23
   3436 .v_w16_loop:
   3437    mova                 m5, m10
   3438    vpdpwssd             m5, m12, m1    ; b0
   3439    mova                 m4, m10
   3440    vpdpwssd             m4, m12, m0    ; a0
   3441    mova                 m1, m3
   3442    vpdpwssd             m5, m13, m3    ; b1
   3443    movu                ym3, [srcq+ssq*1]
   3444    lea                srcq, [srcq+ssq*2]
   3445    vpdpwssd             m4, m13, m2    ; a1
   3446    vinserti32x8         m3, [srcq+ssq*0], 1
   3447    mova                 m0, m2
   3448    vpermb               m3, m6, m3     ; 56
   3449    vpshrdd              m2, m1, m3, 16 ; 45
   3450    vpdpwssd             m5, m14, m3    ; b2
   3451    vpdpwssd             m4, m14, m2    ; a2
   3452    vpermt2b             m4, m7, m5
   3453    mova             [tmpq], m4
   3454    add                tmpq, 64
   3455    sub                  hd, 2
   3456    jg .v_w16_loop
   3457    RET
   3458 .v_w32:
   3459 .v_w64:
   3460 .v_w128:
   3461 %if WIN64
   3462    push                 r8
   3463 %endif
   3464    mova                m11, [prep_endC]
   3465    lea                  r5, [hq+wq*8-256]
   3466 .v_w32_loop0:
   3467    movu                 m4, [srcq+r6 *2]
   3468    movu                 m5, [srcq+r6 *1]
   3469    lea                  r7, [srcq+ssq*2]
   3470    movu                 m6, [srcq+ssq*0]
   3471    movu                 m7, [srcq+ssq*1]
   3472    mov                  r8, tmpq
   3473    movu                 m8, [r7  +ssq*0]
   3474    punpcklwd            m0, m4, m5  ; 01
   3475    punpckhwd            m4, m5
   3476    punpcklwd            m1, m5, m6  ; 12
   3477    punpckhwd            m5, m6
   3478    punpcklwd            m2, m6, m7  ; 23
   3479    punpckhwd            m6, m7
   3480    punpcklwd            m3, m7, m8  ; 34
   3481    punpckhwd            m7, m8
   3482 .v_w32_loop:
   3483    mova                m16, m10
   3484    movu                 m9, [r7+ssq*1]
   3485    mova                m18, m10
   3486    vpdpwssd            m16, m12, m0 ; a0
   3487    mova                m17, m10
   3488    vpdpwssd            m18, m12, m4
   3489    mova                m19, m10
   3490    vpdpwssd            m17, m12, m1 ; b0
   3491    lea                  r7, [r7+ssq*2]
   3492    vpdpwssd            m19, m12, m5
   3493    mova                 m0, m2
   3494    vpdpwssd            m16, m13, m2 ; a1
   3495    punpcklwd            m2, m8, m9  ; 45
   3496    mova                 m4, m6
   3497    vpdpwssd            m18, m13, m6
   3498    punpckhwd            m6, m8, m9
   3499    movu                 m8, [r7+ssq*0]
   3500    vpdpwssd            m17, m13, m3 ; b1
   3501    mova                 m1, m3
   3502    vpdpwssd            m19, m13, m7
   3503    mova                 m5, m7
   3504    vpdpwssd            m16, m14, m2 ; a2
   3505    punpcklwd            m3, m9, m8  ; 56
   3506    vpdpwssd            m18, m14, m6
   3507    punpckhwd            m7, m9, m8
   3508    vpdpwssd            m17, m14, m3 ; b2
   3509    vpdpwssd            m19, m14, m7
   3510    vpermt2b            m16, m11, m18
   3511    vpermt2b            m17, m11, m19
   3512    mova          [r8+wq*0], m16
   3513    mova          [r8+wq*2], m17
   3514    lea                  r8, [r8+wq*4]
   3515    sub                  hd, 2
   3516    jg .v_w32_loop
   3517    add                srcq, 64
   3518    add                tmpq, 64
   3519    movzx                hd, r5b
   3520    sub                 r5d, 1<<8
   3521    jg .v_w32_loop0
   3522 %if WIN64
   3523    pop                  r8
   3524 %endif
   3525    vzeroupper
   3526    RET
   3527 .hv_w4:
   3528    movzx               mxd, mxb
   3529    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
   3530    movzx               mxd, myb
   3531    shr                 myd, 16
   3532    cmp                  hd, 4
   3533    cmove               myd, mxd
   3534    mov                 r5d, r7m
   3535    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
   3536    mov                  r6, ssq
   3537    sub                srcq, 2
   3538    shr                 r5d, 11
   3539    neg                  r6
   3540    psllw              xmm0, [base+prep_hv_shift+r5*8]
   3541    psllw              xmm1, 2
   3542    mova          [tmpq+ 0], xmm0
   3543    mova          [tmpq+16], xmm1
   3544    vpbroadcastd         m8, [tmpq+ 4]
   3545    mov                 r3d, 0xf0
   3546    vpbroadcastd         m9, [tmpq+ 8]
   3547    vpbroadcastd        m12, xmm1
   3548    movu                xm3, [srcq+r6 *2]
   3549    kmovb                k1, r3d
   3550    vinserti32x4        ym3, [srcq+r6 *1], 1
   3551    vbroadcasti32x4      m2, [srcq+ssq*0]
   3552    vinserti64x2     m3{k1}, m2, [srcq+ssq*1], 3
   3553    movu                xm4, [srcq+ssq*2]
   3554    vbroadcasti32x4      m5, [spel_h_shufA]
   3555    vbroadcasti32x4      m6, [spel_h_shufB]
   3556    mova                 m1, m11
   3557    mova                m15, [spel_shuf4a]
   3558    mova                xm2, xm11
   3559    pshufb               m0, m3, m5
   3560    vpdpwssd             m1, m8, m0
   3561    pshufb              xm0, xm4, xm5
   3562    vpdpwssd            xm2, xm8, xm0
   3563    vpbroadcastd        m13, [tmpq+20]
   3564    pshufb               m3, m6
   3565    vpbroadcastd        m14, [tmpq+24]
   3566    pshufb              xm4, xm6
   3567    mova                 m7, [spel_shuf4b]
   3568    vpdpwssd             m1, m9, m3    ; 0 1 2 3
   3569    vpdpwssd            xm2, xm9, xm4  ; 4
   3570    vpermt2b             m1, m15, m2   ; 01 12 23 34
   3571    mova               ym15, [prep_endA]
   3572 .hv_w4_loop:
   3573    lea                srcq, [srcq+ssq*4]
   3574    movu                xm4, [srcq+r6 *1]
   3575    vinserti32x4        ym4, [srcq+ssq*0], 1
   3576    vbroadcasti32x4      m3, [srcq+ssq*1]
   3577    vinserti64x2     m4{k1}, m3, [srcq+ssq*2], 3
   3578    mova                 m2, m11
   3579    pshufb               m3, m4, m5
   3580    vpdpwssd             m2, m8, m3
   3581    mova                 m3, m10
   3582    vpdpwssd             m3, m12, m1   ; a0 b0 c0 d0
   3583    pshufb               m4, m6
   3584    vpdpwssd             m2, m9, m4    ; 5 6 7 8
   3585    mova                 m4, m1
   3586    vpermt2b             m1, m7, m2    ; 45 56 67 78
   3587    vpdpwssd             m3, m14, m1   ; a2 b2 c2 d2
   3588    vshufi32x4           m4, m1, q1032 ; 23 34 45 56
   3589    vpdpwssd             m3, m13, m4   ; a1 b1 c1 d1
   3590    vpermb               m3, m15, m3
   3591    mova             [tmpq], ym3
   3592    add                tmpq, 32
   3593    sub                  hd, 4
   3594    jg .hv_w4_loop
   3595    RET
   3596 .hv_w8:
   3597    mova                 m8, [spel_h_shufA]
   3598    movu               ym18, [srcq+r6 *2]
   3599    vinserti32x8        m18, [srcq+r6 *1], 1 ; 0 1
   3600    movu               ym19, [srcq+ssq*0]
   3601    vinserti32x8        m19, [srcq+ssq*1], 1 ; 2 3
   3602    movu               ym20, [srcq+ssq*2]    ; 4
   3603    movu                 m9, [spel_h_shufC]
   3604    mova                m21, [spel_shuf8a]
   3605    mova                 m0, [spel_shuf8b]
   3606    vpermb               m4, m8, m18
   3607    mova                 m1, m10
   3608    vpermb               m5, m8, m19
   3609    vpdpwssd             m1, m12, m4  ; a0 b0
   3610    mova                 m2, m10
   3611    vpermb               m6, m8, m20
   3612    vpdpwssd             m2, m12, m5  ; c0 d0
   3613    mova                 m3, m10
   3614    vpermb              m18, m9, m18
   3615    vpdpwssd             m3, m12, m6  ; e0
   3616    mova                 m7, [prep_endB]
   3617    vpermb              m19, m9, m19
   3618    vpdpwssd             m1, m14, m18 ; a2 b2
   3619    vpermb              m20, m9, m20
   3620    vpdpwssd             m2, m14, m19 ; c2 d2
   3621    shufpd               m4, m18, 0x55
   3622    vpdpwssd             m3, m14, m20 ; e2
   3623    shufpd               m5, m19, 0x55
   3624    vpdpwssd             m1, m13, m4  ; a1 b1
   3625    shufpd               m6, m20, 0x55
   3626    vpdpwssd             m2, m13, m5  ; c1 d1
   3627    vpdpwssd             m3, m13, m6  ; e1
   3628    vpermt2b             m1, m21, m2  ; 01 12
   3629    vpermt2b             m2, m21, m3  ; 23 34
   3630 .hv_w8_loop:
   3631    lea                srcq, [srcq+ssq*4]
   3632    movu               ym18, [srcq+r6 *1]
   3633    vinserti32x8        m18, [srcq+ssq*0], 1
   3634    movu               ym19, [srcq+ssq*1]
   3635    vinserti32x8        m19, [srcq+ssq*2], 1
   3636    mova                 m3, m10
   3637    vpermb               m5, m8, m18
   3638    mova                 m4, m10
   3639    vpermb               m6, m8, m19
   3640    vpdpwssd             m3, m12, m5  ; f0 g0
   3641    mova                m20, m11
   3642    vpdpwssd             m4, m12, m6  ; h0 i0
   3643    mova                m21, m11
   3644    vpdpwssd            m20, m15, m1  ; A0 B0
   3645    vpermb              m18, m9, m18
   3646    vpdpwssd            m21, m15, m2  ; C0 D0
   3647    vpermb              m19, m9, m19
   3648    vpdpwssd             m3, m14, m18 ; f2 g2
   3649    vpdpwssd             m4, m14, m19 ; h2 i2
   3650    shufpd               m5, m18, 0x55
   3651    vpdpwssd            m20, m16, m2  ; A1 B1
   3652    shufpd               m6, m19, 0x55
   3653    vpdpwssd             m3, m13, m5  ; f1 g1
   3654    vpdpwssd             m4, m13, m6  ; h1 i1
   3655    vpermt2b             m2, m0, m3   ; 45 56
   3656    vpdpwssd            m21, m16, m2  ; C1 D1
   3657    mova                 m1, m2
   3658    vpermt2b             m2, m0, m4   ; 67 78
   3659    vpdpwssd            m20, m17, m1  ; A2 B2
   3660    vpdpwssd            m21, m17, m2  ; A2 B2
   3661    vpermt2b            m20, m7, m21
   3662    mova             [tmpq], m20
   3663    add                tmpq, 64
   3664    sub                  hd, 4
   3665    jg .hv_w8_loop
   3666    vzeroupper
   3667    RET
   3668 .hv:
   3669    vpbroadcastd        m11, [pd_128]
   3670    cmp                  wd, 4
   3671    je .hv_w4
   3672    shr                 mxd, 16
   3673    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
   3674    movzx               mxd, myb
   3675    shr                 myd, 16
   3676    cmp                  hd, 6
   3677    cmovs               myd, mxd
   3678    mov                 r5d, r7m
   3679    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
   3680    mov                  r6, ssq
   3681    sub                srcq, 4
   3682    shr                 r5d, 11
   3683    neg                  r6
   3684    psllw              xmm0, [base+prep_hv_shift+r5*8]
   3685    psllw              xmm1, 2
   3686    mova          [tmpq+ 0], xmm0
   3687    mova          [tmpq+16], xmm1
   3688    vpbroadcastd        m12, xmm0
   3689    vpbroadcastd        m13, [tmpq+ 4]
   3690    vpbroadcastd        m14, [tmpq+ 8]
   3691    vpbroadcastd        m15, xmm1
   3692    vpbroadcastd        m16, [tmpq+20]
   3693    vpbroadcastd        m17, [tmpq+24]
   3694    cmp                  wd, 16
   3695    jl .hv_w8
   3696    vbroadcasti32x4      m8, [spel_h_shufA]
   3697    vbroadcasti32x4      m9, [spel_h_shufB]
   3698    jg .hv_w32
   3699    vbroadcasti32x8      m6, [srcq+r6 *2+ 8]
   3700    vinserti32x8         m2, m6, [srcq+r6 *2+16], 1
   3701    vinserti32x8         m6, [srcq+r6 *2+ 0], 0 ; 0
   3702    movu               ym18, [srcq+r6 *1+ 0]
   3703    movu               ym19, [srcq+r6 *1+12]
   3704    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
   3705    vinserti32x8        m19, [srcq+ssq*0+12], 1 ; 1 2
   3706    movu               ym20, [srcq+ssq*1+ 0]
   3707    movu               ym21, [srcq+ssq*1+12]
   3708    lea                srcq, [srcq+ssq*2]
   3709    vinserti32x8        m20, [srcq+ssq*0+ 0], 1
   3710    vinserti32x8        m21, [srcq+ssq*0+12], 1 ; 3 4
   3711    pshufb               m2, m8
   3712    mova                 m1, m10
   3713    pshufb               m3, m18, m8
   3714    vpdpwssd             m1, m14, m2    ; a2
   3715    mova                 m2, m10
   3716    pshufb               m4, m19, m9
   3717    vpdpwssd             m2, m12, m3    ; b0  c0
   3718    mova                 m3, m10
   3719    pshufb               m5, m20, m8
   3720    vpdpwssd             m3, m14, m4    ; b2' c2'
   3721    mova                 m4, m10
   3722    pshufb               m7, m21, m9
   3723    vpdpwssd             m4, m12, m5    ; d0  e0
   3724    mova                 m5, m10
   3725    pshufb               m0, m6, m8
   3726    vpdpwssd             m5, m14, m7    ; d2' e2'
   3727    mova                 m7, [spel_shuf16]
   3728    pshufb              m18, m9
   3729    vpdpwssd             m1, m12, m0    ; a0
   3730    pshufb              m19, m8
   3731    vpdpwssd             m2, m13, m18   ; b1  c1
   3732    pshufb              m20, m9
   3733    vpdpwssd             m3, m13, m19   ; b1' c1'
   3734    pshufb              m21, m8
   3735    vpdpwssd             m4, m13, m20   ; d1  e1
   3736    pshufb               m6, m9
   3737    vpdpwssd             m5, m13, m21   ; d1' e1'
   3738    mova                 m0, [prep_endB]
   3739    shufpd              m18, m19, 0x55
   3740    vpdpwssd             m1, m13, m6    ; a1
   3741    shufpd              m20, m21, 0x55
   3742    vpdpwssd             m2, m14, m18   ; b2  c2
   3743    vpdpwssd             m3, m12, m18   ; b0' c0'
   3744    vpdpwssd             m4, m14, m20   ; d2  e2
   3745    vpdpwssd             m5, m12, m20   ; d0' e0'
   3746    pslldq               m1, 1
   3747    vpermt2b             m2, m7, m3     ; 12
   3748    vpermt2b             m4, m7, m5     ; 34
   3749    vpshrdd              m1, m2, 16     ; 01
   3750    vpshrdd              m3, m2, m4, 16 ; 23
   3751 .hv_w16_loop:
   3752    movu               ym18, [srcq+ssq*1+ 0]
   3753    movu               ym19, [srcq+ssq*1+12]
   3754    lea                srcq, [srcq+ssq*2]
   3755    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
   3756    vinserti32x8        m19, [srcq+ssq*0+12], 1
   3757    mova                 m5, m10
   3758    mova                 m6, m10
   3759    pshufb              m21, m18, m8
   3760    vpdpwssd             m5, m12, m21   ; f0  g0
   3761    pshufb              m20, m19, m9
   3762    mova                m21, m11
   3763    vpdpwssd             m6, m14, m20   ; f2' g2'
   3764    mova                m20, m11
   3765    vpdpwssd            m21, m15, m2    ; B0
   3766    mova                 m2, m4
   3767    vpdpwssd            m20, m15, m1    ; A0
   3768    mova                 m1, m3
   3769    pshufb              m18, m9
   3770    vpdpwssd             m5, m13, m18   ; f1  g1
   3771    pshufb              m19, m8
   3772    vpdpwssd             m6, m13, m19   ; f1' g1'
   3773    vpdpwssd            m21, m16, m4    ; B1
   3774    vpdpwssd            m20, m16, m3    ; A1
   3775    shufpd              m18, m19, 0x55
   3776    vpdpwssd             m5, m14, m18   ; f2  g2
   3777    vpdpwssd             m6, m12, m18   ; f0' g0'
   3778    mova                 m4, m7
   3779    vpermi2b             m4, m5, m6     ; 56
   3780    vpshrdd              m3, m2, m4, 16 ; 45
   3781    vpdpwssd            m21, m17, m4    ; B2
   3782    vpdpwssd            m20, m17, m3    ; A2
   3783    vpermt2b            m20, m0, m21
   3784    mova             [tmpq], m20
   3785    add                tmpq, 64
   3786    sub                  hd, 2
   3787    jg .hv_w16_loop
   3788    vzeroupper
   3789    RET
   3790 .hv_w32:
   3791    WIN64_SPILL_XMM      29
   3792 %if WIN64
   3793    push                 r8
   3794 %endif
   3795    mova                m27, [spel_shuf32]
   3796    lea                 r5d, [hq+wq*8-256]
   3797    mova                m28, [prep_endC]
   3798 .hv_w32_loop0:
   3799    movu                m18, [srcq+r6 *2+ 0]
   3800    movu                 m7, [srcq+r6 *2+12]
   3801    movu                 m6, [srcq+r6 *1+ 0]
   3802    movu                m20, [srcq+r6 *1+12]
   3803    lea                  r7, [srcq+ssq*2]
   3804    movu                m19, [srcq+ssq*0+ 0]
   3805    movu                m21, [srcq+ssq*0+12]
   3806    movu                m22, [srcq+ssq*1+ 0]
   3807    movu                m24, [srcq+ssq*1+12]
   3808    mov                  r8, tmpq
   3809    movu                m23, [r7  +ssq*0+ 0]
   3810    movu                m25, [r7  +ssq*0+12]
   3811    pshufb               m1, m18, m8
   3812    mova                 m0, m10
   3813    pshufb               m2, m7, m9
   3814    vpdpwssd             m0, m12, m1    ; a0
   3815    mova                 m1, m10
   3816    pshufb               m4, m6, m8
   3817    vpdpwssd             m1, m14, m2    ; a2'
   3818    mova                 m2, m10
   3819    pshufb               m3, m19, m8
   3820    vpdpwssd             m2, m12, m4    ; b0
   3821    mova                 m4, m10
   3822    pshufb               m5, m20, m9
   3823    vpdpwssd             m4, m12, m3    ; c0
   3824    mova                 m3, m10
   3825    pshufb              m26, m21, m9
   3826    vpdpwssd             m3, m14, m5    ; b2'
   3827    mova                 m5, m10
   3828    pshufb              m18, m9
   3829    vpdpwssd             m5, m14, m26   ; c2'
   3830    pshufb               m7, m8
   3831    vpdpwssd             m0, m13, m18   ; a1
   3832    pshufb               m6, m9
   3833    vpdpwssd             m1, m13, m7    ; a1'
   3834    pshufb              m19, m9
   3835    vpdpwssd             m2, m13, m6    ; b1
   3836    pshufb              m20, m8
   3837    vpdpwssd             m4, m13, m19   ; c1
   3838    pshufb              m21, m8
   3839    vpdpwssd             m3, m13, m20   ; b1'
   3840    shufpd              m18, m7, 0x55
   3841    vpdpwssd             m5, m13, m21   ; c1'
   3842    shufpd               m6, m20, 0x55
   3843    vpdpwssd             m0, m14, m18   ; a2
   3844    shufpd              m19, m21, 0x55
   3845    vpdpwssd             m1, m12, m18   ; a0'
   3846    pshufb              m18, m22, m8
   3847    vpdpwssd             m2, m14, m6    ; b2
   3848    pshufb               m7, m23, m8
   3849    vpdpwssd             m4, m14, m19   ; c2
   3850    vpdpwssd             m3, m12, m6    ; b0'
   3851    mova                 m6, m10
   3852    vpdpwssd             m5, m12, m19   ; c0'
   3853    pshufb              m19, m24, m9
   3854    vpdpwssd             m6, m12, m18   ; d0
   3855    mova                m18, m10
   3856    pshufb              m26, m25, m9
   3857    vpdpwssd            m18, m12, m7    ; e0
   3858    mova                 m7, m10
   3859    pshufb              m22, m9
   3860    vpdpwssd             m7, m14, m19   ; d2'
   3861    mova                m19, m10
   3862    pshufb              m23, m9
   3863    vpdpwssd            m19, m14, m26   ; e2'
   3864    pshufb              m24, m8
   3865    vpdpwssd             m6, m13, m22   ; d1
   3866    pshufb              m25, m8
   3867    vpdpwssd            m18, m13, m23   ; e1
   3868    shufpd              m22, m24, 0x55
   3869    vpdpwssd             m7, m13, m24   ; d1'
   3870    shufpd              m23, m25, 0x55
   3871    vpdpwssd            m19, m13, m25   ; e1'
   3872    pslldq               m0, 1
   3873    vpdpwssd             m6, m14, m22   ; d2
   3874    pslldq               m1, 1
   3875    vpdpwssd            m18, m14, m23   ; e2
   3876    vpermt2b             m2, m27, m4    ; 12
   3877    vpdpwssd             m7, m12, m22   ; d0'
   3878    vpermt2b             m3, m27, m5    ; 12'
   3879    vpdpwssd            m19, m12, m23   ; e0'
   3880    vpshrdd              m0, m2, 16     ; 01
   3881    vpermt2b             m6, m27, m18   ; 34
   3882    vpshrdd              m1, m3, 16     ; 01'
   3883    vpermt2b             m7, m27, m19   ; 34'
   3884    vpshrdd              m4, m2, m6, 16 ; 23
   3885    vpshrdd              m5, m3, m7, 16 ; 23'
   3886 .hv_w32_loop:
   3887    movu                m22, [r7+ssq*1+ 0]
   3888    movu                m24, [r7+ssq*1+12]
   3889    lea                  r7, [r7+ssq*2]
   3890    movu                m23, [r7+ssq*0+ 0]
   3891    movu                m25, [r7+ssq*0+12]
   3892    mova                m19, m11
   3893    vpdpwssd            m19, m15, m2    ; B0
   3894    mova                m21, m11
   3895    vpdpwssd            m21, m15, m3    ; B0'
   3896    mova                m18, m11
   3897    vpdpwssd            m18, m15, m0    ; A0
   3898    mova                m20, m11
   3899    vpdpwssd            m20, m15, m1    ; A0'
   3900    mova                 m2, m6
   3901    vpdpwssd            m19, m16, m6    ; B1
   3902    mova                 m3, m7
   3903    vpdpwssd            m21, m16, m7    ; B1'
   3904    mova                 m0, m4
   3905    vpdpwssd            m18, m16, m4    ; A1
   3906    mova                 m1, m5
   3907    pshufb               m4, m22, m8
   3908    vpdpwssd            m20, m16, m5    ; A1'
   3909    mova                 m6, m10
   3910    pshufb               m7, m23, m8
   3911    vpdpwssd             m6, m12, m4    ; f0
   3912    mova                 m4, m10
   3913    pshufb               m5, m24, m9
   3914    vpdpwssd             m4, m12, m7    ; g0
   3915    mova                 m7, m10
   3916    pshufb              m26, m25, m9
   3917    vpdpwssd             m7, m14, m5    ; f2'
   3918    mova                 m5, m10
   3919    pshufb              m22, m9
   3920    vpdpwssd             m5, m14, m26   ; g2'
   3921    pshufb              m23, m9
   3922    vpdpwssd             m6, m13, m22   ; f1
   3923    pshufb              m24, m8
   3924    vpdpwssd             m4, m13, m23   ; g1
   3925    pshufb              m25, m8
   3926    vpdpwssd             m7, m13, m24   ; f1'
   3927    shufpd              m22, m24, 0x55
   3928    vpdpwssd             m5, m13, m25   ; g1'
   3929    shufpd              m23, m25, 0x55
   3930    vpdpwssd             m6, m14, m22   ; f2
   3931    vpdpwssd             m4, m14, m23   ; g2
   3932    vpdpwssd             m7, m12, m22   ; f0'
   3933    vpdpwssd             m5, m12, m23   ; g0'
   3934    vpermt2b             m6, m27, m4    ; 56
   3935    vpermt2b             m7, m27, m5    ; 56'
   3936    vpdpwssd            m19, m17, m6    ; B2
   3937    vpshrdd              m4, m2, m6, 16 ; 45
   3938    vpdpwssd            m21, m17, m7    ; B2'
   3939    vpshrdd              m5, m3, m7, 16 ; 45'
   3940    vpdpwssd            m18, m17, m4    ; A2
   3941    vpdpwssd            m20, m17, m5    ; A2'
   3942    vpermt2b            m19, m28, m21
   3943    vpermt2b            m18, m28, m20
   3944    mova          [r8+wq*0], m18
   3945    mova          [r8+wq*2], m19
   3946    lea                  r8, [r8+wq*4]
   3947    sub                  hd, 2
   3948    jg .hv_w32_loop
   3949    add                srcq, 64
   3950    add                tmpq, 64
   3951    movzx                hd, r5b
   3952    sub                 r5d, 1<<8
   3953    jg .hv_w32_loop0
   3954 %if WIN64
   3955    pop                  r8
   3956 %endif
   3957    RET
   3958 
   3959 PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
   3960 PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
   3961 PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
   3962 PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
   3963 PREP_8TAP_FN sharp,          SHARP,   SHARP
   3964 
   3965 cglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my
   3966 %define base r7-prep_avx512icl
   3967    imul                mxd, mxm, 0x010101
   3968    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   3969    imul                myd, mym, 0x010101
   3970    add                 myd, t1d ; 8tap_v, my, 4tap_v
   3971    lea                  r7, [prep_avx512icl]
   3972    mov                  wd, wm
   3973    movifnidn            hd, hm
   3974    test                mxd, 0xf00
   3975    jnz .h
   3976    test                myd, 0xf00
   3977    jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep
   3978 .v:
   3979    movzx               mxd, myb
   3980    shr                 myd, 16
   3981    cmp                  hd, 4
   3982    cmove               myd, mxd
   3983    mov                 r5d, r7m
   3984    vpbroadcastd        m10, [prep_8tap_rnd]
   3985    pmovsxbw           xmm0, [base+subpel_filters+myq*8]
   3986    tzcnt               r6d, wd
   3987    shr                 r5d, 11
   3988    movzx               r6d, word [r7+r6*2+table_offset(prep, _8tap_v)]
   3989    psllw              xmm0, [base+prep_hv_shift+r5*8]
   3990    add                  r7, r6
   3991    lea                  r6, [strideq*3]
   3992    sub                srcq, r6
   3993    mova             [tmpq], xmm0
   3994    vpbroadcastd        m12, xmm0
   3995    vpbroadcastd        m13, [tmpq+ 4]
   3996    vpbroadcastd        m14, [tmpq+ 8]
   3997    vpbroadcastd        m15, [tmpq+12]
   3998    jmp                  r7
   3999 .v_w4:
   4000    mov                 r3d, 0x330c
   4001    movq                xm1, [srcq+strideq*0]
   4002    kmovw                k1, r3d
   4003    vpbroadcastq    ym1{k1}, [srcq+strideq*1]
   4004    vpbroadcastq         m0, [srcq+r6       ]
   4005    vinserti32x4     m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3
   4006    lea                srcq, [srcq+strideq*4]
   4007    vpbroadcastq    ym0{k1}, [srcq+strideq*0]
   4008    vpbroadcastq         m2, [srcq+strideq*1]
   4009    vinserti32x4     m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6
   4010    mova                ym5, [prep_endA]
   4011    vshufi32x4           m3, m1, m0, q1021 ; 1 2 3 4
   4012    vshufi32x4           m2, m1, m0, q2132 ; 2 3 4 5
   4013    punpcklwd            m1, m3            ; 01 12 23 34
   4014    punpcklwd            m2, m0            ; 23 34 45 56
   4015 .v_w4_loop:
   4016    movq                xm4, [srcq+r6       ]
   4017    lea                srcq, [srcq+strideq*4]
   4018    vpbroadcastq    ym4{k1}, [srcq+strideq*0]
   4019    vpbroadcastq         m3, [srcq+strideq*1]
   4020    vinserti32x4     m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a
   4021    mova                 m3, m10
   4022    vpdpwssd             m3, m12, m1       ; a0 b0 c0 d0
   4023    valignq              m1, m4, m0, 6     ; 6 7 8 9
   4024    vpdpwssd             m3, m13, m2       ; a1 b1 c1 d1
   4025    mova                 m0, m4
   4026    punpcklwd            m4, m1, m4        ; 67 78 89 9a
   4027    vpdpwssd             m3, m15, m4       ; a3 b3 c3 d3
   4028    vshufi32x4           m1, m2, m4, q1032 ; 45 56 67 78
   4029    vpdpwssd             m3, m14, m1       ; a2 b2 c2 d2
   4030    mova                 m2, m4
   4031    vpermb               m3, m5, m3
   4032    mova             [tmpq], ym3
   4033    add                tmpq, 32
   4034    sub                  hd, 4
   4035    jg .v_w4_loop
   4036    RET
   4037 .v_w8:
   4038    movu                xm0, [srcq+strideq*0]
   4039    mov                 r3d, 0x33
   4040    vbroadcasti32x4     ym1, [srcq+strideq*1]
   4041    kmovb                k1, r3d
   4042    mova                 m7, [spel_v_shuf8]
   4043    vinserti64x2     m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2
   4044    add                srcq, r6
   4045    vbroadcasti32x4     ym2, [srcq+strideq*0]
   4046    vbroadcasti32x4      m3, [srcq+strideq*1]
   4047    vbroadcasti32x4     ym0, [srcq+strideq*2]
   4048    vshufi64x2       m2{k1}, m1, m3, q1032    ; 2 3 4
   4049    vinserti64x2     m0{k1}, m3, [srcq+r6], 2 ; 4 5 6
   4050    mova                 m8, [prep_endB]
   4051    vpermb               m1, m7, m1  ; 01 12
   4052    vpermb               m2, m7, m2  ; 23 34
   4053    vpermb               m3, m7, m0  ; 45 56
   4054 .v_w8_loop:
   4055    lea                srcq, [srcq+strideq*4]
   4056    vbroadcasti32x4     ym4, [srcq+strideq*0]
   4057    movu                xm5, [srcq+strideq*1]
   4058    vshufi64x2       m4{k1}, m0, m5, q1032    ; 6 7 8
   4059    vbroadcasti32x4     ym0, [srcq+strideq*2]
   4060    vinserti64x2     m0{k1}, m5, [srcq+r6], 2 ; 8 9 a
   4061    mova                 m5, m10
   4062    vpdpwssd             m5, m12, m1 ; a0 b0
   4063    mova                 m6, m10
   4064    vpdpwssd             m6, m12, m2 ; c0 d0
   4065    mova                 m1, m3
   4066    vpdpwssd             m5, m13, m2 ; c1 d1
   4067    vpdpwssd             m6, m13, m3 ; c1 d1
   4068    vpermb               m2, m7, m4  ; 67 78
   4069    vpdpwssd             m5, m14, m3 ; a2 b2
   4070    vpermb               m3, m7, m0  ; 89 9a
   4071    vpdpwssd             m6, m14, m2 ; c2 d2
   4072    vpdpwssd             m5, m15, m2 ; a3 b3
   4073    vpdpwssd             m6, m15, m3 ; c3 d3
   4074    vpermt2b             m5, m8, m6
   4075    mova             [tmpq], m5
   4076    add                tmpq, 64
   4077    sub                  hd, 4
   4078    jg .v_w8_loop
   4079    RET
   4080 .v_w16:
   4081    vbroadcasti32x8      m0, [srcq+strideq*1]
   4082    vinserti32x8         m1, m0, [srcq+strideq*2], 1
   4083    vinserti32x8         m0, [srcq+strideq*0], 0
   4084    mova                 m8, [spel_v_shuf16]
   4085    add                srcq, r6
   4086    movu                ym3, [srcq+strideq*0]
   4087    vinserti32x8         m3, [srcq+strideq*1], 1
   4088    movu                ym5, [srcq+strideq*2]
   4089    add                srcq, r6
   4090    vinserti32x8         m5, [srcq+strideq*0], 1
   4091    mova                m11, [prep_endA]
   4092    vpermb               m1, m8, m1     ; 12
   4093    vpermb               m0, m8, m0     ; 01
   4094    vpermb               m3, m8, m3     ; 34
   4095    vpermb               m5, m8, m5     ; 56
   4096    vpshrdd              m2, m1, m3, 16 ; 23
   4097    vpshrdd              m4, m3, m5, 16 ; 45
   4098 .v_w16_loop:
   4099    mova                 m7, m10
   4100    vpdpwssd             m7, m12, m1    ; b0
   4101    mova                 m6, m10
   4102    vpdpwssd             m6, m12, m0    ; a0
   4103    mova                 m1, m3
   4104    vpdpwssd             m7, m13, m3    ; b1
   4105    mova                 m0, m2
   4106    vpdpwssd             m6, m13, m2    ; a1
   4107    mova                 m3, m5
   4108    vpdpwssd             m7, m14, m5    ; b2
   4109    mova                 m2, m4
   4110    vpdpwssd             m6, m14, m4    ; a2
   4111    movu                ym5, [srcq+strideq*1]
   4112    lea                srcq, [srcq+strideq*2]
   4113    vinserti32x8         m5, [srcq+strideq*0], 1
   4114    vpermb               m5, m8, m5     ; 78
   4115    vpshrdd              m4, m3, m5, 16 ; 67
   4116    vpdpwssd             m7, m15, m5    ; b3
   4117    vpdpwssd             m6, m15, m4    ; a3
   4118    vpermt2b             m6, m11, m7
   4119    mova             [tmpq], m6
   4120    add                tmpq, 64
   4121    sub                  hd, 2
   4122    jg .v_w16_loop
   4123    RET
   4124 .v_w32:
   4125 .v_w64:
   4126 .v_w128:
   4127    WIN64_PUSH_XMM       23
   4128 %if WIN64
   4129    push                 r8
   4130 %endif
   4131    mova                m11, [prep_endC]
   4132    lea                  r5, [hq+wq*8-256]
   4133 .v_w32_loop0:
   4134    movu                m16, [srcq+strideq*0]
   4135    movu                m17, [srcq+strideq*1]
   4136    lea                  r7, [srcq+r6]
   4137    movu                m18, [srcq+strideq*2]
   4138    movu                m19, [r7  +strideq*0]
   4139    mov                  r8, tmpq
   4140    movu                m20, [r7  +strideq*1]
   4141    movu                m21, [r7  +strideq*2]
   4142    add                  r7, r6
   4143    movu                m22, [r7  +strideq*0]
   4144    punpcklwd            m0, m16, m17 ; 01l
   4145    punpckhwd           m16, m17      ; 01h
   4146    punpcklwd            m1, m17, m18 ; 12l
   4147    punpckhwd           m17, m18      ; 12h
   4148    punpcklwd            m2, m18, m19 ; 23l
   4149    punpckhwd           m18, m19      ; 23h
   4150    punpcklwd            m3, m19, m20 ; 34l
   4151    punpckhwd           m19, m20      ; 34h
   4152    punpcklwd            m4, m20, m21 ; 45l
   4153    punpckhwd           m20, m21      ; 45h
   4154    punpcklwd            m5, m21, m22 ; 56l
   4155    punpckhwd           m21, m22      ; 56h
   4156 .v_w32_loop:
   4157    mova                 m6, m10
   4158    vpdpwssd             m6, m12, m0  ; a0l
   4159    mova                 m8, m10
   4160    vpdpwssd             m8, m12, m16 ; a0h
   4161    mova                 m7, m10
   4162    vpdpwssd             m7, m12, m1  ; b0l
   4163    mova                 m9, m10
   4164    vpdpwssd             m9, m12, m17 ; b0h
   4165    mova                 m0, m2
   4166    vpdpwssd             m6, m13, m2  ; a1l
   4167    mova                m16, m18
   4168    vpdpwssd             m8, m13, m18 ; a1h
   4169    mova                 m1, m3
   4170    vpdpwssd             m7, m13, m3  ; b1l
   4171    mova                m17, m19
   4172    vpdpwssd             m9, m13, m19 ; b1h
   4173    mova                 m2, m4
   4174    vpdpwssd             m6, m14, m4  ; a2l
   4175    mova                m18, m20
   4176    vpdpwssd             m8, m14, m20 ; a2h
   4177    mova                 m3, m5
   4178    vpdpwssd             m7, m14, m5  ; b2l
   4179    mova                m19, m21
   4180    vpdpwssd             m9, m14, m21 ; b2h
   4181    movu                m21, [r7+strideq*1]
   4182    lea                  r7, [r7+strideq*2]
   4183    punpcklwd            m4, m22, m21 ; 67l
   4184    punpckhwd           m20, m22, m21 ; 67h
   4185    movu                m22, [r7+strideq*0]
   4186    vpdpwssd             m6, m15, m4  ; a3l
   4187    vpdpwssd             m8, m15, m20 ; a3h
   4188    punpcklwd            m5, m21, m22 ; 78l
   4189    punpckhwd           m21, m22      ; 78h
   4190    vpdpwssd             m7, m15, m5  ; b3l
   4191    vpdpwssd             m9, m15, m21 ; b3h
   4192    vpermt2b             m6, m11, m8
   4193    vpermt2b             m7, m11, m9
   4194    mova          [r8+wq*0], m6
   4195    mova          [r8+wq*2], m7
   4196    lea                  r8, [r8+wq*4]
   4197    sub                  hd, 2
   4198    jg .v_w32_loop
   4199    add                srcq, 64
   4200    add                tmpq, 64
   4201    movzx                hd, r5b
   4202    sub                 r5d, 1<<8
   4203    jg .v_w32_loop0
   4204 %if WIN64
   4205    pop                  r8
   4206 %endif
   4207    RET
   4208 .h_w4:
   4209    RESET_STACK_STATE
   4210    movzx               mxd, mxb
   4211    sub                srcq, 2
   4212    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
   4213    mov                 r5d, r7m
   4214    vbroadcasti32x4      m4, [spel_h_shufA]
   4215    vbroadcasti32x4      m5, [spel_h_shufB]
   4216    shr                 r5d, 11
   4217    mova                ym9, [prep_endA]
   4218    psllw              xmm0, [base+prep_hv_shift+r5*8]
   4219    mova             [tmpq], xmm0
   4220    vpbroadcastd         m6, [tmpq+4]
   4221    vpbroadcastd         m7, [tmpq+8]
   4222 .h_w4_loop:
   4223    movu                xm2, [srcq+strideq*0]
   4224    vinserti32x4        ym2, [srcq+strideq*1], 1
   4225    vinserti32x4         m2, [srcq+strideq*2], 2
   4226    vinserti32x4         m2, [srcq+r6       ], 3
   4227    lea                srcq, [srcq+strideq*4]
   4228    mova                 m0, m10
   4229    pshufb               m1, m2, m4
   4230    vpdpwssd             m0, m6, m1
   4231    pshufb               m2, m5
   4232    vpdpwssd             m0, m7, m2
   4233    vpermb               m0, m9, m0
   4234    mova             [tmpq], ym0
   4235    add                tmpq, 32
   4236    sub                  hd, 4
   4237    jg .h_w4_loop
   4238    RET
   4239 .h_w8:
   4240    mova                 m6, [spel_h_shufA]
   4241    movu                 m7, [spel_h_shufB]
   4242    movu                 m8, [spel_h_shufC]
   4243    mova                 m9, [spel_h_shufD]
   4244    mova                m11, [prep_endB]
   4245 .h_w8_loop:
   4246    movu                ym4, [srcq+strideq*0]
   4247    vinserti32x8         m4, [srcq+strideq*1], 1
   4248    movu                ym5, [srcq+strideq*2]
   4249    vinserti32x8         m5, [srcq+r6       ], 1
   4250    lea                srcq, [srcq+strideq*4]
   4251    mova                 m0, m10
   4252    mova                 m1, m10
   4253    vpermb               m2, m6, m4
   4254    vpermb               m3, m6, m5
   4255    vpdpwssd             m0, m12, m2
   4256    vpdpwssd             m1, m12, m3
   4257    vpermb               m2, m7, m4
   4258    vpermb               m3, m7, m5
   4259    vpdpwssd             m0, m13, m2
   4260    vpdpwssd             m1, m13, m3
   4261    vpermb               m2, m8, m4
   4262    vpermb               m3, m8, m5
   4263    vpdpwssd             m0, m14, m2
   4264    vpdpwssd             m1, m14, m3
   4265    vpermb               m2, m9, m4
   4266    vpermb               m3, m9, m5
   4267    vpdpwssd             m0, m15, m2
   4268    vpdpwssd             m1, m15, m3
   4269    vpermt2b             m0, m11, m1
   4270    mova             [tmpq], m0
   4271    add                tmpq, 64
   4272    sub                  hd, 4
   4273    jg .h_w8_loop
   4274    RET
   4275 .h:
   4276    vpbroadcastd        m10, [prep_8tap_rnd]
   4277    test                myd, 0xf00
   4278    jnz .hv
   4279    lea                  r6, [strideq*3]
   4280    cmp                  wd, 4
   4281    je .h_w4
   4282    shr                 mxd, 16
   4283    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
   4284    mov                 r5d, r7m
   4285    sub                srcq, 6
   4286    shr                 r5d, 11
   4287    psllw              xmm0, [base+prep_hv_shift+r5*8]
   4288    mova             [tmpq], xmm0
   4289    vpbroadcastd        m12, xmm0
   4290    vpbroadcastd        m13, [tmpq+ 4]
   4291    vpbroadcastd        m14, [tmpq+ 8]
   4292    vpbroadcastd        m15, [tmpq+12]
   4293    cmp                  wd, 16
   4294    jl .h_w8
   4295    vbroadcasti32x4      m6, [spel_h_shufA]
   4296    vbroadcasti32x4      m7, [spel_h_shufB]
   4297    mova                m11, [prep_endC]
   4298    jg .h_w32
   4299 .h_w16_loop:
   4300    movu                ym2, [srcq+strideq*0+ 0]
   4301    vinserti32x8         m2, [srcq+strideq*1+ 0], 1
   4302    movu                ym3, [srcq+strideq*0+16]
   4303    vinserti32x8         m3, [srcq+strideq*1+16], 1
   4304    lea                srcq, [srcq+strideq*2]
   4305    mova                 m0, m10
   4306    mova                 m1, m10
   4307    pshufb               m4, m2, m6
   4308    vpdpwssd             m0, m12, m4 ; a0
   4309    pshufb               m4, m3, m6
   4310    vpdpwssd             m1, m14, m4 ; b2
   4311    pshufb               m4, m2, m7
   4312    vpdpwssd             m0, m13, m4 ; a1
   4313    pshufb               m4, m3, m7
   4314    vpdpwssd             m1, m15, m4 ; b3
   4315    shufpd               m2, m3, 0x55
   4316    pshufb               m4, m2, m6
   4317    vpdpwssd             m0, m14, m4 ; a2
   4318    vpdpwssd             m1, m12, m4 ; b0
   4319    pshufb               m2, m7
   4320    vpdpwssd             m0, m15, m2 ; a3
   4321    vpdpwssd             m1, m13, m2 ; b1
   4322    vpermt2b             m0, m11, m1
   4323    mova             [tmpq], m0
   4324    add                tmpq, 64
   4325    sub                  hd, 2
   4326    jg .h_w16_loop
   4327    RET
   4328 .h_w32:
   4329    lea                srcq, [srcq+wq*2]
   4330    neg                  wq
   4331 .h_w32_loop0:
   4332    mov                  r6, wq
   4333 .h_w32_loop:
   4334    movu                 m2, [srcq+r6*2+ 0]
   4335    movu                 m3, [srcq+r6*2+ 8]
   4336    mova                 m0, m10
   4337    mova                 m1, m10
   4338    pshufb               m4, m2, m6
   4339    vpdpwssd             m0, m12, m4 ; a0
   4340    pshufb               m4, m3, m6
   4341    vpdpwssd             m1, m12, m4 ; b0
   4342    vpdpwssd             m0, m14, m4 ; a2
   4343    movu                 m4, [srcq+r6*2+16]
   4344    pshufb               m3, m7
   4345    vpdpwssd             m1, m13, m3 ; b1
   4346    vpdpwssd             m0, m15, m3 ; a3
   4347    pshufb               m3, m4, m6
   4348    vpdpwssd             m1, m14, m3 ; b2
   4349    pshufb               m2, m7
   4350    vpdpwssd             m0, m13, m2 ; a1
   4351    pshufb               m4, m7
   4352    vpdpwssd             m1, m15, m4 ; b3
   4353    vpermt2b             m0, m11, m1
   4354    mova             [tmpq], m0
   4355    add                tmpq, 64
   4356    add                  r6, 32
   4357    jl .h_w32_loop
   4358    add                srcq, strideq
   4359    dec                  hd
   4360    jg .h_w32_loop0
   4361    RET
   4362 .hv:
   4363    vpbroadcastd        m11, [pd_128]
   4364    cmp                  wd, 4
   4365    jg .hv_w8
   4366    movzx               mxd, mxb
   4367    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
   4368    movzx               mxd, myb
   4369    shr                 myd, 16
   4370    cmp                  hd, 4
   4371    cmove               myd, mxd
   4372    mov                 r5d, r7m
   4373    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
   4374    lea                  r6, [strideq*3]
   4375    sub                srcq, 2
   4376    shr                 r5d, 11
   4377    sub                srcq, r6
   4378    psllw              xmm0, [base+prep_hv_shift+r5*8]
   4379    psllw              xmm1, 2
   4380    mova          [tmpq+ 0], xmm0
   4381    mova          [tmpq+16], xmm1
   4382    vpbroadcastd        m12, xmm1
   4383    movu               xm16, [srcq+strideq*0]
   4384    mov                 r3d, 0xff0
   4385    vinserti128        ym16, [srcq+strideq*1], 1
   4386    kmovw                k1, r3d
   4387    vbroadcasti32x4     m18, [srcq+strideq*2]
   4388    add                srcq, r6
   4389    vinserti64x2    m16{k1}, m18, [srcq+strideq*0], 3
   4390    movu               xm17, [srcq+strideq*1]
   4391    vbroadcasti32x4    ym18, [srcq+strideq*2]
   4392    add                srcq, r6
   4393    vinserti32x4    m17{k1}, m18, [srcq+strideq*0], 2
   4394    vbroadcasti32x4      m5, [spel_h_shufA]
   4395    vbroadcasti32x4      m6, [spel_h_shufB]
   4396    vpbroadcastd         m8, [tmpq+ 4]
   4397    vpbroadcastd         m9, [tmpq+ 8]
   4398    mova                 m1, m10
   4399    mova                m19, [spel_shuf4a]
   4400    mova                 m2, m10
   4401    pshufb               m0, m16, m5
   4402    vpdpwssd             m1, m8, m0
   4403    pshufb               m0, m17, m5
   4404    vpdpwssd             m2, m8, m0
   4405    vpbroadcastd        m13, [tmpq+20]
   4406    pshufb              m16, m6
   4407    vpbroadcastd        m14, [tmpq+24]
   4408    pshufb              m17, m6
   4409    vpbroadcastd        m15, [tmpq+28]
   4410    vpdpwssd             m1, m9, m16       ; 0 1 2 3
   4411    vpdpwssd             m2, m9, m17       ; 4 5 6
   4412    mova                 m7, [spel_shuf4b]
   4413    vpermt2b             m1, m19, m2       ; 01 12 23 34
   4414    vpermb               m2, m19, m2       ; 45 56
   4415    mova               ym19, [prep_endA]
   4416    vshufi32x4           m2, m1, m2, q1032 ; 23 34 45 56
   4417 .hv_w4_loop:
   4418    movu               xm17, [srcq+strideq*1]
   4419    vinserti128        ym17, [srcq+strideq*2], 1
   4420    vbroadcasti32x4     m16, [srcq+r6       ]
   4421    lea                srcq, [srcq+strideq*4]
   4422    vinserti64x2    m17{k1}, m16, [srcq+strideq*0], 3
   4423    mova                m18, m10
   4424    pshufb              m16, m17, m5
   4425    vpdpwssd            m18, m8, m16
   4426    mova                m16, m11
   4427    vpdpwssd            m16, m12, m1       ; a0 b0 c0 d0
   4428    pshufb              m17, m6
   4429    vpdpwssd            m18, m9, m17       ; 7 8 9 a
   4430    mova                 m1, m2
   4431    vpdpwssd            m16, m13, m2       ; a1 b1 c1 d1
   4432    vpermt2b             m2, m7, m18       ; 67 78 89 9a
   4433    vpdpwssd            m16, m15, m2       ; a3 b3 c3 d3
   4434    vshufi32x4           m1, m2, q1032     ; 45 56 67 78
   4435    vpdpwssd            m16, m14, m1       ; a2 b2 c2 d2
   4436    vpermb              m16, m19, m16
   4437    mova             [tmpq], ym16
   4438    add                tmpq, 32
   4439    sub                  hd, 4
   4440    jg .hv_w4_loop
   4441    vzeroupper
   4442    RET
   4443 .hv_w8:
   4444    shr                 mxd, 16
   4445    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
   4446    movzx               mxd, myb
   4447    shr                 myd, 16
   4448    cmp                  hd, 6
   4449    cmovs               myd, mxd
   4450    mov                 r5d, r7m
   4451    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
   4452    lea                  r6, [strideq*3]
   4453    sub                srcq, 6
   4454    shr                 r5d, 11
   4455    sub                srcq, r6
   4456    psllw              xmm0, [base+prep_hv_shift+r5*8]
   4457    psllw              xmm1, 2
   4458    mova          [tmpq+ 0], xmm0
   4459    mova          [tmpq+16], xmm1
   4460    vpbroadcastd        m12, xmm0
   4461    vpbroadcastd        m13, [tmpq+ 4]
   4462    vpbroadcastd        m14, [tmpq+ 8]
   4463    vpbroadcastd        m15, [tmpq+12]
   4464    vpbroadcastd        m16, xmm1
   4465    vpbroadcastd        m17, [tmpq+20]
   4466    vpbroadcastd        m18, [tmpq+24]
   4467    vpbroadcastd        m19, [tmpq+28]
   4468    cmp                  wd, 8
   4469    jg .hv_w16
   4470    WIN64_SPILL_XMM      23
   4471    mova                 m5, [spel_h_shufA]
   4472    movu                ym0, [srcq+strideq*0]
   4473    vinserti32x8         m0, [srcq+strideq*1], 1 ; 0 1
   4474    movu                ym9, [srcq+strideq*2]
   4475    add                srcq, r6
   4476    vinserti32x8         m9, [srcq+strideq*0], 1 ; 2 3
   4477    movu               ym20, [srcq+strideq*1]
   4478    vinserti32x8        m20, [srcq+strideq*2], 1 ; 4 5
   4479    add                srcq, r6
   4480    movu               ym21, [srcq+strideq*0]    ; 6
   4481    movu                 m6, [spel_h_shufB]
   4482    movu                 m7, [spel_h_shufC]
   4483    mova               ym22, [prep_endB]
   4484    vpermb               m8, m5, m0
   4485    mova                 m1, m10
   4486    vpdpwssd             m1, m12, m8  ; a0 b0
   4487    vpermb               m8, m5, m9
   4488    mova                 m2, m10
   4489    vpdpwssd             m2, m12, m8  ; c0 d0
   4490    vpermb               m8, m5, m20
   4491    mova                 m3, m10
   4492    vpdpwssd             m3, m12, m8  ; e0 f0
   4493    vpermb               m8, m5, m21
   4494    mova                 m4, m10
   4495    vpdpwssd             m4, m12, m8  ; g0
   4496    vpermb               m8, m6, m0
   4497    vpdpwssd             m1, m13, m8  ; a1 b1
   4498    vpermb               m8, m6, m9
   4499    vpdpwssd             m2, m13, m8  ; c1 d1
   4500    vpermb               m8, m6, m20
   4501    vpdpwssd             m3, m13, m8  ; e1 f1
   4502    vpermb               m8, m6, m21
   4503    vpdpwssd             m4, m13, m8  ; g1
   4504    vpermb               m8, m7, m0
   4505    vpdpwssd             m1, m14, m8  ; a2 b2
   4506    vpermb               m8, m7, m9
   4507    vpdpwssd             m2, m14, m8  ; c2 d2
   4508    vpermb               m8, m7, m20
   4509    vpdpwssd             m3, m14, m8  ; e2 f2
   4510    vpermb               m8, m7, m21
   4511    vpdpwssd             m4, m14, m8  ; g2
   4512    mova                 m8, [spel_h_shufD]
   4513    vpermb               m0, m8, m0
   4514    vpdpwssd             m1, m15, m0  ; a3 b3
   4515    mova                 m0, [spel_shuf8a]
   4516    vpermb               m9, m8, m9
   4517    vpdpwssd             m2, m15, m9  ; c3 d3
   4518    mova                 m9, [spel_shuf8b]
   4519    vpermb              m20, m8, m20
   4520    vpdpwssd             m3, m15, m20 ; e3 f3
   4521    vpermb              m21, m8, m21
   4522    vpdpwssd             m4, m15, m21 ; g3
   4523    vpermt2b             m1, m0, m2   ; 01 12
   4524    vpermt2b             m2, m0, m3   ; 23 34
   4525    vpermt2b             m3, m0, m4   ; 45 56
   4526 .hv_w8_loop:
   4527    movu                ym0, [srcq+strideq*1]
   4528    lea                srcq, [srcq+strideq*2]
   4529    vinserti32x8         m0, [srcq+strideq*0], 1
   4530    mova                 m4, m10
   4531    mova                m20, m11
   4532    vpermb              m21, m5, m0
   4533    vpdpwssd             m4, m12, m21 ; h0 i0
   4534    vpermb              m21, m6, m0
   4535    vpdpwssd            m20, m16, m1  ; A0 B0
   4536    vpdpwssd             m4, m13, m21 ; h1 i1
   4537    vpermb              m21, m7, m0
   4538    mova                 m1, m2
   4539    vpdpwssd            m20, m17, m2  ; A1 B1
   4540    vpdpwssd             m4, m14, m21 ; h2 i2
   4541    vpermb              m21, m8, m0
   4542    mova                 m2, m3
   4543    vpdpwssd            m20, m18, m3  ; A2 B2
   4544    vpdpwssd             m4, m15, m21 ; h3 i3
   4545    vpermt2b             m3, m9, m4   ; 67 78
   4546    vpdpwssd            m20, m19, m3  ; A3 B3
   4547    vpermb              m20, m22, m20
   4548    mova             [tmpq], ym20
   4549    add                tmpq, 32
   4550    sub                  hd, 2
   4551    jg .hv_w8_loop
   4552    RET
   4553 .hv_w16:
   4554    WIN64_SPILL_XMM      27
   4555 %if WIN64
   4556    push                 r8
   4557 %endif
   4558    vbroadcasti32x4     m20, [spel_h_shufA]
   4559    vbroadcasti32x4     m21, [spel_h_shufB]
   4560    add                  wd, wd
   4561    mova                 m9, [spel_shuf16]
   4562    mova                m26, [prep_endB]
   4563    lea                 r5d, [hq+wq*8-256]
   4564 .hv_w16_loop0:
   4565    vbroadcasti32x8      m5, [srcq+strideq*0+ 8]
   4566    vinserti32x8         m4, m5, [srcq+strideq*0+ 0], 0
   4567    vinserti32x8         m5, [srcq+strideq*0+16], 1 ; 0
   4568    movu                ym6, [srcq+strideq*1+ 0]
   4569    movu                ym7, [srcq+strideq*1+16]
   4570    lea                  r7, [srcq+r6]
   4571    vinserti32x8         m6, [srcq+strideq*2+ 0], 1
   4572    vinserti32x8         m7, [srcq+strideq*2+16], 1 ; 1 2
   4573    movu               ym22, [r7  +strideq*0+ 0]
   4574    movu               ym23, [r7  +strideq*0+16]
   4575    mov                  r8, tmpq
   4576    vinserti32x8        m22, [r7  +strideq*1+ 0], 1
   4577    vinserti32x8        m23, [r7  +strideq*1+16], 1 ; 3 4
   4578    movu               ym24, [r7  +strideq*2+ 0]
   4579    movu               ym25, [r7  +strideq*2+16]
   4580    add                  r7, r6
   4581    vinserti32x8        m24, [r7  +strideq*0+ 0], 1
   4582    vinserti32x8        m25, [r7  +strideq*0+16], 1 ; 5 6
   4583    pshufb               m0, m4, m20
   4584    mova                 m1, m10
   4585    vpdpwssd             m1, m12, m0    ; a0
   4586    pshufb               m0, m6, m20
   4587    mova                 m2, m10
   4588    vpdpwssd             m2, m12, m0    ; b0
   4589    pshufb               m0, m7, m20
   4590    mova                 m3, m10
   4591    vpdpwssd             m3, m14, m0    ; c2
   4592    pshufb               m0, m4, m21
   4593    vpdpwssd             m1, m13, m0    ; a1
   4594    pshufb               m0, m6, m21
   4595    vpdpwssd             m2, m13, m0    ; b1
   4596    pshufb               m0, m7, m21
   4597    vpdpwssd             m3, m15, m0    ; c3
   4598    pshufb               m0, m5, m20
   4599    vpdpwssd             m1, m14, m0    ; a2
   4600    shufpd               m6, m7, 0x55
   4601    pshufb               m7, m6, m20
   4602    vpdpwssd             m2, m14, m7    ; b2
   4603    vpdpwssd             m3, m12, m7    ; c0
   4604    pshufb               m5, m21
   4605    vpdpwssd             m1, m15, m5    ; a3
   4606    pshufb               m6, m21
   4607    vpdpwssd             m2, m15, m6    ; b3
   4608    vpdpwssd             m3, m13, m6    ; c1
   4609    pshufb               m0, m22, m20
   4610    mova                 m4, m10
   4611    vpdpwssd             m4, m12, m0    ; d0
   4612    pshufb               m0, m23, m20
   4613    mova                 m5, m10
   4614    vpdpwssd             m5, m14, m0    ; e2
   4615    pshufb               m0, m24, m20
   4616    mova                 m6, m10
   4617    vpdpwssd             m6, m12, m0    ; f0
   4618    pshufb               m0, m25, m20
   4619    mova                 m7, m10
   4620    vpdpwssd             m7, m14, m0    ; g2
   4621    pshufb               m0, m22, m21
   4622    vpdpwssd             m4, m13, m0    ; d1
   4623    pshufb               m0, m23, m21
   4624    vpdpwssd             m5, m15, m0    ; e3
   4625    pshufb               m0, m24, m21
   4626    vpdpwssd             m6, m13, m0    ; f1
   4627    pshufb               m0, m25, m21
   4628    vpdpwssd             m7, m15, m0    ; g3
   4629    shufpd              m22, m23, 0x55
   4630    pshufb              m23, m22, m20
   4631    vpdpwssd             m4, m14, m23   ; d2
   4632    vpdpwssd             m5, m12, m23   ; e0
   4633    shufpd              m24, m25, 0x55
   4634    pshufb              m25, m24, m20
   4635    vpdpwssd             m6, m14, m25   ; f2
   4636    vpdpwssd             m7, m12, m25   ; g0
   4637    pshufb              m22, m21
   4638    vpdpwssd             m4, m15, m22   ; d3
   4639    vpdpwssd             m5, m13, m22   ; e1
   4640    pshufb              m24, m21
   4641    vpdpwssd             m6, m15, m24   ; f3
   4642    vpdpwssd             m7, m13, m24   ; g1
   4643    pslldq               m1, 1
   4644    vpermt2b             m2, m9, m3     ; 12
   4645    vpermt2b             m4, m9, m5     ; 34
   4646    vpermt2b             m6, m9, m7     ; 56
   4647    vpshrdd              m1, m2, 16     ; 01
   4648    vpshrdd              m3, m2, m4, 16 ; 23
   4649    vpshrdd              m5, m4, m6, 16 ; 45
   4650 .hv_w16_loop:
   4651    movu               ym24, [r7+strideq*1+ 0]
   4652    movu               ym25, [r7+strideq*1+16]
   4653    lea                  r7, [r7+strideq*2]
   4654    vinserti32x8        m24, [r7+strideq*0+ 0], 1
   4655    vinserti32x8        m25, [r7+strideq*0+16], 1
   4656    mova                 m7, m10
   4657    mova                 m8, m10
   4658    pshufb               m0, m24, m20
   4659    vpdpwssd             m7, m12, m0    ; h0
   4660    mova                m22, m11
   4661    pshufb               m0, m25, m20
   4662    vpdpwssd             m8, m14, m0    ; i2
   4663    mova                m23, m11
   4664    vpdpwssd            m22, m16, m1    ; A0
   4665    mova                 m1, m3
   4666    vpdpwssd            m23, m16, m2    ; B0
   4667    mova                 m2, m4
   4668    pshufb               m0, m24, m21
   4669    vpdpwssd             m7, m13, m0    ; h1
   4670    pshufb               m0, m25, m21
   4671    vpdpwssd             m8, m15, m0    ; i3
   4672    vpdpwssd            m22, m17, m3    ; A1
   4673    mova                 m3, m5
   4674    vpdpwssd            m23, m17, m4    ; B1
   4675    mova                 m4, m6
   4676    shufpd              m24, m25, 0x55
   4677    pshufb              m25, m24, m20
   4678    vpdpwssd             m7, m14, m25   ; h2
   4679    vpdpwssd             m8, m12, m25   ; i0
   4680    vpdpwssd            m22, m18, m5    ; A2
   4681    vpdpwssd            m23, m18, m6    ; B2
   4682    pshufb              m24, m21
   4683    vpdpwssd             m7, m15, m24   ; h3
   4684    vpdpwssd             m8, m13, m24   ; i1
   4685    vpermt2b             m7, m9, m8     ; 78
   4686    vpshrdd              m5, m6, m7, 16 ; 67
   4687    vpdpwssd            m22, m19, m5    ; A3
   4688    vpdpwssd            m23, m19, m7    ; B3
   4689    mova                 m6, m7
   4690    vpermt2b            m22, m26, m23
   4691    mova          [r8+wq*0], ym22
   4692    vextracti32x8 [r8+wq*1], m22, 1
   4693    lea                  r8, [r8+wq*2]
   4694    sub                  hd, 2
   4695    jg .hv_w16_loop
   4696    add                srcq, 32
   4697    add                tmpq, 32
   4698    movzx                hd, r5b
   4699    sub                 r5d, 1<<8
   4700    jg .hv_w16_loop0
   4701 %if WIN64
   4702    pop                  r8
   4703 %endif
   4704    RET
   4705 
   4706 %if WIN64
   4707 DECLARE_REG_TMP 5
   4708 %else
   4709 DECLARE_REG_TMP 7
   4710 %endif
   4711 
   4712 cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts
   4713 %define base r6-pd_0to7
   4714    mov                 t0d, r7m
   4715    lea                  r6, [pd_0to7]
   4716    shr                 t0d, 11
   4717    vpbroadcastd         m8, [base+warp_8x8t_rnd_v]
   4718    vpbroadcastd         m1, [base+warp_8x8_rnd_h+t0*4]
   4719    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main
   4720    psrad               m14, m16, 15
   4721    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
   4722    psrad               m16, 15
   4723    packssdw            m14, m16
   4724    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
   4725    psrad               m15, m16, 15
   4726    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
   4727    add                 tsq, tsq
   4728    psrad               m16, 15
   4729    packssdw            m15, m16
   4730    jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end
   4731 
   4732 cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd
   4733    mov                 t0d, r7m ; pixel_max
   4734    lea                  r6, [pd_0to7]
   4735    shr                 t0d, 11
   4736    vpbroadcastd         m1, [base+warp_8x8_rnd_h+t0*4]
   4737    vpbroadcastd         m8, [base+warp_8x8_rnd_v+t0*4]
   4738    call .main
   4739    psrad               m14, m16, 13
   4740    call .main2
   4741    psrad               m16, 13
   4742    packusdw            m14, m16
   4743    call .main2
   4744    psrad               m15, m16, 13
   4745    call .main2
   4746    vpbroadcastd         m0, [base+bidir_shift+t0*4]
   4747    vpsrlvw             m14, m0
   4748    psrad               m16, 13
   4749    packusdw            m15, m16
   4750    vpsrlvw             m15, m0
   4751 .end:
   4752    mova                 m0, [base+warp8x8_end]
   4753    vpermb              m16, m0, m14
   4754    lea                  r2, [dsq*3]
   4755    mova          [dstq+dsq*0], xm16
   4756    vextracti128  [dstq+dsq*1], ym16, 1
   4757    vextracti32x4 [dstq+dsq*2], m16, 2
   4758    vextracti32x4 [dstq+r2   ], m16, 3
   4759    vpermb              m16, m0, m15
   4760    lea                dstq, [dstq+dsq*4]
   4761    mova          [dstq+dsq*0], xm16
   4762    vextracti128  [dstq+dsq*1], ym16, 1
   4763    vextracti32x4 [dstq+dsq*2], m16, 2
   4764    vextracti32x4 [dstq+r2   ], m16, 3
   4765    RET
   4766 .main:
   4767    vpbroadcastd        ym3, [base+pd_512]
   4768 %if WIN64
   4769    mov               abcdq, r5mp
   4770    vpaddd             ym18, ym3, r6m {1to8} ; mx
   4771 %else
   4772    add                 r5d, 512
   4773    vpbroadcastd       ym18, r5d
   4774 %endif
   4775    vpaddd             ym20, ym3, r7m {1to8} ; my
   4776    mova               ym16, [base+pd_0to7]
   4777    vpbroadcastd       ym19, [abcdq+4*0]     ; alpha
   4778    vpbroadcastd       ym21, [abcdq+4*1]     ; gamma
   4779    lea                  r4, [ssq*3+6]
   4780    vpdpwssd           ym18, ym19, ym16      ; tmx
   4781    vpdpwssd           ym20, ym21, ym16      ; tmy
   4782    sub                srcq, r4
   4783    mova                m10, [base+warp8x8_permA]
   4784    lea                  r4, [mc_warp_filter+64*8]
   4785    vbroadcasti32x4     m12, [base+warp8x8_permC]
   4786    kxnorb               k1, k1, k1
   4787    vbroadcasti32x4     m13, [base+warp8x8_permD]
   4788    movu                ym5, [srcq+0]
   4789    vinserti32x8         m5, [srcq+8], 1
   4790    psrad              ym17, ym18, 10
   4791    mova                m11, [base+warp8x8_permB]
   4792    kmovb                k2, k1
   4793    vpgatherdq       m3{k1}, [r4+ym17*8]    ; filter_x0
   4794    psrad              ym19, 16             ; beta
   4795    psrad              ym21, 16             ; delta
   4796    paddd              ym18, ym19
   4797    vpermb               m4, m10, m5
   4798    vpbroadcastq         m9, [base+warp_shift_h+t0*8]
   4799    pshufd               m3, m3, q3120
   4800    paddd                m7, m1, m1
   4801    pshufb               m2, m3, m12
   4802    vpdpwssd             m1, m4, m2
   4803    vpermb               m5, m11, m5
   4804    vshufi32x4           m4, m5, q1021
   4805    pshufb               m3, m13
   4806    vpdpwssd             m1, m4, m3
   4807    call .h
   4808    psllq                m2, m1, 32
   4809    paddd                m1, m2
   4810    vpmultishiftqb       m1, m9, m1
   4811    vpshrdq              m1, m0, 48          ; 01 12
   4812    call .h
   4813    vpshrdq              m2, m1, m0, 48      ; 23 34
   4814    call .h
   4815    vpshrdq              m3, m2, m0, 48      ; 45 56
   4816 .main2:
   4817    call .h
   4818    psrad               ym6, ym20, 10
   4819    kmovb                k1, k2
   4820    paddd              ym17, ym20, ym21      ; my += delta
   4821    vpgatherdq      m20{k2}, [r4+ym6*8]      ; filter_y0
   4822    psrad              ym16, ym17, 10
   4823    kmovb                k2, k1
   4824    vpgatherdq       m6{k1}, [r4+ym16*8]     ; filter_y1
   4825    shufps               m5, m20, m6, q2020
   4826    mova                m16, m8
   4827    pshufb               m4, m5, m12
   4828    vpdpwssd            m16, m1, m4          ; a0 b0
   4829    pshufb               m5, m13
   4830    mova                 m1, m2
   4831    vpdpwssd            m16, m2, m5          ; a1 b1
   4832    shufps               m6, m20, m6, q3131
   4833    paddd              ym20, ym17, ym21
   4834    pshufb               m4, m6, m12
   4835    mova                 m2, m3
   4836    vpdpwssd            m16, m3, m4          ; a2 b2
   4837    vpshrdq              m3, m0, 48          ; 67 78
   4838    pshufb               m6, m13
   4839    vpdpwssd            m16, m3, m6          ; a3 b3
   4840    ret
   4841 ALIGN function_align
   4842 .h:
   4843    movu               ym16, [srcq+ssq*1]
   4844    psrad               ym6, ym18, 10
   4845    lea                srcq, [srcq+ssq*2]
   4846    vinserti32x8         m5, m16, [srcq+ssq*0], 1
   4847    kmovb                k1, k2
   4848    paddd              ym17, ym18, ym19      ; mx += beta
   4849    vpgatherdq      m18{k2}, [r4+ym6*8]      ; filter_x1
   4850    psrad              ym16, ym17, 10
   4851    kmovb                k2, k1
   4852    vpgatherdq       m6{k1}, [r4+ym16*8]     ; filter_x2
   4853    vpermb               m4, m10, m5
   4854    shufps              m16, m18, m6, q2020
   4855    shufps               m6, m18, m6, q3131
   4856    mova                 m0, m7
   4857    pshufb              m18, m16, m12
   4858    vpdpwssd             m0, m4, m18         ; a0 b0
   4859    vpermb               m5, m11, m5
   4860    pshufb              m18, m6, m13
   4861    vpdpwssd             m0, m5, m18         ; a3 b3
   4862    paddd              ym18, ym17, ym19
   4863    vshufi32x4          m17, m4, m5, q1021
   4864    pshufb              m16, m13
   4865    vpdpwssd             m0, m17, m16        ; a1 b1
   4866    vshufi32x4           m4, m5, q2132
   4867    pshufb               m6, m12
   4868    vpdpwssd             m0, m4, m6          ; a2 b2
   4869    vpmultishiftqb       m0, m9, m0          ; a a b b
   4870    ret
   4871 
   4872 %macro BIDIR_FN 0
   4873    call .main
   4874    lea            stride3q, [strideq*3]
   4875    jmp                  wq
   4876 .w4:
   4877    movq   [dstq          ], xm0
   4878    movhps [dstq+strideq*1], xm0
   4879    vextracti32x4       xm2, ym0, 1
   4880    movq   [dstq+strideq*2], xm2
   4881    movhps [dstq+stride3q ], xm2
   4882    cmp                  hd, 8
   4883    jl .w4_end
   4884    vextracti32x4       xm2, m0, 2
   4885    lea                dstq, [dstq+strideq*4]
   4886    movq   [dstq          ], xm2
   4887    movhps [dstq+strideq*1], xm2
   4888    vextracti32x4       xm0, m0, 3
   4889    movq   [dstq+strideq*2], xm0
   4890    movhps [dstq+stride3q ], xm0
   4891    je .w4_end
   4892    lea                dstq, [dstq+strideq*4]
   4893    movq   [dstq          ], xm1
   4894    movhps [dstq+strideq*1], xm1
   4895    vextracti32x4       xm0, ym1, 1
   4896    movq   [dstq+strideq*2], xm0
   4897    movhps [dstq+stride3q ], xm0
   4898    vextracti32x4       xm0, m1, 2
   4899    lea                dstq, [dstq+strideq*4]
   4900    movq   [dstq          ], xm0
   4901    movhps [dstq+strideq*1], xm0
   4902    vextracti32x4       xm1, m1, 3
   4903    movq   [dstq+strideq*2], xm1
   4904    movhps [dstq+stride3q ], xm1
   4905 .w4_end:
   4906    RET
   4907 .w8_loop:
   4908    call .main
   4909    lea                dstq, [dstq+strideq*4]
   4910 .w8:
   4911    mova          [dstq+strideq*0], xm0
   4912    vextracti32x4 [dstq+strideq*1], ym0, 1
   4913    vextracti32x4 [dstq+strideq*2], m0, 2
   4914    vextracti32x4 [dstq+stride3q ], m0, 3
   4915    sub                  hd, 8
   4916    jl .w8_end
   4917    lea                dstq, [dstq+strideq*4]
   4918    mova          [dstq+strideq*0], xm1
   4919    vextracti32x4 [dstq+strideq*1], ym1, 1
   4920    vextracti32x4 [dstq+strideq*2], m1, 2
   4921    vextracti32x4 [dstq+stride3q ], m1, 3
   4922    jg .w8_loop
   4923 .w8_end:
   4924    RET
   4925 .w16_loop:
   4926    call .main
   4927    lea                dstq, [dstq+strideq*4]
   4928 .w16:
   4929    mova          [dstq+strideq*0], ym0
   4930    vextracti32x8 [dstq+strideq*1], m0, 1
   4931    mova          [dstq+strideq*2], ym1
   4932    vextracti32x8 [dstq+stride3q ], m1, 1
   4933    sub                  hd, 4
   4934    jg .w16_loop
   4935    RET
   4936 .w32_loop:
   4937    call .main
   4938    lea                dstq, [dstq+strideq*2]
   4939 .w32:
   4940    mova   [dstq+strideq*0], m0
   4941    mova   [dstq+strideq*1], m1
   4942    sub                  hd, 2
   4943    jg .w32_loop
   4944    RET
   4945 .w64_loop:
   4946    call .main
   4947    add                dstq, strideq
   4948 .w64:
   4949    mova        [dstq+64*0], m0
   4950    mova        [dstq+64*1], m1
   4951    dec                  hd
   4952    jg .w64_loop
   4953    RET
   4954 .w128_loop:
   4955    call .main
   4956    add                dstq, strideq
   4957 .w128:
   4958    mova        [dstq+64*0], m0
   4959    mova        [dstq+64*1], m1
   4960    call .main
   4961    mova        [dstq+64*2], m0
   4962    mova        [dstq+64*3], m1
   4963    dec                  hd
   4964    jg .w128_loop
   4965    RET
   4966 %endmacro
   4967 
   4968 %if WIN64
   4969 DECLARE_REG_TMP 5
   4970 %else
   4971 DECLARE_REG_TMP 7
   4972 %endif
   4973 
   4974 cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3
   4975 %define base r6-avg_avx512icl_table
   4976    lea                  r6, [avg_avx512icl_table]
   4977    tzcnt                wd, wm
   4978    mov                 t0d, r6m ; pixel_max
   4979    movsxd               wq, [r6+wq*4]
   4980    shr                 t0d, 11
   4981    vpbroadcastd         m2, [base+avg_round+t0*4]
   4982    vpbroadcastd         m3, [base+avg_shift+t0*4]
   4983    movifnidn            hd, hm
   4984    add                  wq, r6
   4985    BIDIR_FN
   4986 ALIGN function_align
   4987 .main:
   4988    mova                 m0, [tmp1q+64*0]
   4989    paddsw               m0, [tmp2q+64*0]
   4990    mova                 m1, [tmp1q+64*1]
   4991    paddsw               m1, [tmp2q+64*1]
   4992    add               tmp1q, 64*2
   4993    add               tmp2q, 64*2
   4994    pmaxsw               m0, m2
   4995    pmaxsw               m1, m2
   4996    psubsw               m0, m2
   4997    psubsw               m1, m2
   4998    vpsrlvw              m0, m3
   4999    vpsrlvw              m1, m3
   5000    ret
   5001 
   5002 cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3
   5003 %define base r6-w_avg_avx512icl_table
   5004    lea                  r6, [w_avg_avx512icl_table]
   5005    tzcnt                wd, wm
   5006    mov                 t0d, r7m ; pixel_max
   5007    shr                 t0d, 11
   5008    movsxd               wq, [r6+wq*4]
   5009    vpbroadcastd         m5, [base+w_avg_round+t0*4]
   5010    vpbroadcastd         m7, [base+bidir_shift+t0*4]
   5011    add                  wq, r6
   5012    mov                 r6d, r6m ; weight
   5013    lea                 t0d, [r6-16]
   5014    shl                 r6d, 16
   5015    sub                 r6d, t0d ; 16-weight, weight
   5016    movifnidn            hd, hm
   5017    vpbroadcastd         m6, r6d
   5018    BIDIR_FN
   5019 ALIGN function_align
   5020 .main:
   5021    mova                 m3, [tmp1q+64*0]
   5022    mova                 m1, [tmp2q+64*0]
   5023    mova                 m0, [tmp1q+64*1]
   5024    mova                 m4, [tmp2q+64*1]
   5025    add               tmp1q, 64*2
   5026    add               tmp2q, 64*2
   5027    punpcklwd            m2, m1, m3
   5028    punpckhwd            m1, m3
   5029    punpcklwd            m3, m4, m0
   5030    punpckhwd            m4, m0
   5031    mova                 m0, m5
   5032    vpdpwssd             m0, m6, m2
   5033    mova                 m2, m5
   5034    vpdpwssd             m2, m6, m1
   5035    mova                 m1, m5
   5036    vpdpwssd             m1, m6, m3
   5037    mova                 m3, m5
   5038    vpdpwssd             m3, m6, m4
   5039    REPX       {psrad x, 2}, m0, m2, m1, m3
   5040    packusdw             m0, m2
   5041    packusdw             m1, m3
   5042    vpsrlvw              m0, m7
   5043    vpsrlvw              m1, m7
   5044    ret
   5045 
   5046 cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
   5047 %define base r7-mask_avx512icl_table
   5048    lea                  r7, [mask_avx512icl_table]
   5049    tzcnt                wd, wm
   5050    mov                 r6d, r7m ; pixel_max
   5051    movifnidn            hd, hm
   5052    shr                 r6d, 11
   5053    movsxd               wq, [r7+wq*4]
   5054    vpbroadcastd         m8, [base+pw_64]
   5055    vpbroadcastd         m9, [base+mask_round+r6*4]
   5056    vpbroadcastd        m10, [base+bidir_shift+r6*4]
   5057    mov               maskq, maskmp
   5058    add                  wq, r7
   5059    BIDIR_FN
   5060 ALIGN function_align
   5061 .main:
   5062    pmovzxbw             m1, [maskq+32*0]
   5063    mova                 m4, [tmp1q+64*0]
   5064    mova                 m2, [tmp2q+64*0]
   5065    pmovzxbw             m6, [maskq+32*1]
   5066    mova                 m5, [tmp1q+64*1]
   5067    mova                 m3, [tmp2q+64*1]
   5068    add               maskq, 32*2
   5069    add               tmp1q, 64*2
   5070    add               tmp2q, 64*2
   5071    punpcklwd            m7, m4, m2
   5072    punpckhwd            m4, m2
   5073    psubw                m0, m8, m1
   5074    punpcklwd            m2, m1, m0 ; m, 64-m
   5075    punpckhwd            m1, m0
   5076    mova                 m0, m9
   5077    vpdpwssd             m0, m7, m2
   5078    mova                 m2, m9
   5079    vpdpwssd             m2, m4, m1 ; tmp1 * m + tmp2 * (64-m)
   5080    punpcklwd            m7, m5, m3
   5081    punpckhwd            m5, m3
   5082    psubw                m1, m8, m6
   5083    punpcklwd            m3, m6, m1
   5084    punpckhwd            m6, m1
   5085    mova                 m1, m9
   5086    vpdpwssd             m1, m7, m3
   5087    mova                 m3, m9
   5088    vpdpwssd             m3, m5, m6
   5089    REPX       {psrad x, 4}, m0, m2, m1, m3
   5090    packusdw             m0, m2
   5091    packusdw             m1, m3
   5092    vpsrlvw              m0, m10
   5093    vpsrlvw              m1, m10
   5094    ret
   5095 
   5096 cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
   5097 %define base r7-w_mask_420_avx512icl_table
   5098    lea                  r7, [w_mask_420_avx512icl_table]
   5099    tzcnt                wd, wm
   5100    mov                 r6d, r8m ; pixel_max
   5101    movifnidn            hd, hm
   5102    shr                 r6d, 11
   5103    movsxd               wq, [r7+wq*4]
   5104    vpbroadcastd        m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
   5105    vpbroadcastd        m11, [base+pw_64]
   5106    vpbroadcastd        m12, [base+mask_round+r6*4]
   5107    vpbroadcastd        m13, [base+bidir_shift+r6*4]
   5108    mov                 r6d, r7m ; sign
   5109    vpbroadcastd        m14, [base+w_mask_round+r6*4]
   5110    mova               ym15, [w_mask_end42x]
   5111    mov               maskq, maskmp
   5112    add                  wq, r7
   5113    call .main
   5114    lea            stride3q, [strideq*3]
   5115    jmp                  wq
   5116 .w4:
   5117    mova                 m4, [w_mask_shuf4]
   5118    vpermt2b             m2, m4, m3
   5119    mova                 m3, m14
   5120    vpdpbusd             m3, m2, [pb_64] {1to16}
   5121    vpermb               m3, m15, m3
   5122    movq   [dstq+strideq*0], xm0
   5123    movhps [dstq+strideq*1], xm0
   5124    vextracti32x4       xm2, ym0, 1
   5125    movq   [dstq+strideq*2], xm2
   5126    movhps [dstq+stride3q ], xm2
   5127    mova            [maskq], xm3
   5128    cmp                  hd, 8
   5129    jl .w4_end
   5130    vextracti32x4       xm2, m0, 2
   5131    lea                dstq, [dstq+strideq*4]
   5132    movq   [dstq+strideq*0], xm2
   5133    movhps [dstq+strideq*1], xm2
   5134    vextracti32x4       xm0, m0, 3
   5135    movq   [dstq+strideq*2], xm0
   5136    movhps [dstq+stride3q ], xm0
   5137    je .w4_end
   5138    lea                dstq, [dstq+strideq*4]
   5139    movq   [dstq+strideq*0], xm1
   5140    movhps [dstq+strideq*1], xm1
   5141    vextracti32x4       xm2, ym1, 1
   5142    movq   [dstq+strideq*2], xm2
   5143    movhps [dstq+stride3q ], xm2
   5144    vextracti32x4       xm2, m1, 2
   5145    lea                dstq, [dstq+strideq*4]
   5146    movq   [dstq+strideq*0], xm2
   5147    movhps [dstq+strideq*1], xm2
   5148    vextracti32x4       xm1, m1, 3
   5149    movq   [dstq+strideq*2], xm1
   5150    movhps [dstq+stride3q ], xm1
   5151 .w4_end:
   5152    RET
   5153 .w8:
   5154    mova                 m8, [w_mask_shuf8]
   5155    vpbroadcastd         m9, [pb_64]
   5156    jmp .w8_start
   5157 .w8_loop:
   5158    call .main
   5159    lea                dstq, [dstq+strideq*4]
   5160    add               maskq, 16
   5161 .w8_start:
   5162    vpermt2b             m2, m8, m3
   5163    mova                 m3, m14
   5164    vpdpbusd             m3, m2, m9
   5165    vpermb               m3, m15, m3
   5166    mova          [dstq+strideq*0], xm0
   5167    vextracti32x4 [dstq+strideq*1], ym0, 1
   5168    vextracti32x4 [dstq+strideq*2], m0, 2
   5169    vextracti32x4 [dstq+stride3q ], m0, 3
   5170    mova            [maskq], xm3
   5171    sub                  hd, 8
   5172    jl .w8_end
   5173    lea                dstq, [dstq+strideq*4]
   5174    mova          [dstq+strideq*0], xm1
   5175    vextracti32x4 [dstq+strideq*1], ym1, 1
   5176    vextracti32x4 [dstq+strideq*2], m1, 2
   5177    vextracti32x4 [dstq+stride3q ], m1, 3
   5178    jg .w8_loop
   5179 .w8_end:
   5180    RET
   5181 .w16:
   5182    mova                 m8, [w_mask_shuf16]
   5183    vpbroadcastd         m9, [pb_64]
   5184    jmp .w16_start
   5185 .w16_loop:
   5186    call .main
   5187    lea                dstq, [dstq+strideq*4]
   5188    add               maskq, 16
   5189 .w16_start:
   5190    vpermt2b             m2, m8, m3
   5191    mova                 m3, m14
   5192    vpdpbusd             m3, m2, m9
   5193    vpermb               m3, m15, m3
   5194    mova          [dstq+strideq*0], ym0
   5195    vextracti32x8 [dstq+strideq*1], m0, 1
   5196    mova          [dstq+strideq*2], ym1
   5197    vextracti32x8 [dstq+stride3q ], m1, 1
   5198    mova            [maskq], xm3
   5199    sub                  hd, 4
   5200    jg .w16_loop
   5201    RET
   5202 .w32_loop:
   5203    call .main
   5204    lea                dstq, [dstq+strideq*4]
   5205    add               maskq, 32
   5206 .w32:
   5207    paddw                m2, m3
   5208    mova                 m8, m14
   5209    vpdpwssd             m8, m11, m2
   5210    mova   [dstq+strideq*0], m0
   5211    mova   [dstq+strideq*1], m1
   5212    call .main
   5213    paddw                m2, m3
   5214    mova                 m3, m14
   5215    vpdpwssd             m3, m11, m2
   5216    vpermt2b             m8, m15, m3
   5217    mova   [dstq+strideq*2], m0
   5218    mova   [dstq+stride3q ], m1
   5219    mova            [maskq], ym8
   5220    sub                  hd, 4
   5221    jg .w32_loop
   5222    RET
   5223 .w64_loop:
   5224    call .main
   5225    lea                dstq, [dstq+strideq*2]
   5226    add               maskq, 32
   5227 .w64:
   5228    mova                 m8, m2
   5229    mova                 m9, m3
   5230    mova [dstq+strideq*0+64*0], m0
   5231    mova [dstq+strideq*0+64*1], m1
   5232    call .main
   5233    paddw                m8, m2
   5234    paddw                m9, m3
   5235    mova                 m2, m14
   5236    vpdpwssd             m2, m11, m8
   5237    mova                 m3, m14
   5238    vpdpwssd             m3, m11, m9
   5239    vpermt2b             m2, m15, m3
   5240    mova [dstq+strideq*1+64*0], m0
   5241    mova [dstq+strideq*1+64*1], m1
   5242    mova            [maskq], ym2
   5243    sub                  hd, 2
   5244    jg .w64_loop
   5245    RET
   5246 .w128_loop:
   5247    call .main
   5248    lea                dstq, [dstq+strideq*2]
   5249    add               maskq, 64
   5250 .w128:
   5251    mova               m16, m2
   5252    mova                m8, m3
   5253    mova [dstq+strideq*0+64*0], m0
   5254    mova [dstq+strideq*0+64*1], m1
   5255    call .main
   5256    mova                m17, m2
   5257    mova                 m9, m3
   5258    mova [dstq+strideq*0+64*2], m0
   5259    mova [dstq+strideq*0+64*3], m1
   5260    call .main
   5261    paddw                m2, m16
   5262    paddw                m3, m8
   5263    mova                m16, m14
   5264    vpdpwssd            m16, m11, m2
   5265    mova                 m8, m14
   5266    vpdpwssd             m8, m11, m3
   5267    mova [dstq+strideq*1+64*0], m0
   5268    mova [dstq+strideq*1+64*1], m1
   5269    call .main
   5270    paddw                m2, m17
   5271    paddw                m3, m9
   5272    mova                m17, m14
   5273    vpdpwssd            m17, m11, m2
   5274    mova                 m9, m14
   5275    vpdpwssd             m9, m11, m3
   5276    vpermt2b            m16, m15, m8
   5277    vpermt2b            m17, m15, m9
   5278    mova [dstq+strideq*1+64*2], m0
   5279    mova [dstq+strideq*1+64*3], m1
   5280    mova       [maskq+32*0], ym16
   5281    mova       [maskq+32*1], ym17
   5282    sub                  hd, 2
   5283    jg .w128_loop
   5284    vzeroupper
   5285    RET
   5286 ALIGN function_align
   5287 .main:
   5288    mova                 m1, [tmp1q+64*0]
   5289    mova                 m3, [tmp2q+64*0]
   5290    mova                 m4, [tmp1q+64*1]
   5291    mova                 m7, [tmp2q+64*1]
   5292    add               tmp1q, 64*2
   5293    add               tmp2q, 64*2
   5294    psubsw               m6, m1, m3
   5295    punpcklwd            m5, m3, m1
   5296    pabsw                m6, m6
   5297    punpckhwd            m3, m1
   5298    psubusw              m6, m10, m6
   5299    psrlw                m6, 10      ; 64-m
   5300    psubw                m2, m11, m6 ; m
   5301    punpcklwd            m1, m6, m2
   5302    punpckhwd            m6, m2
   5303    mova                 m0, m12
   5304    vpdpwssd             m0, m5, m1
   5305    mova                 m1, m12
   5306    vpdpwssd             m1, m3, m6
   5307    psubsw               m5, m4, m7
   5308    punpcklwd            m6, m7, m4
   5309    pabsw                m5, m5
   5310    punpckhwd            m7, m4
   5311    psubusw              m5, m10, m5
   5312    psrlw                m5, 10
   5313    psubw                m3, m11, m5
   5314    punpcklwd            m4, m5, m3
   5315    psrad                m0, 4
   5316    punpckhwd            m5, m3
   5317    psrad                m1, 4
   5318    packusdw             m0, m1
   5319    mova                 m1, m12
   5320    vpdpwssd             m1, m6, m4
   5321    mova                 m4, m12
   5322    vpdpwssd             m4, m7, m5
   5323    psrad                m1, 4
   5324    psrad                m4, 4
   5325    packusdw             m1, m4
   5326    vpsrlvw              m0, m13
   5327    vpsrlvw              m1, m13
   5328    ret
   5329 
   5330 cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
   5331 %define base r7-w_mask_422_avx512icl_table
   5332    lea                  r7, [w_mask_422_avx512icl_table]
   5333    tzcnt                wd, wm
   5334    mov                 r6d, r8m ; pixel_max
   5335    movifnidn            hd, hm
   5336    shr                 r6d, 11
   5337    movsxd               wq, [r7+wq*4]
   5338    vpbroadcastd         m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
   5339    vpbroadcastd         m9, [base+pw_64]
   5340    vpbroadcastd        m10, [base+mask_round+r6*4]
   5341    vpbroadcastd        m11, [base+bidir_shift+r6*4]
   5342    mov                 r6d, r7m ; sign
   5343    vpbroadcastd        m12, [base+w_mask_round+r6*4]
   5344    mova               ym13, [w_mask_end42x]
   5345    mov               maskq, maskmp
   5346    add                  wq, r7
   5347    paddw               m14, m9, m9 ; pw_128
   5348    call .main
   5349    lea            stride3q, [strideq*3]
   5350    jmp                  wq
   5351 .w4:
   5352    movq   [dstq+strideq*0], xm0
   5353    movhps [dstq+strideq*1], xm0
   5354    vextracti32x4       xm2, ym0, 1
   5355    movq   [dstq+strideq*2], xm2
   5356    movhps [dstq+stride3q ], xm2
   5357    cmp                  hd, 8
   5358    jl .w4_end
   5359    vextracti32x4       xm2, m0, 2
   5360    lea                dstq, [dstq+strideq*4]
   5361    movq   [dstq+strideq*0], xm2
   5362    movhps [dstq+strideq*1], xm2
   5363    vextracti32x4       xm0, m0, 3
   5364    movq   [dstq+strideq*2], xm0
   5365    movhps [dstq+stride3q ], xm0
   5366    je .w4_end
   5367    lea                dstq, [dstq+strideq*4]
   5368    movq   [dstq+strideq*0], xm1
   5369    movhps [dstq+strideq*1], xm1
   5370    vextracti32x4       xm2, ym1, 1
   5371    movq   [dstq+strideq*2], xm2
   5372    movhps [dstq+stride3q ], xm2
   5373    vextracti32x4       xm2, m1, 2
   5374    lea                dstq, [dstq+strideq*4]
   5375    movq   [dstq+strideq*0], xm2
   5376    movhps [dstq+strideq*1], xm2
   5377    vextracti32x4       xm1, m1, 3
   5378    movq   [dstq+strideq*2], xm1
   5379    movhps [dstq+stride3q ], xm1
   5380 .w4_end:
   5381    RET
   5382 .w8_loop:
   5383    call .main
   5384    lea                dstq, [dstq+strideq*4]
   5385 .w8:
   5386    mova          [dstq+strideq*0], xm0
   5387    vextracti32x4 [dstq+strideq*1], ym0, 1
   5388    vextracti32x4 [dstq+strideq*2], m0, 2
   5389    vextracti32x4 [dstq+stride3q ], m0, 3
   5390    sub                  hd, 8
   5391    jl .w8_end
   5392    lea                dstq, [dstq+strideq*4]
   5393    mova          [dstq+strideq*0], xm1
   5394    vextracti32x4 [dstq+strideq*1], ym1, 1
   5395    vextracti32x4 [dstq+strideq*2], m1, 2
   5396    vextracti32x4 [dstq+stride3q ], m1, 3
   5397    jg .w8_loop
   5398 .w8_end:
   5399    RET
   5400 .w16_loop:
   5401    call .main
   5402    lea                dstq, [dstq+strideq*4]
   5403 .w16:
   5404    mova          [dstq+strideq*0], ym0
   5405    vextracti32x8 [dstq+strideq*1], m0, 1
   5406    mova          [dstq+strideq*2], ym1
   5407    vextracti32x8 [dstq+stride3q ], m1, 1
   5408    sub                  hd, 4
   5409    jg .w16_loop
   5410    RET
   5411 .w32_loop:
   5412    call .main
   5413    lea                dstq, [dstq+strideq*2]
   5414 .w32:
   5415    mova   [dstq+strideq*0], m0
   5416    mova   [dstq+strideq*1], m1
   5417    sub                  hd, 2
   5418    jg .w32_loop
   5419    RET
   5420 .w64_loop:
   5421    call .main
   5422    add                dstq, strideq
   5423 .w64:
   5424    mova        [dstq+64*0], m0
   5425    mova        [dstq+64*1], m1
   5426    dec                  hd
   5427    jg .w64_loop
   5428    RET
   5429 .w128_loop:
   5430    call .main
   5431    add                dstq, strideq
   5432 .w128:
   5433    mova        [dstq+64*0], m0
   5434    mova        [dstq+64*1], m1
   5435    call .main
   5436    mova        [dstq+64*2], m0
   5437    mova        [dstq+64*3], m1
   5438    dec                  hd
   5439    jg .w128_loop
   5440    RET
   5441 ALIGN function_align
   5442 .main:
   5443    mova                 m1, [tmp1q+64*0]
   5444    mova                 m3, [tmp2q+64*0]
   5445    mova                 m4, [tmp1q+64*1]
   5446    mova                 m7, [tmp2q+64*1]
   5447    add               tmp1q, 64*2
   5448    add               tmp2q, 64*2
   5449    psubsw               m6, m1, m3
   5450    punpcklwd            m5, m3, m1
   5451    pabsw                m6, m6
   5452    punpckhwd            m3, m1
   5453    psubusw              m6, m8, m6
   5454    psrlw                m6, 10
   5455    psubw                m2, m9, m6
   5456    punpcklwd            m1, m6, m2
   5457    punpckhwd            m6, m2
   5458    mova                 m0, m10
   5459    vpdpwssd             m0, m5, m1
   5460    mova                 m1, m10
   5461    vpdpwssd             m1, m3, m6
   5462    psubsw               m5, m4, m7
   5463    punpcklwd            m6, m7, m4
   5464    pabsw                m5, m5
   5465    punpckhwd            m7, m4
   5466    psubusw              m5, m8, m5
   5467    psrlw                m5, 10
   5468    psubw                m3, m9, m5
   5469    punpcklwd            m4, m5, m3
   5470    psrad                m0, 4
   5471    punpckhwd            m5, m3
   5472    psrad                m1, 4
   5473    packusdw             m0, m1
   5474    mova                 m1, m10
   5475    vpdpwssd             m1, m6, m4
   5476    mova                 m4, m10
   5477    vpdpwssd             m4, m7, m5
   5478    mova                 m5, m12
   5479    vpdpwssd             m5, m14, m2
   5480    mova                 m2, m12
   5481    vpdpwssd             m2, m14, m3
   5482    psrad                m1, 4
   5483    psrad                m4, 4
   5484    packusdw             m1, m4
   5485    vpermt2b             m5, m13, m2
   5486    vpsrlvw              m0, m11
   5487    vpsrlvw              m1, m11
   5488    mova            [maskq], ym5
   5489    add               maskq, 32
   5490    ret
   5491 
   5492 cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
   5493 %define base r7-w_mask_444_avx512icl_table
   5494    lea                  r7, [w_mask_444_avx512icl_table]
   5495    tzcnt                wd, wm
   5496    mov                 r6d, r8m ; pixel_max
   5497    movifnidn            hd, hm
   5498    shr                 r6d, 11
   5499    movsxd               wq, [r7+wq*4]
   5500    vpbroadcastd         m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
   5501    vpbroadcastd         m9, [base+pw_64]
   5502    vpbroadcastd        m10, [base+mask_round+r6*4]
   5503    mova                m11, [w_mask_end444]
   5504    vpbroadcastd        m12, [base+bidir_shift+r6*4]
   5505    mov               maskq, maskmp
   5506    add                  wq, r7
   5507    call .main
   5508    lea            stride3q, [strideq*3]
   5509    jmp                  wq
   5510 .w4:
   5511    movq   [dstq+strideq*0], xm0
   5512    movhps [dstq+strideq*1], xm0
   5513    vextracti32x4       xm2, ym0, 1
   5514    movq   [dstq+strideq*2], xm2
   5515    movhps [dstq+stride3q ], xm2
   5516    cmp                  hd, 8
   5517    jl .w4_end
   5518    vextracti32x4       xm2, m0, 2
   5519    lea                dstq, [dstq+strideq*4]
   5520    movq   [dstq+strideq*0], xm2
   5521    movhps [dstq+strideq*1], xm2
   5522    vextracti32x4       xm0, m0, 3
   5523    movq   [dstq+strideq*2], xm0
   5524    movhps [dstq+stride3q ], xm0
   5525    je .w4_end
   5526    lea                dstq, [dstq+strideq*4]
   5527    movq   [dstq+strideq*0], xm1
   5528    movhps [dstq+strideq*1], xm1
   5529    vextracti32x4       xm2, ym1, 1
   5530    movq   [dstq+strideq*2], xm2
   5531    movhps [dstq+stride3q ], xm2
   5532    vextracti32x4       xm2, m1, 2
   5533    lea                dstq, [dstq+strideq*4]
   5534    movq   [dstq+strideq*0], xm2
   5535    movhps [dstq+strideq*1], xm2
   5536    vextracti32x4       xm1, m1, 3
   5537    movq   [dstq+strideq*2], xm1
   5538    movhps [dstq+stride3q ], xm1
   5539 .w4_end:
   5540    RET
   5541 .w8_loop:
   5542    call .main
   5543    lea                dstq, [dstq+strideq*4]
   5544 .w8:
   5545    mova          [dstq+strideq*0], xm0
   5546    vextracti32x4 [dstq+strideq*1], ym0, 1
   5547    vextracti32x4 [dstq+strideq*2], m0, 2
   5548    vextracti32x4 [dstq+stride3q ], m0, 3
   5549    sub                  hd, 8
   5550    jl .w8_end
   5551    lea                dstq, [dstq+strideq*4]
   5552    mova          [dstq+strideq*0], xm1
   5553    vextracti32x4 [dstq+strideq*1], ym1, 1
   5554    vextracti32x4 [dstq+strideq*2], m1, 2
   5555    vextracti32x4 [dstq+stride3q ], m1, 3
   5556    jg .w8_loop
   5557 .w8_end:
   5558    RET
   5559 .w16_loop:
   5560    call .main
   5561    lea                dstq, [dstq+strideq*4]
   5562 .w16:
   5563    mova          [dstq+strideq*0], ym0
   5564    vextracti32x8 [dstq+strideq*1], m0, 1
   5565    mova          [dstq+strideq*2], ym1
   5566    vextracti32x8 [dstq+stride3q ], m1, 1
   5567    sub                  hd, 4
   5568    jg .w16_loop
   5569    RET
   5570 .w32_loop:
   5571    call .main
   5572    lea                dstq, [dstq+strideq*2]
   5573 .w32:
   5574    mova   [dstq+strideq*0], m0
   5575    mova   [dstq+strideq*1], m1
   5576    sub                  hd, 2
   5577    jg .w32_loop
   5578    RET
   5579 .w64_loop:
   5580    call .main
   5581    add                dstq, strideq
   5582 .w64:
   5583    mova        [dstq+64*0], m0
   5584    mova        [dstq+64*1], m1
   5585    dec                  hd
   5586    jg .w64_loop
   5587    RET
   5588 .w128_loop:
   5589    call .main
   5590    add                dstq, strideq
   5591 .w128:
   5592    mova        [dstq+64*0], m0
   5593    mova        [dstq+64*1], m1
   5594    call .main
   5595    mova        [dstq+64*2], m0
   5596    mova        [dstq+64*3], m1
   5597    dec                  hd
   5598    jg .w128_loop
   5599    RET
   5600 ALIGN function_align
   5601 .main:
   5602    mova                 m1, [tmp1q+64*0]
   5603    mova                 m3, [tmp2q+64*0]
   5604    mova                 m4, [tmp1q+64*1]
   5605    mova                 m7, [tmp2q+64*1]
   5606    add               tmp1q, 64*2
   5607    add               tmp2q, 64*2
   5608    psubsw               m6, m1, m3
   5609    punpcklwd            m5, m3, m1
   5610    pabsw                m6, m6
   5611    punpckhwd            m3, m1
   5612    psubusw              m6, m8, m6
   5613    psrlw                m6, 10
   5614    psubw                m2, m9, m6
   5615    punpcklwd            m1, m6, m2
   5616    punpckhwd            m6, m2
   5617    mova                 m0, m10
   5618    vpdpwssd             m0, m5, m1
   5619    mova                 m1, m10
   5620    vpdpwssd             m1, m3, m6
   5621    psubsw               m5, m4, m7
   5622    punpcklwd            m6, m7, m4
   5623    pabsw                m5, m5
   5624    punpckhwd            m7, m4
   5625    psubusw              m5, m8, m5
   5626    psrlw                m5, 10
   5627    psubw                m3, m9, m5
   5628    punpcklwd            m4, m5, m3
   5629    psrad                m0, 4
   5630    punpckhwd            m5, m3
   5631    psrad                m1, 4
   5632    packusdw             m0, m1
   5633    mova                 m1, m10
   5634    vpdpwssd             m1, m6, m4
   5635    mova                 m4, m10
   5636    vpdpwssd             m4, m7, m5
   5637    vpermt2b             m2, m11, m3
   5638    psrad                m1, 4
   5639    psrad                m4, 4
   5640    packusdw             m1, m4
   5641    vpsrlvw              m0, m12
   5642    vpsrlvw              m1, m12
   5643    mova            [maskq], m2
   5644    add               maskq, 64
   5645    ret
   5646 
   5647 cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
   5648 %define base r6-blend_avx512icl_table
   5649    lea                  r6, [blend_avx512icl_table]
   5650    tzcnt                wd, wm
   5651    movifnidn            hd, hm
   5652    movsxd               wq, [r6+wq*4]
   5653    movifnidn         maskq, maskmp
   5654    vpbroadcastd         m6, [base+pw_m512]
   5655    add                  wq, r6
   5656    lea                  r6, [dsq*3]
   5657    jmp                  wq
   5658 .w4:
   5659    pmovzxbw           ym19, [maskq]
   5660    movq               xm16, [dstq+dsq*0]
   5661    movhps             xm16, [dstq+dsq*1]
   5662    vpbroadcastq       ym17, [dstq+dsq*2]
   5663    vpbroadcastq       ym18, [dstq+r6   ]
   5664    pmullw             ym19, ym6
   5665    vpblendd           ym16, ym17, 0x30
   5666    vpblendd           ym16, ym18, 0xc0
   5667    psubw              ym17, ym16, [tmpq]
   5668    add               maskq, 16
   5669    add                tmpq, 32
   5670    pmulhrsw           ym17, ym19
   5671    paddw              ym16, ym17
   5672    vextracti128       xm17, ym16, 1
   5673    movq       [dstq+dsq*0], xm16
   5674    movhps     [dstq+dsq*1], xm16
   5675    movq       [dstq+dsq*2], xm17
   5676    movhps     [dstq+r6   ], xm17
   5677    lea                dstq, [dstq+dsq*4]
   5678    sub                  hd, 4
   5679    jg .w4
   5680    vzeroupper
   5681    RET
   5682 .w8:
   5683    pmovzxbw             m2, [maskq]
   5684    mova                xm0, [dstq+dsq*0]
   5685    vinserti32x4        ym0, [dstq+dsq*1], 1
   5686    vinserti32x4         m0, [dstq+dsq*2], 2
   5687    vinserti32x4         m0, [dstq+r6   ], 3
   5688    pmullw               m2, m6
   5689    psubw                m1, m0, [tmpq]
   5690    add               maskq, 32
   5691    add                tmpq, 64
   5692    pmulhrsw             m1, m2
   5693    paddw                m0, m1
   5694    mova          [dstq+dsq*0], xm0
   5695    vextracti32x4 [dstq+dsq*1], ym0, 1
   5696    vextracti32x4 [dstq+dsq*2], m0, 2
   5697    vextracti32x4 [dstq+r6   ], m0, 3
   5698    lea                dstq, [dstq+dsq*4]
   5699    sub                  hd, 4
   5700    jg .w8
   5701    RET
   5702 .w16:
   5703    pmovzxbw             m4, [maskq+32*0]
   5704    pmovzxbw             m5, [maskq+32*1]
   5705    mova                ym0, [dstq+dsq*0]
   5706    vinserti32x8         m0, [dstq+dsq*1], 1
   5707    mova                ym1, [dstq+dsq*2]
   5708    vinserti32x8         m1, [dstq+r6   ], 1
   5709    pmullw               m4, m6
   5710    pmullw               m5, m6
   5711    psubw                m2, m0, [tmpq+64*0]
   5712    psubw                m3, m1, [tmpq+64*1]
   5713    add               maskq, 32*2
   5714    add                tmpq, 64*2
   5715    pmulhrsw             m2, m4
   5716    pmulhrsw             m3, m5
   5717    paddw                m0, m2
   5718    paddw                m1, m3
   5719    mova          [dstq+dsq*0], ym0
   5720    vextracti32x8 [dstq+dsq*1], m0, 1
   5721    mova          [dstq+dsq*2], ym1
   5722    vextracti32x8 [dstq+r6   ], m1, 1
   5723    lea                dstq, [dstq+dsq*4]
   5724    sub                  hd, 4
   5725    jg .w16
   5726    RET
   5727 .w32:
   5728    pmovzxbw             m4, [maskq+32*0]
   5729    pmovzxbw             m5, [maskq+32*1]
   5730    mova                 m0, [dstq+dsq*0]
   5731    mova                 m1, [dstq+dsq*1]
   5732    pmullw               m4, m6
   5733    pmullw               m5, m6
   5734    psubw                m2, m0, [tmpq+ 64*0]
   5735    psubw                m3, m1, [tmpq+ 64*1]
   5736    add               maskq, 32*2
   5737    add                tmpq, 64*2
   5738    pmulhrsw             m2, m4
   5739    pmulhrsw             m3, m5
   5740    paddw                m0, m2
   5741    paddw                m1, m3
   5742    mova       [dstq+dsq*0], m0
   5743    mova       [dstq+dsq*1], m1
   5744    lea                dstq, [dstq+dsq*2]
   5745    sub                  hd, 2
   5746    jg .w32
   5747    RET
   5748 
   5749 cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
   5750    lea                  r5, [blend_v_avx512icl_table]
   5751    tzcnt                wd, wm
   5752    movifnidn            hd, hm
   5753    movsxd               wq, [r5+wq*4]
   5754    add                  wq, r5
   5755    jmp                  wq
   5756 .w2:
   5757    vpbroadcastd       xmm2, [obmc_masks_avx2+2*2]
   5758 .w2_loop:
   5759    movd               xmm0, [dstq+dsq*0]
   5760    pinsrd             xmm0, [dstq+dsq*1], 1
   5761    movq               xmm1, [tmpq]
   5762    add                tmpq, 4*2
   5763    psubw              xmm1, xmm0, xmm1
   5764    pmulhrsw           xmm1, xmm2
   5765    paddw              xmm0, xmm1
   5766    movd       [dstq+dsq*0], xmm0
   5767    pextrd     [dstq+dsq*1], xmm0, 1
   5768    lea                dstq, [dstq+dsq*2]
   5769    sub                  hd, 2
   5770    jg .w2_loop
   5771    RET
   5772 .w4:
   5773    vpbroadcastq       xmm2, [obmc_masks_avx2+4*2]
   5774 .w4_loop:
   5775    movq               xmm0, [dstq+dsq*0]
   5776    movhps             xmm0, [dstq+dsq*1]
   5777    psubw              xmm1, xmm0, [tmpq]
   5778    add                tmpq, 8*2
   5779    pmulhrsw           xmm1, xmm2
   5780    paddw              xmm0, xmm1
   5781    movq       [dstq+dsq*0], xmm0
   5782    movhps     [dstq+dsq*1], xmm0
   5783    lea                dstq, [dstq+dsq*2]
   5784    sub                  hd, 2
   5785    jg .w4_loop
   5786    RET
   5787 .w8:
   5788    vbroadcasti32x4     ym2, [obmc_masks_avx2+8*2]
   5789 .w8_loop:
   5790    mova                xm0, [dstq+dsq*0]
   5791    vinserti32x4        ym0, [dstq+dsq*1], 1
   5792    psubw               ym1, ym0, [tmpq]
   5793    add                tmpq, 16*2
   5794    pmulhrsw            ym1, ym2
   5795    paddw               ym0, ym1
   5796    mova          [dstq+dsq*0], xm0
   5797    vextracti32x4 [dstq+dsq*1], ym0, 1
   5798    lea                dstq, [dstq+dsq*2]
   5799    sub                  hd, 2
   5800    jg .w8_loop
   5801    RET
   5802 .w16:
   5803    vbroadcasti32x8      m2, [obmc_masks_avx2+16*2]
   5804 .w16_loop:
   5805    mova                ym0, [dstq+dsq*0]
   5806    vinserti32x8         m0, [dstq+dsq*1], 1
   5807    psubw                m1, m0, [tmpq]
   5808    add                tmpq, 32*2
   5809    pmulhrsw             m1, m2
   5810    paddw                m0, m1
   5811    mova          [dstq+dsq*0], ym0
   5812    vextracti32x8 [dstq+dsq*1], m0, 1
   5813    lea                dstq, [dstq+dsq*2]
   5814    sub                  hd, 2
   5815    jg .w16_loop
   5816    RET
   5817 .w32:
   5818    mova                 m4, [obmc_masks_avx2+32*2]
   5819 .w32_loop:
   5820    mova                 m0,     [dstq+dsq*0]
   5821    psubw                m2, m0, [tmpq+ 64*0]
   5822    mova                 m1,     [dstq+dsq*1]
   5823    psubw                m3, m1, [tmpq+ 64*1]
   5824    add                tmpq, 64*2
   5825    pmulhrsw             m2, m4
   5826    pmulhrsw             m3, m4
   5827    paddw                m0, m2
   5828    paddw                m1, m3
   5829    mova       [dstq+dsq*0], m0
   5830    mova       [dstq+dsq*1], m1
   5831    lea                dstq, [dstq+dsq*2]
   5832    sub                  hd, 2
   5833    jg .w32_loop
   5834    RET
   5835 
   5836 cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
   5837 %define base r6-$$
   5838    lea                  r6, [$$]
   5839    tzcnt                wd, wm
   5840    mov                  hd, hm
   5841    movsxd               wq, [base+blend_h_avx512icl_table+wq*4]
   5842    lea               maskq, [base+obmc_masks_avx2+hq*2]
   5843    lea                  hd, [hq*3]
   5844    lea                  wq, [base+blend_h_avx512icl_table+wq]
   5845    shr                  hd, 2 ; h * 3/4
   5846    lea               maskq, [maskq+hq*2]
   5847    neg                  hq
   5848    jmp                  wq
   5849 .w2:
   5850    movd               xmm0, [dstq+dsq*0]
   5851    pinsrd             xmm0, [dstq+dsq*1], 1
   5852    movd               xmm2, [maskq+hq*2]
   5853    movq               xmm1, [tmpq]
   5854    add                tmpq, 4*2
   5855    punpcklwd          xmm2, xmm2
   5856    psubw              xmm1, xmm0, xmm1
   5857    pmulhrsw           xmm1, xmm2
   5858    paddw              xmm0, xmm1
   5859    movd       [dstq+dsq*0], xmm0
   5860    pextrd     [dstq+dsq*1], xmm0, 1
   5861    lea                dstq, [dstq+dsq*2]
   5862    add                  hq, 2
   5863    jl .w2
   5864    RET
   5865 .w4:
   5866    mova               xmm3, [blend_shuf]
   5867 .w4_loop:
   5868    movq               xmm0, [dstq+dsq*0]
   5869    movhps             xmm0, [dstq+dsq*1]
   5870    movd               xmm2, [maskq+hq*2]
   5871    psubw              xmm1, xmm0, [tmpq]
   5872    add                tmpq, 8*2
   5873    pshufb             xmm2, xmm3
   5874    pmulhrsw           xmm1, xmm2
   5875    paddw              xmm0, xmm1
   5876    movq       [dstq+dsq*0], xmm0
   5877    movhps     [dstq+dsq*1], xmm0
   5878    lea                dstq, [dstq+dsq*2]
   5879    add                  hq, 2
   5880    jl .w4_loop
   5881    RET
   5882 .w8:
   5883    vbroadcasti32x4     ym3, [blend_shuf]
   5884    shufpd              ym3, ym3, 0x0c
   5885 .w8_loop:
   5886    mova                xm0, [dstq+dsq*0]
   5887    vinserti32x4        ym0, [dstq+dsq*1], 1
   5888    vpbroadcastd        ym2, [maskq+hq*2]
   5889    psubw               ym1, ym0, [tmpq]
   5890    add                tmpq, 16*2
   5891    pshufb              ym2, ym3
   5892    pmulhrsw            ym1, ym2
   5893    paddw               ym0, ym1
   5894    mova          [dstq+dsq*0], xm0
   5895    vextracti32x4 [dstq+dsq*1], ym0, 1
   5896    lea                dstq, [dstq+dsq*2]
   5897    add                  hq, 2
   5898    jl .w8_loop
   5899    RET
   5900 .w16:
   5901    vbroadcasti32x4      m3, [blend_shuf]
   5902    shufpd               m3, m3, 0xf0
   5903 .w16_loop:
   5904    mova                ym0, [dstq+dsq*0]
   5905    vinserti32x8         m0, [dstq+dsq*1], 1
   5906    vpbroadcastd         m2, [maskq+hq*2]
   5907    psubw                m1, m0, [tmpq]
   5908    add                tmpq, 32*2
   5909    pshufb               m2, m3
   5910    pmulhrsw             m1, m2
   5911    paddw                m0, m1
   5912    mova          [dstq+dsq*0], ym0
   5913    vextracti32x8 [dstq+dsq*1], m0, 1
   5914    lea                dstq, [dstq+dsq*2]
   5915    add                  hq, 2
   5916    jl .w16_loop
   5917    RET
   5918 .w32:
   5919    vpbroadcastw         m4, [maskq+hq*2]
   5920    vpbroadcastw         m5, [maskq+hq*2+2]
   5921    mova                 m0,     [dstq+dsq*0]
   5922    psubw                m2, m0, [tmpq+ 64*0]
   5923    mova                 m1,     [dstq+dsq*1]
   5924    psubw                m3, m1, [tmpq+ 64*1]
   5925    add                tmpq, 64*2
   5926    pmulhrsw             m2, m4
   5927    pmulhrsw             m3, m5
   5928    paddw                m0, m2
   5929    paddw                m1, m3
   5930    mova       [dstq+dsq*0], m0
   5931    mova       [dstq+dsq*1], m1
   5932    lea                dstq, [dstq+dsq*2]
   5933    add                  hq, 2
   5934    jl .w32
   5935    RET
   5936 .w64:
   5937    vpbroadcastw         m4, [maskq+hq*2]
   5938    mova                 m0,     [dstq+64*0]
   5939    psubw                m2, m0, [tmpq+64*0]
   5940    mova                 m1,     [dstq+64*1]
   5941    psubw                m3, m1, [tmpq+64*1]
   5942    add                tmpq, 64*2
   5943    pmulhrsw             m2, m4
   5944    pmulhrsw             m3, m4
   5945    paddw                m0, m2
   5946    paddw                m1, m3
   5947    mova        [dstq+64*0], m0
   5948    mova        [dstq+64*1], m1
   5949    add                dstq, dsq
   5950    inc                  hq
   5951    jl .w64
   5952    RET
   5953 .w128:
   5954    vpbroadcastw         m8, [maskq+hq*2]
   5955    mova                 m0,     [dstq+64*0]
   5956    psubw                m4, m0, [tmpq+64*0]
   5957    mova                 m1,     [dstq+64*1]
   5958    psubw                m5, m1, [tmpq+64*1]
   5959    mova                 m2,     [dstq+64*2]
   5960    psubw                m6, m2, [tmpq+64*2]
   5961    mova                 m3,     [dstq+64*3]
   5962    psubw                m7, m3, [tmpq+64*3]
   5963    add                tmpq, 64*4
   5964    REPX   {pmulhrsw x, m8}, m4, m5, m6, m7
   5965    paddw                m0, m4
   5966    paddw                m1, m5
   5967    paddw                m2, m6
   5968    paddw                m3, m7
   5969    mova        [dstq+64*0], m0
   5970    mova        [dstq+64*1], m1
   5971    mova        [dstq+64*2], m2
   5972    mova        [dstq+64*3], m3
   5973    add                dstq, dsq
   5974    inc                  hq
   5975    jl .w128
   5976    RET
   5977 
   5978 cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
   5979                                 dst_w, h, src_w, dx, mx0, pxmax
   5980    sub          dword mx0m, 4<<14
   5981    sub        dword src_wm, 8
   5982    mov                  r6, ~0
   5983    vpbroadcastd         m5, dxm
   5984    vpbroadcastd         m8, mx0m
   5985    vpbroadcastd         m6, src_wm
   5986    kmovq                k6, r6
   5987 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
   5988    LEA                  r7, $$
   5989 %define base r7-$$
   5990    vpbroadcastd         m3, [base+pd_16384]
   5991    vpbroadcastd         m7, [base+pd_63]
   5992    mova                m24, [base+resize_permA]
   5993    mova                m25, [base+resize_permB]
   5994    mova                m26, [base+resize_permC]
   5995    mova                m27, [base+resize_permD]
   5996    vbroadcasti32x4     m28, [base+resize_shufA]
   5997    vbroadcasti32x4     m29, [base+resize_shufB]
   5998    mova                m30, [base+resize_permE]
   5999    vpbroadcastw       ym31, pxmaxm
   6000    vpdpwssd             m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
   6001    pslld                m5, 4                      ; dx*16
   6002    pslld                m6, 14
   6003    pxor                 m2, m2
   6004 .loop_y:
   6005    xor                  xd, xd
   6006    mova                 m4, m8     ; per-line working version of mx
   6007 .loop_x:
   6008    pmaxsd               m0, m4, m2
   6009    psrad                m9, m4, 8  ; filter offset (unmasked)
   6010    pminsd               m0, m6     ; iclip(mx, 0, src_w-8)
   6011    psubd                m1, m4, m0 ; pshufb offset
   6012    psrad                m0, 14     ; clipped src_x offset
   6013    psrad                m1, 14     ; pshufb edge_emu offset
   6014    vptestmd             k5, m1, m1
   6015    pand                 m9, m7     ; filter offset (masked)
   6016    ktestw               k5, k5
   6017    jz .load
   6018    vpbroadcastq        m14, [base+pd_0_4]
   6019    vpermq              m10, m0, q1100
   6020    vpermq              m11, m0, q3322
   6021    vpermq              m20, m1, q1100
   6022    vpermq              m21, m1, q3322
   6023    punpckldq           m10, m10
   6024    punpckldq           m11, m11
   6025    punpckldq           m20, m20
   6026    punpckldq           m21, m21
   6027    paddd               m10, m14
   6028    paddd               m11, m14
   6029    paddd               m20, m14
   6030    paddd               m21, m14
   6031    vextracti32x8      ym12, m10, 1
   6032    vextracti32x8      ym13, m11, 1
   6033    vextracti32x8      ym22, m20, 1
   6034    vextracti32x8      ym23, m21, 1
   6035    kmovq                k1, k6
   6036    kmovq                k2, k6
   6037    kmovq                k3, k6
   6038    kmovq                k4, k6
   6039    vpgatherdq      m16{k1}, [srcq+ym10*2] ; 0 1 2 3
   6040    vpgatherdq      m17{k2}, [srcq+ym11*2] ; 4 5 6 7
   6041    vpgatherdq      m18{k3}, [srcq+ym12*2] ; 8 9 A B
   6042    vpgatherdq      m19{k4}, [srcq+ym13*2] ; C D E F
   6043    kmovq                k1, k6
   6044    kmovq                k2, k6
   6045    kmovq                k3, k6
   6046    kmovq                k4, k6
   6047    vpgatherdq       m0{k1}, [base+resize_shuf+8+ym20*2]
   6048    vpgatherdq       m1{k2}, [base+resize_shuf+8+ym21*2]
   6049    vpgatherdq      m14{k3}, [base+resize_shuf+8+ym22*2]
   6050    vpgatherdq      m15{k4}, [base+resize_shuf+8+ym23*2]
   6051    pshufb              m16, m0
   6052    pshufb              m17, m1
   6053    pshufb              m18, m14
   6054    pshufb              m19, m15
   6055    mova                m20, m24
   6056    mova                m22, m24
   6057    mova                m21, m25
   6058    mova                m23, m25
   6059    vpermi2d            m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
   6060    vpermi2d            m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
   6061    vpermi2d            m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
   6062    vpermi2d            m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
   6063    mova                m15, m26
   6064    mova                m17, m26
   6065    mova                m16, m27
   6066    mova                m18, m27
   6067    vpermi2q            m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
   6068    vpermi2q            m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
   6069    vpermi2q            m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
   6070    vpermi2q            m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
   6071    kmovq                k1, k6
   6072    kmovq                k2, k6
   6073    vpgatherdd      m11{k1}, [base+resize_filter+m9*8+0]
   6074    vpgatherdd      m13{k2}, [base+resize_filter+m9*8+4]
   6075    pshufb              m10, m11, m28
   6076    pshufb              m11, m11, m29
   6077    pshufb              m12, m13, m28
   6078    pshufb              m13, m13, m29
   6079    jmp .filter
   6080 .load:
   6081    kmovq                k1, k6
   6082    kmovq                k2, k6
   6083    kmovq                k3, k6
   6084    kmovq                k4, k6
   6085    vpgatherdd      m11{k1}, [base+resize_filter+m9*8+0]
   6086    vpgatherdd      m13{k2}, [base+resize_filter+m9*8+4]
   6087    pshufb              m10, m11, m28
   6088    pshufb              m11, m11, m29
   6089    pshufb              m12, m13, m28
   6090    pshufb              m13, m13, m29
   6091    vpgatherdd      m15{k3}, [srcq+m0*2+ 0]
   6092    vpgatherdd      m16{k4}, [srcq+m0*2+ 4]
   6093    kmovq                k1, k6
   6094    kmovq                k2, k6
   6095    vpgatherdd      m17{k1}, [srcq+m0*2+ 8]
   6096    vpgatherdd      m18{k2}, [srcq+m0*2+12]
   6097 .filter:
   6098    mova                m14, m2
   6099    vpdpwssd            m14, m15, m10
   6100    vpdpwssd            m14, m16, m11
   6101    vpdpwssd            m14, m17, m12
   6102    vpdpwssd            m14, m18, m13
   6103    psubd               m14, m3, m14
   6104    psrad               m14, 15
   6105    packusdw            m14, m14
   6106    vpermq              m14, m30, m14
   6107    pminsw             ym14, ym31
   6108    mova        [dstq+xq*2], ym14
   6109    paddd                m4, m5
   6110    add                  xd, 16
   6111    cmp                  xd, dst_wd
   6112    jl .loop_x
   6113    add                dstq, dst_strideq
   6114    add                srcq, src_strideq
   6115    dec                  hd
   6116    jg .loop_y
   6117    RET
   6118 
   6119 %endif ; ARCH_X86_64