tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc_avx512.asm (201257B)


      1 ; Copyright © 2020, VideoLAN and dav1d authors
      2 ; Copyright © 2020, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA 64
     32 
     33 obmc_masks:
     34 pw_512:         times 2 dw 512
     35                ; 2
     36                db 45, 19, 64,  0
     37                ; 4
     38                db 39, 25, 50, 14, 59,  5, 64,  0
     39                ; 8
     40                db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
     41                ; 16
     42                db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
     43                db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
     44                ; 32
     45                db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
     46                db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
     47                db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
     48                db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
     49 
     50 warp_8x8_permA: db  4,  5,  6,  7, 16, 17, 18, 19,  5,  6,  7,  8, 17, 18, 19, 20
     51                db  6,  7,  8,  9, 18, 19, 20, 21,  7,  8,  9, 10, 19, 20, 21, 22
     52                db  8,  9, 10, 11, 20, 21, 22, 23,  9, 10, 11, 12, 21, 22, 23, 24
     53                db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26
     54 warp_8x8_permB: db  0,  1,  2,  3, 20, 21, 22, 23,  1,  2,  3,  4, 21, 22, 23, 24
     55                db  2,  3,  4,  5, 22, 23, 24, 25,  3,  4,  5,  6, 23, 24, 25, 26
     56                db  4,  5,  6,  7, 24, 25, 26, 27,  5,  6,  7,  8, 25, 26, 27, 28
     57                db  6,  7,  8,  9, 26, 27, 28, 29,  7,  8,  9, 10, 27, 28, 29, 30
     58 warp_8x8_permC: db -1,  0, -1,  1, -1,  8, -1,  9, -1,  4, -1,  5, -1, 12, -1, 13
     59 warp_8x8_permD: db -1,  2, -1,  3, -1, 10, -1, 11, -1,  6, -1,  7, -1, 14, -1, 15
     60 pd_0to7:        dd  0,  1,  2,  3,  4,  5,  6,  7
     61 warp_8x8_hpack: db  3, 11,  3, 11, 35, 43, 35, 43
     62 pd_16384:       dd 16384
     63 pd_262144:      dd 262144
     64 warp_8x8_end:   db  0,  4, 16, 20, 32, 36, 48, 52,  2,  6, 18, 22, 34, 38, 50, 54
     65 warp_8x8t_end:  db  2,  3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59
     66                db  6,  7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63
     67 bidir_sctr_w4:  dd  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
     68 wm_420_perm4:   db  1,  3,  9, 11,  5,  7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
     69                db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
     70                db  0,  2,  8, 10,  4,  6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
     71                db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
     72 wm_420_perm8:   db  1,  3, 17, 19,  5,  7, 21, 23,  9, 11, 25, 27, 13, 15, 29, 31
     73                db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
     74                db  0,  2, 16, 18,  4,  6, 20, 22,  8, 10, 24, 26, 12, 14, 28, 30
     75                db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
     76 wm_420_perm16:  db  1,  3, 33, 35,  5,  7, 37, 39,  9, 11, 41, 43, 13, 15, 45, 47
     77                db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
     78                db  0,  2, 32, 34,  4,  6, 36, 38,  8, 10, 40, 42, 12, 14, 44, 46
     79                db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
     80 wm_420_mask:    db  3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
     81                db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
     82                db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
     83                db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
     84 wm_422_mask:    db  2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
     85                db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
     86                db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
     87                db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
     88 wm_444_mask:    db  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
     89                db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
     90                db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
     91                db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
     92 bilin_h_perm16: db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
     93                db  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
     94                db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40
     95                db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48
     96 bilin_h_perm32: db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
     97                db  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16
     98                db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24
     99                db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32
    100 bilin_v_perm8:  db  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23
    101                db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
    102                db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39
    103                db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71
    104 bilin_v_perm16: db  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23
    105                db  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    106                db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71
    107                db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79
    108 bilin_v_perm32: db  0, 64,  1, 65,  2, 66,  3, 67,  4, 68,  5, 69,  6, 70,  7, 71
    109                db  8, 72,  9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79
    110                db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87
    111                db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95
    112 bilin_v_perm64: dd  0,  0,  4,  8,  1,  1,  5,  9,  2,  2,  6, 10,  3,  3,  7, 11
    113 spel_h_perm16:  db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
    114                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
    115                db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
    116                db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
    117 spel_h_perm32:  db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
    118                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
    119                db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
    120                db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
    121 spel_v_perm8:   db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
    122                db  8, 16,  9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23
    123                db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
    124                db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39
    125 spel_v_perm16a: db 32,  0, 33,  1, 34,  2, 35,  3, 36,  4, 37,  5, 38,  6, 39,  7
    126                db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
    127                db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
    128                db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
    129 spel_v_perm16b: db 32,  0, 33,  1, 34,  2, 35,  3, 36,  4, 37,  5, 38,  6, 39,  7
    130                db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
    131                db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
    132                db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
    133 spel_v_perm32:  db  0, 32,  1, 33,  2, 34,  3, 35,  4, 36,  5, 37,  6, 38,  7, 39
    134                db  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
    135                db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
    136                db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
    137 spel_hv_perm4a: db  8,  9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
    138                db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
    139 spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
    140                db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
    141 spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
    142                db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
    143 spel_hv_perm4d: db 18, 19,  0,  1, 22, 23,  4,  5, 26, 27,  8,  9, 30, 31, 12, 13
    144                db  0,  1, 16, 17,  4,  5, 20, 21,  8,  9, 24, 25, 12, 13, 28, 29
    145 spel_hv_perm8a: db  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 23
    146                db  8,  9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
    147                db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
    148                db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
    149 spel_hv_perm8b: db 34, 35,  0,  1, 38, 39,  4,  5, 42, 43,  8,  9, 46, 47, 12, 13
    150                db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29
    151                db  0,  1, 32, 33,  4,  5, 36, 37,  8,  9, 40, 41, 12, 13, 44, 45
    152                db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61
    153 spel_hv_perm16a:db  0,  1,  2,  3, 32, 33, 34, 35,  1,  2,  3,  4, 33, 34, 35, 36
    154                db  2,  3,  4,  5, 34, 35, 36, 37,  3,  4,  5,  6, 35, 36, 37, 38
    155                db  8,  9, 10, 11, 40, 41, 42, 43,  9, 10, 11, 12, 41, 42, 43, 44
    156                db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46
    157 spel_hv_perm16b:db  0,  1,  2,  3,  1,  2,  3,  4,  4,  5,  6,  7,  5,  6,  7,  8
    158                db  2,  3,  4,  5,  3,  4,  5,  6,  6,  7,  8,  9,  7,  8,  9, 10
    159                db  8,  9, 10, 11,  9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16
    160                db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18
    161 spel_hv_end16:  db  1,  3, 17, 19,  5,  7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
    162                db  9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
    163 spel_hv_end:    db  1,  3,  5,  7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
    164 deint_shuf4:    db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
    165 subpel_h_shuf4: db  0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
    166                db  2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
    167 subpel_h_shufA: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
    168 subpel_h_shufB: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
    169 subpel_h_shufC: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
    170 bilin_h_shuf4:  db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
    171 bilin_v_shuf4:  db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
    172 blend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
    173 rescale_mul:    dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
    174 resize_permA:   dd  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
    175 resize_permB:   dd  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
    176 resize_permC:   dd  0,  4,  8, 12
    177 resize_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7
    178 pb_02461357:    db  0,  2,  4,  6,  1,  3,  5,  7
    179 
    180 wm_420_perm64:  dq 0xfedcba9876543210
    181 wm_sign:        dd 0x40804080, 0xc0c0c0c0, 0x40404040
    182 
    183 pb_8x0_8x8: times 8 db 0
    184            times 8 db 8
    185 pb_4:       times 4 db 4
    186 pb_32:      times 4 db 32
    187 pb_127:     times 4 db 127
    188 pw_m128     times 2 dw -128
    189 pw_m256:    times 2 dw -256
    190 pw_1024:    times 2 dw 1024
    191 pw_2048:    times 2 dw 2048
    192 pw_6903:    times 2 dw 6903
    193 pw_8192:    times 2 dw 8192
    194 pd_32:              dd 32
    195 pd_34:              dd 34
    196 pd_63:              dd 63
    197 pd_512:             dd 512
    198 
    199 %define pb_m64 (wm_sign+4)
    200 %define pb_64  (wm_sign+8)
    201 %define pd_2   (pd_0to7+8)
    202 
    203 cextern mc_subpel_filters
    204 %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
    205 cextern mc_warp_filter
    206 cextern resize_filter
    207 
    208 %macro BASE_JMP_TABLE 3-*
    209    %xdefine %1_%2_table (%%table - %3)
    210    %xdefine %%base %1_%2
    211    %%table:
    212    %rep %0 - 2
    213        dw %%base %+ _w%3 - %%base
    214        %rotate 1
    215    %endrep
    216 %endmacro
    217 
    218 %macro HV_JMP_TABLE 5-*
    219    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
    220    %xdefine %%base %1_%3
    221    %assign %%types %4
    222    %if %%types & 1
    223        %xdefine %1_%2_h_%3_table  (%%h  - %5)
    224        %%h:
    225        %rep %0 - 4
    226            dw %%prefix %+ .h_w%5 - %%base
    227            %rotate 1
    228        %endrep
    229        %rotate 4
    230    %endif
    231    %if %%types & 2
    232        %xdefine %1_%2_v_%3_table  (%%v  - %5)
    233        %%v:
    234        %rep %0 - 4
    235            dw %%prefix %+ .v_w%5 - %%base
    236            %rotate 1
    237        %endrep
    238        %rotate 4
    239    %endif
    240    %if %%types & 4
    241        %xdefine %1_%2_hv_%3_table (%%hv - %5)
    242        %%hv:
    243        %rep %0 - 4
    244            dw %%prefix %+ .hv_w%5 - %%base
    245            %rotate 1
    246        %endrep
    247    %endif
    248 %endmacro
    249 
    250 %macro BIDIR_JMP_TABLE 2-*
    251    %xdefine %1_%2_table (%%table - 2*%3)
    252    %xdefine %%base %1_%2_table
    253    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
    254    %%table:
    255    %rep %0 - 2
    256        dd %%prefix %+ .w%3 - %%base
    257        %rotate 1
    258    %endrep
    259 %endmacro
    260 
    261 %xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put)
    262 %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep)
    263 
    264 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
    265 
    266 BASE_JMP_TABLE put,  avx512icl,         2, 4, 8, 16, 32, 64, 128
    267 BASE_JMP_TABLE prep, avx512icl,            4, 8, 16, 32, 64, 128
    268 HV_JMP_TABLE put,  bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
    269 HV_JMP_TABLE prep, bilin, avx512icl, 7,    4, 8, 16, 32, 64, 128
    270 HV_JMP_TABLE put,  6tap,  avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
    271 HV_JMP_TABLE put,  8tap,  avx512icl, 3, 2, 4, 8, 16, 32, 64, 128
    272 HV_JMP_TABLE prep, 6tap,  avx512icl, 2,    4, 8, 16, 32, 64, 128
    273 HV_JMP_TABLE prep, 8tap,  avx512icl, 3,    4, 8, 16, 32, 64, 128
    274 BIDIR_JMP_TABLE avg, avx512icl,            4, 8, 16, 32, 64, 128
    275 BIDIR_JMP_TABLE w_avg, avx512icl,          4, 8, 16, 32, 64, 128
    276 BIDIR_JMP_TABLE mask, avx512icl,           4, 8, 16, 32, 64, 128
    277 BIDIR_JMP_TABLE w_mask_420, avx512icl,     4, 8, 16, 32, 64, 128
    278 BIDIR_JMP_TABLE w_mask_422, avx512icl,     4, 8, 16, 32, 64, 128
    279 BIDIR_JMP_TABLE w_mask_444, avx512icl,     4, 8, 16, 32, 64, 128
    280 BIDIR_JMP_TABLE blend, avx512icl,          4, 8, 16, 32
    281 BIDIR_JMP_TABLE blend_v, avx512icl,     2, 4, 8, 16, 32
    282 BIDIR_JMP_TABLE blend_h, avx512icl,     2, 4, 8, 16, 32, 64, 128
    283 
    284 SECTION .text
    285 
    286 %macro WRAP_YMM 1+
    287 INIT_YMM cpuname
    288    %1
    289 INIT_ZMM cpuname
    290 %endmacro
    291 
    292 INIT_ZMM avx512icl
    293 cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
    294    movifnidn          mxyd, r6m ; mx
    295    lea                  r7, [put_avx512icl]
    296    tzcnt                wd, wm
    297    movifnidn            hd, hm
    298    test               mxyd, mxyd
    299    jnz .h
    300    mov                mxyd, r7m ; my
    301    test               mxyd, mxyd
    302    jnz .v
    303 .put:
    304    movzx                wd, word [r7+wq*2+table_offset(put,)]
    305    add                  wq, r7
    306    jmp                  wq
    307 .put_w2:
    308    movzx               r6d, word [srcq+ssq*0]
    309    movzx               r7d, word [srcq+ssq*1]
    310    lea                srcq, [srcq+ssq*2]
    311    mov        [dstq+dsq*0], r6w
    312    mov        [dstq+dsq*1], r7w
    313    lea                dstq, [dstq+dsq*2]
    314    sub                  hd, 2
    315    jg .put_w2
    316    RET
    317 .put_w4:
    318    mov                 r6d, [srcq+ssq*0]
    319    mov                 r7d, [srcq+ssq*1]
    320    lea                srcq, [srcq+ssq*2]
    321    mov        [dstq+dsq*0], r6d
    322    mov        [dstq+dsq*1], r7d
    323    lea                dstq, [dstq+dsq*2]
    324    sub                  hd, 2
    325    jg .put_w4
    326    RET
    327 .put_w8:
    328    mov                  r6, [srcq+ssq*0]
    329    mov                  r7, [srcq+ssq*1]
    330    lea                srcq, [srcq+ssq*2]
    331    mov        [dstq+dsq*0], r6
    332    mov        [dstq+dsq*1], r7
    333    lea                dstq, [dstq+dsq*2]
    334    sub                  hd, 2
    335    jg .put_w8
    336    RET
    337 .put_w16:
    338    movu               xmm0, [srcq+ssq*0]
    339    movu               xmm1, [srcq+ssq*1]
    340    lea                srcq, [srcq+ssq*2]
    341    mova       [dstq+dsq*0], xmm0
    342    mova       [dstq+dsq*1], xmm1
    343    lea                dstq, [dstq+dsq*2]
    344    sub                  hd, 2
    345    jg .put_w16
    346    RET
    347 .put_w32:
    348    movu                ym0, [srcq+ssq*0]
    349    movu                ym1, [srcq+ssq*1]
    350    lea                srcq, [srcq+ssq*2]
    351    mova       [dstq+dsq*0], ym0
    352    mova       [dstq+dsq*1], ym1
    353    lea                dstq, [dstq+dsq*2]
    354    sub                  hd, 2
    355    jg .put_w32
    356    RET
    357 .put_w64:
    358    movu                 m0, [srcq+ssq*0]
    359    movu                 m1, [srcq+ssq*1]
    360    lea                srcq, [srcq+ssq*2]
    361    mova       [dstq+dsq*0], m0
    362    mova       [dstq+dsq*1], m1
    363    lea                dstq, [dstq+dsq*2]
    364    sub                  hd, 2
    365    jg .put_w64
    366    RET
    367 .put_w128:
    368    movu                 m0, [srcq+ssq*0+64*0]
    369    movu                 m1, [srcq+ssq*0+64*1]
    370    movu                 m2, [srcq+ssq*1+64*0]
    371    movu                 m3, [srcq+ssq*1+64*1]
    372    lea                srcq, [srcq+ssq*2]
    373    mova  [dstq+dsq*0+64*0], m0
    374    mova  [dstq+dsq*0+64*1], m1
    375    mova  [dstq+dsq*1+64*0], m2
    376    mova  [dstq+dsq*1+64*1], m3
    377    lea                dstq, [dstq+dsq*2]
    378    sub                  hd, 2
    379    jg .put_w128
    380    RET
    381 .h:
    382    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
    383    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
    384    imul               mxyd, 255
    385    vbroadcasti128       m4, [bilin_h_perm16]
    386    add                mxyd, 16
    387    vpbroadcastw         m5, mxyd
    388    mov                mxyd, r7m ; my
    389    test               mxyd, mxyd
    390    jnz .hv
    391    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_h)]
    392    vpbroadcastd         m3, [pw_2048]
    393    add                  wq, r7
    394    jmp                  wq
    395 .h_w2:
    396    movd               xmm0, [srcq+ssq*0]
    397    pinsrd             xmm0, [srcq+ssq*1], 1
    398    lea                srcq, [srcq+ssq*2]
    399    pshufb             xmm0, xm4
    400    pmaddubsw          xmm0, xm5
    401    pmulhrsw           xmm0, xm3
    402    packuswb           xmm0, xmm0
    403    pextrw     [dstq+dsq*0], xmm0, 0
    404    pextrw     [dstq+dsq*1], xmm0, 2
    405    lea                dstq, [dstq+dsq*2]
    406    sub                  hd, 2
    407    jg .h_w2
    408    RET
    409 .h_w4:
    410    mova               xmm4, [bilin_h_shuf4]
    411 .h_w4_loop:
    412    movq               xmm0, [srcq+ssq*0]
    413    movhps             xmm0, [srcq+ssq*1]
    414    lea                srcq, [srcq+ssq*2]
    415    pshufb             xmm0, xmm4
    416    pmaddubsw          xmm0, xm5
    417    pmulhrsw           xmm0, xm3
    418    packuswb           xmm0, xmm0
    419    movd       [dstq+dsq*0], xmm0
    420    pextrd     [dstq+dsq*1], xmm0, 1
    421    lea                dstq, [dstq+dsq*2]
    422    sub                  hd, 2
    423    jg .h_w4_loop
    424    RET
    425 .h_w8:
    426    movu                xm0, [srcq+ssq*0]
    427    vinserti32x4        ym0, [srcq+ssq*1], 1
    428    lea                srcq, [srcq+ssq*2]
    429    pshufb              ym0, ym4
    430    pmaddubsw           ym0, ym5
    431    pmulhrsw            ym0, ym3
    432    vpmovuswb           xm0, ym0
    433    movq       [dstq+dsq*0], xm0
    434    movhps     [dstq+dsq*1], xm0
    435    lea                dstq, [dstq+dsq*2]
    436    sub                  hd, 2
    437    jg .h_w8
    438    RET
    439 .h_w16:
    440    mova                 m4, [bilin_h_perm16]
    441 .h_w16_loop:
    442    movu                ym0, [srcq+ssq*0]
    443    vinserti32x8         m0, [srcq+ssq*1], 1
    444    lea                srcq, [srcq+ssq*2]
    445    vpermb               m0, m4, m0
    446    pmaddubsw            m0, m5
    447    pmulhrsw             m0, m3
    448    vpmovuswb           ym0, m0
    449    mova         [dstq+dsq*0], xm0
    450    vextracti128 [dstq+dsq*1], ym0, 1
    451    lea                dstq, [dstq+dsq*2]
    452    sub                  hd, 2
    453    jg .h_w16_loop
    454    RET
    455 .h_w32:
    456    movu                ym0, [srcq+ssq*0+8*0]
    457    vinserti32x8         m0, [srcq+ssq*1+8*0], 1
    458    movu                ym1, [srcq+ssq*0+8*1]
    459    vinserti32x8         m1, [srcq+ssq*1+8*1], 1
    460    lea                srcq, [srcq+ssq*2]
    461    pshufb               m0, m4
    462    pshufb               m1, m4
    463    pmaddubsw            m0, m5
    464    pmaddubsw            m1, m5
    465    pmulhrsw             m0, m3
    466    pmulhrsw             m1, m3
    467    packuswb             m0, m1
    468    mova          [dstq+dsq*0], ym0
    469    vextracti32x8 [dstq+dsq*1], m0, 1
    470    lea                dstq, [dstq+dsq*2]
    471    sub                  hd, 2
    472    jg .h_w32
    473    RET
    474 .h_w64:
    475    movu                 m0, [srcq+8*0]
    476    movu                 m1, [srcq+8*1]
    477    pshufb               m0, m4
    478    pshufb               m1, m4
    479    pmaddubsw            m0, m5
    480    pmaddubsw            m1, m5
    481    pmulhrsw             m0, m3
    482    pmulhrsw             m1, m3
    483    packuswb             m0, m1
    484    add                srcq, ssq
    485    mova             [dstq], m0
    486    add                dstq, dsq
    487    dec                  hd
    488    jg .h_w64
    489    RET
    490 .h_w128:
    491    movu                 m0, [srcq+8*0]
    492    movu                 m2, [srcq+8*1]
    493    movu                 m1, [srcq+8*8]
    494    movu                 m6, [srcq+8*9]
    495    add                srcq, ssq
    496    REPX  {pshufb    x, m4}, m0, m2, m1, m6
    497    REPX  {pmaddubsw x, m5}, m0, m2, m1, m6
    498    REPX  {pmulhrsw  x, m3}, m0, m2, m1, m6
    499    packuswb             m0, m2
    500    packuswb             m1, m6
    501    mova        [dstq+64*0], m0
    502    mova        [dstq+64*1], m1
    503    add                dstq, dsq
    504    dec                  hd
    505    jg .h_w128
    506    RET
    507 .v:
    508    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
    509    imul               mxyd, 255
    510    vpbroadcastd         m5, [pw_2048]
    511    add                mxyd, 16
    512    add                  wq, r7
    513    vpbroadcastw         m4, mxyd
    514    jmp                  wq
    515 .v_w2:
    516    movd               xmm0,       [srcq+ssq*0]
    517 .v_w2_loop:
    518    pinsrw             xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1
    519    lea                srcq,       [srcq+ssq*2]
    520    pinsrw             xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1
    521    pshuflw            xmm1, xmm1, q2301           ; 1 0
    522    punpcklbw          xmm1, xmm0
    523    pmaddubsw          xmm1, xm4
    524    pmulhrsw           xmm1, xm5
    525    packuswb           xmm1, xmm1
    526    pextrw     [dstq+dsq*0], xmm1, 1
    527    pextrw     [dstq+dsq*1], xmm1, 0
    528    lea                dstq, [dstq+dsq*2]
    529    sub                  hd, 2
    530    jg .v_w2_loop
    531    RET
    532 .v_w4:
    533    movd               xmm0, [srcq+ssq*0]
    534 .v_w4_loop:
    535    vpbroadcastd       xmm2, [srcq+ssq*1]
    536    lea                srcq, [srcq+ssq*2]
    537    vpblendd           xmm1, xmm2, xmm0, 0x01 ; 0 1
    538    vpbroadcastd       xmm0, [srcq+ssq*0]
    539    vpblendd           xmm2, xmm0, 0x02       ; 1 2
    540    punpcklbw          xmm1, xmm2
    541    pmaddubsw          xmm1, xm4
    542    pmulhrsw           xmm1, xm5
    543    packuswb           xmm1, xmm1
    544    movd       [dstq+dsq*0], xmm1
    545    pextrd     [dstq+dsq*1], xmm1, 1
    546    lea                dstq, [dstq+dsq*2]
    547    sub                  hd, 2
    548    jg .v_w4_loop
    549    RET
    550 .v_w8:
    551    movq               xmm0, [srcq+ssq*0]
    552 .v_w8_loop:
    553    movq               xmm2, [srcq+ssq*1]
    554    lea                srcq, [srcq+ssq*2]
    555    punpcklbw          xmm1, xmm0, xmm2
    556    movq               xmm0, [srcq+ssq*0]
    557    punpcklbw          xmm2, xmm0
    558    pmaddubsw          xmm1, xm4
    559    pmaddubsw          xmm2, xm4
    560    pmulhrsw           xmm1, xm5
    561    pmulhrsw           xmm2, xm5
    562    packuswb           xmm1, xmm2
    563    movq       [dstq+dsq*0], xmm1
    564    movhps     [dstq+dsq*1], xmm1
    565    lea                dstq, [dstq+dsq*2]
    566    sub                  hd, 2
    567    jg .v_w8_loop
    568    RET
    569 .v_w16:
    570    movu               xmm0, [srcq+ssq*0]
    571 .v_w16_loop:
    572    vbroadcasti128     ymm3, [srcq+ssq*1]
    573    lea                srcq, [srcq+ssq*2]
    574    vpblendd           ymm2, ymm3, ymm0, 0x0f ; 0 1
    575    vbroadcasti128     ymm0, [srcq+ssq*0]
    576    vpblendd           ymm3, ymm0, 0xf0       ; 1 2
    577    punpcklbw          ymm1, ymm2, ymm3
    578    punpckhbw          ymm2, ymm3
    579    pmaddubsw          ymm1, ym4
    580    pmaddubsw          ymm2, ym4
    581    pmulhrsw           ymm1, ym5
    582    pmulhrsw           ymm2, ym5
    583    packuswb           ymm1, ymm2
    584    mova         [dstq+dsq*0], xmm1
    585    vextracti128 [dstq+dsq*1], ymm1, 1
    586    lea                dstq, [dstq+dsq*2]
    587    sub                  hd, 2
    588    jg .v_w16_loop
    589    vzeroupper
    590    RET
    591 .v_w32:
    592    movu                ym0, [srcq+ssq*0]
    593    kxnorb               k1, k1, k1
    594 .v_w32_loop:
    595    vbroadcasti32x8      m3, [srcq+ssq*1]
    596    lea                srcq, [srcq+ssq*2]
    597    vpblendmd        m2{k1}, m3, m0 ; 0 1
    598    vbroadcasti32x8      m0, [srcq+ssq*0]
    599    vpblendmd        m3{k1}, m0, m3 ; 1 2
    600    punpcklbw            m1, m2, m3
    601    punpckhbw            m2, m3
    602    pmaddubsw            m1, m4
    603    pmaddubsw            m2, m4
    604    pmulhrsw             m1, m5
    605    pmulhrsw             m2, m5
    606    packuswb             m1, m2
    607    mova          [dstq+dsq*0], ym1
    608    vextracti32x8 [dstq+dsq*1], m1, 1
    609    lea                dstq, [dstq+dsq*2]
    610    sub                  hd, 2
    611    jg .v_w32_loop
    612    RET
    613 .v_w64:
    614    movu                 m0, [srcq+ssq*0]
    615 .v_w64_loop:
    616    movu                 m3, [srcq+ssq*1]
    617    lea                srcq, [srcq+ssq*2]
    618    punpcklbw            m1, m0, m3
    619    punpckhbw            m6, m0, m3
    620    movu                 m0, [srcq+ssq*0]
    621    pmaddubsw            m1, m4
    622    pmaddubsw            m6, m4
    623    punpcklbw            m2, m3, m0
    624    punpckhbw            m3, m0
    625    pmaddubsw            m2, m4
    626    pmaddubsw            m3, m4
    627    REPX   {pmulhrsw x, m5}, m1, m6, m2, m3
    628    packuswb             m1, m6
    629    packuswb             m2, m3
    630    mova       [dstq+dsq*0], m1
    631    mova       [dstq+dsq*1], m2
    632    lea                dstq, [dstq+dsq*2]
    633    sub                  hd, 2
    634    jg .v_w64_loop
    635    RET
    636 .v_w128:
    637    movu                 m0, [srcq+64*0]
    638    movu                 m1, [srcq+64*1]
    639 .v_w128_loop:
    640    add                srcq, ssq
    641    movu                 m2, [srcq+64*0]
    642    movu                 m3, [srcq+64*1]
    643    punpcklbw            m6, m0, m2
    644    pmaddubsw            m6, m4
    645    punpckhbw            m0, m2
    646    pmaddubsw            m0, m4
    647    punpcklbw            m7, m1, m3
    648    pmaddubsw            m7, m4
    649    punpckhbw            m1, m3
    650    pmaddubsw            m1, m4
    651    REPX   {pmulhrsw x, m5}, m6, m0, m7, m1
    652    packuswb             m6, m0
    653    mova                 m0, m2
    654    packuswb             m7, m1
    655    mova                 m1, m3
    656    mova        [dstq+64*0], m6
    657    mova        [dstq+64*1], m7
    658    add                dstq, dsq
    659    dec                  hd
    660    jg .v_w128_loop
    661    RET
    662 .hv:
    663    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
    664    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
    665    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
    666    WIN64_SPILL_XMM       8
    667    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
    668    vpbroadcastd         m7, [pw_2048]
    669    add                  wq, r7
    670    vpbroadcastw         m6, mxyd
    671    jmp                  wq
    672 .hv_w2:
    673    vpbroadcastd       xmm0, [srcq+ssq*0]
    674    pshufb             xmm0, xm4
    675    pmaddubsw          xmm0, xm5
    676 .hv_w2_loop:
    677    movd               xmm1, [srcq+ssq*1]
    678    lea                srcq, [srcq+ssq*2]
    679    pinsrd             xmm1, [srcq+ssq*0], 1
    680    pshufb             xmm1, xm4
    681    pmaddubsw          xmm1, xm5               ; 1 _ 2 _
    682    shufps             xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _
    683    mova               xmm0, xmm1
    684    psubw              xmm1, xmm2
    685    paddw              xmm1, xmm1
    686    pmulhw             xmm1, xm6
    687    paddw              xmm1, xmm2
    688    pmulhrsw           xmm1, xm7
    689    packuswb           xmm1, xmm1
    690    pextrw     [dstq+dsq*0], xmm1, 0
    691    pextrw     [dstq+dsq*1], xmm1, 2
    692    lea                dstq, [dstq+dsq*2]
    693    sub                  hd, 2
    694    jg .hv_w2_loop
    695    RET
    696 .hv_w4:
    697    mova               xmm4, [bilin_h_shuf4]
    698    movddup            xmm0, [srcq+ssq*0]
    699    pshufb             xmm0, xmm4
    700    pmaddubsw          xmm0, xm5
    701 .hv_w4_loop:
    702    movq               xmm1, [srcq+ssq*1]
    703    lea                srcq, [srcq+ssq*2]
    704    movhps             xmm1, [srcq+ssq*0]
    705    pshufb             xmm1, xmm4
    706    pmaddubsw          xmm1, xm5               ; 1 2
    707    shufps             xmm2, xmm0, xmm1, q1032 ; 0 1
    708    mova               xmm0, xmm1
    709    psubw              xmm1, xmm2
    710    paddw              xmm1, xmm1
    711    pmulhw             xmm1, xm6
    712    paddw              xmm1, xmm2
    713    pmulhrsw           xmm1, xm7
    714    packuswb           xmm1, xmm1
    715    movd       [dstq+dsq*0], xmm1
    716    pextrd     [dstq+dsq*1], xmm1, 1
    717    lea                dstq, [dstq+dsq*2]
    718    sub                  hd, 2
    719    jg .hv_w4_loop
    720    RET
    721 .hv_w8:
    722    vbroadcasti128      ym0, [srcq+ssq*0]
    723    pshufb              ym0, ym4
    724    pmaddubsw           ym0, ym5
    725 .hv_w8_loop:
    726    movu                xm1, [srcq+ssq*1]
    727    lea                srcq, [srcq+ssq*2]
    728    vinserti128         ym1, [srcq+ssq*0], 1
    729    pshufb              ym1, ym4
    730    pmaddubsw           ym1, ym5            ; 1 2
    731    valignq             ym2, ym1, ym0, 2
    732    mova                ym0, ym1
    733    psubw               ym1, ym2
    734    paddw               ym1, ym1
    735    pmulhw              ym1, ym6
    736    paddw               ym1, ym2
    737    pmulhrsw            ym1, ym7
    738    vpmovuswb           xm1, ym1
    739    movq       [dstq+dsq*0], xm1
    740    movhps     [dstq+dsq*1], xm1
    741    lea                dstq, [dstq+dsq*2]
    742    sub                  hd, 2
    743    jg .hv_w8_loop
    744    RET
    745 .hv_w16:
    746    vbroadcasti32x8      m0, [srcq+ssq*0]
    747    mova                 m4, [bilin_h_perm16]
    748    vpermb               m0, m4, m0
    749    pmaddubsw            m0, m5
    750 .hv_w16_loop:
    751    movu                ym1, [srcq+ssq*1]
    752    lea                srcq, [srcq+ssq*2]
    753    vinserti32x8         m1, [srcq+ssq*0], 1
    754    vpermb               m1, m4, m1
    755    pmaddubsw            m1, m5        ; 1 2
    756    valignq              m2, m1, m0, 4 ; 0 1
    757    mova                 m0, m1
    758    psubw                m1, m2
    759    paddw                m1, m1
    760    pmulhw               m1, m6
    761    paddw                m1, m2
    762    pmulhrsw             m1, m7
    763    vpmovuswb           ym1, m1
    764    mova          [dstq+dsq*0], xm1
    765    vextracti32x4 [dstq+dsq*1], ym1, 1
    766    lea                dstq, [dstq+dsq*2]
    767    sub                  hd, 2
    768    jg .hv_w16_loop
    769    RET
    770 .hv_w32:
    771    mova                 m4, [bilin_h_perm32]
    772    vpermb               m0, m4, [srcq+ssq*0]
    773    pmovzxbq             m8, [pb_02461357]
    774    pmaddubsw            m0, m5
    775 .hv_w32_loop:
    776    vpermb               m2, m4, [srcq+ssq*1]
    777    lea                srcq, [srcq+ssq*2]
    778    vpermb               m3, m4, [srcq+ssq*0]
    779    pmaddubsw            m2, m5
    780    psubw                m1, m2, m0
    781    paddw                m1, m1
    782    pmulhw               m1, m6
    783    paddw                m1, m0
    784    pmaddubsw            m0, m3, m5
    785    psubw                m3, m0, m2
    786    paddw                m3, m3
    787    pmulhw               m3, m6
    788    paddw                m3, m2
    789    pmulhrsw             m1, m7
    790    pmulhrsw             m3, m7
    791    packuswb             m1, m3
    792    vpermq               m1, m8, m1
    793    mova          [dstq+dsq*0], ym1
    794    vextracti32x8 [dstq+dsq*1], m1, 1
    795    lea                dstq, [dstq+dsq*2]
    796    sub                  hd, 2
    797    jg .hv_w32_loop
    798    RET
    799 .hv_w64:
    800    movu                 m0, [srcq+8*0]
    801    movu                 m1, [srcq+8*1]
    802    pshufb               m0, m4
    803    pshufb               m1, m4
    804    pmaddubsw            m0, m5
    805    pmaddubsw            m1, m5
    806 .hv_w64_loop:
    807    add                srcq, ssq
    808    movu                 m2, [srcq+8*0]
    809    movu                 m3, [srcq+8*1]
    810    pshufb               m2, m4
    811    pshufb               m3, m4
    812    pmaddubsw            m2, m5
    813    pmaddubsw            m3, m5
    814    psubw                m8, m2, m0
    815    psubw                m9, m3, m1
    816    paddw                m8, m8
    817    pmulhw               m8, m6
    818    paddw                m9, m9
    819    pmulhw               m9, m6
    820    paddw                m8, m0
    821    pmulhrsw             m8, m7
    822    paddw                m9, m1
    823    pmulhrsw             m9, m7
    824    mova                 m0, m2
    825    mova                 m1, m3
    826    packuswb             m8, m9
    827    mova             [dstq], m8
    828    add                dstq, dsq
    829    dec                  hd
    830    jg .hv_w64_loop
    831    RET
    832 .hv_w128:
    833    movu                 m0, [srcq+8*0]
    834    movu                 m1, [srcq+8*1]
    835    movu                 m2, [srcq+8*8]
    836    movu                 m3, [srcq+8*9]
    837    REPX  {pshufb    x, m4}, m0, m1, m2, m3
    838    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
    839 .hv_w128_loop:
    840    add                srcq, ssq
    841    movu                 m8, [srcq+8*0]
    842    movu                 m9, [srcq+8*1]
    843    movu                m10, [srcq+8*8]
    844    movu                m11, [srcq+8*9]
    845    REPX  {pshufb    x, m4}, m8, m9, m10, m11
    846    REPX  {pmaddubsw x, m5}, m8, m9, m10, m11
    847    psubw               m12, m8, m0
    848    psubw               m13, m9, m1
    849    psubw               m14, m10, m2
    850    psubw               m15, m11, m3
    851    paddw               m12, m12
    852    pmulhw              m12, m6
    853    paddw               m13, m13
    854    pmulhw              m13, m6
    855    paddw               m14, m14
    856    pmulhw              m14, m6
    857    paddw               m15, m15
    858    pmulhw              m15, m6
    859    paddw               m12, m0
    860    pmulhrsw            m12, m7
    861    paddw               m13, m1
    862    pmulhrsw            m13, m7
    863    paddw               m14, m2
    864    pmulhrsw            m14, m7
    865    paddw               m15, m3
    866    pmulhrsw            m15, m7
    867    mova                 m0, m8
    868    mova                 m1, m9
    869    mova                 m2, m10
    870    mova                 m3, m11
    871    packuswb            m12, m13
    872    packuswb            m14, m15
    873    mova        [dstq+64*0], m12
    874    mova        [dstq+64*1], m14
    875    add                dstq, dsq
    876    dec                  hd
    877    jg .hv_w128_loop
    878    RET
    879 
    880 DECLARE_REG_TMP 3, 5, 6
    881 
    882 cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
    883    movifnidn          mxyd, r5m ; mx
    884    lea                  t2, [prep_avx512icl]
    885    tzcnt                wd, wm
    886    movifnidn            hd, hm
    887    test               mxyd, mxyd
    888    jnz .h
    889    mov                mxyd, r6m ; my
    890    test               mxyd, mxyd
    891    jnz .v
    892 .prep:
    893    movzx                wd, word [t2+wq*2+table_offset(prep,)]
    894    add                  wq, t2
    895    lea            stride3q, [strideq*3]
    896    jmp                  wq
    897 .prep_w4:
    898    movd               xmm0, [srcq+strideq*0]
    899    pinsrd             xmm0, [srcq+strideq*1], 1
    900    pinsrd             xmm0, [srcq+strideq*2], 2
    901    pinsrd             xmm0, [srcq+stride3q ], 3
    902    lea                srcq, [srcq+strideq*4]
    903    pmovzxbw            ym0, xmm0
    904    psllw               ym0, 4
    905    mova             [tmpq], ym0
    906    add                tmpq, 32
    907    sub                  hd, 4
    908    jg .prep_w4
    909    RET
    910 .prep_w8:
    911    movq               xmm0, [srcq+strideq*0]
    912    movq               xmm1, [srcq+strideq*1]
    913    vinserti128         ym0, ymm0, [srcq+strideq*2], 1
    914    vinserti128         ym1, ymm1, [srcq+stride3q ], 1
    915    lea                srcq, [srcq+strideq*4]
    916    punpcklqdq          ym0, ym1
    917    pmovzxbw             m0, ym0
    918    psllw                m0, 4
    919    mova             [tmpq], m0
    920    add                tmpq, 32*2
    921    sub                  hd, 4
    922    jg .prep_w8
    923    RET
    924 .prep_w16:
    925    movu               xmm0, [srcq+strideq*0]
    926    vinserti128         ym0, ymm0, [srcq+strideq*1], 1
    927    movu               xmm1, [srcq+strideq*2]
    928    vinserti128         ym1, ymm1, [srcq+stride3q ], 1
    929    lea                srcq, [srcq+strideq*4]
    930    pmovzxbw             m0, ym0
    931    pmovzxbw             m1, ym1
    932    psllw                m0, 4
    933    psllw                m1, 4
    934    mova        [tmpq+64*0], m0
    935    mova        [tmpq+64*1], m1
    936    add                tmpq, 32*4
    937    sub                  hd, 4
    938    jg .prep_w16
    939    RET
    940 .prep_w32:
    941    pmovzxbw             m0, [srcq+strideq*0]
    942    pmovzxbw             m1, [srcq+strideq*1]
    943    pmovzxbw             m2, [srcq+strideq*2]
    944    pmovzxbw             m3, [srcq+stride3q ]
    945    lea                srcq, [srcq+strideq*4]
    946    REPX       {psllw x, 4}, m0, m1, m2, m3
    947    mova        [tmpq+64*0], m0
    948    mova        [tmpq+64*1], m1
    949    mova        [tmpq+64*2], m2
    950    mova        [tmpq+64*3], m3
    951    add                tmpq, 64*4
    952    sub                  hd, 4
    953    jg .prep_w32
    954    RET
    955 .prep_w64:
    956    pmovzxbw             m0, [srcq+strideq*0+32*0]
    957    pmovzxbw             m1, [srcq+strideq*0+32*1]
    958    pmovzxbw             m2, [srcq+strideq*1+32*0]
    959    pmovzxbw             m3, [srcq+strideq*1+32*1]
    960    lea                srcq, [srcq+strideq*2]
    961    REPX       {psllw x, 4}, m0, m1, m2, m3
    962    mova        [tmpq+64*0], m0
    963    mova        [tmpq+64*1], m1
    964    mova        [tmpq+64*2], m2
    965    mova        [tmpq+64*3], m3
    966    add                tmpq, 64*4
    967    sub                  hd, 2
    968    jg .prep_w64
    969    RET
    970 .prep_w128:
    971    pmovzxbw             m0, [srcq+32*0]
    972    pmovzxbw             m1, [srcq+32*1]
    973    pmovzxbw             m2, [srcq+32*2]
    974    pmovzxbw             m3, [srcq+32*3]
    975    REPX       {psllw x, 4}, m0, m1, m2, m3
    976    mova    [tmpq+64*0], m0
    977    mova    [tmpq+64*1], m1
    978    mova    [tmpq+64*2], m2
    979    mova    [tmpq+64*3], m3
    980    add                tmpq, 64*4
    981    add                srcq, strideq
    982    dec                  hd
    983    jg .prep_w128
    984    RET
    985 .h:
    986    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
    987    ; = (16 - mx) * src[x] + mx * src[x + 1]
    988    imul               mxyd, 255
    989    add                mxyd, 16
    990    vpbroadcastw         m5, mxyd
    991    mov                mxyd, r6m ; my
    992    test               mxyd, mxyd
    993    jnz .hv
    994    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
    995    add                  wq, t2
    996    lea            stride3q, [strideq*3]
    997    jmp                  wq
    998 .h_w4:
    999    vbroadcasti32x4     ym4, [bilin_h_shuf4]
   1000 .h_w4_loop:
   1001    movq               xmm0, [srcq+strideq*0]
   1002    movq               xmm1, [srcq+strideq*1]
   1003    vinserti32x4        ym0, ymm0, [srcq+strideq*2], 1
   1004    vinserti32x4        ym1, ymm1, [srcq+stride3q ], 1
   1005    lea                srcq, [srcq+strideq*4]
   1006    punpcklqdq          ym0, ym1
   1007    pshufb              ym0, ym4
   1008    pmaddubsw           ym0, ym5
   1009    mova             [tmpq], ym0
   1010    add                tmpq, 32
   1011    sub                  hd, 4
   1012    jg .h_w4_loop
   1013    RET
   1014 .h_w8:
   1015    vbroadcasti32x4      m4, [bilin_h_perm16]
   1016 .h_w8_loop:
   1017    movu               xmm0, [srcq+strideq*0]
   1018    vinserti32x4        ym0, ymm0, [srcq+strideq*1], 1
   1019    vinserti32x4         m0, [srcq+strideq*2], 2
   1020    vinserti32x4         m0, [srcq+stride3q ], 3
   1021    lea                srcq, [srcq+strideq*4]
   1022    pshufb               m0, m4
   1023    pmaddubsw            m0, m5
   1024    mova             [tmpq], m0
   1025    add                tmpq, 64
   1026    sub                  hd, 4
   1027    jg .h_w8_loop
   1028    RET
   1029 .h_w16:
   1030    mova                 m4, [bilin_h_perm16]
   1031 .h_w16_loop:
   1032    movu                ym0, [srcq+strideq*0]
   1033    vinserti32x8         m0, [srcq+strideq*1], 1
   1034    movu                ym1, [srcq+strideq*2]
   1035    vinserti32x8         m1, [srcq+stride3q ], 1
   1036    lea                srcq, [srcq+strideq*4]
   1037    vpermb               m0, m4, m0
   1038    vpermb               m1, m4, m1
   1039    pmaddubsw            m0, m5
   1040    pmaddubsw            m1, m5
   1041    mova        [tmpq+64*0], m0
   1042    mova        [tmpq+64*1], m1
   1043    add                tmpq, 64*2
   1044    sub                  hd, 4
   1045    jg .h_w16_loop
   1046    RET
   1047 .h_w32:
   1048    mova                 m4, [bilin_h_perm32]
   1049 .h_w32_loop:
   1050    vpermb               m0, m4, [srcq+strideq*0]
   1051    vpermb               m1, m4, [srcq+strideq*1]
   1052    vpermb               m2, m4, [srcq+strideq*2]
   1053    vpermb               m3, m4, [srcq+stride3q ]
   1054    lea                srcq, [srcq+strideq*4]
   1055    pmaddubsw            m0, m5
   1056    pmaddubsw            m1, m5
   1057    pmaddubsw            m2, m5
   1058    pmaddubsw            m3, m5
   1059    mova        [tmpq+64*0], m0
   1060    mova        [tmpq+64*1], m1
   1061    mova        [tmpq+64*2], m2
   1062    mova        [tmpq+64*3], m3
   1063    add                tmpq, 64*4
   1064    sub                  hd, 4
   1065    jg .h_w32_loop
   1066    RET
   1067 .h_w64:
   1068    mova                 m4, [bilin_h_perm32]
   1069 .h_w64_loop:
   1070    vpermb               m0, m4, [srcq+strideq*0+32*0]
   1071    vpermb               m1, m4, [srcq+strideq*0+32*1]
   1072    vpermb               m2, m4, [srcq+strideq*1+32*0]
   1073    vpermb               m3, m4, [srcq+strideq*1+32*1]
   1074    lea                srcq, [srcq+strideq*2]
   1075    pmaddubsw            m0, m5
   1076    pmaddubsw            m1, m5
   1077    pmaddubsw            m2, m5
   1078    pmaddubsw            m3, m5
   1079    mova        [tmpq+64*0], m0
   1080    mova        [tmpq+64*1], m1
   1081    mova        [tmpq+64*2], m2
   1082    mova        [tmpq+64*3], m3
   1083    add                tmpq, 64*4
   1084    sub                  hd, 2
   1085    jg .h_w64_loop
   1086    RET
   1087 .h_w128:
   1088    mova                 m4, [bilin_h_perm32]
   1089 .h_w128_loop:
   1090    vpermb               m0, m4, [srcq+32*0]
   1091    vpermb               m1, m4, [srcq+32*1]
   1092    vpermb               m2, m4, [srcq+32*2]
   1093    vpermb               m3, m4, [srcq+32*3]
   1094    pmaddubsw            m0, m5
   1095    pmaddubsw            m1, m5
   1096    pmaddubsw            m2, m5
   1097    pmaddubsw            m3, m5
   1098    mova        [tmpq+64*0], m0
   1099    mova        [tmpq+64*1], m1
   1100    mova        [tmpq+64*2], m2
   1101    mova        [tmpq+64*3], m3
   1102    add                tmpq, 64*4
   1103    add                srcq, strideq
   1104    dec                  hd
   1105    jg .h_w128_loop
   1106    RET
   1107 .v:
   1108    WIN64_SPILL_XMM       7
   1109    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
   1110    imul               mxyd, 255
   1111    add                mxyd, 16
   1112    add                  wq, t2
   1113    lea            stride3q, [strideq*3]
   1114    vpbroadcastw         m6, mxyd
   1115    jmp                  wq
   1116 .v_w4:
   1117    vpbroadcastd        xm0, [srcq+strideq*0]
   1118    mov                 r3d, 0x29
   1119    vbroadcasti32x4     ym3, [bilin_v_shuf4]
   1120    kmovb                k1, r3d
   1121 .v_w4_loop:
   1122    vpblendmd       xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
   1123    vpbroadcastd        ym2, [srcq+strideq*2]
   1124    vpbroadcastd    ym2{k1}, [srcq+stride3q ]             ; __2_ 23__
   1125    lea                srcq, [srcq+strideq*4]
   1126    vpbroadcastd        ym0, [srcq+strideq*0]
   1127    punpckhqdq      ym2{k1}, ym1, ym0                     ; 012_ 234_
   1128    pshufb              ym2, ym3
   1129    pmaddubsw           ym2, ym6
   1130    mova             [tmpq], ym2
   1131    add                tmpq, 32
   1132    sub                  hd, 4
   1133    jg .v_w4_loop
   1134    RET
   1135 .v_w8:
   1136    mova                 m5, [bilin_v_perm8]
   1137    vbroadcasti32x4     ym0, [srcq+strideq*0]
   1138 .v_w8_loop:
   1139    vinserti32x4        ym1, ym0, [srcq+strideq*1], 1
   1140    vpbroadcastq        ym0, [srcq+strideq*2]
   1141    vinserti32x4         m1, [srcq+stride3q ], 2
   1142    lea                srcq, [srcq+strideq*4]
   1143    vinserti32x4        ym0, [srcq+strideq*0], 0
   1144    vpermt2b             m1, m5, m0
   1145    pmaddubsw            m1, m6
   1146    mova             [tmpq], m1
   1147    add                tmpq, 64
   1148    sub                  hd, 4
   1149    jg .v_w8_loop
   1150    RET
   1151 .v_w16:
   1152    mova                 m5, [bilin_v_perm16]
   1153    movu                xm0, [srcq+strideq*0]
   1154 .v_w16_loop:
   1155    movu                xm2, [srcq+strideq*2]
   1156    vinserti32x4        ym1, ym0, [srcq+strideq*1], 1
   1157    vpermt2b             m1, m5, m2
   1158    vinserti32x4        ym2, [srcq+stride3q ], 1
   1159    lea                srcq, [srcq+strideq*4]
   1160    movu                xm0, [srcq+strideq*0]
   1161    vpermt2b             m2, m5, m0
   1162    pmaddubsw            m1, m6
   1163    pmaddubsw            m2, m6
   1164    mova        [tmpq+64*0], m1
   1165    mova        [tmpq+64*1], m2
   1166    add                tmpq, 64*2
   1167    sub                  hd, 4
   1168    jg .v_w16_loop
   1169    RET
   1170 .v_w32:
   1171    mova                 m5, [bilin_v_perm32]
   1172    movu                ym0, [srcq+strideq*0]
   1173 .v_w32_loop:
   1174    movu                ym2, [srcq+strideq*1]
   1175    movu                ym3, [srcq+strideq*2]
   1176    movu                ym4, [srcq+stride3q ]
   1177    lea                srcq, [srcq+strideq*4]
   1178    vpermt2b             m0, m5, m2
   1179    vpermt2b             m2, m5, m3
   1180    vpermt2b             m3, m5, m4
   1181    pmaddubsw            m1, m0, m6
   1182    movu                ym0, [srcq+strideq*0]
   1183    vpermt2b             m4, m5, m0
   1184    pmaddubsw            m2, m6
   1185    pmaddubsw            m3, m6
   1186    pmaddubsw            m4, m6
   1187    mova        [tmpq+64*0], m1
   1188    mova        [tmpq+64*1], m2
   1189    mova        [tmpq+64*2], m3
   1190    mova        [tmpq+64*3], m4
   1191    add                tmpq, 64*4
   1192    sub                  hd, 4
   1193    jg .v_w32_loop
   1194    RET
   1195 .v_w64:
   1196    mova                 m5, [bilin_v_perm64]
   1197    vpermq               m0, m5, [srcq+strideq*0]
   1198 .v_w64_loop:
   1199    vpermq               m1, m5, [srcq+strideq*1]
   1200    lea                srcq, [srcq+strideq*2]
   1201    punpcklbw            m4, m0, m1
   1202    punpckhbw            m2, m0, m1
   1203    vpermq               m0, m5, [srcq+strideq*0]
   1204    punpcklbw            m3, m1, m0
   1205    punpckhbw            m1, m0
   1206    pmaddubsw            m4, m6
   1207    pmaddubsw            m2, m6
   1208    pmaddubsw            m3, m6
   1209    pmaddubsw            m1, m6
   1210    mova        [tmpq+64*0], m4
   1211    mova        [tmpq+64*1], m2
   1212    mova        [tmpq+64*2], m3
   1213    mova        [tmpq+64*3], m1
   1214    add                tmpq, 64*4
   1215    sub                  hd, 2
   1216    jg .v_w64_loop
   1217    RET
   1218 .v_w128:
   1219    mova                 m5, [bilin_v_perm64]
   1220    vpermq               m0, m5, [srcq+strideq*0+ 0]
   1221    vpermq               m1, m5, [srcq+strideq*0+64]
   1222 .v_w128_loop:
   1223    vpermq               m2, m5, [srcq+strideq*1+ 0]
   1224    vpermq               m3, m5, [srcq+strideq*1+64]
   1225    lea                srcq, [srcq+strideq*2]
   1226    punpcklbw            m4, m0, m2
   1227    punpckhbw            m0, m2
   1228    pmaddubsw            m4, m6
   1229    pmaddubsw            m0, m6
   1230    mova        [tmpq+64*0], m4
   1231    mova        [tmpq+64*1], m0
   1232    punpcklbw            m4, m1, m3
   1233    punpckhbw            m1, m3
   1234    pmaddubsw            m4, m6
   1235    pmaddubsw            m1, m6
   1236    mova        [tmpq+64*2], m4
   1237    mova        [tmpq+64*3], m1
   1238    vpermq               m0, m5, [srcq+strideq*0+ 0]
   1239    vpermq               m1, m5, [srcq+strideq*0+64]
   1240    punpcklbw            m4, m2, m0
   1241    punpckhbw            m2, m0
   1242    pmaddubsw            m4, m6
   1243    pmaddubsw            m2, m6
   1244    mova        [tmpq+64*4], m4
   1245    mova        [tmpq+64*5], m2
   1246    punpcklbw            m4, m3, m1
   1247    punpckhbw            m3, m1
   1248    pmaddubsw            m4, m6
   1249    pmaddubsw            m3, m6
   1250    mova        [tmpq+64*6], m4
   1251    mova        [tmpq+64*7], m3
   1252    add                tmpq, 64*8
   1253    sub                  hd, 2
   1254    jg .v_w128_loop
   1255    RET
   1256 .hv:
   1257    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
   1258    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
   1259    WIN64_SPILL_XMM       7
   1260    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
   1261    shl                mxyd, 11
   1262    vpbroadcastw         m6, mxyd
   1263    add                  wq, t2
   1264    lea            stride3q, [strideq*3]
   1265    jmp                  wq
   1266 .hv_w4:
   1267    vbroadcasti32x4     ym4, [bilin_h_shuf4]
   1268    vpbroadcastq        ym0, [srcq+strideq*0]
   1269    pshufb              ym0, ym4
   1270    pmaddubsw           ym0, ym5
   1271 .hv_w4_loop:
   1272    movq               xmm1, [srcq+strideq*1]
   1273    movq               xmm2, [srcq+strideq*2]
   1274    vinserti32x4        ym1, ymm1, [srcq+stride3q ], 1
   1275    lea                srcq, [srcq+strideq*4]
   1276    vinserti32x4        ym2, ymm2, [srcq+strideq*0], 1
   1277    punpcklqdq          ym1, ym2
   1278    pshufb              ym1, ym4
   1279    pmaddubsw           ym1, ym5         ; 1 2 3 4
   1280    valignq             ym2, ym1, ym0, 3 ; 0 1 2 3
   1281    mova                ym0, ym1
   1282    psubw               ym1, ym2
   1283    pmulhrsw            ym1, ym6
   1284    paddw               ym1, ym2
   1285    mova             [tmpq], ym1
   1286    add                tmpq, 32
   1287    sub                  hd, 4
   1288    jg .hv_w4_loop
   1289    RET
   1290 .hv_w8:
   1291    vbroadcasti32x4      m4, [bilin_h_perm16]
   1292    vbroadcasti32x4      m0, [srcq+strideq*0]
   1293    pshufb               m0, m4
   1294    pmaddubsw            m0, m5
   1295 .hv_w8_loop:
   1296    movu               xmm1, [srcq+strideq*1]
   1297    vinserti128         ym1, ymm1, [srcq+strideq*2], 1
   1298    vinserti128          m1, [srcq+stride3q ], 2
   1299    lea                srcq, [srcq+strideq*4]
   1300    vinserti128          m1, [srcq+strideq*0], 3
   1301    pshufb               m1, m4
   1302    pmaddubsw            m1, m5        ; 1 2 3 4
   1303    valignq              m2, m1, m0, 6 ; 0 1 2 3
   1304    mova                 m0, m1
   1305    psubw                m1, m2
   1306    pmulhrsw             m1, m6
   1307    paddw                m1, m2
   1308    mova             [tmpq], m1
   1309    add                tmpq, 64
   1310    sub                  hd, 4
   1311    jg .hv_w8_loop
   1312    RET
   1313 .hv_w16:
   1314    mova                 m4, [bilin_h_perm16]
   1315    vbroadcasti32x8      m0, [srcq+strideq*0]
   1316    vpermb               m0, m4, m0
   1317    pmaddubsw            m0, m5
   1318 .hv_w16_loop:
   1319    movu                ym1, [srcq+strideq*1]
   1320    vinserti32x8         m1, [srcq+strideq*2], 1
   1321    movu                ym2, [srcq+stride3q ]
   1322    lea                srcq, [srcq+strideq*4]
   1323    vinserti32x8         m2, [srcq+strideq*0], 1
   1324    vpermb               m1, m4, m1
   1325    vpermb               m2, m4, m2
   1326    pmaddubsw            m1, m5            ; 1 2
   1327    vshufi32x4           m3, m0, m1, q1032 ; 0 1
   1328    pmaddubsw            m0, m2, m5        ; 3 4
   1329    vshufi32x4           m2, m1, m0, q1032 ; 2 3
   1330    psubw                m1, m3
   1331    pmulhrsw             m1, m6
   1332    paddw                m1, m3
   1333    psubw                m3, m0, m2
   1334    pmulhrsw             m3, m6
   1335    paddw                m3, m2
   1336    mova        [tmpq+64*0], m1
   1337    mova        [tmpq+64*1], m3
   1338    add                tmpq, 64*2
   1339    sub                  hd, 4
   1340    jg .hv_w16_loop
   1341    RET
   1342 .hv_w32:
   1343    mova                 m4, [bilin_h_perm32]
   1344    vpermb               m0, m4, [srcq+strideq*0]
   1345    pmaddubsw            m0, m5
   1346 .hv_w32_loop:
   1347    vpermb               m1, m4, [srcq+strideq*1]
   1348    lea                srcq, [srcq+strideq*2]
   1349    vpermb               m2, m4, [srcq+strideq*0]
   1350    pmaddubsw            m1, m5
   1351    psubw                m3, m1, m0
   1352    pmulhrsw             m3, m6
   1353    paddw                m3, m0
   1354    pmaddubsw            m0, m2, m5
   1355    psubw                m2, m0, m1
   1356    pmulhrsw             m2, m6
   1357    paddw                m2, m1
   1358    mova        [tmpq+64*0], m3
   1359    mova        [tmpq+64*1], m2
   1360    add                tmpq, 64*2
   1361    sub                  hd, 2
   1362    jg .hv_w32_loop
   1363    RET
   1364 .hv_w64:
   1365    mova                 m4, [bilin_h_perm32]
   1366    vpermb               m0, m4, [srcq+32*0]
   1367    vpermb               m1, m4, [srcq+32*1]
   1368    pmaddubsw            m0, m5
   1369    pmaddubsw            m1, m5
   1370 .hv_w64_loop:
   1371    add                srcq, strideq
   1372    vpermb               m2, m4, [srcq+32*0]
   1373    vpermb               m3, m4, [srcq+32*1]
   1374    pmaddubsw            m2, m5
   1375    pmaddubsw            m3, m5
   1376    psubw                m7, m2, m0
   1377    psubw                m8, m3, m1
   1378    pmulhrsw             m7, m6
   1379    pmulhrsw             m8, m6
   1380    paddw                m7, m0
   1381    mova                 m0, m2
   1382    paddw                m8, m1
   1383    mova                 m1, m3
   1384    mova        [tmpq+64*0], m7
   1385    mova        [tmpq+64*1], m8
   1386    add                tmpq, 64*2
   1387    dec                  hd
   1388    jg .hv_w64_loop
   1389    RET
   1390 .hv_w128:
   1391    mova                 m4, [bilin_h_perm32]
   1392    vpermb               m0, m4, [srcq+32*0]
   1393    vpermb               m1, m4, [srcq+32*1]
   1394    vpermb               m2, m4, [srcq+32*2]
   1395    vpermb               m3, m4, [srcq+32*3]
   1396    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
   1397 .hv_w128_loop:
   1398    add                srcq, strideq
   1399    vpermb               m7, m4, [srcq+32*0]
   1400    vpermb               m8, m4, [srcq+32*1]
   1401    vpermb               m9, m4, [srcq+32*2]
   1402    vpermb              m10, m4, [srcq+32*3]
   1403    REPX  {pmaddubsw x, m5}, m7, m8, m9, m10
   1404    psubw               m11, m7, m0
   1405    psubw               m12, m8, m1
   1406    psubw               m13, m9, m2
   1407    psubw               m14, m10, m3
   1408    REPX  {pmulhrsw  x, m6}, m11, m12, m13, m14
   1409    paddw               m11, m0
   1410    mova                 m0, m7
   1411    paddw               m12, m1
   1412    mova                 m1, m8
   1413    paddw               m13, m2
   1414    mova                 m2, m9
   1415    paddw               m14, m3
   1416    mova                 m3, m10
   1417    mova        [tmpq+64*0], m11
   1418    mova        [tmpq+64*1], m12
   1419    mova        [tmpq+64*2], m13
   1420    mova        [tmpq+64*3], m14
   1421    add                tmpq, 64*4
   1422    dec                  hd
   1423    jg .hv_w128_loop
   1424    RET
   1425 
   1426 ; int8_t subpel_filters[5][15][8]
   1427 %assign FILTER_REGULAR (0*15 << 16) | 3*15
   1428 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
   1429 %assign FILTER_SHARP   (2*15 << 16) | 3*15
   1430 
   1431 %macro FN 4-5 ; fn, type, type_h, type_v, jmp_to
   1432 cglobal %1_%2_8bpc
   1433    mov                 t0d, FILTER_%3
   1434 %ifidn %3, %4
   1435    mov                 t1d, t0d
   1436 %else
   1437    mov                 t1d, FILTER_%4
   1438 %endif
   1439 %if %0 == 5 ; skip the jump in the last filter
   1440    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
   1441 %endif
   1442 %endmacro
   1443 
   1444 %macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb
   1445 %if %5
   1446    vpermb              m%2, m6, m%1
   1447    vpermb              m%3, m7, m%1
   1448    vpermb              m%4, m8, m%1
   1449 %else
   1450 %if %2 < %4 ; reuse a previous value if possible
   1451    pshufb              m%2, m%1, m6
   1452 %endif
   1453    pshufb              m%3, m%1, m7
   1454    pshufb              m%4, m%1, m8
   1455 %endif
   1456    mova                m%1, m5
   1457    vpdpbusd            m%1, m%2, m9
   1458    mova                m%2, m5
   1459    vpdpbusd            m%2, m%3, m9
   1460    vpdpbusd            m%1, m%3, m10
   1461    vpdpbusd            m%2, m%4, m10
   1462    packusdw            m%1, m%2
   1463    psrlw               m%1, 6
   1464 %endmacro
   1465 
   1466 %if WIN64
   1467 DECLARE_REG_TMP 4, 5
   1468 %else
   1469 DECLARE_REG_TMP 7, 8
   1470 %endif
   1471 
   1472 ; Due to the use of vpdpbusd (which does 4 pixels per instruction) in
   1473 ; the horizontal filter, 6-tap is only used for the vertical filter.
   1474 %define PUT_8TAP_FN FN put_8tap,
   1475 PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_6tap_8bpc
   1476 PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_6tap_8bpc
   1477 PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_8bpc
   1478 PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_8bpc
   1479 PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_8bpc
   1480 PUT_8TAP_FN regular,        REGULAR, REGULAR
   1481 
   1482 cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
   1483 %define base r8-put_avx512icl
   1484    imul                mxd, mxm, 0x010101
   1485    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
   1486    imul                myd, mym, 0x010101
   1487    add                 myd, t1d ; 6tap_v, my, 4tap_v
   1488    lea                  r8, [put_avx512icl]
   1489    movsxd               wq, wm
   1490    movifnidn            hd, hm
   1491    test                mxd, 0xf00
   1492    jnz .h
   1493    test                myd, 0xf00
   1494    jnz .v
   1495 .put:
   1496    tzcnt                wd, wd
   1497    movzx                wd, word [r8+wq*2+table_offset(put,)]
   1498    add                  wq, r8
   1499    lea                  r6, [ssq*3]
   1500    lea                  r7, [dsq*3]
   1501 %if WIN64
   1502    pop                  r8
   1503 %endif
   1504    jmp                  wq
   1505 .v:
   1506    movzx               mxd, myb
   1507    shr                 myd, 16
   1508    cmp                  hd, 6
   1509    cmovs               myd, mxd
   1510    tzcnt               r6d, wd
   1511    movzx               r6d, word [r8+r6*2+table_offset(put, _6tap_v)]
   1512    vpbroadcastd         m6, [pw_512]
   1513    lea                 myq, [base+subpel_filters+1+myq*8]
   1514    vpbroadcastw         m7, [myq+0]
   1515    add                  r6, r8
   1516    vpbroadcastw         m8, [myq+2]
   1517    mov                 nsq, ssq
   1518    vpbroadcastw         m9, [myq+4]
   1519    neg                 nsq
   1520    jmp                  r6
   1521 .v_w2:
   1522    movd               xmm2, [srcq+nsq*2]
   1523    pinsrw             xmm2, [srcq+nsq*1], 2
   1524    pinsrw             xmm2, [srcq+ssq*0], 4
   1525    pinsrw             xmm2, [srcq+ssq*1], 6  ; 0 1 2 3
   1526    lea                srcq, [srcq+ssq*2]
   1527    vpbroadcastd       xmm0, [srcq+ssq*0]
   1528    palignr            xmm3, xmm0, xmm2, 4    ; 1 2 3 4
   1529    punpcklbw          xmm1, xmm2, xmm3       ; 01 12
   1530    punpckhbw          xmm2, xmm3             ; 23 34
   1531 .v_w2_loop:
   1532    vpbroadcastd       xmm4, [srcq+ssq*1]
   1533    lea                srcq, [srcq+ssq*2]
   1534    pmaddubsw          xmm3, xmm1, xm7        ; a0 b0
   1535    mova               xmm1, xmm2
   1536    pmaddubsw          xmm2, xm8              ; a1 b1
   1537    paddw              xmm3, xmm2
   1538    vpblendd           xmm2, xmm0, xmm4, 0x02 ; 4 5
   1539    vpbroadcastd       xmm0, [srcq+ssq*0]
   1540    vpblendd           xmm4, xmm0, 0x02       ; 5 6
   1541    punpcklbw          xmm2, xmm4             ; 67 78
   1542    pmaddubsw          xmm4, xmm2, xm9        ; a3 b3
   1543    paddw              xmm3, xmm4
   1544    pmulhrsw           xmm3, xm6
   1545    packuswb           xmm3, xmm3
   1546    pextrw     [dstq+dsq*0], xmm3, 0
   1547    pextrw     [dstq+dsq*1], xmm3, 2
   1548    lea                dstq, [dstq+dsq*2]
   1549    sub                  hd, 2
   1550    jg .v_w2_loop
   1551    RET
   1552 .v_w4:
   1553    movd               xmm2, [srcq+nsq*2]
   1554    pinsrd             xmm2, [srcq+nsq*1], 1
   1555    pinsrd             xmm2, [srcq+ssq*0], 2
   1556    pinsrd             xmm2, [srcq+ssq*1], 3  ; 0 1 2 3
   1557    lea                srcq, [srcq+ssq*2]
   1558    vpbroadcastd       xmm0, [srcq+ssq*0]
   1559    palignr            xmm3, xmm0, xmm2, 4    ; 1 2 3 4
   1560    punpcklbw          xmm1, xmm2, xmm3       ; 01 12
   1561    punpckhbw          xmm2, xmm3             ; 23 34
   1562 .v_w4_loop:
   1563    vpbroadcastd       xmm4, [srcq+ssq*1]
   1564    lea                srcq, [srcq+ssq*2]
   1565    pmaddubsw          xmm3, xmm1, xm7        ; a0 b0
   1566    mova               xmm1, xmm2
   1567    pmaddubsw          xmm2, xm8              ; a1 b1
   1568    paddw              xmm3, xmm2
   1569    vpblendd           xmm2, xmm0, xmm4, 0x02 ; 4 5
   1570    vpbroadcastd       xmm0, [srcq+ssq*0]
   1571    vpblendd           xmm4, xmm0, 0x02       ; 5 6
   1572    punpcklbw          xmm2, xmm4             ; 45 56
   1573    pmaddubsw          xmm4, xmm2, xm9        ; a2 b2
   1574    paddw              xmm3, xmm4
   1575    pmulhrsw           xmm3, xm6
   1576    packuswb           xmm3, xmm3
   1577    movd       [dstq+dsq*0], xmm3
   1578    pextrd     [dstq+dsq*1], xmm3, 1
   1579    lea                dstq, [dstq+dsq*2]
   1580    sub                  hd, 2
   1581    jg .v_w4_loop
   1582    RET
   1583 .v_w8:
   1584    movq               xmm1, [srcq+nsq*2]
   1585    vpbroadcastq       ymm3, [srcq+nsq*1]
   1586    vpbroadcastq       ymm2, [srcq+ssq*0]
   1587    vpbroadcastq       ymm4, [srcq+ssq*1]
   1588    lea                srcq, [srcq+ssq*2]
   1589    vpbroadcastq       ymm0, [srcq+ssq*0]
   1590    vpblendd           ymm1, ymm3, 0x30
   1591    vpblendd           ymm3, ymm2, 0x30
   1592    punpcklbw          ymm1, ymm3      ; 01 12
   1593    vpblendd           ymm2, ymm4, 0x30
   1594    vpblendd           ymm4, ymm0, 0x30
   1595    punpcklbw          ymm2, ymm4      ; 23 34
   1596 .v_w8_loop:
   1597    vpbroadcastq       ymm4, [srcq+ssq*1]
   1598    lea                srcq, [srcq+ssq*2]
   1599    pmaddubsw          ymm3, ymm1, ym7 ; a0 b0
   1600    mova               ymm1, ymm2
   1601    pmaddubsw          ymm2, ym8       ; a1 b1
   1602    paddw              ymm3, ymm2
   1603    vpblendd           ymm2, ymm0, ymm4, 0x30
   1604    vpbroadcastq       ymm0, [srcq+ssq*0]
   1605    vpblendd           ymm4, ymm0, 0x30
   1606    punpcklbw          ymm2, ymm4      ; 45 56
   1607    pmaddubsw          ymm4, ymm2, ym9 ; a2 b2
   1608    paddw              ymm3, ymm4
   1609    pmulhrsw           ymm3, ym6
   1610    vextracti128       xmm4, ymm3, 1
   1611    packuswb           xmm3, xmm4
   1612    movq       [dstq+dsq*0], xmm3
   1613    movhps     [dstq+dsq*1], xmm3
   1614    lea                dstq, [dstq+dsq*2]
   1615    sub                  hd, 2
   1616    jg .v_w8_loop
   1617    vzeroupper
   1618    RET
   1619 .v_w16:
   1620    mova                 m5, [spel_v_perm16a]
   1621    vbroadcasti32x4      m1, [srcq+nsq*2]
   1622    vbroadcasti32x4     ym3, [srcq+nsq*1]
   1623    mov                 r6d, 0x0f
   1624    vbroadcasti32x4      m2, [srcq+ssq*0]
   1625    kmovb                k1, r6d
   1626    vbroadcasti32x4     ym4, [srcq+ssq*1]
   1627    lea                srcq, [srcq+ssq*2]
   1628    vbroadcasti32x4      m0, [srcq+ssq*0]
   1629    vshufpd          m1{k1}, m3, m2, 0xcc
   1630    vshufpd          m2{k1}, m4, m0, 0xcc
   1631    vpermb               m1, m5, m1 ; 01 12
   1632    vpermb               m2, m5, m2 ; 23 34
   1633 .v_w16_loop:
   1634    vbroadcasti32x4     ym4, [srcq+ssq*1]
   1635    lea                srcq, [srcq+ssq*2]
   1636    pmaddubsw            m3, m1, m7 ; a0 b0
   1637    mova                 m1, m2
   1638    pmaddubsw            m2, m8     ; a1 b1
   1639    paddw                m3, m2
   1640    mova                 m2, m0
   1641    vbroadcasti32x4      m0, [srcq+ssq*0]
   1642    vshufpd          m2{k1}, m4, m0, 0xcc
   1643    vpermb               m2, m5, m2 ; 45 56
   1644    pmaddubsw            m4, m2, m9 ; a2 b2
   1645    paddw                m3, m4
   1646    pmulhrsw             m3, m6
   1647    vextracti32x8       ym4, m3, 1
   1648    packuswb            ym3, ym4
   1649    mova          [dstq+dsq*0], xm3
   1650    vextracti32x4 [dstq+dsq*1], ym3, 1
   1651    lea                dstq, [dstq+dsq*2]
   1652    sub                  hd, 2
   1653    jg .v_w16_loop
   1654    RET
   1655 .v_w32:
   1656    mova                m10, [spel_v_perm32]
   1657    pmovzxbq             m5, [pb_02461357]
   1658    vpshrdw             m11, m10, m10, 8
   1659    movu                ym0, [srcq+nsq*2]
   1660    vinserti32x8         m0, [srcq+nsq*1], 1
   1661    vpermb               m1, m10, m0 ; 01
   1662    vinserti32x8         m0, [srcq+ssq*0], 0
   1663    vpermb               m2, m11, m0 ; 12
   1664    vinserti32x8         m0, [srcq+ssq*1], 1
   1665    lea                srcq, [srcq+ssq*2]
   1666    vpermb               m3, m10, m0 ; 23
   1667    vinserti32x8         m0, [srcq+ssq*0], 0
   1668    vpermb               m4, m11, m0 ; 34
   1669 .v_w32_loop:
   1670    vinserti32x8         m0, [srcq+ssq*1], 1
   1671    lea                srcq, [srcq+ssq*2]
   1672    pmaddubsw           m12, m1, m7
   1673    mova                 m1, m3
   1674    pmaddubsw           m13, m2, m7
   1675    mova                 m2, m4
   1676    pmaddubsw           m14, m3, m8
   1677    vpermb               m3, m10, m0 ; 45
   1678    vinserti32x8         m0, [srcq+ssq*0], 0
   1679    pmaddubsw           m15, m4, m8
   1680    vpermb               m4, m11, m0 ; 56
   1681    paddw               m12, m14
   1682    pmaddubsw           m14, m3, m9
   1683    paddw               m13, m15
   1684    pmaddubsw           m15, m4, m9
   1685    paddw               m12, m14
   1686    paddw               m13, m15
   1687    pmulhrsw            m12, m6
   1688    pmulhrsw            m13, m6
   1689    packuswb            m12, m13
   1690    vpermq              m12, m5, m12
   1691    mova          [dstq+dsq*0], ym12
   1692    vextracti32x8 [dstq+dsq*1], m12, 1
   1693    lea                dstq, [dstq+dsq*2]
   1694    sub                  hd, 2
   1695    jg .v_w32_loop
   1696    RET
   1697 .v_w64:
   1698 .v_w128:
   1699    lea                 r6d, [hq+wq*4-256]
   1700 .v_loop0:
   1701    movu                 m2, [srcq+nsq*2]
   1702    movu                 m4, [srcq+nsq*1]
   1703    lea                  r4, [srcq+ssq*2]
   1704    movu                m11, [srcq+ssq*0]
   1705    movu                m13, [srcq+ssq*1]
   1706    mov                  r7, dstq
   1707    movu                 m0, [r4  +ssq*0]
   1708    punpcklbw            m1, m2, m4   ; 01l
   1709    punpckhbw            m2, m4       ; 01h
   1710    punpcklbw            m3, m4, m11  ; 12l
   1711    punpckhbw            m4, m11      ; 12h
   1712    punpcklbw           m10, m11, m13 ; 23l
   1713    punpckhbw           m11, m13      ; 23h
   1714    punpcklbw           m12, m13, m0  ; 34l
   1715    punpckhbw           m13, m0       ; 34h
   1716 .v_loop:
   1717    movu                 m5, [r4+ssq*1]
   1718    pmaddubsw           m14, m1, m7   ; a0l
   1719    mova                 m1, m10
   1720    pmaddubsw           m10, m8       ; a1l
   1721    lea                  r4, [r4+ssq*2]
   1722    pmaddubsw           m15, m2, m7   ; a0h
   1723    mova                 m2, m11
   1724    pmaddubsw           m11, m8       ; a1h
   1725    paddw               m14, m10
   1726    punpcklbw           m10, m0, m5   ; 45l
   1727    paddw               m15, m11
   1728    punpckhbw           m11, m0, m5   ; 45h
   1729    pmaddubsw            m0, m10, m9  ; a2l
   1730    paddw               m14, m0
   1731    pmaddubsw            m0, m11, m9  ; a2h
   1732    paddw               m15, m0
   1733    movu                 m0, [r4+ssq*0]
   1734    pmulhrsw            m14, m6
   1735    pmulhrsw            m15, m6
   1736    packuswb            m14, m15
   1737    pmaddubsw           m15, m3, m7   ; b0l
   1738    mova                 m3, m12
   1739    pmaddubsw           m12, m8       ; b1l
   1740    mova         [r7+dsq*0], m14
   1741    pmaddubsw           m14, m4, m7   ; b0h
   1742    mova                 m4, m13
   1743    pmaddubsw           m13, m8       ; b1h
   1744    paddw               m15, m12
   1745    punpcklbw           m12, m5, m0   ; 56l
   1746    paddw               m14, m13
   1747    punpckhbw           m13, m5, m0   ; 56h
   1748    pmaddubsw            m5, m12, m9  ; b2l
   1749    paddw               m15, m5
   1750    pmaddubsw            m5, m13, m9  ; b2h
   1751    paddw               m14, m5
   1752    pmulhrsw            m15, m6
   1753    pmulhrsw            m14, m6
   1754    packuswb            m15, m14
   1755    mova         [r7+dsq*1], m15
   1756    lea                  r7, [r7+dsq*2]
   1757    sub                  hd, 2
   1758    jg .v_loop
   1759    add                srcq, 64
   1760    add                dstq, 64
   1761    movzx                hd, r6b
   1762    sub                 r6d, 256
   1763    jg .v_loop0
   1764    RET
   1765 .h:
   1766    test                myd, 0xf00
   1767    jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2
   1768 .hv:
   1769    vpbroadcastd         m9, [pd_34]
   1770    mova               xm10, [spel_hv_end]
   1771    pxor                xm0, xm0
   1772    cmp                  wd, 4
   1773    jg .hv_w8
   1774    movzx               mxd, mxb
   1775    dec                srcq
   1776    vpbroadcastd         m7, [base+subpel_filters+mxq*8+2]
   1777    movzx               mxd, myb
   1778    shr                 myd, 16
   1779    cmp                  hd, 6
   1780    cmovs               myd, mxd
   1781    vpbroadcastq        ym1, [base+subpel_filters+1+myq*8]
   1782    mov                 nsq, ssq
   1783    punpcklbw           ym0, ym1
   1784    neg                 nsq
   1785    psraw               ym0, 2 ; << 6
   1786    pshufd             ym11, ym0, q0000
   1787    pshufd             ym12, ym0, q1111
   1788    pshufd             ym13, ym0, q2222
   1789    cmp                  wd, 4
   1790    je .hv_w4
   1791    vbroadcasti128      ym5, [subpel_h_shuf4]
   1792    movq               xmm0, [srcq+nsq*2]
   1793    movhps             xmm0, [srcq+nsq*1]
   1794    movq               xmm2, [srcq+ssq*0]
   1795    movhps             xmm2, [srcq+ssq*1]
   1796    lea                srcq, [srcq+ssq*2]
   1797    vpbroadcastq       ymm1, [srcq+ssq*0]
   1798    vpblendd           ymm0, ymm1, 0x30
   1799    pshufb             xmm2, xm5        ; 2 3
   1800    pshufb             ymm0, ym5        ; 0 1   4
   1801    mova               xmm1, xm9
   1802    vpdpbusd           xmm1, xmm2, xm7
   1803    mova               ymm2, ym9
   1804    vpdpbusd           ymm2, ymm0, ym7
   1805    packssdw           ymm2, ymm1
   1806    psraw              ymm2, 2
   1807    vextracti128       xmm0, ymm2, 1
   1808    vzeroupper
   1809    palignr            xmm0, xmm2, 4
   1810    punpcklwd          xmm1, xmm2, xmm0 ; 01 12
   1811    punpckhwd          xmm2, xmm0       ; 23 34
   1812 .hv_w2_loop:
   1813    movq               xmm3, [srcq+ssq*1]
   1814    lea                srcq, [srcq+ssq*2]
   1815    movhps             xmm3, [srcq+ssq*0]
   1816    pmaddwd            xmm4, xmm1, xm11 ; a0 b0
   1817    mova               xmm1, xmm2
   1818    vpdpwssd           xmm4, xmm2, xm12 ; a1 b1
   1819    pshufb             xmm3, xm5
   1820    mova               xmm2, xm9
   1821    vpdpbusd           xmm2, xmm3, xm7
   1822    packssdw           xmm3, xmm2, xmm2
   1823    psraw              xmm3, 2
   1824    palignr            xmm2, xmm3, xmm0, 12
   1825    mova               xmm0, xmm3
   1826    punpcklwd          xmm2, xmm3       ; 45 56
   1827    vpdpwssd           xmm4, xmm2, xm13 ; a2 b2
   1828    packuswb           xmm4, xmm4
   1829    pshufb             xmm4, xm10
   1830    pextrw     [dstq+dsq*0], xmm4, 0
   1831    pextrw     [dstq+dsq*1], xmm4, 1
   1832    lea                dstq, [dstq+dsq*2]
   1833    sub                  hd, 2
   1834    jg .hv_w2_loop
   1835    RET
   1836 .hv_w4:
   1837    movq                xm2, [srcq+nsq*2]
   1838    vpbroadcastq        ym1, [srcq+nsq*1]
   1839    vinserti32x4        ym2, [srcq+ssq*0], 1
   1840    vinserti32x4         m1, [srcq+ssq*1], 2 ; _ 1 3
   1841    lea                srcq, [srcq+ssq*2]
   1842    vbroadcasti32x4      m5, [subpel_h_shufA]
   1843    vinserti32x4         m2, [srcq+ssq*0], 2 ; 0 2 4
   1844    pshufb               m1, m5
   1845    mova                 m0, m9
   1846    pshufb               m2, m5
   1847    mova                 m3, m9
   1848    vpdpbusd             m0, m1, m7
   1849    mova                ym1, [spel_hv_perm4a]
   1850    vpdpbusd             m3, m2, m7
   1851    mova                ym2, [spel_hv_perm4b]
   1852    mov                 r6d, 0x5555
   1853    mova                ym6, [spel_hv_perm4d]
   1854    packssdw             m0, m3
   1855    kmovw                k1, r6d
   1856    psraw                m0, 2 ; _ 0   1 2   3 4   5 6
   1857    vpermb              ym1, ym1, ym0 ; 01 12
   1858    vpermb               m2, m2, m0   ; 23 34
   1859 .hv_w4_loop:
   1860    movq                xm3, [srcq+ssq*1]
   1861    lea                srcq, [srcq+ssq*2]
   1862    vinserti32x4        ym3, [srcq+ssq*0], 1
   1863    pmaddwd             ym4, ym1, ym11 ; a0 b0
   1864    mova                ym1, ym2
   1865    pshufb              ym3, ym5
   1866    mova                ym0, ym9
   1867    vpdpbusd            ym0, ym3, ym7
   1868    vpdpwssd            ym4, ym2, ym12 ; a1 b1
   1869    vpsraw          ym2{k1}, ym0, 2    ; 5 6
   1870    vpermb              ym2, ym6, ym2  ; 45 56
   1871    vpdpwssd            ym4, ym2, ym13 ; a2 b2
   1872    packuswb            ym4, ym4
   1873    vpermb              ym4, ym10, ym4
   1874    movd       [dstq+dsq*0], xm4
   1875    pextrd     [dstq+dsq*1], xm4, 1
   1876    lea                dstq, [dstq+dsq*2]
   1877    sub                  hd, 2
   1878    jg .hv_w4_loop
   1879    RET
   1880 .hv_w8:
   1881    shr                 mxd, 16
   1882    sub                srcq, 3
   1883    vpbroadcastd        m11, [base+subpel_filters+mxq*8+0]
   1884    vpbroadcastd        m12, [base+subpel_filters+mxq*8+4]
   1885    movzx               mxd, myb
   1886    shr                 myd, 16
   1887    cmp                  hd, 6
   1888    cmovs               myd, mxd
   1889    vpbroadcastq         m1, [base+subpel_filters+1+myq*8]
   1890    mov                 nsq, ssq
   1891    punpcklbw            m0, m1
   1892    neg                 nsq
   1893    psraw                m0, 2 ; << 6
   1894    pshufd              m13, m0, q0000
   1895    pshufd              m14, m0, q1111
   1896    pshufd              m15, m0, q2222
   1897    cmp                  wd, 8
   1898    jne .hv_w16
   1899    movu                xm0, [srcq+nsq*2]
   1900    vinserti32x4        ym0, [srcq+nsq*1], 1
   1901    vbroadcasti32x4      m1, [subpel_h_shufA]
   1902    vinserti32x4         m0, [srcq+ssq*0], 2
   1903    vbroadcasti32x4      m4, [subpel_h_shufB]
   1904    vinserti32x4         m0, [srcq+ssq*1], 3
   1905    lea                srcq, [srcq+ssq*2]
   1906    vbroadcasti32x4      m7, [subpel_h_shufC]
   1907    vbroadcasti32x4     ym5, [srcq+ssq*0]
   1908    vbroadcasti32x8      m6, [subpel_h_shufA]
   1909    pshufb               m1, m0, m1   ; 0 1 2 3    0123
   1910    mova                 m2, m9
   1911    vpdpbusd             m2, m1, m11
   1912    pshufb               m4, m0, m4   ; 0 1 2 3    4567
   1913    mova                 m1, m9
   1914    vpdpbusd             m1, m4, m11
   1915    pshufb               m0, m7       ; 0 1 2 3    89ab
   1916    pshufb              ym7, ym5, ym6 ; 4     0123 4567
   1917    mova                ym3, ym9
   1918    vpdpbusd            ym3, ym7, ym11
   1919    vbroadcasti32x8      m7, [subpel_h_shufB]
   1920    vpdpbusd             m2, m4, m12
   1921    mova                 m4, [spel_hv_perm8a]
   1922    pshufb              ym5, ym7      ; 4     4567 89ab
   1923    vpdpbusd             m1, m0, m12
   1924    vpaddd               m0, m4, [pb_32] {1to16}
   1925    vpdpbusd            ym3, ym5, ym12
   1926    mova                 m5, [spel_hv_perm8b]
   1927    mov                  r6, 0x55555555ff00
   1928    packssdw             m2, m1
   1929    vpmovsdw            xm3, ym3
   1930    kmovq                k1, r6
   1931    psraw                m2, 2        ; 0 1 2 3
   1932    psraw               xm3, 2        ; 4
   1933    vpermb               m1, m4, m2   ; 01 12
   1934    kshiftrq             k2, k1, 16
   1935    vpermt2b             m2, m0, m3   ; 23 34
   1936 .hv_w8_loop:
   1937    vbroadcasti32x4     ym3, [srcq+ssq*1]
   1938    lea                srcq, [srcq+ssq*2]
   1939    vbroadcasti32x4  m3{k1}, [srcq+ssq*0]
   1940    pmaddwd              m0, m1, m13  ; a0 b0
   1941    pshufb               m1, m3, m6   ; 5 6   0123 4567
   1942    mova                 m4, m9
   1943    vpdpbusd             m4, m1, m11
   1944    pshufb               m3, m7       ; 5 6   4567 89ab
   1945    vpdpwssd             m0, m2, m14  ; a1 b1
   1946    mova                 m1, m2
   1947    vpdpbusd             m4, m3, m12
   1948    psraw            m2{k2}, m4, 2    ; 53 64
   1949    vpermb               m2, m5, m2   ; 45 56
   1950    vpdpwssd             m0, m2, m15  ; a2 b2
   1951    packuswb             m0, m0
   1952    vpermb               m0, m10, m0
   1953    movq       [dstq+dsq*0], xm0
   1954    movhps     [dstq+dsq*1], xm0
   1955    lea                dstq, [dstq+dsq*2]
   1956    sub                  hd, 2
   1957    jg .hv_w8_loop
   1958    RET
   1959 .hv_w16:
   1960    movu                m19, [spel_hv_perm16a]
   1961    vpbroadcastd         m7, [pb_4]
   1962    lea                 r6d, [wq*2-32]
   1963    mova                 m6, [spel_hv_perm16b]
   1964    paddb               m20, m7, m19
   1965    lea                 r6d, [hq+r6*8]
   1966    paddb               m21, m7, m20
   1967    mova               ym10, [spel_hv_end16]
   1968    paddb                m7, m6
   1969 .hv_w16_loop0:
   1970    movu               ym16, [srcq+nsq*2]
   1971    vinserti32x8        m16, [srcq+nsq*1], 1
   1972    lea                  r4, [srcq+ssq*2]
   1973    movu               ym17, [srcq+ssq*0]
   1974    vinserti32x8        m17, [srcq+ssq*1], 1
   1975    mov                  r7, dstq
   1976    movu               ym18, [r4  +ssq*0]
   1977    vpermb               m2, m19, m16    ; 0 1   0123   89ab
   1978    mova                 m1, m9
   1979    vpermb               m3, m21, m16    ; 0 1   89ab   ghij
   1980    vpdpbusd             m1, m2, m11
   1981    mova                 m2, m9
   1982    vpermb               m4, m19, m17    ; 2 3   0123   89ab
   1983    vpdpbusd             m2, m3, m12
   1984    mova                 m3, m9
   1985    vpermb               m5, m21, m17    ; 2 3   89ab   ghij
   1986    vpdpbusd             m3, m4, m11
   1987    mova                 m4, m9
   1988    vpermb               m0, m6, m18     ; 4     0145   2367   89cd   abef
   1989    vpdpbusd             m4, m5, m12
   1990    mova                 m5, m9
   1991    vpermb              m16, m20, m16    ; 0 1   4567   cdef
   1992    vpdpbusd             m5, m0, m11
   1993    vpermb              m17, m20, m17    ; 2 3   4567   cdef
   1994    vpdpbusd             m1, m16, m12
   1995    vpermb              m18, m7, m18     ; 4     4589   67ab   cdgh   efij
   1996    vpdpbusd             m2, m16, m11
   1997    vpdpbusd             m3, m17, m12
   1998    vpdpbusd             m4, m17, m11
   1999    vpdpbusd             m5, m18, m12
   2000    packssdw             m1, m2          ; 01
   2001    packssdw             m3, m4          ; 23
   2002    REPX       {psraw x, 2}, m1, m3, m5
   2003    vpshrdd              m2, m1, m3, 16  ; 12
   2004    vpshrdd              m4, m3, m5, 16  ; 34
   2005 .hv_w16_loop:
   2006    movu               ym18, [r4+ssq*1]
   2007    lea                  r4, [r4+ssq*2]
   2008    vinserti32x8        m18, [r4+ssq*0], 1
   2009    pmaddwd             m16, m1, m13     ; a0
   2010    vpermb               m1, m19, m18    ; 5 6   0123   89ab
   2011    pmaddwd             m17, m2, m13     ; b0
   2012    vpermb               m2, m20, m18    ; 5 6   4567   cdef
   2013    mova                 m0, m9
   2014    vpdpbusd             m0, m1, m11
   2015    vpermb              m18, m21, m18
   2016    mova                 m1, m9
   2017    vpdpbusd             m1, m2, m11
   2018    vpdpwssd            m16, m3, m14    ; a1
   2019    vpdpwssd            m17, m4, m14    ; b1
   2020    vpdpbusd             m0, m2, m12
   2021    mova                 m2, m4
   2022    vpdpbusd             m1, m18, m12
   2023    packssdw             m0, m1
   2024    mova                 m1, m3
   2025    psraw                m4, m0, 2      ; 5 6
   2026    vpshrdd              m3, m2, m4, 16 ; 4 5
   2027    vpdpwssd            m17, m4, m15    ; b2
   2028    vpdpwssd            m16, m3, m15    ; a2
   2029    packuswb            m16, m17
   2030    vpermb              m16, m10, m16
   2031    mova         [r7+dsq*0], xm16
   2032    vextracti128 [r7+dsq*1], ym16, 1
   2033    lea                  r7, [r7+dsq*2]
   2034    sub                  hd, 2
   2035    jg .hv_w16_loop
   2036    add                srcq, 16
   2037    add                dstq, 16
   2038    movzx                hd, r6b
   2039    sub                 r6d, 1<<8
   2040    jg .hv_w16_loop0
   2041    vzeroupper
   2042    RET
   2043 
   2044 PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_8bpc
   2045 PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_8bpc
   2046 PUT_8TAP_FN sharp,          SHARP,   SHARP
   2047 
   2048 cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
   2049    imul                mxd, mxm, 0x010101
   2050    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   2051    imul                myd, mym, 0x010101
   2052    add                 myd, t1d ; 8tap_v, my, 4tap_v
   2053    lea                  r8, [put_avx512icl]
   2054    movsxd               wq, wm
   2055    movifnidn            hd, hm
   2056    test                mxd, 0xf00
   2057    jnz .h
   2058    test                myd, 0xf00
   2059    jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put
   2060 .v:
   2061    movzx               mxd, myb
   2062    shr                 myd, 16
   2063    cmp                  hd, 6
   2064    cmovs               myd, mxd
   2065    tzcnt               r6d, wd
   2066    lea                 myq, [base+subpel_filters+myq*8]
   2067    movzx               r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
   2068    vpbroadcastd         m7, [pw_512]
   2069    vpbroadcastw         m8, [myq+0]
   2070    add                  r6, r8
   2071    vpbroadcastw         m9, [myq+2]
   2072    lea                ss3q, [ssq*3]
   2073    vpbroadcastw        m10, [myq+4]
   2074    sub                srcq, ss3q
   2075    vpbroadcastw        m11, [myq+6]
   2076    jmp                  r6
   2077 .v_w2:
   2078    movd               xmm2, [srcq+ssq*0]
   2079    pinsrw             xmm2, [srcq+ssq*1], 2
   2080    pinsrw             xmm2, [srcq+ssq*2], 4
   2081    add                srcq, ss3q
   2082    pinsrw             xmm2, [srcq+ssq*0], 6  ; 0 1 2 3
   2083    movd               xmm3, [srcq+ssq*1]
   2084    vpbroadcastd       xmm1, [srcq+ssq*2]
   2085    add                srcq, ss3q
   2086    vpbroadcastd       xmm0, [srcq+ssq*0]
   2087    vpblendd           xmm3, xmm3, xmm1, 0x02 ; 4 5
   2088    vpblendd           xmm1, xmm1, xmm0, 0x02 ; 5 6
   2089    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
   2090    punpcklbw          xmm3, xmm1             ; 45 56
   2091    punpcklbw          xmm1, xmm2, xmm4       ; 01 12
   2092    punpckhbw          xmm2, xmm4             ; 23 34
   2093 .v_w2_loop:
   2094    pmaddubsw          xmm5, xmm1, xm8        ; a0 b0
   2095    mova               xmm1, xmm2
   2096    pmaddubsw          xmm2, xm9              ; a1 b1
   2097    paddw              xmm5, xmm2
   2098    mova               xmm2, xmm3
   2099    pmaddubsw          xmm3, xm10             ; a2 b2
   2100    paddw              xmm5, xmm3
   2101    vpbroadcastd       xmm4, [srcq+ssq*1]
   2102    lea                srcq, [srcq+ssq*2]
   2103    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
   2104    vpbroadcastd       xmm0, [srcq+ssq*0]
   2105    vpblendd           xmm4, xmm4, xmm0, 0x02 ; 7 8
   2106    punpcklbw          xmm3, xmm4             ; 67 78
   2107    pmaddubsw          xmm4, xmm3, xm11       ; a3 b3
   2108    paddw              xmm5, xmm4
   2109    pmulhrsw           xmm5, xm7
   2110    packuswb           xmm5, xmm5
   2111    pextrw     [dstq+dsq*0], xmm5, 0
   2112    pextrw     [dstq+dsq*1], xmm5, 2
   2113    lea                dstq, [dstq+dsq*2]
   2114    sub                  hd, 2
   2115    jg .v_w2_loop
   2116    RET
   2117 .v_w4:
   2118    movd               xmm2, [srcq+ssq*0]
   2119    pinsrd             xmm2, [srcq+ssq*1], 1
   2120    pinsrd             xmm2, [srcq+ssq*2], 2
   2121    add                srcq, ss3q
   2122    pinsrd             xmm2, [srcq+ssq*0], 3  ; 0 1 2 3
   2123    movd               xmm3, [srcq+ssq*1]
   2124    vpbroadcastd       xmm1, [srcq+ssq*2]
   2125    add                srcq, ss3q
   2126    vpbroadcastd       xmm0, [srcq+ssq*0]
   2127    vpblendd           xmm3, xmm3, xmm1, 0x02 ; 4 5
   2128    vpblendd           xmm1, xmm1, xmm0, 0x02 ; 5 6
   2129    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
   2130    punpcklbw          xmm3, xmm1             ; 45 56
   2131    punpcklbw          xmm1, xmm2, xmm4       ; 01 12
   2132    punpckhbw          xmm2, xmm4             ; 23 34
   2133 .v_w4_loop:
   2134    vpbroadcastd       xmm4, [srcq+ssq*1]
   2135    lea                srcq, [srcq+ssq*2]
   2136    pmaddubsw          xmm5, xmm1, xm8        ; a0 b0
   2137    mova               xmm1, xmm2
   2138    pmaddubsw          xmm2, xm9              ; a1 b1
   2139    paddw              xmm5, xmm2
   2140    mova               xmm2, xmm3
   2141    pmaddubsw          xmm3, xm10             ; a2 b2
   2142    paddw              xmm5, xmm3
   2143    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
   2144    vpbroadcastd       xmm0, [srcq+ssq*0]
   2145    vpblendd           xmm4, xmm4, xmm0, 0x02 ; 7 8
   2146    punpcklbw          xmm3, xmm4             ; 67 78
   2147    pmaddubsw          xmm4, xmm3, xm11       ; a3 b3
   2148    paddw              xmm5, xmm4
   2149    pmulhrsw           xmm5, xm7
   2150    packuswb           xmm5, xmm5
   2151    movd       [dstq+dsq*0], xmm5
   2152    pextrd     [dstq+dsq*1], xmm5, 1
   2153    lea                dstq, [dstq+dsq*2]
   2154    sub                  hd, 2
   2155    jg .v_w4_loop
   2156    RET
   2157 .v_w8:
   2158    movq               xmm1, [srcq+ssq*0]
   2159    vpbroadcastq       ymm0, [srcq+ssq*1]
   2160    vpbroadcastq       ymm2, [srcq+ssq*2]
   2161    add                srcq, ss3q
   2162    vpbroadcastq       ymm5, [srcq+ssq*0]
   2163    vpbroadcastq       ymm3, [srcq+ssq*1]
   2164    vpbroadcastq       ymm4, [srcq+ssq*2]
   2165    add                srcq, ss3q
   2166    vpblendd           ymm1, ymm0, 0x30
   2167    vpblendd           ymm0, ymm2, 0x30
   2168    punpcklbw          ymm1, ymm0 ; 01 12
   2169    vpbroadcastq       ymm0, [srcq+ssq*0]
   2170    vpblendd           ymm2, ymm5, 0x30
   2171    vpblendd           ymm5, ymm3, 0x30
   2172    punpcklbw          ymm2, ymm5 ; 23 34
   2173    vpblendd           ymm3, ymm4, 0x30
   2174    vpblendd           ymm4, ymm0, 0x30
   2175    punpcklbw          ymm3, ymm4 ; 45 56
   2176 .v_w8_loop:
   2177    vpbroadcastq       ymm4, [srcq+ssq*1]
   2178    lea                srcq, [srcq+ssq*2]
   2179    pmaddubsw          ymm5, ymm1, ym8  ; a0 b0
   2180    mova               ymm1, ymm2
   2181    pmaddubsw          ymm2, ym9        ; a1 b1
   2182    paddw              ymm5, ymm2
   2183    mova               ymm2, ymm3
   2184    pmaddubsw          ymm3, ym10       ; a2 b2
   2185    paddw              ymm5, ymm3
   2186    vpblendd           ymm3, ymm0, ymm4, 0x30
   2187    vpbroadcastq       ymm0, [srcq+ssq*0]
   2188    vpblendd           ymm4, ymm4, ymm0, 0x30
   2189    punpcklbw          ymm3, ymm4       ; 67 78
   2190    pmaddubsw          ymm4, ymm3, ym11 ; a3 b3
   2191    paddw              ymm5, ymm4
   2192    pmulhrsw           ymm5, ym7
   2193    vextracti128       xmm4, ymm5, 1
   2194    packuswb           xmm5, xmm4
   2195    movq       [dstq+dsq*0], xmm5
   2196    movhps     [dstq+dsq*1], xmm5
   2197    lea                dstq, [dstq+dsq*2]
   2198    sub                  hd, 2
   2199    jg .v_w8_loop
   2200    vzeroupper
   2201    RET
   2202 .v_w16:
   2203    mova                m12, [spel_v_perm16a]
   2204    vbroadcasti32x4      m1, [srcq+ssq*0]
   2205    vbroadcasti32x4     ym4, [srcq+ssq*1]
   2206    mov                 r6d, 0x0f
   2207    vbroadcasti32x4      m2, [srcq+ssq*2]
   2208    add                srcq, ss3q
   2209    vbroadcasti32x4     ym5, [srcq+ssq*0]
   2210    kmovb                k1, r6d
   2211    vbroadcasti32x4      m3, [srcq+ssq*1]
   2212    vbroadcasti32x4     ym6, [srcq+ssq*2]
   2213    add                srcq, ss3q
   2214    vbroadcasti32x4      m0, [srcq+ssq*0]
   2215    vshufpd          m1{k1}, m4, m2, 0xcc
   2216    vshufpd          m2{k1}, m5, m3, 0xcc
   2217    vshufpd          m3{k1}, m6, m0, 0xcc
   2218    vpermb               m1, m12, m1 ; 01 12
   2219    vpermb               m2, m12, m2 ; 23 34
   2220    vpermb               m3, m12, m3 ; 45 56
   2221 .v_w16_loop:
   2222    pmaddubsw            m4, m1, m8  ; a0 b0
   2223    mova                 m1, m2
   2224    pmaddubsw            m5, m2, m9  ; a1 b1
   2225    mova                 m2, m3
   2226    pmaddubsw            m6, m3, m10 ; a2 b2
   2227    mova                 m3, m0
   2228    paddw                m4, m5
   2229    vbroadcasti32x4     ym5, [srcq+ssq*1]
   2230    lea                srcq, [srcq+ssq*2]
   2231    vbroadcasti32x4      m0, [srcq+ssq*0]
   2232    vshufpd          m3{k1}, m5, m0, 0xcc
   2233    vpermb               m3, m12, m3 ; 67 78
   2234    pmaddubsw            m5, m3, m11 ; a3 b3
   2235    paddw                m4, m6
   2236    paddw                m4, m5
   2237    pmulhrsw             m4, m7
   2238    vextracti32x8       ym5, m4, 1
   2239    packuswb            ym4, ym5
   2240    mova          [dstq+dsq*0], xm4
   2241    vextracti32x4 [dstq+dsq*1], ym4, 1
   2242    lea                dstq, [dstq+dsq*2]
   2243    sub                  hd, 2
   2244    jg .v_w16_loop
   2245    RET
   2246 .v_w32:
   2247    mova                m12, [spel_v_perm32]
   2248    pmovzxbq            m14, [pb_02461357]
   2249    vpshrdw             m13, m12, m12, 8
   2250    movu                ym0, [srcq+ssq*0]
   2251    vinserti32x8         m0, [srcq+ssq*1], 1
   2252    vpermb               m1, m12, m0 ; 01
   2253    vinserti32x8         m0, [srcq+ssq*2], 0
   2254    add                srcq, ss3q
   2255    vpermb               m2, m13, m0 ; 12
   2256    vinserti32x8         m0, [srcq+ssq*0], 1
   2257    vpermb               m3, m12, m0 ; 23
   2258    vinserti32x8         m0, [srcq+ssq*1], 0
   2259    vpermb               m4, m13, m0 ; 34
   2260    vinserti32x8         m0, [srcq+ssq*2], 1
   2261    add                srcq, ss3q
   2262    vpermb               m5, m12, m0 ; 45
   2263    vinserti32x8         m0, [srcq+ssq*0], 0
   2264    vpermb               m6, m13, m0 ; 56
   2265 .v_w32_loop:
   2266    vinserti32x8         m0, [srcq+ssq*1], 1
   2267    lea                srcq, [srcq+ssq*2]
   2268    pmaddubsw           m15, m1, m8
   2269    mova                 m1, m3
   2270    pmaddubsw           m16, m2, m8
   2271    mova                 m2, m4
   2272    pmaddubsw           m17, m3, m9
   2273    mova                 m3, m5
   2274    pmaddubsw           m18, m4, m9
   2275    mova                 m4, m6
   2276    pmaddubsw           m19, m5, m10
   2277    vpermb               m5, m12, m0 ; 67
   2278    vinserti32x8         m0, [srcq+ssq*0], 0
   2279    pmaddubsw           m20, m6, m10
   2280    vpermb               m6, m13, m0 ; 78
   2281    paddw               m15, m17
   2282    pmaddubsw           m17, m5, m11
   2283    paddw               m16, m18
   2284    pmaddubsw           m18, m6, m11
   2285    paddw               m15, m19
   2286    paddw               m16, m20
   2287    paddw               m15, m17
   2288    paddw               m16, m18
   2289    pmulhrsw            m15, m7
   2290    pmulhrsw            m16, m7
   2291    packuswb            m15, m16
   2292    vpermq              m15, m14, m15
   2293    mova          [dstq+dsq*0], ym15
   2294    vextracti32x8 [dstq+dsq*1], m15, 1
   2295    lea                dstq, [dstq+dsq*2]
   2296    sub                  hd, 2
   2297    jg .v_w32_loop
   2298    vzeroupper
   2299    RET
   2300 .v_w64:
   2301 .v_w128:
   2302    lea                 r6d, [hq+wq*4-256]
   2303    mov                  r4, srcq
   2304    mov                  r7, dstq
   2305 .v_loop0:
   2306    movu                 m2, [srcq+ssq*0]
   2307    movu                 m4, [srcq+ssq*1]
   2308    movu                 m6, [srcq+ssq*2]
   2309    add                srcq, ss3q
   2310    movu                m13, [srcq+ssq*0]
   2311    movu                m15, [srcq+ssq*1]
   2312    movu                m17, [srcq+ssq*2]
   2313    add                srcq, ss3q
   2314    movu                 m0, [srcq+ssq*0]
   2315    punpcklbw            m1, m2, m4    ; 01l
   2316    punpckhbw            m2, m4        ; 01h
   2317    punpcklbw            m3, m4, m6    ; 12l
   2318    punpckhbw            m4, m6        ; 12h
   2319    punpcklbw            m5, m6, m13   ; 23l
   2320    punpckhbw            m6, m13       ; 23h
   2321    punpcklbw           m12, m13, m15  ; 34l
   2322    punpckhbw           m13, m15       ; 34h
   2323    punpcklbw           m14, m15, m17  ; 45l
   2324    punpckhbw           m15, m17       ; 45h
   2325    punpcklbw           m16, m17, m0   ; 56l
   2326    punpckhbw           m17, m0        ; 56h
   2327 .v_loop:
   2328    pmaddubsw           m18, m1, m8    ; a0l
   2329    mova                 m1, m5
   2330    pmaddubsw           m19, m2, m8    ; a0h
   2331    mova                 m2, m6
   2332    pmaddubsw           m20, m3, m8    ; b0l
   2333    mova                 m3, m12
   2334    pmaddubsw           m21, m4, m8    ; b0h
   2335    mova                 m4, m13
   2336    pmaddubsw            m5, m9        ; a1l
   2337    pmaddubsw            m6, m9        ; a1h
   2338    pmaddubsw           m12, m9        ; b1l
   2339    pmaddubsw           m13, m9        ; b1h
   2340    paddw               m18, m5
   2341    mova                 m5, m14
   2342    pmaddubsw           m14, m10       ; a2l
   2343    paddw               m19, m6
   2344    mova                 m6, m15
   2345    pmaddubsw           m15, m10       ; a2h
   2346    paddw               m20, m12
   2347    mova                m12, m16
   2348    pmaddubsw           m16, m10       ; b2l
   2349    paddw               m21, m13
   2350    mova                m13, m17
   2351    pmaddubsw           m17, m10       ; b2h
   2352    paddw               m18, m14
   2353    paddw               m19, m15
   2354    paddw               m20, m16
   2355    paddw               m21, m17
   2356    movu                m17, [srcq+ssq*1]
   2357    lea                srcq, [srcq+ssq*2]
   2358    punpcklbw           m14, m0, m17  ; 67l
   2359    punpckhbw           m15, m0, m17  ; 67h
   2360    pmaddubsw           m16, m14, m11 ; a3l
   2361    pmaddubsw            m0, m15, m11 ; a3h
   2362    paddw               m18, m16
   2363    paddw               m19, m0
   2364    movu                 m0, [srcq+ssq*0]
   2365    punpcklbw           m16, m17, m0  ; 78l
   2366    punpckhbw           m17, m0       ; 78h
   2367    pmulhrsw            m18, m7
   2368    pmulhrsw            m19, m7
   2369    packuswb            m18, m19
   2370    mova       [dstq+dsq*0], m18
   2371    pmaddubsw           m18, m16, m11 ; b3l
   2372    pmaddubsw           m19, m17, m11 ; b3h
   2373    paddw               m18, m20
   2374    paddw               m19, m21
   2375    pmulhrsw            m18, m7
   2376    pmulhrsw            m19, m7
   2377    packuswb            m18, m19
   2378    mova       [dstq+dsq*1], m18
   2379    lea                dstq, [dstq+dsq*2]
   2380    sub                  hd, 2
   2381    jg .v_loop
   2382    add                  r4, 64
   2383    add                  r7, 64
   2384    movzx                hd, r6b
   2385    mov                srcq, r4
   2386    mov                dstq, r7
   2387    sub                 r6d, 256
   2388    jg .v_loop0
   2389    vzeroupper
   2390    RET
   2391 .h:
   2392    test                myd, 0xf00
   2393    jnz .hv
   2394 .h2:
   2395    vpbroadcastd         m5, [pd_34] ; 2 + (8 << 2)
   2396    cmp                  wd, 4
   2397    jl .h_w2
   2398    vbroadcasti128       m6, [subpel_h_shufA]
   2399    je .h_w4
   2400    tzcnt                wd, wd
   2401    vbroadcasti128       m7, [subpel_h_shufB]
   2402    vbroadcasti128       m8, [subpel_h_shufC]
   2403    shr                 mxd, 16
   2404    sub                srcq, 3
   2405    movzx                wd, word [r8+wq*2+table_offset(put, _8tap_h)]
   2406    vpbroadcastd         m9, [base+mxq*8+subpel_filters+0]
   2407    vpbroadcastd        m10, [base+mxq*8+subpel_filters+4]
   2408    add                  wq, r8
   2409    jmp                  wq
   2410 .h_w2:
   2411    movzx               mxd, mxb
   2412    dec                srcq
   2413    mova               xmm4, [subpel_h_shuf4]
   2414    vpbroadcastd       xmm3, [base+mxq*8+subpel_filters+2]
   2415 .h_w2_loop:
   2416    movq               xmm0, [srcq+ssq*0]
   2417    movhps             xmm0, [srcq+ssq*1]
   2418    lea                srcq, [srcq+ssq*2]
   2419    pshufb             xmm0, xmm4
   2420    mova               xmm1, xm5
   2421    vpdpbusd           xmm1, xmm0, xmm3
   2422    packssdw           xmm0, xmm1, xmm1
   2423    psraw              xmm0, 6
   2424    packuswb           xmm0, xm0
   2425    pextrw     [dstq+dsq*0], xmm0, 0
   2426    pextrw     [dstq+dsq*1], xmm0, 1
   2427    lea                dstq, [dstq+dsq*2]
   2428    sub                  hd, 2
   2429    jg .h_w2_loop
   2430    RET
   2431 .h_w4:
   2432    movzx               mxd, mxb
   2433    dec                srcq
   2434    vpbroadcastd       xmm3, [base+mxq*8+subpel_filters+2]
   2435 .h_w4_loop:
   2436    movq               xmm0, [srcq+ssq*0]
   2437    movq               xmm1, [srcq+ssq*1]
   2438    lea                srcq, [srcq+ssq*2]
   2439    pshufb             xmm0, xm6
   2440    pshufb             xmm1, xm6
   2441    mova               xmm2, xm5
   2442    vpdpbusd           xmm2, xmm0, xmm3
   2443    mova               xmm0, xm5
   2444    vpdpbusd           xmm0, xmm1, xmm3
   2445    packssdw           xmm0, xmm2, xmm0
   2446    psraw              xmm0, 6
   2447    packuswb           xmm0, xmm0
   2448    movd       [dstq+dsq*0], xmm0
   2449    pextrd     [dstq+dsq*1], xmm0, 1
   2450    lea                dstq, [dstq+dsq*2]
   2451    sub                  hd, 2
   2452    jg .h_w4_loop
   2453    RET
   2454 .h_w8:
   2455    movu                xm0, [srcq+ssq*0]
   2456    vinserti32x4        ym0, [srcq+ssq*1], 1
   2457    lea                srcq, [srcq+ssq*2]
   2458    WRAP_YMM PUT_8TAP_H   0, 1, 2, 3
   2459    vpmovuswb           xm0, ym0
   2460    movq       [dstq+dsq*0], xm0
   2461    movhps     [dstq+dsq*1], xm0
   2462    lea                dstq, [dstq+dsq*2]
   2463    sub                  hd, 2
   2464    jg .h_w8
   2465    RET
   2466 .h_w16:
   2467    mova                 m6, [spel_h_perm16]
   2468    vpbroadcastd         m8, [pb_4]
   2469    paddb                m7, m8, m6
   2470    paddb                m8, m7
   2471 .h_w16_loop:
   2472    movu                ym0, [srcq+ssq*0]
   2473    vinserti32x8         m0, [srcq+ssq*1], 1
   2474    lea                srcq, [srcq+ssq*2]
   2475    PUT_8TAP_H            0, 1, 2, 3, 1
   2476    vpmovuswb           ym0, m0
   2477    mova         [dstq+dsq*0], xm0
   2478    vextracti128 [dstq+dsq*1], ym0, 1
   2479    lea                dstq, [dstq+dsq*2]
   2480    sub                  hd, 2
   2481    jg .h_w16_loop
   2482    RET
   2483 .h_w32:
   2484    movu                ym0, [srcq+ssq*0+8*0]
   2485    vinserti32x8         m0, [srcq+ssq*1+8*0], 1
   2486    movu                ym1, [srcq+ssq*0+8*1]
   2487    vinserti32x8         m1, [srcq+ssq*1+8*1], 1
   2488    lea                srcq, [srcq+ssq*2]
   2489    PUT_8TAP_H            0, 2, 3, 4
   2490    PUT_8TAP_H            1, 4, 3, 2
   2491    packuswb             m0, m1
   2492    mova          [dstq+dsq*0], ym0
   2493    vextracti32x8 [dstq+dsq*1], m0, 1
   2494    lea                dstq, [dstq+dsq*2]
   2495    sub                  hd, 2
   2496    jg .h_w32
   2497    RET
   2498 .h_w64:
   2499    movu                 m0, [srcq+8*0]
   2500    movu                 m1, [srcq+8*1]
   2501    add                srcq, ssq
   2502    PUT_8TAP_H            0, 2, 3, 4
   2503    PUT_8TAP_H            1, 4, 3, 2
   2504    packuswb             m0, m1
   2505    mova             [dstq], m0
   2506    add                dstq, dsq
   2507    dec                  hd
   2508    jg .h_w64
   2509    RET
   2510 .h_w128:
   2511    movu                 m0, [srcq+8*0]
   2512    movu                 m2, [srcq+8*1]
   2513    movu                 m1, [srcq+8*8]
   2514    movu                 m3, [srcq+8*9]
   2515    add                srcq, ssq
   2516    PUT_8TAP_H            0,  4, 11, 12
   2517    PUT_8TAP_H            2, 12, 11,  4
   2518    PUT_8TAP_H            1,  4, 11, 12
   2519    PUT_8TAP_H            3, 12, 11,  4
   2520    packuswb             m0, m2
   2521    packuswb             m1, m3
   2522    mova        [dstq+64*0], m0
   2523    mova        [dstq+64*1], m1
   2524    add                dstq, dsq
   2525    dec                  hd
   2526    jg .h_w128
   2527    RET
   2528 .hv:
   2529    vpbroadcastd         m9, [pd_34]
   2530    pxor                xm0, xm0
   2531    cmp                  wd, 4
   2532    jg .hv_w8
   2533    movzx               mxd, mxb
   2534    dec                srcq
   2535    vpbroadcastd         m7, [base+subpel_filters+mxq*8+2]
   2536    movzx               mxd, myb
   2537    shr                 myd, 16
   2538    cmp                  hd, 6
   2539    cmovs               myd, mxd
   2540    vpbroadcastq        ym1, [base+subpel_filters+myq*8]
   2541    lea                ss3q, [ssq*3]
   2542    mov                  r6, srcq
   2543    punpcklbw           ym0, ym1
   2544    sub                  r6, ss3q
   2545    psraw               ym0, 2 ; << 6
   2546    mova               xm14, [spel_hv_end]
   2547    pshufd             ym10, ym0, q0000
   2548    pshufd             ym11, ym0, q1111
   2549    pshufd             ym12, ym0, q2222
   2550    pshufd             ym13, ym0, q3333
   2551    cmp                  wd, 4
   2552    je .hv_w4
   2553    vbroadcasti128      ym6, [subpel_h_shuf4]
   2554    movq               xmm2, [r6+ssq*0]
   2555    movhps             xmm2, [r6+ssq*1]
   2556    movq               xmm0, [r6+ssq*2]
   2557    movhps             xmm0, [srcq+ssq*0]
   2558    vpbroadcastq       ymm3, [srcq+ssq*1]
   2559    vpbroadcastq       ymm4, [srcq+ssq*2]
   2560    add                srcq, ss3q
   2561    vpbroadcastq       ymm1, [srcq+ssq*0]
   2562    vpblendd           ymm2, ymm3, 0x30
   2563    vpblendd           ymm0, ymm1, 0x30 ; 2 3   6 _
   2564    vpblendd           ymm2, ymm4, 0xc0 ; 0 1   4 5
   2565    pshufb             ymm2, ym6
   2566    pshufb             ymm0, ym6
   2567    mova               ymm1, ym9
   2568    vpdpbusd           ymm1, ymm2, ym7
   2569    mova               ymm2, ym9
   2570    vpdpbusd           ymm2, ymm0, ym7
   2571    packssdw           ymm2, ymm1, ymm2
   2572    psraw              ymm2, 2
   2573    vextracti128       xmm3, ymm2, 1
   2574    palignr            xmm4, xmm3, xmm2, 4
   2575    punpcklwd          xmm1, xmm2, xmm4 ; 01 12
   2576    punpckhwd          xmm2, xmm4       ; 23 34
   2577    pshufd             xmm0, xmm3, q2121
   2578    punpcklwd          xmm3, xmm0       ; 45 56
   2579 .hv_w2_loop:
   2580    movq               xmm4, [srcq+ssq*1]
   2581    lea                srcq, [srcq+ssq*2]
   2582    movhps             xmm4, [srcq+ssq*0]
   2583    pmaddwd            xmm5, xmm1, xm10 ; a0 b0
   2584    mova               xmm1, xmm2
   2585    vpdpwssd           xmm5, xmm2, xm11 ; a1 b1
   2586    pshufb             xmm4, xm6
   2587    mova               xmm2, xmm3
   2588    vpdpwssd           xmm5, xmm3, xm12 ; a2 b2
   2589    mova               xmm3, xm9
   2590    vpdpbusd           xmm3, xmm4, xm7
   2591    packssdw           xmm4, xmm3, xmm3
   2592    psraw              xmm4, 2
   2593    palignr            xmm3, xmm4, xmm0, 12
   2594    mova               xmm0, xmm4
   2595    punpcklwd          xmm3, xmm4       ; 67 78
   2596    vpdpwssd           xmm5, xmm3, xm13 ; a3 b3
   2597    packuswb           xmm5, xmm5
   2598    pshufb             xmm5, xm14
   2599    pextrw     [dstq+dsq*0], xmm5, 0
   2600    pextrw     [dstq+dsq*1], xmm5, 1
   2601    lea                dstq, [dstq+dsq*2]
   2602    sub                  hd, 2
   2603    jg .hv_w2_loop
   2604    vzeroupper
   2605    RET
   2606 .hv_w4:
   2607    movq               xmm1, [r6+ssq*0]
   2608    vpbroadcastq        ym2, [r6+ssq*1]
   2609    vinserti32x4        ym1, ymm1, [r6+ssq*2], 1
   2610    vinserti32x4         m2, [srcq+ssq*0], 2
   2611    vinserti32x4         m1, [srcq+ssq*1], 2
   2612    vinserti32x4         m2, [srcq+ssq*2], 3 ; _ 1 3 5
   2613    vbroadcasti32x4      m6, [subpel_h_shufA]
   2614    add                srcq, ss3q
   2615    vinserti32x4         m1, [srcq+ssq*0], 3 ; 0 2 4 6
   2616    pshufb               m2, m6
   2617    pshufb               m1, m6
   2618    mova                 m0, m9
   2619    vpdpbusd             m0, m2, m7
   2620    mova                 m4, m9
   2621    vpdpbusd             m4, m1, m7
   2622    mova                ym1, [spel_hv_perm4a]
   2623    mova                ym2, [spel_hv_perm4b]
   2624    mova                ym3, [spel_hv_perm4c]
   2625    packssdw             m0, m4
   2626    psraw                m0, 2 ; _ 0   1 2   3 4   5 6
   2627    mov                 r6d, 0x5555
   2628    vpermb              ym1, ym1, ym0 ; 01 12
   2629    vpermb               m2, m2, m0   ; 23 34
   2630    vpermb               m3, m3, m0   ; 45 56
   2631    kmovw                k1, r6d
   2632    mova               ym15, [spel_hv_perm4d]
   2633 .hv_w4_loop:
   2634    movq               xmm4, [srcq+ssq*1]
   2635    lea                srcq, [srcq+ssq*2]
   2636    vinserti32x4        ym4, ymm4, [srcq+ssq*0], 1
   2637    pmaddwd             ym5, ym1, ym10 ; a0 b0
   2638    mova                ym1, ym2
   2639    pshufb              ym4, ym6
   2640    mova                ym0, ym9
   2641    vpdpbusd            ym0, ym4, ym7
   2642    vpdpwssd            ym5, ym2, ym11 ; a1 b1
   2643    mova                ym2, ym3
   2644    vpdpwssd            ym5, ym3, ym12 ; a2 b2
   2645    vpsraw          ym3{k1}, ym0, 2    ; 7 8
   2646    vpermb              ym3, ym15, ym3 ; 67 78
   2647    vpdpwssd            ym5, ym3, ym13 ; a3 b3
   2648    packuswb            ym5, ym5
   2649    vpermb              ym5, ym14, ym5
   2650    movd       [dstq+dsq*0], xm5
   2651    pextrd     [dstq+dsq*1], xm5, 1
   2652    lea                dstq, [dstq+dsq*2]
   2653    sub                  hd, 2
   2654    jg .hv_w4_loop
   2655    RET
   2656 .hv_w8:
   2657    shr                 mxd, 16
   2658    sub                srcq, 3
   2659    vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
   2660    vpbroadcastd        m11, [base+subpel_filters+mxq*8+4]
   2661    movzx               mxd, myb
   2662    shr                 myd, 16
   2663    cmp                  hd, 6
   2664    cmovs               myd, mxd
   2665    vpbroadcastq         m1, [base+subpel_filters+myq*8]
   2666    punpcklbw            m0, m1
   2667    lea                ss3q, [ssq*3]
   2668    psraw                m0, 2 ; << 6
   2669    pshufd              m12, m0, q0000
   2670    pshufd              m13, m0, q1111
   2671    pshufd              m14, m0, q2222
   2672    pshufd              m15, m0, q3333
   2673    cmp                  wd, 8
   2674    jne .hv_w16
   2675    mov                  r6, srcq
   2676    sub                  r6, ss3q
   2677    movu               xmm1, [r6+ssq*0]
   2678    vinserti128        ymm1, [r6+ssq*1], 1
   2679    movu               xmm2, [srcq+ssq*1]
   2680    vinserti32x4         m6, zmm1, [r6+ssq*2], 2
   2681    vinserti128        ymm2, [srcq+ssq*2], 1
   2682    vinserti32x4         m6, [srcq+ssq*0], 3 ; 0 1 2 3
   2683    add                srcq, ss3q
   2684    vbroadcasti32x4      m4, [subpel_h_shufA]
   2685    vinserti32x4         m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _
   2686    vbroadcasti32x4      m7, [subpel_h_shufB]
   2687    vbroadcasti32x4      m8, [subpel_h_shufC]
   2688    pshufb               m1, m6, m4  ; 0 1 2 3   0123
   2689    mova                 m2, m9
   2690    vpdpbusd             m2, m1, m10
   2691    pshufb               m5, m6, m7  ; 0 1 2 3   4567
   2692    mova                 m1, m9
   2693    vpdpbusd             m1, m5, m10
   2694    pshufb               m4, m0, m4  ; 4 5 6 _   0123
   2695    mova                 m3, m9
   2696    vpdpbusd             m3, m4, m10
   2697    pshufb               m7, m0, m7  ; 4 5 6 _   4567
   2698    mova                 m4, m9
   2699    vpdpbusd             m4, m7, m10
   2700    pshufb               m6, m8
   2701    vpdpbusd             m2, m5, m11
   2702    vpdpbusd             m1, m6, m11
   2703    pshufb               m6, m0, m8
   2704    vpdpbusd             m3, m7, m11
   2705    vpdpbusd             m4, m6, m11
   2706    mova                 m5, [spel_hv_perm8a]
   2707    vpaddd               m0, m5, [pb_32] {1to16}
   2708    mov                  r6, 0x55555555ff00
   2709    packssdw             m2, m1
   2710    packssdw             m3, m4
   2711    mova                 m8, [spel_hv_perm8b]
   2712    psraw                m2, 2 ; 0 1 2 3
   2713    psraw                m3, 2 ; 4 5 6 _
   2714    vpermb               m1, m5, m2 ; 01 12
   2715    vbroadcasti32x8      m6, [subpel_h_shufA]
   2716    kmovq                k1, r6
   2717    vpermt2b             m2, m0, m3 ; 23 34
   2718    vbroadcasti32x8      m7, [subpel_h_shufB]
   2719    kshiftrq             k2, k1, 16
   2720    mova               xm16, [spel_hv_end]
   2721    vpermb               m3, m5, m3 ; 45 56
   2722 .hv_w8_loop:
   2723    vbroadcasti32x4     ym4, [srcq+ssq*1]
   2724    lea                srcq, [srcq+ssq*2]
   2725    vbroadcasti32x4  m4{k1}, [srcq+ssq*0]
   2726    pmaddwd              m0, m1, m12 ; a0 b0
   2727    pshufb               m1, m4, m6  ; 7 8   0123 4567
   2728    mova                 m5, m9
   2729    vpdpbusd             m5, m1, m10
   2730    pshufb               m4, m7      ; 7 8   4567 89ab
   2731    vpdpwssd             m0, m2, m13 ; a1 b1
   2732    mova                 m1, m2
   2733    vpdpbusd             m5, m4, m11
   2734    mova                 m2, m3
   2735    vpdpwssd             m0, m3, m14 ; a2 b2
   2736    psraw            m3{k2}, m5, 2   ; 75 86
   2737    vpermb               m3, m8, m3  ; 67 78
   2738    vpdpwssd             m0, m3, m15 ; a3 b3
   2739    packuswb             m0, m0
   2740    vpermb             zmm1, m16, m0
   2741    movq       [dstq+dsq*0], xmm1
   2742    movhps     [dstq+dsq*1], xmm1
   2743    lea                dstq, [dstq+dsq*2]
   2744    sub                  hd, 2
   2745    jg .hv_w8_loop
   2746    vzeroupper
   2747    RET
   2748 .hv_w16:
   2749    WIN64_SPILL_XMM      23
   2750    movu                m22, [spel_hv_perm16a]
   2751    sub                srcq, ss3q
   2752    vpbroadcastd         m8, [pb_4]
   2753    lea                 r6d, [wq*2-32]
   2754    mova                 m7, [spel_hv_perm16b]
   2755    paddb               m20, m8, m22
   2756    mova               ym16, [spel_hv_end16]
   2757    paddb               m21, m8, m20
   2758    lea                 r6d, [hq+r6*8]
   2759    paddb                m8, m7
   2760 .hv_w16_loop0:
   2761    movu               ym17, [srcq+ssq*0]
   2762    vinserti32x8        m17, [srcq+ssq*1], 1 ; 0 1
   2763    lea                  r4, [srcq+ss3q]
   2764    movu               ym18, [srcq+ssq*2]
   2765    vinserti32x8        m18, [r4  +ssq*0], 1 ; 2 3
   2766    mov                  r7, dstq
   2767    movu               ym19, [r4  +ssq*1]
   2768    vinserti32x8        m19, [r4  +ssq*2], 1 ; 4 5
   2769    add                  r4, ss3q
   2770    vpermb               m2, m22, m17    ; 0 1   0123   89ab
   2771    mova                 m1, m9
   2772    vpermb               m3, m21, m17    ; 0 1   89ab   ghij
   2773    vpdpbusd             m1, m2, m10
   2774    mova                 m2, m9
   2775    vpermb               m4, m22, m18    ; 2 3   0123   89ab
   2776    vpdpbusd             m2, m3, m11
   2777    mova                 m3, m9
   2778    vpermb               m5, m21, m18    ; 2 3   89ab   ghij
   2779    vpdpbusd             m3, m4, m10
   2780    mova                 m4, m9
   2781    vpermb               m6, m22, m19    ; 4 5   0123   89ab
   2782    vpdpbusd             m4, m5, m11
   2783    mova                 m5, m9
   2784    vpermb              m17, m20, m17    ; 0 1   4567   cdef
   2785    vpdpbusd             m5, m6, m10
   2786    mova                 m6, m9
   2787    vpermb               m0, m21, m19    ; 4 5   89ab   ghij
   2788    vpdpbusd             m1, m17, m11
   2789    vpdpbusd             m2, m17, m10
   2790    movu               ym17, [r4+ssq*0]  ; 6
   2791    vpermb              m18, m20, m18    ; 2 3   4567   cdef
   2792    vpdpbusd             m6, m0, m11
   2793    vpermb               m0, m7, m17     ; 6     0145   2367   89cd   abef
   2794    vpdpbusd             m3, m18, m11
   2795    vpermb              m19, m20, m19    ; 4 5   4567   cdef
   2796    vpdpbusd             m4, m18, m10
   2797    mova                m18, m9
   2798    vpermb              m17, m8, m17     ; 6     4589   67ab   cdgh   efij
   2799    vpdpbusd            m18, m0, m10
   2800    packssdw             m1, m2
   2801    vpdpbusd             m5, m19, m11
   2802    vpdpbusd             m6, m19, m10
   2803    packssdw             m3, m4
   2804    vpdpbusd            m18, m17, m11
   2805    psraw                m1, 2           ; 01
   2806    psraw                m3, 2           ; 23
   2807    packssdw             m5, m6
   2808    vpshrdd              m2, m1, m3, 16  ; 12
   2809    psraw                m5, 2           ; 45
   2810    vpshrdd              m4, m3, m5, 16  ; 34
   2811    psraw               m18, 2
   2812    vpshrdd              m6, m5, m18, 16 ; 56
   2813 .hv_w16_loop:
   2814    movu               ym19, [r4+ssq*1]
   2815    lea                  r4, [r4+ssq*2]
   2816    vinserti32x8        m19, [r4+ssq*0], 1
   2817    pmaddwd             m17, m1, m12     ; a0
   2818    vpermb               m1, m22, m19    ; 7 8   0123   89ab
   2819    pmaddwd             m18, m2, m12     ; b0
   2820    mova                 m0, m9
   2821    vpermb               m2, m21, m19    ; 7 8   89ab   ghij
   2822    vpdpbusd             m0, m1, m10
   2823    mova                 m1, m9
   2824    vpermb              m19, m20, m19    ; 7 8   4567   cdef
   2825    vpdpbusd             m1, m2, m11
   2826    mova                 m2, m4
   2827    vpdpwssd            m17, m3, m13     ; a1
   2828    vpdpwssd            m18, m4, m13     ; b1
   2829    mova                 m4, m6
   2830    vpdpbusd             m0, m19, m11
   2831    vpdpbusd             m1, m19, m10
   2832    vpdpwssd            m17, m5, m14     ; a2
   2833    vpdpwssd            m18, m6, m14     ; b2
   2834    packssdw             m0, m1
   2835    mova                 m1, m3
   2836    psraw                m6, m0, 2       ; 78
   2837    mova                 m3, m5
   2838    vpshrdd              m5, m4, m6, 16  ; 67
   2839    vpdpwssd            m18, m6, m15     ; b3
   2840    vpdpwssd            m17, m5, m15     ; a3
   2841    packuswb            m17, m18
   2842    vpermb              m17, m16, m17
   2843    mova         [r7+dsq*0], xm17
   2844    vextracti128 [r7+dsq*1], ym17, 1
   2845    lea                  r7, [r7+dsq*2]
   2846    sub                  hd, 2
   2847    jg .hv_w16_loop
   2848    add                srcq, 16
   2849    add                dstq, 16
   2850    movzx                hd, r6b
   2851    sub                 r6d, 1<<8
   2852    jg .hv_w16_loop0
   2853    RET
   2854 
   2855 %if WIN64
   2856 DECLARE_REG_TMP 6, 4
   2857 %else
   2858 DECLARE_REG_TMP 6, 7
   2859 %endif
   2860 
   2861 %define PREP_8TAP_FN FN prep_8tap,
   2862 PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_6tap_8bpc
   2863 PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_6tap_8bpc
   2864 PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_8bpc
   2865 PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_8bpc
   2866 PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_8bpc
   2867 PREP_8TAP_FN regular,        REGULAR, REGULAR
   2868 
   2869 cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my, ss3
   2870 %define base r7-prep_avx512icl
   2871    imul                mxd, mxm, 0x010101
   2872    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
   2873    imul                myd, mym, 0x010101
   2874    add                 myd, t1d ; 6tap_v, my, 4tap_v
   2875    lea                  r7, [prep_avx512icl]
   2876    movifnidn            hd, hm
   2877    test                mxd, 0xf00
   2878    jnz .h
   2879    test                myd, 0xf00
   2880    jnz .v
   2881 .prep:
   2882    tzcnt                wd, wd
   2883    movzx                wd, word [r7+wq*2+table_offset(prep,)]
   2884    add                  wq, r7
   2885    lea                  r6, [ssq*3]
   2886 %if WIN64
   2887    pop                  r7
   2888 %endif
   2889    jmp                  wq
   2890 .v:
   2891    movzx               mxd, myb
   2892    shr                 myd, 16
   2893    cmp                  hd, 4
   2894    cmove               myd, mxd
   2895    tzcnt               r5d, wd
   2896    lea                 myq, [base+subpel_filters+1+myq*8]
   2897    movzx               r5d, word [r7+r5*2+table_offset(prep, _6tap_v)]
   2898    vpbroadcastd         m7, [pw_8192]
   2899    sub                srcq, ssq
   2900    vpbroadcastw         m8, [myq+0]
   2901    add                  r5, r7
   2902    vpbroadcastw         m9, [myq+2]
   2903    lea                ss3q, [ssq*3]
   2904    vpbroadcastw        m10, [myq+4]
   2905    sub                srcq, ssq
   2906    jmp                  r5
   2907 .v_w4:
   2908    movd               xmm2, [srcq+ssq*0]
   2909    pinsrd             xmm2, [srcq+ssq*1], 1
   2910    vpbroadcastd       ymm1, [srcq+ssq*2]
   2911    add                srcq, ss3q
   2912    vpbroadcastd       ymm3, [srcq+ssq*0]
   2913    vpbroadcastd       ymm0, [srcq+ssq*1]
   2914    vbroadcasti128     ymm5, [deint_shuf4]
   2915    vpblendd           ymm1, ymm2, 0xeb
   2916    punpcklqdq         ymm3, ymm0
   2917    vpblendd           ymm1, ymm3, 0x60 ; 0 1 2 _   2 3 4 _
   2918    pshufb             ymm1, ymm5       ; 01 12 23 34
   2919 .v_w4_loop:
   2920    pinsrd             xmm0, [srcq+ssq*2], 1
   2921    vpbroadcastd       ymm2, [srcq+ss3q ]
   2922    lea                srcq, [srcq+ssq*4]
   2923    vpbroadcastd       ymm3, [srcq+ssq*0]
   2924    vpblendd           ymm2, ymm0, 0xeb
   2925    vpbroadcastd       ymm0, [srcq+ssq*1]
   2926    punpcklqdq         ymm3, ymm0
   2927    vpblendd           ymm2, ymm3, 0x60 ; 4 5 6 _   6 7 8 _
   2928    pshufb             ymm2, ymm5       ; 45 56 67 78
   2929    pmaddubsw          ymm3, ymm1, ym8  ; a0 b0 c0 d0
   2930    vperm2i128         ymm1, ymm2, 0x21 ; 23 34 45 56
   2931    pmaddubsw          ymm4, ymm2, ym10 ; a2 b2 c2 d2
   2932    pmaddubsw          ymm1, ym9        ; a1 b1 c1 d1
   2933    paddw              ymm3, ymm4
   2934    paddw              ymm3, ymm1
   2935    pmulhrsw           ymm3, ym7
   2936    mova               ymm1, ymm2
   2937    mova             [tmpq], ymm3
   2938    add                tmpq, 32
   2939    sub                  hd, 4
   2940    jg .v_w4_loop
   2941    vzeroupper
   2942    RET
   2943 .v_w8:
   2944    mova                 m6, [spel_v_perm8]
   2945    movq                xm1, [srcq+ssq*0]
   2946    mov                 r6d, 0x3e
   2947    movq                xm2, [srcq+ssq*1]
   2948    kmovb                k1, r6d
   2949    vpbroadcastq        ym3, [srcq+ssq*2]
   2950    add                srcq, ss3q
   2951    vpunpcklqdq         ym2, [srcq+ssq*0] {1to4}
   2952    vpunpcklqdq      m1{k1}, m3, [srcq+ssq*1] {1to8}
   2953    movq                xm0, [srcq+ssq*1]
   2954    kshiftlb             k2, k1, 2
   2955    shufpd               m1, m2, 0x18  ; 0 1   2 3   4
   2956    vpermb               m1, m6, m1    ; 01 12 23 34
   2957 .v_w8_loop:
   2958    vpbroadcastq        ym3, [srcq+ss3q ]
   2959    vpunpcklqdq     ym0{k1}, ym3, [srcq+ssq*2] {1to4}
   2960    lea                srcq, [srcq+ssq*4]
   2961    vpbroadcastq         m3, [srcq+ssq*1]
   2962    vpunpcklqdq      m0{k2}, m3, [srcq+ssq*0] {1to8}
   2963    pmaddubsw            m4, m1, m8    ; a0 b0 c0 d0
   2964    vpermb               m2, m6, m0    ; 45 56 67 78
   2965    mova                xm0, xm3
   2966    vshufi32x4           m1, m2, q1032 ; 23 34 45 56
   2967    pmaddubsw            m3, m2, m10   ; a3 b3 c3 d3
   2968    pmaddubsw            m5, m1, m9    ; a2 b2 c2 d2
   2969    mova                 m1, m2
   2970    paddw                m4, m3
   2971    paddw                m4, m5
   2972    pmulhrsw             m4, m7
   2973    mova             [tmpq], m4
   2974    add                tmpq, 64
   2975    sub                  hd, 4
   2976    jg .v_w8_loop
   2977    RET
   2978 .v_w16:
   2979    mova                m11, [spel_v_perm16b]
   2980    vbroadcasti32x4      m1, [srcq+ssq*0]
   2981    mov                 r6d, 0x0f
   2982    vbroadcasti32x4     ym3, [srcq+ssq*1]
   2983    vbroadcasti32x4      m2, [srcq+ssq*2]
   2984    kmovb                k1, r6d
   2985    add                srcq, ss3q
   2986    vbroadcasti32x4     ym4, [srcq+ssq*0]
   2987    vbroadcasti32x4      m0, [srcq+ssq*1]
   2988    vshufpd          m1{k1}, m3, m2, 0xcc
   2989    vshufpd          m2{k1}, m4, m0, 0xcc
   2990    vpermb               m1, m11, m1 ; 01 12
   2991    vpermb               m2, m11, m2 ; 23 34
   2992 .v_w16_loop:
   2993    pmaddubsw            m3, m1, m8  ; a0 b0
   2994    pmaddubsw            m5, m2, m9  ; a1 b1
   2995    vbroadcasti32x4     ym6, [srcq+ssq*2]
   2996    pmaddubsw            m4, m2, m8  ; c0 d0
   2997    vbroadcasti32x4      m2, [srcq+ss3q ]
   2998    lea                srcq, [srcq+ssq*4]
   2999    vshufpd          m0{k1}, m6, m2, 0xcc
   3000    vbroadcasti32x4     ym6, [srcq+ssq*0]
   3001    vpermb               m1, m11, m0 ; 45 56
   3002    vbroadcasti32x4      m0, [srcq+ssq*1]
   3003    vshufpd          m2{k1}, m6, m0, 0xcc
   3004    pmaddubsw            m6, m1, m9  ; c1 d1
   3005    vpermb               m2, m11, m2 ; 67 78
   3006    paddw                m3, m5
   3007    pmaddubsw            m5, m1, m10 ; a2 b2
   3008    paddw                m4, m6
   3009    pmaddubsw            m6, m2, m10 ; c2 d2
   3010    paddw                m3, m5
   3011    paddw                m4, m6
   3012    pmulhrsw             m3, m7
   3013    pmulhrsw             m4, m7
   3014    mova          [tmpq+ 0], m3
   3015    mova          [tmpq+64], m4
   3016    add                tmpq, 64*2
   3017    sub                  hd, 4
   3018    jg .v_w16_loop
   3019    RET
   3020 .v_w32:
   3021    movshdup             m6, [bilin_v_perm64]
   3022    movu               ym16, [srcq+ssq*0]
   3023    movu               ym17, [srcq+ssq*1]
   3024    movu               ym18, [srcq+ssq*2]
   3025    add                srcq, ss3q
   3026    movu               ym19, [srcq+ssq*0]
   3027    add                srcq, ssq
   3028    movu               ym20, [srcq+ssq*0]
   3029    vpermt2q            m16, m6, m18   ; 0 2
   3030    vpermt2q            m17, m6, m19   ; 1 3
   3031    vpermt2q            m18, m6, m20   ; 2 4
   3032    punpcklbw            m0, m16, m17  ; 01
   3033    punpcklbw            m1, m17, m18  ; 12
   3034    punpckhbw            m2, m16, m17  ; 23
   3035    punpckhbw            m3, m17, m18  ; 34
   3036 .v_w32_loop:
   3037    movu               ym16, [srcq+ssq*1]
   3038    lea                srcq, [srcq+ssq*2]
   3039    movu               ym17, [srcq+ssq*0]
   3040    pmaddubsw            m4, m0, m8    ; a0
   3041    mova                 m0, m2
   3042    pmaddubsw            m2, m9        ; a1
   3043    vpermt2q            m16, m6, m17   ; 5 6
   3044    pmaddubsw            m5, m1, m8    ; b0
   3045    mova                 m1, m3
   3046    pmaddubsw            m3, m9        ; b1
   3047    shufpd              m18, m16, 0x55 ; 4 5
   3048    paddw                m4, m2
   3049    punpcklbw            m2, m18, m16  ; 45
   3050    paddw                m5, m3
   3051    punpckhbw            m3, m18, m16  ; 56
   3052    mova                m18, m16
   3053    pmaddubsw           m16, m2, m10   ; a2
   3054    pmaddubsw           m17, m3, m10   ; b2
   3055    paddw                m4, m16
   3056    paddw                m5, m17
   3057    pmulhrsw             m4, m7
   3058    pmulhrsw             m5, m7
   3059    mova          [tmpq+ 0], m4
   3060    mova          [tmpq+64], m5
   3061    add                tmpq, 64*2
   3062    sub                  hd, 2
   3063    jg .v_w32_loop
   3064    vzeroupper
   3065    RET
   3066 .v_w64:
   3067 .v_w128:
   3068    mova                 m6, [bilin_v_perm64]
   3069    add                  wd, wd
   3070    lea                 r6d, [hq+wq]
   3071 .v_loop0:
   3072    vpermq              m12, m6, [srcq+ssq*0]
   3073    vpermq              m13, m6, [srcq+ssq*1]
   3074    lea                  r5, [srcq+ssq*2]
   3075    vpermq              m14, m6, [r5  +ssq*0]
   3076    vpermq              m15, m6, [r5  +ssq*1]
   3077    lea                  r5, [r5+ssq*2]
   3078    vpermq              m16, m6, [r5  +ssq*0]
   3079    mov                  r7, tmpq
   3080    punpcklbw            m0, m12, m13 ; 01
   3081    punpckhbw           m12, m13
   3082    punpcklbw            m1, m13, m14 ; 12
   3083    punpckhbw           m13, m14
   3084    punpcklbw            m2, m14, m15 ; 23
   3085    punpckhbw           m14, m15
   3086    punpcklbw            m3, m15, m16 ; 34
   3087    punpckhbw           m15, m16
   3088 .v_loop:
   3089    pmaddubsw           m17, m0, m8   ; a0
   3090    vpermq               m5, m6, [r5+ssq*1]
   3091    pmaddubsw           m18, m12, m8
   3092    mova                 m0, m2
   3093    pmaddubsw            m2, m9       ; a1
   3094    mova                m12, m14
   3095    pmaddubsw           m14, m9
   3096    lea                  r5, [r5+ssq*2]
   3097    pmaddubsw           m19, m1, m8   ; b0
   3098    pmaddubsw           m20, m13, m8
   3099    mova                 m1, m3
   3100    pmaddubsw            m3, m9       ; b1
   3101    mova                m13, m15
   3102    pmaddubsw           m15, m9
   3103    paddw               m17, m2
   3104    punpcklbw            m2, m16, m5  ; 67
   3105    paddw               m18, m14
   3106    punpckhbw           m14, m16, m5
   3107    vpermq              m16, m6, [r5+ssq*0]
   3108    paddw               m19, m3
   3109    pmaddubsw            m3, m2, m10  ; a3
   3110    paddw               m20, m15
   3111    pmaddubsw           m15, m14, m10
   3112    paddw               m17, m3
   3113    punpcklbw            m3, m5, m16  ; 78
   3114    pmaddubsw            m4, m3, m10  ; b3
   3115    paddw               m18, m15
   3116    punpckhbw           m15, m5, m16
   3117    pmaddubsw            m5, m15, m10
   3118    paddw               m19, m4
   3119    paddw               m20, m5
   3120    REPX   {pmulhrsw x, m7}, m17, m18, m19, m20
   3121    mova       [r7+wq*0+ 0], m17
   3122    mova       [r7+wq*0+64], m18
   3123    mova       [r7+wq*1+ 0], m19
   3124    mova       [r7+wq*1+64], m20
   3125    lea                  r7, [r7+wq*2]
   3126    sub                  hd, 2
   3127    jg .v_loop
   3128    add                srcq, 64
   3129    add                tmpq, 128
   3130    movzx                hd, r6b
   3131    sub                 r6d, 1<<8
   3132    jg .v_loop0
   3133    vzeroupper
   3134    RET
   3135 .h:
   3136    test                myd, 0xf00
   3137    jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2
   3138 .hv:
   3139    vpbroadcastd         m8, [pd_2]
   3140    vpbroadcastd         m9, [pd_32]
   3141    cmp                  wd, 4
   3142    jg .hv_w8
   3143    movzx               mxd, mxb
   3144    vpbroadcastd        m11, [base+subpel_filters+mxq*8+2]
   3145    movzx               mxd, myb
   3146    shr                 myd, 16
   3147    cmp                  hd, 4
   3148    cmove               myd, mxd
   3149    vpbroadcastq         m3, [base+subpel_filters+1+myq*8]
   3150    vbroadcasti128      m10, [subpel_h_shufA]
   3151    lea                  r6, [ssq*2+1]
   3152    mov                 r3d, 0x30
   3153    sub                srcq, r6
   3154    kmovb                k1, r3d
   3155    vpbroadcastq        ym2, [srcq+ssq*0]
   3156    lea                ss3q, [ssq*3]
   3157    vpbroadcastq         m1, [srcq+ssq*1]
   3158    kaddb                k2, k1, k1
   3159    vpbroadcastq     m2{k1}, [srcq+ssq*2]
   3160    add                srcq, ss3q
   3161    vpbroadcastq     m1{k2}, [srcq+ssq*0] ; _ _ 1 3
   3162    punpcklbw            m3, m3
   3163    vpbroadcastq     m2{k2}, [srcq+ssq*1] ; _ 0 2 4
   3164    psraw                m3, 8 ; sign-extend
   3165    mova                 m6, [spel_hv_perm4a]
   3166    kshiftrb             k1, k1, 2
   3167    movu                 m7, [spel_hv_perm4b]
   3168    pshufb               m1, m10
   3169    mova                 m0, m8
   3170    vpdpbusd             m0, m1, m11
   3171    pshufb               m2, m10
   3172    mova                 m1, m8
   3173    vpdpbusd             m1, m2, m11
   3174    pshufd              m12, m3, q0000
   3175    pshufd              m13, m3, q1111
   3176    pshufd              m14, m3, q2222
   3177    packssdw             m0, m1           ; _ _   _ 0   1 2   3 4
   3178    psraw                m0, 2
   3179    vpermb               m1, m7, m0       ; 01 12 23 34
   3180 .hv_w4_loop:
   3181    movq                xm3, [srcq+ssq*2]
   3182    movq                xm4, [srcq+ss3q ]
   3183    lea                srcq, [srcq+ssq*4]
   3184    vpbroadcastq    ym3{k1}, [srcq+ssq*0] ; 5 7
   3185    vpbroadcastq    ym4{k1}, [srcq+ssq*1] ; 6 8
   3186    pshufb              ym3, ym10
   3187    mova                ym2, ym8
   3188    vpdpbusd            ym2, ym3, ym11
   3189    pshufb              ym4, ym10
   3190    mova                ym3, ym8
   3191    vpdpbusd            ym3, ym4, ym11
   3192    mova                 m4, m9
   3193    vpdpwssd             m4, m1, m12      ; a0 b0 c0 d0
   3194    packssdw            ym2, ym3          ; 5 6   7 8
   3195    psraw               ym2, 2
   3196    vshufi32x4           m0, m2, q1032    ; _ 2   3 4   5 6   7 8
   3197    vpermb               m2, m6, m0       ; 23 34 45 56
   3198    vpermb               m1, m7, m0       ; 45 56 67 78
   3199    vpdpwssd             m4, m2, m13      ; a1 b1 c1 d1
   3200    vpdpwssd             m4, m1, m14      ; a2 b2 c2 d2
   3201    psrad                m4, 6
   3202    vpmovdw          [tmpq], m4
   3203    add                tmpq, 32
   3204    sub                  hd, 4
   3205    jg .hv_w4_loop
   3206    RET
   3207 .hv_w8:
   3208    shr                 mxd, 16
   3209    vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
   3210    vpbroadcastd        m11, [base+subpel_filters+mxq*8+4]
   3211    movzx               mxd, myb
   3212    shr                 myd, 16
   3213    cmp                  hd, 4
   3214    cmove               myd, mxd
   3215    vpbroadcastq         m0, [base+subpel_filters+1+myq*8]
   3216    lea                  r6, [ssq*2+3]
   3217    punpcklbw            m0, m0
   3218    sub                srcq, r6
   3219    psraw                m0, 8 ; sign-extend
   3220    lea                ss3q, [ssq*3]
   3221    pshufd              m12, m0, q0000
   3222    pshufd              m13, m0, q1111
   3223    pshufd              m14, m0, q2222
   3224    cmp                  wd, 8
   3225    jg .hv_w16
   3226    movu               xm16, [srcq+ssq*0]
   3227    vbroadcasti32x4     m19, [subpel_h_shufA]
   3228    vinserti128        ym16, [srcq+ssq*1], 1
   3229    vbroadcasti32x4     m21, [subpel_h_shufC]
   3230    vinserti32x4        m16, [srcq+ssq*2], 2
   3231    add                srcq, ss3q
   3232    vinserti32x4        m16, [srcq+ssq*0], 3
   3233    movu               xm17, [srcq+ssq*1]
   3234    vbroadcasti32x4     m20, [subpel_h_shufB]
   3235    pshufb               m3, m16, m19   ; 0 1 2 3   0123
   3236    mova                 m2, m8
   3237    pshufb               m0, m16, m21   ; 0 1 2 3   89ab
   3238    vpdpbusd             m2, m3, m10
   3239    mova                 m3, m8
   3240    pshufb              xm1, xm17, xm19 ; 3 4 5 6   0123
   3241    vpdpbusd             m3, m0, m11
   3242    mova                xm0, xm8
   3243    pshufb             xm18, xm17, xm21 ; 3 4 5 6   89ab
   3244    vpdpbusd            xm0, xm1, xm10
   3245    mova                xm1, xm8
   3246    pshufb              m16, m20        ; 0 1 2 3   4567
   3247    vpdpbusd            xm1, xm18, xm11
   3248    pshufb             xm17, xm20       ; 3 4 5 6   4567
   3249    vpdpbusd             m2, m16, m11
   3250    vpdpbusd             m3, m16, m10
   3251    vpdpbusd            xm0, xm17, xm11
   3252    vpdpbusd            xm1, xm17, xm10
   3253    packssdw             m2, m3
   3254    packssdw            xm0, xm1
   3255    psraw                m2, 2          ; 0 1 2 3
   3256    psraw               xm0, 2          ; 4
   3257    valignq              m0, m2, 2      ; 1 2 3 4
   3258    punpcklwd            m1, m2, m0     ; 01 12 23 34
   3259    punpckhwd            m2, m0
   3260 .hv_w8_loop:
   3261    movu               xm16, [srcq+ssq*2]
   3262    vinserti128        ym16, [srcq+ss3q ], 1
   3263    lea                srcq, [srcq+ssq*4]
   3264    vinserti32x4        m16, [srcq+ssq*0], 2
   3265    vinserti32x4        m16, [srcq+ssq*1], 3
   3266    pshufb               m6, m16, m19   ; 5 6 7 8   0123
   3267    mova                 m5, m8
   3268    pshufb               m3, m16, m21   ; 5 6 7 8   89ab
   3269    vpdpbusd             m5, m6, m10
   3270    mova                 m6, m8
   3271    pshufb              m16, m20        ; 5 6 7 8   4567
   3272    vpdpbusd             m6, m3, m11
   3273    mova                 m3, m9
   3274    vpdpwssd             m3, m1, m12    ; a0 b0 c0 d0
   3275    mova                 m4, m9
   3276    vpdpwssd             m4, m2, m12
   3277    vpdpbusd             m5, m16, m11
   3278    vpdpbusd             m6, m16, m10
   3279    mova                m16, m1
   3280    packssdw             m5, m6
   3281    mova                 m6, m2
   3282    psraw                m5, 2          ; 5 6 7 8
   3283    valignq              m2, m5, m0, 6  ; 4 5 6 7
   3284    mova                 m0, m5
   3285    punpcklwd            m1, m2, m5     ; 45 56 67 78
   3286    punpckhwd            m2, m5
   3287    vpdpwssd             m3, m1, m14    ; a2 b2 c2 d2
   3288    vpdpwssd             m4, m2, m14
   3289    vshufi32x4          m16, m1, q1032  ; 23 34 45 56
   3290    vshufi32x4           m6, m2, q1032
   3291    vpdpwssd             m3, m16, m13   ; a1 b1 c1 d1
   3292    vpdpwssd             m4, m6, m13
   3293    psrad                m3, 6
   3294    psrad                m4, 6
   3295    packssdw             m3, m4
   3296    mova             [tmpq], m3
   3297    add                tmpq, 64
   3298    sub                  hd, 4
   3299    jg .hv_w8_loop
   3300    vzeroupper
   3301    RET
   3302 .hv_w16:
   3303    mova                m16, [spel_h_perm16]
   3304    vpbroadcastd        m18, [pb_4]
   3305    add                  wd, wd
   3306    paddb               m17, m18, m16
   3307    lea                 r6d, [hq+wq*8-256]
   3308    paddb               m18, m17
   3309 .hv_w16_loop0:
   3310    movu               ym19, [srcq+ssq*0]
   3311    vinserti32x8        m19, [srcq+ssq*1], 1
   3312    lea                  r5, [srcq+ssq*2]
   3313    movu               ym20, [r5  +ssq*0]
   3314    vinserti32x8        m20, [r5  +ssq*1], 1
   3315    lea                  r5, [r5  +ssq*2]
   3316    movu               ym21, [r5  +ssq*0]
   3317    mov                  r7, tmpq
   3318    vpermb               m3, m16, m19      ; 0 1   0123   89ab
   3319    mova                 m2, m8
   3320    vpermb               m4, m18, m19      ; 0 1   89ab   ghij
   3321    vpdpbusd             m2, m3, m10
   3322    mova                 m3, m8
   3323    vpermb               m5, m16, m20      ; 2 3   0123   89ab
   3324    vpdpbusd             m3, m4, m11
   3325    mova                 m4, m8
   3326    vpermb               m0, m18, m20      ; 2 3   89ab   ghij
   3327    vpdpbusd             m4, m5, m10
   3328    mova                 m5, m8
   3329    vpermb              ym1, ym16, ym21    ; 4     0123   89ab
   3330    vpdpbusd             m5, m0, m11
   3331    mova                ym0, ym8
   3332    vpermb              ym6, ym18, ym21    ; 4     89ab   ghij
   3333    vpdpbusd            ym0, ym1, ym10
   3334    mova                ym1, ym8
   3335    vpermb              m19, m17, m19      ; 0 1   4567   cdef
   3336    vpdpbusd            ym1, ym6, ym11
   3337    vpermb              m20, m17, m20      ; 2 3   4567   cdef
   3338    vpdpbusd             m2, m19, m11
   3339    vpdpbusd             m3, m19, m10
   3340    vpermb             ym21, ym17, ym21    ; 4     4567   cdef
   3341    vpdpbusd             m4, m20, m11
   3342    vpdpbusd             m5, m20, m10
   3343    vpdpbusd            ym0, ym21, ym11
   3344    vpdpbusd            ym1, ym21, ym10
   3345    packssdw             m2, m3            ; 0 1
   3346    packssdw             m4, m5            ; 2 3
   3347    packssdw            ym0, ym1           ; 4
   3348    REPX       {psraw x, 2}, m2, m4, ym0
   3349    vshufi32x4           m3, m2, m4, q1032 ; 1 2
   3350    vshufi32x4           m0, m4, m0, q1032 ; 3 4
   3351    punpcklwd            m1, m2, m3        ; 01 12
   3352    punpckhwd            m2, m3
   3353    punpcklwd            m3, m4, m0        ; 23 34
   3354    punpckhwd            m4, m0
   3355 .hv_w16_loop:
   3356    movu               ym19, [r5+ssq*1]
   3357    lea                  r5, [r5+ssq*2]
   3358    vinserti32x8        m19, [r5+ssq*0], 1
   3359    vpermb               m6, m16, m19      ; 5 6   0123   89ab
   3360    mova                 m5, m8
   3361    vpermb              m20, m18, m19      ; 5 6   89ab   ghij
   3362    vpdpbusd             m5, m6, m10
   3363    mova                 m6, m8
   3364    vpermb              m19, m17, m19      ; 5 6   4567   cdef
   3365    vpdpbusd             m6, m20, m11
   3366    mova                m20, m9
   3367    vpdpwssd            m20, m1, m12       ; a0 b0
   3368    mova                m21, m9
   3369    vpdpwssd            m21, m2, m12
   3370    vpdpbusd             m5, m19, m11
   3371    vpdpbusd             m6, m19, m10
   3372    vpdpwssd            m20, m3, m13       ; a1 b1
   3373    vpdpwssd            m21, m4, m13
   3374    packssdw             m5, m6
   3375    mova                 m1, m3
   3376    psraw                m5, 2             ; 5 6
   3377    mova                 m2, m4
   3378    vshufi32x4           m4, m0, m5, q1032 ; 4 5
   3379    mova                 m0, m5
   3380    punpcklwd            m3, m4, m0        ; 45 56
   3381    punpckhwd            m4, m0
   3382    vpdpwssd            m20, m3, m14       ; a2 b2
   3383    vpdpwssd            m21, m4, m14
   3384    psrad               m20, 6
   3385    psrad               m21, 6
   3386    packssdw            m20, m21
   3387    mova          [r7+wq*0], ym20
   3388    vextracti32x8 [r7+wq*1], m20, 1
   3389    lea                  r7, [r7+wq*2]
   3390    sub                  hd, 2
   3391    jg .hv_w16_loop
   3392    add                srcq, 16
   3393    add                tmpq, 32
   3394    movzx                hd, r6b
   3395    sub                 r6d, 1<<8
   3396    jg .hv_w16_loop0
   3397    vzeroupper
   3398    RET
   3399 
   3400 %macro PREP_8TAP_H 0
   3401    vpermb              m10, m5, m0
   3402    vpermb              m11, m5, m1
   3403    vpermb              m12, m6, m0
   3404    vpermb              m13, m6, m1
   3405    vpermb              m14, m7, m0
   3406    vpermb              m15, m7, m1
   3407    mova                 m0, m4
   3408    vpdpbusd             m0, m10, m8
   3409    mova                 m2, m4
   3410    vpdpbusd             m2, m12, m8
   3411    mova                 m1, m4
   3412    vpdpbusd             m1, m11, m8
   3413    mova                 m3, m4
   3414    vpdpbusd             m3, m13, m8
   3415    vpdpbusd             m0, m12, m9
   3416    vpdpbusd             m2, m14, m9
   3417    vpdpbusd             m1, m13, m9
   3418    vpdpbusd             m3, m15, m9
   3419    packssdw             m0, m2
   3420    packssdw             m1, m3
   3421    psraw                m0, 2
   3422    psraw                m1, 2
   3423    mova        [tmpq+64*0], m0
   3424    mova        [tmpq+64*1], m1
   3425 %endmacro
   3426 
   3427 PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_8bpc
   3428 PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_8bpc
   3429 PREP_8TAP_FN sharp,          SHARP,   SHARP
   3430 
   3431 cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my, stride3
   3432    imul                mxd, mxm, 0x010101
   3433    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
   3434    imul                myd, mym, 0x010101
   3435    add                 myd, t1d ; 8tap_v, my, 4tap_v
   3436    lea                  r7, [prep_avx512icl]
   3437    movifnidn            hd, hm
   3438    test                mxd, 0xf00
   3439    jnz .h
   3440    test                myd, 0xf00
   3441    jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep
   3442 .v:
   3443    movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.
   3444    shr                 myd, 16  ; Note that the code is 8-tap only, having
   3445    cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4
   3446    cmove               myd, mxd ; had a negligible effect on performance.
   3447    tzcnt               r5d, wd
   3448    lea                 myq, [base+subpel_filters+myq*8]
   3449    movzx               r5d, word [r7+r5*2+table_offset(prep, _8tap_v)]
   3450    vpbroadcastd         m7, [pw_8192]
   3451    vpbroadcastw         m8, [myq+0]
   3452    add                  r5, r7
   3453    vpbroadcastw         m9, [myq+2]
   3454    lea            stride3q, [strideq*3]
   3455    vpbroadcastw        m10, [myq+4]
   3456    sub                srcq, stride3q
   3457    vpbroadcastw        m11, [myq+6]
   3458    jmp                  r5
   3459 .v_w4:
   3460    movd               xmm0, [srcq+strideq*0]
   3461    vpbroadcastd       ymm1, [srcq+strideq*2]
   3462    vpbroadcastd       xmm2, [srcq+strideq*1]
   3463    vpbroadcastd       ymm3, [srcq+stride3q ]
   3464    lea                srcq, [srcq+strideq*4]
   3465    vpblendd           ymm1, ymm0, 0x01       ; 0 2 2 _   2 _ _ _
   3466    vpblendd           ymm3, ymm2, 0x03       ; 1 1 3 3   3 3 _ _
   3467    vpbroadcastd       ymm0, [srcq+strideq*0]
   3468    vpbroadcastd       ymm2, [srcq+strideq*1]
   3469    vpblendd           ymm1, ymm0, 0x68       ; 0 2 2 4   2 4 4 _
   3470    vpbroadcastd       ymm0, [srcq+strideq*2]
   3471    vbroadcasti128     ymm5, [deint_shuf4]
   3472    vpblendd           ymm3, ymm2, 0xc0       ; 1 1 3 3   3 3 5 5
   3473    vpblendd           ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3   2 3 4 5
   3474    vpblendd           ymm3, ymm1, 0xaa       ; 1 2 3 4   3 4 5 _
   3475    punpcklbw          ymm1, ymm2, ymm3       ; 01  12    23  34
   3476    vpblendd           ymm3, ymm0, 0x80       ; 1 2 3 4   3 4 5 6
   3477    punpckhbw          ymm2, ymm3             ; 23  34    45  56
   3478 .v_w4_loop:
   3479    pinsrd             xmm0, [srcq+stride3q ], 1
   3480    lea                srcq, [srcq+strideq*4]
   3481    vpbroadcastd       ymm3, [srcq+strideq*0]
   3482    vpbroadcastd       ymm4, [srcq+strideq*1]
   3483    vpblendd           ymm3, ymm4, 0x20       ; _ _ 8 _   8 9 _ _
   3484    vpblendd           ymm3, ymm0, 0x03       ; 6 7 8 _   8 9 _ _
   3485    vpbroadcastd       ymm0, [srcq+strideq*2]
   3486    vpblendd           ymm3, ymm0, 0x40       ; 6 7 8 _   8 9 a _
   3487    pshufb             ymm3, ymm5             ; 67  78    89  9a
   3488    pmaddubsw          ymm4, ymm1, ym8
   3489    vperm2i128         ymm1, ymm2, ymm3, 0x21 ; 45  56    67  78
   3490    pmaddubsw          ymm2, ym9
   3491    paddw              ymm4, ymm2
   3492    mova               ymm2, ymm3
   3493    pmaddubsw          ymm3, ym11
   3494    paddw              ymm3, ymm4
   3495    pmaddubsw          ymm4, ymm1, ym10
   3496    paddw              ymm3, ymm4
   3497    pmulhrsw           ymm3, ym7
   3498    mova             [tmpq], ymm3
   3499    add                tmpq, 32
   3500    sub                  hd, 4
   3501    jg .v_w4_loop
   3502    vzeroupper
   3503    RET
   3504 .v_w8:
   3505    mova                 m6, [spel_v_perm8]
   3506    movq                xm1, [srcq+strideq*0]
   3507    mov                 r6d, 0x3e
   3508    movq                xm2, [srcq+strideq*1]
   3509    vpbroadcastq        ym3, [srcq+strideq*2]
   3510    kmovb                k1, r6d
   3511    vpbroadcastq        ym4, [srcq+stride3q ]
   3512    lea                srcq, [srcq+strideq*4]
   3513    vpunpcklqdq      m1{k1}, m3, [srcq+strideq*0] {1to8}
   3514    vpunpcklqdq      m2{k1}, m4, [srcq+strideq*1] {1to8}
   3515    movq                xm0, [srcq+strideq*2]
   3516    kshiftlb             k2, k1, 2
   3517    shufpd               m1, m2, 0x30      ; 0 1   2 3   4 5
   3518    vshufi32x4           m2, m1, m0, q0021 ; 2 3   4 5   6 _
   3519    vpermb               m1, m6, m1        ; 01 12 23 34
   3520    vpermb               m2, m6, m2        ; 23 34 45 56
   3521 .v_w8_loop:
   3522    vpbroadcastq        ym3, [srcq+strideq*4]
   3523    vpunpcklqdq     ym0{k1}, ym3, [srcq+stride3q] {1to4}
   3524    lea                srcq, [srcq+strideq*4]
   3525    vpbroadcastq         m3, [srcq+strideq*2]
   3526    vpunpcklqdq      m0{k2}, m3, [srcq+strideq*1] {1to8}
   3527    pmaddubsw            m4, m1, m8        ; a0 b0 c0 d0
   3528    mova                 m1, m2
   3529    pmaddubsw            m5, m2, m9        ; a1 b1 c1 d1
   3530    vpermb               m2, m6, m0        ; 67 78 89 9a
   3531    mova                xm0, xm3
   3532    vshufi32x4           m1, m2, q1032     ; 45 56 67 78
   3533    pmaddubsw            m3, m2, m11       ; a3 b3 c3 d3
   3534    paddw                m4, m5
   3535    pmaddubsw            m5, m1, m10       ; a2 b2 c2 d2
   3536    paddw                m4, m3
   3537    paddw                m4, m5
   3538    pmulhrsw             m4, m7
   3539    mova             [tmpq], m4
   3540    add                tmpq, 64
   3541    sub                  hd, 4
   3542    jg .v_w8_loop
   3543    RET
   3544 .v_w16:
   3545    mova                m12, [spel_v_perm16b]
   3546    vbroadcasti32x4      m1, [srcq+strideq*0]
   3547    mov                 r6d, 0x0f
   3548    vbroadcasti32x4     ym4, [srcq+strideq*1]
   3549    vbroadcasti32x4      m2, [srcq+strideq*2]
   3550    kmovb                k1, r6d
   3551    vbroadcasti32x4     ym5, [srcq+stride3q ]
   3552    lea                srcq, [srcq+strideq*4]
   3553    vbroadcasti32x4      m3, [srcq+strideq*0]
   3554    vbroadcasti32x4     ym6, [srcq+strideq*1]
   3555    vbroadcasti32x4      m0, [srcq+strideq*2]
   3556    vshufpd          m1{k1}, m4, m2, 0xcc
   3557    vshufpd          m2{k1}, m5, m3, 0xcc
   3558    vshufpd          m3{k1}, m6, m0, 0xcc
   3559    vpermb               m1, m12, m1 ; 01 12
   3560    vpermb               m2, m12, m2 ; 23 34
   3561    vpermb               m3, m12, m3 ; 45 56
   3562 .v_w16_loop:
   3563    pmaddubsw            m4, m1, m8  ; a0 b0
   3564    mova                 m1, m3
   3565    pmaddubsw           m13, m2, m9  ; a1 b1
   3566    vbroadcasti32x4     ym6, [srcq+stride3q ]
   3567    pmaddubsw            m5, m2, m8  ; c0 d0
   3568    lea                srcq, [srcq+strideq*4]
   3569    pmaddubsw           m14, m3, m9  ; c1 d1
   3570    vbroadcasti32x4      m3, [srcq+strideq*0]
   3571    vshufpd          m0{k1}, m6, m3, 0xcc
   3572    vbroadcasti32x4     ym6, [srcq+strideq*1]
   3573    vpermb               m2, m12, m0 ; 67 78
   3574    vbroadcasti32x4      m0, [srcq+strideq*2]
   3575    vshufpd          m3{k1}, m6, m0, 0xcc
   3576    paddw                m4, m13
   3577    pmaddubsw           m13, m1, m10 ; a2 b2
   3578    vpermb               m3, m12, m3 ; 89 9a
   3579    paddw                m5, m14
   3580    pmaddubsw           m14, m2, m10 ; c2 d2
   3581    pmaddubsw           m15, m2, m11 ; a3 b3
   3582    pmaddubsw            m6, m3, m11 ; c3 d3
   3583    paddw                m4, m13
   3584    paddw                m5, m14
   3585    paddw                m4, m15
   3586    paddw                m5, m6
   3587    pmulhrsw             m4, m7
   3588    pmulhrsw             m5, m7
   3589    mova          [tmpq+ 0], m4
   3590    mova          [tmpq+64], m5
   3591    add                tmpq, 64*2
   3592    sub                  hd, 4
   3593    jg .v_w16_loop
   3594    RET
   3595 .v_w32:
   3596    movshdup            m21, [bilin_v_perm64]
   3597    movu               ym16, [srcq+strideq*0]
   3598    movu               ym17, [srcq+strideq*1]
   3599    movu               ym18, [srcq+strideq*2]
   3600    add                srcq, stride3q
   3601    movu               ym19, [srcq+strideq*0]
   3602    vpermt2q            m16, m21, m19  ; 0 3
   3603    movu               ym20, [srcq+strideq*1]
   3604    vpermt2q            m17, m21, m20  ; 1 4
   3605    movu               ym20, [srcq+strideq*2]
   3606    add                srcq, stride3q
   3607    vpermt2q            m18, m21, m20  ; 2 5
   3608    movu               ym20, [srcq+strideq*0]
   3609    vpermt2q            m19, m21, m20  ; 3 6
   3610    punpcklbw            m0, m16, m17  ; 01
   3611    punpcklbw            m1, m17, m18  ; 12
   3612    punpcklbw            m2, m18, m19  ; 23
   3613    punpckhbw            m3, m16, m17  ; 34
   3614    punpckhbw            m4, m17, m18  ; 45
   3615    punpckhbw            m5, m18, m19  ; 56
   3616 .v_w32_loop:
   3617    movu               ym16, [srcq+strideq*1]
   3618    lea                srcq, [srcq+strideq*2]
   3619    movu               ym17, [srcq+strideq*0]
   3620    pmaddubsw           m14, m0, m8
   3621    mova                 m0, m2
   3622    pmaddubsw           m15, m1, m8
   3623    mova                 m1, m3
   3624    pmaddubsw            m2, m9
   3625    vpermt2q            m16, m21, m17  ; 7 8
   3626    pmaddubsw            m3, m9
   3627    pmaddubsw           m12, m4, m10
   3628    pmaddubsw           m13, m5, m10
   3629    shufpd              m19, m16, 0x55 ; 6 7
   3630    paddw               m14, m2
   3631    mova                 m2, m4
   3632    punpcklbw            m4, m19, m16  ; 67
   3633    paddw               m15, m3
   3634    mova                 m3, m5
   3635    punpckhbw            m5, m19, m16  ; 78
   3636    paddw               m14, m12
   3637    paddw               m15, m13
   3638    pmaddubsw           m12, m4, m11
   3639    pmaddubsw           m13, m5, m11
   3640    mova                m19, m16
   3641    paddw               m14, m12
   3642    paddw               m15, m13
   3643    pmulhrsw            m14, m7
   3644    pmulhrsw            m15, m7
   3645    mova          [tmpq+ 0], m14
   3646    mova          [tmpq+64], m15
   3647    add                tmpq, 64*2
   3648    sub                  hd, 2
   3649    jg .v_w32_loop
   3650    vzeroupper
   3651    RET
   3652 .v_w64:
   3653 .v_w128:
   3654    WIN64_SPILL_XMM      24
   3655    mova                m23, [bilin_v_perm64]
   3656    add                  wd, wd
   3657    lea                 r6d, [hq+wq]
   3658 .v_loop0:
   3659    vpermq              m12, m23, [srcq+strideq*0]
   3660    vpermq              m13, m23, [srcq+strideq*1]
   3661    lea                  r5, [srcq+strideq*2]
   3662    vpermq              m14, m23, [r5  +strideq*0]
   3663    vpermq              m15, m23, [r5  +strideq*1]
   3664    lea                  r5, [r5+strideq*2]
   3665    vpermq              m16, m23, [r5  +strideq*0]
   3666    vpermq              m17, m23, [r5  +strideq*1]
   3667    lea                  r5, [r5+strideq*2]
   3668    vpermq              m18, m23, [r5  +strideq*0]
   3669    mov                  r7, tmpq
   3670    punpcklbw            m0, m12, m13 ; 01
   3671    punpckhbw           m12, m13
   3672    punpcklbw            m1, m13, m14 ; 12
   3673    punpckhbw           m13, m14
   3674    punpcklbw            m2, m14, m15 ; 23
   3675    punpckhbw           m14, m15
   3676    punpcklbw            m3, m15, m16 ; 34
   3677    punpckhbw           m15, m16
   3678    punpcklbw            m4, m16, m17 ; 45
   3679    punpckhbw           m16, m17
   3680    punpcklbw            m5, m17, m18 ; 56
   3681    punpckhbw           m17, m18
   3682 .v_loop:
   3683    pmaddubsw           m19, m0, m8   ; a0
   3684    vpermq               m6, m23, [r5+strideq*1]
   3685    pmaddubsw           m20, m12, m8
   3686    mova                 m0, m2
   3687    pmaddubsw            m2, m9       ; a1
   3688    mova                m12, m14
   3689    pmaddubsw           m14, m9
   3690    lea                  r5, [r5+strideq*2]
   3691    pmaddubsw           m21, m1, m8   ; b0
   3692    pmaddubsw           m22, m13, m8
   3693    mova                 m1, m3
   3694    pmaddubsw            m3, m9       ; b1
   3695    mova                m13, m15
   3696    pmaddubsw           m15, m9
   3697    paddw               m19, m2
   3698    mova                 m2, m4
   3699    pmaddubsw            m4, m10      ; a2
   3700    paddw               m20, m14
   3701    mova                m14, m16
   3702    pmaddubsw           m16, m10
   3703    paddw               m21, m3
   3704    mova                 m3, m5
   3705    pmaddubsw            m5, m10      ; b2
   3706    paddw               m22, m15
   3707    mova                m15, m17
   3708    pmaddubsw           m17, m10
   3709    paddw               m19, m4
   3710    punpcklbw            m4, m18, m6  ; 67
   3711    paddw               m20, m16
   3712    punpckhbw           m16, m18, m6
   3713    vpermq              m18, m23, [r5+strideq*0]
   3714    paddw               m21, m5
   3715    pmaddubsw            m5, m4, m11  ; a3
   3716    paddw               m22, m17
   3717    pmaddubsw           m17, m16, m11
   3718    paddw               m19, m5
   3719    punpcklbw            m5, m6, m18  ; 78
   3720    paddw               m20, m17
   3721    punpckhbw           m17, m6, m18
   3722    pmaddubsw            m6, m5, m11  ; b3
   3723    paddw               m21, m6
   3724    pmaddubsw            m6, m17, m11
   3725    paddw               m22, m6
   3726    REPX   {pmulhrsw x, m7}, m19, m20, m21, m22
   3727    mova       [r7+wq*0+ 0], m19
   3728    mova       [r7+wq*0+64], m20
   3729    mova       [r7+wq*1+ 0], m21
   3730    mova       [r7+wq*1+64], m22
   3731    lea                  r7, [r7+wq*2]
   3732    sub                  hd, 2
   3733    jg .v_loop
   3734    add                srcq, 64
   3735    add                tmpq, 128
   3736    movzx                hd, r6b
   3737    sub                 r6d, 1<<8
   3738    jg .v_loop0
   3739    RET
   3740 .h:
   3741    RESET_STACK_STATE
   3742    test                myd, 0xf00
   3743    jnz .hv
   3744 .h2:
   3745    vpbroadcastd         m4, [pd_2]
   3746    cmp                  wd, 4
   3747    je .h_w4
   3748    tzcnt                wd, wd
   3749    shr                 mxd, 16
   3750    sub                srcq, 3
   3751    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
   3752    vpbroadcastd         m8, [base+subpel_filters+mxq*8+0]
   3753    vpbroadcastd         m9, [base+subpel_filters+mxq*8+4]
   3754    add                  wq, r7
   3755    jmp                  wq
   3756 .h_w4:
   3757    movzx               mxd, mxb
   3758    vbroadcasti128      ym5, [subpel_h_shufA]
   3759    mov                 r3d, 0x4
   3760    dec                srcq
   3761    vpbroadcastd        ym6, [base+subpel_filters+mxq*8+2]
   3762    kmovb                k1, r3d
   3763    lea            stride3q, [strideq*3]
   3764 .h_w4_loop:
   3765    movq                xm2, [srcq+strideq*0]
   3766    movq                xm3, [srcq+strideq*1]
   3767    vpbroadcastq    ym2{k1}, [srcq+strideq*2]
   3768    vpbroadcastq    ym3{k1}, [srcq+stride3q ]
   3769    lea                srcq, [srcq+strideq*4]
   3770    pshufb              ym2, ym5
   3771    pshufb              ym3, ym5
   3772    mova                ym0, ym4
   3773    vpdpbusd            ym0, ym2, ym6
   3774    mova                ym1, ym4
   3775    vpdpbusd            ym1, ym3, ym6
   3776    packssdw            ym0, ym1
   3777    psraw               ym0, 2
   3778    mova             [tmpq], ym0
   3779    add                tmpq, 32
   3780    sub                  hd, 4
   3781    jg .h_w4_loop
   3782    RET
   3783 .h_w8:
   3784    vbroadcasti128       m5, [subpel_h_shufA]
   3785    vbroadcasti128       m6, [subpel_h_shufB]
   3786    vbroadcasti128       m7, [subpel_h_shufC]
   3787    lea            stride3q, [strideq*3]
   3788 .h_w8_loop:
   3789    movu               xmm3, [srcq+strideq*0]
   3790    vinserti128         ym3, ymm3, [srcq+strideq*1], 1
   3791    vinserti128          m3, [srcq+strideq*2], 2
   3792    vinserti128          m3, [srcq+stride3q ], 3
   3793    lea                srcq, [srcq+strideq*4]
   3794    pshufb               m1, m3, m5
   3795    pshufb               m2, m3, m6
   3796    mova                 m0, m4
   3797    vpdpbusd             m0, m1, m8
   3798    mova                 m1, m4
   3799    vpdpbusd             m1, m2, m8
   3800    pshufb               m3, m7
   3801    vpdpbusd             m0, m2, m9
   3802    vpdpbusd             m1, m3, m9
   3803    packssdw             m0, m1
   3804    psraw                m0, 2
   3805    mova             [tmpq], m0
   3806    add                tmpq, 64
   3807    sub                  hd, 4
   3808    jg .h_w8_loop
   3809    RET
   3810 .h_w16:
   3811    mova                 m5, [spel_h_perm16]
   3812    vpbroadcastd         m7, [pb_4]
   3813    lea            stride3q, [strideq*3]
   3814    paddb                m6, m7, m5
   3815    paddb                m7, m6
   3816 .h_w16_loop:
   3817    movu                ym0, [srcq+strideq*0]
   3818    movu                ym1, [srcq+strideq*2]
   3819    vinserti32x8         m0, [srcq+strideq*1], 1
   3820    vinserti32x8         m1, [srcq+stride3q ], 1
   3821    lea                srcq, [srcq+strideq*4]
   3822    PREP_8TAP_H
   3823    add                tmpq, 64*2
   3824    sub                  hd, 4
   3825    jg .h_w16_loop
   3826    RET
   3827 .h_w32:
   3828    mova                 m5, [spel_h_perm32]
   3829    vpbroadcastd         m7, [pb_4]
   3830    paddb                m6, m7, m5
   3831    paddb                m7, m6
   3832 .h_w32_loop:
   3833    movu                 m0, [srcq+strideq*0]
   3834    movu                 m1, [srcq+strideq*1]
   3835    lea                srcq, [srcq+strideq*2]
   3836    PREP_8TAP_H
   3837    add                tmpq, 64*2
   3838    sub                  hd, 2
   3839    jg .h_w32_loop
   3840    RET
   3841 .h_w64:
   3842    xor                 r6d, r6d
   3843    jmp .h_start
   3844 .h_w128:
   3845    mov                  r6, -64*1
   3846 .h_start:
   3847    mova                 m5, [spel_h_perm32]
   3848    vpbroadcastd         m7, [pb_4]
   3849    sub                srcq, r6
   3850    paddb                m6, m7, m5
   3851    paddb                m7, m6
   3852 .h_loop0:
   3853    mov                  r5, r6
   3854 .h_loop:
   3855    movu                 m0, [srcq+r5+32*0]
   3856    movu                 m1, [srcq+r5+32*1]
   3857    PREP_8TAP_H
   3858    add                tmpq, 64*2
   3859    add                  r5, 64
   3860    jle .h_loop
   3861    add                srcq, strideq
   3862    dec                  hd
   3863    jg .h_loop0
   3864    RET
   3865 .hv:
   3866    RESET_STACK_STATE
   3867    vpbroadcastd         m8, [pd_2]
   3868    vpbroadcastd         m9, [pd_32]
   3869    cmp                  wd, 4
   3870    jg .hv_w8
   3871    movzx               mxd, mxb
   3872    dec                srcq
   3873    vpbroadcastd        m11, [base+subpel_filters+mxq*8+2]
   3874    movzx               mxd, myb
   3875    shr                 myd, 16
   3876    cmp                  hd, 4
   3877    cmove               myd, mxd
   3878    vpbroadcastq         m0, [base+subpel_filters+myq*8]
   3879    lea            stride3q, [strideq*3]
   3880    sub                srcq, stride3q
   3881    mov                 r3d, 0x04
   3882    kmovb                k1, r3d
   3883    kshiftlb             k2, k1, 2
   3884    kshiftlb             k3, k1, 4
   3885    vbroadcasti128      m10, [subpel_h_shufA]
   3886    punpcklbw            m0, m0
   3887    psraw                m0, 8 ; sign-extend
   3888    pshufd              m12, m0, q0000
   3889    pshufd              m13, m0, q1111
   3890    pshufd              m14, m0, q2222
   3891    pshufd              m15, m0, q3333
   3892    movq                xm3, [srcq+strideq*0]
   3893    vpbroadcastq        ym2, [srcq+strideq*1]
   3894    vpbroadcastq    ym3{k1}, [srcq+strideq*2]
   3895    vpbroadcastq     m2{k2}, [srcq+stride3q ]
   3896    lea                srcq, [srcq+strideq*4]
   3897    vpbroadcastq     m3{k2}, [srcq+strideq*0]
   3898    vpbroadcastq     m2{k3}, [srcq+strideq*1]
   3899    vpbroadcastq     m3{k3}, [srcq+strideq*2]
   3900    mova                 m6, [spel_hv_perm4a]
   3901    movu                 m7, [spel_hv_perm4b]
   3902    mova                 m0, m8
   3903    mova                 m1, m8
   3904    pshufb               m2, m10
   3905    pshufb               m3, m10
   3906    vpdpbusd             m0, m2, m11
   3907    vpdpbusd             m1, m3, m11
   3908    packssdw             m0, m1        ; _ 0  1 2  3 4  5 6
   3909    psraw                m0, 2
   3910    vpermb               m1, m6, m0    ; 01 12 23 34
   3911    vpermb               m2, m7, m0    ; 23 34 45 56
   3912 .hv_w4_loop:
   3913    movq                xm3, [srcq+stride3q ]
   3914    lea                srcq, [srcq+strideq*4]
   3915    movq                xm4, [srcq+strideq*0]
   3916    vpbroadcastq    ym3{k1}, [srcq+strideq*1]
   3917    vpbroadcastq    ym4{k1}, [srcq+strideq*2]
   3918    mova                 m5, m9
   3919    pshufb              ym3, ym10
   3920    vpdpwssd             m5, m1, m12   ; a0 b0 c0 d0
   3921    mova                ym1, ym8
   3922    pshufb              ym4, ym10
   3923    vpdpbusd            ym1, ym3, ym11
   3924    mova                ym3, ym8
   3925    vpdpbusd            ym3, ym4, ym11
   3926    vpdpwssd             m5, m2, m13   ; a1 b1 c1 d1
   3927    packssdw            ym1, ym3       ; 7 8  9 a
   3928    psraw               ym1, 2
   3929    vshufi32x4           m0, m1, q1032 ; _ 4  5 6  7 8  9 a
   3930    vpermb               m1, m6, m0    ; 45 56 67 78
   3931    vpermb               m2, m7, m0    ; 67 78 89 9a
   3932    vpdpwssd             m5, m1, m14   ; a2 b2 c2 d2
   3933    vpdpwssd             m5, m2, m15   ; a3 b3 c3 d3
   3934    psrad                m5, 6
   3935    vpmovdw          [tmpq], m5
   3936    add                tmpq, 32
   3937    sub                  hd, 4
   3938    jg .hv_w4_loop
   3939    RET
   3940 .hv_w8:
   3941    shr                 mxd, 16
   3942    sub                srcq, 3
   3943    vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
   3944    vpbroadcastd        m11, [base+subpel_filters+mxq*8+4]
   3945    movzx               mxd, myb
   3946    shr                 myd, 16
   3947    cmp                  hd, 4
   3948    cmove               myd, mxd
   3949    vpbroadcastq         m0, [base+subpel_filters+myq*8]
   3950    lea            stride3q, [strideq*3]
   3951    sub                srcq, stride3q
   3952    punpcklbw            m0, m0
   3953    psraw                m0, 8 ; sign-extend
   3954    pshufd              m12, m0, q0000
   3955    pshufd              m13, m0, q1111
   3956    pshufd              m14, m0, q2222
   3957    pshufd              m15, m0, q3333
   3958    cmp                  wd, 8
   3959    jg .hv_w16
   3960    vbroadcasti32x4     m17, [srcq+stride3q ]
   3961    vinserti32x4        m16, m17, [srcq+strideq*0], 0
   3962    vbroadcasti32x4     m19, [subpel_h_shufA]
   3963    vinserti32x4        m16, [srcq+strideq*1], 1
   3964    vbroadcasti32x4     m21, [subpel_h_shufC]
   3965    vinserti32x4        m16, [srcq+strideq*2], 2
   3966    lea                srcq, [srcq+strideq*4]
   3967    vinserti128        ym17, [srcq+strideq*0], 1
   3968    vbroadcasti32x4     m20, [subpel_h_shufB]
   3969    vinserti32x4        m17, [srcq+strideq*1], 2
   3970    vinserti32x4        m17, [srcq+strideq*2], 3
   3971    pshufb               m3, m16, m19      ; 0 1 2 3   0123
   3972    mova                 m2, m8
   3973    pshufb               m0, m16, m21      ; 0 1 2 3   89ab
   3974    vpdpbusd             m2, m3, m10
   3975    mova                 m3, m8
   3976    pshufb               m1, m17, m19      ; 3 4 5 6   0123
   3977    vpdpbusd             m3, m0, m11
   3978    mova                 m0, m8
   3979    pshufb               m4, m17, m21      ; 3 4 5 6   89ab
   3980    vpdpbusd             m0, m1, m10
   3981    mova                 m1, m8
   3982    pshufb              m16, m20           ; 0 1 2 3   4567
   3983    vpdpbusd             m1, m4, m11
   3984    pshufb              m17, m20           ; 3 4 5 6   4567
   3985    vpdpbusd             m2, m16, m11
   3986    vpdpbusd             m3, m16, m10
   3987    vpdpbusd             m0, m17, m11
   3988    vpdpbusd             m1, m17, m10
   3989    packssdw             m2, m3
   3990    packssdw             m0, m1
   3991    psraw                m2, 2             ; 0 1 2 3
   3992    psraw                m0, 2             ; 3 4 5 6
   3993    vshufi32x4           m4, m2, m0, q2132 ; 2 3 4 5
   3994    vshufi32x4           m5, m2, m0, q1021 ; 1 2 3 4
   3995    punpcklwd            m3, m4, m0        ; 23 34 45 56
   3996    punpckhwd            m4, m0
   3997    punpcklwd            m1, m2, m5        ; 01 12 23 34
   3998    punpckhwd            m2, m5
   3999 .hv_w8_loop:
   4000    movu               xm18, [srcq+stride3q ]
   4001    lea                srcq, [srcq+strideq*4]
   4002    vinserti128        ym18, [srcq+strideq*0], 1
   4003    vinserti32x4        m18, [srcq+strideq*1], 2
   4004    vinserti32x4        m18, [srcq+strideq*2], 3
   4005    pshufb              m17, m18, m19      ; 7 8 9 a   0123
   4006    mova                m16, m8
   4007    pshufb               m5, m18, m21      ; 7 8 9 a   89ab
   4008    vpdpbusd            m16, m17, m10
   4009    mova                m17, m8
   4010    pshufb              m18, m20           ; 7 8 9 a   4567
   4011    vpdpbusd            m17, m5, m11
   4012    mova                 m5, m9
   4013    vpdpwssd             m5, m3, m13       ; a1 b1 c1 d1
   4014    mova                 m6, m9
   4015    vpdpwssd             m6, m4, m13
   4016    vpdpbusd            m16, m18, m11
   4017    vpdpbusd            m17, m18, m10
   4018    vpdpwssd             m5, m1, m12       ; a0 b0 c0 d0
   4019    mova                 m1, m3
   4020    vpdpwssd             m6, m2, m12
   4021    mova                 m2, m4
   4022    packssdw            m16, m17
   4023    psraw               m16, 2             ; 7 8 9 a
   4024    valignq              m4, m16, m0, 6    ; 6 7 8 9
   4025    mova                 m0, m16
   4026    punpcklwd            m3, m4, m16       ; 67 78 89 9a
   4027    punpckhwd            m4, m16
   4028    vpdpwssd             m5, m3, m15       ; a3 b3 c3 d3
   4029    vpdpwssd             m6, m4, m15
   4030    vshufi32x4           m1, m3, q1032     ; 45 56 67 78
   4031    vshufi32x4           m2, m4, q1032
   4032    vpdpwssd             m5, m1, m14       ; a2 b2 c2 d2
   4033    vpdpwssd             m6, m2, m14
   4034    psrad                m5, 6
   4035    psrad                m6, 6
   4036    packssdw             m5, m6
   4037    mova             [tmpq], m5
   4038    add                tmpq, 64
   4039    sub                  hd, 4
   4040    jg .hv_w8_loop
   4041    vzeroupper
   4042    RET
   4043 .hv_w16:
   4044    WIN64_SPILL_XMM      23
   4045    mova                m16, [spel_h_perm16]
   4046    vpbroadcastd        m18, [pb_4]
   4047    add                  wd, wd
   4048    paddb               m17, m18, m16
   4049    lea                 r6d, [hq+wq*8-256]
   4050    paddb               m18, m17
   4051 .hv_w16_loop0:
   4052    movu               ym19, [srcq+strideq*0]
   4053    vinserti32x8        m19, [srcq+strideq*1], 1
   4054    lea                  r5, [srcq+strideq*2]
   4055    movu               ym20, [r5  +strideq*0]
   4056    vinserti32x8        m20, [r5  +strideq*1], 1
   4057    lea                  r5, [r5  +strideq*2]
   4058    movu               ym21, [r5  +strideq*0]
   4059    vinserti32x8        m21, [r5  +strideq*1], 1
   4060    lea                  r5, [r5  +strideq*2]
   4061    movu               ym22, [r5  +strideq*0]
   4062    mov                  r7, tmpq
   4063    vpermb               m3, m16, m19      ; 0 1   0123   89ab
   4064    mova                 m2, m8
   4065    vpermb               m4, m18, m19      ; 0 1   89ab   ghij
   4066    vpdpbusd             m2, m3, m10
   4067    mova                 m3, m8
   4068    vpermb               m5, m16, m20      ; 2 3   0123   89ab
   4069    vpdpbusd             m3, m4, m11
   4070    mova                 m4, m8
   4071    vpermb               m6, m18, m20      ; 2 3   89ab   ghij
   4072    vpdpbusd             m4, m5, m10
   4073    mova                 m5, m8
   4074    vpermb               m7, m16, m21      ; 4 5   0123   89ab
   4075    vpdpbusd             m5, m6, m11
   4076    mova                 m6, m8
   4077    vpermb               m0, m18, m21      ; 4 5   89ab   ghij
   4078    vpdpbusd             m6, m7, m10
   4079    mova                 m7, m8
   4080    vpermb              ym1, ym16, ym22    ; 6     0123   89ab
   4081    vpdpbusd             m7, m0, m11
   4082    mova                ym0, ym8
   4083    vpermb              m19, m17, m19      ; 0 1   4567   cdef
   4084    vpdpbusd            ym0, ym1, ym10
   4085    vpermb              ym1, ym18, ym22    ; 6     89ab   ghij
   4086    vpdpbusd             m2, m19, m11
   4087    vpdpbusd             m3, m19, m10
   4088    mova               ym19, ym8
   4089    vpermb              m20, m17, m20      ; 2 3   4567   cdef
   4090    vpdpbusd           ym19, ym1, ym11
   4091    vpermb              m21, m17, m21      ; 4 5   4567   cdef
   4092    vpdpbusd             m4, m20, m11
   4093    vpdpbusd             m5, m20, m10
   4094    vpermb             ym22, ym17, ym22    ; 6     4567   cdef
   4095    vpdpbusd             m6, m21, m11
   4096    vpdpbusd             m7, m21, m10
   4097    packssdw             m2, m3            ; 0 1
   4098    vpdpbusd            ym0, ym22, ym11
   4099    packssdw             m4, m5            ; 2 3
   4100    vpdpbusd           ym19, ym22, ym10
   4101    packssdw             m6, m7            ; 4 5
   4102    packssdw            ym0, ym19          ; 6
   4103    REPX       {psraw x, 2}, m2, m4, m6, ym0
   4104    vshufi32x4           m3, m2, m4, q1032 ; 1 2
   4105    vshufi32x4           m5, m4, m6, q1032 ; 3 4
   4106    vshufi32x4           m0, m6, m0, q1032 ; 5 6
   4107    punpcklwd            m1, m2, m3  ; 01 12
   4108    punpckhwd            m2, m3
   4109    punpcklwd            m3, m4, m5  ; 23 34
   4110    punpckhwd            m4, m5
   4111    punpcklwd            m5, m6, m0  ; 45 56
   4112    punpckhwd            m6, m0
   4113 .hv_w16_loop:
   4114    movu               ym19, [r5+strideq*1]
   4115    lea                  r5, [r5+strideq*2]
   4116    vinserti32x8        m19, [r5+strideq*0], 1
   4117    mova                m20, m9
   4118    vpdpwssd            m20, m1, m12 ; a0
   4119    vpermb               m1, m16, m19
   4120    mova                m21, m9
   4121    vpdpwssd            m21, m2, m12 ; b0
   4122    vpermb               m2, m17, m19
   4123    mova                m22, m8
   4124    vpdpbusd            m22, m1, m10
   4125    mova                 m1, m8
   4126    vpermb              m19, m18, m19
   4127    vpdpbusd             m1, m2, m10
   4128    vpdpwssd            m20, m3, m13 ; a1
   4129    vpdpwssd            m21, m4, m13 ; b1
   4130    vpdpbusd            m22, m2, m11
   4131    mova                 m2, m4
   4132    vpdpbusd             m1, m19, m11
   4133    mova                 m4, m6
   4134    vpdpwssd            m20, m5, m14 ; a2
   4135    vpdpwssd            m21, m6, m14 ; b2
   4136    packssdw            m22, m1
   4137    mova                 m1, m3
   4138    psraw               m22, 2              ; 7 8
   4139    mova                 m3, m5
   4140    vshufi32x4           m6, m0, m22, q1032 ; 6 7
   4141    mova                 m0, m22
   4142    punpcklwd            m5, m6, m0  ; 67 78
   4143    punpckhwd            m6, m0
   4144    vpdpwssd            m20, m5, m15 ; a3
   4145    vpdpwssd            m21, m6, m15 ; b3
   4146    psrad               m20, 6
   4147    psrad               m21, 6
   4148    packssdw            m20, m21
   4149    mova          [r7+wq*0], ym20
   4150    vextracti32x8 [r7+wq*1], m20, 1
   4151    lea                  r7, [r7+wq*2]
   4152    sub                  hd, 2
   4153    jg .hv_w16_loop
   4154    add                srcq, 16
   4155    add                tmpq, 32
   4156    movzx                hd, r6b
   4157    sub                 r6d, 1<<8
   4158    jg .hv_w16_loop0
   4159    RET
   4160 
   4161 cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts
   4162    vpbroadcastd         m9, [pd_16384]
   4163    mova               ym15, [warp_8x8t_end]
   4164    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main
   4165    jmp .start
   4166 .loop:
   4167    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2
   4168    lea                tmpq, [tmpq+tsq*4]
   4169 .start:
   4170    paddd               m16, m16
   4171    vpermb              m16, m15, m16
   4172    mova         [tmpq+tsq*0], xm16
   4173    vextracti128 [tmpq+tsq*2], ym16, 1
   4174    sub                 r6d, 0x1800
   4175    jg .loop
   4176    RET
   4177 
   4178 cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter
   4179    vpbroadcastd         m9, [pd_262144]
   4180    mova               xm15, [warp_8x8_end]
   4181    call .main
   4182    jmp .start
   4183 .loop:
   4184    call .main2
   4185    lea                dstq, [dstq+dsq*2]
   4186 .start:
   4187    psrad               m16, 19
   4188    packuswb            m16, m16
   4189    vpermb              m16, m15, m16
   4190    movq       [dstq+dsq*0], xm16
   4191    movhps     [dstq+dsq*1], xm16
   4192    sub                 r6d, 0x1800
   4193    jg .loop
   4194    RET
   4195 ALIGN function_align
   4196 .main:
   4197    vpbroadcastd         m1, [pd_512]
   4198 %if WIN64
   4199    mov               abcdq, r5mp
   4200    vpaddd             ym18, ym1, r6m {1to8} ; mx
   4201 %else
   4202    add                 r5d, 512
   4203    vpbroadcastd       ym18, r5d
   4204 %endif
   4205    vpaddd             ym20, ym1, r7m {1to8} ; my
   4206    mova               ym16, [pd_0to7]
   4207    vpbroadcastd       ym19, [abcdq+4*0]
   4208    vpbroadcastd       ym21, [abcdq+4*1]
   4209    lea                  r4, [ssq*3+3]
   4210    mova                m10, [warp_8x8_permA]
   4211    mov                 r6d, 0x5555
   4212    mova                m11, [warp_8x8_permB]
   4213    lea             filterq, [mc_warp_filter+64*8]
   4214    vpbroadcastq        m12, [warp_8x8_hpack]
   4215    sub                srcq, r4               ; src -= src_stride*3 + 3
   4216    vbroadcasti32x4     m13, [warp_8x8_permC]
   4217    kxnorb               k2, k2, k2
   4218    vbroadcasti32x4     m14, [warp_8x8_permD]
   4219    vpdpwssd           ym18, ym19, ym16       ; alpha
   4220    vpdpwssd           ym20, ym21, ym16       ; gamma
   4221    vbroadcasti32x4      m0, [srcq]
   4222    psrad              ym19, 16               ; beta
   4223    psrad              ym21, 16               ; delta
   4224    kmovw                k1, r6d
   4225    psrad              ym16, ym18, 10
   4226    kmovb                k3, k2
   4227    paddd              ym18, ym19
   4228    vpgatherdq       m2{k2}, [filterq+ym16*8] ; filter_x0
   4229    psrld                m1, 8                ; pd_2
   4230    pshufb               m0, m11
   4231    paddd                m8, m1, m1           ; pd_4
   4232    vpdpbusd             m1, m0, m2
   4233    call .h
   4234    psllq                m2, m1, 45
   4235    pslld                m1, 13
   4236    paddd                m1, m2
   4237    vpshrdq              m1, m0, 48           ; 01 12
   4238    call .h
   4239    vpshrdq              m2, m1, m0, 48       ; 23 34
   4240    call .h
   4241    vpshrdq              m3, m2, m0, 48       ; 45 56
   4242 .main2:
   4243    call .h
   4244    psrad              ym17, ym20, 10
   4245    kmovb                k2, k3
   4246    paddd              ym20, ym21
   4247    vpgatherdq       m7{k3}, [filterq+ym17*8] ; filter_y0
   4248    psrad              ym16, ym20, 10
   4249    kmovb                k3, k2
   4250    paddd              ym20, ym21
   4251    vpgatherdq      m17{k2}, [filterq+ym16*8] ; filter_y1
   4252    shufps               m5, m7, m17, q2020   ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3
   4253    mova                m16, m9
   4254    pshufb               m4, m5, m13          ;    a0    a1    A0    A1    b0    b1    B0    B1
   4255    vpdpwssd            m16, m1, m4
   4256    pshufb               m5, m14              ;    a2    a3    A2    A3    b2    b3    B2    B3
   4257    mova                 m1, m2
   4258    vpdpwssd            m16, m2, m5
   4259    shufps               m5, m7, m17, q3131   ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7
   4260    mova                 m2, m3
   4261    pshufb               m4, m5, m13          ;    a4    a5    A4    A5    b4    b5    B4    B5
   4262    vpdpwssd            m16, m3, m4
   4263    vpshrdq              m3, m0, 48           ; 67 78
   4264    pshufb               m5, m14              ;    a6    a7    A6    A7    b6    b7    B6    B7
   4265    vpdpwssd            m16, m3, m5
   4266    ret
   4267 ALIGN function_align
   4268 .h:
   4269    movu                xm5, [srcq+ssq*1]
   4270    psrad              ym16, ym18, 10
   4271    lea                srcq, [srcq+ssq*2]
   4272    vinserti32x4        ym5, [srcq+ssq*0], 1
   4273    kmovb                k2, k3
   4274    paddd              ym18, ym19
   4275    vpgatherdq       m6{k3}, [filterq+ym16*8] ; filter_x1
   4276    psrad              ym17, ym18, 10
   4277    kmovb                k3, k2
   4278    paddd              ym18, ym19
   4279    vpgatherdq      m16{k2}, [filterq+ym17*8] ; filter_x2
   4280    mova                 m0, m8
   4281    vpermb               m4, m10, m5          ; a4 b0 a5 b1   a6 b2 a7 b3   a8 b4 a9 b5   aa b6 ab b7
   4282    vpshldq             m17, m16, m6, 32      ; a4 a5 a6 a7   b0 b1 b2 b3
   4283    vpdpbusd             m0, m4, m17
   4284    vpermb               m5, m11, m5          ; a0 b4 a1 b5   a2 b6 a3 b7   a4 b8 a5 b9   a6 ba a7 bb
   4285    vmovdqa32       m16{k1}, m6               ; a0 a1 a2 a3   b4 b5 b6 b7
   4286    vpdpbusd             m0, m5, m16
   4287    vpmultishiftqb       m0, m12, m0          ; 1 1 2 2 (>> 3)
   4288    ret
   4289 
   4290 %macro BIDIR_FN 1 ; op
   4291    lea            stride3q, [strideq*3]
   4292    jmp                  wq
   4293 .w4:
   4294    cmp                  hd, 8
   4295    jg .w4_h16
   4296    WRAP_YMM %1           0
   4297    vextracti32x4       xm1, ym0, 1
   4298    movd   [dstq          ], xm0
   4299    pextrd [dstq+strideq*1], xm0, 1
   4300    movd   [dstq+strideq*2], xm1
   4301    pextrd [dstq+stride3q ], xm1, 1
   4302    jl .w4_ret
   4303    lea                dstq, [dstq+strideq*4]
   4304    pextrd [dstq          ], xm0, 2
   4305    pextrd [dstq+strideq*1], xm0, 3
   4306    pextrd [dstq+strideq*2], xm1, 2
   4307    pextrd [dstq+stride3q ], xm1, 3
   4308 .w4_ret:
   4309    RET
   4310 .w4_h16:
   4311    vpbroadcastd         m7, strided
   4312    pmulld               m7, [bidir_sctr_w4]
   4313    %1                    0
   4314    kxnorw               k1, k1, k1
   4315    vpscatterdd [dstq+m7]{k1}, m0
   4316    RET
   4317 .w8:
   4318    cmp                  hd, 4
   4319    jne .w8_h8
   4320    WRAP_YMM %1           0
   4321    vextracti32x4       xm1, ym0, 1
   4322    movq   [dstq          ], xm0
   4323    movq   [dstq+strideq*1], xm1
   4324    movhps [dstq+strideq*2], xm0
   4325    movhps [dstq+stride3q ], xm1
   4326    RET
   4327 .w8_loop:
   4328    %1_INC_PTR            2
   4329    lea                dstq, [dstq+strideq*4]
   4330 .w8_h8:
   4331    %1                    0
   4332    vextracti32x4       xm1, ym0, 1
   4333    vextracti32x4       xm2, m0, 2
   4334    vextracti32x4       xm3, m0, 3
   4335    movq   [dstq          ], xm0
   4336    movq   [dstq+strideq*1], xm1
   4337    movq   [dstq+strideq*2], xm2
   4338    movq   [dstq+stride3q ], xm3
   4339    lea                dstq, [dstq+strideq*4]
   4340    movhps [dstq          ], xm0
   4341    movhps [dstq+strideq*1], xm1
   4342    movhps [dstq+strideq*2], xm2
   4343    movhps [dstq+stride3q ], xm3
   4344    sub                  hd, 8
   4345    jg .w8_loop
   4346    RET
   4347 .w16_loop:
   4348    %1_INC_PTR            2
   4349    lea                dstq, [dstq+strideq*4]
   4350 .w16:
   4351    %1                    0
   4352    vpermq               m0, m0, q3120
   4353    mova          [dstq          ], xm0
   4354    vextracti32x4 [dstq+strideq*1], m0, 2
   4355    vextracti32x4 [dstq+strideq*2], ym0, 1
   4356    vextracti32x4 [dstq+stride3q ], m0, 3
   4357    sub                  hd, 4
   4358    jg .w16_loop
   4359    RET
   4360 .w32:
   4361    pmovzxbq             m7, [pb_02461357]
   4362 .w32_loop:
   4363    %1                    0
   4364    %1_INC_PTR            2
   4365    vpermq               m0, m7, m0
   4366    mova          [dstq+strideq*0], ym0
   4367    vextracti32x8 [dstq+strideq*1], m0, 1
   4368    lea                dstq, [dstq+strideq*2]
   4369    sub                  hd, 2
   4370    jg .w32_loop
   4371    RET
   4372 .w64:
   4373    pmovzxbq             m7, [pb_02461357]
   4374 .w64_loop:
   4375    %1                    0
   4376    %1_INC_PTR            2
   4377    vpermq               m0, m7, m0
   4378    mova             [dstq], m0
   4379    add                dstq, strideq
   4380    dec                  hd
   4381    jg .w64_loop
   4382    RET
   4383 .w128:
   4384    pmovzxbq             m7, [pb_02461357]
   4385 .w128_loop:
   4386    %1                    0
   4387    vpermq               m6, m7, m0
   4388    %1                    2
   4389    mova        [dstq+64*0], m6
   4390    %1_INC_PTR            4
   4391    vpermq               m6, m7, m0
   4392    mova        [dstq+64*1], m6
   4393    add                dstq, strideq
   4394    dec                  hd
   4395    jg .w128_loop
   4396    RET
   4397 %endmacro
   4398 
   4399 %macro AVG 1 ; src_offset
   4400    mova                 m0, [tmp1q+(%1+0)*mmsize]
   4401    paddw                m0, [tmp2q+(%1+0)*mmsize]
   4402    mova                 m1, [tmp1q+(%1+1)*mmsize]
   4403    paddw                m1, [tmp2q+(%1+1)*mmsize]
   4404    pmulhrsw             m0, m4
   4405    pmulhrsw             m1, m4
   4406    packuswb             m0, m1
   4407 %endmacro
   4408 
   4409 %macro AVG_INC_PTR 1
   4410    add               tmp1q, %1*mmsize
   4411    add               tmp2q, %1*mmsize
   4412 %endmacro
   4413 
   4414 cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3
   4415 %define base r6-avg_avx512icl_table
   4416    lea                  r6, [avg_avx512icl_table]
   4417    tzcnt                wd, wm
   4418    movifnidn            hd, hm
   4419    movsxd               wq, dword [r6+wq*4]
   4420    vpbroadcastd         m4, [base+pw_1024]
   4421    add                  wq, r6
   4422    BIDIR_FN            AVG
   4423 
   4424 %macro W_AVG 1 ; src_offset
   4425    ; (a * weight + b * (16 - weight) + 128) >> 8
   4426    ; = ((a - b) * weight + (b << 4) + 128) >> 8
   4427    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
   4428    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
   4429    mova                 m0,     [tmp1q+(%1+0)*mmsize]
   4430    psubw                m2, m0, [tmp2q+(%1+0)*mmsize]
   4431    mova                 m1,     [tmp1q+(%1+1)*mmsize]
   4432    psubw                m3, m1, [tmp2q+(%1+1)*mmsize]
   4433    pmulhw               m2, m4
   4434    pmulhw               m3, m4
   4435    paddw                m0, m2
   4436    paddw                m1, m3
   4437    pmulhrsw             m0, m5
   4438    pmulhrsw             m1, m5
   4439    packuswb             m0, m1
   4440 %endmacro
   4441 
   4442 %define W_AVG_INC_PTR AVG_INC_PTR
   4443 
   4444 cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
   4445 %define base r6-w_avg_avx512icl_table
   4446    lea                  r6, [w_avg_avx512icl_table]
   4447    tzcnt                wd, wm
   4448    movifnidn            hd, hm
   4449    vpbroadcastw         m4, r6m ; weight
   4450    movsxd               wq, dword [r6+wq*4]
   4451    vpbroadcastd         m5, [base+pw_2048]
   4452    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
   4453    add                  wq, r6
   4454    cmp           dword r6m, 7
   4455    jg .weight_gt7
   4456    mov                  r6, tmp1q
   4457    pxor                 m0, m0
   4458    mov               tmp1q, tmp2q
   4459    psubw                m4, m0, m4 ; -weight
   4460    mov               tmp2q, r6
   4461 .weight_gt7:
   4462    BIDIR_FN          W_AVG
   4463 
   4464 %macro MASK 1 ; src_offset
   4465    ; (a * m + b * (64 - m) + 512) >> 10
   4466    ; = ((a - b) * m + (b << 6) + 512) >> 10
   4467    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
   4468 %if mmsize == 64
   4469    vpermq               m3, m8, [maskq+%1*32]
   4470 %else
   4471    vpermq               m3,     [maskq+%1*16], q3120
   4472 %endif
   4473    mova                 m0,     [tmp2q+(%1+0)*mmsize]
   4474    psubw                m1, m0, [tmp1q+(%1+0)*mmsize]
   4475    psubb                m3, m4, m3
   4476    paddw                m1, m1     ; (b - a) << 1
   4477    paddb                m3, m3
   4478    punpcklbw            m2, m4, m3 ; -m << 9
   4479    pmulhw               m1, m2
   4480    paddw                m0, m1
   4481    mova                 m1,     [tmp2q+(%1+1)*mmsize]
   4482    psubw                m2, m1, [tmp1q+(%1+1)*mmsize]
   4483    paddw                m2, m2
   4484    punpckhbw            m3, m4, m3
   4485    pmulhw               m2, m3
   4486    paddw                m1, m2
   4487    pmulhrsw             m0, m5
   4488    pmulhrsw             m1, m5
   4489    packuswb             m0, m1
   4490 %endmacro
   4491 
   4492 %macro MASK_INC_PTR 1
   4493    add               maskq, %1*32
   4494    add               tmp2q, %1*64
   4495    add               tmp1q, %1*64
   4496 %endmacro
   4497 
   4498 cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
   4499 %define base r7-mask_avx512icl_table
   4500    lea                  r7, [mask_avx512icl_table]
   4501    tzcnt                wd, wm
   4502    movifnidn            hd, hm
   4503    mov               maskq, maskmp
   4504    movsxd               wq, dword [r7+wq*4]
   4505    pxor                 m4, m4
   4506    mova                 m8, [base+bilin_v_perm64]
   4507    vpbroadcastd         m5, [base+pw_2048]
   4508    add                  wq, r7
   4509    BIDIR_FN           MASK
   4510 
   4511 %macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
   4512    mova                m%1, [tmp1q+mmsize*%3]
   4513    mova                 m1, [tmp2q+mmsize*%3]
   4514    psubw                m1, m%1
   4515    pabsw               m%2, m1
   4516    psubusw             m%2, m6, m%2
   4517    psrlw               m%2, 8 ; 64 - m
   4518    psllw                m2, m%2, 10
   4519    pmulhw               m1, m2
   4520    paddw               m%1, m1
   4521    mova                 m1, [tmp1q+mmsize*%4]
   4522    mova                 m2, [tmp2q+mmsize*%4]
   4523    psubw                m2, m1
   4524    pabsw                m3, m2
   4525    psubusw              m3, m6, m3
   4526    vpshldw             m%2, m3, 8
   4527    psllw                m3, m%2, 10
   4528 %if %5
   4529    psubb               m%2, m5, m%2
   4530 %endif
   4531    pmulhw               m2, m3
   4532    paddw                m1, m2
   4533    pmulhrsw            m%1, m7
   4534    pmulhrsw             m1, m7
   4535    packuswb            m%1, m1
   4536 %endmacro
   4537 
   4538 cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
   4539 %define base r7-w_mask_420_avx512icl_table
   4540    lea                  r7, [w_mask_420_avx512icl_table]
   4541    tzcnt                wd, wm
   4542    mov                 r6d, r7m ; sign
   4543    movifnidn            hd, hm
   4544    movsxd               wq, [r7+wq*4]
   4545    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
   4546    vpbroadcastd         m7, [base+pw_2048]
   4547    vpbroadcastd         m9, [base+pb_m64]             ; -1 << 6
   4548    mova               ym10, [base+wm_420_mask+32]
   4549    vpbroadcastd         m8, [base+wm_sign+r6*8] ; (258 - sign) << 6
   4550    add                  wq, r7
   4551    mov               maskq, maskmp
   4552    lea            stride3q, [strideq*3]
   4553    jmp                  wq
   4554 .w4:
   4555    mova                 m5, [wm_420_perm4]
   4556    cmp                  hd, 8
   4557    jg .w4_h16
   4558    WRAP_YMM W_MASK       0, 4, 0, 1
   4559    vinserti128         ym5, [wm_420_perm4+32], 1
   4560    vpermb              ym4, ym5, ym4
   4561    vpdpbusd            ym8, ym4, ym9
   4562    vextracti32x4       xm1, m0, 1
   4563    movd   [dstq+strideq*0], xm0
   4564    pextrd [dstq+strideq*1], xm0, 1
   4565    movd   [dstq+strideq*2], xm1
   4566    pextrd [dstq+stride3q ], xm1, 1
   4567    jl .w4_end
   4568    lea                dstq, [dstq+strideq*4]
   4569    pextrd [dstq+strideq*0], xm0, 2
   4570    pextrd [dstq+strideq*1], xm0, 3
   4571    pextrd [dstq+strideq*2], xm1, 2
   4572    pextrd [dstq+stride3q ], xm1, 3
   4573 .w4_end:
   4574    vpermb              ym8, ym10, ym8
   4575    movq            [maskq], xm8
   4576    RET
   4577 .w4_h16:
   4578    vpbroadcastd        m11, strided
   4579    pmulld              m11, [bidir_sctr_w4]
   4580    W_MASK                0, 4, 0, 1
   4581    vpermb               m4, m5, m4
   4582    vpdpbusd             m8, m4, m9
   4583    kxnorw               k1, k1, k1
   4584    vpermb               m8, m10, m8
   4585    mova            [maskq], xm8
   4586    vpscatterdd [dstq+m11]{k1}, m0
   4587    RET
   4588 .w8:
   4589    mova                 m5, [wm_420_perm8]
   4590    cmp                  hd, 4
   4591    jne .w8_h8
   4592    WRAP_YMM W_MASK       0, 4, 0, 1
   4593    vinserti128         ym5, [wm_420_perm8+32], 1
   4594    vpermb              ym4, ym5, ym4
   4595    vpdpbusd            ym8, ym4, ym9
   4596    vpermb               m8, m10, m8
   4597    mova            [maskq], xm8
   4598    vextracti32x4       xm1, ym0, 1
   4599    movq   [dstq+strideq*0], xm0
   4600    movq   [dstq+strideq*1], xm1
   4601    movhps [dstq+strideq*2], xm0
   4602    movhps [dstq+stride3q ], xm1
   4603    RET
   4604 .w8_loop:
   4605    add               tmp1q, 128
   4606    add               tmp2q, 128
   4607    add               maskq, 16
   4608    lea                dstq, [dstq+strideq*4]
   4609 .w8_h8:
   4610    W_MASK                0, 4, 0, 1
   4611    vpermb               m4, m5, m4
   4612    mova                 m1, m8
   4613    vpdpbusd             m1, m4, m9
   4614    vpermb               m1, m10, m1
   4615    mova            [maskq], xm1
   4616    vextracti32x4       xm1, ym0, 1
   4617    vextracti32x4       xm2, m0, 2
   4618    vextracti32x4       xm3, m0, 3
   4619    movq   [dstq+strideq*0], xm0
   4620    movq   [dstq+strideq*1], xm1
   4621    movq   [dstq+strideq*2], xm2
   4622    movq   [dstq+stride3q ], xm3
   4623    lea                dstq, [dstq+strideq*4]
   4624    movhps [dstq+strideq*0], xm0
   4625    movhps [dstq+strideq*1], xm1
   4626    movhps [dstq+strideq*2], xm2
   4627    movhps [dstq+stride3q ], xm3
   4628    sub                  hd, 8
   4629    jg .w8_loop
   4630    RET
   4631 .w16:
   4632    mova                 m5, [wm_420_perm16]
   4633 .w16_loop:
   4634    W_MASK                0, 4, 0, 1
   4635    vpermb               m4, m5, m4
   4636    mova                 m1, m8
   4637    vpdpbusd             m1, m4, m9
   4638    add               tmp1q, 128
   4639    add               tmp2q, 128
   4640    vpermb               m1, m10, m1
   4641    vpermq               m0, m0, q3120
   4642    mova            [maskq], xm1
   4643    add               maskq, 16
   4644    mova          [dstq+strideq*0], xm0
   4645    vextracti32x4 [dstq+strideq*1], m0, 2
   4646    vextracti32x4 [dstq+strideq*2], ym0, 1
   4647    vextracti32x4 [dstq+stride3q ], m0, 3
   4648    lea                dstq, [dstq+strideq*4]
   4649    sub                  hd, 4
   4650    jg .w16_loop
   4651    RET
   4652 .w32:
   4653    pmovzxbq             m5, [pb_02461357]
   4654 .w32_loop:
   4655    W_MASK                0, 4, 0, 1
   4656    mova                 m1, m8
   4657    vpdpbusd             m1, m4, m9
   4658    add               tmp1q, 128
   4659    add               tmp2q, 128
   4660    vpermb               m1, m10, m1
   4661    vpermq               m0, m5, m0
   4662    mova            [maskq], xm1
   4663    add               maskq, 16
   4664    mova          [dstq+strideq*0], ym0
   4665    vextracti32x8 [dstq+strideq*1], m0, 1
   4666    lea                dstq, [dstq+strideq*2]
   4667    sub                  hd, 2
   4668    jg .w32_loop
   4669    RET
   4670 .w64:
   4671    pmovzxbq            m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
   4672    psrlq               m13, m12, 4          ; 1, 3, 5, 7, 9, 11, 13, 15
   4673 .w64_loop:
   4674    W_MASK                0, 4, 0, 2
   4675    W_MASK               11, 5, 1, 3
   4676    mova                 m2, m8
   4677    vpdpbusd             m2, m4, m9
   4678    mova                 m3, m8
   4679    vpdpbusd             m3, m5, m9
   4680    add               tmp1q, 256
   4681    add               tmp2q, 256
   4682    vpermt2b             m2, m10, m3
   4683    mova                 m1, m0
   4684    vpermt2q             m0, m12, m11
   4685    vpermt2q             m1, m13, m11
   4686    mova            [maskq], ym2
   4687    add               maskq, 32
   4688    mova   [dstq+strideq*0], m0
   4689    mova   [dstq+strideq*1], m1
   4690    lea                dstq, [dstq+strideq*2]
   4691    sub                  hd, 2
   4692    jg .w64_loop
   4693    RET
   4694 .w128:
   4695    pmovzxbq            m14, [wm_420_perm64]
   4696    mova                m10, [wm_420_mask]
   4697    psrlq               m15, m14, 4
   4698 .w128_loop:
   4699    W_MASK                0, 12, 0, 4
   4700    W_MASK               11, 13, 1, 5
   4701    mova                 m4, m8
   4702    vpdpbusd             m4, m12, m9
   4703    mova                 m5, m8
   4704    vpdpbusd             m5, m13, m9
   4705    mova                 m1, m0
   4706    vpermt2q             m0, m14, m11
   4707    vpermt2q             m1, m15, m11
   4708    mova [dstq+strideq*0+64*0], m0
   4709    mova [dstq+strideq*1+64*0], m1
   4710    W_MASK                0, 12, 2, 6
   4711    W_MASK               11, 13, 3, 7
   4712    vprold               m4, 16
   4713    vprold               m5, 16
   4714    vpdpbusd             m4, m12, m9
   4715    vpdpbusd             m5, m13, m9
   4716    add               tmp1q, 512
   4717    add               tmp2q, 512
   4718    vpermt2b             m4, m10, m5
   4719    mova                 m1, m0
   4720    vpermt2q             m0, m14, m11
   4721    vpermt2q             m1, m15, m11
   4722    mova            [maskq], m4
   4723    add               maskq, 64
   4724    mova [dstq+strideq*0+64*1], m0
   4725    mova [dstq+strideq*1+64*1], m1
   4726    lea                dstq, [dstq+strideq*2]
   4727    sub                  hd, 2
   4728    jg .w128_loop
   4729    RET
   4730 
   4731 cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
   4732 %define base r7-w_mask_422_avx512icl_table
   4733    lea                  r7, [w_mask_422_avx512icl_table]
   4734    tzcnt                wd, wm
   4735    mov                 r6d, r7m ; sign
   4736    movifnidn            hd, hm
   4737    movsxd               wq, dword [r7+wq*4]
   4738    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
   4739    vpbroadcastd         m7, [base+pw_2048]
   4740    vpbroadcastd         m9, [base+pw_m128]
   4741    mova                m10, [base+wm_422_mask]
   4742    vpbroadcastd        m11, [base+pb_127]
   4743    add                  wq, r7
   4744    vpbroadcastd         m8, [base+wm_sign+4+r6*4]
   4745    mov               maskq, maskmp
   4746    lea            stride3q, [strideq*3]
   4747    jmp                  wq
   4748 .w4:
   4749    cmp                  hd, 8
   4750    jg .w4_h16
   4751    WRAP_YMM W_MASK       0, 4, 0, 1
   4752    movhps             xm10, [wm_422_mask+16]
   4753    vpdpwssd            ym8, ym4, ym9
   4754    vpermb              ym8, ym10, ym8
   4755    vextracti32x4       xm1, m0, 1
   4756    movd   [dstq+strideq*0], xm0
   4757    pextrd [dstq+strideq*1], xm0, 1
   4758    movd   [dstq+strideq*2], xm1
   4759    pextrd [dstq+stride3q ], xm1, 1
   4760    jl .w4_end
   4761    lea                dstq, [dstq+strideq*4]
   4762    pextrd [dstq+strideq*0], xm0, 2
   4763    pextrd [dstq+strideq*1], xm0, 3
   4764    pextrd [dstq+strideq*2], xm1, 2
   4765    pextrd [dstq+stride3q ], xm1, 3
   4766 .w4_end:
   4767    pand                xm8, xm11
   4768    mova            [maskq], xm8
   4769    RET
   4770 .w4_h16:
   4771    vpbroadcastd         m5, strided
   4772    pmulld               m5, [bidir_sctr_w4]
   4773    W_MASK                0, 4, 0, 1
   4774    vpdpwssd             m8, m4, m9
   4775    kxnorw               k1, k1, k1
   4776    vpermb               m8, m10, m8
   4777    pand                ym8, ym11
   4778    mova            [maskq], ym8
   4779    vpscatterdd [dstq+m5]{k1}, m0
   4780    RET
   4781 .w8:
   4782    cmp                  hd, 4
   4783    jne .w8_h8
   4784    WRAP_YMM W_MASK       0, 4, 0, 1
   4785    movhps             xm10, [wm_422_mask+16]
   4786    vpdpwssd            ym8, ym4, ym9
   4787    vpermb              ym8, ym10, ym8
   4788    pand                xm8, xm11
   4789    mova            [maskq], xm8
   4790    vextracti32x4       xm1, ym0, 1
   4791    movq   [dstq+strideq*0], xm0
   4792    movq   [dstq+strideq*1], xm1
   4793    movhps [dstq+strideq*2], xm0
   4794    movhps [dstq+stride3q ], xm1
   4795    RET
   4796 .w8_loop:
   4797    add               tmp1q, 128
   4798    add               tmp2q, 128
   4799    add               maskq, 32
   4800    lea                dstq, [dstq+strideq*4]
   4801 .w8_h8:
   4802    W_MASK                0, 4, 0, 1
   4803    mova                 m1, m8
   4804    vpdpwssd             m1, m4, m9
   4805    vpermb               m1, m10, m1
   4806    pand                ym1, ym11
   4807    mova            [maskq], ym1
   4808    vextracti32x4       xm1, ym0, 1
   4809    vextracti32x4       xm2, m0, 2
   4810    vextracti32x4       xm3, m0, 3
   4811    movq   [dstq+strideq*0], xm0
   4812    movq   [dstq+strideq*1], xm1
   4813    movq   [dstq+strideq*2], xm2
   4814    movq   [dstq+stride3q ], xm3
   4815    lea                dstq, [dstq+strideq*4]
   4816    movhps [dstq+strideq*0], xm0
   4817    movhps [dstq+strideq*1], xm1
   4818    movhps [dstq+strideq*2], xm2
   4819    movhps [dstq+stride3q ], xm3
   4820    sub                  hd, 8
   4821    jg .w8_loop
   4822    RET
   4823 .w16_loop:
   4824    add               tmp1q, 128
   4825    add               tmp2q, 128
   4826    add               maskq, 32
   4827    lea                dstq, [dstq+strideq*4]
   4828 .w16:
   4829    W_MASK                0, 4, 0, 1
   4830    mova                 m1, m8
   4831    vpdpwssd             m1, m4, m9
   4832    vpermb               m1, m10, m1
   4833    vpermq               m0, m0, q3120
   4834    pand                ym1, ym11
   4835    mova            [maskq], ym1
   4836    mova          [dstq+strideq*0], xm0
   4837    vextracti32x4 [dstq+strideq*1], m0, 2
   4838    vextracti32x4 [dstq+strideq*2], ym0, 1
   4839    vextracti32x4 [dstq+stride3q ], m0, 3
   4840    sub                  hd, 4
   4841    jg .w16_loop
   4842    RET
   4843 .w32:
   4844    pmovzxbq             m5, [pb_02461357]
   4845 .w32_loop:
   4846    W_MASK                0, 4, 0, 1
   4847    mova                 m1, m8
   4848    vpdpwssd             m1, m4, m9
   4849    add               tmp1q, 128
   4850    add               tmp2q, 128
   4851    vpermb               m1, m10, m1
   4852    vpermq               m0, m5, m0
   4853    pand                ym1, ym11
   4854    mova            [maskq], ym1
   4855    add               maskq, 32
   4856    mova          [dstq+strideq*0], ym0
   4857    vextracti32x8 [dstq+strideq*1], m0, 1
   4858    lea                dstq, [dstq+strideq*2]
   4859    sub                  hd, 2
   4860    jg .w32_loop
   4861    RET
   4862 .w64:
   4863    pmovzxbq             m5, [pb_02461357]
   4864 .w64_loop:
   4865    W_MASK                0, 4, 0, 1
   4866    mova                 m1, m8
   4867    vpdpwssd             m1, m4, m9
   4868    add               tmp1q, 128
   4869    add               tmp2q, 128
   4870    vpermb               m1, m10, m1
   4871    vpermq               m0, m5, m0
   4872    pand                ym1, ym11
   4873    mova            [maskq], ym1
   4874    add               maskq, 32
   4875    mova             [dstq], m0
   4876    add                dstq, strideq
   4877    dec                  hd
   4878    jg .w64_loop
   4879    RET
   4880 .w128:
   4881    pmovzxbq            m13, [pb_02461357]
   4882 .w128_loop:
   4883    W_MASK                0, 4, 0, 1
   4884    W_MASK               12, 5, 2, 3
   4885    mova                 m2, m8
   4886    vpdpwssd             m2, m4, m9
   4887    mova                 m3, m8
   4888    vpdpwssd             m3, m5, m9
   4889    add               tmp1q, 256
   4890    add               tmp2q, 256
   4891    vpermt2b             m2, m10, m3
   4892    vpermq               m0, m13, m0
   4893    vpermq               m1, m13, m12
   4894    pand                 m2, m11
   4895    mova            [maskq], m2
   4896    add               maskq, 64
   4897    mova        [dstq+64*0], m0
   4898    mova        [dstq+64*1], m1
   4899    add                dstq, strideq
   4900    dec                  hd
   4901    jg .w128_loop
   4902    RET
   4903 
   4904 cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
   4905 %define base r7-w_mask_444_avx512icl_table
   4906    lea                  r7, [w_mask_444_avx512icl_table]
   4907    tzcnt                wd, wm
   4908    movifnidn            hd, hm
   4909    movsxd               wq, dword [r7+wq*4]
   4910    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
   4911    vpbroadcastd         m5, [base+pb_64]
   4912    vpbroadcastd         m7, [base+pw_2048]
   4913    mova                 m8, [base+wm_444_mask]
   4914    add                  wq, r7
   4915    mov               maskq, maskmp
   4916    lea            stride3q, [strideq*3]
   4917    jmp                  wq
   4918 .w4:
   4919    cmp                  hd, 8
   4920    jg .w4_h16
   4921    WRAP_YMM W_MASK       0, 4, 0, 1, 1
   4922    vinserti128         ym8, [wm_444_mask+32], 1
   4923    vpermb              ym4, ym8, ym4
   4924    mova            [maskq], ym4
   4925    vextracti32x4      xm1, m0, 1
   4926    movd   [dstq+strideq*0], xm0
   4927    pextrd [dstq+strideq*1], xm0, 1
   4928    movd   [dstq+strideq*2], xm1
   4929    pextrd [dstq+stride3q ], xm1, 1
   4930    jl .w4_end
   4931    lea                dstq, [dstq+strideq*4]
   4932    pextrd [dstq+strideq*0], xm0, 2
   4933    pextrd [dstq+strideq*1], xm0, 3
   4934    pextrd [dstq+strideq*2], xm1, 2
   4935    pextrd [dstq+stride3q ], xm1, 3
   4936 .w4_end:
   4937    RET
   4938 .w4_h16:
   4939    vpbroadcastd         m9, strided
   4940    pmulld               m9, [bidir_sctr_w4]
   4941    W_MASK                0, 4, 0, 1, 1
   4942    vpermb               m4, m8, m4
   4943    kxnorw               k1, k1, k1
   4944    mova            [maskq], m4
   4945    vpscatterdd [dstq+m9]{k1}, m0
   4946    RET
   4947 .w8:
   4948    cmp                  hd, 4
   4949    jne .w8_h8
   4950    WRAP_YMM W_MASK       0, 4, 0, 1, 1
   4951    vinserti128         ym8, [wm_444_mask+32], 1
   4952    vpermb              ym4, ym8, ym4
   4953    mova            [maskq], ym4
   4954    vextracti32x4       xm1, ym0, 1
   4955    movq   [dstq+strideq*0], xm0
   4956    movq   [dstq+strideq*1], xm1
   4957    movhps [dstq+strideq*2], xm0
   4958    movhps [dstq+stride3q ], xm1
   4959    RET
   4960 .w8_loop:
   4961    add               tmp1q, 128
   4962    add               tmp2q, 128
   4963    add               maskq, 64
   4964    lea                dstq, [dstq+strideq*4]
   4965 .w8_h8:
   4966    W_MASK                0, 4, 0, 1, 1
   4967    vpermb               m4, m8, m4
   4968    mova            [maskq], m4
   4969    vextracti32x4       xm1, ym0, 1
   4970    vextracti32x4       xm2, m0, 2
   4971    vextracti32x4       xm3, m0, 3
   4972    movq   [dstq+strideq*0], xm0
   4973    movq   [dstq+strideq*1], xm1
   4974    movq   [dstq+strideq*2], xm2
   4975    movq   [dstq+stride3q ], xm3
   4976    lea                dstq, [dstq+strideq*4]
   4977    movhps [dstq+strideq*0], xm0
   4978    movhps [dstq+strideq*1], xm1
   4979    movhps [dstq+strideq*2], xm2
   4980    movhps [dstq+stride3q ], xm3
   4981    sub                  hd, 8
   4982    jg .w8_loop
   4983    RET
   4984 .w16_loop:
   4985    add               tmp1q, 128
   4986    add               tmp2q, 128
   4987    add               maskq, 64
   4988    lea                dstq, [dstq+strideq*4]
   4989 .w16:
   4990    W_MASK                0, 4, 0, 1, 1
   4991    vpermb               m4, m8, m4
   4992    vpermq               m0, m0, q3120
   4993    mova            [maskq], m4
   4994    mova          [dstq+strideq*0], xm0
   4995    vextracti32x4 [dstq+strideq*1], m0, 2
   4996    vextracti32x4 [dstq+strideq*2], ym0, 1
   4997    vextracti32x4 [dstq+stride3q ], m0, 3
   4998    sub                  hd, 4
   4999    jg .w16_loop
   5000    RET
   5001 .w32:
   5002    pmovzxbq             m9, [pb_02461357]
   5003 .w32_loop:
   5004    W_MASK                0, 4, 0, 1, 1
   5005    vpermb               m4, m8, m4
   5006    add               tmp1q, 128
   5007    add               tmp2q, 128
   5008    vpermq               m0, m9, m0
   5009    mova            [maskq], m4
   5010    add               maskq, 64
   5011    mova          [dstq+strideq*0], ym0
   5012    vextracti32x8 [dstq+strideq*1], m0, 1
   5013    lea                dstq, [dstq+strideq*2]
   5014    sub                  hd, 2
   5015    jg .w32_loop
   5016    RET
   5017 .w64:
   5018    pmovzxbq             m9, [pb_02461357]
   5019 .w64_loop:
   5020    W_MASK                0, 4, 0, 1, 1
   5021    vpermb               m4, m8, m4
   5022    add               tmp1q, 128
   5023    add               tmp2q, 128
   5024    vpermq               m0, m9, m0
   5025    mova            [maskq], m4
   5026    add               maskq, 64
   5027    mova             [dstq], m0
   5028    add                dstq, strideq
   5029    dec                  hd
   5030    jg .w64_loop
   5031    RET
   5032 .w128:
   5033    pmovzxbq            m11, [pb_02461357]
   5034 .w128_loop:
   5035    W_MASK                0, 4, 0, 1, 1
   5036    W_MASK               10, 9, 2, 3, 1
   5037    vpermb               m4, m8, m4
   5038    vpermb               m9, m8, m9
   5039    add               tmp1q, 256
   5040    add               tmp2q, 256
   5041    vpermq               m0, m11, m0
   5042    vpermq              m10, m11, m10
   5043    mova       [maskq+64*0], m4
   5044    mova       [maskq+64*1], m9
   5045    add               maskq, 128
   5046    mova        [dstq+64*0], m0
   5047    mova        [dstq+64*1], m10
   5048    add                dstq, strideq
   5049    dec                  hd
   5050    jg .w128_loop
   5051    RET
   5052 
   5053 cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
   5054 %define base r6-blend_avx512icl_table
   5055    lea                  r6, [blend_avx512icl_table]
   5056    tzcnt                wd, wm
   5057    movifnidn         maskq, maskmp
   5058    movifnidn            hd, hm
   5059    movsxd               wq, [r6+wq*4]
   5060    vpbroadcastd         m6, [base+pb_64]
   5061    vpbroadcastd         m7, [base+pw_512]
   5062    sub                tmpq, maskq
   5063    add                  wq, r6
   5064    lea                  r6, [dsq*3]
   5065    jmp                  wq
   5066 .w4:
   5067    movd               xmm0, [dstq+dsq*0]
   5068    pinsrd             xmm0, [dstq+dsq*1], 1
   5069    vpbroadcastd       xmm1, [dstq+dsq*2]
   5070    pinsrd             xmm1, [dstq+r6   ], 3
   5071    mova               xmm4, [maskq]
   5072    mova               xmm5, [maskq+tmpq]
   5073    add               maskq, 4*4
   5074    psubb              xmm3, xm6, xmm4
   5075    punpcklbw          xmm0, xmm5
   5076    punpcklbw          xmm2, xmm3, xmm4
   5077    punpckhbw          xmm1, xmm5
   5078    punpckhbw          xmm3, xmm4
   5079    pmaddubsw          xmm0, xmm2
   5080    pmaddubsw          xmm1, xmm3
   5081    pmulhrsw           xmm0, xm7
   5082    pmulhrsw           xmm1, xm7
   5083    packuswb           xmm0, xmm1
   5084    movd       [dstq+dsq*0], xmm0
   5085    pextrd     [dstq+dsq*1], xmm0, 1
   5086    pextrd     [dstq+dsq*2], xmm0, 2
   5087    pextrd     [dstq+r6   ], xmm0, 3
   5088    lea                dstq, [dstq+dsq*4]
   5089    sub                  hd, 4
   5090    jg .w4
   5091    RET
   5092 .w8:
   5093    movq               xmm0, [dstq+dsq*0]
   5094    vpbroadcastq       xmm1, [dstq+dsq*1]
   5095    vpbroadcastq       ymm2, [dstq+dsq*2]
   5096    vpbroadcastq       ymm3, [dstq+r6   ]
   5097    mova               ymm4, [maskq]
   5098    mova               ymm5, [maskq+tmpq]
   5099    add               maskq, 8*4
   5100    vpblendd           ymm0, ymm2, 0x30
   5101    vpblendd           ymm1, ymm3, 0xc0
   5102    psubb              ymm3, ym6, ymm4
   5103    punpcklbw          ymm0, ymm5
   5104    punpcklbw          ymm2, ymm3, ymm4
   5105    punpckhbw          ymm1, ymm5
   5106    punpckhbw          ymm3, ymm4
   5107    pmaddubsw          ymm0, ymm2
   5108    pmaddubsw          ymm1, ymm3
   5109    pmulhrsw           ymm0, ym7
   5110    pmulhrsw           ymm1, ym7
   5111    packuswb           ymm0, ymm1
   5112    vextracti128       xmm1, ymm0, 1
   5113    movq       [dstq+dsq*0], xmm0
   5114    movhps     [dstq+dsq*1], xmm0
   5115    movq       [dstq+dsq*2], xmm1
   5116    movhps     [dstq+r6   ], xmm1
   5117    lea                dstq, [dstq+dsq*4]
   5118    sub                  hd, 4
   5119    jg .w8
   5120    vzeroupper
   5121    RET
   5122 .w16:
   5123    mova                xm1, [dstq+dsq*0]
   5124    vinserti32x4        ym1, [dstq+dsq*1], 1
   5125    vinserti32x4         m1, [dstq+dsq*2], 2
   5126    mova                 m4, [maskq]
   5127    vinserti32x4         m1, [dstq+r6   ], 3
   5128    mova                 m5, [maskq+tmpq]
   5129    add               maskq, 16*4
   5130    psubb                m3, m6, m4
   5131    punpcklbw            m0, m1, m5
   5132    punpcklbw            m2, m3, m4
   5133    punpckhbw            m1, m5
   5134    punpckhbw            m3, m4
   5135    pmaddubsw            m0, m2
   5136    pmaddubsw            m1, m3
   5137    pmulhrsw             m0, m7
   5138    pmulhrsw             m1, m7
   5139    packuswb             m0, m1
   5140    mova          [dstq+dsq*0], xm0
   5141    vextracti32x4 [dstq+dsq*1], ym0, 1
   5142    vextracti32x4 [dstq+dsq*2], m0, 2
   5143    vextracti32x4 [dstq+r6   ], m0, 3
   5144    lea                dstq, [dstq+dsq*4]
   5145    sub                  hd, 4
   5146    jg .w16
   5147    RET
   5148 .w32:
   5149    mova                ym1, [dstq+dsq*0]
   5150    vinserti32x8         m1, [dstq+dsq*1], 1
   5151    mova                 m4, [maskq]
   5152    mova                 m5, [maskq+tmpq]
   5153    add               maskq, 32*2
   5154    psubb                m3, m6, m4
   5155    punpcklbw            m0, m1, m5
   5156    punpcklbw            m2, m3, m4
   5157    punpckhbw            m1, m5
   5158    punpckhbw            m3, m4
   5159    pmaddubsw            m0, m2
   5160    pmaddubsw            m1, m3
   5161    pmulhrsw             m0, m7
   5162    pmulhrsw             m1, m7
   5163    packuswb             m0, m1
   5164    mova          [dstq+dsq*0], ym0
   5165    vextracti32x8 [dstq+dsq*1], m0, 1
   5166    lea                dstq, [dstq+dsq*2]
   5167    sub                  hd, 2
   5168    jg .w32
   5169    RET
   5170 
   5171 cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
   5172 %define base r5-blend_v_avx512icl_table
   5173    lea                  r5, [blend_v_avx512icl_table]
   5174    tzcnt                wd, wm
   5175    movifnidn            hd, hm
   5176    movsxd               wq, [r5+wq*4]
   5177    vpbroadcastd         m5, [base+pw_512]
   5178    add                  wq, r5
   5179    add               maskq, obmc_masks-blend_v_avx512icl_table
   5180    jmp                  wq
   5181 .w2:
   5182    vpbroadcastd       xmm2, [maskq+2*2]
   5183 .w2_s0_loop:
   5184    movd               xmm0, [dstq+dsq*0]
   5185    pinsrw             xmm0, [dstq+dsq*1], 1
   5186    movd               xmm1, [tmpq]
   5187    add                tmpq, 2*2
   5188    punpcklbw          xmm0, xmm1
   5189    pmaddubsw          xmm0, xmm2
   5190    pmulhrsw           xmm0, xm5
   5191    packuswb           xmm0, xmm0
   5192    pextrw     [dstq+dsq*0], xmm0, 0
   5193    pextrw     [dstq+dsq*1], xmm0, 1
   5194    lea                dstq, [dstq+dsq*2]
   5195    sub                  hd, 2
   5196    jg .w2_s0_loop
   5197    RET
   5198 .w4:
   5199    vpbroadcastq       xmm2, [maskq+4*2]
   5200 .w4_loop:
   5201    movd               xmm0, [dstq+dsq*0]
   5202    pinsrd             xmm0, [dstq+dsq*1], 1
   5203    movq               xmm1, [tmpq]
   5204    add                tmpq, 4*2
   5205    punpcklbw          xmm0, xmm1
   5206    pmaddubsw          xmm0, xmm2
   5207    pmulhrsw           xmm0, xm5
   5208    packuswb           xmm0, xmm0
   5209    movd       [dstq+dsq*0], xmm0
   5210    pextrd     [dstq+dsq*1], xmm0, 1
   5211    lea                dstq, [dstq+dsq*2]
   5212    sub                  hd, 2
   5213    jg .w4_loop
   5214    RET
   5215 .w8:
   5216    mova               xmm3, [maskq+8*2]
   5217 .w8_loop:
   5218    movq               xmm0, [dstq+dsq*0]
   5219    vpbroadcastq       xmm1, [dstq+dsq*1]
   5220    mova               xmm2, [tmpq]
   5221    add                tmpq, 8*2
   5222    punpcklbw          xmm0, xmm2
   5223    punpckhbw          xmm1, xmm2
   5224    pmaddubsw          xmm0, xmm3
   5225    pmaddubsw          xmm1, xmm3
   5226    pmulhrsw           xmm0, xm5
   5227    pmulhrsw           xmm1, xm5
   5228    packuswb           xmm0, xmm1
   5229    movq       [dstq+dsq*0], xmm0
   5230    movhps     [dstq+dsq*1], xmm0
   5231    lea                dstq, [dstq+dsq*2]
   5232    sub                  hd, 2
   5233    jg .w8_loop
   5234    RET
   5235 .w16:
   5236    vbroadcasti32x4     ym3, [maskq+16*2]
   5237    vbroadcasti32x4     ym4, [maskq+16*3]
   5238 .w16_loop:
   5239    mova                xm1, [dstq+dsq*0]
   5240    vinserti32x4        ym1, [dstq+dsq*1], 1
   5241    mova                ym2, [tmpq]
   5242    add                tmpq, 16*2
   5243    punpcklbw           ym0, ym1, ym2
   5244    punpckhbw           ym1, ym2
   5245    pmaddubsw           ym0, ym3
   5246    pmaddubsw           ym1, ym4
   5247    pmulhrsw            ym0, ym5
   5248    pmulhrsw            ym1, ym5
   5249    packuswb            ym0, ym1
   5250    mova          [dstq+dsq*0], xm0
   5251    vextracti32x4 [dstq+dsq*1], m0, 1
   5252    lea                dstq, [dstq+dsq*2]
   5253    sub                  hd, 2
   5254    jg .w16_loop
   5255    RET
   5256 .w32:
   5257    mova                 m4, [maskq+32*2]
   5258    vshufi32x4           m3, m4, m4, q2020
   5259    vshufi32x4           m4, m4, q3131
   5260 .w32_loop:
   5261    mova                ym1, [dstq+dsq*0]
   5262    vinserti32x8         m1, [dstq+dsq*1], 1
   5263    mova                 m2, [tmpq]
   5264    add                tmpq, 32*2
   5265    punpcklbw            m0, m1, m2
   5266    punpckhbw            m1, m2
   5267    pmaddubsw            m0, m3
   5268    pmaddubsw            m1, m4
   5269    pmulhrsw             m0, m5
   5270    pmulhrsw             m1, m5
   5271    packuswb             m0, m1
   5272    mova          [dstq+dsq*0], ym0
   5273    vextracti32x8 [dstq+dsq*1], m0, 1
   5274    lea                dstq, [dstq+dsq*2]
   5275    sub                  hd, 2
   5276    jg .w32_loop
   5277    RET
   5278 
   5279 cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
   5280 %define base r6-blend_h_avx512icl_table
   5281    lea                  r6, [blend_h_avx512icl_table]
   5282    tzcnt                wd, wm
   5283    mov                  hd, hm
   5284    movsxd               wq, [r6+wq*4]
   5285    lea               maskq, [base+obmc_masks+hq*2]
   5286    vpbroadcastd         m5, [base+pw_512]
   5287    lea                  hd, [hq*3]
   5288    add                  wq, r6
   5289    shr                  hd, 2 ; h * 3/4
   5290    lea               maskq, [maskq+hq*2]
   5291    neg                  hq
   5292    jmp                  wq
   5293 .w2:
   5294    movd               xmm0, [dstq+dsq*0]
   5295    pinsrw             xmm0, [dstq+dsq*1], 1
   5296    movd               xmm2, [maskq+hq*2]
   5297    movd               xmm1, [tmpq]
   5298    add                tmpq, 2*2
   5299    punpcklwd          xmm2, xmm2
   5300    punpcklbw          xmm0, xmm1
   5301    pmaddubsw          xmm0, xmm2
   5302    pmulhrsw           xmm0, xm5
   5303    packuswb           xmm0, xmm0
   5304    pextrw     [dstq+dsq*0], xmm0, 0
   5305    pextrw     [dstq+dsq*1], xmm0, 1
   5306    lea                dstq, [dstq+dsq*2]
   5307    add                  hq, 2
   5308    jl .w2
   5309    RET
   5310 .w4:
   5311    mova               xmm3, [blend_shuf]
   5312 .w4_loop:
   5313    movd               xmm0, [dstq+dsq*0]
   5314    pinsrd             xmm0, [dstq+dsq*1], 1
   5315    movd               xmm2, [maskq+hq*2]
   5316    movq               xmm1, [tmpq]
   5317    add                tmpq, 4*2
   5318    pshufb             xmm2, xmm3
   5319    punpcklbw          xmm0, xmm1
   5320    pmaddubsw          xmm0, xmm2
   5321    pmulhrsw           xmm0, xm5
   5322    packuswb           xmm0, xmm0
   5323    movd       [dstq+dsq*0], xmm0
   5324    pextrd     [dstq+dsq*1], xmm0, 1
   5325    lea                dstq, [dstq+dsq*2]
   5326    add                  hq, 2
   5327    jl .w4_loop
   5328    RET
   5329 .w8:
   5330    vbroadcasti128     ymm4, [blend_shuf]
   5331    shufpd             ymm4, ymm4, 0x03
   5332 .w8_loop:
   5333    vpbroadcastq       ymm1, [dstq+dsq*0]
   5334    movq               xmm0, [dstq+dsq*1]
   5335    vpblendd           ymm0, ymm1, 0x30
   5336    vpbroadcastd       ymm3, [maskq+hq*2]
   5337    movq               xmm1, [tmpq+8*1]
   5338    vinserti128        ymm1, [tmpq+8*0], 1
   5339    add                tmpq, 8*2
   5340    pshufb             ymm3, ymm4
   5341    punpcklbw          ymm0, ymm1
   5342    pmaddubsw          ymm0, ymm3
   5343    pmulhrsw           ymm0, ym5
   5344    vextracti128       xmm1, ymm0, 1
   5345    packuswb           xmm0, xmm1
   5346    movhps     [dstq+dsq*0], xmm0
   5347    movq       [dstq+dsq*1], xmm0
   5348    lea                dstq, [dstq+dsq*2]
   5349    add                  hq, 2
   5350    jl .w8_loop
   5351    vzeroupper
   5352    RET
   5353 .w16:
   5354    vbroadcasti32x4     ym4, [blend_shuf]
   5355    shufpd              ym4, ym4, 0x0c
   5356 .w16_loop:
   5357    mova                xm1, [dstq+dsq*0]
   5358    vinserti32x4        ym1, [dstq+dsq*1], 1
   5359    vpbroadcastd        ym3, [maskq+hq*2]
   5360    mova                ym2, [tmpq]
   5361    add                tmpq, 16*2
   5362    pshufb              ym3, ym4
   5363    punpcklbw           ym0, ym1, ym2
   5364    punpckhbw           ym1, ym2
   5365    pmaddubsw           ym0, ym3
   5366    pmaddubsw           ym1, ym3
   5367    pmulhrsw            ym0, ym5
   5368    pmulhrsw            ym1, ym5
   5369    packuswb            ym0, ym1
   5370    mova          [dstq+dsq*0], xm0
   5371    vextracti32x4 [dstq+dsq*1], m0, 1
   5372    lea                dstq, [dstq+dsq*2]
   5373    add                  hq, 2
   5374    jl .w16_loop
   5375    RET
   5376 .w32:
   5377    vbroadcasti32x4      m4, [blend_shuf]
   5378    shufpd               m4, m4, 0xf0
   5379 .w32_loop:
   5380    mova                ym1, [dstq+dsq*0]
   5381    vinserti32x8         m1, [dstq+dsq*1], 1
   5382    vpbroadcastd         m3, [maskq+hq*2]
   5383    mova                 m2, [tmpq]
   5384    add                tmpq, 32*2
   5385    pshufb               m3, m4
   5386    punpcklbw            m0, m1, m2
   5387    punpckhbw            m1, m2
   5388    pmaddubsw            m0, m3
   5389    pmaddubsw            m1, m3
   5390    pmulhrsw             m0, m5
   5391    pmulhrsw             m1, m5
   5392    packuswb             m0, m1
   5393    mova          [dstq+dsq*0], ym0
   5394    vextracti32x8 [dstq+dsq*1], m0, 1
   5395    lea                dstq, [dstq+dsq*2]
   5396    add                  hq, 2
   5397    jl .w32_loop
   5398    RET
   5399 .w64:
   5400    vpbroadcastw         m3, [maskq+hq*2]
   5401    mova                 m1, [dstq]
   5402    mova                 m2, [tmpq]
   5403    add                tmpq, 32*2
   5404    punpcklbw            m0, m1, m2
   5405    punpckhbw            m1, m2
   5406    pmaddubsw            m0, m3
   5407    pmaddubsw            m1, m3
   5408    pmulhrsw             m0, m5
   5409    pmulhrsw             m1, m5
   5410    packuswb             m0, m1
   5411    mova             [dstq], m0
   5412    add                dstq, dsq
   5413    inc                  hq
   5414    jl .w64
   5415    RET
   5416 .w128:
   5417    vpbroadcastw         m6, [maskq+hq*2]
   5418    mova                 m2, [dstq+64*0]
   5419    mova                 m1, [tmpq+64*0]
   5420    mova                 m3, [dstq+64*1]
   5421    mova                 m4, [tmpq+64*1]
   5422    add                tmpq, 64*2
   5423    punpcklbw            m0, m2, m1
   5424    punpckhbw            m2, m1
   5425    pmaddubsw            m0, m6
   5426    pmaddubsw            m2, m6
   5427    punpcklbw            m1, m3, m4
   5428    punpckhbw            m3, m4
   5429    pmaddubsw            m1, m6
   5430    pmaddubsw            m3, m6
   5431    REPX   {pmulhrsw x, m5}, m0, m2, m1, m3
   5432    packuswb             m0, m2
   5433    packuswb             m1, m3
   5434    mova        [dstq+64*0], m0
   5435    mova        [dstq+64*1], m1
   5436    add                dstq, dsq
   5437    inc                  hq
   5438    jl .w128
   5439    RET
   5440 
   5441 cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \
   5442                                dst_w, h, src_w, dx, mx0
   5443    sub          dword mx0m, 4<<14
   5444    sub        dword src_wm, 8
   5445    mov                  r6, ~0
   5446    vpbroadcastd         m5, dxm
   5447    vpbroadcastd         m8, mx0m
   5448    vpbroadcastd         m6, src_wm
   5449    kmovq                k3, r6
   5450 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
   5451    LEA                  r7, $$
   5452 %define base r7-$$
   5453    vpbroadcastd         m3, [base+pw_m256]
   5454    vpbroadcastd         m7, [base+pd_63]
   5455    vbroadcasti32x4     m15, [base+pb_8x0_8x8]
   5456    vpdpwssd             m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
   5457    pslld                m5, 4                      ; dx*16
   5458    pslld                m6, 14
   5459    pxor                 m2, m2
   5460    mova                m16, [base+resize_permA]
   5461    mova                m17, [base+resize_permB]
   5462    mova               xm18, [base+resize_permC]
   5463 .loop_y:
   5464    xor                  xd, xd
   5465    mova                 m4, m8     ; per-line working version of mx
   5466 .loop_x:
   5467    pmaxsd               m0, m4, m2
   5468    psrad                m9, m4, 8  ; filter offset (unmasked)
   5469    pminsd               m0, m6     ; iclip(mx, 0, src_w-8)
   5470    psubd                m1, m4, m0 ; pshufb offset
   5471    psrad                m0, 14     ; clipped src_x offset
   5472    psrad                m1, 14     ; pshufb edge_emu offset
   5473    vptestmd             k4, m1, m1
   5474    pand                 m9, m7     ; filter offset (masked)
   5475    ktestw               k4, k4
   5476    jz .load
   5477    vextracti32x8      ym12, m0, 1
   5478    vextracti32x8      ym13, m1, 1
   5479    kmovq                k1, k3
   5480    kmovq                k2, k3
   5481    vpgatherdq      m10{k1}, [srcq+ym0]
   5482    vpgatherdq      m11{k2}, [srcq+ym12]
   5483    kmovq                k1, k3
   5484    kmovq                k2, k3
   5485    vpgatherdq      m14{k1}, [base+resize_shuf+4+ym1]
   5486    vpgatherdq       m0{k2}, [base+resize_shuf+4+ym13]
   5487    mova                m12, m16
   5488    mova                m13, m17
   5489    paddb               m14, m15
   5490    paddb                m0, m15
   5491    pshufb              m10, m14
   5492    pshufb              m11, m0
   5493    vpermi2d            m12, m10, m11
   5494    vpermi2d            m13, m10, m11
   5495    jmp .filter
   5496 .load:
   5497    kmovq                k1, k3
   5498    kmovq                k2, k3
   5499    vpgatherdd      m12{k1}, [srcq+m0+0]
   5500    vpgatherdd      m13{k2}, [srcq+m0+4]
   5501 .filter:
   5502    kmovq                k1, k3
   5503    kmovq                k2, k3
   5504    vpgatherdd      m10{k1}, [base+resize_filter+m9*8+0]
   5505    vpgatherdd      m11{k2}, [base+resize_filter+m9*8+4]
   5506    mova                m14, m2
   5507    vpdpbusd            m14, m12, m10
   5508    vpdpbusd            m14, m13, m11
   5509    packssdw            m14, m14
   5510    pmulhrsw            m14, m3
   5511    packuswb            m14, m14
   5512    vpermd              m14, m18, m14
   5513    mova          [dstq+xq], xm14
   5514    paddd                m4, m5
   5515    add                  xd, 16
   5516    cmp                  xd, dst_wd
   5517    jl .loop_x
   5518    add                dstq, dst_strideq
   5519    add                srcq, src_strideq
   5520    dec                  hd
   5521    jg .loop_y
   5522    RET
   5523 
   5524 %endif ; ARCH_X86_64