tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx_avx512.asm (304805B)


      1 ; Copyright © 2020-2023, VideoLAN and dav1d authors
      2 ; Copyright © 2020-2023, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA 64
     32 const \
     33 dup16_perm,  db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
     34             db  8,  9,  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15
     35             db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23
     36             db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31
     37 const \
     38 int8_permA,  db  0,  1, 16, 17, 32, 33, 48, 49,  2,  3, 18, 19, 34, 35, 50, 51
     39             db  4,  5, 20, 21, 36, 37, 52, 53,  6,  7, 22, 23, 38, 39, 54, 55
     40             db  8,  9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
     41             db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
     42 int8_permB:  db  0,  1, 16, 17, 32, 33, 48, 49,  2,  3, 18, 19, 34, 35, 50, 51
     43             db  8,  9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
     44             db  4,  5, 20, 21, 36, 37, 52, 53,  6,  7, 22, 23, 38, 39, 54, 55
     45             db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
     46 int16_perm:  db  0,  1, 32, 33,  2,  3, 34, 35,  4,  5, 36, 37,  6,  7, 38, 39
     47             db  8,  9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
     48             db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
     49             db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
     50 idtx_16x4p:  db  0,  1,  4,  5, 16, 17, 20, 21,  2,  3,  6,  7, 18, 19, 22, 23
     51             db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55
     52             db  8,  9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31
     53             db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63
     54 idct_8x32p:  db 60, 61,  4,  5, 32, 33,  0,  1, 28, 29, 36, 37, 56, 57,  8,  9
     55             db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17
     56             db 62, 63,  2,  3,  6,  7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51
     57             db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35
     58 idct_16x32p: db  6,  7, 58, 59, 38, 39, 26, 27, 32, 33,  0,  1, 30, 31, 34, 35
     59             db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21
     60             db 62, 63,  2,  3, 48, 49, 16, 17, 56, 57,  8,  9, 14, 15, 50, 51
     61             db 54, 55, 10, 11, 60, 61,  4,  5, 12, 13, 52, 53, 28, 29, 36, 37
     62 end_16x32p:  db  0, 32,  1, 48,  2, 36,  3, 52, 16, 40, 17, 56, 18, 44, 19, 60
     63             db  4, 33,  5, 49,  6, 37,  7, 53, 20, 41, 21, 57, 22, 45, 23, 61
     64             db  8, 35,  9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63
     65             db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62
     66 
     67 ; packed 4-bit qword shuffle indices
     68 permA:       dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262
     69             dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373
     70             dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb
     71             dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea
     72 permB:       dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604
     73             dq 0xc824352d56128751, 0xd906171e74301e15
     74             dq 0x6271604b03472d62, 0x735342782165b426
     75             dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37
     76 permC:       dq 0x9d409d041551c2e0, 0xbf62bf263773a486
     77             dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597
     78             dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e
     79             dq 0x5115049dd9045b79, 0x733726bffb263d1f
     80 permD:       dq 0x0cda098800041504, 0x0edb09b2028c3726
     81             dq 0x0f11fa9c01150415, 0x0988f326039d2637
     82             dq 0x05640f1108269d8c, 0x05290edb0aaebfae
     83             dq 0x0005000509378c9d, 0xffffffff0bbfaebf
     84 
     85 pd_0to15:    dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
     86 gather8a:    dd  0,  2,  1,  3,  8, 10,  9, 11
     87 gather8b:    dd  0,  1,  4,  5,  8,  9, 12, 13
     88 gather8c:    dd  0,  4,  2,  6, 12,  8, 14, 10
     89 gather8d:    dd  0, 19,  1, 18,  2, 17,  3, 16
     90 
     91 int_shuf1:   db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
     92 int_shuf2:   db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
     93 int_shuf3:   db  0,  1,  8,  9,  4,  5, 12, 13,  2,  3, 10, 11,  6,  7, 14, 15
     94 int_shuf4:   db  8,  9,  0,  1, 12, 13,  4,  5, 10, 11,  2,  3, 14, 15,  6,  7
     95 deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
     96 int_mshift:  db 12, 20,  0,  0, 44, 52,  0,  0
     97 
     98 pb_32:           times 4 db 32
     99 pw_2048:         times 2 dw 2048
    100 pw_4096:         times 2 dw 4096
    101 pw_8192:         times 2 dw 8192
    102 pw_16384:        times 2 dw 16384
    103 pw_1697x16:      times 2 dw 1697*16
    104 pw_1697x8:       times 2 dw 1697*8
    105 pw_2896x8:       times 2 dw 2896*8
    106 pd_2048:         dd  2048
    107 
    108 %define pw_5          (permD+52)
    109 %define pd_m1         (permD+60)
    110 %define pw_3803_1321  (permD+44)
    111 %define pw_2482_3803  (permD+12)
    112 %define pw_2440_3290  (permD+ 4)
    113 %define pw_m3290_2440 (permD+28)
    114 %define pw_3857_1380  (permD+36)
    115 %define pw_m1380_3857 (permD+20)
    116 
    117 pw_8192_m8192:   dw   8192,  -8192
    118 pw_m8192_8192:   dw  -8192,   8192
    119 pw_16384_m16384: dw  16384, -16384
    120 pw_m16384_16384: dw -16384,  16384
    121 
    122 pw_m1321_2482:   dw  -1321,  2482
    123 pw_m3344_3344:   dw  -3344,  3344
    124 pw_2482_3344:    dw   2482,  3344
    125 pw_m3803_3344:   dw  -3803,  3344
    126 pd_3344:         dd   3344
    127 pw_m1321_m3344:  dw  -1321, -3344
    128 pw_2896_m2896:   dw   2896, -2896
    129 
    130 pw_1567_m3784:   dw   1567, -3784
    131 pw_3784_m1567:   dw   3784, -1567
    132 pw_4017_m799:    dw   4017,  -799
    133 pw_2276_m3406:   dw   2276, -3406
    134 pw_m799_m4017:   dw   -799, -4017
    135 pw_m3406_m2276:  dw  -3406, -2276
    136 
    137 %macro COEF_PAIR 2-3 0
    138 pw_%1_%2:   dw  %1,  %2
    139 pw_m%2_%1:  dw -%2,  %1
    140 %if %3
    141 pw_m%1_m%2: dw -%1, -%2
    142 %endif
    143 %endmacro
    144 
    145 COEF_PAIR 2896, 2896
    146 COEF_PAIR 1567, 3784, 1
    147 COEF_PAIR 3784, 1567
    148 COEF_PAIR  201, 4091
    149 COEF_PAIR  995, 3973
    150 COEF_PAIR 1751, 3703
    151 COEF_PAIR 3035, 2751
    152 COEF_PAIR 3513, 2106
    153 COEF_PAIR 4052,  601
    154 COEF_PAIR 3166, 2598, 1
    155 COEF_PAIR 3920, 1189, 1
    156 COEF_PAIR 2276, 3406
    157 COEF_PAIR 4017,  799
    158 
    159 %macro COEF_X8 1-*
    160 %rep %0
    161    dw %1*8, %1*8
    162    %rotate 1
    163 %endrep
    164 %endmacro
    165 
    166 pw_m2276x8: COEF_X8 -2276
    167 pw_3406x8:  COEF_X8  3406
    168 pw_4017x8:  COEF_X8  4017
    169 pw_799x8:   COEF_X8   799
    170 pw_3784x8:  COEF_X8  3784
    171 pw_1567x8:  COEF_X8  1567
    172 
    173 pw_4076x8:  COEF_X8  4076
    174 pw_401x8:   COEF_X8   401
    175 pw_m2598x8: COEF_X8 -2598
    176 pw_3166x8:  COEF_X8  3166
    177 pw_3612x8:  COEF_X8  3612
    178 pw_1931x8:  COEF_X8  1931
    179 pw_m1189x8: COEF_X8 -1189
    180 pw_3920x8:  COEF_X8  3920
    181 
    182 pw_4091x8:  COEF_X8  4091
    183 pw_201x8:   COEF_X8   201
    184 pw_m2751x8: COEF_X8 -2751
    185 pw_3035x8:  COEF_X8  3035
    186 pw_3703x8:  COEF_X8  3703
    187 pw_1751x8:  COEF_X8  1751
    188 pw_m1380x8: COEF_X8 -1380
    189 pw_3857x8:  COEF_X8  3857
    190 pw_3973x8:  COEF_X8  3973
    191 pw_995x8:   COEF_X8   995
    192 pw_m2106x8: COEF_X8 -2106
    193 pw_3513x8:  COEF_X8  3513
    194 pw_3290x8:  COEF_X8  3290
    195 pw_2440x8:  COEF_X8  2440
    196 pw_m601x8:  COEF_X8  -601
    197 pw_4052x8:  COEF_X8  4052
    198 
    199 pw_401_4076x8:   dw   401*8, 4076*8
    200 pw_m2598_3166x8: dw -2598*8, 3166*8
    201 pw_1931_3612x8:  dw  1931*8, 3612*8
    202 pw_m1189_3920x8: dw -1189*8, 3920*8
    203 pw_799_4017x8:   dw   799*8, 4017*8
    204 pw_m2276_3406x8: dw -2276*8, 3406*8
    205 
    206 pw_201_4091x8:   dw   201*8, 4091*8
    207 pw_m601_4052x8:  dw  -601*8, 4052*8
    208 pw_995_3973x8:   dw   995*8, 3973*8
    209 pw_m1380_3857x8: dw -1380*8, 3857*8
    210 pw_1751_3703x8:  dw  1751*8, 3703*8
    211 pw_m2106_3513x8: dw -2106*8, 3513*8
    212 pw_2440_3290x8:  dw  2440*8, 3290*8
    213 pw_m2751_3035x8: dw -2751*8, 3035*8
    214 
    215 pw_101_4095x8:   dw   101*8, 4095*8
    216 pw_m2824_2967x8: dw -2824*8, 2967*8
    217 pw_1660_3745x8:  dw  1660*8, 3745*8
    218 pw_m1474_3822x8: dw -1474*8, 3822*8
    219 pw_897_3996x8:   dw   897*8, 3996*8
    220 pw_m2191_3461x8: dw -2191*8, 3461*8
    221 pw_2359_3349x8:  dw  2359*8, 3349*8
    222 pw_m700_4036x8:  dw  -700*8, 4036*8
    223 pw_501_4065x8:   dw   501*8, 4065*8
    224 pw_m2520_3229x8: dw -2520*8, 3229*8
    225 pw_2019_3564x8:  dw  2019*8, 3564*8
    226 pw_m1092_3948x8: dw -1092*8, 3948*8
    227 pw_1285_3889x8:  dw  1285*8, 3889*8
    228 pw_m1842_3659x8: dw -1842*8, 3659*8
    229 pw_2675_3102x8:  dw  2675*8, 3102*8
    230 pw_m301_4085x8:  dw  -301*8, 4085*8
    231 
    232 idct64_mul: COEF_X8  4095,   101,  2967, -2824,  3745,  1660,  3822, -1474
    233 COEF_PAIR  401, 4076, 1
    234 COEF_PAIR  799, 4017
    235            COEF_X8  -700,  4036,  2359,  3349, -2191,  3461,   897,  3996
    236 dw    -2598, -3166,  3166, -2598,  2598,  3166, -4017,  -799,   799, -4017
    237            COEF_X8  4065,   501,  3229, -2520,  3564,  2019,  3948, -1092
    238 COEF_PAIR 1931, 3612, 1
    239 COEF_PAIR 3406, 2276
    240            COEF_X8  -301,  4085,  2675,  3102, -1842,  3659,  1285,  3889
    241 dw    -1189, -3920,  3920, -1189,  1189,  3920, -2276, -3406,  3406, -2276
    242 
    243 SECTION .text
    244 
    245 %define o_base int8_permA+64*18
    246 %define o(x) (r5 - (o_base) + (x))
    247 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
    248 
    249 ; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
    250 ;        16 = special_mul1, 32 = special_mul2
    251 %macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
    252    mova                m%2, m%4
    253 %if %7 & 16
    254    vpdpwssd            m%2, m%1, [o(pw_%5)] {bcstd}
    255    mova                m%3, m%4
    256 %if %7 & 32
    257    vpdpwssd            m%3, m%1, [o(pw_%6)] {bcstd}
    258 %else
    259    vpdpwssd            m%3, m%1, m%6
    260 %endif
    261 %elif %7 & 32
    262    vpdpwssd            m%2, m%1, m%5
    263    mova                m%3, m%4
    264    vpdpwssd            m%3, m%1, [o(pw_%6)] {bcstd}
    265 %elif %6 < 32
    266    vpdpwssd            m%2, m%1, m%5
    267    mova                m%3, m%4
    268    vpdpwssd            m%3, m%1, m%6
    269 %elif %7 & 1
    270    vpdpwssd            m%2, m%1, [o(pw_%5_%6)] {bcstd}
    271    mova                m%3, m%4
    272    vpdpwssd            m%3, m%1, [o(pw_m%6_%5)] {bcstd}
    273 %else
    274    vpdpwssd            m%2, m%1, [o(pw_m%6_%5)] {bcstd}
    275    mova                m%3, m%4
    276    vpdpwssd            m%3, m%1, [o(pw_%5_%6)] {bcstd}
    277 %endif
    278 %if %7 & 2
    279    psrld               m%2, 12
    280    pslld               m%3, 4
    281    vpshrdd             m%1, m%3, m%2, 16
    282 %elif %7 & 4
    283    ; compared to using shifts (as above) this has better throughput,
    284    ; but worse latency and requires setting up the opmask/index
    285    ; registers, so only use this method for the larger transforms
    286    pslld               m%1, m%2, 4
    287    vpmultishiftqb  m%1{k7}, m13, m%3
    288 %else
    289    psrad               m%2, 12
    290    psrad               m%3, 12
    291 %if %7 & 8 == 0
    292    packssdw            m%1, m%3, m%2
    293 %endif
    294 %endif
    295 %endmacro
    296 
    297 ; flags: same as ITX_MUL2X_PACK
    298 %macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags
    299 %if %11 & 1
    300    vpbroadcastd        m%4, [o(pw_%9_%10)]
    301    vpbroadcastd    m%4{k1}, [o(pw_%7_%8)]
    302    vpbroadcastd        m%5, [o(pw_m%10_%9)]
    303    vpbroadcastd    m%5{k1}, [o(pw_m%8_%7)]
    304 %else
    305    vpbroadcastd        m%4, [o(pw_m%10_%9)]
    306    vpbroadcastd    m%4{k1}, [o(pw_m%8_%7)]
    307    vpbroadcastd        m%5, [o(pw_%9_%10)]
    308    vpbroadcastd    m%5{k1}, [o(pw_%7_%8)]
    309 %endif
    310    ITX_MUL2X_PACK       %1, %2, %3, %6, %4, %5, %11
    311 %endmacro
    312 
    313 ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
    314 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
    315 %macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
    316    punpcklwd           m%3, m%2, m%1
    317    punpckhwd           m%2, m%1
    318 %if %7 < 32
    319    mova                m%1, m%5
    320    vpdpwssd            m%1, m%3, m%7
    321    mova                m%4, m%5
    322    vpdpwssd            m%4, m%2, m%7
    323 %else
    324    mova                m%1, m%5
    325    vpdpwssd            m%1, m%3, [o(pw_m%7_%6)] {bcstd}
    326    mova                m%4, m%5
    327    vpdpwssd            m%4, m%2, [o(pw_m%7_%6)] {bcstd}
    328 %endif
    329    psrad               m%1, 12
    330    psrad               m%4, 12
    331    packssdw            m%1, m%4
    332    mova                m%4, m%5
    333 %if %7 < 32
    334    vpdpwssd            m%4, m%2, m%6
    335    mova                m%2, m%5
    336    vpdpwssd            m%2, m%3, m%6
    337 %else
    338    vpdpwssd            m%4, m%2, [o(pw_%6_%7)] {bcstd}
    339    mova                m%2, m%5
    340    vpdpwssd            m%2, m%3, [o(pw_%6_%7)] {bcstd}
    341 %endif
    342    psrad               m%4, 12
    343    psrad               m%2, 12
    344 %if %0 == 8
    345    packssdw            m%8, m%2, m%4
    346 %else
    347    packssdw            m%2, m%4
    348 %endif
    349 %endmacro
    350 
    351 %macro WRAP_XMM 1+
    352    %xdefine %%reset RESET_MM_PERMUTATION
    353    INIT_XMM cpuname
    354    DEFINE_MMREGS xmm
    355    AVX512_MM_PERMUTATION
    356    %1
    357    %%reset
    358 %endmacro
    359 
    360 %macro WRAP_YMM 1+
    361    INIT_YMM cpuname
    362    %1
    363    INIT_ZMM cpuname
    364 %endmacro
    365 
    366 %macro ITX4_END 4-5 2048 ; row[1-4], rnd
    367 %if %5
    368    vpbroadcastd         m2, [o(pw_%5)]
    369    pmulhrsw             m0, m2
    370    pmulhrsw             m1, m2
    371 %endif
    372    lea                  r2, [dstq+strideq*2]
    373 %assign %%i 1
    374 %rep 4
    375    %if %1 & 2
    376        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
    377    %else
    378        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
    379    %endif
    380    %assign %%i %%i + 1
    381    %rotate 1
    382 %endrep
    383    movd                 m2, [%%row_adr1]
    384    pinsrd               m2, [%%row_adr2], 1
    385    movd                 m3, [%%row_adr3]
    386    pinsrd               m3, [%%row_adr4], 1
    387    pmovzxbw             m2, m2
    388    pmovzxbw             m3, m3
    389    paddw                m0, m2
    390    paddw                m1, m3
    391    packuswb             m0, m1
    392    movd       [%%row_adr1], m0
    393    pextrd     [%%row_adr2], m0, 1
    394    pextrd     [%%row_adr3], m0, 2
    395    pextrd     [%%row_adr4], m0, 3
    396    ret
    397 %endmacro
    398 
    399 %macro INV_TXFM_FN 3 ; type1, type2, size
    400 cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base
    401    %define %%p1 m(i%1_%3_internal_8bpc)
    402    lea               baseq, [o_base]
    403    ; Jump to the 1st txfm function if we're not taking the fast path, which
    404    ; in turn performs an indirect jump to the 2nd txfm function.
    405    lea tx2q, [m(i%2_%3_internal_8bpc).pass2]
    406 %ifidn %1_%2, dct_dct
    407    test               eobd, eobd
    408    jnz %%p1
    409 %else
    410    ; jump to the 1st txfm function unless it's located directly after this
    411    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
    412 ALIGN function_align
    413 %%end:
    414 %endif
    415 %endmacro
    416 
    417 %macro INV_TXFM_4X4_FN 2 ; type1, type2
    418    INV_TXFM_FN          %1, %2, 4x4
    419 %ifidn %1_%2, dct_dct
    420    vpbroadcastw         m0, [cq]
    421    vpbroadcastd         m1, [o(pw_2896x8)]
    422    pmulhrsw             m0, m1
    423    mov                [cq], eobd
    424    pmulhrsw             m0, m1
    425    mova                 m1, m0
    426    jmp m(iadst_4x4_internal_8bpc).end2
    427 %endif
    428 %endmacro
    429 
    430 %macro IDCT4_1D_PACKED 0
    431    vpbroadcastd         m4, [o(pd_2048)]
    432    punpckhwd            m2, m1, m0
    433    punpcklwd            m1, m0
    434    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784
    435    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896
    436    paddsw               m0, m1, m2 ; out0 out1
    437    psubsw               m1, m2     ; out3 out2
    438 %endmacro
    439 
    440 %macro IADST4_1D_PACKED 0
    441    punpcklwd            m4, m1, m0 ; in2 in0
    442    punpckhwd            m5, m1, m0 ; in3 in1
    443 .main2:
    444    vpbroadcastd         m3, [o(pd_2048)]
    445    mova                 m0, m3
    446    vpdpwssd             m0, m4, [o(pw_3803_1321)] {bcstd}
    447    mova                 m2, m3
    448    vpdpwssd             m2, m4, [o(pw_m1321_2482)] {bcstd}
    449    mova                 m1, m3
    450    vpdpwssd             m1, m4, [o(pw_m3344_3344)] {bcstd}
    451    vpdpwssd             m3, m4, [o(pw_2482_3803)] {bcstd}
    452    vpdpwssd             m0, m5, [o(pw_2482_3344)] {bcstd}
    453    vpdpwssd             m2, m5, [o(pw_m3803_3344)] {bcstd}
    454    vpdpwssd             m1, m5, [o(pd_3344)] {bcstd}
    455    vpdpwssd             m3, m5, [o(pw_m1321_m3344)] {bcstd}
    456    REPX      {psrad x, 12}, m0, m2, m1, m3
    457    packssdw             m0, m2 ; out0 out1
    458    packssdw             m1, m3 ; out2 out3
    459 %endmacro
    460 
    461 INIT_XMM avx512icl
    462 INV_TXFM_4X4_FN dct, dct
    463 INV_TXFM_4X4_FN dct, adst
    464 INV_TXFM_4X4_FN dct, flipadst
    465 INV_TXFM_4X4_FN dct, identity
    466 
    467 cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
    468    mova                 m0, [cq+16*0]
    469    mova                 m1, [cq+16*1]
    470    IDCT4_1D_PACKED
    471    mova                 m2, [o(deint_shuf)]
    472    shufps               m3, m0, m1, q1331
    473    shufps               m0, m0, m1, q0220
    474    pshufb               m0, m2
    475    pshufb               m1, m3, m2
    476    jmp                tx2q
    477 .pass2:
    478    IDCT4_1D_PACKED
    479    pxor              ymm16, ymm16
    480    mova               [cq], ymm16
    481    ITX4_END              0, 1, 3, 2
    482 
    483 INV_TXFM_4X4_FN adst, dct
    484 INV_TXFM_4X4_FN adst, adst
    485 INV_TXFM_4X4_FN adst, flipadst
    486 INV_TXFM_4X4_FN adst, identity
    487 
    488 cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
    489    mova                 m0, [cq+16*0]
    490    mova                 m1, [cq+16*1]
    491    call .main
    492    punpckhwd            m3, m0, m1
    493    punpcklwd            m0, m1
    494    punpckhwd            m1, m0, m3
    495    punpcklwd            m0, m3
    496    jmp                tx2q
    497 .pass2:
    498    call .main
    499 .end:
    500    pxor              ymm16, ymm16
    501    mova               [cq], ymm16
    502 .end2:
    503    ITX4_END              0, 1, 2, 3
    504 ALIGN function_align
    505 .main:
    506    IADST4_1D_PACKED
    507    ret
    508 
    509 INV_TXFM_4X4_FN flipadst, dct
    510 INV_TXFM_4X4_FN flipadst, adst
    511 INV_TXFM_4X4_FN flipadst, flipadst
    512 INV_TXFM_4X4_FN flipadst, identity
    513 
    514 cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
    515    mova                 m0, [cq+16*0]
    516    mova                 m1, [cq+16*1]
    517    call m(iadst_4x4_internal_8bpc).main
    518    punpcklwd            m2, m1, m0
    519    punpckhwd            m1, m0
    520    punpcklwd            m0, m1, m2
    521    punpckhwd            m1, m2
    522    jmp                tx2q
    523 .pass2:
    524    call m(iadst_4x4_internal_8bpc).main
    525 .end:
    526    pxor              ymm16, ymm16
    527    mova               [cq], ymm16
    528 .end2:
    529    ITX4_END              3, 2, 1, 0
    530 
    531 INV_TXFM_4X4_FN identity, dct
    532 INV_TXFM_4X4_FN identity, adst
    533 INV_TXFM_4X4_FN identity, flipadst
    534 INV_TXFM_4X4_FN identity, identity
    535 
    536 cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
    537    mova                 m0, [cq+16*0]
    538    mova                 m1, [cq+16*1]
    539    vpbroadcastd         m3, [o(pw_1697x8)]
    540    pmulhrsw             m2, m3, m0
    541    pmulhrsw             m3, m1
    542    paddsw               m0, m2
    543    paddsw               m1, m3
    544    punpckhwd            m2, m0, m1
    545    punpcklwd            m0, m1
    546    punpckhwd            m1, m0, m2
    547    punpcklwd            m0, m2
    548    jmp                tx2q
    549 .pass2:
    550    vpbroadcastd         m3, [o(pw_1697x8)]
    551    pmulhrsw             m2, m3, m0
    552    pmulhrsw             m3, m1
    553    paddsw               m0, m2
    554    paddsw               m1, m3
    555    jmp m(iadst_4x4_internal_8bpc).end
    556 
    557 %macro INV_TXFM_4X8_FN 2 ; type1, type2
    558    INV_TXFM_FN          %1, %2, 4x8
    559 %ifidn %1_%2, dct_dct
    560    movd               xmm1, [o(pw_2896x8)]
    561    pmulhrsw           xmm0, xmm1, [cq]
    562    movd               xmm2, [o(pw_2048)]
    563    pmulhrsw           xmm0, xmm1
    564    pmulhrsw           xmm0, xmm1
    565    pmulhrsw           xmm0, xmm2
    566    vpbroadcastw        ym0, xmm0
    567    mova                ym1, ym0
    568    jmp m(iadst_4x8_internal_8bpc).end3
    569 %endif
    570 %endmacro
    571 
    572 %macro IDCT8_1D_PACKED 0
    573    punpckhwd            m5, m3, m0 ; in7 in1
    574    punpckhwd            m4, m1, m2 ; in3 in5
    575    punpcklwd            m3, m1     ; in6 in2
    576    punpcklwd            m2, m0     ; in4 in0
    577 .main2:
    578    vpbroadcastd         m6, [o(pd_2048)]
    579    ITX_MUL2X_PACK        5, 0, 1, 6,  799, 4017, 3 ; t4a t7a
    580    ITX_MUL2X_PACK        4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
    581    ITX_MUL2X_PACK        3, 0, 1, 6, 1567, 3784    ; t3 t2
    582    psubsw               m0, m5, m4 ; t5a t6a (interleaved)
    583    paddsw               m4, m5     ; t4  t7  (interleaved)
    584    ITX_MUL2X_PACK        2, 1, 5, 6, 2896, 2896    ; t0 t1
    585    ITX_MUL2X_PACK        0, 1, 5, 6, 2896, 2896, 1 ; t6 t5
    586 %if mmsize > 16
    587    vbroadcasti32x4      m1, [o(deint_shuf)]
    588    pshufb               m4, m1
    589 %else
    590    pshufb               m4, [o(deint_shuf)]
    591 %endif
    592    psubsw               m1, m2, m3 ; tmp3 tmp2
    593    paddsw               m3, m2     ; tmp0 tmp1
    594    punpckhqdq           m2, m4, m0 ; t7 t6
    595    punpcklqdq           m4, m0     ; t4 t5
    596    paddsw               m0, m3, m2 ; out0 out1
    597    psubsw               m3, m2     ; out7 out6
    598    psubsw               m2, m1, m4 ; out4 out5
    599    paddsw               m1, m4     ; out3 out2
    600 %endmacro
    601 
    602 %macro IADST8_1D_PACKED 1 ; pass
    603    vpbroadcastd         m6, [o(pd_2048)]
    604 %if %1 == 1
    605    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076, 3 ; t1a t0a
    606    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
    607    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
    608    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
    609    psubsw               m4, m0, m2 ; t5 t4
    610    paddsw               m0, m2     ; t1 t0
    611    psubsw               m5, m1, m3 ; t6 t7
    612    paddsw               m1, m3     ; t2 t3
    613    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
    614    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
    615 %if mmsize > 16
    616    vbroadcasti32x4      m2, [o(deint_shuf)]
    617 %else
    618    mova                 m2, [o(deint_shuf)]
    619 %endif
    620    vprord               m1, 16
    621    psubsw               m3, m0, m1 ; t3 t2
    622    paddsw               m0, m1     ; -out7  out0
    623    psubsw               m1, m4, m5 ; t7 t6
    624    paddsw               m4, m5     ;  out6 -out1
    625    pshufb               m0, m2
    626    pshufb               m4, m2
    627    mova                 m2, m6
    628    vpdpwssd             m2, m3, [o(pw_m2896_2896)] {bcstd}
    629    mova                 m5, m6
    630    vpdpwssd             m5, m1, [o(pw_m2896_2896)] {bcstd}
    631    psrad                m2, 12
    632    psrad                m5, 12
    633    packssdw             m2, m5     ; out4 -out5
    634    mova                 m5, m6
    635    vpdpwssd             m5, m3, [o(pw_2896_2896)] {bcstd}
    636    mova                 m3, m6
    637    vpdpwssd             m3, m1, [o(pw_2896_2896)] {bcstd}
    638    psrad                m5, 12
    639    psrad                m3, 12
    640    packssdw             m1, m3, m5 ; out2 -out3
    641 %else
    642    punpckhwd            m0, m4, m3 ; 0 7
    643    punpckhwd            m1, m5, m2 ; 2 5
    644    punpcklwd            m2, m5     ; 4 3
    645    punpcklwd            m3, m4     ; 6 1
    646    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
    647    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
    648    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
    649    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
    650    psubsw               m4, m0, m2 ; t4 t5
    651    paddsw               m0, m2     ; t0 t1
    652    psubsw               m5, m1, m3 ; t6 t7
    653    paddsw               m1, m3     ; t2 t3
    654    shufps               m2, m5, m4, q1032
    655    punpckhwd            m4, m2
    656    punpcklwd            m5, m2
    657    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784    ; t4a t5a
    658    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a
    659    psubsw               m2, m0, m1 ; t2 t3
    660    paddsw               m0, m1     ; out0 -out7
    661    psubsw               m1, m4, m5 ; t6 t7
    662    paddsw               m4, m5     ; -out1 out6
    663    vpbroadcastd         m5, [o(pw_2896x8)]
    664    punpckhqdq           m3, m2, m1 ; t3 t7
    665    punpcklqdq           m2, m1     ; t2 t6
    666    paddsw               m1, m2, m3 ; t2+t3 t6+t7
    667    psubsw               m2, m3     ; t2-t3 t6-t7
    668    punpckhqdq           m3, m4, m0 ; out6 -out7
    669    punpcklqdq           m0, m4     ; out0 -out1
    670    pmulhrsw             m2, m5     ; out4 -out5
    671    pshufd               m1, m1, q1032
    672    pmulhrsw             m1, m5     ; out2 -out3
    673 %endif
    674 %endmacro
    675 
    676 INIT_YMM avx512icl
    677 INV_TXFM_4X8_FN dct, dct
    678 INV_TXFM_4X8_FN dct, identity
    679 INV_TXFM_4X8_FN dct, adst
    680 INV_TXFM_4X8_FN dct, flipadst
    681 
    682 cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
    683    vpermq               m0, [cq+32*0], q3120
    684    vpermq               m1, [cq+32*1], q3120
    685    vpbroadcastd         m2, [o(pw_2896x8)]
    686    pmulhrsw             m0, m2
    687    pmulhrsw             m1, m2
    688    IDCT4_1D_PACKED
    689    vbroadcasti32x4      m2, [o(deint_shuf)]
    690    shufps               m3, m0, m1, q1331
    691    shufps               m0, m0, m1, q0220
    692    pshufb               m0, m2
    693    pshufb               m1, m3, m2
    694    jmp                tx2q
    695 .pass2:
    696    vextracti32x4       xm2, m0, 1
    697    vextracti32x4       xm3, m1, 1
    698    call .main
    699    vpbroadcastd         m4, [o(pw_2048)]
    700    vinserti32x4         m0, m0, xm2, 1
    701    vinserti32x4         m1, m1, xm3, 1
    702    pshufd               m1, m1, q1032
    703    jmp m(iadst_4x8_internal_8bpc).end2
    704 ALIGN function_align
    705 .main:
    706    WRAP_XMM IDCT8_1D_PACKED
    707    ret
    708 
    709 INV_TXFM_4X8_FN adst, dct
    710 INV_TXFM_4X8_FN adst, adst
    711 INV_TXFM_4X8_FN adst, flipadst
    712 INV_TXFM_4X8_FN adst, identity
    713 
    714 cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
    715    vpermq               m0, [cq+32*0], q3120
    716    vpermq               m1, [cq+32*1], q3120
    717    vpbroadcastd         m2, [o(pw_2896x8)]
    718    pmulhrsw             m0, m2
    719    pmulhrsw             m1, m2
    720    call m(iadst_8x4_internal_8bpc).main
    721    punpckhwd            m3, m0, m1
    722    punpcklwd            m0, m1
    723    punpckhwd            m1, m0, m3
    724    punpcklwd            m0, m3
    725    jmp                tx2q
    726 .pass2:
    727    vextracti32x4       xm2, m0, 1
    728    vextracti32x4       xm3, m1, 1
    729    pshufd              xm4, xm0, q1032
    730    pshufd              xm5, xm1, q1032
    731    call .main_pass2
    732    vpbroadcastd         m4, [o(pw_2048)]
    733    vinserti32x4         m0, xm2, 1
    734    vinserti32x4         m1, xm3, 1
    735    pxor                 m5, m5
    736    psubw                m5, m4
    737 .end:
    738    punpcklqdq           m4, m5
    739 .end2:
    740    pmulhrsw             m0, m4
    741    pmulhrsw             m1, m4
    742 .end3:
    743    vpbroadcastd         m3, strided
    744    pmulld               m5, m3, [o(pd_0to15)]
    745    kxnorb               k1, k1, k1
    746    kmovb                k2, k1
    747    vpgatherdd       m3{k1}, [dstq+m5]
    748    pxor                 m4, m4
    749    mova               [cq], zmm20
    750    punpcklbw            m2, m3, m4
    751    punpckhbw            m3, m4
    752    paddw                m0, m2
    753    paddw                m1, m3
    754    packuswb             m0, m1
    755    vpscatterdd [dstq+m5]{k2}, m0
    756    RET
    757 ALIGN function_align
    758 .main_pass1:
    759    punpckhwd           xm0, xm4, xm3 ; 0 7
    760    punpckhwd           xm1, xm5, xm2 ; 2 5
    761    punpcklwd           xm2, xm5      ; 4 3
    762    punpcklwd           xm3, xm4      ; 6 1
    763    WRAP_XMM IADST8_1D_PACKED 1
    764    punpcklqdq          xm3, xm4, xm0 ; out6 -out7
    765    punpckhqdq          xm0, xm4      ; out0 -out1
    766    ret
    767 ALIGN function_align
    768 .main_pass2:
    769    WRAP_XMM IADST8_1D_PACKED 2
    770    ret
    771 
    772 INV_TXFM_4X8_FN flipadst, dct
    773 INV_TXFM_4X8_FN flipadst, adst
    774 INV_TXFM_4X8_FN flipadst, flipadst
    775 INV_TXFM_4X8_FN flipadst, identity
    776 
    777 cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
    778    vpermq               m0, [cq+32*0], q3120
    779    vpermq               m1, [cq+32*1], q3120
    780    vpbroadcastd         m2, [o(pw_2896x8)]
    781    pmulhrsw             m0, m2
    782    pmulhrsw             m1, m2
    783    call m(iadst_8x4_internal_8bpc).main
    784    punpcklwd            m3, m1, m0
    785    punpckhwd            m1, m0
    786    punpcklwd            m0, m1, m3
    787    punpckhwd            m1, m3
    788    jmp                tx2q
    789 .pass2:
    790    vextracti32x4       xm2, m0, 1
    791    vextracti32x4       xm3, m1, 1
    792    pshufd              xm4, xm0, q1032
    793    pshufd              xm5, xm1, q1032
    794    call m(iadst_4x8_internal_8bpc).main_pass2
    795    vpbroadcastd         m5, [o(pw_2048)]
    796    vinserti32x4         m3, xm1, 1
    797    vinserti32x4         m2, xm0, 1
    798    pxor                 m4, m4
    799    psubw                m4, m5
    800    pshufd               m0, m3, q1032
    801    pshufd               m1, m2, q1032
    802    jmp m(iadst_4x8_internal_8bpc).end
    803 
    804 INIT_ZMM avx512icl
    805 INV_TXFM_4X8_FN identity, dct
    806 INV_TXFM_4X8_FN identity, adst
    807 INV_TXFM_4X8_FN identity, flipadst
    808 INV_TXFM_4X8_FN identity, identity
    809 
    810 cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
    811    vpbroadcastd         m0, [o(pw_2896x8)]
    812    pmulhrsw             m0, [cq]
    813    mova                 m1, [o(int8_permB)]
    814    vpbroadcastd         m2, [o(pw_1697x8)]
    815    vpermb               m0, m1, m0
    816    pmulhrsw             m2, m0
    817    paddsw               m0, m2
    818    vextracti32x8       ym1, m0, 1
    819    jmp                tx2q
    820 .pass2:
    821    vpbroadcastd        ym4, [o(pw_4096)]
    822    jmp m(iadst_4x8_internal_8bpc).end2
    823 
    824 %macro INV_TXFM_4X16_FN 2 ; type1, type2
    825    INV_TXFM_FN          %1, %2, 4x16
    826 %ifidn %1_%2, dct_dct
    827    movsx               r6d, word [cq]
    828    mov                [cq], eobd
    829    imul                r6d, 181
    830    add                 r6d, 128+256
    831    sar                 r6d, 8+1
    832    imul                r6d, 181
    833    add                 r6d, 128+2048
    834    sar                 r6d, 8+4
    835    vpbroadcastw         m0, r6d
    836    mova                 m1, m0
    837    jmp m(iadst_4x16_internal_8bpc).end3
    838 %endif
    839 %endmacro
    840 
    841 %macro IDCT16_1D_PACKED 0
    842    punpckhwd            m8, m7, m0 ; dct16 in15 in1
    843    punpcklwd            m9, m4, m0 ; dct4  in2  in0
    844    punpckhwd            m0, m3, m4 ; dct16 in7  in9
    845    punpcklwd            m7, m1     ; dct8  in7  in1
    846    punpckhwd            m1, m6     ; dct16 in3  in13
    847    punpcklwd            m3, m5     ; dct8  in3  in5
    848    punpckhwd            m5, m2     ; dct16 in11 in5
    849    punpcklwd            m6, m2     ; dct4  in3  in1
    850 cglobal_label .main2
    851    vpbroadcastd        m10, [o(pd_2048)]
    852 .main3:
    853    vpbroadcastq        m13, [o(int_mshift)]
    854    vpcmpub              k7, m13, m10, 6 ; 0x33...
    855    ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 5 ; t8a  t15a
    856    ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 5 ; t9a  t14a
    857    ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a
    858    ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a
    859    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 5 ; t4a  t7a
    860    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 5 ; t5a  t6a
    861 .main4:
    862    psubsw               m2, m8, m0 ; t9  t14
    863    paddsw               m8, m0     ; t8  t15
    864    psubsw               m4, m1, m5 ; t10 t13
    865    paddsw               m1, m5     ; t11 t12
    866    ITX_MUL2X_PACK        6, 0, 5, 10, 1567,  3784    ; t3   t2
    867    psubsw               m0, m8, m1 ; t11a t12a
    868    paddsw               m8, m1     ; t8a  t15a
    869    psubsw               m1, m7, m3 ; t5a  t6a
    870    paddsw               m7, m3     ; t4   t7
    871 .main5:
    872    ITX_MUL2X_PACK        2, 3, 5, 10, 1567,  3784, 5 ; t9a  t14a
    873    ITX_MUL2X_PACK        4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a
    874 %if mmsize > 16
    875    vbroadcasti32x4      m5, [o(deint_shuf)]
    876 %else
    877    mova                 m5, [o(deint_shuf)]
    878 %endif
    879    vpbroadcastd        m11, [o(pw_m2896_2896)]
    880    vpbroadcastd        m12, [o(pw_2896_2896)]
    881    paddsw               m3, m2, m4 ; t9   t14
    882    psubsw               m2, m4     ; t10  t13
    883    pshufb               m8, m5
    884    pshufb               m7, m5
    885    pshufb               m3, m5
    886    ITX_MUL2X_PACK        9, 4,  5, 10, 11, 12    ; t0   t1
    887    ITX_MUL2X_PACK        1, 4,  5, 10, 12, 11    ; t5   t6
    888    ITX_MUL2X_PACK        0, 4,  5, 10, 11, 12, 8 ; t11  t12
    889    ITX_MUL2X_PACK        2, 0, 11, 10, 11, 12, 8 ; t10a t13a
    890    punpckhqdq           m2, m7, m1 ; t7 t6
    891    punpcklqdq           m7, m1     ; t4 t5
    892    psubsw               m1, m9, m6 ; dct4 out3 out2
    893    paddsw               m9, m6     ; dct4 out0 out1
    894    packssdw             m5, m11    ; t12  t13a
    895    packssdw             m4, m0     ; t11  t10a
    896    punpckhqdq           m0, m8, m3 ; t15a t14
    897    punpcklqdq           m8, m3     ; t8a  t9
    898    psubsw               m3, m9, m2 ; dct8 out7 out6
    899    paddsw               m9, m2     ; dct8 out0 out1
    900    psubsw               m2, m1, m7 ; dct8 out4 out5
    901    paddsw               m1, m7     ; dct8 out3 out2
    902    psubsw               m7, m9, m0 ; out15 out14
    903    paddsw               m0, m9     ; out0  out1
    904    psubsw               m6, m1, m5 ; out12 out13
    905    paddsw               m1, m5     ; out3  out2
    906    psubsw               m5, m2, m4 ; out11 out10
    907    paddsw               m2, m4     ; out4  out5
    908    psubsw               m4, m3, m8 ; out8  out9
    909    paddsw               m3, m8     ; out7  out6
    910 %endmacro
    911 
    912 INV_TXFM_4X16_FN dct, dct
    913 INV_TXFM_4X16_FN dct, identity
    914 INV_TXFM_4X16_FN dct, adst
    915 INV_TXFM_4X16_FN dct, flipadst
    916 
    917 cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
    918    mova                ym1, [cq+32*2]
    919    vinserti32x8         m1, [cq+32*0], 1
    920    mova                 m0, [o(int16_perm)]
    921    mova                ym2, [cq+32*3]
    922    vinserti32x8         m2, [cq+32*1], 1
    923    vpbroadcastd         m4, [o(pd_2048)]
    924    vpermb               m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3
    925    vpermb               m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3
    926    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896, 2
    927    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784, 2
    928    vpbroadcastd         m4, [o(pw_16384)]
    929    psubsw               m3, m1, m2
    930    paddsw               m1, m2     ; out0 out1
    931    vprord               m3, 16     ; out2 out3
    932    punpckldq            m0, m1, m3
    933    punpckhdq            m1, m3
    934    pmulhrsw             m0, m4
    935    pmulhrsw             m1, m4
    936    jmp                tx2q
    937 .pass2:
    938    vextracti32x4       xm2, ym0, 1
    939    vextracti32x4       xm3, ym1, 1
    940    vextracti32x4       xm4, m0, 2
    941    vextracti32x4       xm5, m1, 2
    942    vextracti32x4       xm6, m0, 3
    943    vextracti32x4       xm7, m1, 3
    944    call .main
    945    vinserti32x4        ym0, xm2, 1
    946    vinserti32x4        ym1, xm3, 1
    947    vinserti32x4        ym4, xm6, 1
    948    vinserti32x4        ym5, xm7, 1
    949    vinserti32x8         m0, ym4, 1
    950    vinserti32x8         m1, ym5, 1
    951    vpbroadcastd         m5, [o(pw_2048)]
    952    pshufd               m1, m1, q1032
    953    jmp m(iadst_4x16_internal_8bpc).end2
    954 ALIGN function_align
    955 .main:
    956    WRAP_XMM IDCT16_1D_PACKED
    957    ret
    958 
    959 INV_TXFM_4X16_FN adst, dct
    960 INV_TXFM_4X16_FN adst, adst
    961 INV_TXFM_4X16_FN adst, flipadst
    962 INV_TXFM_4X16_FN adst, identity
    963 
    964 cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
    965    mova                 m1, [o(permB)]
    966    vpermq               m0, m1, [cq+64*0]
    967    vpermq               m1, m1, [cq+64*1]
    968    call m(iadst_16x4_internal_8bpc).main
    969    vpbroadcastd         m3, [o(pw_16384)]
    970    punpckhwd            m2, m0, m1
    971    punpcklwd            m0, m1
    972    pmulhrsw             m2, m3
    973    pmulhrsw             m0, m3
    974    punpckhwd            m1, m0, m2
    975    punpcklwd            m0, m2
    976    jmp                tx2q
    977 .pass2:
    978    call .main
    979    vpbroadcastd         m5, [o(pw_2048)]
    980    psrlq               m10, 4
    981    psubw                m6, m8, m5
    982 .end:
    983    vpbroadcastd         m7, [o(pw_2896x8)]
    984    paddsw              ym1, ym2, ym4
    985    psubsw              ym2, ym4
    986    vinserti32x8         m1, ym2, 1
    987    pmulhrsw             m1, m7      ; -out7   out4   out6  -out5   out8  -out11 -out9   out10
    988    psrlq                m0, m10, 4
    989    vpermi2q             m0, m1, m3  ; 0 1   4 5   8 9   c d
    990    vpermt2q             m1, m10, m3 ; 2 3   6 7   a b   e f
    991    punpcklqdq           m5, m6
    992 .end2:
    993    pmulhrsw             m0, m5
    994    pmulhrsw             m1, m5
    995 .end3:
    996    vpbroadcastd         m3, strided
    997    pmulld               m5, m3, [o(pd_0to15)]
    998    kxnorw               k1, k1, k1
    999    kmovw                k2, k1
   1000    vpgatherdd       m3{k1}, [dstq+m5]
   1001    pxor                 m4, m4
   1002    mova          [cq+64*0], m4
   1003    mova          [cq+64*1], m4
   1004    punpcklbw            m2, m3, m4
   1005    punpckhbw            m3, m4
   1006    paddw                m0, m2
   1007    paddw                m1, m3
   1008    packuswb             m0, m1
   1009    vpscatterdd [dstq+m5]{k2}, m0
   1010    RET
   1011 ALIGN function_align
   1012 .main:
   1013    movu                 m3, [o(permB+1)]
   1014    psrlq               m10, m3, 4
   1015 .main2:
   1016    vpermi2q             m3, m0, m1  ; in15 in12 in13 in14 in11 in8  in9  in10
   1017    vpermt2q             m0, m10, m1 ; in0  in3  in2  in1  in4  in7  in6  in5
   1018    vpbroadcastd         m9, [o(pd_2048)]
   1019    vpbroadcastq       ym13, [o(int_mshift)]
   1020    kxnorb               k1, k1, k1
   1021    punpckhwd            m4, m3, m0  ; in12 in3  in14 in1
   1022    punpcklwd            m0, m3      ; in0  in15 in2  in13
   1023    kshiftrb             k1, k1, 4
   1024    vextracti32x8       ym3, m4, 1   ; in8  in7  in10 in5
   1025    vextracti32x8       ym1, m0, 1   ; in4  in11 in6  in9
   1026 INIT_YMM avx512icl
   1027    vpcmpub              k7, m13, m9, 6 ; 0x33...
   1028    pxor                 m8, m8
   1029    ITX_MUL4X_PACK        0, 2, 5, 6, 7, 9,  201, 4091,  995, 3973, 5
   1030    ITX_MUL4X_PACK        1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5
   1031    ITX_MUL4X_PACK        3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5
   1032    ITX_MUL4X_PACK        4, 2, 5, 6, 7, 9, 3857, 1380, 4052,  601, 5
   1033    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
   1034    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
   1035    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
   1036    paddsw               m4, m1     ; t5a  t4a  t7a  t6a
   1037    ITX_MUL4X_PACK        2, 1, 5, 6, 7, 9,  799, 4017, 3406, 2276, 5
   1038    psubw                m7, m8, m7
   1039    ITX_MUL2X_PACK        3, 1, 5, 9, 7, 6, 4
   1040    vpbroadcastd         m6, [o(pw_3784_m1567)]
   1041    vpbroadcastd     m6{k1}, [o(pw_m3784_1567)]
   1042    psubsw               m1, m0, m4 ; t5   t4   t7   t6
   1043    paddsw               m0, m4     ; t1   t0   t3   t2
   1044    psubsw               m4, m2, m3 ; t13a t12a t15a t14a
   1045    paddsw               m2, m3     ; t9a  t8a  t11a t10a
   1046    ITX_MUL2X_PACK        1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a
   1047    ITX_MUL2X_PACK        4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14
   1048    vbroadcasti32x4      m5, [o(deint_shuf)]
   1049    pshufb               m0, m5
   1050    pshufb               m2, m5
   1051    vshufi32x4           m3, m0, m2, 0x03  ; t3   t2   t11a t10a
   1052    vinserti32x4         m0, xm2, 1        ; t1   t0   t9a  t8a
   1053    vshufi32x4           m2, m1, m4, 0x03  ; t7a  t6a  t15  t14
   1054    vinserti32x4         m1, xm4, 1        ; t4a  t5a  t12  t13
   1055    pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
   1056    psubsw               m4, m0, m3        ; t3a t2a t11 t10
   1057    paddsw               m0, m3            ; -out15  out0   out14 -out1
   1058    paddsw               m3, m1, m2        ;  out12 -out3  -out13  out2
   1059    psubsw               m1, m2            ; t7 t6 t15a t14a
   1060    punpckhqdq           m2, m4, m1        ; t2a t6  t10 t14a
   1061    punpcklqdq           m4, m1            ; t3a t7  t11 t15a
   1062 INIT_ZMM avx512icl
   1063    vinserti32x8         m3, ym0, 1        ; out12 -out3  -out13  out2  -out15  out0   out14 -out1
   1064    ret
   1065 
   1066 INV_TXFM_4X16_FN flipadst, dct
   1067 INV_TXFM_4X16_FN flipadst, adst
   1068 INV_TXFM_4X16_FN flipadst, flipadst
   1069 INV_TXFM_4X16_FN flipadst, identity
   1070 
   1071 cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1072    mova                 m1, [o(permB)]
   1073    vpermq               m0, m1, [cq+64*0]
   1074    vpermq               m1, m1, [cq+64*1]
   1075    call m(iadst_16x4_internal_8bpc).main
   1076    vpbroadcastd         m3, [o(pw_16384)]
   1077    punpcklwd            m2, m1, m0
   1078    punpckhwd            m1, m0
   1079    pmulhrsw             m2, m3
   1080    pmulhrsw             m1, m3
   1081    punpcklwd            m0, m1, m2
   1082    punpckhwd            m1, m2
   1083    jmp                tx2q
   1084 .pass2:
   1085    call m(iadst_4x16_internal_8bpc).main
   1086    vpbroadcastd         m6, [o(pw_2048)]
   1087    psrlq               m10, 12
   1088    psubw                m5, m8, m6
   1089    jmp m(iadst_4x16_internal_8bpc).end
   1090 
   1091 INV_TXFM_4X16_FN identity, dct
   1092 INV_TXFM_4X16_FN identity, adst
   1093 INV_TXFM_4X16_FN identity, flipadst
   1094 INV_TXFM_4X16_FN identity, identity
   1095 
   1096 cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1097    mova                 m2, [o(int16_perm)]
   1098    vpermb               m1, m2, [cq+64*0]
   1099    vpermb               m2, m2, [cq+64*1]
   1100    vpbroadcastd         m4, [o(pw_1697x8)]
   1101    vpbroadcastd         m0, [o(pd_m1)]
   1102    pmulhrsw             m3, m4, m1    ; we want to do a signed avg, but pavgw is
   1103    vpcmpw               k1, m1, m0, 4 ; unsigned. as long as both signs are equal
   1104    pmulhrsw             m4, m2        ; it still works, but if the input is -1 the
   1105    vpcmpw               k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes
   1106    vpavgw        m1{k1}{z}, m3        ; pavgw to output -32768 instead of 0 unless
   1107    vpavgw        m2{k2}{z}, m4        ; we explicitly deal with that case here.
   1108    punpckldq            m0, m1, m2
   1109    punpckhdq            m1, m2
   1110    jmp                tx2q
   1111 .pass2:
   1112    vpbroadcastd         m3, [o(pw_1697x16)]
   1113    vpbroadcastd         m5, [o(pw_2048)]
   1114    pmulhrsw             m2, m3, m0
   1115    pmulhrsw             m3, m1
   1116    paddsw               m0, m0
   1117    paddsw               m1, m1
   1118    paddsw               m0, m2
   1119    paddsw               m1, m3
   1120    jmp m(iadst_4x16_internal_8bpc).end2
   1121 
   1122 %macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3]
   1123    movq               xm%3, [dstq   ]
   1124    movhps             xm%3, [dstq+%5]
   1125    movq               xm%4, [dstq+%6]
   1126    movhps             xm%4, [dstq+%7]
   1127    pmovzxbw            m%3, xm%3
   1128    pmovzxbw            m%4, xm%4
   1129 %ifnum %1
   1130    paddw               m%3, m%1
   1131 %else
   1132    paddw               m%3, %1
   1133 %endif
   1134 %ifnum %2
   1135    paddw               m%4, m%2
   1136 %else
   1137    paddw               m%4, %2
   1138 %endif
   1139    packuswb            m%3, m%4
   1140    vextracti32x4      xm%4, m%3, 1
   1141    movq          [dstq   ], xm%3
   1142    movhps        [dstq+%6], xm%3
   1143    movq          [dstq+%5], xm%4
   1144    movhps        [dstq+%7], xm%4
   1145 %endmacro
   1146 
   1147 %macro INV_TXFM_8X4_FN 2 ; type1, type2
   1148    INV_TXFM_FN          %1, %2, 8x4
   1149 %ifidn %1_%2, dct_dct
   1150    movd                xm1, [o(pw_2896x8)]
   1151    pmulhrsw            xm0, xm1, [cq]
   1152    movd                xm2, [o(pw_2048)]
   1153    pmulhrsw            xm0, xm1
   1154    pmulhrsw            xm0, xm1
   1155    pmulhrsw            xm0, xm2
   1156    vpbroadcastw         m0, xm0
   1157    mova                 m1, m0
   1158    jmp m(iadst_8x4_internal_8bpc).end3
   1159 %endif
   1160 %endmacro
   1161 
   1162 INIT_YMM avx512icl
   1163 INV_TXFM_8X4_FN dct, dct
   1164 INV_TXFM_8X4_FN dct, adst
   1165 INV_TXFM_8X4_FN dct, flipadst
   1166 INV_TXFM_8X4_FN dct, identity
   1167 
   1168 cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1169    vpbroadcastd        xm3, [o(pw_2896x8)]
   1170    pmulhrsw            xm0, xm3, [cq+16*0]
   1171    pmulhrsw            xm1, xm3, [cq+16*1]
   1172    pmulhrsw            xm2, xm3, [cq+16*2]
   1173    pmulhrsw            xm3,      [cq+16*3]
   1174    call m(idct_4x8_internal_8bpc).main
   1175    vbroadcasti32x4      m4, [o(deint_shuf)]
   1176    vinserti32x4         m3, m1, xm3, 1
   1177    vinserti32x4         m1, m0, xm2, 1
   1178    shufps               m0, m1, m3, q0220
   1179    shufps               m1, m3, q1331
   1180    pshufb               m0, m4
   1181    pshufb               m1, m4
   1182    jmp                tx2q
   1183 .pass2:
   1184    IDCT4_1D_PACKED
   1185    vpermq               m0, m0, q3120
   1186    vpermq               m1, m1, q2031
   1187    jmp m(iadst_8x4_internal_8bpc).end2
   1188 
   1189 INV_TXFM_8X4_FN adst, dct
   1190 INV_TXFM_8X4_FN adst, adst
   1191 INV_TXFM_8X4_FN adst, flipadst
   1192 INV_TXFM_8X4_FN adst, identity
   1193 
   1194 cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1195    vpbroadcastd        xm0, [o(pw_2896x8)]
   1196    pshufd              xm4,      [cq+16*0], q1032
   1197    pmulhrsw            xm3, xm0, [cq+16*3]
   1198    pshufd              xm5,      [cq+16*1], q1032
   1199    pmulhrsw            xm2, xm0, [cq+16*2]
   1200    pmulhrsw            xm4, xm0
   1201    pmulhrsw            xm5, xm0
   1202    call m(iadst_4x8_internal_8bpc).main_pass1
   1203    vinserti32x4         m0, xm2, 1
   1204    vinserti32x4         m1, xm3, 1
   1205    pxor                 m3, m3
   1206    punpckhwd            m2, m0, m1
   1207    punpcklwd            m0, m1
   1208    psubsw               m3, m2
   1209    punpckhwd            m1, m0, m3
   1210    punpcklwd            m0, m3
   1211    jmp                tx2q
   1212 .pass2:
   1213    call .main
   1214 .end:
   1215    vpermq               m0, m0, q3120
   1216    vpermq               m1, m1, q3120
   1217 .end2:
   1218    vpbroadcastd         m2, [o(pw_2048)]
   1219    pmulhrsw             m0, m2
   1220    pmulhrsw             m1, m2
   1221 .end3:
   1222    pxor                 m2, m2
   1223    mova               [cq], zmm18
   1224    lea                  r6, [strideq*3]
   1225    WRITE_8X4             0, 1, 4, 5
   1226    RET
   1227 ALIGN function_align
   1228 .main:
   1229    IADST4_1D_PACKED
   1230    ret
   1231 
   1232 INV_TXFM_8X4_FN flipadst, dct
   1233 INV_TXFM_8X4_FN flipadst, adst
   1234 INV_TXFM_8X4_FN flipadst, flipadst
   1235 INV_TXFM_8X4_FN flipadst, identity
   1236 
   1237 cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1238    vpbroadcastd        xm0, [o(pw_2896x8)]
   1239    pshufd              xm4,      [cq+16*0], q1032
   1240    pmulhrsw            xm3, xm0, [cq+16*3]
   1241    pshufd              xm5,      [cq+16*1], q1032
   1242    pmulhrsw            xm2, xm0, [cq+16*2]
   1243    pmulhrsw            xm4, xm0
   1244    pmulhrsw            xm5, xm0
   1245    call m(iadst_4x8_internal_8bpc).main_pass1
   1246    vinserti32x4         m3, m3, xm1, 1
   1247    vinserti32x4         m2, m2, xm0, 1
   1248    punpckhwd            m1, m3, m2
   1249    punpcklwd            m3, m2
   1250    pxor                 m0, m0
   1251    psubsw               m0, m1
   1252    punpckhwd            m1, m0, m3
   1253    punpcklwd            m0, m3
   1254    jmp                tx2q
   1255 .pass2:
   1256    call m(iadst_8x4_internal_8bpc).main
   1257    mova                 m2, m1
   1258    vpermq               m1, m0, q2031
   1259    vpermq               m0, m2, q2031
   1260    jmp m(iadst_8x4_internal_8bpc).end2
   1261 
   1262 INV_TXFM_8X4_FN identity, dct
   1263 INV_TXFM_8X4_FN identity, adst
   1264 INV_TXFM_8X4_FN identity, flipadst
   1265 INV_TXFM_8X4_FN identity, identity
   1266 
   1267 cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1268    mova                xm2, [cq+16*0]
   1269    mova                xm0, [cq+16*1]
   1270    vinserti32x4         m2, [cq+16*2], 1
   1271    vinserti32x4         m0, [cq+16*3], 1
   1272    vpbroadcastd         m3, [o(pw_2896x8)]
   1273    punpcklwd            m1, m2, m0
   1274    punpckhwd            m2, m0
   1275    pmulhrsw             m1, m3
   1276    pmulhrsw             m2, m3
   1277    punpcklwd            m0, m1, m2
   1278    punpckhwd            m1, m2
   1279    paddsw               m0, m0
   1280    paddsw               m1, m1
   1281    jmp                tx2q
   1282 .pass2:
   1283    vpbroadcastd         m3, [o(pw_1697x8)]
   1284    pmulhrsw             m2, m3, m0
   1285    pmulhrsw             m3, m1
   1286    paddsw               m0, m2
   1287    paddsw               m1, m3
   1288    jmp m(iadst_8x4_internal_8bpc).end
   1289 
   1290 %macro INV_TXFM_8X8_FN 2 ; type1, type2
   1291    INV_TXFM_FN          %1, %2, 8x8
   1292 %ifidn %1_%2, dct_dct
   1293 INIT_ZMM avx512icl
   1294    movsx               r6d, word [cq]
   1295    mov                [cq], eobd
   1296 .dconly:
   1297    imul                r6d, 181
   1298    add                 r6d, 128+256
   1299    sar                 r6d, 8+1
   1300 .dconly2:
   1301    vpbroadcastd        ym2, strided
   1302    imul                r6d, 181
   1303    pmulld              ym5, ym2, [o(pd_0to15)]
   1304    kxnorb               k1, k1, k1
   1305    add                 r6d, 128+2048
   1306    sar                 r6d, 8+4
   1307    pxor                 m3, m3
   1308    vpbroadcastw         m4, r6d
   1309 .dconly_loop:
   1310    kmovb                k2, k1
   1311    vpgatherdq       m2{k1}, [dstq+ym5]
   1312    punpcklbw            m0, m2, m3
   1313    punpckhbw            m1, m2, m3
   1314    paddw                m0, m4
   1315    paddw                m1, m4
   1316    packuswb             m0, m1
   1317    kmovb                k1, k2
   1318    vpscatterdq [dstq+ym5]{k2}, m0
   1319    lea                dstq, [dstq+strideq*8]
   1320    sub                 r3d, 8
   1321    jg .dconly_loop
   1322    RET
   1323 INIT_YMM avx512icl
   1324 %endif
   1325 %endmacro
   1326 
   1327 INV_TXFM_8X8_FN dct, dct
   1328 INV_TXFM_8X8_FN dct, identity
   1329 INV_TXFM_8X8_FN dct, adst
   1330 INV_TXFM_8X8_FN dct, flipadst
   1331 
   1332 cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1333    vpermq               m0, [cq+32*0], q3120 ; 0 1
   1334    vpermq               m3, [cq+32*3], q3120 ; 6 7
   1335    vpermq               m2, [cq+32*2], q3120 ; 4 5
   1336    vpermq               m1, [cq+32*1], q3120 ; 2 3
   1337    call .main
   1338    shufps               m4, m0, m1, q0220
   1339    shufps               m5, m0, m1, q1331
   1340    shufps               m1, m2, m3, q0220
   1341    shufps               m3, m2, m3, q1331
   1342    vbroadcasti32x4      m0, [o(deint_shuf)]
   1343    vpbroadcastd         m2, [o(pw_16384)]
   1344    REPX   {pshufb   x, m0}, m4, m5, m1, m3
   1345    REPX   {pmulhrsw x, m2}, m4, m5, m1, m3
   1346    vinserti32x4         m0, m4, xm1, 1
   1347    vshufi32x4           m2, m4, m1, 0x03
   1348    vinserti32x4         m1, m5, xm3, 1
   1349    vshufi32x4           m3, m5, m3, 0x03
   1350    jmp                tx2q
   1351 .pass2:
   1352    call .main
   1353    vpbroadcastd         m4, [o(pw_2048)]
   1354    vpermq               m0, m0, q3120
   1355    vpermq               m1, m1, q2031
   1356    vpermq               m2, m2, q3120
   1357    vpermq               m3, m3, q2031
   1358    jmp m(iadst_8x8_internal_8bpc).end2
   1359 ALIGN function_align
   1360 cglobal_label .main
   1361    IDCT8_1D_PACKED
   1362    ret
   1363 
   1364 INV_TXFM_8X8_FN adst, dct
   1365 INV_TXFM_8X8_FN adst, adst
   1366 INV_TXFM_8X8_FN adst, flipadst
   1367 INV_TXFM_8X8_FN adst, identity
   1368 
   1369 cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1370    vpermq               m4, [cq+32*0], q1302 ; 1 0
   1371    vpermq               m3, [cq+32*3], q3120 ; 6 7
   1372    vpermq               m5, [cq+32*1], q1302 ; 3 2
   1373    vpermq               m2, [cq+32*2], q3120 ; 4 5
   1374    call .main_pass1
   1375    vpbroadcastd         m5, [o(pw_16384_m16384)]
   1376    punpcklwd            m4, m0, m1
   1377    punpckhwd            m0, m1
   1378    punpcklwd            m1, m2, m3
   1379    punpckhwd            m2, m3
   1380    punpcklwd            m3, m4, m0
   1381    punpckhwd            m4, m0
   1382    punpcklwd            m0, m1, m2
   1383    punpckhwd            m1, m2
   1384    REPX   {pmulhrsw x, m5}, m3, m4, m0, m1
   1385    vshufi32x4           m2, m3, m0, 0x03
   1386    vinserti32x4         m0, m3, xm0, 1
   1387    vshufi32x4           m3, m4, m1, 0x03
   1388    vinserti32x4         m1, m4, xm1, 1
   1389    jmp                tx2q
   1390 .pass2:
   1391    pshufd               m4, m0, q1032
   1392    pshufd               m5, m1, q1032
   1393    call .main_pass2
   1394    vpbroadcastd         m5, [o(pw_2048)]
   1395    vpbroadcastd        xm4, [o(pw_4096)]
   1396    psubw                m4, m5 ; lower half = 2048, upper half = -2048
   1397 .end:
   1398    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
   1399 .end2:
   1400    pmulhrsw             m0, m4
   1401    pmulhrsw             m1, m4
   1402 .end3:
   1403    pmulhrsw             m2, m4
   1404    pmulhrsw             m3, m4
   1405 .end4:
   1406    pxor                 m4, m4
   1407    mova          [cq+32*0], m4
   1408    mova          [cq+32*1], m4
   1409    mova          [cq+32*2], m4
   1410    mova          [cq+32*3], m4
   1411    lea                  r6, [strideq*3]
   1412    WRITE_8X4             0, 1, 4, 5
   1413    lea                dstq, [dstq+strideq*4]
   1414    WRITE_8X4             2, 3, 4, 5
   1415    RET
   1416 ALIGN function_align
   1417 .main_pass1:
   1418    punpckhwd            m0, m4, m3 ; 0 7
   1419    punpckhwd            m1, m5, m2 ; 2 5
   1420    punpcklwd            m2, m5     ; 4 3
   1421    punpcklwd            m3, m4     ; 6 1
   1422    IADST8_1D_PACKED 1
   1423    punpcklqdq           m3, m4, m0        ; out6 -out7
   1424    punpckhqdq           m0, m4            ; out0 -out1
   1425    ret
   1426 ALIGN function_align
   1427 cglobal_label .main_pass2
   1428    IADST8_1D_PACKED 2
   1429    ret
   1430 
   1431 INV_TXFM_8X8_FN flipadst, dct
   1432 INV_TXFM_8X8_FN flipadst, adst
   1433 INV_TXFM_8X8_FN flipadst, flipadst
   1434 INV_TXFM_8X8_FN flipadst, identity
   1435 
   1436 cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1437    vpermq               m4, [cq+32*0], q1302 ; 1 0
   1438    vpermq               m3, [cq+32*3], q3120 ; 6 7
   1439    vpermq               m5, [cq+32*1], q1302 ; 3 2
   1440    vpermq               m2, [cq+32*2], q3120 ; 4 5
   1441    call m(iadst_8x8_internal_8bpc).main_pass1
   1442    vpbroadcastd         m5, [o(pw_m16384_16384)]
   1443    punpckhwd            m4, m3, m2
   1444    punpcklwd            m3, m2
   1445    punpckhwd            m2, m1, m0
   1446    punpcklwd            m1, m0
   1447    punpckhwd            m0, m4, m3
   1448    punpcklwd            m4, m3
   1449    punpckhwd            m3, m2, m1
   1450    punpcklwd            m2, m1
   1451    REPX   {pmulhrsw x, m5}, m0, m4, m3, m2
   1452    vinserti32x4         m1, m0, xm3, 1
   1453    vshufi32x4           m3, m0, m3, 0x03
   1454    vinserti32x4         m0, m4, xm2, 1
   1455    vshufi32x4           m2, m4, m2, 0x03
   1456    jmp                tx2q
   1457 .pass2:
   1458    pshufd               m4, m0, q1032
   1459    pshufd               m5, m1, q1032
   1460    call m(iadst_8x8_internal_8bpc).main_pass2
   1461    vpbroadcastd         m4, [o(pw_2048)]
   1462    vpbroadcastd        xm5, [o(pw_4096)]
   1463    psubw                m4, m5 ; lower half = -2048, upper half = 2048
   1464    vpermq               m5, m3, q2031
   1465    vpermq               m3, m0, q2031
   1466    vpermq               m0, m2, q2031
   1467    vpermq               m2, m1, q2031
   1468    pmulhrsw             m1, m0, m4
   1469    pmulhrsw             m0, m5, m4
   1470    jmp m(iadst_8x8_internal_8bpc).end3
   1471 
   1472 INV_TXFM_8X8_FN identity, dct
   1473 INV_TXFM_8X8_FN identity, adst
   1474 INV_TXFM_8X8_FN identity, flipadst
   1475 INV_TXFM_8X8_FN identity, identity
   1476 
   1477 cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1478    mova                xm3, [cq+16*0]
   1479    mova                xm2, [cq+16*1]
   1480    vinserti32x4         m3, [cq+16*4], 1
   1481    vinserti32x4         m2, [cq+16*5], 1
   1482    mova                xm4, [cq+16*2]
   1483    mova                xm0, [cq+16*3]
   1484    vinserti32x4         m4, [cq+16*6], 1
   1485    vinserti32x4         m0, [cq+16*7], 1
   1486    punpcklwd            m1, m3, m2
   1487    punpckhwd            m3, m2
   1488    punpcklwd            m2, m4, m0
   1489    punpckhwd            m4, m0
   1490    punpckldq            m0, m1, m2
   1491    punpckhdq            m1, m2
   1492    punpckldq            m2, m3, m4
   1493    punpckhdq            m3, m4
   1494    jmp                tx2q
   1495 .pass2:
   1496    vpbroadcastd         m4, [o(pw_4096)]
   1497    jmp m(iadst_8x8_internal_8bpc).end
   1498 
   1499 %macro INV_TXFM_8X16_FN 2 ; type1, type2
   1500    INV_TXFM_FN          %1, %2, 8x16
   1501 %ifidn %1_%2, dct_dct
   1502    movsx               r6d, word [cq]
   1503    mov                [cq], eobd
   1504    or                  r3d, 16
   1505    imul                r6d, 181
   1506    add                 r6d, 128
   1507    sar                 r6d, 8
   1508    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
   1509 %endif
   1510 %endmacro
   1511 
   1512 %macro ITX_8X16_LOAD_COEFS 0
   1513    vpbroadcastd         m4, [o(pw_2896x8)]
   1514    pmulhrsw             m0, m4, [cq+32*0]
   1515    add                  cq, 32*4
   1516    pmulhrsw             m7, m4, [cq+32*3]
   1517    pmulhrsw             m1, m4, [cq-32*3]
   1518    pmulhrsw             m6, m4, [cq+32*2]
   1519    pmulhrsw             m2, m4, [cq-32*2]
   1520    pmulhrsw             m5, m4, [cq+32*1]
   1521    pmulhrsw             m3, m4, [cq-32*1]
   1522    pmulhrsw             m4,     [cq+32*0]
   1523 %endmacro
   1524 
   1525 INIT_ZMM avx512icl
   1526 INV_TXFM_8X16_FN dct, dct
   1527 INV_TXFM_8X16_FN dct, identity
   1528 INV_TXFM_8X16_FN dct, adst
   1529 INV_TXFM_8X16_FN dct, flipadst
   1530 
   1531 cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1532    mova                 m3, [o(permB)]
   1533    vpermq               m0, m3, [cq+64*0]
   1534    vpbroadcastd         m4, [o(pw_2896x8)]
   1535    vpermq               m1, m3, [cq+64*1]
   1536    vpermq               m2, m3, [cq+64*2]
   1537    vpermq               m3, m3, [cq+64*3]
   1538    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
   1539    call m(idct_16x8_internal_8bpc).main
   1540    vpbroadcastd         m5, [o(pw_16384)]
   1541    punpckhwd            m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3
   1542    punpcklwd            m0, m2     ; a0 e0 a1 e1 a2 e2 a3 e3
   1543    punpckhwd            m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3
   1544    punpcklwd            m1, m3     ; d0 h0 d1 h1 d2 h2 d3 h3
   1545    REPX   {pmulhrsw x, m5}, m4, m0, m2, m1
   1546    punpckhwd            m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3
   1547    punpcklwd            m0, m4     ; a0 b0 e0 f0 a1 b1 e1 f1
   1548    punpckhwd            m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3
   1549    punpcklwd            m2, m1     ; c0 d0 g0 h0 c1 d1 g1 h1
   1550    punpckhdq            m1, m0, m2 ;  1  5  9 13
   1551    punpckldq            m0, m2     ;  0  4  8 12
   1552    punpckldq            m2, m3, m4 ;  2  6 10 14
   1553    punpckhdq            m3, m4     ;  3  7 11 15
   1554    jmp                tx2q
   1555 .pass2:
   1556    vprord               m5, [o(int16_perm)], 16
   1557    vshufi32x4           m2, m2, q1320     ;  2 10 14  6
   1558    vshufi32x4           m4, m1, m3, q2310 ;  1  5 15 11
   1559    vshufi32x4           m1, m3, q0132     ;  9 13  7  3
   1560    vpermb               m9, m5, m0
   1561    vpermb               m7, m5, m2
   1562    vpermb               m8, m5, m4
   1563    vpermb               m0, m5, m1
   1564    vextracti32x8       ym6, m9, 1
   1565    vextracti32x8       ym3, m7, 1
   1566    vextracti32x8       ym5, m8, 1
   1567    vextracti32x8       ym1, m0, 1
   1568    call .main2
   1569    mova                ym8, [o(gather8a)]
   1570    lea                  r3, [dstq+strideq*4]
   1571    pmovzxdq             m9, ym8
   1572    pshufd              ym8, ym8, q1230
   1573    vpermt2q             m0, m9, m4
   1574    vpermt2q             m1, m9, m5
   1575    vpermt2q             m2, m9, m6
   1576    vpermt2q             m3, m9, m7
   1577 .end:
   1578    vpbroadcastd         m7, [o(pw_2048)]
   1579 .end2:
   1580    pmulhrsw             m0, m7
   1581    pmulhrsw             m1, m7
   1582 .end3:
   1583    pmulhrsw             m2, m7
   1584    pmulhrsw             m3, m7
   1585 .end4:
   1586    vpbroadcastd        ym6, strided
   1587    kxnorb               k1, k1, k1
   1588    pxor                 m4, m4
   1589    pmulld              ym8, ym6
   1590    kmovb                k2, k1
   1591    vpgatherdq       m6{k1}, [dstq+ym8]
   1592    kmovb                k1, k2
   1593    vpgatherdq       m7{k2}, [r3+ym8]
   1594    mova          [cq+64*0], m4
   1595    mova          [cq+64*1], m4
   1596    kmovb                k2, k1
   1597    mova          [cq+64*2], m4
   1598    mova          [cq+64*3], m4
   1599    punpcklbw            m5, m6, m4
   1600    punpckhbw            m6, m4
   1601    paddw                m0, m5
   1602    paddw                m1, m6
   1603    packuswb             m0, m1
   1604    vpscatterdq [dstq+ym8]{k1}, m0
   1605    punpcklbw            m6, m7, m4
   1606    punpckhbw            m7, m4
   1607    paddw                m2, m6
   1608    paddw                m3, m7
   1609    packuswb             m2, m3
   1610    vpscatterdq [r3+ym8]{k2}, m2
   1611    RET
   1612 ALIGN function_align
   1613 cglobal_label .main_fast2 ; bottom three-quarters are zero
   1614    vpbroadcastd       ym10, [o(pd_2048)]
   1615    vpbroadcastq       ym13, [o(int_mshift)]
   1616    vpbroadcastd        ym3, [o(pw_401_4076x8)]
   1617    vpbroadcastd        ym5, [o(pw_799_4017x8)]
   1618    vpbroadcastd        ym4, [o(pw_m1189_3920x8)]
   1619    pxor                ym6, ym6
   1620    punpckhwd           ym2, ym0, ym0
   1621    pmulhrsw            ym2, ym3      ; t8a  t15a
   1622    punpcklwd           ym7, ym1, ym1
   1623    pmulhrsw            ym7, ym5      ; t4a  t7a
   1624    punpckhwd           ym1, ym1
   1625    pmulhrsw            ym4, ym1      ; t11a t12a
   1626    vpcmpub              k7, ym13, ym10, 6
   1627    punpcklwd           ym9, ym6, ym0
   1628    psubsw              ym0, ym2, ym4 ; t11a t12a
   1629    paddsw              ym8, ym2, ym4 ; t8a  t15a
   1630    mova                ym1, ym7
   1631    jmp .main5
   1632 ALIGN function_align
   1633 cglobal_label .main_fast ; bottom half is zero
   1634    vpbroadcastd       ym10, [o(pd_2048)]
   1635    vpbroadcastq       ym13, [o(int_mshift)]
   1636    pxor                ym6, ym6
   1637    punpckhwd           ym8, ym0, ym0
   1638    punpckhwd           ym4, ym3, ym3
   1639    punpckhwd           ym5, ym2, ym2
   1640    punpcklwd           ym7, ym1, ym1
   1641    punpckhwd           ym1, ym1
   1642    punpcklwd           ym3, ym3
   1643    punpcklwd           ym9, ym6, ym0
   1644    punpcklwd           ym6, ym2
   1645    vpbroadcastd        ym2, [o(pw_401_4076x8)]
   1646    vpbroadcastd        ym0, [o(pw_m2598_3166x8)]
   1647    vpbroadcastd       ym11, [o(pw_1931_3612x8)]
   1648    vpbroadcastd       ym12, [o(pw_m1189_3920x8)]
   1649    pmulhrsw            ym8, ym2  ; t8a  t15a
   1650    vpbroadcastd        ym2, [o(pw_799_4017x8)]
   1651    pmulhrsw            ym0, ym4  ; t9a  t14a
   1652    vpbroadcastd        ym4, [o(pw_m2276_3406x8)]
   1653    pmulhrsw            ym5, ym11 ; t10a t13a
   1654    pmulhrsw            ym1, ym12 ; t11a t12a
   1655    pmulhrsw            ym7, ym2  ; t4a  t7a
   1656    pmulhrsw            ym3, ym4  ; t5a  t6a
   1657    vpcmpub              k7, ym13, ym10, 6
   1658    jmp .main4
   1659 ALIGN function_align
   1660 cglobal_label .main
   1661    WRAP_YMM IDCT16_1D_PACKED
   1662    ret
   1663 
   1664 INV_TXFM_8X16_FN adst, dct
   1665 INV_TXFM_8X16_FN adst, adst
   1666 INV_TXFM_8X16_FN adst, flipadst
   1667 INV_TXFM_8X16_FN adst, identity
   1668 
   1669 cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1670    call m(iadst_16x8_internal_8bpc).main_pass1
   1671    vbroadcasti32x4      m6, [o(int_shuf1)]
   1672    vpbroadcastd         m7, [o(pw_16384_m16384)]
   1673    punpckhwd            m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
   1674    punpcklwd            m4, m0     ; g0 h0 g1 h1 g2 h2 g3 h3
   1675    pshufb               m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
   1676    pshufb               m2, m6     ; e0 f0 e1 f1 e2 f2 e3 f3
   1677 .pass1_end:
   1678    REPX   {pmulhrsw x, m7}, m3, m5, m4, m2
   1679    punpckldq            m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1
   1680    punpckhdq            m3, m5     ; a2 b2 c2 d2 a3 b3 c3 d3
   1681    punpckhdq            m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
   1682    punpckldq            m2, m4     ; e0 f0 g0 h0 e1 f1 g1 h1
   1683    punpckhqdq           m1, m0, m2
   1684    punpcklqdq           m0, m2
   1685    punpcklqdq           m2, m3, m5
   1686    punpckhqdq           m3, m5
   1687    jmp                tx2q
   1688 .pass2:
   1689    call .main_pass2
   1690    vpbroadcastd         m6, [o(pw_2048)]
   1691    psrlq               m10, 4
   1692    psubw                m7, m8, m6
   1693 .pass2_end:
   1694    vpbroadcastd         m5, [o(pw_2896x8)]
   1695    paddsw               m1, m2, m4
   1696    psubsw               m2, m4
   1697    pmulhrsw             m1, m5      ; -out7   out4   out6  -out5
   1698    pmulhrsw             m5, m2      ;  out8  -out11 -out9   out10
   1699    mova                ym8, [o(gather8c)]
   1700    lea                  r3, [dstq+strideq]
   1701    psrlq                m2, m10, 4
   1702    vpermi2q             m2, m0, m3  ;  1  3 13 15
   1703    vpermt2q             m0, m10, m3 ;  0  2 12 14
   1704    psrlq                m3, m10, 8
   1705    vpermi2q             m3, m1, m5  ;  5  7  9 11
   1706    psrlq               m10, 12
   1707    vpermt2q             m1, m10, m5 ;  4  6  8 10
   1708    pmulhrsw             m0, m6
   1709    pmulhrsw             m1, m6
   1710    jmp m(idct_8x16_internal_8bpc).end3
   1711 ALIGN function_align
   1712 .main_pass1:
   1713    vpbroadcastd         m2, [o(pw_2896x8)]
   1714    pmulhrsw             m5, m2, [cq+64*0]
   1715    pmulhrsw             m3, m2, [cq+64*3]
   1716    pmulhrsw             m1, m2, [cq+64*1]
   1717    pmulhrsw             m2,     [cq+64*2]
   1718    movu                 m4, [o(permA+3)]
   1719    psrlq               m10, m4, 4
   1720    mova                 m6, m4
   1721    vpermi2q             m4, m5, m3  ; in0  in12 in2  in14
   1722    vpermt2q             m5, m10, m3 ; in15 in3  in13 in1
   1723    vpermi2q             m6, m1, m2  ; in4  in8  in6  in10
   1724    vpermt2q             m1, m10, m2 ; in11 in7  in9  in5
   1725    jmp .main
   1726 ALIGN function_align
   1727 .main_pass2:
   1728    mova                 m4, [o(permC)]
   1729    psrlq                m5, m4, 4
   1730    vpermi2q             m4, m0, m2  ; in0  in12 in2  in14
   1731    psrlq                m6, m5, 4
   1732    vpermi2q             m5, m1, m3  ; in15 in3  in13 in1
   1733    psrlq               m10, m6, 4
   1734    vpermi2q             m6, m0, m2  ; in4  in8  in6  in10
   1735    vpermt2q             m1, m10, m3 ; in11 in7  in9  in5
   1736 .main:
   1737    punpcklwd            m0, m4, m5  ; in0  in15 in2  in13
   1738    punpckhwd            m4, m5      ; in12 in3  in14 in1
   1739    punpcklwd            m5, m6, m1  ; in4  in11 in6  in9
   1740    punpckhwd            m6, m1      ; in8  in7  in10 in5
   1741 cglobal_label .main2
   1742    vpbroadcastd         m9, [o(pd_2048)]
   1743    vpbroadcastq        m13, [o(int_mshift)]
   1744    kxnorb               k1, k1, k1
   1745    vpcmpub              k7, m13, m9, 6 ; 0x33...
   1746    pxor                 m8, m8
   1747    ITX_MUL4X_PACK        0, 1, 2, 3, 7, 9,  201, 4091,  995, 3973, 5
   1748    ITX_MUL4X_PACK        6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5
   1749    ITX_MUL4X_PACK        4, 1, 2, 3, 7, 9, 3857, 1380, 4052,  601, 5
   1750    ITX_MUL4X_PACK        5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5
   1751    psubsw               m2, m0, m6 ; t9a  t8a  t11a t10a
   1752    paddsw               m0, m6     ; t1a  t0a  t3a  t2a
   1753    psubsw               m3, m5, m4 ; t13a t12a t15a t14a
   1754    paddsw               m5, m4     ; t5a  t4a  t7a  t6a
   1755    ITX_MUL4X_PACK        2, 4, 1, 6, 7, 9,  799, 4017, 3406, 2276, 5
   1756    psubw                m7, m8, m7
   1757    ITX_MUL2X_PACK        3, 4, 1, 9, 7, 6, 4
   1758    vpbroadcastd         m6, [o(pw_3784_m1567)]
   1759    vpbroadcastd     m6{k1}, [o(pw_m3784_1567)]
   1760    psubsw               m1, m0, m5 ; t5   t4   t7   t6
   1761    paddsw               m0, m5     ; t1   t0   t3   t2
   1762    psubsw               m4, m2, m3 ; t13a t12a t15a t14a
   1763    paddsw               m2, m3     ; t9a  t8a  t11a t10a
   1764    ITX_MUL2X_PACK        1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a
   1765    ITX_MUL2X_PACK        4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15
   1766    vbroadcasti32x4      m5, [o(deint_shuf)]
   1767    pshufb               m0, m5
   1768    pshufb               m2, m5
   1769    vshufi32x4           m3, m0, m2, q3232 ; t3   t2   t11a t10a
   1770    vinserti32x8         m0, ym2, 1        ; t1   t0   t9a  t8a
   1771    vshufi32x4           m2, m1, m4, q3232 ; t6a  t7a  t14  t15
   1772    vinserti32x8         m1, ym4, 1        ; t5a  t4a  t13  t12
   1773    pshufd               m2, m2, q1032     ; t7a  t6a  t15  t14
   1774    psubsw               m4, m0, m3        ; t3a t2a t11 t10
   1775    paddsw               m0, m3            ; -out15  out0   out14 -out1
   1776    paddsw               m3, m1, m2        ;  out12 -out3  -out13  out2
   1777    psubsw               m1, m2            ; t7 t6 t15a t14a
   1778    punpckhqdq           m2, m4, m1        ; t2a t6  t10 t14a
   1779    punpcklqdq           m4, m1            ; t3a t7  t11 t15a
   1780    ret
   1781 
   1782 INV_TXFM_8X16_FN flipadst, dct
   1783 INV_TXFM_8X16_FN flipadst, adst
   1784 INV_TXFM_8X16_FN flipadst, flipadst
   1785 INV_TXFM_8X16_FN flipadst, identity
   1786 
   1787 cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1788    call m(iadst_16x8_internal_8bpc).main_pass1
   1789    vbroadcasti32x4      m6, [o(int_shuf2)]
   1790    vpbroadcastd         m7, [o(pw_m16384_16384)]
   1791    punpcklwd            m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
   1792    punpckhwd            m4, m0     ; g0 h0 g1 h1 g2 h2 g3 h3
   1793    pshufb               m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
   1794    pshufb               m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
   1795    jmp m(iadst_8x16_internal_8bpc).pass1_end
   1796 .pass2:
   1797    call m(iadst_8x16_internal_8bpc).main_pass2
   1798    vpbroadcastd         m7, [o(pw_2048)]
   1799    psrlq               m10, 36
   1800    psubw                m6, m8, m7
   1801    jmp m(iadst_8x16_internal_8bpc).pass2_end
   1802 
   1803 INV_TXFM_8X16_FN identity, dct
   1804 INV_TXFM_8X16_FN identity, adst
   1805 INV_TXFM_8X16_FN identity, flipadst
   1806 INV_TXFM_8X16_FN identity, identity
   1807 
   1808 cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1809    mova                 m0, [o(int16_perm)]
   1810    vpermb               m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
   1811    vpermb               m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
   1812    vpermb               m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
   1813    vpermb               m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
   1814    vpbroadcastd         m5, [o(pw_2896x8)]
   1815    punpckldq            m1, m3, m2        ; a0 b0 c0 d0 a1 b1 c1 d1
   1816    punpckhdq            m3, m2            ; a2 b2 c2 d2 a3 b3 c3 d3
   1817    punpckldq            m2, m4, m0        ; e0 f0 g0 h0 a1 f1 g1 h1
   1818    punpckhdq            m4, m0            ; e2 f2 g2 h2 e3 f3 g3 h3
   1819    REPX   {pmulhrsw x, m5}, m1, m2, m3, m4
   1820    punpcklqdq           m0, m1, m2        ; a0 b0 c0 d0 e0 f0 g0 h0
   1821    punpckhqdq           m1, m2            ; a1 b1 c1 d1 e1 f1 g1 h1
   1822    punpcklqdq           m2, m3, m4        ; a2 b2 c2 d2 e2 f2 g2 h2
   1823    punpckhqdq           m3, m4            ; a3 b3 c3 d3 e3 f3 g3 h3
   1824    jmp                tx2q
   1825 .pass2:
   1826    vpbroadcastd         m7, [o(pw_1697x16)]
   1827    mova                ym8, [o(gather8b)]
   1828    lea                  r3, [dstq+strideq*2]
   1829    pmulhrsw             m4, m7, m0
   1830    pmulhrsw             m5, m7, m1
   1831    pmulhrsw             m6, m7, m2
   1832    pmulhrsw             m7, m3
   1833    REPX      {paddsw x, x}, m0, m1, m2, m3
   1834    paddsw               m0, m4
   1835    paddsw               m1, m5
   1836    paddsw               m2, m6
   1837    paddsw               m3, m7
   1838    jmp m(idct_8x16_internal_8bpc).end
   1839 
   1840 %macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
   1841    pmovzxbw            m%3, [dstq+%5]
   1842 %ifnum %1
   1843    paddw               m%3, m%1
   1844 %else
   1845    paddw               m%3, %1
   1846 %endif
   1847    pmovzxbw            m%4, [dstq+%6]
   1848 %ifnum %2
   1849    paddw               m%4, m%2
   1850 %else
   1851    paddw               m%4, %2
   1852 %endif
   1853    packuswb            m%3, m%4
   1854    vpermq              m%3, m%3, q3120
   1855    mova          [dstq+%5], xm%3
   1856    vextracti32x4 [dstq+%6], m%3, 1
   1857 %endmacro
   1858 
   1859 %macro INV_TXFM_16X4_FN 2 ; type1, type2
   1860    INV_TXFM_FN          %1, %2, 16x4
   1861 %ifidn %1_%2, dct_dct
   1862    movsx               r6d, word [cq]
   1863    mov                [cq], eobd
   1864    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2
   1865 %endif
   1866 %endmacro
   1867 
   1868 INIT_ZMM avx512icl
   1869 INV_TXFM_16X4_FN dct, dct
   1870 INV_TXFM_16X4_FN dct, adst
   1871 INV_TXFM_16X4_FN dct, flipadst
   1872 INV_TXFM_16X4_FN dct, identity
   1873 
   1874 cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1875    mova                xm0, [cq+16*0]
   1876    mova                xm1, [cq+16*1]
   1877    mova                xm2, [cq+16*2]
   1878    mova                xm3, [cq+16*3]
   1879    mova                xm4, [cq+16*4]
   1880    mova                xm5, [cq+16*5]
   1881    mova                xm6, [cq+16*6]
   1882    mova                xm7, [cq+16*7]
   1883    call m(idct_4x16_internal_8bpc).main
   1884    vpbroadcastd         m8, [o(pw_16384)]
   1885    vinserti32x4        ym1, xm3, 1 ; 3 2   7 6
   1886    vinserti32x4        ym5, xm7, 1 ; b a   f e
   1887    vinserti32x4        ym0, xm2, 1 ; 0 1   4 5
   1888    vinserti32x4        ym4, xm6, 1 ; 8 9   c d
   1889    vinserti32x8         m1, ym5, 1 ; 3 2   7 6   b a   f e
   1890    vinserti32x8         m0, ym4, 1 ; 0 1   4 5   8 9   c d
   1891    pmulhrsw             m1, m8
   1892    pmulhrsw             m0, m8
   1893    pshufd               m1, m1, q1032
   1894    punpckhwd            m2, m0, m1
   1895    punpcklwd            m0, m1
   1896    punpckhwd            m1, m0, m2
   1897    punpcklwd            m0, m2
   1898    jmp                tx2q
   1899 .pass2:
   1900    IDCT4_1D_PACKED
   1901    mova                 m2, [o(permA)]
   1902    jmp m(iadst_16x4_internal_8bpc).end
   1903 
   1904 INV_TXFM_16X4_FN adst, dct
   1905 INV_TXFM_16X4_FN adst, adst
   1906 INV_TXFM_16X4_FN adst, flipadst
   1907 INV_TXFM_16X4_FN adst, identity
   1908 
   1909 cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1910    mova                 m0, [cq+64*0]
   1911    mova                 m1, [cq+64*1]
   1912    movshdup             m3, [o(permB)]
   1913    psrlq               m10, m3, 4
   1914    call m(iadst_4x16_internal_8bpc).main2
   1915    vpbroadcastd         m6, [o(pw_16384_m16384)]
   1916    psrlq                m0, m10, 4
   1917    psrlq               m10, 8
   1918 .pass1_end:
   1919    punpcklwd           ym5, ym4, ym2
   1920    punpckhwd           ym4, ym2
   1921    vinserti32x8         m5, ym4, 1
   1922    mova                 m1, m9
   1923    vpdpwssd             m1, m5, [o(pw_m2896_2896)] {1to16}
   1924    mova                 m4, m9
   1925    vpdpwssd             m4, m5, [o(pw_2896_2896)] {1to16}
   1926    psrad                m1, 12
   1927    psrad                m4, 12
   1928    packssdw             m1, m4 ;  out8  -out7  -out9   out6  -out11  out4   out10 -out5
   1929    vpermi2q             m0, m1, m3  ; 0 1   4 5   8 9   c d
   1930    vpermt2q             m1, m10, m3 ; 2 3   6 7   a b   e f
   1931    punpckhwd            m2, m0, m1
   1932    punpcklwd            m0, m1
   1933    punpckhwd            m1, m0, m2
   1934    punpcklwd            m0, m2
   1935    pmulhrsw             m0, m6
   1936    pmulhrsw             m1, m6
   1937    jmp                tx2q
   1938 .pass2:
   1939    call .main
   1940    movu                 m2, [o(permA+1)]
   1941 .end:
   1942    vpbroadcastd         m3, [o(pw_2048)]
   1943    pmulhrsw             m0, m3
   1944    pmulhrsw             m1, m3
   1945 .end2:
   1946    psrlq                m3, m2, 4
   1947    vpermi2q             m2, m0, m1
   1948    vpermi2q             m3, m0, m1
   1949 .end3:
   1950    lea                  r3, [dstq+strideq*2]
   1951    mova                xm1, [dstq+strideq*0]
   1952    vinserti32x4        ym1, [dstq+strideq*1], 1
   1953    vinserti32x4         m1, [r3  +strideq*0], 2
   1954    vinserti32x4         m1, [r3  +strideq*1], 3
   1955    pxor                 m4, m4
   1956    mova          [cq+64*0], m4
   1957    mova          [cq+64*1], m4
   1958    punpcklbw            m0, m1, m4
   1959    punpckhbw            m1, m4
   1960    paddw                m0, m2
   1961    paddw                m1, m3
   1962    packuswb             m0, m1
   1963    mova          [dstq+strideq*0], xm0
   1964    vextracti32x4 [dstq+strideq*1], ym0, 1
   1965    vextracti32x4 [r3  +strideq*0], m0, 2
   1966    vextracti32x4 [r3  +strideq*1], m0, 3
   1967    RET
   1968 ALIGN function_align
   1969 .main:
   1970    IADST4_1D_PACKED
   1971    ret
   1972 
   1973 INV_TXFM_16X4_FN flipadst, dct
   1974 INV_TXFM_16X4_FN flipadst, adst
   1975 INV_TXFM_16X4_FN flipadst, flipadst
   1976 INV_TXFM_16X4_FN flipadst, identity
   1977 
   1978 cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1979    mova                 m0, [cq+64*0]
   1980    mova                 m1, [cq+64*1]
   1981    movshdup             m3, [o(permB)]
   1982    psrlq               m10, m3, 4
   1983    call m(iadst_4x16_internal_8bpc).main2
   1984    vpbroadcastd         m6, [o(pw_m16384_16384)]
   1985    psrlq                m0, m10, 12
   1986    psrlq               m10, 16
   1987    jmp m(iadst_16x4_internal_8bpc).pass1_end
   1988 .pass2:
   1989    call m(iadst_16x4_internal_8bpc).main
   1990    movu                m2, [o(permA+2)]
   1991    jmp m(iadst_16x4_internal_8bpc).end
   1992 
   1993 INV_TXFM_16X4_FN identity, dct
   1994 INV_TXFM_16X4_FN identity, adst
   1995 INV_TXFM_16X4_FN identity, flipadst
   1996 INV_TXFM_16X4_FN identity, identity
   1997 
   1998 cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   1999    mova                 m1, [cq+64*0]
   2000    mova                 m2, [cq+64*1]
   2001    vpbroadcastd         m3, [o(pw_1697x16)]
   2002    vpbroadcastd         m4, [o(pw_16384)]
   2003    mova                 m5, [o(idtx_16x4p)]
   2004    shufps               m0, m1, m2, q2020
   2005    shufps               m1, m2, q3131
   2006    pmulhrsw             m2, m3, m0
   2007    pmulhrsw             m3, m1
   2008    pmulhrsw             m2, m4
   2009    pmulhrsw             m3, m4
   2010    paddsw               m0, m2
   2011    paddsw               m1, m3
   2012    vpermb               m0, m5, m0
   2013    vpermb               m1, m5, m1
   2014    jmp                tx2q
   2015 .pass2:
   2016    vpbroadcastd         m3, [o(pw_1697x8)]
   2017    pmulhrsw             m2, m3, m0
   2018    pmulhrsw             m3, m1
   2019    paddsw               m0, m2
   2020    paddsw               m1, m3
   2021    movu                 m2, [o(permA+1)]
   2022    jmp m(iadst_16x4_internal_8bpc).end
   2023 
   2024 %macro INV_TXFM_16X8_FN 2 ; type1, type2
   2025    INV_TXFM_FN          %1, %2, 16x8
   2026 %ifidn %1_%2, dct_dct
   2027    movsx               r6d, word [cq]
   2028    mov                [cq], eobd
   2029    or                  r3d, 8
   2030 .dconly:
   2031    imul                r6d, 181
   2032    add                 r6d, 128
   2033    sar                 r6d, 8
   2034 .dconly2:
   2035    imul                r6d, 181
   2036    add                 r6d, 128+256
   2037    sar                 r6d, 8+1
   2038 .dconly3:
   2039    imul                r6d, 181
   2040    lea                  r2, [strideq*3]
   2041    add                 r6d, 128+2048
   2042    sar                 r6d, 8+4
   2043    pxor                 m2, m2
   2044    vpbroadcastw         m3, r6d
   2045 .dconly_loop:
   2046    mova                xm1, [dstq+strideq*0]
   2047    vinserti32x4        ym1, [dstq+strideq*1], 1
   2048    vinserti32x4         m1, [dstq+strideq*2], 2
   2049    vinserti32x4         m1, [dstq+r2       ], 3
   2050    punpcklbw            m0, m1, m2
   2051    punpckhbw            m1, m2
   2052    paddw                m0, m3
   2053    paddw                m1, m3
   2054    packuswb             m0, m1
   2055    mova          [dstq+strideq*0], xm0
   2056    vextracti32x4 [dstq+strideq*1], ym0, 1
   2057    vextracti32x4 [dstq+strideq*2], m0, 2
   2058    vextracti32x4 [dstq+r2       ], m0, 3
   2059    lea                dstq, [dstq+strideq*4]
   2060    sub                 r3d, 4
   2061    jg .dconly_loop
   2062    RET
   2063 %endif
   2064 %endmacro
   2065 
   2066 %macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
   2067    vpbroadcastd         m8, [o(pw_2896x8)]
   2068    vpermq               m0, [cq+32*0], q3120
   2069    add                  cq, 32*4
   2070    vpermq               m7, [cq+32*3], q%1
   2071    vpermq               m1, [cq-32*3], q%1
   2072    vpermq               m6, [cq+32*2], q3120
   2073    vpermq               m2, [cq-32*2], q3120
   2074    vpermq               m5, [cq+32*1], q%1
   2075    vpermq               m3, [cq-32*1], q%1
   2076    vpermq               m4, [cq+32*0], q3120
   2077    REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
   2078 %endmacro
   2079 
   2080 INV_TXFM_16X8_FN dct, dct
   2081 INV_TXFM_16X8_FN dct, identity
   2082 INV_TXFM_16X8_FN dct, adst
   2083 INV_TXFM_16X8_FN dct, flipadst
   2084 
   2085 cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   2086    vpbroadcastd         m1, [o(pw_2896x8)]
   2087    vpermq               m0, [cq+64*0], q3120
   2088    vpermq               m2, [cq+64*1], q3120
   2089    vpermq               m4, [cq+64*2], q3120
   2090    vpermq               m6, [cq+64*3], q3120
   2091    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6
   2092    vextracti32x8       ym1, m0, 1
   2093    vextracti32x8       ym3, m2, 1
   2094    vextracti32x8       ym5, m4, 1
   2095    vextracti32x8       ym7, m6, 1
   2096    call m(idct_8x16_internal_8bpc).main
   2097    vbroadcasti32x4      m8, [o(int_shuf1)]
   2098    vbroadcasti32x4      m9, [o(int_shuf2)]
   2099    vinserti32x8         m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3
   2100    vinserti32x8         m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3
   2101    vinserti32x8         m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3
   2102    vinserti32x8         m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3
   2103    vpbroadcastd         m2, [o(pw_16384)]
   2104    pshufb               m0, m8     ; a0 b0 a1 b1 a2 b2 a3 b3
   2105    pshufb               m1, m9     ; c0 d0 c1 d1 c2 d2 c3 d3
   2106    pshufb               m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3
   2107    pshufb               m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3
   2108    REPX   {pmulhrsw x, m2}, m0, m1, m6, m7
   2109    punpckldq            m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1
   2110    punpckhdq            m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3
   2111    punpckldq            m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1
   2112    punpckhdq            m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3
   2113    jmp                tx2q
   2114 .pass2:
   2115    vshufi32x4           m0, m2, m4, q2020 ; 0 1
   2116    vshufi32x4           m2, m4, q3131     ; 4 5
   2117    vshufi32x4           m1, m3, m5, q2020 ; 2 3
   2118    vshufi32x4           m3, m5, q3131     ; 6 7
   2119    call .main
   2120    movshdup             m4, [o(permC)]
   2121    psrlq                m6, m4, 4
   2122    vpermq               m5, m4, q1032
   2123    vpermi2q             m4, m0, m2 ; a2 a3   b2 b3   e2 e3   f2 f3
   2124    vpermt2q             m0, m6, m2 ; a0 a1   b0 b1   e0 e1   f0 f1
   2125    psrlq                m6, m5, 4
   2126    vpermi2q             m5, m1, m3 ; c2 c3   d2 d3   g2 g3   h2 h3
   2127    vpermt2q             m1, m6, m3 ; c0 c1   d0 d1   g0 g1   h0 h1
   2128    vpbroadcastd         m6, [o(pw_2048)]
   2129 .end:
   2130    REPX   {pmulhrsw x, m6}, m0, m4, m1, m5
   2131 .end2:
   2132    lea                  r3, [dstq+strideq*4]
   2133    lea                  r4, [strideq*3]
   2134    mova                xm3, [dstq+strideq*0]
   2135    mova                xm6, [dstq+strideq*2]
   2136    vinserti32x4        ym3, [dstq+strideq*1], 1
   2137    vinserti32x4        ym6, [dstq+r4       ], 1
   2138    vinserti32x4         m3, [r3  +strideq*0], 2
   2139    vinserti32x4         m6, [r3  +strideq*2], 2
   2140    vinserti32x4         m3, [r3  +strideq*1], 3
   2141    vinserti32x4         m6, [r3  +r4       ], 3
   2142    pxor                 m7, m7
   2143    mova          [cq+64*0], m7
   2144    mova          [cq+64*1], m7
   2145    mova          [cq+64*2], m7
   2146    mova          [cq+64*3], m7
   2147    punpcklbw            m2, m3, m7
   2148    punpckhbw            m3, m7
   2149    paddw                m0, m2
   2150    paddw                m4, m3
   2151    packuswb             m0, m4
   2152    mova          [dstq+strideq*0], xm0
   2153    vextracti32x4 [dstq+strideq*1], ym0, 1
   2154    vextracti32x4 [r3  +strideq*0], m0, 2
   2155    vextracti32x4 [r3  +strideq*1], m0, 3
   2156    punpcklbw            m3, m6, m7
   2157    punpckhbw            m6, m7
   2158    paddw                m1, m3
   2159    paddw                m5, m6
   2160    packuswb             m1, m5
   2161    mova          [dstq+strideq*2], xm1
   2162    vextracti32x4 [dstq+r4       ], ym1, 1
   2163    vextracti32x4 [r3  +strideq*2], m1, 2
   2164    vextracti32x4 [r3  +r4       ], m1, 3
   2165    RET
   2166 ALIGN function_align
   2167 cglobal_label .main
   2168    IDCT8_1D_PACKED
   2169    ret
   2170 
   2171 INV_TXFM_16X8_FN adst, dct
   2172 INV_TXFM_16X8_FN adst, adst
   2173 INV_TXFM_16X8_FN adst, flipadst
   2174 INV_TXFM_16X8_FN adst, identity
   2175 
   2176 cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   2177    call m(iadst_8x16_internal_8bpc).main_pass1
   2178    vpbroadcastd         m7, [o(pw_16384_m16384)]
   2179    psrlq               m10, 4
   2180 .pass1_end:
   2181    punpcklwd            m5, m4, m2
   2182    punpckhwd            m4, m2
   2183    mova                 m1, m9
   2184    vpdpwssd             m1, m5, [o(pw_m2896_2896)] {1to16}
   2185    mova                 m6, m9
   2186    vpdpwssd             m6, m5, [o(pw_2896_2896)] {1to16}
   2187    mova                 m2, m9
   2188    vpdpwssd             m2, m4, [o(pw_m2896_2896)] {1to16}
   2189    vpdpwssd             m9, m4, [o(pw_2896_2896)] {1to16}
   2190    psrad                m1, 12
   2191    psrad                m6, 12
   2192    packssdw             m1, m6 ;  out8  -out7  -out9   out6
   2193    psrad                m2, 12
   2194    psrad                m9, 12
   2195    packssdw             m2, m9 ; -out11  out4   out10 -out5
   2196    psrlq                m4, m10, 4
   2197    vpermi2q             m4, m0, m2
   2198    vpermt2q             m0, m10, m2
   2199    psrlq                m5, m10, 8
   2200    vpermi2q             m5, m1, m3
   2201    psrlq               m10, 12
   2202    vpermt2q             m1, m10, m3
   2203    punpcklwd            m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3
   2204    punpckhwd            m4, m5     ; b0 d0 b1 d1 b2 d2 b3 d3
   2205    punpcklwd            m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3
   2206    punpckhwd            m1, m0     ; j0 l0 j1 l1 j2 l2 j3 l3
   2207    punpcklwd            m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1
   2208    punpckhwd            m3, m4     ; a2 b2 c2 d2 a3 b3 c3 d3
   2209    punpcklwd            m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1
   2210    punpckhwd            m5, m1     ; i2 j2 k2 l2 i3 j3 k3 l3
   2211    REPX   {pmulhrsw x, m7}, m2, m3, m4, m5
   2212    jmp                tx2q
   2213 .pass2:
   2214    vshufi32x4           m0, m2, m4, q2020
   2215    vshufi32x4           m2, m4, q3131     ; 4 5
   2216    vshufi32x4           m1, m3, m5, q2020
   2217    vshufi32x4           m3, m5, q3131     ; 6 7
   2218    pshufd               m4, m0, q1032     ; 1 0
   2219    pshufd               m5, m1, q1032     ; 3 2
   2220    call .main_pass2
   2221    movshdup             m4, [o(permC)]
   2222    pmulhrsw             m0, m6
   2223    pmulhrsw             m1, m6
   2224    psrlq                m6, m4, 4
   2225    mova                 m5, m4
   2226    vpermi2q             m4, m0, m2
   2227    vpermt2q             m0, m6, m2
   2228    vpermi2q             m5, m1, m3
   2229    vpermt2q             m1, m6, m3
   2230    jmp m(idct_16x8_internal_8bpc).end2
   2231 ALIGN function_align
   2232 .main_pass1:
   2233    vpbroadcastd         m4, [o(pw_2896x8)]
   2234    pmulhrsw             m3, m4, [cq+64*0]
   2235    pmulhrsw             m1, m4, [cq+64*3]
   2236    pmulhrsw             m2, m4, [cq+64*1]
   2237    pmulhrsw             m4, [cq+64*2]
   2238    mova                 m5, [o(int16_perm)]
   2239    kxnorb               k1, k1, k1
   2240    vpblendmd        m0{k1}, m1, m3 ; 0 7
   2241    vmovdqa32        m3{k1}, m1     ; 6 1
   2242    vpblendmd        m1{k1}, m4, m2 ; 2 5
   2243    vmovdqa32        m2{k1}, m4     ; 4 3
   2244    REPX  {vpermb x, m5, x}, m0, m1, m2, m3
   2245    IADST8_1D_PACKED 1
   2246    ret
   2247 ALIGN function_align
   2248 cglobal_label .main_pass2
   2249    IADST8_1D_PACKED 2
   2250    pxor                 m5, m5
   2251    psubd                m5, m6
   2252    packssdw             m6, m5
   2253    pmulhrsw             m2, m6
   2254    pmulhrsw             m3, m6
   2255    ret
   2256 
   2257 INV_TXFM_16X8_FN flipadst, dct
   2258 INV_TXFM_16X8_FN flipadst, adst
   2259 INV_TXFM_16X8_FN flipadst, flipadst
   2260 INV_TXFM_16X8_FN flipadst, identity
   2261 
   2262 cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   2263    call m(iadst_8x16_internal_8bpc).main_pass1
   2264    vpbroadcastd         m7, [o(pw_m16384_16384)]
   2265    psrlq               m10, 20
   2266    jmp m(iadst_16x8_internal_8bpc).pass1_end
   2267 .pass2:
   2268    vshufi32x4           m0, m2, m4, q2020
   2269    vshufi32x4           m2, m4, q3131     ; 4 5
   2270    vshufi32x4           m1, m3, m5, q2020
   2271    vshufi32x4           m3, m5, q3131     ; 6 7
   2272    pshufd               m4, m0, q1032     ; 1 0
   2273    pshufd               m5, m1, q1032     ; 3 2
   2274    call m(iadst_16x8_internal_8bpc).main_pass2
   2275    movshdup             m4, [o(permC)]
   2276    pmulhrsw             m5, m6, m0
   2277    pmulhrsw             m0, m6, m1
   2278    psrlq                m1, m4, 12
   2279    psrlq                m4, 8
   2280    mova                 m7, m4
   2281    vpermi2q             m4, m0, m3
   2282    vpermt2q             m0, m1, m3
   2283    vpermi2q             m1, m5, m2
   2284    vpermt2q             m5, m7, m2
   2285    jmp m(idct_16x8_internal_8bpc).end2
   2286 
   2287 INV_TXFM_16X8_FN identity, dct
   2288 INV_TXFM_16X8_FN identity, adst
   2289 INV_TXFM_16X8_FN identity, flipadst
   2290 INV_TXFM_16X8_FN identity, identity
   2291 
   2292 cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   2293    vpbroadcastd         m0, [o(pw_2896x8)]
   2294    pmulhrsw             m3, m0, [cq+64*0]
   2295    pmulhrsw             m4, m0, [cq+64*1]
   2296    pmulhrsw             m5, m0, [cq+64*2]
   2297    pmulhrsw             m0,     [cq+64*3]
   2298    vpbroadcastd         m7, [o(pw_1697x16)]
   2299    vpbroadcastd         m8, [o(pw_16384)]
   2300    shufps               m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5
   2301    shufps               m3, m4, q3131     ; a2 a3 a6 a7 e2 e3 e6 e7
   2302    shufps               m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5
   2303    shufps               m5, m0, q3131     ; i2 i3 i6 i7 m2 m3 m6 m7
   2304    mova                 m9, [o(int8_permA)]
   2305    pmulhrsw             m0, m7, m2
   2306    pmulhrsw             m1, m7, m3
   2307    pmulhrsw             m6, m7, m4
   2308    pmulhrsw             m7, m5
   2309    REPX   {pmulhrsw x, m8}, m0, m1, m6, m7
   2310    paddsw               m2, m0
   2311    paddsw               m3, m1
   2312    paddsw               m4, m6
   2313    paddsw               m5, m7
   2314    REPX  {vpermb x, m9, x}, m2, m3, m4, m5
   2315    jmp                tx2q
   2316 .pass2:
   2317    mova                 m7, [o(permB)]
   2318    vpbroadcastd         m6, [o(pw_4096)]
   2319    vpermq               m0, m7, m2
   2320    vpermq               m4, m7, m4
   2321    vpermq               m1, m7, m3
   2322    vpermq               m5, m7, m5
   2323    jmp m(idct_16x8_internal_8bpc).end
   2324 
   2325 %macro INV_TXFM_16X16_FN 2 ; type1, type2
   2326    INV_TXFM_FN          %1, %2, 16x16
   2327 %ifidn %1_%2, dct_dct
   2328    movsx               r6d, word [cq]
   2329    mov                [cq], eobd
   2330    or                  r3d, 16
   2331    imul                r6d, 181
   2332    add                 r6d, 128+512
   2333    sar                 r6d, 8+2
   2334    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
   2335 %endif
   2336 %endmacro
   2337 
   2338 INV_TXFM_16X16_FN dct, dct
   2339 INV_TXFM_16X16_FN dct, identity
   2340 INV_TXFM_16X16_FN dct, adst
   2341 INV_TXFM_16X16_FN dct, flipadst
   2342 
   2343 cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   2344    mova                 m7, [o(permB)]
   2345    vpermq               m0, m7, [cq+64*0]
   2346    vpermq               m1, m7, [cq+64*1]
   2347    vpermq               m2, m7, [cq+64*2]
   2348    vpermq               m3, m7, [cq+64*3]
   2349    vpermq               m4, m7, [cq+64*4]
   2350    vpermq               m5, m7, [cq+64*5]
   2351    vpermq               m6, m7, [cq+64*6]
   2352    vpermq               m7, m7, [cq+64*7]
   2353    call .main
   2354    vbroadcasti32x4     m12, [o(int_shuf1)]
   2355    vbroadcasti32x4     m11, [o(int_shuf2)]
   2356    vpbroadcastd        m13, [o(pw_8192)]
   2357    pshufb               m0, m12
   2358    pshufb               m8, m1, m11
   2359    pshufb               m2, m12
   2360    pshufb               m9, m3, m11
   2361    pshufb               m4, m12
   2362    pshufb              m10, m5, m11
   2363    pshufb               m6, m12
   2364    pshufb              m11, m7, m11
   2365    REPX  {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11
   2366    punpckhdq            m1, m0, m8
   2367    punpckldq            m0, m8
   2368    punpckhdq            m3, m2, m9
   2369    punpckldq            m2, m9
   2370    punpckhdq            m5, m4, m10
   2371    punpckldq            m4, m10
   2372    punpckhdq            m7, m6, m11
   2373    punpckldq            m6, m11
   2374    jmp                tx2q
   2375 .pass2:
   2376    vshufi32x4           m8, m4, m6, q3232 ; i8 ic m8 mc
   2377    vinserti32x8         m4, ym6, 1        ; i0 i4 m0 m4
   2378    vshufi32x4           m6, m0, m2, q3232 ; a8 ac e8 ec
   2379    vinserti32x8         m0, ym2, 1        ; a0 a4 e0 e4
   2380    vshufi32x4           m9, m5, m7, q3232 ; ia ie ma me
   2381    vinserti32x8         m5, ym7, 1        ; i2 i6 m2 m6
   2382    vshufi32x4           m7, m1, m3, q3232 ; aa ae ea ee
   2383    vinserti32x8         m1, ym3, 1        ; a2 a6 e2 e6
   2384    vshufi32x4           m2, m0, m4, q3131 ;  4  5
   2385    vshufi32x4           m0, m4, q2020     ;  0  1
   2386    vshufi32x4           m4, m6, m8, q2020 ;  8  9
   2387    vshufi32x4           m6, m8, q3131     ; 12 13
   2388    vshufi32x4           m3, m1, m5, q3131 ;  6  7
   2389    vshufi32x4           m1, m5, q2020     ;  2  3
   2390    vshufi32x4           m5, m7, m9, q2020 ; 10 11
   2391    vshufi32x4           m7, m9, q3131     ; 14 15
   2392    call .main
   2393    mova                  m8, [o(permD)]
   2394    psrlq                m12, m8, 4
   2395    psrlq                 m9, m8, 8
   2396    psrlq                m13, m8, 12
   2397    mova                 m10, m8
   2398    vpermi2q              m8, m0, m2 ;  0  1  4  5
   2399    vpermt2q              m0, m12, m2
   2400    mova                 m11, m9
   2401    vpermi2q              m9, m1, m3 ;  2  3  6  7
   2402    vpermt2q              m1, m13, m3
   2403    vpermi2q             m10, m4, m6 ;  8  9 12 13
   2404    vpermt2q              m4, m12, m6
   2405    vpermi2q             m11, m5, m7 ; 10 11 14 15
   2406    vpermt2q              m5, m13, m7
   2407 .end:
   2408    vpbroadcastd        m12, [o(pw_2048)]
   2409 .end2:
   2410    REPX  {pmulhrsw x, m12}, m0, m1, m4, m5
   2411 .end3:
   2412    REPX  {pmulhrsw x, m12}, m8, m9, m10, m11
   2413    lea                  r3, [strideq*3]
   2414    lea                  r4, [dstq+strideq*4]
   2415    lea                  r5, [dstq+strideq*8]
   2416    lea                  r6, [r4  +strideq*8]
   2417    mova                xm3, [dstq+strideq*0]
   2418    mova                xm6, [dstq+strideq*2]
   2419    vinserti32x4        ym3, [dstq+strideq*1], 1
   2420    vinserti32x4        ym6, [dstq+r3       ], 1
   2421    vinserti32x4         m3, [r4+strideq*0], 2
   2422    vinserti32x4         m6, [r4+strideq*2], 2
   2423    vinserti32x4         m3, [r4+strideq*1], 3
   2424    vinserti32x4         m6, [r4+r3       ], 3
   2425    mova               xm12, [r5+strideq*0]
   2426    mova               xm13, [r5+strideq*2]
   2427    vinserti32x4       ym12, [r5+strideq*1], 1
   2428    vinserti32x4       ym13, [r5+r3       ], 1
   2429    vinserti32x4        m12, [r6+strideq*0], 2
   2430    vinserti32x4        m13, [r6+strideq*2], 2
   2431    vinserti32x4        m12, [r6+strideq*1], 3
   2432    vinserti32x4        m13, [r6+r3       ], 3
   2433    pxor                 m7, m7
   2434    REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
   2435    punpcklbw            m2, m3, m7
   2436    punpckhbw            m3, m7
   2437    paddw                m0, m2
   2438    paddw                m8, m3
   2439    packuswb             m0, m8
   2440    punpcklbw            m2, m6, m7
   2441    punpckhbw            m6, m7
   2442    paddw                m1, m2
   2443    paddw                m9, m6
   2444    packuswb             m1, m9
   2445    punpcklbw            m2, m12, m7
   2446    punpckhbw           m12, m7
   2447    paddw                m2, m4
   2448    paddw               m10, m12
   2449    packuswb             m2, m10
   2450    punpcklbw            m3, m13, m7
   2451    punpckhbw           m13, m7
   2452    paddw                m3, m5
   2453    paddw               m11, m13
   2454    packuswb             m3, m11
   2455    mova          [dstq+strideq*0], xm0
   2456    vextracti32x4 [dstq+strideq*1], ym0, 1
   2457    mova          [dstq+strideq*2], xm1
   2458    vextracti32x4 [dstq+r3       ], ym1, 1
   2459    vextracti32x4 [r4+strideq*0], m0, 2
   2460    vextracti32x4 [r4+strideq*1], m0, 3
   2461    vextracti32x4 [r4+strideq*2], m1, 2
   2462    vextracti32x4 [r4+r3       ], m1, 3
   2463    mova          [r5+strideq*0], xm2
   2464    vextracti32x4 [r5+strideq*1], ym2, 1
   2465    mova          [r5+strideq*2], xm3
   2466    vextracti32x4 [r5+r3       ], ym3, 1
   2467    vextracti32x4 [r6+strideq*0], m2, 2
   2468    vextracti32x4 [r6+strideq*1], m2, 3
   2469    vextracti32x4 [r6+strideq*2], m3, 2
   2470    vextracti32x4 [r6+r3       ], m3, 3
   2471    RET
   2472 ALIGN function_align
   2473 cglobal_label .main_fast2 ; bottom three-quarters are zero
   2474    vpbroadcastd        m10, [o(pd_2048)]
   2475    vpbroadcastq        m13, [o(int_mshift)]
   2476    vpcmpub              k7, m13, m10, 6
   2477 .main_fast4:
   2478    vpbroadcastd         m2, [o(pw_401_4076x8)]
   2479    vpbroadcastd         m4, [o(pw_m1189_3920x8)]
   2480    vpbroadcastd         m3, [o(pw_799_4017x8)]
   2481    pmulhrsw             m2, m8     ; t8a  t15a
   2482    pmulhrsw             m4, m1     ; t11a t12a
   2483    pmulhrsw             m7, m3     ; t4a  t7a
   2484    pxor                 m6, m6
   2485    psubsw               m0, m2, m4 ; t11a t12a
   2486    paddsw               m8, m2, m4 ; t8a  t15a
   2487    mova                 m1, m7
   2488    jmp .main5
   2489 ALIGN function_align
   2490 cglobal_label .main_fast ; bottom half is zero
   2491    vpbroadcastd        m10, [o(pd_2048)]
   2492 .main_fast3:
   2493    vpbroadcastq        m13, [o(int_mshift)]
   2494    vpcmpub              k7, m13, m10, 6
   2495 .main_fast5:
   2496    vpbroadcastd         m2, [o(pw_401_4076x8)]
   2497    vpbroadcastd         m4, [o(pw_m2598_3166x8)]
   2498    vpbroadcastd        m11, [o(pw_1931_3612x8)]
   2499    vpbroadcastd        m12, [o(pw_m1189_3920x8)]
   2500    pmulhrsw             m8, m2  ; t8a  t15a
   2501    vpbroadcastd         m2, [o(pw_799_4017x8)]
   2502    pmulhrsw             m0, m4  ; t9a  t14a
   2503    vpbroadcastd         m4, [o(pw_m2276_3406x8)]
   2504    pmulhrsw             m5, m11 ; t10a t13a
   2505    pmulhrsw             m1, m12 ; t11a t12a
   2506    pmulhrsw             m7, m2  ; t4a  t7a
   2507    pmulhrsw             m3, m4  ; t5a  t6a
   2508    jmp .main4
   2509 ALIGN function_align
   2510 cglobal_label .main
   2511    IDCT16_1D_PACKED
   2512    ret
   2513 
   2514 INV_TXFM_16X16_FN adst, dct
   2515 INV_TXFM_16X16_FN adst, adst
   2516 INV_TXFM_16X16_FN adst, flipadst
   2517 
   2518 cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   2519    call .main_pass1
   2520    vpbroadcastd        m10, [o(pw_8192_m8192)]
   2521    punpcklwd            m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3
   2522    punpckhwd            m0, m1     ; a0 c0 a1 c1 a2 c2 a3 c3
   2523    punpckhwd            m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3
   2524    punpcklwd            m0, m8     ; a0 b0 c0 d0 a1 b1 c1 d1
   2525    punpcklwd            m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3
   2526    punpckhwd            m2, m3     ; e0 g0 e1 g1 e2 g2 e3 g3
   2527    punpckhwd            m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3
   2528    punpcklwd            m2, m8     ; e0 f0 g0 h0 e1 f1 g1 h1
   2529    punpckhwd            m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3
   2530    punpcklwd            m4, m5     ; j0 l0 j1 l1 j2 l2 j3 l3
   2531    punpckhwd            m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
   2532    punpcklwd            m4, m8     ; i0 j0 k0 l0 i1 j1 k1 l1
   2533    punpckhwd            m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3
   2534    punpcklwd            m6, m7     ; n0 p0 n1 p1 n2 p2 n3 p3
   2535    punpckhwd            m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
   2536    punpcklwd            m6, m8     ; m0 n0 o0 p0 m1 n1 o1 p1
   2537 .pass1_end:
   2538    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
   2539    jmp                tx2q
   2540 .pass2:
   2541    call .main_pass2
   2542    mova                m10, [o(permD)]
   2543    psrlq                m8, m10, 8
   2544    psrlq               m12, m10, 12
   2545    psrlq               m13, m10, 4
   2546    mova                 m9, m8
   2547    vpermi2q             m8, m0, m2 ;  0  1  4  5
   2548    vpermt2q             m0, m12, m2
   2549    vpermi2q             m9, m1, m3 ;  2  3  6  7
   2550    vpermt2q             m1, m12, m3
   2551    vpbroadcastd        m12, [o(pw_2048)]
   2552    mov                 r3d, 0xff00ff00
   2553    mova                m11, m10
   2554    vpermi2q            m10, m4, m6 ;  8  9 12 13
   2555    vpermt2q             m4, m13, m6
   2556    kmovd                k1, r3d
   2557    vpermi2q            m11, m5, m7 ; 10 11 14 15
   2558    vpermt2q             m5, m13, m7
   2559    pxor                 m7, m7
   2560    vpsubw          m12{k1}, m7, m12
   2561    jmp m(idct_16x16_internal_8bpc).end2
   2562 ALIGN function_align
   2563 .main_pass1:
   2564    mova                 m4, [o(permB)]
   2565    psrlq                m3, m4, 4
   2566    vpermq               m0, m4, [cq+64*0]
   2567    vpermq               m7, m3, [cq+64*7]
   2568    vpermq               m6, m4, [cq+64*6]
   2569    vpermq               m1, m3, [cq+64*1]
   2570    vpermq               m2, m4, [cq+64*2]
   2571    vpermq               m5, m3, [cq+64*5]
   2572    vpermq               m4, m4, [cq+64*4]
   2573    vpermq               m3, m3, [cq+64*3]
   2574    call .main
   2575    vpbroadcastd        m13, [o(pw_2896_2896)]
   2576    vpbroadcastd        m12, [o(pw_m2896_2896)]
   2577    mova                 m2, m10
   2578    vpdpwssd             m2, m5, m13       ; -out5
   2579    mova                 m8, m10
   2580    vpdpwssd             m8, m11, m13      ;  out4
   2581    mova                 m9, m10
   2582    vpdpwssd             m9, m5, m12       ;  out10
   2583    mova                 m5, m10
   2584    vpdpwssd             m5, m11, m12      ; -out11
   2585    mova                m11, m10
   2586    vpdpwssd            m11, m3, m13       ; -out7
   2587    mova                m14, m10
   2588    vpdpwssd            m14, m4, m13       ;  out6
   2589    mova                m13, m10
   2590    vpdpwssd            m13, m3, m12       ;  out8
   2591    vpdpwssd            m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9
   2592    REPX      {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10
   2593    packssdw             m2, m8            ; -out5   out4
   2594    packssdw             m5, m9, m5        ;  out10 -out11
   2595    packssdw             m3, m11, m14      ; -out7   out6
   2596    packssdw             m4, m13, m10      ;  out8  -out9
   2597    ret
   2598 ALIGN function_align
   2599 .main_pass2:
   2600    vshufi32x4           m8, m4, m6, q3232 ; i8 ic m8 mc
   2601    vinserti32x8         m4, ym6, 1        ; i0 i4 m0 m4
   2602    vshufi32x4           m6, m0, m2, q3232 ; a8 ac e8 ec
   2603    vinserti32x8         m0, ym2, 1        ; a0 a4 e0 e4
   2604    vshufi32x4           m9, m5, m7, q3232 ; ia ie ma me
   2605    vinserti32x8         m5, ym7, 1        ; i2 i6 m2 m6
   2606    vshufi32x4           m7, m1, m3, q3232 ; aa ae ea ee
   2607    vinserti32x8         m1, ym3, 1        ; a2 a6 e2 e6
   2608    vshufi32x4           m2, m0, m4, q3131 ;  4  5
   2609    vshufi32x4           m0, m4, q2020     ;  0  1
   2610    vshufi32x4           m4, m6, m8, q2020 ;  8  9
   2611    vshufi32x4           m6, m8, q3131     ; 12 13
   2612    vshufi32x4           m3, m1, m5, q3131 ;  6  7
   2613    vshufi32x4           m1, m5, q2020     ;  2  3
   2614    vshufi32x4           m5, m7, m9, q2020 ; 10 11
   2615    vshufi32x4           m7, m9, q3131     ; 14 15
   2616 cglobal_label .main_pass2b
   2617    REPX {pshufd x, x, q1032}, m1, m3, m5, m7
   2618    call .main
   2619    vpbroadcastd         m8, [o(pw_2896x8)]
   2620    pshufb               m2, m11, m12
   2621    pshufb               m5, m12
   2622    pshufb               m3, m12
   2623    pshufb               m4, m12
   2624    punpcklqdq           m9, m5, m2        ;  t15a   t7
   2625    punpckhqdq           m5, m2            ;  t14a   t6
   2626    shufps               m2, m3, m4, q1032 ;  t2a    t10
   2627    shufps               m3, m4, q3210     ;  t3a    t11
   2628    psubsw               m4, m2, m3        ;  out8  -out9
   2629    paddsw               m3, m2            ; -out7   out6
   2630    paddsw               m2, m5, m9        ; -out5   out4
   2631    psubsw               m5, m9            ;  out10 -out11
   2632    REPX   {pmulhrsw x, m8}, m2, m3, m4, m5
   2633    ret
   2634 ALIGN function_align
   2635 .main:
   2636    vpbroadcastd        m10, [o(pd_2048)]
   2637    vpbroadcastq        m13, [o(int_mshift)]
   2638    punpckhwd            m8, m7, m0 ; in14 in1
   2639    punpcklwd            m0, m7     ; in0  in15
   2640    punpcklwd            m7, m6, m1 ; in12 in3
   2641    punpckhwd            m1, m6     ; in2  in13
   2642    punpckhwd            m6, m5, m2 ; in10 in5
   2643    punpcklwd            m2, m5     ; in4  in11
   2644    punpcklwd            m5, m4, m3 ; in8  in7
   2645    punpckhwd            m3, m4     ; in6  in9
   2646    vpcmpub              k7, m13, m10, 6 ; 0x33...
   2647    ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 5 ; t0  t1
   2648    ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 5 ; t2  t3
   2649    ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 5 ; t4  t5
   2650    ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 5 ; t6  t7
   2651    ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 5 ; t8  t9
   2652    ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 5 ; t10 t11
   2653    ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 5 ; t12 t13
   2654    ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 5 ; t14 t15
   2655    psubsw               m4, m0, m5 ; t9a  t8a
   2656    paddsw               m0, m5     ; t1a  t0a
   2657    psubsw               m5, m1, m6 ; t11a t10a
   2658    paddsw               m1, m6     ; t3a  t2a
   2659    psubsw               m6, m2, m7 ; t13a t12a
   2660    paddsw               m2, m7     ; t5a  t4a
   2661    psubsw               m7, m3, m8 ; t15a t14a
   2662    paddsw               m3, m8     ; t7a  t6a
   2663    ITX_MUL2X_PACK        4, 8, 9, 10, 799,       4017,        4 ; t8  t9
   2664    ITX_MUL2X_PACK        6, 8, 9, 10, 799_4017,  4017_m799,  52 ; t12 t13
   2665    ITX_MUL2X_PACK        5, 8, 9, 10, 3406,      2276,        4 ; t10 t11
   2666    ITX_MUL2X_PACK        7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15
   2667    psubsw               m8, m1, m3 ; t7   t6
   2668    paddsw               m1, m3     ; t3   t2
   2669    psubsw               m3, m0, m2 ; t5   t4
   2670    paddsw               m0, m2     ; t1   t0
   2671    psubsw               m2, m5, m7 ; t14a t15a
   2672    paddsw               m7, m5     ; t10a t11a
   2673    psubsw               m5, m4, m6 ; t12a t13a
   2674    paddsw               m4, m6     ; t8a  t9a
   2675    ITX_MUL2X_PACK        3, 6, 9, 10, 1567,       3784,        5 ; t5a t4a
   2676    ITX_MUL2X_PACK        8, 6, 9, 10, 3784_m1567, 1567_3784,  52 ; t7a t6a
   2677    ITX_MUL2X_PACK        2, 6, 9, 10, 3784,       1567,        4 ; t15 t14
   2678    ITX_MUL2X_PACK        5, 6, 9, 10, 3784_1567,  1567_m3784, 52 ; t13 t12
   2679    vbroadcasti32x4     m12, [o(deint_shuf)]
   2680    paddsw               m6, m4, m7        ; -out1  out14
   2681    psubsw               m4, m7            ;  t10    t11
   2682    psubsw              m11, m3, m8        ;  t7     t6
   2683    paddsw               m8, m3            ;  out12 -out3
   2684    psubsw               m3, m0, m1        ;  t3a    t2a
   2685    paddsw               m0, m1            ; -out15  out0
   2686    paddsw               m1, m2, m5        ; -out13  out2
   2687    psubsw               m5, m2            ;  t15a   t14a
   2688    pshufb               m0, m12
   2689    pshufb               m6, m12
   2690    pshufb               m8, m12
   2691    pshufb               m1, m12
   2692    shufps               m7, m6, m0, q1032 ;  out14 -out15
   2693    shufps               m0, m6, m0, q3210 ; -out1   out0
   2694    punpcklqdq           m6, m8, m1        ;  out12 -out13
   2695    punpckhqdq           m1, m8, m1        ; -out3   out2
   2696    ret
   2697 
   2698 INV_TXFM_16X16_FN flipadst, dct
   2699 INV_TXFM_16X16_FN flipadst, adst
   2700 INV_TXFM_16X16_FN flipadst, flipadst
   2701 
   2702 cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   2703    call m(iadst_16x16_internal_8bpc).main_pass1
   2704    vpbroadcastd        m10, [o(pw_m8192_8192)]
   2705    punpcklwd            m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3
   2706    punpckhwd            m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3
   2707    punpckhwd            m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3
   2708    punpcklwd            m7, m6     ; b0 d0 b1 d1 b2 d2 b3 d3
   2709    punpcklwd            m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1
   2710    punpckhwd            m1, m7     ; a2 b2 c2 d2 a3 b3 c3 d3
   2711    punpcklwd            m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1
   2712    punpckhwd            m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3
   2713    punpcklwd            m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3
   2714    punpckhwd            m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3
   2715    punpckhwd            m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3
   2716    punpcklwd            m5, m4     ; f0 h0 f1 h1 f2 h2 f3 h3
   2717    punpcklwd            m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1
   2718    punpckhwd            m3, m5     ; e2 f2 g2 h2 e3 f3 g3 h3
   2719    punpcklwd            m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1
   2720    punpckhwd            m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3
   2721    jmp m(iadst_16x16_internal_8bpc).pass1_end
   2722 .pass2:
   2723    call m(iadst_16x16_internal_8bpc).main_pass2
   2724    mova                m10, [o(permD)]
   2725    psrlq                m8, m10, 8
   2726    psrlq               m12, m10, 12
   2727    psrlq               m13, m10, 4
   2728    mova                 m9, m8
   2729    vpermi2q             m8, m7, m5 ;  0  1  4  5
   2730    vpermt2q             m7, m12, m5
   2731    vpermi2q             m9, m6, m4 ;  2  3  6  7
   2732    vpermt2q             m6, m12, m4
   2733    vpbroadcastd        m12, [o(pw_2048)]
   2734    mov                 r3d, 0x00ff00ff
   2735    mova                m11, m10
   2736    vpermi2q            m10, m3, m1 ;  8  9 12 13
   2737    vpermt2q             m3, m13, m1
   2738    kmovd                k1, r3d
   2739    vpermi2q            m11, m2, m0 ; 10 11 14 15
   2740    vpermt2q             m2, m13, m0
   2741    pxor                 m0, m0
   2742    vpsubw          m12{k1}, m0, m12
   2743    pmulhrsw             m0, m7, m12
   2744    pmulhrsw             m1, m6, m12
   2745    pmulhrsw             m4, m3, m12
   2746    pmulhrsw             m5, m2, m12
   2747    jmp m(idct_16x16_internal_8bpc).end3
   2748 
   2749 INV_TXFM_16X16_FN identity, dct
   2750 INV_TXFM_16X16_FN identity, identity
   2751 
   2752 cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
   2753    mova                 m8, [o(int16_perm)]
   2754    vpermb               m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
   2755    vpermb               m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
   2756    vpbroadcastd         m0, [o(pw_1697x16)]
   2757    vpermb               m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
   2758    vpermb               m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
   2759    vpermb               m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3
   2760    vpermb               m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3
   2761    vpermb               m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3
   2762    vpermb               m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3
   2763    pmulhrsw             m9, m0, m1
   2764    pmulhrsw            m10, m0, m2
   2765    pmulhrsw            m11, m0, m3
   2766    pmulhrsw            m12, m0, m4
   2767    pmulhrsw            m13, m0, m5
   2768    pmulhrsw            m14, m0, m6
   2769    pmulhrsw            m15, m0, m7
   2770    pmulhrsw             m0, m8
   2771    REPX       {psraw x, 1}, m9, m10, m11, m12
   2772    pavgw                m1, m9
   2773    pavgw                m2, m10
   2774    pavgw                m3, m11
   2775    pavgw                m4, m12
   2776    REPX       {psraw x, 1}, m13, m14, m15, m0
   2777    pavgw                m5, m13
   2778    pavgw                m6, m14
   2779    pavgw                m7, m15
   2780    pavgw                m8, m0
   2781    punpckldq            m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
   2782    punpckhdq            m1, m2     ; a2 b2 c2 d2 a3 b3 c3 d3
   2783    punpckldq            m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
   2784    punpckhdq            m3, m4     ; e2 f2 g2 h2 e3 f3 g3 h3
   2785    punpckldq            m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1
   2786    punpckhdq            m5, m6     ; i2 j2 k2 l2 i3 j3 k3 l3
   2787    punpckldq            m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
   2788    punpckhdq            m7, m8     ; m2 n2 o2 p2 m3 n3 o3 p3
   2789    jmp                tx2q
   2790 ALIGN function_align
   2791 .pass2:
   2792    vpbroadcastd        m11, [o(pw_1697x16)]
   2793    pmulhrsw            m12, m11, m0
   2794    pmulhrsw            m13, m11, m1
   2795    pmulhrsw            m14, m11, m2
   2796    pmulhrsw            m15, m11, m3
   2797    pmulhrsw             m8, m11, m4
   2798    pmulhrsw             m9, m11, m5
   2799    pmulhrsw            m10, m11, m6
   2800    pmulhrsw            m11, m7
   2801    REPX      {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
   2802    paddsw               m0, m12
   2803    paddsw               m1, m13
   2804    paddsw               m2, m14
   2805    paddsw               m3, m15
   2806    paddsw               m8, m4
   2807    movu                 m4, [o(permD+2)]
   2808    paddsw               m9, m5
   2809    paddsw               m6, m10
   2810    paddsw               m7, m11
   2811    psrlq               m12, m4, 4
   2812    mova                 m5, m4
   2813    mova                m10, m4
   2814    mova                m11, m4
   2815    vpermi2q             m4, m0, m2  ;  8  9 12 13
   2816    vpermt2q             m0, m12, m2 ;  0  1  4  5
   2817    vpermi2q             m5, m1, m3  ; 10 11 14 15
   2818    vpermt2q             m1, m12, m3 ;  2  3  6  7
   2819    vpermi2q            m10, m8, m6
   2820    vpermt2q             m8, m12, m6
   2821    vpermi2q            m11, m9, m7
   2822    vpermt2q             m9, m12, m7
   2823    jmp m(idct_16x16_internal_8bpc).end
   2824 
   2825 %macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4]
   2826    vpbroadcastd        m%4, [o(pw_%5_%6x8)]
   2827    punpcklwd           m%1, m%3, m%3
   2828    pmulhrsw            m%1, m%4
   2829    vpbroadcastd        m%4, [o(pw_%7_%8x8)]
   2830    punpckhwd           m%2, m%3, m%3
   2831    pmulhrsw            m%2, m%4
   2832 %endmacro
   2833 
   2834 cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
   2835 %undef cmp
   2836    lea                  r5, [o_base]
   2837    test               eobd, eobd
   2838    jz .dconly
   2839    cmp                eobd, 107
   2840    jb .fast
   2841    mova                 m5, [cq+64*5]
   2842    mova                 m3, [cq+64*3]
   2843    mova                 m1, [cq+64*1]
   2844    mova                 m7, [cq+64*7]
   2845    mova                 m2, [cq+64*2]
   2846    mova                 m6, [cq+64*6]
   2847    mova                 m0, [cq+64*0]
   2848    mova                 m4, [cq+64*4]
   2849    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   2850    mova                 m8, [o(idct_8x32p)]
   2851    vpbroadcastd         m9, [o(pw_8192)]
   2852    REPX  {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7
   2853    punpckldq            m8, m0, m1 ; ab
   2854    punpckhdq            m0, m1
   2855    punpckldq            m1, m2, m3 ; cd
   2856    punpckhdq            m2, m3
   2857    punpckldq            m3, m4, m5 ; ef
   2858    punpckhdq            m4, m5
   2859    punpckldq            m5, m6, m7 ; gh
   2860    punpckhdq            m6, m7
   2861    REPX   {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6
   2862    punpcklqdq          m18, m8, m1 ; 30  2    6 26   31  1   23  9
   2863    punpckhqdq          m14, m8, m1 ; 16  0   12 20    3 29   11 21
   2864    punpcklqdq          m21, m0, m2 ; 14 18   22 10   27  5   19 13
   2865    punpckhqdq          m15, m0, m2 ; 18  4   24  8    7 25   15 17
   2866    punpcklqdq          m20, m3, m5
   2867    punpckhqdq          m16, m3, m5
   2868    punpcklqdq          m19, m4, m6
   2869    punpckhqdq          m17, m4, m6
   2870    vinserti32x4        ym8, ym18, xm20, 1
   2871    vshufi32x4          ym1, ym18, ym20, 0x03
   2872    vinserti32x4        ym9, ym14, xm16, 1
   2873    vshufi32x4          ym3, ym14, ym16, 0x03
   2874    vinserti32x4        ym0, ym21, xm19, 1
   2875    vshufi32x4          ym5, ym21, ym19, 0x03
   2876    vinserti32x4        ym7, ym15, xm17, 1
   2877    vshufi32x4          ym6, ym15, ym17, 0x03
   2878    call m(idct_8x16_internal_8bpc).main2
   2879    psrlq               m12, [o(permB)], 60
   2880    vpermt2q            m14, m12, m16
   2881    vpermt2q            m21, m12, m19
   2882    vpermt2q            m15, m12, m17
   2883    vpermi2q            m12, m18, m20
   2884    vextracti32x8      ym16, m14, 1
   2885    vextracti32x8      ym19, m21, 1
   2886    vextracti32x8      ym17, m15, 1
   2887    vextracti32x8      ym20, m12, 1
   2888    call .main2
   2889    jmp .end
   2890 .fast: ; right half is zero
   2891    mova                 m0, [o(int16_perm)]
   2892    mova                ym2, [cq+64*4]
   2893    vinserti32x8         m2, [cq+64*0], 1
   2894    mova                ym3, [cq+64*6]
   2895    vinserti32x8         m3, [cq+64*2], 1
   2896    mova                ym4, [cq+64*3]
   2897    vinserti32x8         m4, [cq+64*5], 1
   2898    mova                ym5, [cq+64*7]
   2899    vinserti32x8         m5, [cq+64*1], 1
   2900    REPX  {vpermb x, m0, x}, m2, m3, m4, m5
   2901    call m(idct_16x8_internal_8bpc).main2
   2902    vbroadcasti32x4      m4, [o(int_shuf3)]
   2903    vbroadcasti32x4      m5, [o(int_shuf4)]
   2904    pshufb               m2, m4     ; e0 f0 e2 f2 e1 f1 e3 f3
   2905    pshufb               m3, m5     ; g0 h0 g2 h2 g1 h1 g3 h3
   2906    pshufb               m0, m4     ; a0 b0 a2 b2 a1 b1 a3 b3
   2907    pshufb               m1, m5     ; c0 d0 c2 d2 c1 d1 c3 d3
   2908    vpbroadcastd         m4, [o(pw_8192)]
   2909    psrlq                m5, [o(permB)], 60
   2910    punpckldq            m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2
   2911    punpckhdq           m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3
   2912    punpckldq            m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2
   2913    punpckhdq           m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3
   2914    REPX   {pmulhrsw x, m4}, m6, m17, m2, m16
   2915    vinserti32x4        ym0, ym2, xm6, 1      ;  0  2
   2916    vshufi32x4          ym1, ym2, ym6, 0x03   ;  4  6
   2917    vinserti32x4       ym14, ym16, xm17, 1    ;  1  3
   2918    vshufi32x4         ym15, ym16, ym17, 0x03 ;  5  7
   2919    vpermt2q             m2, m5, m6           ;  8 10
   2920    vpermt2q            m16, m5, m17          ;  9 11
   2921    vextracti32x8       ym3, m2, 1            ; 12 14
   2922    vextracti32x8      ym17, m16, 1           ; 13 15
   2923    call m(idct_8x16_internal_8bpc).main_fast
   2924    call .main_fast
   2925 .end:
   2926    vpbroadcastd        ym8, strided
   2927    pmulld              ym8, [o(gather8d)]
   2928    call .main_end
   2929    lea                  r3, [dstq+strideq*4]
   2930    kxnorb               k1, k1, k1
   2931    lea                  r4, [dstq+strideq*8]
   2932    pxor                 m9, m9
   2933    lea                  r1, [r3+strideq*8]
   2934    kmovb                k2, k1
   2935    vpgatherdq      m12{k1}, [r0+ym8]
   2936    kmovb                k1, k2
   2937    vpgatherdq      m13{k2}, [r3+ym8]
   2938    kmovb                k2, k1
   2939    vpgatherdq      m14{k1}, [r4+ym8]
   2940    kmovb                k1, k2
   2941    vpgatherdq      m15{k2}, [r1+ym8]
   2942    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
   2943    REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7
   2944    punpcklbw           m11, m12, m9
   2945    punpckhbw           m12, m9
   2946    paddw                m0, m11
   2947    paddw                m1, m12
   2948    packuswb             m0, m1
   2949    kmovb                k2, k1
   2950    vpscatterdq [r0+ym8]{k1}, m0
   2951    punpcklbw           m12, m13, m9
   2952    punpckhbw           m13, m9
   2953    paddw                m2, m12
   2954    paddw                m3, m13
   2955    packuswb             m2, m3
   2956    kmovb                k1, k2
   2957    vpscatterdq [r3+ym8]{k2}, m2
   2958    punpcklbw           m13, m14, m9
   2959    punpckhbw           m14, m9
   2960    paddw                m4, m13
   2961    paddw                m5, m14
   2962    packuswb             m4, m5
   2963    kmovb                k2, k1
   2964    vpscatterdq [r4+ym8]{k1}, m4
   2965    punpcklbw           m14, m15, m9
   2966    punpckhbw           m15, m9
   2967    paddw                m6, m14
   2968    paddw                m7, m15
   2969    packuswb             m6, m7
   2970    vpscatterdq [r1+ym8]{k2}, m6
   2971    RET
   2972 .dconly:
   2973    movsx               r6d, word [cq]
   2974    mov                [cq], eobd
   2975    or                  r3d, 32
   2976    imul                r6d, 181
   2977    add                 r6d, 128+512
   2978    sar                 r6d, 8+2
   2979    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
   2980 INIT_YMM avx512icl
   2981 ALIGN function_align
   2982 cglobal_label .main_fast2 ; bottom three-quarters are zero
   2983    ITX_UNPACK_MULHRSW   12, 14, 14, 8,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
   2984    ITX_UNPACK_MULHRSW   21, 20, 15, 8,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
   2985    mova                m11, m12
   2986    mova                m17, m20
   2987    mova                m15, m21
   2988    mova                m16, m14
   2989    jmp .main4
   2990 ALIGN function_align
   2991 cglobal_label .main_fast ; bottom half is zero
   2992    ITX_UNPACK_MULHRSW   12, 14, 14, 8,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
   2993    ITX_UNPACK_MULHRSW   21, 15, 15, 8,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
   2994    ITX_UNPACK_MULHRSW   20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
   2995    ITX_UNPACK_MULHRSW   19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
   2996    jmp .main3
   2997 ALIGN function_align
   2998 cglobal_label .main
   2999    punpcklwd           m12, m21, m14 ; in31 in1
   3000    punpckhwd           m14, m21      ; in3  in29
   3001    punpcklwd           m21, m20, m15 ; in27 in5
   3002    punpckhwd           m15, m20      ; in7  in25
   3003    punpcklwd           m20, m19, m16 ; in23 in9
   3004    punpckhwd           m16, m19      ; in11 in21
   3005    punpcklwd           m19, m18, m17 ; in19 in13
   3006    punpckhwd           m17, m18      ; in15 in17
   3007 .main2:
   3008    ITX_MUL2X_PACK       12, 8, 9, 10,  201, 4091, 5 ; t16a, t31a
   3009    ITX_MUL2X_PACK       14, 8, 9, 10, 4052,  601, 5 ; t23a, t24a
   3010    ITX_MUL2X_PACK       21, 8, 9, 10,  995, 3973, 5 ; t20a, t27a
   3011    ITX_MUL2X_PACK       15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
   3012    ITX_MUL2X_PACK       20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
   3013    ITX_MUL2X_PACK       16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
   3014    ITX_MUL2X_PACK       19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
   3015    ITX_MUL2X_PACK       17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
   3016 .main3:
   3017    psubsw              m11, m12, m17 ; t17 t30
   3018    paddsw              m12, m17      ; t16 t31
   3019    psubsw              m17, m15, m20 ; t18 t29
   3020    paddsw              m20, m15      ; t19 t28
   3021    psubsw              m15, m21, m16 ; t21 t26
   3022    paddsw              m21, m16      ; t20 t27
   3023    psubsw              m16, m14, m19 ; t22 t25
   3024    paddsw              m14, m19      ; t23 t24
   3025 .main4:
   3026    ITX_MUL2X_PACK       11, 18, 19, 10,   799, 4017, 5 ; t17a t30a
   3027    ITX_MUL2X_PACK       17, 18, 19, 10, m4017,  799, 5 ; t18a t29a
   3028    ITX_MUL2X_PACK       15, 18, 19, 10,  3406, 2276, 5 ; t21a t26a
   3029    ITX_MUL2X_PACK       16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a
   3030    vpbroadcastd         m8, [o(pw_m3784_1567)]
   3031    psubsw              m19, m12, m20 ; t19a t28a
   3032    paddsw              m20, m12      ; t16a t31a
   3033    psubsw              m12, m14, m21 ; t20a t27a
   3034    paddsw              m14, m21      ; t23a t24a
   3035    psubsw              m21, m11, m17 ; t18  t29
   3036    paddsw              m11, m17      ; t17  t30
   3037    psubsw              m17, m16, m15 ; t21  t26
   3038    paddsw              m16, m15      ; t22  t25
   3039    ITX_MUL2X_PACK       21, 18, 15, 10, 1567_3784, 8,   20 ; t18a t29a
   3040    ITX_MUL2X_PACK       19, 18, 15, 10, 1567_3784, 8,   20 ; t19  t28
   3041    ITX_MUL2X_PACK       12, 18, 15, 10, 8, m1567_m3784, 36 ; t20  t27
   3042    ITX_MUL2X_PACK       17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a
   3043    vbroadcasti32x4     m18, [o(deint_shuf)]
   3044    vpbroadcastd         m8, [o(pw_m2896_2896)]
   3045    vpbroadcastd         m9, [o(pw_2896_2896)]
   3046    psubsw              m15, m20, m14 ; t23  t24
   3047    paddsw              m20, m14      ; t16  t31
   3048    psubsw              m14, m11, m16 ; t22a t25a
   3049    paddsw              m11, m16      ; t17a t30a
   3050    psubsw              m16, m21, m17 ; t21  t26
   3051    paddsw              m21, m17      ; t18  t29
   3052    psubsw              m17, m19, m12 ; t20a t27a
   3053    paddsw              m19, m12      ; t19a t28a
   3054    REPX    {pshufb x, m18}, m20, m11, m21, m19
   3055    ITX_MUL2X_PACK       15, 18, 12, 10, 8, 9, 8 ; t23a t22a
   3056    ITX_MUL2X_PACK       14, 13, 15, 10, 8, 9, 8 ; t22  t25
   3057    packssdw            m18, m13      ; t23a t22
   3058    packssdw            m12, m15      ; t24a t25
   3059    ITX_MUL2X_PACK       16, 13, 15, 10, 8, 9, 8 ; t21a t26a
   3060    ITX_MUL2X_PACK       17, 16, 14, 10, 8, 9, 8 ; t20  t27
   3061    packssdw            m16, m13      ; t20  t21a
   3062    packssdw            m14, m15      ; t27  t26a
   3063    punpcklqdq          m13, m19, m21 ; t19a t18
   3064    punpckhqdq          m19, m21      ; t28a t29
   3065    punpcklqdq          m21, m20, m11 ; t16  t17a
   3066    punpckhqdq          m20, m11      ; t31  t30a
   3067 INIT_ZMM avx512icl
   3068    mova                m15, [o(permA)]
   3069    ret
   3070 cglobal_label .main_end
   3071    vpbroadcastd        m10, [o(pw_2048)]
   3072    vpermt2q             m0, m15, m1  ; t0   t1   t2   t3
   3073    vpermt2q            m20, m15, m19 ; t31  t30a t29  t28a
   3074    vpermt2q             m2, m15, m3  ; t4   t5   t6   t7
   3075    vpermt2q            m14, m15, m12 ; t27  t26a t25  t24a
   3076    vpermt2q             m4, m15, m5  ; t8   t9   t10  t11
   3077    vpermt2q            m18, m15, m16 ; t23a t22  t21a t20
   3078    vpermt2q             m6, m15, m7  ; t12  t13  t14  t15
   3079    vpermt2q            m13, m15, m21 ; t19a t18  t17a t16
   3080    psubsw               m7, m0, m20  ; out31 out30 out29 out28
   3081    paddsw               m0, m20      ; out0  out1  out2  out3
   3082    psubsw               m5, m2, m14  ; out27 out26 out25 out24
   3083    paddsw               m2, m14      ; out4  out5  out6  out7
   3084    psubsw               m3, m4, m18  ; out23 out22 out21 out20
   3085    paddsw               m4, m18      ; out8  out9  out10 out11
   3086    psubsw               m1, m6, m13  ; out19 out18 out17 out16
   3087    paddsw               m6, m13      ; out12 out13 out14 out15
   3088    vzeroupper
   3089    ret
   3090 
   3091 %macro LOAD_PACKED_16X2 3 ; dst, row[1-2]
   3092    vbroadcasti32x4    ym%1, [cq+16*%2]
   3093    vbroadcasti32x4     ym8, [cq+16*%3]
   3094    shufpd             ym%1, ym8, 0x0c
   3095 %endmacro
   3096 
   3097 cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
   3098 %undef cmp
   3099    test               eobd, eobd
   3100    jz .dconly
   3101    lea                  r5, [o_base]
   3102    LOAD_PACKED_16X2      0,  0,  2 ; in0  in2
   3103    LOAD_PACKED_16X2      1,  4,  6 ; in4  in6
   3104    LOAD_PACKED_16X2      2,  8, 10 ; in8  in10
   3105    LOAD_PACKED_16X2      3, 12, 14 ; in12 in14
   3106    LOAD_PACKED_16X2     14,  1,  3 ; in1  in3
   3107    LOAD_PACKED_16X2     15,  5,  7 ; in5  in7
   3108    LOAD_PACKED_16X2     16,  9, 11 ; in9  in11
   3109    LOAD_PACKED_16X2     17, 13, 15 ; in13 in15
   3110    pxor                 m4, m4
   3111    REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
   3112    cmp                eobd, 107
   3113    jb .fast
   3114    LOAD_PACKED_16X2      4, 16, 18 ; in16 in18
   3115    LOAD_PACKED_16X2      5, 20, 22 ; in20 in22
   3116    LOAD_PACKED_16X2      6, 24, 26 ; in24 in26
   3117    LOAD_PACKED_16X2      7, 28, 30 ; in28 in30
   3118    call m(idct_8x16_internal_8bpc).main
   3119    LOAD_PACKED_16X2     18, 19, 17 ; in19 in17
   3120    LOAD_PACKED_16X2     19, 23, 21 ; in23 in21
   3121    LOAD_PACKED_16X2     20, 27, 25 ; in27 in25
   3122    LOAD_PACKED_16X2     21, 31, 29 ; in31 in29
   3123    pxor                 m8, m8
   3124    REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
   3125    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
   3126    jmp .pass2
   3127 .fast: ; bottom half is zero
   3128    mova                ym5, ym4
   3129    mova                ym6, ym4
   3130    mova                ym7, ym4
   3131    call m(idct_8x16_internal_8bpc).main
   3132    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
   3133 .pass2:
   3134    vpbroadcastd        m10, [o(pw_8192)]
   3135    vpermt2q             m0, m15, m4       ; t0   t1   t9   t8
   3136    vpermt2q            m20, m15, m18      ; t31  t30a t23a t22
   3137    vpermt2q             m3, m15, m7       ; t7   t6   t14  t15
   3138    vpermt2q            m12, m15, m21      ; t25  t24a t17a t16
   3139    vpermt2q             m2, m15, m6       ; t4   t5   t13  t12
   3140    vpermt2q            m14, m15, m13      ; t23a t22  t21a t20
   3141    vpermt2q             m1, m15, m5       ; t3   t2   t10  t11
   3142    vpermt2q            m19, m15, m16      ; t27  t26a t19a t18
   3143    psubsw               m8, m0, m20       ; out31 out30 out22 out23
   3144    paddsw               m0, m20           ; out0  out1  out9  out8
   3145    paddsw               m6, m3, m12       ; out7  out6  out14 out15
   3146    psubsw               m3, m12           ; out24 out25 out17 out16
   3147    psubsw               m5, m2, m14       ; out27 out26 out18 out19
   3148    paddsw               m4, m2, m14       ; out4  out5  out13 out12
   3149    psubsw               m7, m1, m19       ; out28 out29 out21 out20
   3150    paddsw               m2, m1, m19       ; out3  out2  out10 out11
   3151    vzeroupper
   3152    vshufi32x4           m1, m0, m3, q1221 ; out1  out9  out17 out25
   3153    vshufi32x4           m0, m3, q0330     ; out0  out8  out16 out24
   3154    vshufi32x4           m3, m2, m5, q0330 ; out3  out11 out19 out27
   3155    vshufi32x4           m2, m5, q1221     ; out2  out10 out18 out26
   3156    vshufi32x4           m5, m4, m7, q1221 ; out5  out13 out21 out29
   3157    vshufi32x4           m4, m7, q0330     ; out4  out12 out20 out28
   3158    vshufi32x4           m7, m6, m8, q0330 ; out7  out15 out23 out31
   3159    vshufi32x4           m6, m8, q1221     ; out6  out14 out22 out30
   3160    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
   3161    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
   3162    call .main
   3163    vpbroadcastd         m8, [o(pw_2048)]
   3164    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
   3165    lea                  r2, [strideq*3]
   3166    lea                  r3, [dstq+strideq*4]
   3167    movshdup            m12, [o(permD)]
   3168    pmovzxbw             m8, [dstq+strideq*0]
   3169    pmovzxbw             m9, [dstq+strideq*1]
   3170    pmovzxbw            m10, [dstq+strideq*2]
   3171    pmovzxbw            m11, [dstq+r2       ]
   3172    paddw                m0, m8
   3173    paddw                m1, m9
   3174    paddw                m2, m10
   3175    paddw                m3, m11
   3176    pmovzxbw             m8, [r3+strideq*0]
   3177    pmovzxbw             m9, [r3+strideq*1]
   3178    pmovzxbw            m10, [r3+strideq*2]
   3179    pmovzxbw            m11, [r3+r2       ]
   3180    paddw                m4, m8
   3181    paddw                m5, m9
   3182    paddw                m6, m10
   3183    paddw                m7, m11
   3184    packuswb             m0, m1
   3185    packuswb             m2, m3
   3186    vpermq               m0, m12, m0
   3187    vpermq               m2, m12, m2
   3188    mova          [dstq+strideq*0], ym0
   3189    vextracti32x8 [dstq+strideq*1], m0, 1
   3190    mova          [dstq+strideq*2], ym2
   3191    vextracti32x8 [dstq+r2       ], m2, 1
   3192    packuswb             m4, m5
   3193    packuswb             m6, m7
   3194    vpermq               m4, m12, m4
   3195    vpermq               m6, m12, m6
   3196    mova          [r3+strideq*0], ym4
   3197    vextracti32x8 [r3+strideq*1], m4, 1
   3198    mova          [r3+strideq*2], ym6
   3199    vextracti32x8 [r3+r2       ], m6, 1
   3200    RET
   3201 .dconly:
   3202    movsx               r6d, word [cq]
   3203    mov                [cq], eobd
   3204    or                  r3d, 8
   3205 .dconly2:
   3206    imul                r6d, 181
   3207    add                 r6d, 128+512
   3208    sar                 r6d, 8+2
   3209 .dconly3:
   3210    imul                r6d, 181
   3211    add                 r6d, 128+2048
   3212    sar                 r6d, 8+4
   3213    pxor                 m2, m2
   3214    vpbroadcastw         m3, r6d
   3215 .dconly_loop:
   3216    mova                ym1, [dstq+strideq*0]
   3217    vinserti32x8         m1, [dstq+strideq*1], 1
   3218    punpcklbw            m0, m1, m2
   3219    punpckhbw            m1, m2
   3220    paddw                m0, m3
   3221    paddw                m1, m3
   3222    packuswb             m0, m1
   3223    mova          [dstq+strideq*0], ym0
   3224    vextracti32x8 [dstq+strideq*1], m0, 1
   3225    lea                dstq, [dstq+strideq*2]
   3226    sub                 r3d, 2
   3227    jg .dconly_loop
   3228    RET
   3229 ALIGN function_align
   3230 cglobal_label .main
   3231    vpbroadcastd       m10, [o(pd_2048)]
   3232 .main2:
   3233    ITX_MULSUB_2W        5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a
   3234    ITX_MULSUB_2W        1, 7, 8, 9, 10,  799, 4017 ; t4a, t7a
   3235    ITX_MULSUB_2W        2, 6, 8, 9, 10, 1567, 3784 ; t2, t3
   3236    vpbroadcastd       m11, [o(pw_2896_2896)]
   3237    vpbroadcastd       m12, [o(pw_m2896_2896)]
   3238    ITX_MULSUB_2W        0, 4, 8, 9, 10, 11, 12 ; t1, t0
   3239 .main3:
   3240    paddsw              m8, m1, m5 ; t4
   3241    psubsw              m1, m5     ; t5a
   3242    paddsw              m9, m7, m3 ; t7
   3243    psubsw              m7, m3     ; t6a
   3244    ITX_MULSUB_2W        7, 1, 3, 5, 10, 11, 12 ; t5, t6
   3245    psubsw              m5, m0, m2 ; dct4 out2
   3246    paddsw              m2, m0     ; dct4 out1
   3247    paddsw              m0, m4, m6 ; dct4 out0
   3248    psubsw              m4, m6     ; dct4 out3
   3249    psubsw              m6, m2, m1 ; out6
   3250    paddsw              m1, m2     ; out1
   3251    paddsw              m2, m5, m7 ; out2
   3252    psubsw              m5, m7     ; out5
   3253    psubsw              m7, m0, m9 ; out7
   3254    paddsw              m0, m9     ; out0
   3255    paddsw              m3, m4, m8 ; out3
   3256    psubsw              m4, m8     ; out4
   3257    ret
   3258 
   3259 cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c
   3260    vpbroadcastd         m7, [pw_5]
   3261    paddsw               m0, m7, [cq+64*0]
   3262    paddsw               m1, m7, [cq+64*1]
   3263    vpbroadcastd        ym9, strided
   3264    paddsw               m2, m7, [cq+64*2]
   3265    paddsw               m3, m7, [cq+64*3]
   3266    paddsw               m4, m7, [cq+64*4]
   3267    paddsw               m5, m7, [cq+64*5]
   3268    paddsw               m6, m7, [cq+64*6]
   3269    paddsw               m7,     [cq+64*7]
   3270    pmulld             ym14, ym9, [pd_0to15]
   3271    lea                  r3, [dstq+strideq*1]
   3272    lea                  r4, [dstq+strideq*2]
   3273    kxnorb               k1, k1, k1
   3274    pxor                m13, m13
   3275    add                  r1, r4 ; dstq+strideq*3
   3276    kmovb                k2, k1
   3277    vpgatherdq       m9{k1}, [r0+ym14*4]
   3278    kmovb                k1, k2
   3279    vpgatherdq      m10{k2}, [r3+ym14*4]
   3280    kmovb                k2, k1
   3281    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
   3282    REPX       {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
   3283    vpgatherdq      m11{k1}, [r4+ym14*4]
   3284    kmovb                k1, k2
   3285    vpgatherdq      m12{k2}, [r1+ym14*4]
   3286    REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
   3287    punpcklbw            m8, m9, m13  ;  0  8 16 24
   3288    punpckhbw            m9, m13      ;  4 12 20 28
   3289    paddw                m0, m8
   3290    paddw                m4, m9
   3291    packuswb             m0, m4
   3292    kmovb                k2, k1
   3293    vpscatterdq [r0+ym14*4]{k1}, m0
   3294    punpcklbw            m8, m10, m13 ;  1  9 17 25
   3295    punpckhbw           m10, m13      ;  5 13 21 29
   3296    paddw                m1, m8
   3297    paddw                m5, m10
   3298    packuswb             m1, m5
   3299    kmovb                k1, k2
   3300    vpscatterdq [r3+ym14*4]{k2}, m1
   3301    punpcklbw            m8, m11, m13 ;  2 10 18 26
   3302    punpckhbw           m11, m13      ;  6 14 22 30
   3303    paddw                m2, m8
   3304    paddw                m6, m11
   3305    packuswb             m2, m6
   3306    kmovb                k2, k1
   3307    vpscatterdq [r4+ym14*4]{k1}, m2
   3308    punpcklbw            m8, m12, m13 ;  3 11 19 27
   3309    punpckhbw           m12, m13      ;  7 15 23 31
   3310    paddw                m3, m8
   3311    paddw                m7, m12
   3312    packuswb             m3, m7
   3313    vpscatterdq [r1+ym14*4]{k2}, m3
   3314    RET
   3315 
   3316 cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c
   3317    vpbroadcastd         m0, [pw_4096]
   3318    pmulhrsw             m3, m0, [cq+64*0]
   3319    pmulhrsw             m4, m0, [cq+64*4]
   3320    pmulhrsw             m6, m0, [cq+64*1]
   3321    pmulhrsw             m5, m0, [cq+64*5]
   3322    pmulhrsw             m7, m0, [cq+64*2]
   3323    pmulhrsw             m2, m0, [cq+64*6]
   3324    pmulhrsw             m8, m0, [cq+64*3]
   3325    pmulhrsw             m0,     [cq+64*7]
   3326    mova                m13, [int8_permA]
   3327    lea                  r3, [strideq*3]
   3328    lea                  r4, [dstq+strideq*4]
   3329    punpckldq            m1, m3, m4
   3330    punpckhdq            m3, m4
   3331    punpckldq            m4, m6, m5
   3332    punpckhdq            m6, m5
   3333    punpckldq            m5, m7, m2
   3334    punpckhdq            m7, m2
   3335    punpckldq            m2, m8, m0
   3336    punpckhdq            m8, m0
   3337    mova                ym9, [dstq+strideq*0]
   3338    vinserti32x8         m9, [dstq+strideq*2], 1
   3339    mova               ym10, [dstq+strideq*1]
   3340    vinserti32x8        m10, [dstq+r3       ], 1
   3341    mova               ym11, [r4+strideq*0]
   3342    vinserti32x8        m11, [r4+strideq*2], 1
   3343    mova               ym12, [r4+strideq*1]
   3344    vinserti32x8        m12, [r4+r3       ], 1
   3345    REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8
   3346    pxor                m13, m13
   3347    REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
   3348    punpcklqdq           m0, m1, m4 ; a0 a2   c0 c2
   3349    punpckhqdq           m1, m4     ; b0 b2   d0 d2
   3350    punpcklqdq           m4, m5, m2 ; a1 a3   c1 c3
   3351    punpckhqdq           m5, m2     ; b1 b3   d1 d3
   3352    punpcklqdq           m2, m3, m6 ; e0 e2   g0 g2
   3353    punpckhqdq           m3, m6     ; f0 f2   h0 h2
   3354    punpcklqdq           m6, m7, m8 ; e1 e3   g1 g3
   3355    punpckhqdq           m7, m8     ; f1 f3   h1 h3
   3356    punpcklbw            m8, m9, m13
   3357    punpckhbw            m9, m13
   3358    paddw                m0, m8
   3359    paddw                m4, m9
   3360    packuswb             m0, m4
   3361    mova          [dstq+strideq*0], ym0
   3362    vextracti32x8 [dstq+strideq*2], m0, 1
   3363    punpcklbw            m8, m10, m13
   3364    punpckhbw           m10, m13
   3365    paddw                m1, m8
   3366    paddw                m5, m10
   3367    packuswb             m1, m5
   3368    mova          [dstq+strideq*1], ym1
   3369    vextracti32x8 [dstq+r3       ], m1, 1
   3370    punpcklbw            m8, m11, m13
   3371    punpckhbw           m11, m13
   3372    paddw                m2, m8
   3373    paddw                m6, m11
   3374    packuswb             m2, m6
   3375    mova          [r4+strideq*0], ym2
   3376    vextracti32x8 [r4+strideq*2], m2, 1
   3377    punpcklbw            m8, m12, m13
   3378    punpckhbw           m12, m13
   3379    paddw                m3, m8
   3380    paddw                m7, m12
   3381    packuswb             m3, m7
   3382    mova          [r4+strideq*1], ym3
   3383    vextracti32x8 [r4+r3       ], m3, 1
   3384    RET
   3385 
   3386 %macro IDCT_16x32_END 3 ; src[1-2], row
   3387    mova                xm8, [dstq+strideq*0]
   3388    vinserti32x4        ym8, [dstq+strideq*1], 1
   3389    mova                xm9, [dstq+r3       ]
   3390    vinserti32x4        ym9, [dstq+strideq*2], 1
   3391    pmulhrsw            m%1, m10
   3392    pmulhrsw            m%2, m10
   3393    vpermb               m8, m11, m8
   3394    vpermb               m9, m11, m9
   3395    mova   [cq+64*(%3*2+0)], m13
   3396    mova   [cq+64*(%3*2+1)], m13
   3397    paddw                m8, m%1
   3398    paddw                m9, m%2
   3399    packuswb             m8, m9
   3400    vpermd               m8, m12, m8
   3401    mova          [dstq+strideq*0], xm8
   3402    vextracti32x4 [dstq+strideq*1], ym8, 1
   3403    vextracti32x4 [dstq+strideq*2], m8, 2
   3404    vextracti32x4 [dstq+r3       ], m8, 3
   3405 %if %1 != 20
   3406    lea                dstq, [dstq+strideq*4]
   3407 %endif
   3408 %endmacro
   3409 
   3410 cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob
   3411 %undef cmp
   3412    lea                  r5, [o_base]
   3413    test               eobd, eobd
   3414    jz .dconly
   3415    vpbroadcastd        m15, [o(pw_2896x8)]
   3416    cmp                eobd, 151
   3417    jb .fast
   3418    pmulhrsw             m5, m15, [cq+64*10]
   3419    pmulhrsw             m3, m15, [cq+64* 6]
   3420    pmulhrsw             m1, m15, [cq+64* 2]
   3421    pmulhrsw             m7, m15, [cq+64*14]
   3422    pmulhrsw             m2, m15, [cq+64* 4]
   3423    pmulhrsw             m6, m15, [cq+64*12]
   3424    pmulhrsw             m0, m15, [cq+64* 0]
   3425    pmulhrsw             m4, m15, [cq+64* 8]
   3426    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   3427    pmulhrsw            m14, m15, [cq+64* 1]
   3428    pmulhrsw            m21, m15, [cq+64*15]
   3429    pmulhrsw            m18, m15, [cq+64* 9]
   3430    pmulhrsw            m17, m15, [cq+64* 7]
   3431    pmulhrsw            m16, m15, [cq+64* 5]
   3432    pmulhrsw            m19, m15, [cq+64*11]
   3433    pmulhrsw            m20, m15, [cq+64*13]
   3434    pmulhrsw            m15,      [cq+64* 3]
   3435    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
   3436    mova                 m8, [o(idct_16x32p)]
   3437    vpbroadcastd         m9, [o(pw_16384)]
   3438    REPX {vpermb x, m8, x}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   3439                            m14, m15, m16, m17, m18, m19, m20, m21
   3440    punpckldq            m8, m0, m1
   3441    punpckhdq            m0, m1
   3442    punpckldq            m1, m2, m3
   3443    punpckhdq            m2, m3
   3444    REPX   {pmulhrsw x, m9}, m8, m0, m1, m2
   3445    punpckldq            m3, m4, m5
   3446    punpckhdq            m4, m5
   3447    punpckldq            m5, m6, m7
   3448    punpckhdq            m6, m7
   3449    REPX   {pmulhrsw x, m9}, m3, m4, m5, m6
   3450    punpckldq            m7, m14, m15
   3451    punpckhdq           m14, m15
   3452    punpckldq           m15, m16, m17
   3453    punpckhdq           m16, m17
   3454    REPX   {pmulhrsw x, m9}, m7, m14, m15, m16
   3455    punpckldq           m17, m18, m19
   3456    punpckhdq           m18, m19
   3457    punpckldq           m19, m20, m21
   3458    punpckhdq           m20, m21
   3459    REPX   {pmulhrsw x, m9}, m17, m18, m19, m20
   3460    punpcklqdq          m21, m8, m1
   3461    punpckhqdq           m8, m1
   3462    punpcklqdq           m1, m0, m2
   3463    punpckhqdq           m0, m2
   3464    punpcklqdq           m2, m3, m5
   3465    punpckhqdq           m3, m5
   3466    punpcklqdq           m5, m4, m6
   3467    punpckhqdq           m4, m6
   3468    punpcklqdq           m6, m7, m15
   3469    punpckhqdq           m7, m15
   3470    punpcklqdq          m15, m14, m16
   3471    punpckhqdq          m14, m16
   3472    punpcklqdq          m16, m17, m19
   3473    punpckhqdq          m17, m19
   3474    punpcklqdq          m19, m18, m20
   3475    punpckhqdq          m18, m20
   3476    vinserti32x8        m20, m21, ym2, 1
   3477    vshufi32x4          m21, m2, q3232
   3478    vinserti32x8         m2, m8, ym3, 1
   3479    vshufi32x4           m8, m3, q3232
   3480    vinserti32x8         m3, m1, ym5, 1
   3481    vshufi32x4           m1, m5, q3232
   3482    vinserti32x8         m5, m0, ym4, 1
   3483    vshufi32x4           m0, m4, q3232
   3484    vinserti32x8         m4, m6, ym16, 1
   3485    vshufi32x4           m6, m16, q3232
   3486    vinserti32x8        m16, m7, ym17, 1
   3487    vshufi32x4           m7, m17, q3232
   3488    vinserti32x8        m17, m15, ym19, 1
   3489    vshufi32x4          m15, m19, q3232
   3490    vinserti32x8        m19, m14, ym18, 1
   3491    vshufi32x4          m14, m18, q3232
   3492    vshufi32x4          m18, m21, m6, q3131 ; 27  5
   3493    vshufi32x4          m21, m6, q2020      ; 31  1
   3494    vshufi32x4           m6, m8, m7, q2020  ; 24  8
   3495    vshufi32x4           m8, m7, q3131      ; 30  2
   3496    vshufi32x4           m7, m1, m15, q2020 ; 28  4
   3497    vshufi32x4           m1, m15, q3131     ;  6 26
   3498    vshufi32x4          m15, m0, m14, q2020 ;  7 25
   3499    vshufi32x4           m0, m14, q3131     ; 14 18
   3500    vshufi32x4          m14, m20, m4, q2020 ;  3 29
   3501    vshufi32x4          m20, m4, q3131      ; 23  9
   3502    vshufi32x4           m9, m3, m17, q2020 ; 16  0
   3503    vshufi32x4           m3, m17, q3131     ; 12 20
   3504    vshufi32x4          m17, m5, m19, q2020 ; 15 17
   3505    vshufi32x4           m5, m19, q3131     ; 22 10
   3506    vshufi32x4          m19, m2, m16, q2020 ; 19 13
   3507    vshufi32x4          m16, m2, m16, q3131 ; 11 21
   3508    call m(idct_16x16_internal_8bpc).main3
   3509    call .main_oddhalf
   3510    jmp .pass2
   3511 .fast: ; right half is zero
   3512    mova                ym8, [cq+64*15]
   3513    vinserti32x8         m8, [cq+64* 1], 1
   3514    mova                 m2, [o(int16_perm)]
   3515    mova                ym9, [cq+64* 8]
   3516    vinserti32x8         m9, [cq+64* 0], 1
   3517    mova                ym0, [cq+64* 7]
   3518    vinserti32x8         m0, [cq+64* 9], 1
   3519    mova                ym7, [cq+64*14]
   3520    vinserti32x8         m7, [cq+64* 2], 1
   3521    mova                ym1, [cq+64* 3]
   3522    vinserti32x8         m1, [cq+64*13], 1
   3523    mova                ym3, [cq+64* 6]
   3524    vinserti32x8         m3, [cq+64*10], 1
   3525    mova                ym5, [cq+64*11]
   3526    vinserti32x8         m5, [cq+64* 5], 1
   3527    mova                ym6, [cq+64*12]
   3528    vinserti32x8         m6, [cq+64* 4], 1
   3529    REPX  {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6
   3530    REPX  {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
   3531    call m(idct_16x16_internal_8bpc).main2
   3532    vbroadcasti32x4      m8, [o(int_shuf3)]
   3533    vbroadcasti32x4      m9, [o(int_shuf4)]
   3534    vpbroadcastd        m11, [o(pw_16384)]
   3535    pshufb               m0, m8
   3536    pshufb               m1, m9
   3537    pshufb               m2, m8
   3538    pshufb               m3, m9
   3539    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
   3540    pshufb               m4, m8
   3541    pshufb               m5, m9
   3542    pshufb               m6, m8
   3543    pshufb               m7, m9
   3544    REPX  {pmulhrsw x, m11}, m4, m5, m6, m7
   3545    punpckhdq           m17, m0, m1
   3546    punpckldq            m0, m1
   3547    punpckhdq           m16, m2, m3
   3548    punpckldq            m2, m3
   3549    punpckhdq           m18, m4, m5
   3550    punpckldq            m4, m5
   3551    punpckhdq            m5, m6, m7
   3552    punpckldq            m6, m7
   3553    vinserti32x8         m1, m0, ym2, 1
   3554    vshufi32x4           m3, m0, m2, q3232
   3555    vinserti32x8         m2, m4, ym6, 1
   3556    vshufi32x4           m4, m6, q3232
   3557    vinserti32x8        m15, m17, ym16, 1
   3558    vshufi32x4          m17, m16, q3232
   3559    vinserti32x8        m16, m18, ym5, 1
   3560    vshufi32x4          m18, m5, q3232
   3561    vshufi32x4           m0, m1, m2, q2020   ;  0  2
   3562    vshufi32x4           m1, m2, q3131       ;  4  6
   3563    vshufi32x4           m2, m3, m4, q2020   ;  8 10
   3564    vshufi32x4           m3, m4, q3131       ; 12 14
   3565    vshufi32x4          m14, m15, m16, q2020 ;  1  3
   3566    vshufi32x4          m15, m16, q3131      ;  5  7
   3567    vshufi32x4          m16, m17, m18, q2020 ;  9 11
   3568    vshufi32x4          m17, m18, q3131      ; 13 15
   3569    pxor                 m6, m6
   3570    punpckhwd            m8, m0, m0
   3571    punpcklwd            m9, m6, m0
   3572    punpckhwd            m0, m3, m3
   3573    punpckhwd            m5, m2, m2
   3574    punpcklwd            m7, m1, m1
   3575    punpckhwd            m1, m1
   3576    punpcklwd            m3, m3
   3577    punpcklwd            m6, m2
   3578    call m(idct_16x16_internal_8bpc).main_fast5
   3579    punpcklwd           m21, m14, m14
   3580    punpckhwd           m14, m14
   3581    punpcklwd           m18, m15, m15
   3582    punpckhwd           m15, m15
   3583    punpcklwd           m20, m16, m16
   3584    punpckhwd           m16, m16
   3585    punpcklwd           m19, m17, m17
   3586    punpckhwd           m17, m17
   3587    call .main_oddhalf_fast
   3588 .pass2:
   3589    vpbroadcastd        m10, [o(pw_2048)]
   3590    mova                m11, [o(end_16x32p)]
   3591    lea                  r3, [strideq*3]
   3592    pxor                m13, m13
   3593    psrld               m12, m11, 8
   3594    IDCT_16x32_END        0,  1,  0
   3595    IDCT_16x32_END        2,  3,  1
   3596    IDCT_16x32_END        4,  5,  2
   3597    IDCT_16x32_END        6,  7,  3
   3598    IDCT_16x32_END       14, 15,  4
   3599    IDCT_16x32_END       16, 17,  5
   3600    IDCT_16x32_END       18, 19,  6
   3601    IDCT_16x32_END       20, 21,  7
   3602    RET
   3603 ALIGN function_align
   3604 .dconly:
   3605    movsx               r6d, word [cq]
   3606    mov                [cq], eobd
   3607    or                  r3d, 32
   3608    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly
   3609 ALIGN function_align
   3610 cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
   3611    vpbroadcastd         m8, [o(pw_201_4091x8)]
   3612    vpbroadcastd        m20, [o(pw_m1380_3857x8)]
   3613    vpbroadcastd         m9, [o(pw_995_3973x8)]
   3614    vpbroadcastd        m16, [o(pw_m601_4052x8)]
   3615    pmulhrsw            m21, m8  ; t16a, t31a
   3616    pmulhrsw            m20, m15 ; t19a, t28a
   3617    pmulhrsw            m18, m9  ; t20a, t27a
   3618    pmulhrsw            m14, m16 ; t23a, t24a
   3619    mova                 m8, m21
   3620    mova                m17, m20
   3621    mova                m15, m18
   3622    mova                m16, m14
   3623    jmp .main3
   3624 ALIGN function_align
   3625 cglobal_label .main_oddhalf_fast ; bottom half is zero
   3626    vpbroadcastd         m8, [o(pw_201_4091x8)]
   3627    vpbroadcastd         m9, [o(pw_m2751_3035x8)]
   3628    vpbroadcastd        m11, [o(pw_1751_3703x8)]
   3629    vpbroadcastd        m12, [o(pw_m1380_3857x8)]
   3630    pmulhrsw            m21, m8  ; t16a, t31a
   3631    vpbroadcastd         m8, [o(pw_995_3973x8)]
   3632    pmulhrsw            m17, m9  ; t17a, t30a
   3633    vpbroadcastd         m9, [o(pw_m2106_3513x8)]
   3634    pmulhrsw            m20, m11 ; t18a, t29a
   3635    vpbroadcastd        m11, [o(pw_2440_3290x8)]
   3636    pmulhrsw            m15, m12 ; t19a, t28a
   3637    vpbroadcastd        m12, [o(pw_m601_4052x8)]
   3638    pmulhrsw            m18, m8  ; t20a, t27a
   3639    pmulhrsw            m16, m9  ; t21a, t26a
   3640    pmulhrsw            m19, m11 ; t22a, t25a
   3641    pmulhrsw            m14, m12 ; t23a, t24a
   3642    jmp .main2
   3643 ALIGN function_align
   3644 cglobal_label .main_oddhalf
   3645    ITX_MUL2X_PACK       21, 8, 9, 10,  201, 4091, 5 ; t16a, t31a
   3646    ITX_MUL2X_PACK       17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
   3647    ITX_MUL2X_PACK       20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
   3648    ITX_MUL2X_PACK       15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
   3649    ITX_MUL2X_PACK       18, 8, 9, 10,  995, 3973, 5 ; t20a, t27a
   3650    ITX_MUL2X_PACK       16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
   3651    ITX_MUL2X_PACK       19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
   3652    ITX_MUL2X_PACK       14, 8, 9, 10, 4052,  601, 5 ; t23a, t24a
   3653 .main2:
   3654    psubsw               m8, m21, m17 ; t17 t30
   3655    paddsw              m21, m17      ; t16 t31
   3656    psubsw              m17, m15, m20 ; t18 t29
   3657    paddsw              m20, m15      ; t19 t28
   3658    psubsw              m15, m18, m16 ; t21 t26
   3659    paddsw              m18, m16      ; t20 t27
   3660    psubsw              m16, m14, m19 ; t22 t25
   3661    paddsw              m14, m19      ; t23 t24
   3662 .main3:
   3663    ITX_MUL2X_PACK        8, 9, 19, 10,   799, 4017, 5 ; t17a t30a
   3664    ITX_MUL2X_PACK       17, 9, 19, 10, m4017,  799, 5 ; t18a t29a
   3665    ITX_MUL2X_PACK       15, 9, 19, 10,  3406, 2276, 5 ; t21a t26a
   3666    ITX_MUL2X_PACK       16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a
   3667    vpbroadcastd        m11, [o(pw_m3784_1567)]
   3668    psubsw              m19, m21, m20 ; t19a t28a
   3669    paddsw              m21, m20      ; t16a t31a
   3670    psubsw              m20, m14, m18 ; t20a t27a
   3671    paddsw              m14, m18      ; t23a t24a
   3672    psubsw              m18, m8, m17  ; t18  t29
   3673    paddsw               m8, m17      ; t17  t30
   3674    psubsw              m17, m16, m15 ; t21  t26
   3675    paddsw              m15, m16      ; t22  t25
   3676    ITX_MUL2X_PACK       18, 9, 16, 10, 1567_3784, 11,   20 ; t18a t29a
   3677    ITX_MUL2X_PACK       19, 9, 16, 10, 1567_3784, 11,   20 ; t19  t28
   3678    ITX_MUL2X_PACK       20, 9, 16, 10, 11, m1567_m3784, 36 ; t20  t27
   3679    ITX_MUL2X_PACK       17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a
   3680    vbroadcasti32x4      m9, [o(deint_shuf)]
   3681    psubsw              m16, m21, m14 ; t23  t24
   3682    paddsw              m14, m21      ; t16  t31
   3683    psubsw              m21, m8, m15  ; t22a t25a
   3684    paddsw              m15, m8       ; t17a t30a
   3685    psubsw               m8, m18, m17 ; t21  t26
   3686    paddsw              m18, m17      ; t18  t29
   3687    paddsw              m17, m19, m20 ; t19a t28a
   3688    psubsw              m19, m20      ; t20a t27a
   3689    vpbroadcastd        m11, [o(pw_m2896_2896)]
   3690    vpbroadcastd        m12, [o(pw_2896_2896)]
   3691    REPX     {pshufb x, m9}, m14, m15, m18, m17
   3692    mova                 m9, m10
   3693    vpdpwssd             m9, m16, m11
   3694    mova                m20, m10
   3695    vpdpwssd            m20, m21, m11
   3696    psrad                m9, 12
   3697    psrad               m20, 12
   3698    packssdw             m9, m20      ; t23a t22
   3699    mova                m20, m10
   3700    vpdpwssd            m20, m16, m12
   3701    mova                m16, m10
   3702    vpdpwssd            m16, m21, m12
   3703    psrad               m20, 12
   3704    psrad               m16, 12
   3705    packssdw            m16, m20, m16 ; t24a t25
   3706    ITX_MUL2X_PACK        8, 21, 20, 10, 11, 12, 8 ; t21a t26a
   3707    ITX_MUL2X_PACK       19,  8, 11, 10, 11, 12, 8 ; t20  t27
   3708    packssdw            m11, m20      ; t27  t26a
   3709    packssdw             m8, m21      ; t20  t21a
   3710    punpcklqdq          m20, m14, m15 ; t16  t17a
   3711    punpckhqdq          m14, m15      ; t31  t30a
   3712    punpckhqdq          m15, m17, m18 ; t28a t29
   3713    punpcklqdq          m17, m18      ; t19a t18
   3714    psubsw              m21, m0, m14  ; out31 out30
   3715    paddsw               m0, m14      ; out0  out1
   3716    psubsw              m14, m7, m20  ; out16 out17
   3717    paddsw               m7, m20      ; out15 out14
   3718    psubsw              m20, m1, m15  ; out28 out29
   3719    paddsw               m1, m15      ; out3  out2
   3720    psubsw              m15, m6, m17  ; out19 out18
   3721    paddsw               m6, m17      ; out12 out13
   3722    psubsw              m17, m4, m9   ; out23 out22
   3723    paddsw               m4, m9       ; out8  out9
   3724    psubsw              m18, m3, m16  ; out24 out25
   3725    paddsw               m3, m16      ; out7  out6
   3726    psubsw              m16, m5, m8   ; out20 out21
   3727    paddsw               m5, m8       ; out11 out10
   3728    psubsw              m19, m2, m11  ; out27 out26
   3729    paddsw               m2, m11      ; out4  out5
   3730    ret
   3731 
   3732 cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob
   3733 %undef cmp
   3734    lea                  r5, [o_base]
   3735    test               eobd, eobd
   3736    jz .dconly
   3737    mova                m21, [o(permB)]
   3738    vpermq               m1, m21, [cq+64* 0] ;  0  1
   3739    vpermq              m14, m21, [cq+64* 1] ;  2  3
   3740    vpermq              m20, m21, [cq+64* 2] ;  4  5
   3741    vpermq              m15, m21, [cq+64* 3] ;  6  7
   3742    vpbroadcastd         m8, [o(pw_2896x8)]
   3743    vpermq               m2, m21, [cq+64* 4] ;  8  9
   3744    vpermq              m16, m21, [cq+64* 5] ; 10 11
   3745    vpermq               m3, m21, [cq+64* 6] ; 12 13
   3746    vpermq              m17, m21, [cq+64* 7] ; 14 15
   3747    REPX   {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17
   3748    pxor                m12, m12
   3749    REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7
   3750    cmp                eobd, 151
   3751    jb .fast
   3752    vpermq               m9, m21, [cq+64* 8] ; 16 17
   3753    vpermq              m19, m21, [cq+64* 9] ; 18 19
   3754    vpermq               m4, m21, [cq+64*10] ; 20 21
   3755    vpermq               m5, m21, [cq+64*11] ; 22 23
   3756    vpermq               m6, m21, [cq+64*12] ; 24 25
   3757    vpermq              m18, m21, [cq+64*13] ; 26 27
   3758    vpermq               m7, m21, [cq+64*14] ; 28 29
   3759    vpermq              m21, m21, [cq+64*15] ; 30 31
   3760    REPX   {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21
   3761    REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15
   3762    punpcklwd            m8, m21, m14 ; 30  2
   3763    punpckhwd           m21, m1       ; 31  1
   3764    punpcklwd            m0, m17, m19 ; 14 18
   3765    punpckhwd           m17, m9       ; 15 17
   3766    punpcklwd            m9, m1       ; 16  0
   3767    punpckhwd           m14, m7       ;  3 29
   3768    punpcklwd            m1, m15, m18 ;  6 26
   3769    punpckhwd           m15, m6       ;  7 25
   3770    punpcklwd            m6, m2       ; 24  8
   3771    punpckhwd           m19, m3       ; 19 13
   3772    punpcklwd            m3, m4       ; 12 20
   3773    punpckhwd           m18, m20      ; 27  5
   3774    punpcklwd            m7, m20      ; 28  4
   3775    punpckhwd           m20, m5, m2   ; 23  9
   3776    punpcklwd            m5, m16      ; 22 10
   3777    punpckhwd           m16, m4       ; 11 21
   3778    call m(idct_16x16_internal_8bpc).main2
   3779    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
   3780    jmp .pass2
   3781 .fast: ; bottom half zero
   3782    punpcklwd            m8, m14, m14 ;  2
   3783    punpcklwd            m0, m17, m17 ; 14
   3784    punpcklwd            m5, m16, m16 ; 10
   3785    punpcklwd            m9, m12, m1  ; __  0
   3786    punpckhwd           m21, m1, m1   ;  1
   3787    punpcklwd            m1, m15, m15 ;  6
   3788    punpcklwd            m7, m20, m20 ;  4
   3789    punpckhwd           m19, m3, m3   ; 13
   3790    punpcklwd            m3, m3       ; 12
   3791    punpcklwd            m6, m12, m2  ; __  8
   3792    punpckhwd           m18, m20, m20 ;  5
   3793    punpckhwd           m20, m2, m2   ;  9
   3794    call m(idct_16x16_internal_8bpc).main_fast
   3795    punpckhwd           m15, m15      ;  7
   3796    punpckhwd           m14, m14      ;  3
   3797    punpckhwd           m16, m16      ; 11
   3798    punpckhwd           m17, m17      ; 15
   3799    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   3800 .pass2:
   3801    vpbroadcastd         m9, [o(pw_16384)]
   3802    call .transpose_round
   3803    vshufi32x4          m16, m14, m2, q3131 ;  5
   3804    vshufi32x4          m14, m2, q2020      ;  1
   3805    vshufi32x4           m2, m0, m3, q3131  ;  4
   3806    vshufi32x4           m0, m3, q2020      ;  0
   3807    vshufi32x4           m3, m1, m18, q3131 ;  6
   3808    vshufi32x4           m1, m18, q2020     ;  2
   3809    vshufi32x4          m18, m20, m6, q2020 ;  9
   3810    vshufi32x4          m20, m6, q3131      ; 13
   3811    vshufi32x4           m6, m21, m4, q3131 ; 12
   3812    vshufi32x4           m4, m21, m4, q2020 ;  8
   3813    vshufi32x4          m21, m19, m7, q3131 ; 15
   3814    vshufi32x4          m19, m7, q2020      ; 11
   3815    vshufi32x4           m7, m5, m15, q3131 ; 14
   3816    vshufi32x4           m5, m15, q2020     ; 10
   3817    vshufi32x4          m15, m17, m9, q2020 ;  3
   3818    vshufi32x4          m17, m9, q3131      ;  7
   3819    call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
   3820    call .main_oddhalf
   3821    vpbroadcastd        m12, [o(pw_2048)]
   3822    movshdup            m13, [o(permD)]
   3823    lea                  r2, [strideq*3]
   3824    pmovzxbw             m8, [dstq+strideq*0]
   3825    pmovzxbw             m9, [dstq+strideq*1]
   3826    pmovzxbw            m10, [dstq+strideq*2]
   3827    pmovzxbw            m11, [dstq+r2       ]
   3828    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3
   3829    lea                  r3, [dstq+strideq*4]
   3830    paddw                m0, m8
   3831    paddw                m1, m9
   3832    paddw                m2, m10
   3833    paddw                m3, m11
   3834    pmovzxbw             m8, [r3+strideq*0]
   3835    pmovzxbw             m9, [r3+strideq*1]
   3836    pmovzxbw            m10, [r3+strideq*2]
   3837    pmovzxbw            m11, [r3+r2       ]
   3838    REPX  {pmulhrsw x, m12}, m4, m5, m6, m7
   3839    lea                  r4, [dstq+strideq*8]
   3840    packuswb             m0, m1
   3841    paddw                m4, m8
   3842    paddw                m5, m9
   3843    packuswb             m2, m3
   3844    paddw                m6, m10
   3845    paddw                m7, m11
   3846    pmovzxbw             m8, [r4+strideq*0]
   3847    pmovzxbw             m9, [r4+strideq*1]
   3848    pmovzxbw            m10, [r4+strideq*2]
   3849    pmovzxbw            m11, [r4+r2       ]
   3850    REPX  {pmulhrsw x, m12}, m14, m15, m16, m17
   3851    lea                  r5, [r3+strideq*8]
   3852    packuswb             m4, m5
   3853    paddw               m14, m8
   3854    paddw               m15, m9
   3855    packuswb             m6, m7
   3856    paddw               m16, m10
   3857    paddw               m17, m11
   3858    pmovzxbw             m8, [r5+strideq*0]
   3859    pmovzxbw             m9, [r5+strideq*1]
   3860    pmovzxbw            m10, [r5+strideq*2]
   3861    pmovzxbw            m11, [r5+r2       ]
   3862    REPX  {pmulhrsw x, m12}, m18, m19, m20, m21
   3863    packuswb            m14, m15
   3864    paddw               m18, m8
   3865    paddw               m19, m9
   3866    packuswb            m16, m17
   3867    paddw               m20, m10
   3868    paddw               m21, m11
   3869    packuswb            m18, m19
   3870    packuswb            m20, m21
   3871    REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20
   3872    mova          [dstq+strideq*0], ym0
   3873    vextracti32x8 [dstq+strideq*1], m0, 1
   3874    mova          [dstq+strideq*2], ym2
   3875    vextracti32x8 [dstq+r2       ], m2, 1
   3876    mova          [r3+strideq*0], ym4
   3877    vextracti32x8 [r3+strideq*1], m4, 1
   3878    mova          [r3+strideq*2], ym6
   3879    vextracti32x8 [r3+r2       ], m6, 1
   3880    mova          [r4+strideq*0], ym14
   3881    vextracti32x8 [r4+strideq*1], m14, 1
   3882    mova          [r4+strideq*2], ym16
   3883    vextracti32x8 [r4+r2       ], m16, 1
   3884    mova          [r5+strideq*0], ym18
   3885    vextracti32x8 [r5+strideq*1], m18, 1
   3886    mova          [r5+strideq*2], ym20
   3887    vextracti32x8 [r5+r2       ], m20, 1
   3888    RET
   3889 ALIGN function_align
   3890 .dconly:
   3891    movsx               r6d, word [cq]
   3892    mov                [cq], eobd
   3893    or                  r3d, 16
   3894    imul                r6d, 181
   3895    add                 r6d, 128
   3896    sar                 r6d, 8
   3897    imul                r6d, 181
   3898    add                 r6d, 128+256
   3899    sar                 r6d, 8+1
   3900    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
   3901 ALIGN function_align
   3902 cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero
   3903    vpbroadcastd         m8, [o(pw_2896x8)]
   3904    vpbroadcastd         m4, [o(pw_4076x8)]
   3905    vpbroadcastd         m3, [o(pw_401x8)]
   3906    pmulhrsw             m8, m0  ; t0
   3907    pmulhrsw             m4, m14 ; t15a
   3908    pmulhrsw             m3, m14 ; t8a
   3909    punpcklwd            m9, m3, m4
   3910    punpckhwd            m5, m3, m4
   3911    mova                 m2, m10
   3912    vpdpwssd             m2, m9, [o(pw_m3784_1567)] {bcstd}
   3913    mova                 m1, m10
   3914    vpdpwssd             m1, m5, [o(pw_m3784_1567)] {bcstd}
   3915    mova                 m6, m10
   3916    vpdpwssd             m6, m5, [o(pw_1567_3784)] {bcstd}
   3917    mova                 m5, m10
   3918    vpdpwssd             m5, m9, [o(pw_1567_3784)] {bcstd}
   3919    vpbroadcastd        m11, [o(pw_2896_2896)]
   3920    vpbroadcastd        m12, [o(pw_m2896_2896)]
   3921    psubsw              m21, m8, m4 ; out15
   3922    paddsw               m0, m8, m4 ; out0
   3923    psubsw              m14, m8, m3 ; out8
   3924    paddsw               m7, m8, m3 ; out7
   3925    REPX      {psrad x, 12}, m2, m1, m6, m5
   3926    packssdw             m2, m1     ; t9a
   3927    packssdw             m5, m6     ; t14a
   3928    ITX_MULSUB_2W         4, 3, 16, 17, 10, 11, 12 ; t11,  t12
   3929    psubsw              m20, m8, m5 ; out14
   3930    paddsw               m1, m8, m5 ; out1
   3931    psubsw              m15, m8, m2 ; out9
   3932    paddsw               m6, m8, m2 ; out6
   3933    ITX_MULSUB_2W         5, 2, 16, 17, 10, 11, 12 ; t10a, t13a
   3934    psubsw              m18, m8, m3 ; out12
   3935    paddsw               m3, m8     ; out3
   3936    psubsw              m17, m8, m4 ; out11
   3937    paddsw               m4, m8     ; out4
   3938    psubsw              m19, m8, m2 ; out13
   3939    paddsw               m2, m8     ; out2
   3940    psubsw              m16, m8, m5 ; out10
   3941    paddsw               m5, m8     ; out5
   3942    ret
   3943 cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
   3944    vpbroadcastd         m9, [o(pw_2896x8)]
   3945    vpbroadcastd         m2, [o(pw_4017x8)]
   3946    vpbroadcastd         m3, [o(pw_799x8)]
   3947    vpbroadcastd        m18, [o(pw_4076x8)]
   3948    vpbroadcastd        m19, [o(pw_401x8)]
   3949    vpbroadcastd        m20, [o(pw_m1189x8)]
   3950    vpbroadcastd        m16, [o(pw_3920x8)]
   3951    pmulhrsw             m9, m0  ; t0
   3952    pmulhrsw             m2, m1  ; t7a
   3953    pmulhrsw             m1, m3  ; t4a
   3954    pmulhrsw            m18, m14 ; t15a
   3955    pmulhrsw            m14, m19 ; t8a
   3956    pmulhrsw            m20, m15 ; t11a
   3957    pmulhrsw            m15, m16 ; t12a
   3958    psubsw               m7, m9, m2 ; idct8 out7
   3959    paddsw               m0, m9, m2 ; idct8 out0
   3960    psubsw               m4, m9, m1 ; idct8 out4
   3961    paddsw               m3, m9, m1 ; idct8 out3
   3962    ITX_MULSUB_2W         2, 1, 5, 6, 10, 2896, 2896 ; t5, t6
   3963    mova                m21, m18
   3964    mova                m19, m14
   3965    mova                m16, m15
   3966    mova                 m8, m20
   3967    psubsw               m6, m9, m1 ; idct8 out6
   3968    paddsw               m1, m9     ; idct8 out1
   3969    psubsw               m5, m9, m2 ; idct8 out5
   3970    paddsw               m2, m9     ; idct8 out2
   3971    jmp .main3
   3972 ALIGN function_align
   3973 cglobal_label .main_oddhalf_fast ; bottom half is zero
   3974    vpbroadcastd         m5, [o(pw_m2276x8)]
   3975    vpbroadcastd        m11, [o(pw_3406x8)]
   3976    vpbroadcastd         m7, [o(pw_4017x8)]
   3977    vpbroadcastd        m12, [o(pw_799x8)]
   3978    vpbroadcastd         m6, [o(pw_3784x8)]
   3979    vpbroadcastd        m10, [o(pw_1567x8)]
   3980    vpbroadcastd         m4, [o(pw_2896x8)]
   3981    pmulhrsw             m5, m3  ; t5a
   3982    pmulhrsw             m3, m11 ; t6a
   3983    pmulhrsw             m7, m1  ; t7a
   3984    pmulhrsw             m1, m12 ; t4a
   3985    pmulhrsw             m6, m2  ; t3
   3986    pmulhrsw             m2, m10 ; t2
   3987    pmulhrsw             m4, m0  ; t0
   3988    vpbroadcastd        m11, [o(pw_2896_2896)]
   3989    vpbroadcastd        m12, [o(pw_m2896_2896)]
   3990    vpbroadcastd        m10, [o(pd_2048)]
   3991    mova                 m0, m4  ; t1
   3992    call m(inv_txfm_add_dct_dct_32x8_8bpc).main3
   3993    vpbroadcastd        m21, [o(pw_4076x8)]
   3994    vpbroadcastd         m8, [o(pw_401x8)]
   3995    vpbroadcastd        m18, [o(pw_m2598x8)]
   3996    vpbroadcastd         m9, [o(pw_3166x8)]
   3997    vpbroadcastd        m19, [o(pw_3612x8)]
   3998    vpbroadcastd        m11, [o(pw_1931x8)]
   3999    vpbroadcastd        m20, [o(pw_m1189x8)]
   4000    vpbroadcastd        m12, [o(pw_3920x8)]
   4001    pmulhrsw            m21, m14 ; t15a
   4002    pmulhrsw            m14, m8  ; t8a
   4003    pmulhrsw            m18, m17 ; t9a
   4004    pmulhrsw            m17, m9  ; t14a
   4005    pmulhrsw            m19, m16 ; t13a
   4006    pmulhrsw            m16, m11 ; t10a
   4007    pmulhrsw            m20, m15 ; t11a
   4008    pmulhrsw            m15, m12 ; t12a
   4009    jmp .main2
   4010 ALIGN function_align
   4011 cglobal_label .main_oddhalf
   4012    ITX_MULSUB_2W        14, 21, 8, 9, 10,  401, 4076 ; t8a,  t15a
   4013    ITX_MULSUB_2W        18, 17, 8, 9, 10, 3166, 2598 ; t9a,  t14a
   4014    ITX_MULSUB_2W        16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a
   4015    ITX_MULSUB_2W        20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a
   4016 .main2:
   4017    paddsw               m8, m20, m16 ; t11
   4018    psubsw              m20, m16      ; t10
   4019    paddsw              m16, m15, m19 ; t12
   4020    psubsw              m15, m19      ; t13
   4021    psubsw              m19, m14, m18 ; t9
   4022    paddsw              m14, m18      ; t8
   4023    psubsw              m18, m21, m17 ; t14
   4024    paddsw              m21, m17      ; t15
   4025 .main3:
   4026    vpbroadcastd        m11, [o(pw_1567_3784)]
   4027    vpbroadcastd        m12, [o(pw_m3784_1567)]
   4028    ITX_MULSUB_2W        18, 19, 9, 17, 10, 11, 12 ; t9a,  t14a
   4029    vpbroadcastd        m11, [o(pw_m1567_m3784)]
   4030    ITX_MULSUB_2W        15, 20, 9, 17, 10, 12, 11 ; t10a, t13a
   4031    vpbroadcastd        m11, [o(pw_2896_2896)]
   4032    vpbroadcastd        m12, [o(pw_m2896_2896)]
   4033    psubsw              m17, m14, m8  ; t11a
   4034    paddsw               m8, m14      ; t8a
   4035    paddsw              m14, m18, m15 ; t9
   4036    psubsw              m18, m15      ; t10
   4037    psubsw              m15, m19, m20 ; t13
   4038    paddsw              m19, m20      ; t14
   4039    paddsw              m20, m21, m16 ; t15a
   4040    psubsw              m16, m21, m16 ; t12a
   4041    ITX_MULSUB_2W        15, 18, 9, 21, 10, 11, 12 ; t10a, t13a
   4042    ITX_MULSUB_2W        16, 17, 9, 21, 10, 11, 12 ; t11,  t12
   4043    psubsw              m21, m0, m20 ; out15
   4044    paddsw               m0, m20     ; out0
   4045    psubsw              m20, m1, m19 ; out14
   4046    paddsw               m1, m19     ; out1
   4047    psubsw              m19, m2, m18 ; out13
   4048    paddsw               m2, m18     ; out2
   4049    psubsw              m18, m3, m17 ; out12
   4050    paddsw               m3, m17     ; out3
   4051    psubsw              m17, m4, m16 ; out11
   4052    paddsw               m4, m16     ; out4
   4053    psubsw              m16, m5, m15 ; out10
   4054    paddsw               m5, m15     ; out5
   4055    psubsw              m15, m6, m14 ; out9
   4056    paddsw               m6, m14     ; out6
   4057    psubsw              m14, m7, m8  ; out8
   4058    paddsw               m7, m8      ; out7
   4059    ret
   4060 .transpose_round:
   4061    punpcklwd            m8, m0, m2
   4062    punpckhwd            m0, m2
   4063    punpcklwd            m2, m1, m3
   4064    punpckhwd            m1, m3
   4065    punpcklwd            m3, m4, m6
   4066    punpckhwd            m4, m6
   4067    punpcklwd            m6, m5, m7
   4068    punpckhwd            m5, m7
   4069    punpcklwd            m7, m14, m16
   4070    punpckhwd           m14, m16
   4071    punpcklwd           m16, m15, m17
   4072    punpckhwd           m15, m17
   4073    punpcklwd           m17, m19, m21
   4074    punpckhwd           m19, m21
   4075    punpckhwd           m21, m18, m20
   4076    punpcklwd           m18, m20
   4077    punpcklwd           m20, m8, m1
   4078    punpckhwd            m8, m1
   4079    punpcklwd            m1, m0, m2
   4080    punpckhwd            m0, m2
   4081    punpcklwd            m2, m3, m5
   4082    punpckhwd            m3, m5
   4083    punpcklwd            m5, m4, m6
   4084    punpckhwd            m4, m6
   4085    REPX   {pmulhrsw x, m9}, m20, m8, m1, m0
   4086    punpcklwd            m6, m7, m15
   4087    punpckhwd            m7, m15
   4088    punpcklwd           m15, m14, m16
   4089    punpckhwd           m14, m16
   4090    REPX   {pmulhrsw x, m9}, m2, m3, m5, m4
   4091    punpckhwd           m16, m18, m19
   4092    punpcklwd           m18, m19
   4093    punpcklwd           m19, m21, m17
   4094    punpckhwd           m21, m17
   4095    REPX   {pmulhrsw x, m9}, m6, m7, m15, m14
   4096    punpcklwd           m17, m8, m0         ; a2   a6   aa   ae
   4097    punpckhwd            m8, m0             ; a3   a7   ab   af
   4098    punpcklwd            m0, m20, m1        ; a0   a4   a8   ac
   4099    punpckhwd           m20, m1             ; a1   a5   a9   ad
   4100    REPX   {pmulhrsw x, m9}, m16, m18, m19, m21
   4101    punpcklwd            m1, m2, m5         ; b0   b4   b8   bc
   4102    punpckhwd            m2, m5             ; b1   b5   b9   bd
   4103    punpcklwd            m5, m3, m4         ; b2   b6   ba   be
   4104    punpckhwd            m3, m4             ; b3   b7   bb   bf
   4105    punpcklwd            m4, m6, m15        ; c0   c4   c8   cc
   4106    punpckhwd            m6, m15            ; c1   c5   c9   cd
   4107    punpcklwd           m15, m7, m14        ; c2   c6   ca   ce
   4108    punpckhwd            m7, m14            ; c3   c7   cb   cf
   4109    punpcklwd           m14, m18, m19       ; d0   d4   d8   dc
   4110    punpckhwd           m18, m19            ; d1   d5   d9   dd
   4111    punpcklwd            m9, m16, m21       ; d2   d6   da   de
   4112    punpckhwd           m16, m21            ; d3   d7   db   df
   4113    vshufi32x4          m21, m0, m1, q3232  ; a8   ac   b8   bc
   4114    vinserti32x8         m0, ym1, 1         ; a0   a4   b0   b4
   4115    vinserti32x8         m1, m17, ym5, 1    ; a2   a6   b2   b6
   4116    vshufi32x4           m5, m17, m5, q3232 ; aa   ae   ba   be
   4117    vinserti32x8        m17, m8, ym3, 1     ; a3   a7   b3   b7
   4118    vshufi32x4          m19, m8, m3, q3232  ; ab   af   bb   bf
   4119    vinserti32x8         m3, m4, ym14, 1    ; c0   c4   d0   d4
   4120    vshufi32x4           m4, m14, q3232     ; c8   cc   d8   dc
   4121    vinserti32x8        m14, m20, ym2, 1    ; a1   a5   b1   b5
   4122    vshufi32x4          m20, m2, q3232      ; a9   ad   b9   bd
   4123    vinserti32x8         m2, m6, ym18, 1    ; c1   c5   d1   d5
   4124    vshufi32x4           m6, m18, q3232     ; c9   cd   d9   dd
   4125    vinserti32x8        m18, m15, ym9, 1    ; c2   c6   d2   d6
   4126    vshufi32x4          m15, m9, q3232      ; ca   ce   da   de
   4127    vinserti32x8         m9, m7, ym16, 1    ; c3   c7   d3   d7
   4128    vshufi32x4           m7, m16, q3232     ; cb   cf   db   df
   4129    ret
   4130 
   4131 %macro IDTX_16x32 4 ; src/dst[1-4]
   4132    pmulhrsw            m%1, m15, [cq+64*%1]
   4133    pmulhrsw            m%2, m15, [cq+64*%2]
   4134    pmulhrsw            m%3, m15, [cq+64*%3]
   4135    pmulhrsw            m%4, m15, [cq+64*%4]
   4136    pmulhrsw            m18, m16, m%1
   4137    pmulhrsw            m19, m16, m%2
   4138    pmulhrsw            m20, m16, m%3
   4139    pmulhrsw            m21, m16, m%4
   4140    REPX  {pmulhrsw x, m17}, m18, m19, m20, m21
   4141    paddsw              m%1, m18
   4142    paddsw              m%2, m19
   4143    paddsw              m%3, m20
   4144    paddsw              m%4, m21
   4145 %endmacro
   4146 
   4147 %macro IDTX_16x32_STORE 2 ; src[1-2]
   4148    mova               xm17, [dstq+r3*0]
   4149    vinserti128        ym17, [dstq+r3*4], 1
   4150    vinserti32x4        m17, [dstq+r3*8], 2
   4151    vinserti32x4        m17, [dstq+r4*8], 3
   4152    mova   [cq+64*(%1*2+0)], m18
   4153    mova   [cq+64*(%1*2+1)], m18
   4154    punpcklbw           m16, m17, m18
   4155    punpckhbw           m17, m18
   4156    paddw               m16, m%1
   4157    paddw               m17, m%2
   4158    packuswb            m16, m17
   4159    mova          [dstq+r3*0], xm16
   4160    vextracti128  [dstq+r3*4], ym16, 1
   4161    vextracti32x4 [dstq+r3*8], m16, 2
   4162    vextracti32x4 [dstq+r4*8], m16, 3
   4163 %if %1 != 7
   4164    add                dstq, strideq
   4165 %endif
   4166 %endmacro
   4167 
   4168 cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c
   4169    vpbroadcastd        m15, [pw_2896x8]
   4170    vpbroadcastd        m16, [pw_1697x16]
   4171    vpbroadcastd        m17, [pw_16384]
   4172    IDTX_16x32            0,  1,  2,  3
   4173    IDTX_16x32            4,  5,  6,  7
   4174    IDTX_16x32            8,  9, 10, 11
   4175    IDTX_16x32           12, 13, 14, 15
   4176    vpbroadcastd        m16, [pw_8192]
   4177    call .transpose_2x8x8_round
   4178    lea                  r3, [strideq*2]
   4179    lea                  r4, [strideq*3]
   4180    pxor                m18, m18
   4181    IDTX_16x32_STORE      0,  8
   4182    IDTX_16x32_STORE      1,  9
   4183    IDTX_16x32_STORE      2, 10
   4184    IDTX_16x32_STORE      3, 11
   4185    IDTX_16x32_STORE      4, 12
   4186    IDTX_16x32_STORE      5, 13
   4187    IDTX_16x32_STORE      6, 14
   4188    IDTX_16x32_STORE      7, 15
   4189    RET
   4190 ALIGN function_align
   4191 .transpose_2x8x8_round:
   4192    punpckhwd           m17, m4, m5
   4193    punpcklwd            m4, m5
   4194    punpckhwd            m5, m0, m1
   4195    punpcklwd            m0, m1
   4196    punpckhwd            m1, m6, m7
   4197    punpcklwd            m6, m7
   4198    punpckhwd            m7, m2, m3
   4199    punpcklwd            m2, m3
   4200    punpckhdq            m3, m0, m2
   4201    punpckldq            m0, m2
   4202    punpckldq            m2, m4, m6
   4203    punpckhdq            m4, m6
   4204    punpckhdq            m6, m5, m7
   4205    punpckldq            m5, m7
   4206    punpckldq            m7, m17, m1
   4207    punpckhdq           m17, m1
   4208    REPX  {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17
   4209    punpckhqdq           m1, m0, m2
   4210    punpcklqdq           m0, m2
   4211    punpcklqdq           m2, m3, m4
   4212    punpckhqdq           m3, m4
   4213    punpcklqdq           m4, m5, m7
   4214    punpckhqdq           m5, m7
   4215    punpckhqdq           m7, m6, m17
   4216    punpcklqdq           m6, m17
   4217    punpckhwd           m17, m12, m13
   4218    punpcklwd           m12, m13
   4219    punpckhwd           m13, m8, m9
   4220    punpcklwd            m8, m9
   4221    punpckhwd            m9, m14, m15
   4222    punpcklwd           m14, m15
   4223    punpckhwd           m15, m10, m11
   4224    punpcklwd           m10, m11
   4225    punpckhdq           m11, m8, m10
   4226    punpckldq            m8, m10
   4227    punpckldq           m10, m12, m14
   4228    punpckhdq           m12, m14
   4229    punpckhdq           m14, m13, m15
   4230    punpckldq           m13, m15
   4231    punpckldq           m15, m17, m9
   4232    punpckhdq           m17, m9
   4233    REPX  {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17
   4234    punpckhqdq           m9, m8, m10
   4235    punpcklqdq           m8, m10
   4236    punpcklqdq          m10, m11, m12
   4237    punpckhqdq          m11, m12
   4238    punpcklqdq          m12, m13, m15
   4239    punpckhqdq          m13, m15
   4240    punpckhqdq          m15, m14, m17
   4241    punpcklqdq          m14, m17
   4242    ret
   4243 
   4244 %macro IDTX_32x16 4 ; dst[1-4]
   4245    pmulhrsw            m%2, m12, [cq+32*(%1+ 0)]
   4246    pmulhrsw            m18, m12, [cq+32*(%1+16)]
   4247    pmulhrsw            m%4, m12, [cq+32*(%3+ 0)]
   4248    pmulhrsw            m19, m12, [cq+32*(%3+16)]
   4249    REPX      {paddsw x, x}, m%2, m18, m%4, m19
   4250    mova                m%1, m14
   4251    vpermi2q            m%1, m%2, m18
   4252    vpermt2q            m%2, m16, m18
   4253 %if %3 != 14
   4254    mova                m%3, m14
   4255 %endif
   4256    vpermi2q            m%3, m%4, m19
   4257    vpermt2q            m%4, m16, m19
   4258    pmulhrsw            m18, m17, m%1
   4259    pmulhrsw            m19, m17, m%2
   4260    pmulhrsw            m20, m17, m%3
   4261    pmulhrsw            m21, m17, m%4
   4262    REPX      {paddsw x, x}, m%1, m%2, m%3, m%4
   4263    paddsw              m%1, m18
   4264    paddsw              m%2, m19
   4265    paddsw              m%3, m20
   4266    paddsw              m%4, m21
   4267 %endmacro
   4268 
   4269 %macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32
   4270    mova               ym19, [dstq+strideq*0]
   4271    vinserti32x8        m19, [dstq+strideq*8], 1
   4272 %if %3 == 0
   4273    mova   [cq+64*(%1*2+0)], m20
   4274    mova   [cq+64*(%1*2+1)], m20
   4275 %endif
   4276    punpcklbw           m18, m19, m20
   4277    punpckhbw           m19, m20
   4278    paddw               m18, m%1
   4279    paddw               m19, m%2
   4280    packuswb            m18, m19
   4281    mova          [dstq+strideq*0], ym18
   4282    vextracti32x8 [dstq+strideq*8], m18, 1
   4283 %if %3 || %1 != 7
   4284    add                dstq, strideq
   4285 %endif
   4286 %endmacro
   4287 
   4288 cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c
   4289    vpbroadcastd        m12, [pw_2896x8]
   4290    movu                m14, [permB+7]
   4291    vpbroadcastd        m17, [pw_1697x16]
   4292    psrlq               m16, m14, 4
   4293    IDTX_32x16            0,  1,  2,  3
   4294    IDTX_32x16            4,  5,  6,  7
   4295    IDTX_32x16            8,  9, 10, 11
   4296    IDTX_32x16           12, 13, 14, 15
   4297    vpbroadcastd        m16, [pw_2048]
   4298    call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
   4299    pxor                m20, m20
   4300    IDTX_32x16_STORE      0,  8
   4301    IDTX_32x16_STORE      1,  9
   4302    IDTX_32x16_STORE      2, 10
   4303    IDTX_32x16_STORE      3, 11
   4304    IDTX_32x16_STORE      4, 12
   4305    IDTX_32x16_STORE      5, 13
   4306    IDTX_32x16_STORE      6, 14
   4307    IDTX_32x16_STORE      7, 15
   4308    RET
   4309 
   4310 %macro IDCT_32x32_END 4 ; src, mem, stride[1-2]
   4311    pmovzxbw            m10, [dstq+%3]
   4312    pmovzxbw            m11, [r3  +%4]
   4313 %if %2 < 8
   4314    paddsw               m8, m%2, m%1
   4315    psubsw               m9, m%2, m%1
   4316 %else
   4317    mova                 m9, [cq+64*(%2*2-16)]
   4318    paddsw               m8, m9, m%1
   4319    psubsw               m9, m%1
   4320 %endif
   4321    pmulhrsw             m8, m12
   4322    pmulhrsw             m9, m12
   4323 %if %2 >= 8
   4324 %if %2 == 8
   4325    pxor                 m0, m0
   4326 %endif
   4327    mova  [cq+64*(%2*2-16)], m0
   4328    mova  [cq+64*(%2*2-15)], m0
   4329 %endif
   4330    paddw                m8, m10
   4331    paddw                m9, m11
   4332    packuswb             m8, m9
   4333    vpermq               m8, m13, m8
   4334    mova          [dstq+%3], ym8
   4335    vextracti32x8 [r3  +%4], m8, 1
   4336 %if %2 == 3 || %2 == 7 || %2 == 11
   4337    add                dstq, r5
   4338    sub                  r3, r5
   4339 %endif
   4340 %endmacro
   4341 
   4342 cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob
   4343 %undef cmp
   4344    lea                  r5, [o_base]
   4345    test               eobd, eobd
   4346    jz .dconly
   4347    WIN64_SPILL_XMM      30
   4348    cmp                eobd, 136
   4349    jb .fast
   4350    mova                 m5, [cq+64*20]
   4351    mova                 m3, [cq+64*12]
   4352    mova                 m1, [cq+64* 4]
   4353    mova                 m7, [cq+64*28]
   4354    mova                 m2, [cq+64* 8]
   4355    mova                 m6, [cq+64*24]
   4356    mova                 m0, [cq+64* 0]
   4357    mova                 m4, [cq+64*16]
   4358    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   4359    mova                m14, [cq+64* 2]
   4360    mova                m21, [cq+64*30]
   4361    mova                m18, [cq+64*18]
   4362    mova                m17, [cq+64*14]
   4363    mova                m16, [cq+64*10]
   4364    mova                m19, [cq+64*22]
   4365    mova                m20, [cq+64*26]
   4366    mova                m15, [cq+64* 6]
   4367    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
   4368    mova         [cq+64* 0], m14
   4369    mova         [cq+64* 2], m15
   4370    mova         [cq+64* 4], m16
   4371    mova         [cq+64* 6], m17
   4372    mova         [cq+64* 8], m18
   4373    mova         [cq+64*10], m19
   4374    mova         [cq+64*12], m20
   4375    mova         [cq+64*14], m21
   4376    mova                m22, [cq+64* 1]
   4377    mova                m21, [cq+64*31]
   4378    mova                m14, [cq+64*17]
   4379    mova                m29, [cq+64*15]
   4380    mova                m26, [cq+64* 9]
   4381    mova                m17, [cq+64*23]
   4382    mova                m18, [cq+64*25]
   4383    mova                m25, [cq+64* 7]
   4384    mova                m24, [cq+64* 5]
   4385    mova                m19, [cq+64*27]
   4386    mova                m16, [cq+64*21]
   4387    mova                m27, [cq+64*11]
   4388    mova                m28, [cq+64*13]
   4389    mova                m15, [cq+64*19]
   4390    mova                m20, [cq+64*29]
   4391    mova                m23, [cq+64* 3]
   4392    call .main_oddhalf
   4393    vpbroadcastd        m10, [o(pw_8192)]
   4394    psubsw              m13, m0, m29 ; 31
   4395    paddsw               m0, m29     ;  0
   4396    psubsw              m29, m1, m28 ; 30
   4397    paddsw               m1, m28     ;  1
   4398    psubsw              m28, m2, m27 ; 29
   4399    paddsw               m2, m27     ;  2
   4400    psubsw              m27, m3, m26 ; 28
   4401    paddsw               m3, m26     ;  3
   4402    psubsw              m26, m4, m25 ; 27
   4403    paddsw               m4, m25     ;  4
   4404    psubsw              m25, m5, m24 ; 26
   4405    paddsw               m5, m24     ;  5
   4406    psubsw              m24, m6, m23 ; 25
   4407    paddsw               m6, m23     ;  6
   4408    psubsw              m23, m7, m22 ; 24
   4409    paddsw               m7, m22     ;  7
   4410    pxor                 m9, m9
   4411    punpckhwd            m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
   4412    punpcklwd            m0, m1     ; a0 b0 a1 b1 a2 b2 a3 b3
   4413    punpckhwd            m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
   4414    punpcklwd            m2, m3     ; c0 d0 c1 d1 c2 d2 c3 d3
   4415    REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
   4416    punpckhwd           m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
   4417    punpcklwd            m4, m5     ; e0 f0 e1 f1 e2 f2 e3 f3
   4418    punpckhwd            m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
   4419    punpcklwd            m6, m7     ; g0 h0 g1 h1 g2 h2 g3 h3
   4420    REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
   4421    punpckhwd            m3, m23, m24
   4422    punpcklwd           m23, m24
   4423    punpckhwd           m24, m25, m26
   4424    punpcklwd           m25, m26
   4425    REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
   4426    punpckhwd           m26, m27, m28
   4427    punpcklwd           m27, m28
   4428    punpckhwd           m28, m29, m13
   4429    punpcklwd           m29, m13
   4430    REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
   4431    punpckhdq            m7, m0, m2  ; a2 b2 c2 d2 a3 b3 c3 d3
   4432    punpckldq            m0, m2      ; a0 b0 c0 d0 a1 b1 c1 d1
   4433    punpckhdq            m2, m4, m6  ; e2 f2 g2 h2 e3 f3 g3 h3
   4434    punpckldq            m4, m6      ; e0 f0 g0 h0 e1 f1 g1 h1
   4435    punpckhdq            m6, m8, m1  ; a6 b6 c6 d6 a7 b7 c7 d7
   4436    punpckldq            m8, m1      ; a4 b4 c4 d4 a5 b5 c5 d5
   4437    punpckhdq            m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
   4438    punpckldq           m22, m5      ; e4 f4 g4 h5 e5 f5 g5 h5
   4439    REPX  {pmulhrsw x, m10}, m0, m4, m8, m22
   4440    punpckhdq           m13, m23, m25
   4441    punpckldq           m23, m25
   4442    punpckhdq           m25, m27, m29
   4443    punpckldq           m27, m29
   4444    REPX  {pmulhrsw x, m10}, m13, m23, m25, m27
   4445    punpckhdq            m9, m3, m24
   4446    punpckldq            m3, m24
   4447    punpckhdq           m24, m26, m28
   4448    punpckldq           m26, m28
   4449    punpcklqdq           m5, m23, m27 ; d00 d08 d16 d24
   4450    punpckhqdq          m23, m27      ; d01 d09 d17 d25
   4451    punpckhqdq          m27, m13, m25 ; d03 d11 d19 d27
   4452    punpcklqdq          m13, m25      ; d02 d10 d18 d26
   4453    punpckhqdq          m25, m3, m26  ; d05 d13 d21 d29
   4454    punpcklqdq           m3, m26      ; d04 d12 d20 d28
   4455    punpckhqdq          m26, m9, m24  ; d07 d15 d23 d31
   4456    punpcklqdq           m9, m24      ; d06 d14 d22 d30
   4457    REPX  {pmulhrsw x, m10}, m25, m3, m26
   4458    mova         [cq+64* 9], m23
   4459    mova         [cq+64*11], m27
   4460    mova         [cq+64*13], m25
   4461    mova         [cq+64*15], m26
   4462    punpckhqdq          m24, m8, m22  ; a05 a13 a21 a29
   4463    punpcklqdq           m8, m22      ; a04 a12 a20 a28
   4464    punpckhqdq          m22, m0, m4   ; a01 a09 a17 a25
   4465    punpcklqdq           m0, m4       ; a00 a08 a16 a24
   4466    punpckhqdq          m23, m7, m2   ; a03 a11 a19 a27
   4467    punpcklqdq           m7, m2       ; a02 a10 a18 a26
   4468    punpckhqdq          m25, m6, m1   ; a07 a15 a23 a31
   4469    punpcklqdq           m6, m1       ; a06 a14 a22 a30
   4470    mova                 m2, [cq+64* 0]
   4471    mova                m11, [cq+64* 2]
   4472    mova                m12, [cq+64* 4]
   4473    mova                m29, [cq+64* 6]
   4474    mova                m27, [cq+64* 8]
   4475    mova                m26, [cq+64*10]
   4476    mova                 m4, [cq+64*12]
   4477    mova                m28, [cq+64*14]
   4478    psubsw               m1, m2, m21  ; 23
   4479    paddsw               m2, m21      ;  8
   4480    psubsw              m21, m11, m20 ; 22
   4481    paddsw              m11, m20      ;  9
   4482    psubsw              m20, m12, m19 ; 21
   4483    paddsw              m12, m19      ; 10
   4484    psubsw              m19, m29, m18 ; 20
   4485    paddsw              m29, m18      ; 11
   4486    psubsw              m18, m27, m17 ; 19
   4487    paddsw              m27, m17      ; 12
   4488    psubsw              m17, m26, m16 ; 18
   4489    paddsw              m26, m16      ; 13
   4490    paddsw              m16, m4, m15  ; 14
   4491    psubsw               m4, m15      ; 17
   4492    pmulhrsw            m15, m6, m10
   4493    psubsw               m6, m28, m14 ; 16
   4494    paddsw              m28, m14      ; 15
   4495    pmulhrsw            m14, m7, m10
   4496    punpcklwd            m7, m6, m4
   4497    punpckhwd            m6, m4
   4498    punpckhwd            m4, m17, m18
   4499    punpcklwd           m17, m18
   4500    punpckhwd           m18, m19, m20
   4501    punpcklwd           m19, m20
   4502    punpckhwd           m20, m21, m1
   4503    punpcklwd           m21, m1
   4504    punpckhwd            m1, m2, m11  ; i4 j4 i5 j5 i6 j6 i7 j7
   4505    punpcklwd            m2, m11      ; i0 j1 i1 j1 i2 j2 i3 j3
   4506    punpckhwd           m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
   4507    punpcklwd           m12, m29      ; k0 l0 k1 l1 k2 l2 k3 l3
   4508    punpckhwd           m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
   4509    punpcklwd           m27, m26      ; m0 n0 m1 n1 m2 n2 m3 n3
   4510    punpckhwd           m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
   4511    punpcklwd           m16, m28      ; o0 p0 o1 p1 o2 p2 o3 p3
   4512    pmulhrsw            m23, m10
   4513    pmulhrsw            m25, m10
   4514    punpckhdq           m28, m2, m12  ; i2 j2 k2 l2 i3 j3 k3 l3
   4515    punpckldq            m2, m12      ; i0 j0 k0 l0 i1 j1 k1 l1
   4516    punpckhdq           m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3
   4517    punpckldq           m27, m16      ; m0 n0 o0 p0 m1 n1 o1 p1
   4518    REPX  {pmulhrsw x, m10}, m28, m2, m12, m27
   4519    punpckhdq           m16, m1, m11  ; i6 j6 k6 l6 i7 j7 k7 l7
   4520    punpckldq            m1, m11      ; i4 j4 k4 l4 i5 j5 k5 l5
   4521    punpckhdq           m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
   4522    punpckldq           m29, m26      ; m4 n4 o4 p4 m5 n5 o5 p5
   4523    REPX  {pmulhrsw x, m10}, m16, m1, m11, m29
   4524    punpckhdq           m26, m19, m21
   4525    punpckldq           m19, m21
   4526    punpckhdq           m21, m6, m4
   4527    punpckldq            m6, m4
   4528    REPX  {pmulhrsw x, m10}, m26, m19, m21, m6
   4529    punpckhdq            m4, m18, m20
   4530    punpckldq           m18, m20
   4531    punpckhdq           m20, m7, m17
   4532    punpckldq            m7, m17
   4533    REPX  {pmulhrsw x, m10}, m4, m18, m20, m7
   4534    punpcklqdq          m17, m28, m12 ; b02 b10 b18 b26
   4535    punpckhqdq          m28, m12      ; b03 b11 b19 b27
   4536    punpckhqdq          m12, m2, m27  ; b01 b09 b17 b25
   4537    punpcklqdq           m2, m27      ; b00 b08 b16 b24
   4538    punpckhqdq          m27, m1, m29  ; b05 b13 b21 b29
   4539    punpcklqdq           m1, m29      ; b04 b12 b20 b28
   4540    punpckhqdq          m29, m16, m11 ; b07 b15 b23 b31
   4541    punpcklqdq          m16, m11      ; b06 b14 b22 b30
   4542    mova         [cq+64* 1], m12
   4543    mova         [cq+64* 3], m28
   4544    mova         [cq+64* 5], m27
   4545    mova         [cq+64* 7], m29
   4546    punpckhqdq          m27, m20, m26 ; c03 c11 c19 c27
   4547    punpcklqdq          m20, m26      ; c02 c10 c18 c26
   4548    punpckhqdq          m26, m7, m19  ; c01 c09 c17 c25
   4549    punpcklqdq           m7, m19      ; c00 c08 c16 c24
   4550    punpckhqdq          m28, m6, m18  ; c05 c13 c21 c29
   4551    punpcklqdq           m6, m18      ; c04 c12 c20 c28
   4552    punpckhqdq          m29, m21, m4  ; c07 c15 c23 c31
   4553    punpcklqdq          m21, m4       ; c06 c14 c22 c30
   4554    pmulhrsw            m19, m9, m10
   4555    vshufi32x4           m4, m0, m2, q3232   ; a16 a24 b16 b24
   4556    vinserti32x8         m0, ym2, 1          ; a00 a08 b00 b08
   4557    vshufi32x4           m2, m7, m5, q3232   ; c16 c24 d16 d24
   4558    vinserti32x8         m7, ym5, 1          ; c00 c08 d00 d08
   4559    vshufi32x4           m5, m8, m1, q3232   ; a20 a28 b20 b28
   4560    vinserti32x8         m1, m8, ym1, 1      ; a04 a12 b04 b12
   4561    vshufi32x4           m8, m6, m3, q3232   ; c20 c28 d20 d28
   4562    vinserti32x8         m6, ym3, 1          ; c04 c12 d04 d12
   4563    vshufi32x4           m3, m1, m6, q3131   ; 12
   4564    vshufi32x4           m1, m6, q2020       ;  4
   4565    vshufi32x4           m6, m4, m2, q3131   ; 24
   4566    vshufi32x4           m4, m2, q2020       ; 16
   4567    vshufi32x4           m2, m0, m7, q3131   ;  8
   4568    vshufi32x4           m0, m7, q2020       ;  0
   4569    vshufi32x4           m7, m5, m8, q3131   ; 28
   4570    vshufi32x4           m5, m8, q2020       ; 20
   4571    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   4572    vshufi32x4          m18, m14, m17, q3232 ; a18 a26 b18 b26
   4573    vinserti32x8        m14, ym17, 1         ; a02 a10 b02 b10
   4574    vshufi32x4          m17, m20, m13, q3232 ; c18 c26 d18 d26
   4575    vinserti32x8        m20, ym13, 1         ; c02 c10 d02 d10
   4576    vshufi32x4          m13, m21, m19, q3232 ; c22 c30 d22 d30
   4577    vinserti32x8        m21, ym19, 1         ; c06 c14 d06 d14
   4578    vshufi32x4          m19, m15, m16, q3232 ; a22 a30 b22 b30
   4579    vinserti32x8        m15, ym16, 1         ; a06 a14 b06 b14
   4580    vshufi32x4          m16, m14, m20, q3131 ; 10
   4581    vshufi32x4          m14, m20, q2020      ;  2
   4582    vshufi32x4          m20, m18, m17, q3131 ; 26
   4583    vshufi32x4          m18, m17, q2020      ; 18
   4584    vshufi32x4          m17, m15, m21, q3131 ; 14
   4585    vshufi32x4          m15, m21, q2020      ;  6
   4586    vshufi32x4          m21, m19, m13, q3131 ; 30
   4587    vshufi32x4          m19, m13, q2020      ; 22
   4588    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
   4589    mova         [cq+64* 0], m14
   4590    mova         [cq+64* 2], m15
   4591    mova         [cq+64* 4], m16
   4592    mova         [cq+64* 6], m17
   4593    mova         [cq+64* 8], m18
   4594    mova         [cq+64*10], m19
   4595    mova         [cq+64*12], m20
   4596    mova         [cq+64*14], m21
   4597    mova                m15, [cq+64* 1]
   4598    mova                m16, [cq+64* 3]
   4599    mova                m17, [cq+64* 5]
   4600    mova                m19, [cq+64* 7]
   4601    mova                m20, [cq+64* 9]
   4602    mova                m21, [cq+64*11]
   4603    mova                m13, [cq+64*13]
   4604    mova                m18, [cq+64*15]
   4605    vshufi32x4          m14, m22, m15, q3232 ; a17 a25 b17 b25
   4606    vinserti32x8        m22, ym15, 1         ; a01 a09 b01 b09
   4607    vshufi32x4          m15, m23, m16, q3232 ; a19 a27 b19 b27
   4608    vinserti32x8        m23, ym16, 1         ; a03 a11 b03 b11
   4609    vshufi32x4          m16, m24, m17, q3232 ; a21 a29 b21 b29
   4610    vinserti32x8        m24, ym17, 1         ; a05 a13 b05 b13
   4611    vshufi32x4          m17, m25, m19, q3232 ; a23 a31 b23 b31
   4612    vinserti32x8        m25, ym19, 1         ; a07 a15 b07 b15
   4613    vinserti32x8         m8, m26, ym20, 1    ; c01 c09 d01 d09
   4614    vshufi32x4          m26, m20, q3232      ; c17 c25 d17 d25
   4615    vinserti32x8         m9, m27, ym21, 1    ; c03 c11 d03 d11
   4616    vshufi32x4          m27, m21, q3232      ; c19 c27 d19 d27
   4617    vinserti32x8        m11, m28, ym13, 1    ; c05 c13 d05 d13
   4618    vshufi32x4          m28, m13, q3232      ; c21 c29 d21 d29
   4619    vinserti32x8        m12, m29, ym18, 1    ; c07 c15 d07 d15
   4620    vshufi32x4          m29, m18, q3232      ; c23 c31 d23 d31
   4621    vshufi32x4          m18, m14, m26, q3131 ; 25
   4622    vshufi32x4          m14, m26, q2020      ; 17
   4623    vshufi32x4          m19, m15, m27, q3131 ; 27
   4624    vshufi32x4          m15, m27, q2020      ; 19
   4625    vshufi32x4          m20, m16, m28, q3131 ; 29
   4626    vshufi32x4          m16, m28, q2020      ; 21
   4627    vshufi32x4          m21, m17, m29, q3131 ; 31
   4628    vshufi32x4          m17, m29, q2020      ; 23
   4629    vshufi32x4          m26, m22, m8, q3131  ;  9
   4630    vshufi32x4          m22, m8, q2020       ;  1
   4631    vshufi32x4          m27, m23, m9, q3131  ; 11
   4632    vshufi32x4          m23, m9, q2020       ;  3
   4633    vshufi32x4          m28, m24, m11, q3131 ; 13
   4634    vshufi32x4          m24, m11, q2020      ;  5
   4635    vshufi32x4          m29, m25, m12, q3131 ; 15
   4636    vshufi32x4          m25, m12, q2020      ;  7
   4637    call .main_oddhalf
   4638    jmp .end
   4639 .fast: ; bottom/right halves are zero
   4640    mova                m14, [o(dup16_perm)]
   4641    pmovzxwd             m9,       [cq+64* 0]
   4642    pmovzxwd             m6,       [cq+64* 8]
   4643    vpermb               m8, m14,  [cq+64* 2]
   4644    vpermb              ym0, ym14, [cq+64*14]
   4645    vpermb              ym5, ym14, [cq+64*10]
   4646    vpermb               m1, m14,  [cq+64* 6]
   4647    vpermb               m7, m14,  [cq+64* 4]
   4648    vpermb              ym3, ym14, [cq+64*12]
   4649    pslld                m9, 16
   4650    pslld                m6, 16
   4651    call m(idct_16x16_internal_8bpc).main_fast
   4652    vpermb              m21, m14,  [cq+64* 1]
   4653    vpermb             ym17, ym14, [cq+64*15]
   4654    vpermb             ym20, ym14, [cq+64* 9]
   4655    vpermb              m15, m14,  [cq+64* 7]
   4656    vpermb              m18, m14,  [cq+64* 5]
   4657    vpermb             ym16, ym14, [cq+64*11]
   4658    vpermb             ym19, ym14, [cq+64*13]
   4659    vpermb              m14, m14,  [cq+64* 3]
   4660    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   4661    vpbroadcastd         m9, [o(pw_8192)]
   4662    call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
   4663    vshufi32x4          m22, m14, m2, q2020 ;  1
   4664    vshufi32x4          m24, m14, m2, q3131 ;  5
   4665    vshufi32x4          m23, m17, m9, q2020 ;  3
   4666    vshufi32x4          m25, m17, m9, q3131 ;  7
   4667    vshufi32x4          m16, m5, m15, q2020 ; 10
   4668    vshufi32x4          m17, m5, m15, q3131 ; 14
   4669    vshufi32x4          m14, m1, m18, q2020 ;  2
   4670    vshufi32x4          m15, m1, m18, q3131 ;  6
   4671    vshufi32x4           m1, m0, m3, q3131  ;  4
   4672    vshufi32x4           m0, m3, q2020      ;  0
   4673    vshufi32x4           m3, m21, m4, q3131 ; 12
   4674    vshufi32x4           m2, m21, m4, q2020 ;  8
   4675    vshufi32x4          m26, m20, m6, q2020 ;  9
   4676    vshufi32x4          m28, m20, m6, q3131 ; 13
   4677    vshufi32x4          m27, m19, m7, q2020 ; 11
   4678    vshufi32x4          m29, m19, m7, q3131 ; 15
   4679    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
   4680    mova         [cq+64* 0], m14
   4681    mova         [cq+64* 2], m15
   4682    mova         [cq+64* 4], m16
   4683    mova         [cq+64* 6], m17
   4684    mova         [cq+64* 8], m18
   4685    mova         [cq+64*10], m19
   4686    mova         [cq+64*12], m20
   4687    mova         [cq+64*14], m21
   4688    call .main_oddhalf_fast
   4689 .end:
   4690    lea                  r4, [strideq*3]
   4691    vpbroadcastd        m12, [o(pw_2048)]
   4692    movshdup            m13, [o(permD)]
   4693    lea                  r3, [dstq+r4*8]
   4694    lea                  r5, [strideq+r4] ; stride*4
   4695    add                  r3, r5           ; dst+stride*28
   4696    IDCT_32x32_END       29,  0, strideq*0, r4
   4697    IDCT_32x32_END       28,  1, strideq*1, strideq*2
   4698    IDCT_32x32_END       27,  2, strideq*2, strideq*1
   4699    IDCT_32x32_END       26,  3, r4       , strideq*0
   4700    IDCT_32x32_END       25,  4, strideq*0, r4
   4701    IDCT_32x32_END       24,  5, strideq*1, strideq*2
   4702    IDCT_32x32_END       23,  6, strideq*2, strideq*1
   4703    IDCT_32x32_END       22,  7, r4       , strideq*0
   4704    IDCT_32x32_END       21,  8, strideq*0, r4
   4705    IDCT_32x32_END       20,  9, strideq*1, strideq*2
   4706    IDCT_32x32_END       19, 10, strideq*2, strideq*1
   4707    IDCT_32x32_END       18, 11, r4       , strideq*0
   4708    IDCT_32x32_END       17, 12, strideq*0, r4
   4709    IDCT_32x32_END       16, 13, strideq*1, strideq*2
   4710    IDCT_32x32_END       15, 14, strideq*2, strideq*1
   4711    IDCT_32x32_END       14, 15, r4       , strideq*0
   4712    RET
   4713 .dconly:
   4714    movsx               r6d, word [cq]
   4715    mov                [cq], eobd
   4716    or                  r3d, 32
   4717    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2
   4718 ALIGN function_align
   4719 cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero
   4720    vpbroadcastd        m21, [o(pw_4091x8)]
   4721    vpbroadcastd         m8, [o(pw_201x8)]
   4722    vpbroadcastd        m24, [o(pw_m601x8)]
   4723    vpbroadcastd        m12, [o(pw_4052x8)]
   4724    pmulhrsw            m21, m22 ; t31a
   4725    pmulhrsw            m22, m8  ; t16a
   4726    pmulhrsw            m24, m23 ; t23a
   4727    pmulhrsw            m23, m12 ; t24a
   4728 
   4729    punpcklwd            m9, m22, m21
   4730    punpckhwd            m8, m22, m21
   4731    mova                m15, m10
   4732    vpdpwssd            m15, m9, [o(pw_m4017_799)] {bcstd}
   4733    mova                m17, m10
   4734    vpdpwssd            m17, m8, [o(pw_m4017_799)] {bcstd}
   4735    REPX      {psrad x, 12}, m15, m17
   4736    packssdw            m15, m17
   4737    mova                m17, m10
   4738    vpdpwssd            m17, m8, [o(pw_799_4017)] {bcstd}
   4739    mova                 m8, m10
   4740    vpdpwssd             m8, m9, [o(pw_799_4017)] {bcstd}
   4741    REPX      {psrad x, 12}, m17, m8
   4742    packssdw             m8, m17
   4743 
   4744    punpcklwd            m9, m24, m23
   4745    punpckhwd           m16, m24, m23
   4746    mova                m20, m10
   4747    vpdpwssd            m20, m9, [o(pw_m3406_m2276)] {bcstd}
   4748    mova                m17, m10
   4749    vpdpwssd            m17, m16, [o(pw_m3406_m2276)] {bcstd}
   4750    REPX      {psrad x, 12}, m20, m17
   4751    packssdw            m20, m17
   4752    mova                m17, m10
   4753    vpdpwssd            m17, m16, [o(pw_m2276_3406)] {bcstd}
   4754    mova                m16, m10
   4755    vpdpwssd            m16, m9, [o(pw_m2276_3406)] {bcstd}
   4756    REPX      {psrad x, 12}, m17, m16
   4757    packssdw            m16, m17
   4758 
   4759    mova                m17, m21
   4760    mova                m27, m15
   4761    mova                m25, m20
   4762    mova                m29, m8
   4763    mova                m18, m22
   4764    mova                m14, m24
   4765    mova                m28, m16
   4766    mova                m26, m23
   4767    jmp .main4
   4768 cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
   4769    vpbroadcastd        m21, [o(pw_4091x8)]
   4770    vpbroadcastd         m8, [o(pw_201x8)]
   4771    vpbroadcastd        m18, [o(pw_m1380x8)]
   4772    vpbroadcastd         m9, [o(pw_3857x8)]
   4773    vpbroadcastd        m19, [o(pw_3973x8)]
   4774    vpbroadcastd        m11, [o(pw_995x8)]
   4775    vpbroadcastd        m28, [o(pw_m601x8)]
   4776    vpbroadcastd        m12, [o(pw_4052x8)]
   4777    pmulhrsw            m21, m22 ; t31a
   4778    pmulhrsw            m22, m8  ; t16a
   4779    pmulhrsw            m18, m25 ; t19a
   4780    pmulhrsw            m25, m9 ; t28a
   4781    pmulhrsw            m19, m24 ; t27a
   4782    pmulhrsw            m24, m11 ; t20a
   4783    pmulhrsw            m28, m23 ; t23a
   4784    pmulhrsw            m23, m12 ; t24a
   4785    mova                m15, m21
   4786    mova                 m8, m22
   4787    mova                m14, m18
   4788    mova                m27, m25
   4789    mova                m29, m19
   4790    mova                m26, m24
   4791    mova                m16, m28
   4792    mova                m20, m23
   4793    jmp .main3
   4794 ALIGN function_align
   4795 cglobal_label .main_oddhalf_fast ; bottom half is zero
   4796    vpbroadcastd        m21, [o(pw_4091x8)]
   4797    vpbroadcastd         m8, [o(pw_201x8)]
   4798    vpbroadcastd        m14, [o(pw_m2751x8)]
   4799    vpbroadcastd         m9, [o(pw_3035x8)]
   4800    vpbroadcastd        m17, [o(pw_3703x8)]
   4801    vpbroadcastd        m11, [o(pw_1751x8)]
   4802    vpbroadcastd        m18, [o(pw_m1380x8)]
   4803    vpbroadcastd        m12, [o(pw_3857x8)]
   4804    pmulhrsw            m21, m22 ; t31a
   4805    vpbroadcastd        m19, [o(pw_3973x8)]
   4806    pmulhrsw            m22, m8  ; t16a
   4807    vpbroadcastd         m8, [o(pw_995x8)]
   4808    pmulhrsw            m14, m29 ; t30a
   4809    vpbroadcastd        m16, [o(pw_m2106x8)]
   4810    pmulhrsw            m29, m9  ; t17a
   4811    vpbroadcastd         m9, [o(pw_3513x8)]
   4812    pmulhrsw            m17, m26 ; t29a
   4813    vpbroadcastd        m15, [o(pw_3290x8)]
   4814    pmulhrsw            m26, m11 ; t18a
   4815    vpbroadcastd        m11, [o(pw_2440x8)]
   4816    pmulhrsw            m18, m25 ; t19a
   4817    vpbroadcastd        m20, [o(pw_m601x8)]
   4818    pmulhrsw            m25, m12 ; t28a
   4819    vpbroadcastd        m12, [o(pw_4052x8)]
   4820    pmulhrsw            m19, m24 ; t27a
   4821    pmulhrsw            m24, m8  ; t20a
   4822    pmulhrsw            m16, m27 ; t21a
   4823    pmulhrsw            m27, m9  ; t26a
   4824    pmulhrsw            m15, m28 ; t25a
   4825    pmulhrsw            m28, m11 ; t22a
   4826    pmulhrsw            m20, m23 ; t23a
   4827    pmulhrsw            m23, m12 ; t24a
   4828    jmp .main2
   4829 ALIGN function_align
   4830 cglobal_label .main_oddhalf
   4831    ITX_MULSUB_2W        22, 21,  8,  9, 10,  201, 4091 ; t16a, t31a
   4832    ITX_MULSUB_2W        14, 29,  8,  9, 10, 3035, 2751 ; t17a, t30a
   4833    ITX_MULSUB_2W        26, 17,  8,  9, 10, 1751, 3703 ; t18a, t29a
   4834    ITX_MULSUB_2W        18, 25,  8,  9, 10, 3857, 1380 ; t19a, t28a
   4835    ITX_MULSUB_2W        24, 19,  8,  9, 10,  995, 3973 ; t20a, t27a
   4836    ITX_MULSUB_2W        16, 27,  8,  9, 10, 3513, 2106 ; t21a, t26a
   4837    ITX_MULSUB_2W        28, 15,  8,  9, 10, 2440, 3290 ; t22a, t25a
   4838    ITX_MULSUB_2W        20, 23,  8,  9, 10, 4052,  601 ; t23a, t24a
   4839 .main2:
   4840    psubsw               m8, m22, m14 ; t17
   4841    paddsw              m22, m14      ; t16
   4842    paddsw              m14, m18, m26 ; t19
   4843    psubsw              m18, m26      ; t18
   4844    psubsw              m26, m24, m16 ; t21
   4845    paddsw              m24, m16      ; t20
   4846    psubsw              m16, m20, m28 ; t22
   4847    paddsw              m28, m20      ; t23
   4848    psubsw              m20, m23, m15 ; t25
   4849    paddsw              m23, m15      ; t24
   4850    psubsw              m15, m21, m29 ; t30
   4851    paddsw              m21, m29      ; t31
   4852    psubsw              m29, m19, m27 ; t26
   4853    paddsw              m19, m27      ; t27
   4854    paddsw              m27, m25, m17 ; t28
   4855    psubsw              m25, m17      ; t29
   4856 .main3:
   4857    ITX_MULSUB_2W        15,  8,  9, 17, 10,   799, 4017 ; t17a, t30a
   4858    ITX_MULSUB_2W        25, 18,  9, 17, 10, m4017,  799 ; t18a, t29a
   4859    ITX_MULSUB_2W        29, 26,  9, 17, 10,  3406, 2276 ; t21a, t26a
   4860    ITX_MULSUB_2W        20, 16,  9, 17, 10, m2276, 3406 ; t22a, t25a
   4861    psubsw              m17, m21, m27 ; t28a
   4862    paddsw              m21, m27      ; t31a
   4863    psubsw              m27, m15, m25 ; t18
   4864    paddsw              m15, m25      ; t17
   4865    psubsw              m25, m20, m29 ; t21
   4866    paddsw              m20, m29      ; t22
   4867    psubsw              m29, m8, m18  ; t29
   4868    paddsw               m8, m18      ; t30
   4869    psubsw              m18, m22, m14 ; t19a
   4870    paddsw              m22, m14      ; t16a
   4871    psubsw              m14, m28, m24 ; t20a
   4872    paddsw              m24, m28      ; t23a
   4873    paddsw              m28, m16, m26 ; t25
   4874    psubsw              m16, m26      ; t26
   4875    psubsw              m26, m23, m19 ; t27a
   4876    paddsw              m23, m19      ; t24a
   4877 .main4:
   4878    vpbroadcastd        m12, [o(pw_m3784_1567)]
   4879    vpbroadcastd        m11, [o(pw_1567_3784)]
   4880    ITX_MULSUB_2W        29, 27,  9, 19, 10, 11, 12 ; t18a, t29a
   4881    ITX_MULSUB_2W        17, 18,  9, 19, 10, 11, 12 ; t19,  t28
   4882    vpbroadcastd        m11, [o(pw_m1567_m3784)]
   4883    ITX_MULSUB_2W        16, 25,  9, 19, 10, 12, 11 ; t21a, t26a
   4884    ITX_MULSUB_2W        26, 14,  9, 19, 10, 12, 11 ; t20,  t27
   4885    vpbroadcastd        m12, [o(pw_m2896_2896)]
   4886    vpbroadcastd        m11, [o(pw_2896_2896)]
   4887    psubsw              m19, m27, m25 ; t26
   4888    paddsw              m27, m25      ; t29
   4889    psubsw              m25, m17, m26 ; t20a
   4890    paddsw              m17, m26      ; t19a
   4891    paddsw              m26, m18, m14 ; t28a
   4892    psubsw              m18, m14      ; t27a
   4893    paddsw              m14, m22, m24 ; t16
   4894    psubsw              m22, m24      ; t23
   4895    psubsw              m24, m29, m16 ; t21
   4896    paddsw              m16, m29      ; t18
   4897    paddsw              m29, m21, m23 ; t31
   4898    psubsw              m21, m23      ; t24
   4899    psubsw              m23, m15, m20 ; t22a
   4900    paddsw              m15, m20      ; t17a
   4901    psubsw              m20, m8, m28  ; t25a
   4902    paddsw              m28, m8       ; t30a
   4903    ITX_MULSUB_2W        18, 25,  8,  9, 10, 11, 12 ; t20,  t27
   4904    ITX_MULSUB_2W        19, 24,  8,  9, 10, 11, 12 ; t21a, t26a
   4905    ITX_MULSUB_2W        21, 22,  8,  9, 10, 11, 12 ; t23a, t24a
   4906    ITX_MULSUB_2W        20, 23,  8,  9, 10, 11, 12 ; t22,  t25
   4907    ret
   4908 
   4909 %macro IDTX_32x32 2 ; dst[1-2]
   4910    vmovdqa32           ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which
   4911    vmovdqa32           ym17, [cq+64*(%1+16)] ; reduces code size due to
   4912    vmovdqa32           ym%2, [cq+64*(%2+ 0)] ; compressed displacements
   4913    vmovdqa32           ym18, [cq+64*(%2+16)]
   4914    vpermt2q             m%1, m21, m17
   4915    vpermt2q             m%2, m21, m18
   4916 %endmacro
   4917 
   4918 cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c
   4919    movu                 m21, [permB+7]
   4920    vpbroadcastd         m16, [pw_8192]
   4921    pxor                 m20, m20
   4922 .loop:
   4923    IDTX_32x32            0,  1
   4924    IDTX_32x32            2,  3
   4925    IDTX_32x32            4,  5
   4926    IDTX_32x32            6,  7
   4927    IDTX_32x32            8,  9
   4928    IDTX_32x32           10, 11
   4929    IDTX_32x32           12, 13
   4930    IDTX_32x32           14, 15
   4931    call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
   4932    IDTX_32x16_STORE      0,  8, 1
   4933    IDTX_32x16_STORE      1,  9, 1
   4934    IDTX_32x16_STORE      2, 10, 1
   4935    IDTX_32x16_STORE      3, 11, 1
   4936    IDTX_32x16_STORE      4, 12, 1
   4937    IDTX_32x16_STORE      5, 13, 1
   4938    IDTX_32x16_STORE      6, 14, 1
   4939    IDTX_32x16_STORE      7, 15, 1
   4940    lea                dstq, [dstq+strideq*8]
   4941    btc                  cq, 5
   4942    jnc .loop
   4943    mov                 r0d, 8
   4944 .zero_loop:
   4945    mova          [cq+64*0], m20
   4946    mova          [cq+64*1], m20
   4947    mova          [cq+64*2], m20
   4948    mova          [cq+64*3], m20
   4949    add                  cq, 64*4
   4950    dec                 r0d
   4951    jg .zero_loop
   4952    RET
   4953 
   4954 cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
   4955 %undef cmp
   4956    lea                  r5, [o_base]
   4957    test               eobd, eobd
   4958    jz .dconly
   4959    WIN64_SPILL_XMM      30
   4960    cmp                eobd, 151
   4961    jb .fast
   4962    mova                 m5, [cq+64*10]
   4963    mova                 m3, [cq+64* 6]
   4964    mova                 m1, [cq+64* 2]
   4965    mova                 m7, [cq+64*14]
   4966    mova                 m2, [cq+64* 4]
   4967    mova                 m6, [cq+64*12]
   4968    mova                 m0, [cq+64* 0]
   4969    mova                 m4, [cq+64* 8]
   4970    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   4971    mova                m14, [cq+64* 1]
   4972    mova                m21, [cq+64*15]
   4973    mova                m18, [cq+64* 9]
   4974    mova                m17, [cq+64* 7]
   4975    mova                m16, [cq+64* 5]
   4976    mova                m19, [cq+64*11]
   4977    mova                m20, [cq+64*13]
   4978    mova                m15, [cq+64* 3]
   4979    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
   4980    vpbroadcastd         m9, [o(pw_8192)]
   4981 %macro TRANSPOSE_8x4_ROUND 4
   4982    punpckhwd            m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7
   4983    punpcklwd           m%3, m%4      ; c0 d0 c1 d1 c2 d2 c3 d3
   4984    punpckhwd           m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7
   4985    punpcklwd           m%1, m%2      ; a0 b0 a1 b1 a2 b2 a3 b3
   4986    punpckhdq           m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3
   4987    punpckldq           m%1, m%3      ; a0 b0 c0 d0 a1 b1 c1 d1
   4988    punpckldq           m%3, m%4, m8  ; a4 b4 c4 d4 a5 b5 c5 d5
   4989    punpckhdq           m%4, m8       ; a6 b6 c6 d6 a7 b7 c7 d7
   4990    REPX   {pmulhrsw x, m9}, m%2, m%1, m%3, m%4
   4991 %endmacro
   4992    TRANSPOSE_8x4_ROUND   0,  1,  2,  3
   4993    TRANSPOSE_8x4_ROUND   4,  5,  6,  7
   4994    TRANSPOSE_8x4_ROUND  14, 15, 16, 17
   4995    TRANSPOSE_8x4_ROUND  18, 19, 20, 21
   4996    vinserti32x8        m26, m0, ym4, 1     ; a0  a4  b0  b4
   4997    vshufi32x4           m0, m4, q3232      ; a8  a12 b8  b12
   4998    vinserti32x8        m27, m1, ym5, 1     ; a1  a5  b1  b5
   4999    vshufi32x4           m1, m5, q3232      ; a9  a13 b9  b13
   5000    vinserti32x8        m28, m2, ym6, 1     ; a2  a6  b2  b6
   5001    vshufi32x4           m2, m6, q3232      ; a10 a14 b10 b14
   5002    vinserti32x8        m29, m3, ym7, 1     ; a3  a7  b3  b7
   5003    vshufi32x4           m8, m3, m7, q3232  ; a11 a15 b11 b15
   5004    vinserti32x8         m4, m14, ym18, 1   ; c0  c4  d0  d4
   5005    vshufi32x4          m14, m18, q3232     ; c8  c12 d8  d12
   5006    vinserti32x8         m5, m15, ym19, 1   ; c1  c5  d1  d5
   5007    vshufi32x4          m15, m19, q3232     ; c9  c13 d9  d13
   5008    vinserti32x8         m6, m16, ym20, 1   ; c2  c6  d2  d6
   5009    vshufi32x4          m16, m20, q3232     ; c10 c14 d10 d14
   5010    vinserti32x8         m7, m17, ym21, 1   ; c3  c7  d3  d7
   5011    vshufi32x4          m17, m21, q3232     ; c11 c15 d11 d15
   5012    vshufi32x4          m22, m26, m4, q2020 ;  0  1
   5013    vshufi32x4          m26, m4, q3131      ;  8  9
   5014    vshufi32x4          m23, m27, m5, q2020 ;  2  3
   5015    vshufi32x4          m27, m5, q3131      ; 10 11
   5016    vshufi32x4          m24, m28, m6, q2020 ;  4  5
   5017    vshufi32x4          m28, m6, q3131      ; 12 13
   5018    vshufi32x4          m25, m29, m7, q2020 ;  6  7
   5019    vshufi32x4          m29, m7, q3131      ; 14 15
   5020    vshufi32x4           m4, m0, m14, q2020 ; 16 17
   5021    vshufi32x4           m3, m0, m14, q3131 ; 24 25
   5022    vshufi32x4          m20, m1, m15, q2020 ; 18 19
   5023    vshufi32x4          m19, m1, m15, q3131 ; 26 27
   5024    vshufi32x4           m5, m2, m16, q2020 ; 20 21
   5025    vshufi32x4           m0, m2, m16, q3131 ; 28 29
   5026    vshufi32x4          m16, m8, m17, q2020 ; 22 23
   5027    vshufi32x4          m17, m8, m17, q3131 ; 30 31
   5028    pxor                 m6, m6
   5029    mova         [cq+64* 0], m4
   5030    mova         [cq+64* 2], m5
   5031    mova         [cq+64* 4], m3
   5032    mova         [cq+64* 6], m0
   5033    punpcklwd            m8, m24, m24 ;  4
   5034    punpcklwd            m0, m0       ; 28
   5035    punpcklwd            m5, m5       ; 20
   5036    punpcklwd            m1, m28, m28 ; 12
   5037    punpcklwd            m7, m26, m26 ;  8
   5038    punpcklwd            m3, m3       ; 24
   5039    punpcklwd            m9, m6, m22  ; __  0
   5040    punpcklwd            m6, m4       ; __ 16
   5041    call m(idct_16x16_internal_8bpc).main_fast3
   5042    mova         [cq+64* 1], m20
   5043    mova         [cq+64* 3], m16
   5044    mova         [cq+64* 5], m19
   5045    mova         [cq+64* 7], m17
   5046    punpcklwd           m21, m23, m23 ;  2
   5047    punpcklwd           m17, m17      ; 30
   5048    punpcklwd           m20, m20      ; 18
   5049    punpcklwd           m15, m29, m29 ; 14
   5050    punpcklwd           m18, m27, m27 ; 10
   5051    punpcklwd           m16, m16      ; 22
   5052    punpcklwd           m19, m19      ; 26
   5053    punpcklwd           m14, m25, m25 ;  6
   5054    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   5055    mova         [cq+64* 8], m14
   5056    mova         [cq+64* 9], m15
   5057    mova         [cq+64*10], m16
   5058    mova         [cq+64*11], m17
   5059    mova         [cq+64*12], m18
   5060    mova         [cq+64*13], m19
   5061    mova         [cq+64*14], m20
   5062    mova         [cq+64*15], m21
   5063    mova                m21, [cq+64* 7]
   5064    mova                m14, [cq+64* 0]
   5065    mova                m17, [cq+64* 3]
   5066    mova                m18, [cq+64* 4]
   5067    mova                m19, [cq+64* 5]
   5068    mova                m16, [cq+64* 2]
   5069    mova                m15, [cq+64* 1]
   5070    mova                m20, [cq+64* 6]
   5071    REPX   {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
   5072                             m24, m19, m16, m27, m28, m15, m20, m23
   5073    call .main_oddhalf
   5074    jmp .end
   5075 .fast: ; right half is zero
   5076    mova                ym8, [cq+64*15]
   5077    vinserti32x8         m8, [cq+64* 1], 1
   5078    mova                 m2, [o(int16_perm)]
   5079    mova                ym9, [cq+64* 8]
   5080    vinserti32x8         m9, [cq+64* 0], 1
   5081    mova                ym0, [cq+64* 7]
   5082    vinserti32x8         m0, [cq+64* 9], 1
   5083    mova                ym7, [cq+64*14]
   5084    vinserti32x8         m7, [cq+64* 2], 1
   5085    mova                ym1, [cq+64* 3]
   5086    vinserti32x8         m1, [cq+64*13], 1
   5087    mova                ym3, [cq+64* 6]
   5088    vinserti32x8         m3, [cq+64*10], 1
   5089    mova                ym5, [cq+64*11]
   5090    vinserti32x8         m5, [cq+64* 5], 1
   5091    mova                ym6, [cq+64*12]
   5092    vinserti32x8         m6, [cq+64* 4], 1
   5093    REPX  {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
   5094    call m(idct_16x16_internal_8bpc).main2
   5095    vbroadcasti32x4      m8, [o(int_shuf3)]
   5096    vbroadcasti32x4      m9, [o(int_shuf4)]
   5097    vpbroadcastd        m11, [o(pw_8192)]
   5098    pshufb               m0, m8
   5099    pshufb               m1, m9
   5100    pshufb               m2, m8
   5101    pshufb               m3, m9
   5102    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
   5103    pshufb               m4, m8
   5104    pshufb               m5, m9
   5105    pshufb               m6, m8
   5106    pshufb               m7, m9
   5107    REPX  {pmulhrsw x, m11}, m4, m5, m6, m7
   5108    punpckhdq           m28, m0, m1
   5109    punpckldq            m0, m1
   5110    punpckhdq           m27, m2, m3
   5111    punpckldq            m2, m3
   5112    punpckhdq           m22, m4, m5
   5113    punpckldq            m4, m5
   5114    punpckhdq           m23, m6, m7
   5115    punpckldq            m6, m7
   5116    vinserti32x8        m14, m0, ym2, 1
   5117    vshufi32x4          m15, m0, m2, q3232
   5118    vinserti32x8         m2, m4, ym6, 1
   5119    vshufi32x4           m4, m6, q3232
   5120    vshufi32x4          m21, m14, m2, q2020 ;  0  2
   5121    vshufi32x4          m14, m2, q3131      ;  4  6
   5122    vshufi32x4          m18, m15, m4, q2020 ;  8 10
   5123    vshufi32x4          m15, m4, q3131      ; 12 14
   5124    pxor                 m9, m9
   5125    punpcklwd            m8, m14, m14 ;  4
   5126    punpcklwd            m1, m15, m15 ; 12
   5127    punpcklwd            m7, m18, m18 ;  8
   5128    punpcklwd            m9, m21      ; __  0
   5129    call m(idct_16x16_internal_8bpc).main_fast4
   5130    punpckhwd           m21, m21      ;  2
   5131    punpckhwd           m15, m15      ; 14
   5132    punpckhwd           m18, m18      ; 10
   5133    punpckhwd           m14, m14      ;  6
   5134    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
   5135    vinserti32x8        m24, m28, ym27, 1
   5136    vshufi32x4          m28, m27, q3232
   5137    vinserti32x8        m27, m22, ym23, 1
   5138    vshufi32x4          m22, m23, q3232
   5139    vshufi32x4          m23, m24, m27, q2020 ;  1  3
   5140    vshufi32x4          m24, m27, q3131      ;  5  7
   5141    vshufi32x4          m27, m28, m22, q2020 ;  9 11
   5142    vshufi32x4          m28, m22, q3131      ; 13 15
   5143    punpcklwd           m22, m23, m23 ;  1
   5144    punpckhwd           m29, m28, m28 ; 15
   5145    punpcklwd           m26, m27, m27 ;  9
   5146    punpckhwd           m25, m24, m24 ;  7
   5147    mova         [cq+64* 8], m14
   5148    mova         [cq+64* 9], m15
   5149    mova         [cq+64*10], m16
   5150    mova         [cq+64*11], m17
   5151    punpcklwd           m24, m24      ;  5
   5152    punpckhwd           m27, m27      ; 11
   5153    punpcklwd           m28, m28      ; 13
   5154    punpckhwd           m23, m23      ;  3
   5155    mova         [cq+64*12], m18
   5156    mova         [cq+64*13], m19
   5157    mova         [cq+64*14], m20
   5158    mova         [cq+64*15], m21
   5159    call .main_oddhalf_fast
   5160 .end:
   5161    imul                 r6, strideq, 60
   5162    mova                m10, [o(end_16x32p)]
   5163    vpbroadcastd        m11, [o(pw_2048)]
   5164    lea                  r3, [strideq*3]
   5165    pxor                m12, m12
   5166    add                  r6, dstq         ; dst+stride*60
   5167    psrldq              m13, m10, 1
   5168    lea                  r4, [strideq+r3] ; stride*4
   5169 %macro IDCT_16x64_END 3 ; idct32, idct64, tmp
   5170 %if %1 & 1
   5171    %define %%s0 r3
   5172    %define %%s1 strideq*2
   5173    %define %%s2 strideq*1
   5174    %define %%s3 strideq*0
   5175 %else
   5176    %define %%s0 strideq*0
   5177    %define %%s1 strideq*1
   5178    %define %%s2 strideq*2
   5179    %define %%s3 r3
   5180 %if %1
   5181    add                dstq, r4
   5182    sub                  r6, r4
   5183 %endif
   5184 %endif
   5185 %if %1 < 8
   5186    pmulhrsw             m8, m11, m%1
   5187    pmulhrsw             m9, m11, m%2
   5188 %else
   5189    mova                 m9, [cq+64*%1]
   5190    paddsw               m8, m9, m%2 ; out  0+n,  1+n
   5191    psubsw               m9, m%2     ; out 63-n, 62-n
   5192    pmulhrsw             m8, m11
   5193    pmulhrsw             m9, m11
   5194 %endif
   5195    mova               xm29, [dstq+%%s0]
   5196    vinserti128        ym29, [dstq+%%s1], 1
   5197    mova               xm%3, [r6  +%%s3]
   5198    vinserti128        ym%3, [r6  +%%s2], 1
   5199    vpermb              m29, m10, m29
   5200    vpermb              m%3, m10, m%3
   5201    mova         [cq+64*%1], m12
   5202    paddw               m29, m8
   5203    paddw               m%3, m9
   5204    packuswb            m29, m%3
   5205    vpermd              m29, m13, m29
   5206    mova          [dstq+%%s0], xm29
   5207    vextracti128  [dstq+%%s1], ym29, 1
   5208    vextracti32x4 [r6  +%%s2], m29, 2
   5209    vextracti32x4 [r6  +%%s3], m29, 3
   5210 %endmacro
   5211    IDCT_16x64_END        0, 29,  0
   5212    IDCT_16x64_END        1, 28, 28
   5213    IDCT_16x64_END        2, 27, 28
   5214    IDCT_16x64_END        3, 26, 28
   5215    IDCT_16x64_END        4, 25, 28
   5216    IDCT_16x64_END        5, 24, 28
   5217    IDCT_16x64_END        6, 23, 28
   5218    IDCT_16x64_END        7, 22, 28
   5219    IDCT_16x64_END        8, 21, 28
   5220    IDCT_16x64_END        9, 20, 28
   5221    IDCT_16x64_END       10, 19, 28
   5222    IDCT_16x64_END       11, 18, 28
   5223    IDCT_16x64_END       12, 17, 28
   5224    IDCT_16x64_END       13, 16, 28
   5225    IDCT_16x64_END       14, 15, 28
   5226    IDCT_16x64_END       15, 14, 28
   5227    RET
   5228 .dconly:
   5229    movsx               r6d, word [cq]
   5230    mov                [cq], eobd
   5231    or                  r3d, 64
   5232    imul                r6d, 181
   5233    add                 r6d, 128+512
   5234    sar                 r6d, 8+2
   5235    jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
   5236 ALIGN function_align
   5237 cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero
   5238    vpbroadcastd         m8, [o(pw_101_4095x8)]
   5239    vpbroadcastd        m21, [o(pw_m1474_3822x8)]
   5240    vpbroadcastd        m14, [o(pw_897_3996x8)]
   5241    vpbroadcastd        m17, [o(pw_m700_4036x8)]
   5242    vpbroadcastd        m18, [o(pw_501_4065x8)]
   5243    vpbroadcastd        m19, [o(pw_m1092_3948x8)]
   5244    vpbroadcastd        m16, [o(pw_1285_3889x8)]
   5245    vpbroadcastd        m15, [o(pw_m301_4085x8)]
   5246    pmulhrsw             m8, m22 ; t32a t63a
   5247    pmulhrsw            m21, m29 ; t35a t60a
   5248    pmulhrsw            m14, m26 ; t36a t59a
   5249    pmulhrsw            m17, m25 ; t39a t56
   5250    pmulhrsw            m18, m24 ; t40a t55a
   5251    pmulhrsw            m19, m27 ; t43a t52a
   5252    pmulhrsw            m16, m28 ; t44a t51a
   5253    pmulhrsw            m15, m23 ; t47a t48a
   5254    mova                m22, m8
   5255    mova                m29, m21
   5256    mova                m26, m14
   5257    mova                m25, m17
   5258    mova                m24, m18
   5259    mova                m27, m19
   5260    mova                m28, m16
   5261    mova                m20, m15
   5262    jmp .main_oddhalf2
   5263 ALIGN function_align
   5264 cglobal_label .main_oddhalf
   5265    vpbroadcastd         m8, [o(pw_101_4095x8)]
   5266    vpbroadcastd         m9, [o(pw_m2824_2967x8)]
   5267    vpbroadcastd        m11, [o(pw_1660_3745x8)]
   5268    vpbroadcastd        m12, [o(pw_m1474_3822x8)]
   5269    pmulhrsw            m22, m8       ; t32a t63a
   5270    vpbroadcastd         m8, [o(pw_897_3996x8)]
   5271    pmulhrsw            m21, m9       ; t33a t62a
   5272    vpbroadcastd         m9, [o(pw_m2191_3461x8)]
   5273    pmulhrsw            m14, m11      ; t34a t61a
   5274    vpbroadcastd        m11, [o(pw_2359_3349x8)]
   5275    pmulhrsw            m29, m12      ; t35a t60a
   5276    vpbroadcastd        m12, [o(pw_m700_4036x8)]
   5277    pmulhrsw            m26, m8       ; t36a t59a
   5278    vpbroadcastd         m8, [o(pw_501_4065x8)]
   5279    pmulhrsw            m17, m9       ; t37a t58a
   5280    vpbroadcastd         m9, [o(pw_m2520_3229x8)]
   5281    pmulhrsw            m18, m11      ; t38a t57a
   5282    vpbroadcastd        m11, [o(pw_2019_3564x8)]
   5283    pmulhrsw            m25, m12      ; t39a t56a
   5284    vpbroadcastd        m12, [o(pw_m1092_3948x8)]
   5285    pmulhrsw            m24, m8       ; t40a t55a
   5286    vpbroadcastd         m8, [o(pw_1285_3889x8)]
   5287    pmulhrsw            m19, m9       ; t41a t54a
   5288    vpbroadcastd         m9, [o(pw_m1842_3659x8)]
   5289    pmulhrsw            m16, m11      ; t42a t53a
   5290    vpbroadcastd        m11, [o(pw_2675_3102x8)]
   5291    pmulhrsw            m27, m12      ; t43a t52a
   5292    vpbroadcastd        m12, [o(pw_m301_4085x8)]
   5293    pmulhrsw            m28, m8       ; t44a t51a
   5294    pmulhrsw            m15, m9       ; t45a t50a
   5295    pmulhrsw            m20, m11      ; t46a t49a
   5296    pmulhrsw            m23, m12      ; t47a t48a
   5297    psubsw               m8, m22, m21 ; t33  t62
   5298    paddsw              m22, m21      ; t32  t63
   5299    psubsw              m21, m29, m14 ; t34  t61
   5300    paddsw              m29, m14      ; t35  t60
   5301    psubsw              m14, m26, m17 ; t37  t58
   5302    paddsw              m26, m17      ; t36  t59
   5303    psubsw              m17, m25, m18 ; t38  t57
   5304    paddsw              m25, m18      ; t39  t56
   5305    psubsw              m18, m24, m19 ; t41  t54
   5306    paddsw              m24, m19      ; t40  t55
   5307    psubsw              m19, m27, m16 ; t42  t53
   5308    paddsw              m27, m16      ; t43  t52
   5309    psubsw              m16, m28, m15 ; t45  t50
   5310    paddsw              m28, m15      ; t44  t51
   5311    psubsw              m15, m23, m20 ; t46  t49
   5312    paddsw              m20, m23      ; t47  t48
   5313 .main_oddhalf2:
   5314    ITX_MUL2X_PACK        8, 9, 23, 10,   401, 4076, 5 ; t33a t62a
   5315    ITX_MUL2X_PACK       21, 9, 23, 10, m4076,  401, 5 ; t34a t61a
   5316    ITX_MUL2X_PACK       14, 9, 23, 10,  3166, 2598, 5 ; t37a t58a
   5317    ITX_MUL2X_PACK       17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a
   5318    ITX_MUL2X_PACK       18, 9, 23, 10,  1931, 3612, 5 ; t41a t54a
   5319    ITX_MUL2X_PACK       19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a
   5320    ITX_MUL2X_PACK       16, 9, 23, 10,  3920, 1189, 5 ; t45a t50a
   5321    ITX_MUL2X_PACK       15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a
   5322    vpbroadcastd        m11, [o(pw_m4017_799)]
   5323    psubsw              m23, m25, m26 ; t36a t59a
   5324    paddsw              m25, m26      ; t39a t56a
   5325    psubsw              m26, m24, m27 ; t43a t52a
   5326    paddsw              m27, m24      ; t40a t55a
   5327    psubsw              m24, m20, m28 ; t44a t51a
   5328    paddsw              m20, m28      ; t47a t48a
   5329    psubsw              m28, m8, m21  ; t34  t61
   5330    paddsw               m8, m21      ; t33  t62
   5331    psubsw              m21, m17, m14 ; t37  t58
   5332    paddsw              m17, m14      ; t38  t57
   5333    psubsw              m14, m18, m19 ; t42  t53
   5334    paddsw              m18, m19      ; t41  t54
   5335    psubsw              m19, m15, m16 ; t45  t50
   5336    paddsw              m15, m16      ; t46  t49
   5337    psubsw              m16, m22, m29 ; t35a t60a
   5338    paddsw              m22, m29      ; t32a t63a
   5339    ITX_MUL2X_PACK       16, 9, 29, 10, 799_4017, 11,    20 ; t35  t60
   5340    ITX_MUL2X_PACK       28, 9, 29, 10, 799_4017, 11,    20 ; t34a t61a
   5341    ITX_MUL2X_PACK       23, 9, 29, 10, 11, m799_m4017,  36 ; t36  t59
   5342    ITX_MUL2X_PACK       21, 9, 29, 10, 11, m799_m4017,  36 ; t37a t58a
   5343    vpbroadcastd        m11, [o(pw_m2276_3406)]
   5344    ITX_MUL2X_PACK       26, 9, 29, 10, 3406_2276, 11,   20 ; t43  t52
   5345    ITX_MUL2X_PACK       14, 9, 29, 10, 3406_2276, 11,   20 ; t42a t53a
   5346    ITX_MUL2X_PACK       24, 9, 29, 10, 11, m3406_m2276, 36 ; t44  t51
   5347    ITX_MUL2X_PACK       19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a
   5348    vpbroadcastd        m11, [o(pw_1567_3784)]
   5349    vpbroadcastd        m12, [o(pw_m3784_1567)]
   5350    psubsw              m29, m22, m25 ; t39  t56
   5351    paddsw              m22, m25      ; t32  t63
   5352    psubsw              m25, m20, m27 ; t40  t55
   5353    paddsw              m20, m27      ; t47  t48
   5354    psubsw              m27, m8, m17  ; t38a t57a
   5355    paddsw               m8, m17      ; t33a t62a
   5356    psubsw              m17, m15, m18 ; t41a t54a
   5357    paddsw              m15, m18      ; t46a t49a
   5358    paddsw              m18, m16, m23 ; t35a t60a
   5359    psubsw              m16, m23      ; t36a t59a
   5360    psubsw              m23, m24, m26 ; t43a t52a
   5361    paddsw              m24, m26      ; t44a t51a
   5362    paddsw              m26, m28, m21 ; t34  t61
   5363    psubsw              m28, m21      ; t37  t58
   5364    psubsw              m21, m19, m14 ; t42  t53
   5365    paddsw              m19, m14      ; t45  t50
   5366    ITX_MUL2X_PACK       29, 9, 14, 10, 11, 12, 4 ; t39a t56a
   5367    ITX_MUL2X_PACK       27, 9, 14, 10, 11, 12, 4 ; t38  t57
   5368    ITX_MUL2X_PACK       16, 9, 14, 10, 11, 12, 4 ; t36  t59
   5369    ITX_MUL2X_PACK       28, 9, 14, 10, 11, 12, 4 ; t37a t58a
   5370    vpbroadcastd        m11, [o(pw_m1567_m3784)]
   5371    ITX_MUL2X_PACK       25, 9, 14, 10, 12, 11, 4 ; t40a t55a
   5372    ITX_MUL2X_PACK       17, 9, 14, 10, 12, 11, 4 ; t41  t54
   5373    ITX_MUL2X_PACK       23, 9, 14, 10, 12, 11, 4 ; t43  t52
   5374    ITX_MUL2X_PACK       21, 9, 14, 10, 12, 11, 4 ; t42a t53a
   5375    vbroadcasti32x4     m13, [o(deint_shuf)]
   5376    vpbroadcastd        m11, [o(pw_2896_2896)]
   5377    vpbroadcastd        m12, [o(pw_m2896_2896)]
   5378    paddsw              m14, m22, m20 ; t32a t63a
   5379    psubsw              m22, m20      ; t47a t48a
   5380    psubsw              m20, m8, m15  ; t46  t49
   5381    paddsw               m8, m15      ; t33  t62
   5382    paddsw              m15, m18, m24 ; t35  t60
   5383    psubsw              m18, m24      ; t44  t51
   5384    psubsw              m24, m26, m19 ; t45a t50a
   5385    paddsw              m26, m19      ; t34a t61a
   5386    REPX    {pshufb x, m13}, m14, m8, m15, m26
   5387    psubsw              m19, m29, m25 ; t40  t55
   5388    paddsw              m25, m29      ; t39  t56
   5389    psubsw              m29, m27, m17 ; t41a t54a
   5390    paddsw              m27, m17      ; t38a t57a
   5391    psubsw              m17, m16, m23 ; t43a t52a
   5392    paddsw              m16, m23      ; t36a t59a
   5393    psubsw               m9, m28, m21 ; t42  t53
   5394    paddsw              m28, m21      ; t37  t58
   5395    REPX    {pshufb x, m13}, m25, m27, m16, m28
   5396    ITX_MUL2X_PACK       22, 13, 21, 10, 11, 12, 8 ; t47  t48
   5397    ITX_MUL2X_PACK       20, 23, 22, 10, 11, 12, 8 ; t46a t49a
   5398    packssdw            m21, m22      ; t47  t46a
   5399    packssdw            m13, m23      ; t48  t49a
   5400    ITX_MUL2X_PACK       18, 22, 20, 10, 11, 12, 8 ; t44a t51a
   5401    ITX_MUL2X_PACK       24, 23, 18, 10, 11, 12, 8 ; t45  t50
   5402    packssdw            m20, m18      ; t44a t45
   5403    packssdw            m22, m23      ; t51a t50
   5404    ITX_MUL2X_PACK       19, 24, 18, 10, 11, 12, 8 ; t40a t55a
   5405    ITX_MUL2X_PACK       29, 23, 19, 10, 11, 12, 8 ; t41  t54
   5406    packssdw            m18, m19      ; t40a t41
   5407    packssdw            m24, m23      ; t55a t54
   5408    ITX_MUL2X_PACK       17, 23, 19, 10, 11, 12, 8 ; t43  t52
   5409    ITX_MUL2X_PACK        9, 29, 17, 10, 11, 12, 8 ; t42a t53a
   5410    packssdw            m19, m17      ; t43  t42a
   5411    packssdw            m23, m29      ; t52  t53a
   5412    punpcklqdq          m17, m25, m27 ; t39  t38a
   5413    punpckhqdq          m25, m27      ; t56  t57a
   5414    punpckhqdq          m27, m15, m26 ; t60  t61a
   5415    punpcklqdq          m15, m26      ; t35  t34a
   5416    punpckhqdq          m26, m16, m28 ; t59a t58
   5417    punpcklqdq          m16, m28      ; t36a t37
   5418    punpckhqdq          m28, m14, m8  ; t63a t62
   5419    punpcklqdq          m14, m8       ; t32a t33
   5420    psubsw              m29, m0, m28  ; out63 out62
   5421    paddsw               m0, m28      ; out0  out1
   5422    psubsw              m28, m1, m27  ; out60 out61
   5423    paddsw               m1, m27      ; out3  out2
   5424    psubsw              m27, m2, m26  ; out59 out58
   5425    paddsw               m2, m26      ; out4  out5
   5426    psubsw              m26, m3, m25  ; out56 out57
   5427    paddsw               m3, m25      ; out7  out6
   5428    psubsw              m25, m4, m24  ; out55 out54
   5429    paddsw               m4, m24      ; out8  out9
   5430    psubsw              m24, m5, m23  ; out52 out53
   5431    paddsw               m5, m23      ; out11 out10
   5432    psubsw              m23, m6, m22  ; out51 out50
   5433    paddsw               m6, m22      ; out12 out13
   5434    psubsw              m22, m7, m13  ; out48 out49
   5435    paddsw               m7, m13      ; out15 out14
   5436    ret
   5437 
   5438 cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob
   5439 %undef cmp
   5440    lea                  r5, [o_base]
   5441    test               eobd, eobd
   5442    jnz .normal
   5443    movsx               r6d, word [cq]
   5444    mov                [cq], eobd
   5445    or                  r3d, 16
   5446 .dconly:
   5447    imul                r6d, 181
   5448    add                 r6d, 128+512
   5449    sar                 r6d, 8+2
   5450 .dconly2:
   5451    imul                r6d, 181
   5452    add                 r6d, 128+2048
   5453    sar                 r6d, 8+4
   5454    pxor                 m2, m2
   5455    vpbroadcastw         m3, r6d
   5456 .dconly_loop:
   5457    mova                 m1, [dstq]
   5458    punpcklbw            m0, m1, m2
   5459    punpckhbw            m1, m2
   5460    paddw                m0, m3
   5461    paddw                m1, m3
   5462    packuswb             m0, m1
   5463    mova             [dstq], m0
   5464    add                dstq, strideq
   5465    dec                 r3d
   5466    jg .dconly_loop
   5467    RET
   5468 .normal:
   5469    WIN64_SPILL_XMM      31
   5470    mova                m19, [o(dup16_perm)]
   5471    mova                m24, [cq+64* 2]
   5472    mova                m28, [cq+64* 6]
   5473    mova                m26, [cq+64* 4]
   5474    mova                m22, [cq+64* 0]
   5475    mova                m23, [cq+64* 1]
   5476    mova                m29, [cq+64* 7]
   5477    mova                m27, [cq+64* 5]
   5478    mova                m25, [cq+64* 3]
   5479    vpermb               m8, m19, m24        ;  4
   5480    vpermb               m1, m19, m28        ; 12
   5481    vpermb               m7, m19, m26        ;  8
   5482    vpermb               m9, m19, m22        ; __  0
   5483    vpermb              m21, m19, m23        ;  2
   5484    vpermb              m15, m19, m29        ; 14
   5485    vpermb              m18, m19, m27        ; 10
   5486    vpermb              m14, m19, m25        ;  6
   5487    pslld                m9, 16
   5488    vpord               m30, m19, [o(pb_32)] {1to16}
   5489    REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23
   5490    cmp                eobd, 151
   5491    jb .fast
   5492    vpermb               m0, m19, [cq+64*14] ; 28
   5493    vpermb               m5, m19, [cq+64*10] ; 20
   5494    vpermb               m3, m19, [cq+64*12] ; 24
   5495    vpermb               m6, m19, [cq+64* 8] ; __ 16
   5496    pslld                m6, 16
   5497    call m(idct_16x16_internal_8bpc).main_fast
   5498    vpermb              m17, m19, [cq+64*15] ; 30
   5499    vpermb              m20, m19, [cq+64* 9] ; 18
   5500    vpermb              m16, m19, [cq+64*11] ; 22
   5501    vpermb              m19, m19, [cq+64*13] ; 26
   5502    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   5503    mova         [cq+64* 0], m14
   5504    mova         [cq+64* 1], m15
   5505    mova         [cq+64* 2], m16
   5506    mova         [cq+64* 3], m17
   5507    mova         [cq+64* 4], m18
   5508    mova         [cq+64* 5], m19
   5509    mova         [cq+64* 6], m20
   5510    mova         [cq+64* 7], m21
   5511    vpermb              m21, m30, [cq+64*15]
   5512    vpermb              m14, m30, [cq+64* 8]
   5513    vpermb              m17, m30, [cq+64*11]
   5514    vpermb              m18, m30, [cq+64*12]
   5515    vpermb              m19, m30, [cq+64*13]
   5516    vpermb              m16, m30, [cq+64*10]
   5517    vpermb              m15, m30, [cq+64* 9]
   5518    vpermb              m20, m30, [cq+64*14]
   5519    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
   5520    jmp .end
   5521 .fast: ; bottom half is zero
   5522    call m(idct_16x16_internal_8bpc).main_fast2
   5523    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
   5524    mova         [cq+64* 0], m14
   5525    mova         [cq+64* 1], m15
   5526    mova         [cq+64* 2], m16
   5527    mova         [cq+64* 3], m17
   5528    mova         [cq+64* 4], m18
   5529    mova         [cq+64* 5], m19
   5530    mova         [cq+64* 6], m20
   5531    mova         [cq+64* 7], m21
   5532    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
   5533 .end:
   5534    mova         [cq+64* 8], m4
   5535    mova         [cq+64* 9], m5
   5536    mova         [cq+64*10], m6
   5537    mova         [cq+64*11], m7
   5538    mova         [cq+64*12], m26
   5539    mova         [cq+64*13], m27
   5540    mova         [cq+64*14], m28
   5541    mova         [cq+64*15], m29
   5542    vpbroadcastd        m13, [o(pw_8192)]
   5543    call .pass1_end
   5544    call .pass2
   5545    mova         [cq+64* 0], m0
   5546    mova         [cq+64* 1], m1
   5547    mova         [cq+64* 2], m2
   5548    mova         [cq+64* 3], m3
   5549    mova         [cq+64* 4], m4
   5550    mova         [cq+64* 5], m5
   5551    mova         [cq+64* 6], m6
   5552    mova         [cq+64* 7], m7
   5553    pmulhrsw             m0, m13, [cq+64* 8]
   5554    pmulhrsw             m1, m13, [cq+64* 9]
   5555    pmulhrsw             m2, m13, [cq+64*10]
   5556    pmulhrsw             m3, m13, [cq+64*11]
   5557    vpbroadcastd        m30, [o(pw_2048)]
   5558    pmulhrsw             m4, m13, m22
   5559    pmulhrsw             m5, m13, m23
   5560    pmulhrsw             m6, m13, m24
   5561    pmulhrsw             m7, m13, m25
   5562    pmulhrsw            m22, m30, m14
   5563    pmulhrsw            m14, m13, m26
   5564    pmulhrsw            m23, m30, m15
   5565    pmulhrsw            m15, m13, m27
   5566    pmulhrsw            m24, m30, m16
   5567    pmulhrsw            m16, m13, m28
   5568    pmulhrsw            m25, m30, m17
   5569    pmulhrsw            m17, m13, m29
   5570    pmulhrsw            m26, m30, m18
   5571    pmulhrsw            m18, m13, [cq+64*12]
   5572    pmulhrsw            m27, m30, m19
   5573    pmulhrsw            m19, m13, [cq+64*13]
   5574    pmulhrsw            m28, m30, m20
   5575    pmulhrsw            m20, m13, [cq+64*14]
   5576    pmulhrsw            m29, m30, m21
   5577    pmulhrsw            m21, m13, [cq+64*15]
   5578    call .transpose_round
   5579    call .pass2
   5580    pxor                m10, m10
   5581    lea                  r3, [strideq*3]
   5582 %macro IDCT_64x16_END 4
   5583    mova                 m9, [dstq+%4]
   5584 %if %1 < 8
   5585    pmulhrsw            m%3, m30, [cq+64*%1]
   5586 %endif
   5587    pmulhrsw            m%2, m30
   5588    mova         [cq+64*%1], m10
   5589    punpcklbw            m8, m9, m10
   5590    punpckhbw            m9, m10
   5591    paddw                m8, m%3
   5592    paddw                m9, m%2
   5593    packuswb             m8, m9
   5594    mova          [dstq+%4], m8
   5595 %if %1 == 3 || %1 == 7 || %1 == 11
   5596    lea                dstq, [dstq+strideq*4]
   5597 %endif
   5598 %endmacro
   5599    IDCT_64x16_END        0,  0, 11, strideq*0
   5600    IDCT_64x16_END        1,  1, 11, strideq*1
   5601    IDCT_64x16_END        2,  2, 11, strideq*2
   5602    IDCT_64x16_END        3,  3, 11, r3
   5603    IDCT_64x16_END        4,  4, 11, strideq*0
   5604    IDCT_64x16_END        5,  5, 11, strideq*1
   5605    IDCT_64x16_END        6,  6, 11, strideq*2
   5606    IDCT_64x16_END        7,  7, 11, r3
   5607    IDCT_64x16_END        8, 14, 22, strideq*0
   5608    IDCT_64x16_END        9, 15, 23, strideq*1
   5609    IDCT_64x16_END       10, 16, 24, strideq*2
   5610    IDCT_64x16_END       11, 17, 25, r3
   5611    IDCT_64x16_END       12, 18, 26, strideq*0
   5612    IDCT_64x16_END       13, 19, 27, strideq*1
   5613    IDCT_64x16_END       14, 20, 28, strideq*2
   5614    IDCT_64x16_END       15, 21, 29, r3
   5615    RET
   5616 ALIGN function_align
   5617 .pass1_end:
   5618    mova                 m4, [cq+64* 0]
   5619    mova                 m5, [cq+64* 1]
   5620    mova                 m6, [cq+64* 2]
   5621    mova                 m7, [cq+64* 3]
   5622    mova                 m8, [cq+64* 4]
   5623    mova                 m9, [cq+64* 5]
   5624    mova                m11, [cq+64* 6]
   5625    mova                m12, [cq+64* 7]
   5626    psubsw              m29, m4, m21  ; out47 out46
   5627    paddsw               m4, m21      ; out16 out17
   5628    psubsw              m28, m5, m20  ; out44 out45
   5629    paddsw               m5, m20      ; out19 out18
   5630    REPX  {pmulhrsw x, m13}, m0, m1, m2, m3
   5631    psubsw              m27, m6, m19  ; out43 out42
   5632    paddsw               m6, m19      ; out20 out21
   5633    psubsw              m26, m7, m18  ; out40 out41
   5634    paddsw               m7, m18      ; out23 out22
   5635    pmulhrsw            m18, m13, m22
   5636    pmulhrsw            m19, m13, m23
   5637    pmulhrsw            m20, m13, m24
   5638    pmulhrsw            m21, m13, m25
   5639    paddsw              m25, m12, m14 ; out31 out30
   5640    psubsw              m14, m12, m14 ; out32 out33
   5641    paddsw              m24, m11, m15 ; out28 out29
   5642    psubsw              m15, m11, m15 ; out35 out34
   5643    REPX  {pmulhrsw x, m13}, m4, m5, m6, m7
   5644    paddsw              m23, m9, m16  ; out27 out26
   5645    psubsw              m16, m9, m16  ; out36 out37
   5646    paddsw              m22, m8, m17  ; out24 out25
   5647    psubsw              m17, m8, m17  ; out39 out38
   5648    REPX  {pmulhrsw x, m13}, m14, m15, m16, m17
   5649 .transpose_round:
   5650 %macro TRANSPOSE_8x4_PACKED 4
   5651    punpckhwd            m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3
   5652    punpcklwd           m%1, m%3      ; a0 e0 a1 e1 a2 e2 a3 e3
   5653    punpcklwd           m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3
   5654    punpckhwd           m%2, m%4      ; c0 g0 c1 g1 c2 g2 c3 g3
   5655    punpckhwd           m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3
   5656    punpcklwd           m%1, m%2      ; a0 c0 e0 g0 a1 c1 e1 g1
   5657    punpckhwd           m%2, m8, m%3  ; b2 d2 f2 h2 b3 d3 f3 h3
   5658    punpcklwd            m8, m%3      ; b0 d0 f0 h0 b1 d1 f1 h1
   5659    punpcklwd           m%3, m%4, m%2 ; 2
   5660    punpckhwd           m%4, m%2      ; 3
   5661    punpckhwd           m%2, m%1, m8  ; 1
   5662    punpcklwd           m%1, m8       ; 0
   5663 %endmacro
   5664    TRANSPOSE_8x4_PACKED  0,  1,  2,  3
   5665    TRANSPOSE_8x4_PACKED 18, 19, 20, 21
   5666    TRANSPOSE_8x4_PACKED  4,  5,  6,  7
   5667    TRANSPOSE_8x4_PACKED 14, 15, 16, 17
   5668    vshufi32x4           m8, m0, m4, q3232   ; a02 a03 b02 b03
   5669    vinserti32x8         m0, ym4, 1          ; a00 a01 b00 b01
   5670    vshufi32x4           m4, m1, m5, q3232   ; a12 a13 b12 b13
   5671    vinserti32x8         m9, m1, ym5, 1      ; a10 a11 b10 b11
   5672    vshufi32x4           m5, m2, m6, q3232   ; a22 a23 b22 b23
   5673    vinserti32x8         m1, m2, ym6, 1      ; a20 a21 b20 b21
   5674    vshufi32x4           m6, m3, m7, q3232   ; a32 a33 b32 b33
   5675    vinserti32x8        m11, m3, ym7, 1      ; a30 a31 b30 b31
   5676    vshufi32x4           m2, m14, m18, q3232 ; c02 c03 d02 d03
   5677    vinserti32x8         m3, m14, ym18, 1    ; c00 c01 d00 d01
   5678    vshufi32x4          m18, m15, m19, q3232 ; c12 c13 d12 d13
   5679    vinserti32x8        m15, ym19, 1         ; c10 c11 d10 d11
   5680    vshufi32x4          m19, m16, m20, q3232 ; c22 c23 d22 d23
   5681    vinserti32x8        m16, ym20, 1         ; c20 c21 d20 d21
   5682    vshufi32x4          m20, m17, m21, q3232 ; c32 c33 d32 d33
   5683    vinserti32x8        m17, ym21, 1         ; c30 c31 d30 d31
   5684    ret
   5685 .pass2:
   5686    vshufi32x4           m7, m5, m19, q3131  ; 14
   5687    vshufi32x4           m5, m19, q2020      ; 10
   5688    vshufi32x4          m21, m6, m20, q3131  ; 15
   5689    vshufi32x4          m19, m6, m20, q2020  ; 11
   5690    vshufi32x4          m20, m4, m18, q3131  ; 13
   5691    vshufi32x4          m18, m4, m18, q2020  ;  9
   5692    vshufi32x4           m6, m8, m2, q3131   ; 12
   5693    vshufi32x4           m4, m8, m2, q2020   ;  8
   5694    vshufi32x4           m2, m0, m3, q3131   ;  4
   5695    vshufi32x4           m0, m3, q2020       ;  0
   5696    vshufi32x4           m3, m1, m16, q3131  ;  6
   5697    vshufi32x4           m1, m16, q2020      ;  2
   5698    vshufi32x4          m16, m9, m15, q3131  ;  5
   5699    vshufi32x4          m14, m9, m15, q2020  ;  1
   5700    vshufi32x4          m15, m11, m17, q2020 ;  3
   5701    vshufi32x4          m17, m11, m17, q3131 ;  7
   5702    call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
   5703    jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
   5704 
   5705 cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob
   5706    lea                  r5, [o_base]
   5707    test               eobd, eobd
   5708    jz .dconly
   5709    PROLOGUE              0, 9, 30, 64*32, dst, stride, c, eob
   5710    vpbroadcastd        m23, [o(pw_2896x8)]
   5711 %undef cmp
   5712    cmp                eobd, 136
   5713    jb .fast
   5714    pmulhrsw             m5, m23, [cq+64*20]
   5715    pmulhrsw             m3, m23, [cq+64*12]
   5716    pmulhrsw             m1, m23, [cq+64* 4]
   5717    pmulhrsw             m7, m23, [cq+64*28]
   5718    pmulhrsw             m2, m23, [cq+64* 8]
   5719    pmulhrsw             m6, m23, [cq+64*24]
   5720    pmulhrsw             m0, m23, [cq+64* 0]
   5721    pmulhrsw             m4, m23, [cq+64*16]
   5722    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   5723    pmulhrsw            m14, m23, [cq+64* 2]
   5724    pmulhrsw            m21, m23, [cq+64*30]
   5725    pmulhrsw            m18, m23, [cq+64*18]
   5726    pmulhrsw            m17, m23, [cq+64*14]
   5727    pmulhrsw            m16, m23, [cq+64*10]
   5728    pmulhrsw            m19, m23, [cq+64*22]
   5729    pmulhrsw            m20, m23, [cq+64*26]
   5730    pmulhrsw            m15, m23, [cq+64* 6]
   5731    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
   5732    mova         [cq+64* 0], m14
   5733    mova         [cq+64* 2], m15
   5734    mova         [cq+64* 4], m16
   5735    mova         [cq+64* 6], m17
   5736    mova         [cq+64* 8], m18
   5737    mova         [cq+64*10], m19
   5738    mova         [cq+64*12], m20
   5739    mova         [cq+64*14], m21
   5740    pmulhrsw            m22, m23, [cq+64* 1]
   5741    pmulhrsw            m21, m23, [cq+64*31]
   5742    pmulhrsw            m14, m23, [cq+64*17]
   5743    pmulhrsw            m29, m23, [cq+64*15]
   5744    pmulhrsw            m26, m23, [cq+64* 9]
   5745    pmulhrsw            m17, m23, [cq+64*23]
   5746    pmulhrsw            m18, m23, [cq+64*25]
   5747    pmulhrsw            m25, m23, [cq+64* 7]
   5748    pmulhrsw            m24, m23, [cq+64* 5]
   5749    pmulhrsw            m19, m23, [cq+64*27]
   5750    pmulhrsw            m16, m23, [cq+64*21]
   5751    pmulhrsw            m27, m23, [cq+64*11]
   5752    pmulhrsw            m28, m23, [cq+64*13]
   5753    pmulhrsw            m15, m23, [cq+64*19]
   5754    pmulhrsw            m20, m23, [cq+64*29]
   5755    pmulhrsw            m23,      [cq+64* 3]
   5756    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
   5757    vpbroadcastd        m12, [o(pw_16384)]
   5758    psubsw              m13, m0, m29 ; 31
   5759    paddsw               m0, m29     ;  0
   5760    psubsw              m29, m1, m28 ; 30
   5761    paddsw               m1, m28     ;  1
   5762    psubsw              m28, m2, m27 ; 29
   5763    paddsw               m2, m27     ;  2
   5764    psubsw              m27, m3, m26 ; 28
   5765    paddsw               m3, m26     ;  3
   5766    psubsw              m26, m4, m25 ; 27
   5767    paddsw               m4, m25     ;  4
   5768    psubsw              m25, m5, m24 ; 26
   5769    paddsw               m5, m24     ;  5
   5770    psubsw              m24, m6, m23 ; 25
   5771    paddsw               m6, m23     ;  6
   5772    psubsw              m23, m7, m22 ; 24
   5773    paddsw               m7, m22     ;  7
   5774    pxor                 m9, m9
   5775    punpckhwd            m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
   5776    punpcklwd            m0, m1     ; a0 b0 a1 b1 a2 b2 a3 b3
   5777    punpckhwd            m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
   5778    punpcklwd            m2, m3     ; c0 d0 c1 d1 c2 d2 c3 d3
   5779    REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
   5780    punpckhwd           m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
   5781    punpcklwd            m4, m5     ; e0 f0 e1 f1 e2 f2 e3 f3
   5782    punpckhwd            m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
   5783    punpcklwd            m6, m7     ; g0 h0 g1 h1 g2 h2 g3 h3
   5784    REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
   5785    punpckhwd            m3, m23, m24
   5786    punpcklwd           m23, m24
   5787    punpckhwd           m24, m25, m26
   5788    punpcklwd           m25, m26
   5789    REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
   5790    punpckhwd           m26, m27, m28
   5791    punpcklwd           m27, m28
   5792    punpckhwd           m28, m29, m13
   5793    punpcklwd           m29, m13
   5794    REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
   5795    punpckhdq            m7, m0, m2  ; a2 b2 c2 d2 a3 b3 c3 d3
   5796    punpckldq            m0, m2      ; a0 b0 c0 d0 a1 b1 c1 d1
   5797    punpckhdq            m2, m4, m6  ; e2 f2 g2 h2 e3 f3 g3 h3
   5798    punpckldq            m4, m6      ; e0 f0 g0 h0 e1 f1 g1 h1
   5799    REPX  {pmulhrsw x, m12}, m7, m0, m2, m4
   5800    punpckhdq            m6, m8, m1  ; a6 b6 c6 d6 a7 b7 c7 d7
   5801    punpckldq            m8, m1      ; a4 b4 c4 d4 a5 b5 c5 d5
   5802    punpckhdq            m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
   5803    punpckldq           m22, m5      ; e4 f4 g4 h5 e5 f5 g5 h5
   5804    REPX  {pmulhrsw x, m12}, m6, m8, m1, m22
   5805    punpckhdq           m13, m23, m25
   5806    punpckldq           m23, m25
   5807    punpckhdq           m25, m27, m29
   5808    punpckldq           m27, m29
   5809    REPX  {pmulhrsw x, m12}, m13, m23, m25, m27
   5810    punpckhdq            m9, m3, m24
   5811    punpckldq            m3, m24
   5812    punpckhdq           m24, m26, m28
   5813    punpckldq           m26, m28
   5814    REPX  {pmulhrsw x, m12}, m9, m3, m24, m26
   5815    punpckhqdq           m5, m23, m27 ; d01 d09 d17 d25
   5816    punpcklqdq          m23, m27      ; d00 d08 d16 d24
   5817    punpcklqdq          m27, m13, m25 ; d02 d10 d18 d26
   5818    punpckhqdq          m13, m25      ; d03 d11 d19 d27
   5819    punpcklqdq          m25, m3, m26  ; d04 d12 d20 d28
   5820    punpckhqdq           m3, m26      ; d05 d13 d21 d29
   5821    punpcklqdq          m26, m9, m24  ; d06 d14 d22 d30
   5822    punpckhqdq           m9, m24      ; d07 d15 d23 d31
   5823    mova         [cq+64* 3], m23
   5824    mova         [cq+64*13], m27
   5825    mova         [cq+64* 7], m25
   5826    mova         [cq+64*15], m26
   5827    punpckhqdq          m24, m8, m22  ; a05 a13 a21 a29
   5828    punpcklqdq           m8, m22      ; a04 a12 a20 a28
   5829    punpckhqdq          m22, m0, m4   ; a01 a09 a17 a25
   5830    punpcklqdq           m0, m4       ; a00 a08 a16 a24
   5831    punpckhqdq          m23, m7, m2   ; a03 a11 a19 a27
   5832    punpcklqdq           m7, m2       ; a02 a10 a18 a26
   5833    punpckhqdq          m25, m6, m1   ; a07 a15 a23 a31
   5834    punpcklqdq           m6, m1       ; a06 a14 a22 a30
   5835    mova         [cq+64* 1], m0
   5836    mova         [cq+64* 9], m7
   5837    mova         [cq+64* 5], m8
   5838    mova         [cq+64*11], m6
   5839    mova                 m2, [cq+64* 0]
   5840    mova                m11, [cq+64* 2]
   5841    mova                 m8, [cq+64* 4]
   5842    mova                m29, [cq+64* 6]
   5843    mova                m27, [cq+64* 8]
   5844    mova                m26, [cq+64*10]
   5845    mova                 m4, [cq+64*12]
   5846    mova                m28, [cq+64*14]
   5847    psubsw               m1, m2, m21  ; 23
   5848    paddsw               m2, m21      ;  8
   5849    psubsw              m21, m11, m20 ; 22
   5850    paddsw              m11, m20      ;  9
   5851    psubsw              m20, m8, m19  ; 21
   5852    paddsw               m8, m19      ; 10
   5853    psubsw              m19, m29, m18 ; 20
   5854    paddsw              m29, m18      ; 11
   5855    psubsw              m18, m27, m17 ; 19
   5856    paddsw              m27, m17      ; 12
   5857    psubsw              m17, m26, m16 ; 18
   5858    paddsw              m26, m16      ; 13
   5859    psubsw              m16, m4, m15  ; 17
   5860    paddsw               m4, m15      ; 14
   5861    psubsw              m15, m28, m14 ; 16
   5862    paddsw              m28, m14      ; 15
   5863    punpcklwd           m14, m15, m16
   5864    punpckhwd           m15, m16
   5865    punpckhwd           m16, m17, m18
   5866    punpcklwd           m17, m18
   5867    punpckhwd           m18, m19, m20
   5868    punpcklwd           m19, m20
   5869    punpckhwd           m20, m21, m1
   5870    punpcklwd           m21, m1
   5871    punpckhwd            m1, m2, m11  ; i4 j4 i5 j5 i6 j6 i7 j7
   5872    punpcklwd            m2, m11      ; i0 j1 i1 j1 i2 j2 i3 j3
   5873    punpckhwd           m11, m8, m29  ; k4 l4 k5 l5 k6 l6 k7 l7
   5874    punpcklwd            m8, m29      ; k0 l0 k1 l1 k2 l2 k3 l3
   5875    punpckhwd           m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
   5876    punpcklwd           m27, m26      ; m0 n0 m1 n1 m2 n2 m3 n3
   5877    punpckhwd           m26, m4, m28  ; o4 p4 o5 p5 o6 p6 o7 p7
   5878    punpcklwd            m4, m28      ; o0 p0 o1 p1 o2 p2 o3 p3
   5879    punpckhdq           m28, m2, m8   ; i2 j2 k2 l2 i3 j3 k3 l3
   5880    punpckldq            m2, m8       ; i0 j0 k0 l0 i1 j1 k1 l1
   5881    punpckhdq            m8, m27, m4  ; m2 n2 o2 p2 m3 n3 o3 p3
   5882    punpckldq           m27, m4       ; m0 n0 o0 p0 m1 n1 o1 p1
   5883    REPX  {pmulhrsw x, m12}, m28, m2, m8, m27
   5884    punpckhdq            m4, m1, m11  ; i6 j6 k6 l6 i7 j7 k7 l7
   5885    punpckldq            m1, m11      ; i4 j4 k4 l4 i5 j5 k5 l5
   5886    punpckhdq           m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
   5887    punpckldq           m29, m26      ; m4 n4 o4 p4 m5 n5 o5 p5
   5888    REPX  {pmulhrsw x, m12}, m4, m1, m11, m29
   5889    punpckhdq           m26, m19, m21
   5890    punpckldq           m19, m21
   5891    punpckhdq           m21, m15, m16
   5892    punpckldq           m15, m16
   5893    REPX  {pmulhrsw x, m12}, m26, m19, m21, m15
   5894    punpckhdq           m16, m18, m20
   5895    punpckldq           m18, m20
   5896    punpckhdq           m20, m14, m17
   5897    punpckldq           m14, m17
   5898    REPX  {pmulhrsw x, m12}, m16, m18, m20, m14
   5899    punpckhqdq          m17, m28, m8  ; b03 b11 b19 b27
   5900    punpcklqdq          m28, m8       ; b02 b10 b18 b26
   5901    punpckhqdq           m8, m2, m27  ; b01 b09 b17 b25
   5902    punpcklqdq           m2, m27      ; b00 b08 b16 b24
   5903    punpcklqdq          m27, m1, m29  ; b04 b12 b20 b28
   5904    punpckhqdq           m1, m29      ; b05 b13 b21 b29
   5905    punpcklqdq          m29, m4, m11  ; b06 b14 b22 b30
   5906    punpckhqdq           m4, m11      ; b07 b15 b23 b31
   5907    mova         [cq+64* 0], m2
   5908    mova         [cq+64* 8], m28
   5909    mova         [cq+64* 4], m27
   5910    mova         [cq+64*10], m29
   5911    punpckhqdq          m27, m20, m26 ; c03 c11 c19 c27
   5912    punpcklqdq          m20, m26      ; c02 c10 c18 c26
   5913    punpckhqdq          m26, m14, m19 ; c01 c09 c17 c25
   5914    punpcklqdq          m14, m19      ; c00 c08 c16 c24
   5915    punpckhqdq          m28, m15, m18 ; c05 c13 c21 c29
   5916    punpcklqdq          m15, m18      ; c04 c12 c20 c28
   5917    punpckhqdq          m29, m21, m16 ; c07 c15 c23 c31
   5918    punpcklqdq          m21, m16      ; c06 c14 c22 c30
   5919    mova         [cq+64* 2], m14
   5920    mova         [cq+64*12], m20
   5921    mova         [cq+64* 6], m15
   5922    mova         [cq+64*14], m21
   5923    vshufi32x4          m14, m22, m8, q3232  ; a17 a25 b17 b25
   5924    vinserti32x8        m22, ym8, 1          ; a01 a09 b01 b09
   5925    vshufi32x4          m15, m23, m17, q3232 ; a19 a27 b19 b27
   5926    vinserti32x8        m23, ym17, 1         ; a03 a11 b03 b11
   5927    vshufi32x4          m16, m24, m1, q3232  ; a21 a29 b21 b29
   5928    vinserti32x8        m24, ym1, 1          ; a05 a13 b05 b13
   5929    vshufi32x4          m17, m25, m4, q3232  ; a23 a31 b23 b31
   5930    vinserti32x8        m25, ym4, 1          ; a07 a15 b07 b15
   5931    vinserti32x8        m19, m26, ym5, 1     ; c01 c09 d01 d09
   5932    vshufi32x4          m26, m5, q3232       ; c17 c25 d17 d25
   5933    vinserti32x8        m20, m27, ym13, 1    ; c03 c11 d03 d11
   5934    vshufi32x4          m27, m13, q3232      ; c19 c27 d19 d27
   5935    vinserti32x8        m21, m28, ym3, 1     ; c05 c13 d05 d13
   5936    vshufi32x4          m28, m3, q3232       ; c21 c29 d21 d29
   5937    vinserti32x8        m18, m29, ym9, 1     ; c07 c15 d07 d15
   5938    vshufi32x4          m29, m9, q3232       ; c23 c31 d23 d31
   5939    mov                  r4, rsp
   5940    vshufi32x4           m0, m22, m19, q2020 ;  1
   5941    vshufi32x4           m1, m17, m29, q3131 ; 31
   5942    vshufi32x4           m2, m14, m26, q2020 ; 17
   5943    vshufi32x4           m3, m25, m18, q3131 ; 15
   5944    call .main_part1
   5945    vshufi32x4           m0, m25, m18, q2020 ;  7
   5946    vshufi32x4           m1, m14, m26, q3131 ; 25
   5947    vshufi32x4           m2, m17, m29, q2020 ; 23
   5948    vshufi32x4           m3, m22, m19, q3131 ;  9
   5949    call .main_part1
   5950    vshufi32x4           m0, m24, m21, q2020 ;  5
   5951    vshufi32x4           m1, m15, m27, q3131 ; 27
   5952    vshufi32x4           m2, m16, m28, q2020 ; 21
   5953    vshufi32x4           m3, m23, m20, q3131 ; 11
   5954    call .main_part1
   5955    vshufi32x4           m0, m23, m20, q2020 ;  3
   5956    vshufi32x4           m1, m16, m28, q3131 ; 29
   5957    vshufi32x4           m2, m15, m27, q2020 ; 19
   5958    vshufi32x4           m3, m24, m21, q3131 ; 13
   5959    call .main_part1
   5960    call .main_part2
   5961    mova                 m0, [cq+64* 1] ; a0
   5962    mova                m15, [cq+64* 0] ; b0
   5963    mova                 m3, [cq+64* 2] ; c0
   5964    mova                m16, [cq+64* 3] ; d0
   5965    mova                m14, [cq+64* 5] ; a4
   5966    mova                 m8, [cq+64* 4] ; b4
   5967    mova                m17, [cq+64* 6] ; c4
   5968    mova                 m1, [cq+64* 7] ; d4
   5969    vshufi32x4           m2, m0, m15, q3232  ; a16 a24 b16 b24
   5970    vinserti32x8         m0, ym15, 1         ; a00 a08 b00 b08
   5971    vshufi32x4          m15, m3, m16, q3232  ; c16 c24 d16 d24
   5972    vinserti32x8         m3, ym16, 1         ; c00 c08 d00 d08
   5973    vshufi32x4          m16, m14, m8, q3232  ; a20 a28 b20 b28
   5974    vinserti32x8        m14, ym8, 1          ; a04 a12 b04 b12
   5975    vshufi32x4           m8, m17, m1, q3232  ; c20 c28 d20 d28
   5976    vinserti32x8        m17, ym1, 1          ; c04 c12 d04 d12
   5977    vshufi32x4           m1, m0, m3, q3131   ;  8
   5978    vshufi32x4           m0, m3, q2020       ;  0
   5979    vshufi32x4           m3, m2, m15, q3131  ; 24
   5980    vshufi32x4           m2, m15, q2020      ; 16
   5981    vshufi32x4          m15, m14, m17, q3131 ; 12
   5982    vshufi32x4          m14, m17, q2020      ;  4
   5983    vshufi32x4          m17, m16, m8, q3131  ; 28
   5984    vshufi32x4          m16, m8, q2020       ; 20
   5985    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
   5986    mova                 m8, [cq+64* 8]
   5987    mova                 m9, [cq+64*12]
   5988    mova                m11, [cq+64*10]
   5989    mova                m12, [cq+64*14]
   5990    mova         [cq+64* 0], m14
   5991    mova         [cq+64* 2], m15
   5992    mova         [cq+64* 4], m16
   5993    mova         [cq+64* 6], m17
   5994    mova         [cq+64* 8], m18
   5995    mova         [cq+64*10], m19
   5996    mova         [cq+64*12], m20
   5997    mova         [cq+64*14], m21
   5998    mova                m22, [cq+64* 9]
   5999    mova                m27, [cq+64*13]
   6000    mova                m23, [cq+64*11]
   6001    mova                m24, [cq+64*15]
   6002    vshufi32x4          m26, m22, m8, q3232  ; a18 a26 b18 b26
   6003    vinserti32x8        m22, ym8, 1          ; a02 a10 b02 b10
   6004    vshufi32x4           m8, m9, m27, q3232  ; c18 c26 d18 d26
   6005    vinserti32x8         m9, ym27, 1         ; c02 c10 d02 d10
   6006    vshufi32x4          m27, m23, m11, q3232 ; a22 a30 b22 b30
   6007    vinserti32x8        m23, ym11, 1         ; a06 a14 b06 b14
   6008    vshufi32x4          m11, m12, m24, q3232 ; c22 c30 d22 d30
   6009    vinserti32x8        m12, ym24, 1         ; c06 c14 d06 d14
   6010    vshufi32x4          m28, m26, m8, q3131  ; 26
   6011    vshufi32x4          m26, m8, q2020       ; 18
   6012    vshufi32x4          m24, m22, m9, q3131  ; 10
   6013    vshufi32x4          m22, m9, q2020       ;  2
   6014    vshufi32x4          m29, m27, m11, q3131 ; 30
   6015    vshufi32x4          m27, m11, q2020      ; 22
   6016    vshufi32x4          m25, m23, m12, q3131 ; 14
   6017    vshufi32x4          m23, m12, q2020      ;  6
   6018    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
   6019    jmp .end
   6020 .fast: ; bottom/right halves are zero
   6021    pmulhrsw            ym9, ym23, [cq+64* 0]
   6022    pmulhrsw            ym6, ym23, [cq+64* 8]
   6023    mova                m14, [o(dup16_perm)]
   6024    pmulhrsw            ym8, ym23, [cq+64* 2]
   6025    pmulhrsw            xm0, xm23, [cq+64*14]
   6026    pmulhrsw            xm5, xm23, [cq+64*10]
   6027    pmulhrsw            ym1, ym23, [cq+64* 6]
   6028    pmulhrsw            ym7, ym23, [cq+64* 4]
   6029    pmulhrsw            xm3, xm23, [cq+64*12]
   6030    pmovzxwd             m9, ym9
   6031    pmovzxwd             m6, ym6
   6032    vpermb               m8, m14, m8
   6033    punpcklwd           xm0, xm0
   6034    vpermb              ym5, ym14, ym5
   6035    vpermb               m1, m14, m1
   6036    vpermb               m7, m14, m7
   6037    punpcklwd           xm3, xm3
   6038    pslld                m9, 16
   6039    pslld                m6, 16
   6040    call m(idct_16x16_internal_8bpc).main_fast
   6041          vpmulhrsw    ym21, ym23, [cq+64* 1]
   6042    {evex}vpmulhrsw    xm17, xm23, [cq+64*15] ; force EVEX encoding, which
   6043    {evex}vpmulhrsw    xm20, xm23, [cq+64* 9] ; reduces code size due to
   6044    {evex}vpmulhrsw    ym15, ym23, [cq+64* 7] ; compressed displacements
   6045    {evex}vpmulhrsw    ym18, ym23, [cq+64* 5]
   6046    {evex}vpmulhrsw    xm16, xm23, [cq+64*11]
   6047    {evex}vpmulhrsw    xm19, xm23, [cq+64*13]
   6048    {evex}vpmulhrsw    ym23,       [cq+64* 3]
   6049    vpermb              m21, m14, m21
   6050    punpcklwd          xm17, xm17
   6051    vpermb             ym20, ym14, ym20
   6052    vpermb              m15, m14, m15
   6053    vpermb              m18, m14, m18
   6054    vpermb             ym16, ym14, ym16
   6055    punpcklwd          xm19, xm19
   6056    vpermb              m14, m14, m23
   6057    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   6058    vpbroadcastd         m9, [o(pw_16384)]
   6059    call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
   6060    vshufi32x4          m16, m0, m3, q2020  ;  0
   6061    vshufi32x4          m26, m0, m3, q3131  ;  4
   6062    vshufi32x4           m0, m14, m2, q2020 ;  1
   6063    vshufi32x4          m14, m2, q3131      ;  5
   6064    vshufi32x4           m3, m19, m7, q3131 ; 15
   6065    vshufi32x4          m19, m7, q2020      ; 11
   6066    vshufi32x4          m27, m17, m9, q2020 ;  3
   6067    vshufi32x4          m17, m9, q3131      ;  7
   6068    vshufi32x4          m28, m20, m6, q2020 ;  9
   6069    vshufi32x4          m20, m6, q3131      ; 13
   6070    vshufi32x4          m22, m1, m18, q2020 ;  2
   6071    vshufi32x4          m23, m1, m18, q3131 ;  6
   6072    vshufi32x4          m24, m5, m15, q2020 ; 10
   6073    vshufi32x4          m25, m5, m15, q3131 ; 14
   6074    vshufi32x4          m15, m21, m4, q3131 ; 12
   6075    vshufi32x4          m21, m21, m4, q2020 ;  8
   6076    mov                  r4, rsp
   6077    call .main_part1_fast
   6078    mova                 m0, m17
   6079    mova                 m3, m28
   6080    call .main_part1_fast
   6081    mova                 m0, m14
   6082    mova                 m3, m19
   6083    call .main_part1_fast
   6084    mova                 m0, m27
   6085    mova                 m3, m20
   6086    call .main_part1_fast
   6087    call .main_part2
   6088    mova                 m0, m16
   6089    mova                 m1, m21
   6090    mova                m14, m26
   6091    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
   6092    mova         [cq+64*14], m21
   6093    mova         [cq+64* 0], m14
   6094    mova         [cq+64* 6], m17
   6095    mova         [cq+64* 8], m18
   6096    mova         [cq+64*10], m19
   6097    mova         [cq+64* 4], m16
   6098    mova         [cq+64* 2], m15
   6099    mova         [cq+64*12], m20
   6100    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
   6101 .end:
   6102    lea                  r4, [strideq*3]
   6103    vpbroadcastd        m12, [o(pw_2048)]
   6104    movshdup            m13, [o(permD)]
   6105    lea                  r5, [r4+strideq]   ; stride*4
   6106    lea                  r3, [dstq+r4*8]
   6107    lea                  r6, [strideq+r5*8] ; stride*33
   6108    lea                  r8, [r4+r5*8]      ; stride*35
   6109    add                  r3, r5             ; dst+stride*28
   6110    lea                  r7, [r6+strideq]   ; stride*34
   6111 %macro IDCT_32x64_END 6 ; src, mem, stride[1-4]
   6112 %if %2 < 8
   6113    paddsw              m10, m%2, m%1
   6114    psubsw              m11, m%2, m%1
   6115 %else
   6116    mova                m11, [cq+64*(%2*2-16)]
   6117    paddsw              m10, m11, m%1
   6118    psubsw              m11, m%1
   6119 %endif
   6120    mova                 m9, [rsp+64*(31-%2)]
   6121    mova                m%1, [rsp+64*%2]
   6122    paddsw               m8, m10, m9
   6123    psubsw              m10, m9
   6124    paddsw               m9, m11, m%1
   6125    pmovzxbw             m0, [dstq+%3]
   6126    psubsw              m11, m%1
   6127    pmovzxbw            m%1, [r3  +%4]
   6128    REPX  {pmulhrsw x, m12}, m8, m10, m9, m11
   6129    paddw                m8, m0
   6130    pmovzxbw             m0, [r3  +%5]
   6131    paddw               m10, m%1
   6132    pmovzxbw            m%1, [dstq+%6]
   6133    paddw                m9, m0
   6134    paddw               m11, m%1
   6135 %if %2 >= 8
   6136 %if %2 == 8
   6137    pxor                 m1, m1
   6138 %endif
   6139    mova  [cq+64*(%2*2-16)], m1
   6140    mova  [cq+64*(%2*2-15)], m1
   6141 %endif
   6142    packuswb             m8, m10
   6143    packuswb             m9, m11
   6144    vpermq               m8, m13, m8
   6145    vpermq               m9, m13, m9
   6146    mova          [dstq+%3], ym8
   6147    vextracti32x8 [r3  +%4], m8, 1
   6148    mova          [r3  +%5], ym9
   6149    vextracti32x8 [dstq+%6], m9, 1
   6150 %if %2 == 3 || %2 == 7 || %2 == 11
   6151    add                dstq, r5
   6152    sub                  r3, r5
   6153 %endif
   6154 %endmacro
   6155    IDCT_32x64_END       29,  0, strideq*0, r8,   r4       , r5*8
   6156    IDCT_32x64_END       28,  1, strideq*1, r7,   strideq*2, r6
   6157    IDCT_32x64_END       27,  2, strideq*2, r6,   strideq*1, r7
   6158    IDCT_32x64_END       26,  3, r4       , r5*8, strideq*0, r8
   6159    IDCT_32x64_END       25,  4, strideq*0, r8,   r4       , r5*8
   6160    IDCT_32x64_END       24,  5, strideq*1, r7,   strideq*2, r6
   6161    IDCT_32x64_END       23,  6, strideq*2, r6,   strideq*1, r7
   6162    IDCT_32x64_END       22,  7, r4       , r5*8, strideq*0, r8
   6163    IDCT_32x64_END       21,  8, strideq*0, r8,   r4       , r5*8
   6164    IDCT_32x64_END       20,  9, strideq*1, r7,   strideq*2, r6
   6165    IDCT_32x64_END       19, 10, strideq*2, r6,   strideq*1, r7
   6166    IDCT_32x64_END       18, 11, r4       , r5*8, strideq*0, r8
   6167    IDCT_32x64_END       17, 12, strideq*0, r8,   r4       , r5*8
   6168    IDCT_32x64_END       16, 13, strideq*1, r7,   strideq*2, r6
   6169    IDCT_32x64_END       15, 14, strideq*2, r6,   strideq*1, r7
   6170    IDCT_32x64_END       14, 15, r4       , r5*8, strideq*0, r8
   6171    RET
   6172 .dconly:
   6173    movsx               r6d, word [cq]
   6174    mov                [cq], eobd
   6175    or                  r3d, 64
   6176    imul                r6d, 181
   6177    add                 r6d, 128
   6178    sar                 r6d, 8
   6179    imul                r6d, 181
   6180    add                 r6d, 128+256
   6181    sar                 r6d, 8+1
   6182    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
   6183 ALIGN function_align ; bottom three-quarters are zero
   6184 cglobal_label .main_part1_fast2
   6185    vpbroadcastd         m7, [o(idct64_mul+4*0)]
   6186    vpbroadcastd         m8, [o(idct64_mul+4*1)]
   6187    pmulhrsw             m7, m0     ; t63a
   6188    pmulhrsw             m0, m8     ; t32a
   6189 
   6190    punpcklwd            m4, m0, m7
   6191    punpckhwd            m6, m0, m7
   6192    mova                 m1, m10
   6193    vpdpwssd             m1, m4, [o(idct64_mul+4*9)] {bcstd}
   6194    mova                 m9, m10
   6195    vpdpwssd             m9, m6, [o(idct64_mul+4*9)] {bcstd}
   6196    REPX      {psrad x, 12}, m1, m9
   6197    packssdw             m1, m9
   6198    mova                 m9, m10
   6199    vpdpwssd             m9, m6, [o(idct64_mul+4*8)] {bcstd}
   6200    mova                 m6, m10
   6201    vpdpwssd             m6, m4, [o(idct64_mul+4*8)] {bcstd}
   6202    REPX      {psrad x, 12}, m9, m6
   6203    packssdw             m6, m9
   6204 
   6205    mova                 m4, m0
   6206    mova                 m3, m7
   6207    mova                 m5, m1
   6208    mova                 m2, m6
   6209    jmp .main_part1c
   6210 cglobal_label .main_part1_fast
   6211    vpbroadcastd         m1, [o(idct64_mul+4*0)]
   6212    vpbroadcastd         m8, [o(idct64_mul+4*1)]
   6213    vpbroadcastd         m2, [o(idct64_mul+4*6)]
   6214    vpbroadcastd         m9, [o(idct64_mul+4*7)]
   6215    pmulhrsw             m1, m0     ; t63a
   6216    pmulhrsw             m0, m8     ; t32a
   6217    pmulhrsw             m2, m3     ; t60a
   6218    pmulhrsw             m3, m9     ; t35a
   6219    mova                 m8, m0
   6220    mova                 m7, m1
   6221    mova                 m6, m3
   6222    mova                 m5, m2
   6223    jmp .main_part1b
   6224 cglobal_label .main_part1
   6225    ; idct64 steps 1-5:
   6226    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
   6227    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
   6228    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
   6229    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
   6230    vpbroadcastd         m7, [o(idct64_mul+4*0)]
   6231    vpbroadcastd         m8, [o(idct64_mul+4*1)]
   6232    vpbroadcastd         m6, [o(idct64_mul+4*2)]
   6233    vpbroadcastd         m9, [o(idct64_mul+4*3)]
   6234    pmulhrsw             m7, m0     ; t63a
   6235    vpbroadcastd         m5, [o(idct64_mul+4*4)]
   6236    pmulhrsw             m0, m8     ; t32a
   6237    vpbroadcastd         m8, [o(idct64_mul+4*5)]
   6238    pmulhrsw             m6, m1     ; t62a
   6239    vpbroadcastd         m4, [o(idct64_mul+4*6)]
   6240    pmulhrsw             m1, m9     ; t33a
   6241    vpbroadcastd         m9, [o(idct64_mul+4*7)]
   6242    pmulhrsw             m5, m2     ; t61a
   6243    pmulhrsw             m2, m8     ; t34a
   6244    pmulhrsw             m4, m3     ; t60a
   6245    pmulhrsw             m3, m9     ; t35a
   6246    psubsw               m8, m0, m1 ; t33
   6247    paddsw               m0, m1     ; t32
   6248    psubsw               m1, m7, m6 ; t62
   6249    paddsw               m7, m6     ; t63
   6250    psubsw               m6, m3, m2 ; t34
   6251    paddsw               m3, m2     ; t35
   6252    psubsw               m2, m4, m5 ; t61
   6253    paddsw               m5, m4     ; t60
   6254 .main_part1b:
   6255    vpbroadcastd        m11, [o(idct64_mul+4*8)]
   6256    vpbroadcastd        m12, [o(idct64_mul+4*9)]
   6257    ITX_MULSUB_2W         1, 8, 4, 9, 10, 11, 12 ; t33a, t62a
   6258    vpbroadcastd        m11, [o(idct64_mul+4*10)]
   6259    ITX_MULSUB_2W         2, 6, 4, 9, 10, 12, 11 ; t34a, t61a
   6260    psubsw               m4, m0, m3 ; t35a
   6261    paddsw               m0, m3     ; t32a
   6262    psubsw               m3, m7, m5 ; t60a
   6263    paddsw               m7, m5     ; t63a
   6264    psubsw               m5, m1, m2 ; t34
   6265    paddsw               m1, m2     ; t33
   6266    psubsw               m2, m8, m6 ; t61
   6267    paddsw               m6, m8     ; t62
   6268 .main_part1c:
   6269    vpbroadcastd        m11, [o(idct64_mul+4*11)]
   6270    vpbroadcastd        m12, [o(idct64_mul+4*12)]
   6271    add                  r5, 4*13
   6272    ITX_MULSUB_2W         3, 4, 8, 9, 10, 11, 12 ; t35,  t60
   6273    ITX_MULSUB_2W         2, 5, 8, 9, 10, 11, 12 ; t34a, t61a
   6274    mova          [r4+64*0], m0
   6275    mova          [r4+64*7], m7
   6276    mova          [r4+64*1], m1
   6277    mova          [r4+64*6], m6
   6278    mova          [r4+64*3], m3
   6279    mova          [r4+64*4], m4
   6280    mova          [r4+64*2], m2
   6281    mova          [r4+64*5], m5
   6282    add                  r4, 64*8
   6283    ret
   6284 cglobal_label .main_part2
   6285    vpbroadcastd        m11, [o(pw_1567_3784  -16*13)]
   6286    vpbroadcastd        m12, [o(pw_m3784_1567 -16*13)]
   6287    lea                  r6, [r4+64*7]
   6288    vpbroadcastd        m17, [o(pw_m1567_m3784-16*13)]
   6289    vpbroadcastd        m18, [o(pw_2896_2896  -16*13)]
   6290    vpbroadcastd        m19, [o(pw_m2896_2896 -16*13)]
   6291    sub                  r5, 16*13
   6292 .main_part2_loop:
   6293    mova                 m0, [r4-64*32] ; t32a
   6294    mova                 m1, [r6-64*24] ; t39a
   6295    mova                 m2, [r6-64*32] ; t63a
   6296    mova                 m3, [r4-64*24] ; t56a
   6297    mova                 m4, [r4-64*16] ; t40a
   6298    mova                 m5, [r6-64* 8] ; t47a
   6299    mova                 m6, [r6-64*16] ; t55a
   6300    mova                 m7, [r4-64* 8] ; t48a
   6301    psubsw               m8, m0, m1 ; t39
   6302    paddsw               m0, m1     ; t32
   6303    psubsw               m1, m2, m3 ; t56
   6304    paddsw               m2, m3     ; t63
   6305    psubsw               m3, m5, m4 ; t40
   6306    paddsw               m5, m4     ; t47
   6307    psubsw               m4, m7, m6 ; t55
   6308    paddsw               m7, m6     ; t48
   6309    ITX_MULSUB_2W         1, 8, 6, 9, 10, 11, 12 ; t39a, t56a
   6310    ITX_MULSUB_2W         4, 3, 6, 9, 10, 12, 17 ; t40a, t55a
   6311    psubsw               m6, m2, m7 ; t48a
   6312    paddsw               m2, m7     ; t63a
   6313    psubsw               m7, m0, m5 ; t47a
   6314    paddsw               m0, m5     ; t32a
   6315    psubsw               m5, m8, m3 ; t55
   6316    paddsw               m8, m3     ; t56
   6317    psubsw               m3, m1, m4 ; t40
   6318    paddsw               m1, m4     ; t39
   6319    ITX_MULSUB_2W         6, 7, 4, 9, 10, 18, 19 ; t47,  t48
   6320    ITX_MULSUB_2W         5, 3, 4, 9, 10, 18, 19 ; t40a, t55a
   6321    mova         [r6-64* 8], m2
   6322    mova         [r4-64*32], m0
   6323    mova         [r4-64* 8], m8
   6324    mova         [r6-64*32], m1
   6325    mova         [r6-64*24], m6
   6326    mova         [r4-64*16], m7
   6327    mova         [r4-64*24], m5
   6328    mova         [r6-64*16], m3
   6329    add                  r4, 64
   6330    sub                  r6, 64
   6331    cmp                  r4, r6
   6332    jb .main_part2_loop
   6333    ret
   6334 
   6335 cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob
   6336    lea                  r5, [o_base]
   6337    test               eobd, eobd
   6338    jz .dconly
   6339    PROLOGUE              0, 7, 30, 64*32, dst, stride, c, eob
   6340    vpbroadcastd        m23, [o(pw_2896x8)]
   6341 %undef cmp
   6342    cmp                eobd, 136
   6343    jb .fast
   6344    pmulhrsw             m0, m23, [cq+64* 1]
   6345    pmulhrsw             m1, m23, [cq+64*31]
   6346    pmulhrsw             m2, m23, [cq+64*17]
   6347    pmulhrsw             m3, m23, [cq+64*15]
   6348    vpbroadcastd        m10, [o(pd_2048)]
   6349    mov                  r4, rsp
   6350    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   6351    pmulhrsw             m0, m23, [cq+64* 7]
   6352    pmulhrsw             m1, m23, [cq+64*25]
   6353    pmulhrsw             m2, m23, [cq+64*23]
   6354    pmulhrsw             m3, m23, [cq+64* 9]
   6355    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   6356    pmulhrsw             m0, m23, [cq+64* 5]
   6357    pmulhrsw             m1, m23, [cq+64*27]
   6358    pmulhrsw             m2, m23, [cq+64*21]
   6359    pmulhrsw             m3, m23, [cq+64*11]
   6360    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   6361    pmulhrsw             m0, m23, [cq+64* 3]
   6362    pmulhrsw             m1, m23, [cq+64*29]
   6363    pmulhrsw             m2, m23, [cq+64*19]
   6364    pmulhrsw             m3, m23, [cq+64*13]
   6365    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   6366    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
   6367    pmulhrsw             m3, m23, [cq+64*24]
   6368    pmulhrsw             m1, m23, [cq+64* 8]
   6369    pmulhrsw             m2, m23, [cq+64*16]
   6370    pmulhrsw             m0, m23, [cq+64* 0]
   6371    pmulhrsw            m14, m23, [cq+64* 4]
   6372    pmulhrsw            m17, m23, [cq+64*28]
   6373    pmulhrsw            m16, m23, [cq+64*20]
   6374    pmulhrsw            m15, m23, [cq+64*12]
   6375    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
   6376    pmulhrsw            m22, m23, [cq+64* 2]
   6377    pmulhrsw            m29, m23, [cq+64*30]
   6378    pmulhrsw            m26, m23, [cq+64*18]
   6379    pmulhrsw            m25, m23, [cq+64*14]
   6380    pmulhrsw            m24, m23, [cq+64*10]
   6381    pmulhrsw            m27, m23, [cq+64*22]
   6382    pmulhrsw            m28, m23, [cq+64*26]
   6383    pmulhrsw            m23,      [cq+64* 6]
   6384    mova         [cq+64* 0], m14
   6385    mova         [cq+64* 1], m15
   6386    mova         [cq+64* 2], m16
   6387    mova         [cq+64* 3], m17
   6388    mova         [cq+64* 4], m18
   6389    mova         [cq+64* 5], m19
   6390    mova         [cq+64* 6], m20
   6391    mova         [cq+64* 7], m21
   6392    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
   6393    vpbroadcastd        m13, [o(pw_16384)]
   6394    call .pass1_end_part1
   6395    mova         [cq+64*16], m1
   6396    mova         [cq+64*17], m3
   6397    mova         [cq+64*18], m5
   6398    mova         [cq+64*19], m7
   6399    mova         [cq+64*24], m23
   6400    mova         [cq+64*25], m25
   6401    mova         [cq+64*26], m27
   6402    mova         [cq+64*27], m29
   6403    pmulhrsw            m23, m13, m0 ; a0
   6404    pmulhrsw            m25, m13, m2 ; a2
   6405    pmulhrsw            m27, m13, m4 ; a4
   6406    pmulhrsw            m29, m13, m6 ; a6
   6407    REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6
   6408    call .pass1_end_part2
   6409    mova         [cq+64*20], m15
   6410    mova         [cq+64*21], m17
   6411    mova         [cq+64*22], m19
   6412    mova         [cq+64*23], m21
   6413    mova         [cq+64*28], m1
   6414    mova         [cq+64*29], m3
   6415    mova         [cq+64*30], m5
   6416    mova         [cq+64*31], m7
   6417    REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6
   6418    REPX {pmulhrsw x, m13}, m0, m2, m4, m6     ; g0 g2 g4 g6
   6419    vinserti32x8        m3, m23, ym14, 1 ; a00 a01 c00 c01
   6420    vshufi32x4         m23, m14, q3232   ; a02 a03 c02 c03
   6421    vinserti32x8       m15, m22, ym0, 1  ; e00 e01 g00 g01
   6422    vshufi32x4         m22, m0, q3232    ; e02 e03 g02 g03
   6423    vinserti32x8        m1, m27, ym18, 1 ; a40 a41 c40 c41
   6424    vshufi32x4         m27, m18, q3232   ; a42 a43 c42 c43
   6425    vinserti32x8       m18, m26, ym4, 1  ; e40 e41 g40 g41
   6426    vshufi32x4         m26, m4, q3232    ; e42 e43 g42 g43
   6427    vinserti32x8       m14, m25, ym16, 1 ; a20 a21 c20 c21
   6428    vshufi32x4         m25, m16, q3232   ; a22 a23 c22 c23
   6429    vinserti32x8       m17, m24, ym2, 1  ; e20 e21 g20 g21
   6430    vshufi32x4         m24, m2, q3232    ; e22 e23 g22 g23
   6431    vinserti32x8       m19, m29, ym20, 1 ; a60 a61 c60 c61
   6432    vshufi32x4         m29, m20, q3232   ; a62 a63 c62 c63
   6433    vinserti32x8       m20, m28, ym6, 1  ; e60 e61 g60 g61
   6434    vshufi32x4         m28, m6, q3232    ; e62 e63 g62 g63
   6435    vshufi32x4          m2, m3, m15, q3131  ;  8
   6436    vshufi32x4          m0, m3, m15, q2020  ;  0
   6437    vshufi32x4          m6, m23, m22, q3131 ; 24
   6438    vshufi32x4          m4, m23, m22, q2020 ; 16
   6439    vshufi32x4          m3, m1, m18, q3131  ; 12
   6440    vshufi32x4          m1, m18, q2020      ;  4
   6441    vshufi32x4          m7, m27, m26, q3131 ; 28
   6442    vshufi32x4          m5, m27, m26, q2020 ; 20
   6443    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   6444    vshufi32x4         m16, m14, m17, q3131 ; 10
   6445    vshufi32x4         m14, m17, q2020      ;  2
   6446    vshufi32x4         m17, m19, m20, q3131 ; 14
   6447    vshufi32x4         m15, m19, m20, q2020 ;  6
   6448    vshufi32x4         m20, m25, m24, q3131 ; 26
   6449    vshufi32x4         m18, m25, m24, q2020 ; 18
   6450    vshufi32x4         m21, m29, m28, q3131 ; 30
   6451    vshufi32x4         m19, m29, m28, q2020 ; 22
   6452    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
   6453    pmulhrsw           m22, m13, [cq+64*16] ; a1
   6454    pmulhrsw           m23, m13, [cq+64*20] ; c1
   6455    pmulhrsw           m24, m13, [cq+64*24] ; e1
   6456    pmulhrsw           m25, m13, [cq+64*28] ; g1
   6457    pmulhrsw           m26, m13, [cq+64*17] ; a3
   6458    pmulhrsw           m27, m13, [cq+64*21] ; c3
   6459    pmulhrsw           m28, m13, [cq+64*25] ; e3
   6460    pmulhrsw           m29, m13, [cq+64*29] ; g3
   6461    mova        [cq+64* 8], m14
   6462    mova        [cq+64* 9], m15
   6463    mova        [cq+64*10], m16
   6464    mova        [cq+64*11], m17
   6465    mova        [cq+64*12], m18
   6466    mova        [cq+64*13], m19
   6467    mova        [cq+64*14], m20
   6468    mova        [cq+64*15], m21
   6469    pmulhrsw           m14, m13, [cq+64*18] ; a5
   6470    pmulhrsw           m15, m13, [cq+64*22] ; c5
   6471    pmulhrsw           m16, m13, [cq+64*26] ; e5
   6472    pmulhrsw           m17, m13, [cq+64*30] ; g5
   6473    pmulhrsw           m18, m13, [cq+64*19] ; a7
   6474    pmulhrsw           m19, m13, [cq+64*23] ; c7
   6475    pmulhrsw           m20, m13, [cq+64*27] ; e7
   6476    pmulhrsw           m21, m13, [cq+64*31] ; g7
   6477    vinserti32x8        m8, m22, ym23, 1 ; a10 a11 c10 c11
   6478    vshufi32x4         m22, m23, q3232   ; a12 a13 c12 c13
   6479    vinserti32x8        m9, m24, ym25, 1 ; e10 e11 g10 g11
   6480    vshufi32x4         m24, m25, q3232   ; e12 e13 g12 g13
   6481    vinserti32x8       m23, m26, ym27, 1 ; a30 a31 c30 c31
   6482    vshufi32x4         m26, m27, q3232   ; a32 a33 c32 c33
   6483    vinserti32x8       m11, m28, ym29, 1 ; e30 e31 g30 g31
   6484    vshufi32x4         m28, m29, q3232   ; e32 e33 g32 g33
   6485    mova        [cq+64* 0], m0
   6486    mova        [cq+64* 1], m1
   6487    mova        [cq+64* 2], m2
   6488    mova        [cq+64* 3], m3
   6489    mova        [cq+64* 4], m4
   6490    mova        [cq+64* 5], m5
   6491    mova        [cq+64* 6], m6
   6492    mova        [cq+64* 7], m7
   6493    vinserti32x8       m12, m14, ym15, 1 ; a50 a51 c50 c51
   6494    vshufi32x4         m14, m15, q3232   ; a52 a53 c52 c53
   6495    vinserti32x8       m13, m16, ym17, 1 ; e50 e51 g50 g51
   6496    vshufi32x4         m16, m17, q3232   ; e52 e53 g52 g53
   6497    vinserti32x8       m25, m18, ym19, 1 ; a70 a71 c70 c71
   6498    vshufi32x4         m18, m19, q3232   ; a72 a73 c72 c73
   6499    vinserti32x8       m17, m20, ym21, 1 ; e70 e71 g70 g71
   6500    vshufi32x4         m20, m21, q3232   ; e72 e73 g72 g73
   6501    vshufi32x4         m27, m23, m11, q3131 ; 11 m27
   6502    vshufi32x4         m23, m11, q2020      ;  3 m23
   6503    vshufi32x4         m19, m26, m28, q3131 ; 27 m19
   6504    vshufi32x4         m15, m26, m28, q2020 ; 19 m15
   6505    vshufi32x4         m29, m25, m17, q3131 ; 15 m29
   6506    vshufi32x4         m25, m17, q2020      ;  7 m25
   6507    vshufi32x4         m21, m18, m20, q3131 ; 31 m21
   6508    vshufi32x4         m17, m18, m20, q2020 ; 23 m17
   6509    vshufi32x4         m20, m14, m16, q3131 ; 29 m20
   6510    vshufi32x4         m16, m14, m16, q2020 ; 21 m16
   6511    vshufi32x4         m18, m22, m24, q3131 ; 25 m18
   6512    vshufi32x4         m14, m22, m24, q2020 ; 17 m14
   6513    vshufi32x4         m26, m8, m9, q3131   ;  9 m26
   6514    vshufi32x4         m22, m8, m9, q2020   ;  1 m22
   6515    vshufi32x4         m28, m12, m13, q3131 ; 13 m28
   6516    vshufi32x4         m24, m12, m13, q2020 ;  5 m24
   6517    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
   6518    vpbroadcastd       m13, [o(pw_16384)]
   6519    pmulhrsw            m0, m13, [r4-64*21]
   6520    pmulhrsw            m1, m13, [r4-64*22]
   6521    pmulhrsw            m2, m13, [r4-64*23]
   6522    pmulhrsw            m3, m13, [r4-64*24]
   6523    pmulhrsw            m4, m13, [r4-64*25]
   6524    pmulhrsw            m5, m13, [r4-64*26]
   6525    pmulhrsw            m6, m13, [r4-64*27]
   6526    pmulhrsw            m7, m13, [r4-64*28]
   6527    mova        [cq+64*16], m14
   6528    mova        [cq+64*17], m15
   6529    mova        [cq+64*18], m16
   6530    mova        [cq+64*19], m17
   6531    mova        [cq+64*20], m18
   6532    mova        [cq+64*21], m19
   6533    mova        [cq+64*22], m20
   6534    mova        [cq+64*23], m21
   6535    pmulhrsw           m14, m13, [r4-64*12]
   6536    pmulhrsw           m15, m13, [r4-64*11]
   6537    pmulhrsw           m16, m13, [r4-64*10]
   6538    pmulhrsw           m17, m13, [r4-64* 9]
   6539    pmulhrsw           m18, m13, [r4-64* 8]
   6540    pmulhrsw           m19, m13, [r4-64* 7]
   6541    pmulhrsw           m20, m13, [r4-64* 6]
   6542    pmulhrsw           m21, m13, [r4-64* 5]
   6543    mova        [cq+64*24], m22
   6544    mova        [cq+64*25], m23
   6545    mova        [cq+64*26], m24
   6546    mova        [cq+64*27], m25
   6547    mova        [cq+64*28], m26
   6548    mova        [cq+64*29], m27
   6549    mova        [cq+64*30], m28
   6550    mova        [cq+64*31], m29
   6551    call .transpose_2x8x8_lo
   6552    mova        [r4-64*12], m1
   6553    mova        [r4-64*11], m3
   6554    mova        [r4-64*10], m5
   6555    mova        [r4-64* 9], m7
   6556    mova        [r4-64* 8], m15
   6557    mova        [r4-64* 7], m17
   6558    mova        [r4-64* 6], m19
   6559    mova        [r4-64* 5], m21
   6560    vinserti32x8       m22, m0, ym14, 1     ; f00 f01 h00 h01
   6561    vshufi32x4         m23, m0, m14, q3232  ; f02 f03 h02 h03
   6562    vinserti32x8       m24, m2, ym16, 1     ; f20 f21 h20 h21
   6563    vshufi32x4         m25, m2, m16, q3232  ; f22 f23 h22 h23
   6564    vinserti32x8       m26, m4, ym18, 1     ; f40 f41 h40 h41
   6565    vshufi32x4         m27, m4, m18, q3232  ; f42 f43 h42 h43
   6566    vinserti32x8       m28, m6, ym20, 1     ; f60 f61 h60 h61
   6567    vshufi32x4         m29, m6, m20, q3232  ; f62 f63 h62 h63
   6568    pmulhrsw            m0, m13, [r4-64*20]
   6569    pmulhrsw            m1, m13, [r4-64*19]
   6570    pmulhrsw            m2, m13, [r4-64*18]
   6571    pmulhrsw            m3, m13, [r4-64*17]
   6572    pmulhrsw            m4, m13, [r4-64*16]
   6573    pmulhrsw            m5, m13, [r4-64*15]
   6574    pmulhrsw            m6, m13, [r4-64*14]
   6575    pmulhrsw            m7, m13, [r4-64*13]
   6576    pmulhrsw           m14, m13, [r4-64*29]
   6577    pmulhrsw           m15, m13, [r4-64*30]
   6578    pmulhrsw           m16, m13, [r4-64*31]
   6579    pmulhrsw           m17, m13, [r4-64*32]
   6580    pmulhrsw           m18, m13, [r4-64*33]
   6581    pmulhrsw           m19, m13, [r4-64*34]
   6582    pmulhrsw           m20, m13, [r4-64*35]
   6583    pmulhrsw           m21, m13, [r4-64*36]
   6584    call .transpose_2x8x8_lo
   6585    mova       [r4-64*20], m1
   6586    mova       [r4-64*19], m3
   6587    mova       [r4-64*18], m5
   6588    mova       [r4-64*17], m7
   6589    mova       [r4-64*16], m15
   6590    mova       [r4-64*15], m17
   6591    mova       [r4-64*14], m19
   6592    mova       [r4-64*13], m21
   6593    vinserti32x8        m1, m4, ym18, 1     ; b40 b41 d40 d41
   6594    vshufi32x4          m5, m4, m18, q3232  ; b42 b43 d42 d43
   6595    vshufi32x4          m4, m0, m14, q3232  ; b02 b03 d02 d03
   6596    vinserti32x8        m0, ym14, 1         ; b00 b01 d00 d01
   6597    vinserti32x8       m14, m2, ym16, 1     ; b20 b21 d20 d21
   6598    vshufi32x4         m18, m2, m16, q3232  ; b22 b23 d22 d23
   6599    vinserti32x8       m15, m6, ym20, 1     ; b60 b61 d60 d61
   6600    vshufi32x4         m19, m6, m20, q3232  ; b62 b63 d62 d63
   6601    vshufi32x4          m2, m0, m22, q3131  ;  8
   6602    vshufi32x4          m0, m22, q2020      ;  0
   6603    vshufi32x4          m3, m1, m26, q3131  ; 12
   6604    vshufi32x4          m1, m26, q2020      ;  4
   6605    vshufi32x4          m6, m4, m23, q3131  ; 24
   6606    vshufi32x4          m4, m23, q2020      ; 16
   6607    vshufi32x4          m7, m5, m27, q3131  ; 28
   6608    vshufi32x4          m5, m27, q2020      ; 20
   6609    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   6610    vshufi32x4         m16, m14, m24, q3131 ; 10
   6611    vshufi32x4         m14, m24, q2020      ;  2
   6612    vshufi32x4         m17, m15, m28, q3131 ; 14
   6613    vshufi32x4         m15, m28, q2020      ;  6
   6614    vshufi32x4         m20, m18, m25, q3131 ; 26
   6615    vshufi32x4         m18, m25, q2020      ; 18
   6616    vshufi32x4         m21, m19, m29, q3131 ; 30
   6617    vshufi32x4         m19, m29, q2020      ; 22
   6618    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
   6619    mova               m22, [r4-64*20]
   6620    mova               m26, [r4-64*16]
   6621    mova               m23, [r4-64*19]
   6622    mova               m27, [r4-64*15]
   6623    mova               m24, [r4-64*18]
   6624    mova               m28, [r4-64*14]
   6625    mova               m25, [r4-64*17]
   6626    mova               m29, [r4-64*13]
   6627    mova        [r4-64*20], m14
   6628    mova        [r4-64*19], m15
   6629    mova        [r4-64*18], m16
   6630    mova        [r4-64*17], m17
   6631    mova        [r4-64*16], m18
   6632    mova        [r4-64*15], m19
   6633    mova        [r4-64*14], m20
   6634    mova        [r4-64*13], m21
   6635    mova               m19, [r4-64*12]
   6636    mova               m11, [r4-64* 8]
   6637    mova               m20, [r4-64*11]
   6638    mova               m12, [r4-64* 7]
   6639    mova               m21, [r4-64*10]
   6640    mova                m8, [r4-64* 6]
   6641    mova                m9, [r4-64* 9]
   6642    mova               m18, [r4-64* 5]
   6643    vshufi32x4         m14, m22, m26, q3232 ; b12 b13 d12 d13
   6644    vinserti32x8       m22, ym26, 1         ; b10 b11 d10 d11
   6645    vshufi32x4         m15, m23, m27, q3232 ; b32 b33 d32 d33
   6646    vinserti32x8       m23, ym27, 1         ; b30 b31 d30 d31
   6647    vshufi32x4         m16, m24, m28, q3232 ; b52 b53 d52 d53
   6648    vinserti32x8       m24, ym28, 1         ; b50 b51 d50 d51
   6649    vshufi32x4         m17, m25, m29, q3232 ; b72 b73 d72 d73
   6650    vinserti32x8       m25, ym29, 1         ; b70 b71 d70 d71
   6651    vinserti32x8       m27, m19, ym11, 1    ; f10 f11 h10 h11
   6652    vshufi32x4         m19, m11, q3232      ; f12 f13 h12 h13
   6653    vinserti32x8       m28, m20, ym12, 1    ; f30 f31 h30 h31
   6654    vshufi32x4         m20, m12, q3232      ; f32 f33 h32 h33
   6655    vinserti32x8       m29, m21, ym8, 1     ; f50 f51 h50 h51
   6656    vshufi32x4         m21, m8, q3232       ; f52 f53 h52 h53
   6657    vinserti32x8        m8, m9, ym18, 1     ; f70 f71 h70 h71
   6658    vshufi32x4          m9, m18, q3232      ; f72 f73 h72 h73
   6659    vshufi32x4         m26, m22, m27, q3131 ;  9
   6660    vshufi32x4         m22, m27, q2020      ;  1
   6661    vshufi32x4         m27, m23, m28, q3131 ; 11
   6662    vshufi32x4         m23, m28, q2020      ;  3
   6663    vshufi32x4         m28, m24, m29, q3131 ; 13
   6664    vshufi32x4         m24, m29, q2020      ;  5
   6665    vshufi32x4         m29, m25, m8, q3131  ; 15
   6666    vshufi32x4         m25, m8, q2020       ;  7
   6667    vshufi32x4         m18, m14, m19, q3131 ; 25
   6668    vshufi32x4         m14, m19, q2020      ; 17
   6669    vshufi32x4         m19, m15, m20, q3131 ; 27
   6670    vshufi32x4         m15, m20, q2020      ; 19
   6671    vshufi32x4         m20, m16, m21, q3131 ; 29
   6672    vshufi32x4         m16, m21, q2020      ; 21
   6673    vshufi32x4         m21, m17, m9, q3131  ; 31
   6674    vshufi32x4         m17, m9, q2020       ; 23
   6675    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
   6676    jmp .end
   6677 .fast: ; bottom/right halves are zero
   6678    {evex}vpmulhrsw     ym8, ym23, [cq+64* 4]
   6679    {evex}vpmulhrsw     xm1, xm23, [cq+64*12]
   6680    mova                m28, [o(dup16_perm)]
   6681    {evex}vpmulhrsw     ym7, ym23, [cq+64* 8]
   6682          vpmulhrsw    ym22, ym23, [cq+64* 0]
   6683    vpermb               m8, m28, m8
   6684    vpermb              ym1, ym28, ym1
   6685    vpermb               m7, m28, m7
   6686    pmovzxwd             m9, ym22
   6687    pslld                m9, 16
   6688    call m(idct_16x16_internal_8bpc).main_fast2
   6689    {evex}vpmulhrsw    ym21, ym23, [cq+64* 2]
   6690    {evex}vpmulhrsw    xm15, xm23, [cq+64*14]
   6691    {evex}vpmulhrsw    xm18, xm23, [cq+64*10]
   6692    {evex}vpmulhrsw    ym14, ym23, [cq+64* 6]
   6693    vpermb              m21, m28, m21
   6694    punpcklwd          xm15, xm15
   6695    vpermb             ym18, ym28, ym18
   6696    vpermb              m14, m28, m14
   6697    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
   6698          vpmulhrsw    ym22, ym23, [cq+64* 1]
   6699    {evex}vpmulhrsw    xm29, xm23, [cq+64*15]
   6700    {evex}vpmulhrsw    xm26, xm23, [cq+64* 9]
   6701    {evex}vpmulhrsw    ym25, ym23, [cq+64* 7]
   6702    {evex}vpmulhrsw    ym24, ym23, [cq+64* 5]
   6703    {evex}vpmulhrsw    xm27, xm23, [cq+64*11]
   6704    {evex}vpmulhrsw     xm8, xm23, [cq+64*13]
   6705    {evex}vpmulhrsw    ym23,       [cq+64* 3]
   6706    vpermb              m22, m28, m22
   6707    punpcklwd          xm29, xm29
   6708    vpermb             ym26, ym28, ym26
   6709    vpermb              m25, m28, m25
   6710    mova         [cq+64* 0], m14
   6711    mova         [cq+64* 1], m15
   6712    mova         [cq+64* 2], m16
   6713    mova         [cq+64* 3], m17
   6714    REPX {vpermb x, m28, x}, m24, m27, m23
   6715    punpcklwd          xm28, xm8, xm8
   6716    mova         [cq+64* 4], m18
   6717    mova         [cq+64* 5], m19
   6718    mova         [cq+64* 6], m20
   6719    mova         [cq+64* 7], m21
   6720    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
   6721    mov                  r4, rsp
   6722    vpbroadcastd        m13, [o(pw_16384)]
   6723    mova         [r4+64*16], m4
   6724    mova         [r4+64*17], m5
   6725    mova         [r4+64*18], m6
   6726    mova         [r4+64*19], m7
   6727    mova         [r4+64*28], m26
   6728    mova         [r4+64*29], m27
   6729    mova         [r4+64*30], m28
   6730    mova         [r4+64*31], m29
   6731    call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
   6732    mova         [r4+64*20], m22
   6733    mova         [r4+64*21], m23
   6734    mova         [r4+64*22], m24
   6735    mova         [r4+64*23], m25
   6736    mova         [r4+64*24], m26
   6737    mova         [r4+64*25], m27
   6738    mova         [r4+64*26], m28
   6739    mova         [r4+64*27], m29
   6740    call .pass2_fast
   6741    mova         [cq+64* 8], m14
   6742    mova         [cq+64* 9], m15
   6743    mova         [cq+64*10], m16
   6744    mova         [cq+64*11], m17
   6745    mova         [cq+64*12], m18
   6746    mova         [cq+64*13], m19
   6747    mova         [cq+64*14], m20
   6748    mova         [cq+64*15], m21
   6749    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
   6750    mova         [cq+64* 0], m0
   6751    mova         [cq+64* 1], m1
   6752    mova         [cq+64* 2], m2
   6753    mova         [cq+64* 3], m3
   6754    mova         [cq+64* 4], m4
   6755    mova         [cq+64* 5], m5
   6756    mova         [cq+64* 6], m6
   6757    mova         [cq+64* 7], m7
   6758    pmulhrsw             m0, m13, [r4+64*16]
   6759    pmulhrsw             m1, m13, [r4+64*17]
   6760    pmulhrsw             m2, m13, [r4+64*18]
   6761    pmulhrsw             m3, m13, [r4+64*19]
   6762    pmulhrsw             m4, m13, [r4+64*20]
   6763    pmulhrsw             m5, m13, [r4+64*21]
   6764    pmulhrsw             m6, m13, [r4+64*22]
   6765    pmulhrsw             m7, m13, [r4+64*23]
   6766    mova         [cq+64*16], m14
   6767    mova         [cq+64*17], m15
   6768    mova         [cq+64*18], m16
   6769    mova         [cq+64*19], m17
   6770    mova         [cq+64*20], m18
   6771    mova         [cq+64*21], m19
   6772    mova         [cq+64*22], m20
   6773    mova         [cq+64*23], m21
   6774    pmulhrsw            m14, m13, [r4+64*24]
   6775    pmulhrsw            m15, m13, [r4+64*25]
   6776    pmulhrsw            m16, m13, [r4+64*26]
   6777    pmulhrsw            m17, m13, [r4+64*27]
   6778    pmulhrsw            m18, m13, [r4+64*28]
   6779    pmulhrsw            m19, m13, [r4+64*29]
   6780    pmulhrsw            m20, m13, [r4+64*30]
   6781    pmulhrsw            m21, m13, [r4+64*31]
   6782    mova         [cq+64*24], m22
   6783    mova         [cq+64*25], m23
   6784    mova         [cq+64*26], m24
   6785    mova         [cq+64*27], m25
   6786    mova         [cq+64*28], m26
   6787    mova         [cq+64*29], m27
   6788    mova         [cq+64*30], m28
   6789    mova         [cq+64*31], m29
   6790    call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
   6791    call .pass2_fast
   6792    mova         [r4+64*16], m14
   6793    mova         [r4+64*17], m15
   6794    mova         [r4+64*18], m16
   6795    mova         [r4+64*19], m17
   6796    mova         [r4+64*20], m18
   6797    mova         [r4+64*21], m19
   6798    mova         [r4+64*22], m20
   6799    mova         [r4+64*23], m21
   6800    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
   6801 .end:
   6802    vpbroadcastd        m13, [o(pw_2048)]
   6803    lea                  r5, [strideq*3]
   6804    pxor                m12, m12
   6805    lea                  r3, [dstq+r5*8]
   6806    lea                  r6, [strideq+r5] ; stride*4
   6807    add                  r3, r6           ; dst+stride*28
   6808 %macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi
   6809    mova                m11, [cq+64*(   %3)] ;  0
   6810    mova                 m9, [cq+64*(31-%3)] ; 31
   6811 %if %3 >= 8
   6812    mova                m%1, [rsp+64*(%1+16)]
   6813 %endif
   6814    mova                m10, [dstq+%4]
   6815    paddsw               m8, m11, m9
   6816    psubsw              m11, m9
   6817    paddsw               m9, m%1, m%2
   6818    psubsw              m%1, m%2
   6819    punpcklbw           m%2, m10, m12
   6820    punpckhbw           m10, m12
   6821    pmulhrsw             m8, m13
   6822    pmulhrsw             m9, m13
   6823    paddw                m8, m%2
   6824    paddw                m9, m10
   6825    mova                m10, [r3+%5]
   6826    pmulhrsw            m11, m13
   6827    pmulhrsw            m%1, m13
   6828    mova    [cq+64*(   %3)], m12
   6829    mova    [cq+64*(31-%3)], m12
   6830    punpcklbw           m%2, m10, m12
   6831    punpckhbw           m10, m12
   6832    packuswb             m8, m9
   6833    paddw               m11, m%2
   6834    paddw               m%1, m10
   6835    packuswb            m11, m%1
   6836    mova          [dstq+%4], m8
   6837    mova          [r3  +%5], m11
   6838 %if %3 == 3 || %3 == 7 || %3 == 11
   6839    add                dstq, r6
   6840    sub                  r3, r6
   6841 %endif
   6842 %endmacro
   6843    IDCT_64x32_END        0, 29,  0, strideq*0, r5
   6844    IDCT_64x32_END        1, 28,  1, strideq*1, strideq*2
   6845    IDCT_64x32_END        2, 27,  2, strideq*2, strideq*1
   6846    IDCT_64x32_END        3, 26,  3, r5       , strideq*0
   6847    IDCT_64x32_END        4, 25,  4, strideq*0, r5
   6848    IDCT_64x32_END        5, 24,  5, strideq*1, strideq*2
   6849    IDCT_64x32_END        6, 23,  6, strideq*2, strideq*1
   6850    IDCT_64x32_END        7, 22,  7, r5       , strideq*0
   6851    IDCT_64x32_END        0, 21,  8, strideq*0, r5
   6852    IDCT_64x32_END        1, 20,  9, strideq*1, strideq*2
   6853    IDCT_64x32_END        2, 19, 10, strideq*2, strideq*1
   6854    IDCT_64x32_END        3, 18, 11, r5       , strideq*0
   6855    IDCT_64x32_END        4, 17, 12, strideq*0, r5
   6856    IDCT_64x32_END        5, 16, 13, strideq*1, strideq*2
   6857    IDCT_64x32_END        6, 15, 14, strideq*2, strideq*1
   6858    IDCT_64x32_END        7, 14, 15, r5       , strideq*0
   6859    RET
   6860 ALIGN function_align
   6861 .dconly:
   6862    movsx               r6d, word [cq]
   6863    mov                [cq], eobd
   6864    or                  r3d, 32
   6865    imul                r6d, 181
   6866    add                 r6d, 128
   6867    sar                 r6d, 8
   6868    imul                r6d, 181
   6869    add                 r6d, 128+256
   6870    sar                 r6d, 8+1
   6871    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2
   6872 ALIGN function_align
   6873 .pass1_end_part1:
   6874 %macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64
   6875 %if %1 != %3
   6876    mova                m%1, [cq+64*%1]
   6877 %endif
   6878    mova                 m9, [r4+64*(%3-36)] ; idct64 32+n
   6879    mova                m11, [r4+64*(-5-%3)] ; idct64 63-n
   6880    psubsw               m8, m%1, m%2        ; idct32 31-n
   6881    paddsw              m%1, m%2             ; idct32  0+n
   6882 %if %1 == %3
   6883    psubsw              m%2, m8, m9   ; out 32+n e
   6884    paddsw               m8, m9       ; out 31-n d
   6885    psubsw               m9, m%1, m11 ; out 63-n h
   6886    paddsw              m%1, m11      ; out  0+n a
   6887 %else
   6888    paddsw              m%2, m8, m9   ; out 23-n c
   6889    psubsw               m8, m9       ; out 40+n f
   6890    paddsw               m9, m%1, m11 ; out  8+n b
   6891    psubsw              m%1, m11      ; out 55-n g
   6892 %endif
   6893    mova   [r4+64*(%3-36)], m8
   6894    mova   [r4+64*(-5-%3)], m9
   6895 %endmacro
   6896    IDCT_64x32_PASS1_END  0, 29,  0
   6897    IDCT_64x32_PASS1_END  1, 28,  1
   6898    IDCT_64x32_PASS1_END  2, 27,  2
   6899    IDCT_64x32_PASS1_END  3, 26,  3
   6900    IDCT_64x32_PASS1_END  4, 25,  4
   6901    IDCT_64x32_PASS1_END  5, 24,  5
   6902    IDCT_64x32_PASS1_END  6, 23,  6
   6903    IDCT_64x32_PASS1_END  7, 22,  7
   6904 .transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted)
   6905    punpcklwd            m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3
   6906    punpckhwd           m25, m24      ; e4 f4 e5 f5 e6 f6 e7 f7
   6907    punpcklwd           m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3
   6908    punpckhwd           m23, m22      ; g4 h4 g5 h5 g6 h6 g7 h7
   6909    punpcklwd           m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3
   6910    punpckhwd           m29, m28      ; a4 b4 a5 b5 a6 b6 a7 b7
   6911    punpcklwd           m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3
   6912    punpckhwd           m27, m26      ; c4 d4 c5 d5 c6 d6 c7 d7
   6913    punpckldq           m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5
   6914    punpckhdq           m29, m27      ; a6 b6 c6 d6 a7 b7 c7 d7
   6915    punpckldq           m27, m8, m24  ; e0 f0 g0 h0 e1 f1 g1 h1
   6916    punpckhdq            m8, m24      ; e2 f2 g2 h2 e3 f3 g3 h3
   6917    punpckhdq           m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3
   6918    punpckldq           m22, m28      ; a0 b0 c0 d0 a1 b1 c1 d1
   6919    punpckldq           m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5
   6920    punpckhdq           m25, m23      ; e6 f6 g6 h6 e7 f7 g7 h7
   6921    punpckhqdq          m23, m22, m27 ;  1 23
   6922    punpcklqdq          m22, m27      ;  0 22
   6923    punpckhqdq          m27, m26, m28 ;  5 27
   6924    punpcklqdq          m26, m28      ;  4 26
   6925    punpcklqdq          m28, m29, m25 ;  6 28
   6926    punpckhqdq          m29, m25      ;  7 29
   6927    punpckhqdq          m25, m24, m8  ;  3 25
   6928    punpcklqdq          m24, m8       ;  2 24
   6929 .transpose_8x8:
   6930    punpckhwd            m8, m4, m5
   6931    punpcklwd            m4, m5
   6932    punpckhwd            m5, m0, m1
   6933    punpcklwd            m0, m1
   6934    punpckhwd            m1, m6, m7
   6935    punpcklwd            m6, m7
   6936    punpckhwd            m7, m2, m3
   6937    punpcklwd            m2, m3
   6938    punpckhdq            m3, m0, m2
   6939    punpckldq            m0, m2
   6940    punpckldq            m2, m4, m6
   6941    punpckhdq            m4, m6
   6942    punpckhdq            m6, m5, m7
   6943    punpckldq            m5, m7
   6944    punpckldq            m7, m8, m1
   6945    punpckhdq            m8, m1
   6946    punpckhqdq           m1, m0, m2
   6947    punpcklqdq           m0, m2
   6948    punpcklqdq           m2, m3, m4
   6949    punpckhqdq           m3, m4
   6950    punpcklqdq           m4, m5, m7
   6951    punpckhqdq           m5, m7
   6952    punpckhqdq           m7, m6, m8
   6953    punpcklqdq           m6, m8
   6954    ret
   6955 .pass1_end_part2:
   6956    IDCT_64x32_PASS1_END  0, 21,  8
   6957    IDCT_64x32_PASS1_END  1, 20,  9
   6958    IDCT_64x32_PASS1_END  2, 19, 10
   6959    IDCT_64x32_PASS1_END  3, 18, 11
   6960    IDCT_64x32_PASS1_END  4, 17, 12
   6961    IDCT_64x32_PASS1_END  5, 16, 13
   6962    IDCT_64x32_PASS1_END  6, 15, 14
   6963    IDCT_64x32_PASS1_END  7, 14, 15
   6964 .transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21
   6965    punpcklwd            m8, m3, m2
   6966    punpckhwd            m3, m2
   6967    punpcklwd            m2, m1, m0
   6968    punpckhwd            m1, m0
   6969    punpcklwd            m0, m7, m6
   6970    punpckhwd            m7, m6
   6971    punpcklwd            m6, m5, m4
   6972    punpckhwd            m5, m4
   6973    punpckldq            m4, m7, m5
   6974    punpckhdq            m7, m5
   6975    punpckldq            m5, m8, m2
   6976    punpckhdq            m8, m2
   6977    punpckhdq            m2, m0, m6
   6978    punpckldq            m0, m6
   6979    punpckldq            m6, m3, m1
   6980    punpckhdq            m3, m1
   6981    punpckhqdq           m1, m0, m5
   6982    punpcklqdq           m0, m5
   6983    punpckhqdq           m5, m4, m6
   6984    punpcklqdq           m4, m6
   6985    punpcklqdq           m6, m7, m3
   6986    punpckhqdq           m7, m3
   6987    punpckhqdq           m3, m2, m8
   6988    punpcklqdq           m2, m8
   6989    punpckhwd            m8, m18, m19
   6990    punpcklwd           m18, m19
   6991    punpckhwd           m19, m14, m15
   6992    punpcklwd           m14, m15
   6993    punpckhwd           m15, m20, m21
   6994    punpcklwd           m20, m21
   6995    punpckhwd           m21, m16, m17
   6996    punpcklwd           m16, m17
   6997    punpckhdq           m17, m14, m16
   6998    punpckldq           m14, m16
   6999    punpckldq           m16, m18, m20
   7000    punpckhdq           m18, m20
   7001    punpckhdq           m20, m19, m21
   7002    punpckldq           m19, m21
   7003    punpckldq           m21, m8, m15
   7004    punpckhdq            m8, m15
   7005    punpckhqdq          m15, m14, m16
   7006    punpcklqdq          m14, m16
   7007    punpcklqdq          m16, m17, m18
   7008    punpckhqdq          m17, m18
   7009    punpcklqdq          m18, m19, m21
   7010    punpckhqdq          m19, m21
   7011    punpckhqdq          m21, m20, m8
   7012    punpcklqdq          m20, m8
   7013    ret
   7014 .pass2_fast:
   7015    vshufi32x4          m24, m9, m15, q3131  ;  5
   7016    vshufi32x4          m22, m9, m15, q2020  ;  1
   7017    vshufi32x4          m15, m1, m16, q3131  ;  6
   7018    vshufi32x4          m14, m1, m16, q2020  ;  2
   7019    vshufi32x4           m1, m0, m3, q3131   ;  4
   7020    vshufi32x4           m0, m3, q2020       ;  0
   7021    vshufi32x4           m3, m8, m2, q3131   ; 12
   7022    vshufi32x4           m2, m8, m2, q2020   ;  8
   7023    vshufi32x4          m25, m11, m17, q3131 ;  7
   7024    vshufi32x4          m23, m11, m17, q2020 ;  3
   7025    vshufi32x4          m17, m5, m19, q3131  ; 14
   7026    vshufi32x4          m16, m5, m19, q2020  ; 10
   7027    vshufi32x4          m29, m6, m20, q3131  ; 15
   7028    vshufi32x4          m27, m6, m20, q2020  ; 11
   7029    vshufi32x4          m28, m4, m18, q3131  ; 13
   7030    vshufi32x4          m26, m4, m18, q2020  ;  9
   7031    jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
   7032 
   7033 cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob
   7034    lea                  r5, [o_base]
   7035    test               eobd, eobd
   7036    jz .dconly
   7037    PROLOGUE              0, 7, 30, 64*96, dst, stride, c, eob
   7038 %undef cmp
   7039    cmp                eobd, 136
   7040    jb .fast
   7041    mova                 m0, [cq+64* 1]
   7042    mova                 m1, [cq+64*31]
   7043    mova                 m2, [cq+64*17]
   7044    mova                 m3, [cq+64*15]
   7045    vpbroadcastd        m10, [o(pd_2048)]
   7046    mov                  r4, rsp
   7047    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   7048    mova                 m0, [cq+64* 7]
   7049    mova                 m1, [cq+64*25]
   7050    mova                 m2, [cq+64*23]
   7051    mova                 m3, [cq+64* 9]
   7052    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   7053    mova                 m0, [cq+64* 5]
   7054    mova                 m1, [cq+64*27]
   7055    mova                 m2, [cq+64*21]
   7056    mova                 m3, [cq+64*11]
   7057    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   7058    mova                 m0, [cq+64* 3]
   7059    mova                 m1, [cq+64*29]
   7060    mova                 m2, [cq+64*19]
   7061    mova                 m3, [cq+64*13]
   7062    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   7063    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
   7064    mova                 m0, [cq+64* 0]
   7065    mova                 m1, [cq+64* 8]
   7066    mova                 m2, [cq+64*16]
   7067    mova                 m3, [cq+64*24]
   7068    mova                m14, [cq+64* 4]
   7069    mova                m15, [cq+64*12]
   7070    mova                m16, [cq+64*20]
   7071    mova                m17, [cq+64*28]
   7072    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
   7073    mova                m22, [cq+64* 2]
   7074    mova                m29, [cq+64*30]
   7075    mova                m26, [cq+64*18]
   7076    mova                m25, [cq+64*14]
   7077    mova                m24, [cq+64*10]
   7078    mova                m27, [cq+64*22]
   7079    mova                m28, [cq+64*26]
   7080    mova                m23, [cq+64* 6]
   7081    mova         [cq+64* 0], m14
   7082    mova         [cq+64* 1], m15
   7083    mova         [cq+64* 2], m16
   7084    mova         [cq+64* 3], m17
   7085    mova         [cq+64* 4], m18
   7086    mova         [cq+64* 5], m19
   7087    mova         [cq+64* 6], m20
   7088    mova         [cq+64* 7], m21
   7089    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
   7090    vpbroadcastd        m13, [o(pw_8192)]
   7091    call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1
   7092    mova         [r4+64*36], m1
   7093    mova         [r4+64*37], m3
   7094    mova         [r4+64*38], m5
   7095    mova         [r4+64*39], m7
   7096    mova         [r4+64*44], m23
   7097    mova         [r4+64*45], m25
   7098    mova         [r4+64*46], m27
   7099    mova         [r4+64*47], m29
   7100    pmulhrsw            m23, m13, m0 ; a0
   7101    pmulhrsw            m25, m13, m2 ; a2
   7102    pmulhrsw            m27, m13, m4 ; a4
   7103    pmulhrsw            m29, m13, m6 ; a6
   7104    call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2
   7105    lea                  r6, [r4-64*4]
   7106    add                  r4, 64*28
   7107    call .pass2_end
   7108    mov                  r4, rsp
   7109    mova                 m0, [r4+64*23]
   7110    mova                 m1, [r4+64*22]
   7111    mova                 m2, [r4+64*21]
   7112    mova                 m3, [r4+64*20]
   7113    mova                 m4, [r4+64*19]
   7114    mova                 m5, [r4+64*18]
   7115    mova                 m6, [r4+64*17]
   7116    mova                 m7, [r4+64*16]
   7117    mova                m22, [r4+64*15]
   7118    mova                m23, [r4+64*14]
   7119    mova                m24, [r4+64*13]
   7120    mova                m25, [r4+64*12]
   7121    mova                m26, [r4+64*11]
   7122    mova                m27, [r4+64*10]
   7123    mova                m28, [r4+64* 9]
   7124    mova                m29, [r4+64* 8]
   7125    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi
   7126    vpbroadcastd        m13, [o(pw_8192)]
   7127    mova         [r4+64* 8], m1
   7128    mova         [r4+64* 9], m3
   7129    mova         [r4+64*10], m5
   7130    mova         [r4+64*11], m7
   7131    mova         [r4+64*16], m23
   7132    mova         [r4+64*17], m25
   7133    mova         [r4+64*18], m27
   7134    mova         [r4+64*19], m29
   7135    pmulhrsw            m23, m13, m0 ; b0
   7136    pmulhrsw            m25, m13, m2 ; b2
   7137    pmulhrsw            m27, m13, m4 ; b4
   7138    pmulhrsw            m29, m13, m6 ; b6
   7139    mova                 m0, [r4+64*31]
   7140    mova                 m1, [r4+64*30]
   7141    mova                 m2, [r4+64*29]
   7142    mova                 m3, [r4+64*28]
   7143    mova                 m4, [r4+64*27]
   7144    mova                 m5, [r4+64*26]
   7145    mova                 m6, [r4+64*25]
   7146    mova                 m7, [r4+64*24]
   7147    mova                m14, [r4+64* 7]
   7148    mova                m15, [r4+64* 6]
   7149    mova                m16, [r4+64* 5]
   7150    mova                m17, [r4+64* 4]
   7151    mova                m18, [r4+64* 3]
   7152    mova                m19, [r4+64* 2]
   7153    mova                m20, [r4+64* 1]
   7154    mova                m21, [r4+64* 0]
   7155    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo
   7156    mov                  r6, cq
   7157    call .pass2_end
   7158    jmp .end
   7159 .fast: ; bottom/right halves are zero
   7160    mova                m28, [o(dup16_perm)]
   7161    pmovzxwd             m9,       [cq+64* 0]
   7162    vpermb               m8, m28,  [cq+64* 4]
   7163    vpermb              ym1, ym28, [cq+64*12]
   7164    vpermb               m7, m28,  [cq+64* 8]
   7165    pslld                m9, 16
   7166    call m(idct_16x16_internal_8bpc).main_fast2
   7167    vpermb              m21, m28,  [cq+64* 2]
   7168    vpermb             ym15, ym28, [cq+64*14]
   7169    vpermb             ym18, ym28, [cq+64*10]
   7170    vpermb              m14, m28,  [cq+64* 6]
   7171    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
   7172    vpermb              m22, m28,  [cq+64* 1]
   7173    vpermb             ym29, ym28, [cq+64*15]
   7174    vpermb             ym26, ym28, [cq+64* 9]
   7175    vpermb              m25, m28,  [cq+64* 7]
   7176    vpermb              m24, m28,  [cq+64* 5]
   7177    vpermb             ym27, ym28, [cq+64*11]
   7178    vpermb              m23, m28,  [cq+64* 3]
   7179    vpermb             ym28, ym28, [cq+64*13]
   7180    mova         [cq+64* 0], m14
   7181    mova         [cq+64* 1], m15
   7182    mova         [cq+64* 2], m16
   7183    mova         [cq+64* 3], m17
   7184    mova         [cq+64* 4], m18
   7185    mova         [cq+64* 5], m19
   7186    mova         [cq+64* 6], m20
   7187    mova         [cq+64* 7], m21
   7188    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
   7189    vpbroadcastd        m13, [o(pw_8192)]
   7190    mova         [cq+64*16], m4
   7191    mova         [cq+64*17], m5
   7192    mova         [cq+64*18], m6
   7193    mova         [cq+64*19], m7
   7194    mova         [cq+64*28], m26
   7195    mova         [cq+64*29], m27
   7196    mova         [cq+64*30], m28
   7197    mova         [cq+64*31], m29
   7198    call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
   7199    mova         [cq+64*20], m22
   7200    mova         [cq+64*21], m23
   7201    mova         [cq+64*22], m24
   7202    mova         [cq+64*23], m25
   7203    mova         [cq+64*24], m26
   7204    mova         [cq+64*25], m27
   7205    mova         [cq+64*26], m28
   7206    mova         [cq+64*27], m29
   7207    lea                  r4, [rsp+64*64]
   7208    lea                  r3, [rsp+64*32]
   7209    call .pass2_fast
   7210    pmulhrsw             m0, m13, [cq+64*16]
   7211    pmulhrsw             m1, m13, [cq+64*17]
   7212    pmulhrsw             m2, m13, [cq+64*18]
   7213    pmulhrsw             m3, m13, [cq+64*19]
   7214    pmulhrsw             m4, m13, [cq+64*20]
   7215    pmulhrsw             m5, m13, [cq+64*21]
   7216    pmulhrsw             m6, m13, [cq+64*22]
   7217    pmulhrsw             m7, m13, [cq+64*23]
   7218    pmulhrsw            m14, m13, [cq+64*24]
   7219    pmulhrsw            m15, m13, [cq+64*25]
   7220    pmulhrsw            m16, m13, [cq+64*26]
   7221    pmulhrsw            m17, m13, [cq+64*27]
   7222    pmulhrsw            m18, m13, [cq+64*28]
   7223    pmulhrsw            m19, m13, [cq+64*29]
   7224    pmulhrsw            m20, m13, [cq+64*30]
   7225    pmulhrsw            m21, m13, [cq+64*31]
   7226    call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
   7227    mov                  r4, rsp
   7228    mov                  r3, cq
   7229    call .pass2_fast
   7230 .end:
   7231    vpbroadcastd        m17, [o(pw_2048)]
   7232    lea                  r5, [strideq*8]
   7233    mov                  r3, dstq
   7234    pxor                m16, m16
   7235    sub                  r4, 64*5 ; rsp+64*31
   7236    mov                  r6, rsp
   7237 .end_loop:
   7238    mova                 m2, [r6+64*32] ; idct16 0+n  lo
   7239    mova                 m7, [r6+64*48] ; idct32 31-n lo
   7240    mova                 m6, [cq+64* 0] ; idct16 0+n  hi
   7241    mova                 m0, [cq+64*16] ; idct32 31-n hi
   7242    mova                 m4, [r4+64*64] ; idct64 63-n lo
   7243    mova                 m1, [r4+64* 0] ; idct64 63-n hi
   7244    mova                 m5, [r6+64*64] ; idct64 32+n lo
   7245    mova                 m8, [r6+64* 0] ; idct64 32+n hi
   7246    sub                  r3, strideq
   7247    paddsw               m3, m2, m7     ; idct32  0+n lo
   7248    mova                m12, [dstq+r5*0]
   7249    psubsw               m2, m7         ; idct32 31-n lo
   7250    mova                m15, [r3  +r5*8]
   7251    paddsw               m7, m6, m0     ; idct32  0+n hi
   7252    mova                m13, [r3  +r5*4]
   7253    psubsw               m6, m0         ; idct32 31-n hi
   7254    mova                m14, [dstq+r5*4]
   7255    paddsw               m0, m3, m4     ; out  0+n lo
   7256    add                  r6, 64
   7257    psubsw               m3, m4         ; out 63-n lo
   7258    sub                  r4, 64
   7259    paddsw               m4, m7, m1     ; out  0+n hi
   7260    mova         [cq+64* 0], m16
   7261    psubsw               m7, m1         ; out 63-n hi
   7262    mova         [cq+64*16], m16
   7263    paddsw               m1, m2, m5     ; out 31-n lo
   7264    add                  cq, 64
   7265    psubsw               m2, m5         ; out 32+n lo
   7266    paddsw               m5, m6, m8     ; out 31-n hi
   7267    psubsw               m6, m8         ; out 32+n hi
   7268    pmulhrsw             m0, m17
   7269    punpcklbw            m8, m12, m16
   7270    pmulhrsw             m4, m17
   7271    punpckhbw           m12, m16
   7272    pmulhrsw             m3, m17
   7273    punpcklbw           m11, m15, m16
   7274    pmulhrsw             m7, m17
   7275    punpckhbw           m15, m16
   7276    pmulhrsw             m1, m17
   7277    punpcklbw            m9, m13, m16
   7278    pmulhrsw             m5, m17
   7279    punpckhbw           m13, m16
   7280    pmulhrsw             m2, m17
   7281    punpcklbw           m10, m14, m16
   7282    pmulhrsw             m6, m17
   7283    punpckhbw           m14, m16
   7284    paddw                m0, m8
   7285    paddw                m4, m12
   7286    packuswb             m0, m4
   7287    paddw                m3, m11
   7288    paddw                m7, m15
   7289    packuswb             m3, m7
   7290    paddw                m1, m9
   7291    paddw                m5, m13
   7292    packuswb             m1, m5
   7293    paddw                m2, m10
   7294    paddw                m6, m14
   7295    packuswb             m2, m6
   7296    mova        [dstq+r5*0], m0
   7297    mova        [r3  +r5*8], m3
   7298    mova        [r3  +r5*4], m1
   7299    mova        [dstq+r5*4], m2
   7300    add                dstq, strideq
   7301    cmp                  r6, r4
   7302    jb .end_loop
   7303    RET
   7304 .dconly:
   7305    movsx               r6d, word [cq]
   7306    mov                [cq], eobd
   7307    or                  r3d, 64
   7308    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
   7309 ALIGN function_align
   7310 .pass2_end:
   7311    REPX  {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6
   7312    mova         [r4+64*20], m1
   7313    mova         [r4+64*21], m3
   7314    mova         [r4+64*22], m5
   7315    mova         [r4+64*23], m7
   7316    vinserti32x8         m1, m23, ym14, 1    ; a00 a01 c00 c01
   7317    vshufi32x4           m3, m23, m14, q3232 ; a02 a03 c02 c03
   7318    vinserti32x8         m5, m22, ym0, 1     ; e00 e01 g00 g01
   7319    vshufi32x4          m14, m22, m0, q3232  ; e02 e03 g02 g03
   7320    mova         [r4+64*12], m15
   7321    mova         [r4+64*13], m17
   7322    mova         [r4+64*14], m19
   7323    mova         [r4+64*15], m21
   7324    vinserti32x8        m15, m27, ym18, 1    ; a40 a41 c40 c41
   7325    vshufi32x4          m17, m27, m18, q3232 ; a42 a43 c42 c43
   7326    vinserti32x8        m18, m26, ym4, 1     ; e40 e41 g40 g41
   7327    vshufi32x4          m19, m26, m4, q3232  ; e42 e43 g42 g43
   7328    vinserti32x8        m22, m25, ym16, 1    ; a20 a21 c20 c21
   7329    vshufi32x4          m26, m25, m16, q3232 ; a22 a23 c22 c23
   7330    vinserti32x8        m25, m24, ym2, 1     ; e20 e21 g20 g21
   7331    vshufi32x4          m27, m24, m2, q3232  ; e22 e23 g22 g23
   7332    vinserti32x8        m23, m29, ym20, 1    ; a60 a61 c60 c61
   7333    vshufi32x4          m29, m20, q3232      ; a62 a63 c62 c63
   7334    vshufi32x4          m13, m28, m6, q3232  ; e62 e63 g62 g63
   7335    vinserti32x8        m28, ym6, 1          ; e60 e61 g60 g61
   7336    vshufi32x4           m0, m1, m5, q2020   ;  0
   7337    vshufi32x4           m1, m5, q3131       ;  8
   7338    vshufi32x4           m2, m3, m14, q2020  ; 16
   7339    vshufi32x4           m3, m14, q3131      ; 24
   7340    vshufi32x4          m14, m15, m18, q2020 ;  4
   7341    vshufi32x4          m15, m18, q3131      ; 12
   7342    vshufi32x4          m16, m17, m19, q2020 ; 20
   7343    vshufi32x4          m17, m19, q3131      ; 28
   7344    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
   7345    vshufi32x4          m24, m22, m25, q3131 ; 10
   7346    vshufi32x4          m22, m25, q2020      ;  2
   7347    vshufi32x4          m25, m23, m28, q3131 ; 14
   7348    vshufi32x4          m23, m28, q2020      ;  6
   7349    vshufi32x4          m28, m26, m27, q3131 ; 26
   7350    vshufi32x4          m26, m27, q2020      ; 18
   7351    vshufi32x4          m27, m29, m13, q2020 ; 22
   7352    vshufi32x4          m29, m13, q3131      ; 30
   7353    mova         [r6+64* 0], m0
   7354    mova         [r6+64* 1], m1
   7355    mova         [r6+64* 2], m2
   7356    mova         [r6+64* 3], m3
   7357    mova         [r6+64* 4], m4
   7358    mova         [r6+64* 5], m5
   7359    mova         [r6+64* 6], m6
   7360    mova         [r6+64* 7], m7
   7361    mova         [r6+64* 8], m14
   7362    mova         [r6+64* 9], m15
   7363    mova         [r6+64*10], m16
   7364    mova         [r6+64*11], m17
   7365    mova         [r6+64*12], m18
   7366    mova         [r6+64*13], m19
   7367    mova         [r6+64*14], m20
   7368    mova         [r6+64*15], m21
   7369    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
   7370    vpbroadcastd        m13, [o(pw_8192)]
   7371    mova         [r6+64*16], m29
   7372    mova         [r6+64*17], m28
   7373    mova         [r6+64*18], m27
   7374    mova         [r6+64*19], m26
   7375    mova         [r6+64*20], m25
   7376    mova         [r6+64*21], m24
   7377    mova         [r6+64*22], m23
   7378    mova         [r6+64*23], m22
   7379    mova         [r6+64*24], m21
   7380    mova         [r6+64*25], m20
   7381    mova         [r6+64*26], m19
   7382    mova         [r6+64*27], m18
   7383    mova         [r6+64*28], m17
   7384    mova         [r6+64*29], m16
   7385    mova         [r6+64*30], m15
   7386    mova         [r6+64*31], m14
   7387    pmulhrsw            m15, m13, [r4+64* 8] ;  1  9 17 25
   7388    pmulhrsw            m16, m13, [r4+64*12]
   7389    pmulhrsw            m17, m13, [r4+64*16]
   7390    pmulhrsw            m18, m13, [r4+64*20]
   7391    pmulhrsw            m19, m13, [r4+64*11] ;  7 15 23 31
   7392    pmulhrsw            m20, m13, [r4+64*15]
   7393    pmulhrsw            m21, m13, [r4+64*19]
   7394    pmulhrsw            m22, m13, [r4+64*23]
   7395    vinserti32x8        m14, m15, ym16, 1 ; a1  a9  c1  c9
   7396    vshufi32x4          m15, m16, q3232   ; a17 a25 c17 c25
   7397    vinserti32x8        m16, m17, ym18, 1 ; e1  e9  g1  g9
   7398    vshufi32x4          m17, m18, q3232   ; e17 e25 g17 g25
   7399    pmulhrsw            m23, m13, [r4+64*10] ;  5 13 21 29
   7400    pmulhrsw            m24, m13, [r4+64*14]
   7401    pmulhrsw            m25, m13, [r4+64*18]
   7402    pmulhrsw            m26, m13, [r4+64*22]
   7403    vinserti32x8        m18, m19, ym20, 1 ; a7  a15 c7  c15
   7404    vshufi32x4          m19, m20, q3232   ; a23 a31 c23 c31
   7405    vinserti32x8        m20, m21, ym22, 1 ; e7  e15 g7  g15
   7406    vshufi32x4          m21, m22, q3232   ; e23 e31 g23 g31
   7407    pmulhrsw            m27, m13, [r4+64* 9] ;  3 11 19 27
   7408    pmulhrsw            m28, m13, [r4+64*13]
   7409    pmulhrsw            m29, m13, [r4+64*17]
   7410    pmulhrsw            m13,      [r4+64*21]
   7411    vshufi32x4           m0, m14, m16, q2020 ;  1
   7412    vshufi32x4           m1, m19, m21, q3131 ; 31
   7413    vshufi32x4           m2, m15, m17, q2020 ; 17
   7414    vshufi32x4           m3, m18, m20, q3131 ; 15
   7415    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   7416    vshufi32x4           m0, m18, m20, q2020 ;  7
   7417    vshufi32x4           m1, m15, m17, q3131 ; 25
   7418    vshufi32x4           m2, m19, m21, q2020 ; 23
   7419    vshufi32x4           m3, m14, m16, q3131 ;  9
   7420    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   7421    vinserti32x8        m22, m23, ym24, 1 ; a5  a13 c5  c13
   7422    vshufi32x4          m23, m24, q3232   ; a21 a29 c21 c29
   7423    vinserti32x8        m24, m25, ym26, 1 ; e5  e13 g5  g13
   7424    vshufi32x4          m25, m26, q3232   ; e21 e29 g21 g29
   7425    vinserti32x8        m26, m27, ym28, 1 ; a3  a11 c3  c11
   7426    vshufi32x4          m27, m28, q3232   ; a19 a27 c19 c27
   7427    vinserti32x8        m28, m29, ym13, 1 ; e3  e11 g3  g11
   7428    vshufi32x4          m29, m13, q3232   ; e19 e17 g19 g27
   7429    vshufi32x4           m0, m22, m24, q2020 ;  5
   7430    vshufi32x4           m1, m27, m29, q3131 ; 27
   7431    vshufi32x4           m2, m23, m25, q2020 ; 21
   7432    vshufi32x4           m3, m26, m28, q3131 ; 11
   7433    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   7434    vshufi32x4           m0, m26, m28, q2020 ;  3
   7435    vshufi32x4           m1, m23, m25, q3131 ; 29
   7436    vshufi32x4           m2, m27, m29, q2020 ; 19
   7437    vshufi32x4           m3, m22, m24, q3131 ; 13
   7438    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   7439    jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
   7440 ALIGN function_align
   7441 .pass2_fast:
   7442    vshufi32x4          m23, m1, m16, q3131  ;  6
   7443    vshufi32x4          m22, m1, m16, q2020  ;  2
   7444    vshufi32x4          m14, m0, m3, q3131   ;  4
   7445    vshufi32x4          m26, m0, m3, q2020   ;  0
   7446    vshufi32x4          m28, m9, m15, q3131  ;  5
   7447    vshufi32x4           m0, m9, m15, q2020  ;  1
   7448    vshufi32x4          m16, m11, m17, q3131 ;  7
   7449    vshufi32x4          m29, m11, m17, q2020 ;  3
   7450    vshufi32x4          m15, m8, m2, q3131   ; 12
   7451    vshufi32x4          m27, m8, m2, q2020   ;  8
   7452    vshufi32x4          m25, m5, m19, q3131  ; 14
   7453    vshufi32x4          m24, m5, m19, q2020  ; 10
   7454    vshufi32x4           m3, m6, m20, q3131  ; 15
   7455    vshufi32x4          m19, m6, m20, q2020  ; 11
   7456    vshufi32x4          m17, m4, m18, q3131  ; 13
   7457    vshufi32x4          m18, m4, m18, q2020  ;  9
   7458    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
   7459    mova                 m0, m16
   7460    mova                 m3, m18
   7461    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
   7462    mova                 m0, m28
   7463    mova                 m3, m19
   7464    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
   7465    mova                 m0, m29
   7466    mova                 m3, m17
   7467    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
   7468    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
   7469    mova                 m0, m26
   7470    mova                 m1, m27
   7471    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
   7472    mova         [r3+64* 0], m0
   7473    mova         [r3+64* 1], m1
   7474    mova         [r3+64* 2], m2
   7475    mova         [r3+64* 3], m3
   7476    mova         [r3+64* 4], m4
   7477    mova         [r3+64* 5], m5
   7478    mova         [r3+64* 6], m6
   7479    mova         [r3+64* 7], m7
   7480    mova         [r3+64* 8], m14
   7481    mova         [r3+64* 9], m15
   7482    mova         [r3+64*10], m16
   7483    mova         [r3+64*11], m17
   7484    mova         [r3+64*12], m18
   7485    mova         [r3+64*13], m19
   7486    mova         [r3+64*14], m20
   7487    mova         [r3+64*15], m21
   7488    call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
   7489    mova         [r3+64*16], m29
   7490    mova         [r3+64*17], m28
   7491    mova         [r3+64*18], m27
   7492    mova         [r3+64*19], m26
   7493    mova         [r3+64*20], m25
   7494    mova         [r3+64*21], m24
   7495    mova         [r3+64*22], m23
   7496    mova         [r3+64*23], m22
   7497    mova         [r3+64*24], m21
   7498    mova         [r3+64*25], m20
   7499    mova         [r3+64*26], m19
   7500    mova         [r3+64*27], m18
   7501    mova         [r3+64*28], m17
   7502    mova         [r3+64*29], m16
   7503    mova         [r3+64*30], m15
   7504    mova         [r3+64*31], m14
   7505    ret
   7506 
   7507 %endif ; ARCH_X86_64