tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx16_avx512.asm (233313B)


      1 ; Copyright © 2022-2023, VideoLAN and dav1d authors
      2 ; Copyright © 2022-2023, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA 64
     32 
     33 idct8x8p:      db  0,  1,  4,  5,  2,  3,  6,  7, 16, 17, 20, 21, 18, 19, 22, 23
     34               db  8,  9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31
     35               db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55
     36               db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63
     37 idtx8x8p:      db  0,  1, 32, 33,  2,  3, 34, 35,  4,  5, 36, 37,  6,  7, 38, 39
     38               db  8,  9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
     39               db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
     40               db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
     41 idct8x16p:     db 54, 55,  2,  3, 22, 23, 34, 35, 38, 39, 18, 19,  6,  7, 50, 51
     42               db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59
     43               db 52, 53,  4,  5, 20, 21, 36, 37, 32, 33,  0,  1, 48, 49, 16, 17
     44               db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41,  8,  9, 56, 57, 24, 25
     45 iadst8x16p:    db  0,  1, 54, 55, 48, 49,  6,  7, 16, 17, 38, 39, 32, 33, 22, 23
     46               db  8,  9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31
     47               db  4,  5, 50, 51, 52, 53,  2,  3, 20, 21, 34, 35, 36, 37, 18, 19
     48               db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27
     49 permA:         db  0,  1,  0,  8,  4,  5,  1,  9,  8,  9,  4, 12, 12, 13,  5, 13
     50               db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29
     51               db  2,  3,  2, 10,  6,  7,  3, 11, 10, 11,  6, 14, 14, 15,  7, 15
     52               db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31
     53 permB:         db  4,  2,  1,  8,  0,  0,  1,  0, 12,  3,  3, 10,  8,  1,  3,  2
     54               db  5, 10,  5, 12,  1,  8,  5,  4, 13, 11,  7, 14,  9,  9,  7,  6
     55               db  6,  6, 13,  4,  2,  4,  4,  5, 14,  7, 15,  6, 10,  5,  6,  7
     56               db  7, 14,  9,  0,  3, 12,  0,  1, 15, 15, 11,  2, 11, 13,  2,  3
     57 permC:         db  0,  9,  0,  0,  0,  1,  4,  4,  2, 11,  2,  2,  2,  3,  6,  6
     58               db  1,  8,  1,  8,  4,  5,  5, 12,  3, 10,  3, 10,  6,  7,  7, 14
     59               db  9,  1,  8,  1,  1,  0, 12,  5, 11,  3, 10,  3,  3,  2, 14,  7
     60               db  8,  0,  9,  9,  5,  4, 13, 13, 10,  2, 11, 11,  7,  6, 15, 15
     61 idct8x32p:     db  0,  1,  4,  5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
     62               db  8,  9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
     63               db  2,  3,  6,  7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
     64               db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
     65 idct32x8p:     db  2, 18,  0, 16,  3, 19,  1, 17, 10, 26,  8, 24, 11, 27,  9, 25
     66               db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57
     67               db  6, 22,  4, 20,  7, 23,  5, 21, 14, 30, 12, 28, 15, 31, 13, 29
     68               db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61
     69 idtx32x8p:     db  0,  8, 16, 24,  4, 12, 20, 28,  2, 10, 18, 26,  6, 14, 22, 30
     70               db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62
     71               db  1,  9, 17, 25,  5, 13, 21, 29,  3, 11, 19, 27,  7, 15, 23, 31
     72               db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63
     73 
     74 pw_2048_m2048: times 16 dw  2048
     75 pw_m2048_2048: times 16 dw -2048
     76 pw_2048:       times 16 dw  2048
     77 
     78 ; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-, 4=--
     79 %macro COEF_PAIR 2-3 0 ; a, b, flags
     80 %if %3 == 1
     81 pd_%1_m%2: dd %1, %1, -%2, -%2
     82 %define pd_%1  (pd_%1_m%2 + 4*0)
     83 %define pd_m%2 (pd_%1_m%2 + 4*2)
     84 %elif %3 == 2
     85 pd_m%1_%2: dd -%1, -%1, %2, %2
     86 %define pd_m%1 (pd_m%1_%2 + 4*0)
     87 %define pd_%2  (pd_m%1_%2 + 4*2)
     88 %elif %3 == 4
     89 pd_m%1_m%2: dd -%1, -%1, -%2, -%2
     90 %define pd_m%1 (pd_m%1_m%2 + 4*0)
     91 %define pd_m%2 (pd_m%1_m%2 + 4*2)
     92 %else
     93 pd_%1_%2: dd %1, %1, %2, %2
     94 %define pd_%1  (pd_%1_%2 + 4*0)
     95 %define pd_%2  (pd_%1_%2 + 4*2)
     96 %if %3 == 3
     97 %define pd_%2_m%2 pd_%2
     98 dd -%2, -%2
     99 %endif
    100 %endif
    101 %endmacro
    102 
    103 COEF_PAIR  101,  501
    104 COEF_PAIR  201,  601, 1
    105 COEF_PAIR  201,  995
    106 COEF_PAIR  401, 1189, 1
    107 COEF_PAIR  401, 1931
    108 COEF_PAIR  401, 3920
    109 COEF_PAIR  401, 4076
    110 COEF_PAIR  700,  301, 4
    111 COEF_PAIR  799, 2276, 1
    112 COEF_PAIR  799, 3406
    113 COEF_PAIR  799, 4017
    114 COEF_PAIR 1380,  601
    115 COEF_PAIR 1751, 2440
    116 COEF_PAIR 2598, 1189
    117 COEF_PAIR 2598, 1931, 2
    118 COEF_PAIR 2598, 3612
    119 COEF_PAIR 2751, 2106
    120 COEF_PAIR 2896, 1567, 3
    121 COEF_PAIR 2896, 3784, 3
    122 COEF_PAIR 3035, 3513
    123 COEF_PAIR 3166, 1931
    124 COEF_PAIR 3166, 3612
    125 COEF_PAIR 3166, 3920
    126 COEF_PAIR 3703, 3290
    127 COEF_PAIR 3857, 4052
    128 COEF_PAIR 4017, 2276
    129 COEF_PAIR 4017, 3406
    130 COEF_PAIR 4036, 4085
    131 COEF_PAIR 4076, 1189
    132 COEF_PAIR 4076, 3612
    133 COEF_PAIR 4076, 3920
    134 COEF_PAIR 4091, 3973
    135 COEF_PAIR 4091, 4052
    136 COEF_PAIR 4095, 4065
    137 
    138 pb_32:           times 4 db 32
    139 pw_5:            times 2 dw 5
    140 pw_4096:         times 2 dw 4096
    141 pw_8192:         times 2 dw 8192
    142 pw_1697x16:      times 2 dw 1697*16
    143 pw_2896x8:       times 2 dw 2896*8
    144 pixel_10bpc_max: times 2 dw 0x03ff
    145 dconly_10bpc:    times 2 dw 0x7c00
    146 clip_18b_min:    dd -0x20000
    147 clip_18b_max:    dd  0x1ffff
    148 pd_1:            dd 1
    149 pd_2:            dd 2
    150 pd_1448:         dd 1448
    151 pd_2048:         dd 2048
    152 pd_3071:         dd 3071 ; 1024 + 2048 - 1
    153 pd_3072:         dd 3072 ; 1024 + 2048
    154 pd_5119:         dd 5119 ; 1024 + 4096 - 1
    155 pd_5120:         dd 5120 ; 1024 + 4096
    156 pd_5793:         dd 5793
    157 
    158 cextern dup16_perm
    159 cextern int8_permA
    160 cextern idct64_mul_16bpc
    161 cextern idct_8x8_internal_8bpc_avx512icl.main
    162 cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2
    163 cextern idct_8x16_internal_8bpc_avx512icl.main
    164 cextern idct_8x16_internal_8bpc_avx512icl.main2
    165 cextern idct_8x16_internal_8bpc_avx512icl.main_fast
    166 cextern idct_8x16_internal_8bpc_avx512icl.main_fast2
    167 cextern iadst_8x16_internal_8bpc_avx512icl.main2
    168 cextern idct_16x8_internal_8bpc_avx512icl.main
    169 cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2
    170 cextern idct_16x16_internal_8bpc_avx512icl.main
    171 cextern idct_16x16_internal_8bpc_avx512icl.main2
    172 cextern idct_16x16_internal_8bpc_avx512icl.main_fast
    173 cextern idct_16x16_internal_8bpc_avx512icl.main_fast2
    174 cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b
    175 cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main
    176 cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast
    177 cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2
    178 cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end
    179 cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf
    180 cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast
    181 cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast2
    182 cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main
    183 cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf
    184 cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast
    185 cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2
    186 cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast3
    187 cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf
    188 cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast
    189 cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2
    190 cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast3
    191 cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf
    192 cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast
    193 cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1
    194 cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast
    195 cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast2
    196 cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part2
    197 
    198 SECTION .text
    199 
    200 %define o_base (pw_2048+4*128)
    201 %define o_base_8bpc (int8_permA+64*18)
    202 %define o(x) (r5 - o_base + (x))
    203 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
    204 
    205 INIT_ZMM avx512icl
    206 
    207 ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
    208 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
    209 ; flags: 1 = inv_dst1, 2 = inv_dst2
    210 ; skip round/shift if rnd is not a number
    211 %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
    212 %if %8 < 32
    213    pmulld              m%4, m%1, m%8
    214    pmulld              m%3, m%2, m%8
    215 %else
    216 %if %8 < 4096
    217    vpbroadcastd        m%3, [o(pd_%8)]
    218 %else
    219    vbroadcasti32x4     m%3, [o(pd_%8)]
    220 %endif
    221    pmulld              m%4, m%1, m%3
    222    pmulld              m%3, m%2
    223 %endif
    224 %if %7 < 32
    225    pmulld              m%1, m%7
    226    pmulld              m%2, m%7
    227 %else
    228 %if %7 < 4096
    229    vpbroadcastd        m%5, [o(pd_%7)]
    230 %else
    231    vbroadcasti32x4     m%5, [o(pd_%7)]
    232 %endif
    233    pmulld              m%1, m%5
    234    pmulld              m%2, m%5
    235 %endif
    236 %if %9 & 2
    237    psubd               m%4, m%6, m%4
    238    psubd               m%2, m%4, m%2
    239 %else
    240 %ifnum %6
    241    paddd               m%4, m%6
    242 %endif
    243    paddd               m%2, m%4
    244 %endif
    245 %ifnum %6
    246    paddd               m%1, m%6
    247 %endif
    248 %if %9 & 1
    249    psubd               m%1, m%3, m%1
    250 %else
    251    psubd               m%1, m%3
    252 %endif
    253 %ifnum %6
    254    psrad               m%2, 12
    255    psrad               m%1, 12
    256 %endif
    257 %endmacro
    258 
    259 %macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size
    260 cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2
    261    %define %%p1 m(i%1_%4_internal_10bpc)
    262    lea                  r5, [o_base]
    263    ; Jump to the 1st txfm function if we're not taking the fast path, which
    264    ; in turn performs an indirect jump to the 2nd txfm function.
    265    lea tx2q, [m(i%2_%4_internal_10bpc).pass2]
    266 %ifidn %1_%2, dct_dct
    267    test               eobd, eobd
    268    jnz %%p1
    269 %else
    270 %if %3
    271    add                eobd, %3
    272 %endif
    273    ; jump to the 1st txfm function unless it's located directly after this
    274    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
    275 ALIGN function_align
    276 %%end:
    277 %endif
    278 %endmacro
    279 
    280 %macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
    281    INV_TXFM_FN          %1, %2, %3, 8x8
    282 %ifidn %1_%2, dct_dct
    283    imul                r6d, [cq], 181
    284    mov                [cq], eobd ; 0
    285    or                  r3d, 8
    286 .dconly:
    287    add                 r6d, 384
    288    sar                 r6d, 9
    289 .dconly2:
    290    vpbroadcastd        ym2, [o(dconly_10bpc)]
    291    imul                r6d, 181
    292    add                 r6d, 2176
    293    sar                 r6d, 12
    294    vpbroadcastw        ym1, r6d
    295    paddsw              ym1, ym2
    296 .dconly_loop:
    297    mova                xm0, [dstq+strideq*0]
    298    vinserti32x4        ym0, [dstq+strideq*1], 1
    299    paddsw              ym0, ym1
    300    psubusw             ym0, ym2
    301    mova          [dstq+strideq*0], xm0
    302    vextracti32x4 [dstq+strideq*1], ym0, 1
    303    lea                dstq, [dstq+strideq*2]
    304    sub                 r3d, 2
    305    jg .dconly_loop
    306    RET
    307 %endif
    308 %endmacro
    309 
    310 INV_TXFM_8X8_FN dct, dct
    311 INV_TXFM_8X8_FN dct, adst
    312 INV_TXFM_8X8_FN dct, flipadst
    313 INV_TXFM_8X8_FN dct, identity
    314 
    315 cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
    316    call .load
    317    vpermi2q             m1, m0, m2 ; 1 5
    318    vpermi2q             m3, m6, m4 ; 7 3
    319    vpermt2q             m0, m5, m4 ; 0 2
    320    vpermt2q             m2, m5, m6 ; 4 6
    321    call .main
    322    call .main_end
    323    mova                 m4, [o(idct8x8p)]
    324    packssdw             m0, m2     ; 0 1 4 5
    325    packssdw             m1, m3     ; 3 2 7 6
    326    vpermb               m0, m4, m0
    327    vprolq               m1, 32
    328    vpermb               m2, m4, m1
    329    punpckhdq            m1, m0, m2
    330    punpckldq            m0, m2
    331    jmp                tx2q
    332 .pass2:
    333    lea                  r5, [o_base_8bpc]
    334    vextracti32x8       ym2, m0, 1
    335    vextracti32x8       ym3, m1, 1
    336    call m(idct_8x8_internal_8bpc).main
    337    mova                m10, [permC]
    338    vpbroadcastd        m12, [pw_2048]
    339 .end:
    340    vpermt2q             m0, m10, m1
    341    vpermt2q             m2, m10, m3
    342 .end2:
    343    vpbroadcastd        m11, [pixel_10bpc_max]
    344    lea                  r6, [strideq*3]
    345    pxor                m10, m10
    346    pmulhrsw             m8, m12, m0
    347    call .write_8x4_start
    348    pmulhrsw             m8, m12, m2
    349 .write_8x4:
    350    lea                dstq, [dstq+strideq*4]
    351    add                  cq, 64*2
    352 .write_8x4_start:
    353    mova                xm9, [dstq+strideq*0]
    354    vinserti32x4        ym9, [dstq+strideq*1], 1
    355    vinserti32x4         m9, [dstq+strideq*2], 2
    356    vinserti32x4         m9, [dstq+r6       ], 3
    357    mova          [cq+64*0], m10
    358    mova          [cq+64*1], m10
    359    paddw                m9, m8
    360    pmaxsw               m9, m10
    361    pminsw               m9, m11
    362    mova          [dstq+strideq*0], xm9
    363    vextracti32x4 [dstq+strideq*1], ym9, 1
    364    vextracti32x4 [dstq+strideq*2], m9, 2
    365    vextracti32x4 [dstq+r6       ], m9, 3
    366    ret
    367 ALIGN function_align
    368 .load:
    369    mova                 m0, [cq+64*0] ; 0 1
    370    mova                 m4, [cq+64*1] ; 2 3
    371    mova                 m1, [o(permB)]
    372    mova                 m2, [cq+64*2] ; 4 5
    373    mova                 m6, [cq+64*3] ; 6 7
    374    vpbroadcastd        m13, [o(pd_2048)]
    375    vpbroadcastd        m14, [o(clip_18b_min)]
    376    vpbroadcastd        m15, [o(clip_18b_max)]
    377    psrlq                m5, m1, 32
    378    vpbroadcastd        m12, [o(pd_2896)]
    379    mova                 m3, m1
    380    vpbroadcastd        m11, [o(pd_1)]
    381    ret
    382 ALIGN function_align
    383 .main_fast: ; bottom half is zero
    384    vbroadcasti32x4      m3, [o(pd_4017_3406)]
    385    vbroadcasti32x4      m8, [o(pd_799_m2276)]
    386    vbroadcasti32x4      m2, [o(pd_2896_3784)]
    387    vbroadcasti32x4      m9, [o(pd_2896_1567)]
    388    pmulld               m3, m1     ; t4a  t5a
    389    pmulld               m1, m8     ; t7a  t6a
    390    pmulld               m2, m0     ; t0   t3
    391    pmulld               m0, m9     ; t1   t2
    392    jmp .main2
    393 .main:
    394    ITX_MULSUB_2D         1, 3, 8, 9, 10, _,  799_3406, 4017_2276
    395    ITX_MULSUB_2D         0, 2, 8, 9, 10, _, 2896_1567, 2896_3784
    396 .main2:
    397    REPX     {paddd x, m13}, m1, m3, m0, m2
    398    REPX     {psrad x, 12 }, m1, m3, m0, m2
    399    punpcklqdq           m8, m1, m3 ; t4a  t7a
    400    punpckhqdq           m1, m3     ; t5a  t6a
    401    psubd                m3, m8, m1 ; t5a  t6a
    402    paddd                m8, m1     ; t4   t7
    403    pmaxsd               m3, m14
    404    punpckhqdq           m1, m2, m0 ; t3   t2
    405    pminsd               m3, m15
    406    punpcklqdq           m2, m0     ; t0   t1
    407    pmulld               m3, m12
    408    paddd                m0, m2, m1 ; dct4 out0 out1
    409    psubd                m2, m1     ; dct4 out3 out2
    410    REPX    {pmaxsd x, m14}, m8, m0, m2
    411    REPX    {pminsd x, m15}, m8, m0, m2
    412 .main3:
    413    pshufd               m1, m3, q1032
    414    paddd                m3, m13
    415    psubd                m9, m3, m1
    416    paddd                m3, m1
    417    psrad                m9, 12
    418    psrad                m3, 12
    419    punpckhqdq           m1, m8, m3   ; t7   t6
    420    shufpd               m8, m9, 0xaa ; t4   t5
    421    ret
    422 .main_end:
    423    paddd                m0, m11
    424    paddd                m2, m11
    425    psubd                m3, m0, m1 ; out7 out6
    426    paddd                m0, m1     ; out0 out1
    427    paddd                m1, m2, m8 ; out3 out2
    428    psubd                m2, m8     ; out4 out5
    429    REPX   {vpsravd x, m11}, m0, m2, m3, m1
    430    ret
    431 
    432 INV_TXFM_8X8_FN adst, dct
    433 INV_TXFM_8X8_FN adst, flipadst
    434 INV_TXFM_8X8_FN adst, identity
    435 INV_TXFM_8X8_FN adst, adst
    436 
    437 cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
    438    call m(idct_8x8_internal_10bpc).load
    439    vpermi2q             m1, m6, m2 ; 7 5
    440    vpermi2q             m3, m4, m0 ; 3 1
    441    vpermt2q             m0, m5, m4 ; 0 2
    442    vpermt2q             m2, m5, m6 ; 4 6
    443    call .main
    444    punpckldq            m1, m2, m4 ;  out4  out6
    445    punpckhdq            m2, m0     ; -out5 -out7
    446    punpckldq            m0, m3     ;  out0  out2
    447    punpckhdq            m4, m3     ; -out1 -out3
    448    paddd                m1, m11
    449    psubd                m3, m11, m2
    450    paddd                m0, m11
    451    psubd                m4, m11, m4
    452 .pass1_end:
    453    REPX       {psrad x, 1}, m1, m0, m3, m4
    454    packssdw             m0, m1     ; 0 2 4 6
    455    packssdw             m4, m3     ; 1 3 5 7
    456    psrlq                m1, [o(permB)], 8
    457    punpckhwd            m3, m0, m4
    458    punpcklwd            m0, m4
    459    psrlq                m2, m1, 32
    460    vpermi2q             m1, m0, m3
    461    vpermt2q             m0, m2, m3
    462    jmp                tx2q
    463 .pass2:
    464    call .main_pass2
    465    movu                m10, [permC+2]
    466    vbroadcasti32x8     m12, [pw_2048_m2048+16]
    467    jmp m(idct_8x8_internal_10bpc).end
    468 .main_pass2:
    469    vextracti32x8       ym2, m0, 1
    470    vextracti32x8       ym3, m1, 1
    471    lea                  r5, [o_base_8bpc]
    472    pshufd              ym4, ym0, q1032
    473    pshufd              ym5, ym1, q1032
    474    jmp m(iadst_8x8_internal_8bpc).main_pass2
    475 ALIGN function_align
    476 .main:
    477    ITX_MULSUB_2D         1, 0, 4, 5, 6, 13,  401_1931, 4076_3612
    478    ITX_MULSUB_2D         3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189
    479    psubd                m4, m0, m2   ; t4  t6
    480    paddd                m0, m2       ; t0  t2
    481    psubd                m2, m1, m3   ; t5  t7
    482    paddd                m1, m3       ; t1  t3
    483    REPX    {pmaxsd x, m14}, m4, m2, m0, m1
    484    REPX    {pminsd x, m15}, m4, m2, m0, m1
    485    pxor                 m5, m5
    486    psubd                m5, m4
    487    shufpd               m4, m2, 0xaa ; t4  t7
    488    shufpd               m2, m5, 0xaa ; t5 -t6
    489    ITX_MULSUB_2D         4, 2, 3, 5, 6, 13, 1567, 3784
    490    punpckhqdq           m3, m0, m1
    491    punpcklqdq           m0, m1
    492    psubd                m1, m0, m3   ; t2  t3
    493    paddd                m0, m3       ; out0 -out7
    494    punpckhqdq           m3, m4, m2   ; t7a t6a
    495    punpcklqdq           m4, m2       ; t5a t4a
    496    psubd                m2, m4, m3   ; t7  t6
    497    paddd                m4, m3       ; out6 -out1
    498    REPX    {pmaxsd x, m14}, m1, m2
    499    REPX    {pminsd x, m15}, m1, m2
    500    shufpd               m3, m1, m2, 0xaa
    501    shufpd               m1, m2, 0x55
    502    pmulld               m3, m12
    503    pmulld               m1, m12
    504    paddd                m3, m13
    505    psubd                m2, m3, m1
    506    paddd                m3, m1
    507    psrad                m2, 12       ; out4 -out5
    508    pshufd               m3, m3, q1032
    509    psrad                m3, 12       ; out2 -out3
    510    ret
    511 
    512 INV_TXFM_8X8_FN flipadst, dct
    513 INV_TXFM_8X8_FN flipadst, adst
    514 INV_TXFM_8X8_FN flipadst, identity
    515 INV_TXFM_8X8_FN flipadst, flipadst
    516 
    517 cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
    518    call m(idct_8x8_internal_10bpc).load
    519    vpermi2q             m1, m6, m2 ; 7 5
    520    vpermi2q             m3, m4, m0 ; 3 1
    521    vpermt2q             m0, m5, m4 ; 0 2
    522    vpermt2q             m2, m5, m6 ; 4 6
    523    call m(iadst_8x8_internal_10bpc).main
    524    punpckhdq            m1, m3, m4 ; -out3 -out1
    525    punpckldq            m3, m0     ;  out2  out0
    526    punpckhdq            m0, m2     ; -out7 -out5
    527    punpckldq            m4, m2     ;  out6  out4
    528    psubd                m1, m11, m1
    529    paddd                m3, m11
    530    psubd                m0, m11, m0
    531    paddd                m4, m11
    532    jmp m(iadst_8x8_internal_10bpc).pass1_end
    533 .pass2:
    534    call m(iadst_8x8_internal_10bpc).main_pass2
    535    movu                m10, [permC+1]
    536    vbroadcasti32x8     m12, [pw_m2048_2048+16]
    537    lea                  r6, [strideq*3]
    538    vpermt2q             m0, m10, m1 ; 7 6 5 4
    539    vpbroadcastd        m11, [pixel_10bpc_max]
    540    vpermt2q             m2, m10, m3 ; 3 2 1 0
    541    pxor                m10, m10
    542    pmulhrsw             m8, m12, m2
    543    call m(idct_8x8_internal_10bpc).write_8x4_start
    544    pmulhrsw             m8, m12, m0
    545    jmp m(idct_8x8_internal_10bpc).write_8x4
    546 
    547 INV_TXFM_8X8_FN identity, dct
    548 INV_TXFM_8X8_FN identity, adst
    549 INV_TXFM_8X8_FN identity, flipadst
    550 INV_TXFM_8X8_FN identity, identity
    551 
    552 cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
    553    mova                 m1, [cq+64*0]
    554    packssdw             m1, [cq+64*2] ; 0 4   1 5
    555    mova                 m2, [cq+64*1] ; 2 6   3 7
    556    packssdw             m2, [cq+64*3]
    557    mova                 m0, [o(idtx8x8p)]
    558    vpermb               m1, m0, m1
    559    vpermb               m2, m0, m2
    560    punpckldq            m0, m1, m2    ; 0 1   4 5
    561    punpckhdq            m1, m2        ; 2 3   6 7
    562    jmp                tx2q
    563 .pass2:
    564    movu                 m3, [o(permC+2)]
    565    vpbroadcastd        m12, [o(pw_4096)]
    566    psrlq                m2, m3, 32
    567    vpermi2q             m2, m0, m1
    568    vpermt2q             m0, m3, m1
    569    jmp m(idct_8x8_internal_10bpc).end2
    570 
    571 %macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset
    572    INV_TXFM_FN          %1, %2, %3, 8x16
    573 %ifidn %1_%2, dct_dct
    574    imul                r6d, [cq], 181
    575    mov                [cq], eobd ; 0
    576    or                  r3d, 16
    577    add                 r6d, 128
    578    sar                 r6d, 8
    579    imul                r6d, 181
    580    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
    581 %endif
    582 %endmacro
    583 
    584 INV_TXFM_8X16_FN dct, dct
    585 INV_TXFM_8X16_FN dct, identity, 35
    586 INV_TXFM_8X16_FN dct, flipadst
    587 INV_TXFM_8X16_FN dct, adst
    588 
    589 cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
    590 %undef cmp
    591    cmp                eobd, 43
    592    jl .fast
    593    call .load
    594    call .main
    595    call .main_end
    596 .pass1_end:
    597    packssdw             m0, m4
    598    packssdw             m1, m5
    599    packssdw             m2, m6
    600    packssdw             m3, m7
    601    jmp                tx2q
    602 .pass2:
    603    mova                 m8, [o(idct8x16p)]
    604    REPX  {vpermb x, m8, x}, m0, m1, m2, m3
    605    punpckhdq            m5, m0, m1
    606    punpckldq            m0, m1
    607    punpckhdq            m4, m2, m3
    608    punpckldq            m2, m3
    609    punpcklqdq           m8, m0, m2 ; 15  1
    610    punpckhqdq           m0, m2     ;  7  9
    611    punpckhqdq           m1, m5, m4 ;  3 13
    612    punpcklqdq           m5, m4     ; 11  5
    613    lea                  r5, [o_base_8bpc]
    614    vextracti32x8       ym7, m8, 1  ; 14  2
    615    vextracti32x8       ym3, m0, 1  ;  6 10
    616    vextracti32x8       ym6, m1, 1  ; 12  4
    617    vextracti32x8       ym9, m5, 1  ;  8  0
    618    call m(idct_8x16_internal_8bpc).main2
    619    mova                 m8, [permC]
    620    vpbroadcastd        m12, [pw_2048]
    621    vpermt2q             m0, m8, m1
    622    lea                  r6, [strideq*3]
    623    vpermt2q             m2, m8, m3
    624    vpbroadcastd        m11, [pixel_10bpc_max]
    625    vpermt2q             m4, m8, m5
    626    pxor                m10, m10
    627    vpermt2q             m6, m8, m7
    628    pmulhrsw             m8, m12, m0
    629    call m(idct_8x8_internal_10bpc).write_8x4_start
    630    pmulhrsw             m8, m12, m2
    631    call m(idct_8x8_internal_10bpc).write_8x4
    632    pmulhrsw             m8, m12, m4
    633    call m(idct_8x8_internal_10bpc).write_8x4
    634    pmulhrsw             m8, m12, m6
    635    jmp m(idct_8x8_internal_10bpc).write_8x4
    636 .fast:
    637    mova                ym0, [cq+64*0]
    638    mova                ym4, [cq+64*2]
    639    mova                ym1, [cq+64*1]
    640    mova                ym5, [cq+64*5]
    641    mova                ym2, [cq+64*4]
    642    mova                ym6, [cq+64*6]
    643    mova                ym3, [cq+64*7]
    644    mova                ym7, [cq+64*3]
    645    call .round_input_fast
    646    call m(idct_8x8_internal_10bpc).main
    647    call m(idct_8x8_internal_10bpc).main_end
    648    movu                 m6, [o(permC+3)]
    649    packssdw             m3, m1, m3
    650    packssdw             m1, m0, m2
    651    vprolq               m3, 32
    652    vpermd               m1, m6, m1
    653    vpermd               m3, m6, m3
    654    mova                ym0, ym1    ; 0 4
    655    vextracti32x8       ym1, m1, 1  ; 1 5
    656    mova                ym2, ym3    ; 2 6
    657    vextracti32x8       ym3, m3, 1  ; 3 7
    658    jmp                tx2q
    659 ALIGN function_align
    660 .round_input_fast:
    661    movshdup             m8, [o(permB)]
    662    vpbroadcastd        m12, [o(pd_2896)]
    663    vpermt2q             m0, m8, m4
    664    vpermt2q             m1, m8, m5
    665    vpermt2q             m2, m8, m6
    666    vpermt2q             m3, m8, m7
    667    vpbroadcastd        m13, [o(pd_2048)]
    668    REPX    {pmulld x, m12}, m0, m1, m2, m3
    669    vpbroadcastd        m14, [o(clip_18b_min)]
    670    vpbroadcastd        m15, [o(clip_18b_max)]
    671    REPX    {paddd  x, m13}, m0, m1, m2, m3
    672    vpbroadcastd        m11, [o(pd_1)]
    673    REPX    {psrad  x, 12 }, m0, m1, m2, m3
    674    ret
    675 ALIGN function_align
    676 .load:
    677    vpbroadcastd        m14, [o(clip_18b_min)]
    678    vpbroadcastd        m15, [o(clip_18b_max)]
    679 .load2:
    680    vpbroadcastd        m12, [o(pd_2896)]
    681    pmulld               m0, m12, [cq+64*0]
    682    pmulld               m1, m12, [cq+64*1]
    683    pmulld               m2, m12, [cq+64*2]
    684    pmulld               m3, m12, [cq+64*3]
    685    vpbroadcastd        m13, [o(pd_2048)]
    686    pmulld               m4, m12, [cq+64*4]
    687    pmulld               m5, m12, [cq+64*5]
    688    pmulld               m6, m12, [cq+64*6]
    689    pmulld               m7, m12, [cq+64*7]
    690 .round:
    691    REPX     {paddd x, m13}, m0, m1, m2, m3
    692    REPX     {psrad x, 12 }, m0, m1, m2, m3
    693    REPX     {paddd x, m13}, m4, m5, m6, m7
    694    REPX     {psrad x, 12 }, m4, m5, m6, m7
    695    ret
    696 ALIGN function_align
    697 .main_fast2_rect2:
    698    REPX     {paddd x, m13}, m0, m1
    699    REPX     {psrad x, 12 }, m0, m1
    700 .main_fast2:
    701    pmulld               m0, m12
    702    pmulld               m6, m1, [o(pd_4017)] {1to16} ; t7a
    703    pmulld               m8, m1, [o(pd_799)] {1to16}  ; t4a
    704    REPX    {paddd  x, m13}, m0, m6, m8
    705    REPX    {psrad  x, 12 }, m0, m6, m8
    706    pmulld               m5, m6, m12
    707    pmulld               m1, m8, m12
    708    paddd                m5, m13
    709    psubd                m4, m5, m1
    710    paddd                m5, m1
    711    REPX    {psrad  x, 12 }, m4, m5
    712    REPX    {mova   x, m0 }, m1, m2, m3
    713    ret
    714 .main_fast_rect2:
    715    REPX     {paddd x, m13}, m0, m1, m2, m3
    716    REPX     {psrad x, 12 }, m0, m1, m2, m3
    717 .main_fast:
    718    pmulld               m0, m12
    719    pmulld               m5, m3, [o(pd_2276)] {1to16} ; t5a
    720    pmulld               m3, [o(pd_3406)] {1to16}     ; t6a
    721    pmulld               m7, m1, [o(pd_4017)] {1to16} ; t7a
    722    pmulld               m1, [o(pd_799)] {1to16}      ; t4a
    723    pmulld               m6, m2, [o(pd_3784)] {1to16} ; t3
    724    pmulld               m2, [o(pd_1567)] {1to16}     ; t2
    725    paddd                m0, m13
    726    psubd                m5, m13, m5
    727    psrad                m0, 12                       ; t0
    728    mova                 m9, m0                       ; t1
    729    jmp .main2
    730 .main_rect2:
    731    call .round
    732 .main:
    733    pmulld               m0, m12
    734    ITX_MULSUB_2D         5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
    735    ITX_MULSUB_2D         1, 7, 8, 9, 10, _,  799, 4017 ; t4a t7a
    736    ITX_MULSUB_2D         2, 6, 8, 9, 10, _, 1567, 3784 ; t2  t3
    737    pmulld               m4, m12
    738    paddd                m0, m13
    739    paddd                m5, m13
    740    psubd                m9, m0, m4 ; t1
    741    paddd                m0, m4     ; t0
    742    psrad                m9, 12
    743    psrad                m0, 12
    744 .main2:
    745    REPX    {paddd  x, m13}, m3, m1, m7
    746    REPX    {psrad  x, 12 }, m5, m1, m3, m7
    747    paddd                m8, m1, m5 ; t4
    748    psubd                m1, m5     ; t5a
    749    psubd                m5, m7, m3 ; t6a
    750    paddd                m7, m3     ; t7
    751    pmaxsd               m5, m14
    752    pmaxsd               m1, m14
    753    paddd                m2, m13
    754    paddd                m6, m13
    755    pminsd               m5, m15
    756    pminsd               m1, m15
    757    pmulld               m5, m12
    758    pmulld               m1, m12
    759    pmaxsd               m8, m14
    760    pmaxsd               m7, m14
    761    pminsd               m8, m15
    762    paddd                m5, m13
    763    psubd                m4, m5, m1
    764    paddd                m5, m1
    765    REPX    {psrad  x, 12 }, m2, m6, m5, m4
    766    paddd                m1, m9, m2 ; dct4 out1
    767    psubd                m2, m9, m2 ; dct4 out2
    768    psubd                m3, m0, m6 ; dct4 out3
    769    paddd                m0, m6     ; dct4 out0
    770    pminsd               m6, m15, m7
    771    REPX    {pmaxsd x, m14}, m0, m1, m2, m3
    772    REPX    {pminsd x, m15}, m0, m1, m2, m3
    773    ret
    774 .main_end:
    775    vpbroadcastd        m11, [o(pd_1)]
    776 .main_end2:
    777    REPX     {paddd x, m11}, m0, m1, m2, m3
    778    psubd                m7, m0, m6 ; out7
    779    paddd                m0, m6     ; out0
    780    psubd                m6, m1, m5 ; out6
    781    paddd                m1, m5     ; out1
    782    psubd                m5, m2, m4 ; out5
    783    paddd                m2, m4     ; out2
    784    psubd                m4, m3, m8 ; out4
    785    paddd                m3, m8     ; out3
    786    REPX   {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
    787    ret
    788 
    789 INV_TXFM_8X16_FN adst, dct
    790 INV_TXFM_8X16_FN adst, identity, 35
    791 INV_TXFM_8X16_FN adst, flipadst
    792 INV_TXFM_8X16_FN adst, adst
    793 
    794 cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
    795 %undef cmp
    796    cmp                eobd, 43
    797    jl .fast
    798    call m(idct_8x16_internal_10bpc).load
    799    call .main
    800    psrad                m0, 1
    801    psrad                m1, 1
    802    psrad                m6, m10, 1
    803    psrad                m7, m11, 1
    804    psrad                m2, 12
    805    psrad                m3, 12
    806    psrad                m4, m8, 12
    807    psrad                m5, m9, 12
    808    jmp m(idct_8x16_internal_10bpc).pass1_end
    809 .fast:
    810    call .fast_main
    811    punpcklqdq           m1, m2, m4 ;  out4  out6
    812    punpckhqdq           m2, m0     ; -out5 -out7
    813    punpcklqdq           m0, m3     ;  out0  out2
    814    punpckhqdq           m4, m3     ; -out1 -out3
    815    paddd                m1, m11
    816    psubd                m3, m11, m2
    817    paddd                m0, m11
    818    psubd                m4, m11, m4
    819 .fast_end:
    820    movu                 m5, [o(permC+3)]
    821    REPX       {psrad x, 1}, m1, m0, m3, m4
    822    packssdw             m2, m0, m1 ; 0 2 4 6
    823    packssdw             m3, m4, m3 ; 1 3 5 7
    824    vpermd               m2, m5, m2
    825    vpermd               m3, m5, m3
    826    mova                ym0, ym2
    827    vextracti32x8       ym2, m2, 1
    828    mova                ym1, ym3
    829    vextracti32x8       ym3, m3, 1
    830    jmp                tx2q
    831 .pass2:
    832    call .pass2_main
    833    movu                 m4, [permB+2]
    834    vbroadcasti32x8     m12, [pw_2048_m2048+16]
    835    psrlq                m7, m4, 8
    836    vpermi2q             m4, m0, m3 ;  0  1  2  3
    837    psrlq                m5, m7, 24
    838    vpermi2q             m7, m0, m3 ; 12 13 14 15
    839    psrlq                m6, m5, 8
    840    vpermq               m5, m5, m1 ;  4  5  6  7
    841    vpermq               m6, m6, m2 ;  8  9 10 11
    842 .pass2_end:
    843    vpbroadcastd        m11, [pixel_10bpc_max]
    844    pxor                m10, m10
    845    lea                  r6, [strideq*3]
    846    pmulhrsw             m8, m12, m4
    847    call m(idct_8x8_internal_10bpc).write_8x4_start
    848    pmulhrsw             m8, m12, m5
    849    call m(idct_8x8_internal_10bpc).write_8x4
    850    pmulhrsw             m8, m12, m6
    851    call m(idct_8x8_internal_10bpc).write_8x4
    852    pmulhrsw             m8, m12, m7
    853    jmp m(idct_8x8_internal_10bpc).write_8x4
    854 ALIGN function_align
    855 .main:
    856    ITX_MULSUB_2D         7, 0, 8, 9, 10, 13,  401, 4076 ; t1a, t0a
    857    ITX_MULSUB_2D         1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a
    858    ITX_MULSUB_2D         5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a
    859    ITX_MULSUB_2D         3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a
    860    psubd                m8, m2, m6 ; t6
    861    paddd                m2, m6     ; t2
    862    psubd                m6, m0, m4 ; t4
    863    paddd                m0, m4     ; t0
    864    psubd                m4, m5, m1 ; t7
    865    paddd                m5, m1     ; t3
    866    psubd                m1, m7, m3 ; t5
    867    paddd                m7, m3     ; t1
    868    REPX    {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7
    869    REPX    {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7
    870    vpbroadcastd        m10, [o(pd_1567)]
    871    vpbroadcastd        m11, [o(pd_3784)]
    872    ITX_MULSUB_2D         6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a
    873    ITX_MULSUB_2D         4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a
    874    vpbroadcastd        m12, [o(pd_1448)]
    875    psubd                m9, m6, m8 ;  t7
    876    paddd                m6, m8     ;  out6
    877    psubd                m3, m7, m5 ;  t3
    878    paddd                m7, m5     ; -out7
    879    psubd                m5, m0, m2 ;  t2
    880    paddd                m0, m2     ;  out0
    881    psubd                m2, m1, m4 ;  t6
    882    paddd                m1, m4     ; -out1
    883    REPX    {pmaxsd x, m14}, m5, m3, m2, m9
    884    REPX    {pminsd x, m15}, m5, m3, m2, m9
    885    REPX    {pmulld x, m12}, m5, m3, m2, m9
    886    vpbroadcastd         m4, [o(pd_1)]
    887    psubd                m8, m5, m3 ; (t2 - t3) * 1448
    888    paddd                m3, m5     ; (t2 + t3) * 1448
    889    psubd                m5, m2, m9 ; (t6 - t7) * 1448
    890    paddd                m2, m9     ; (t6 + t7) * 1448
    891    vpbroadcastd         m9, [o(pd_3072)]
    892    paddd                m0, m4
    893    psubd                m1, m4, m1
    894    paddd               m10, m6, m4
    895    psubd               m11, m4, m7
    896    paddd                m2, m9
    897    paddd                m8, m9
    898    vpbroadcastd         m9, [o(pd_3071)]
    899    psubd                m3, m9, m3
    900    psubd                m9, m5
    901    ret
    902 ALIGN function_align
    903 .fast_main:
    904    mova                ym0, [cq+64*0]
    905    mova                ym4, [cq+64*2]
    906    mova                ym1, [cq+64*7]
    907    mova                ym5, [cq+64*5]
    908    mova                ym2, [cq+64*4]
    909    mova                ym6, [cq+64*6]
    910    mova                ym3, [cq+64*3]
    911    mova                ym7, [cq+64*1]
    912    call m(idct_8x16_internal_10bpc).round_input_fast
    913    jmp m(iadst_8x8_internal_10bpc).main
    914 ALIGN function_align
    915 .pass2_main:
    916    mova                 m8, [o(iadst8x16p)]
    917    REPX  {vpermb x, m8, x}, m0, m1, m2, m3
    918    vpbroadcastd        m10, [o(pw_2896x8)]
    919    punpckhdq            m5, m0, m1
    920    punpckldq            m0, m1
    921    punpckhdq            m1, m2, m3
    922    punpckldq            m2, m3
    923    lea                  r5, [o_base_8bpc]
    924    punpckhqdq           m4, m0, m2 ; 12  3   14  1
    925    punpcklqdq           m0, m2     ;  0 15    2 13
    926    punpckhqdq           m6, m5, m1 ;  8  7   10  5
    927    punpcklqdq           m5, m1     ;  4 11    6  9
    928    call m(iadst_8x16_internal_8bpc).main2
    929    paddsw               m1, m2, m4
    930    psubsw               m2, m4
    931    pmulhrsw             m1, m10    ; -out7   out4   out6  -out5
    932    pmulhrsw             m2, m10    ;  out8  -out11 -out9   out10
    933    ret
    934 
    935 INV_TXFM_8X16_FN flipadst, dct
    936 INV_TXFM_8X16_FN flipadst, identity, 35
    937 INV_TXFM_8X16_FN flipadst, adst
    938 INV_TXFM_8X16_FN flipadst, flipadst
    939 
    940 cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
    941 %undef cmp
    942    cmp                eobd, 43
    943    jl .fast
    944    call m(idct_8x16_internal_10bpc).load
    945    call m(iadst_8x16_internal_10bpc).main
    946    psrad                m7, m0, 1
    947    psrad                m0, m11, 1
    948    psrad                m6, m1, 1
    949    psrad                m1, m10, 1
    950    psrad                m5, m2, 12
    951    psrad                m2, m9, 12
    952    psrad                m4, m3, 12
    953    psrad                m3, m8, 12
    954    jmp m(idct_8x16_internal_10bpc).pass1_end
    955 .fast:
    956    call m(iadst_8x16_internal_10bpc).fast_main
    957    punpckhqdq           m1, m3, m4 ; -out3 -out1
    958    punpcklqdq           m3, m0     ;  out2  out0
    959    punpckhqdq           m0, m2     ; -out7 -out5
    960    punpcklqdq           m4, m2     ;  out6  out4
    961    psubd                m1, m11, m1
    962    paddd                m3, m11
    963    psubd                m0, m11, m0
    964    paddd                m4, m11
    965    jmp m(iadst_8x16_internal_10bpc).fast_end
    966 .pass2:
    967    call m(iadst_8x16_internal_10bpc).pass2_main
    968    movu                 m7, [permB+2]
    969    vbroadcasti32x8     m12, [pw_m2048_2048+16]
    970    psrlq                m4, m7, 8
    971    vpermi2q             m7, m3, m0 ;  3  2  1  0
    972    psrlq                m5, m4, 24
    973    vpermi2q             m4, m3, m0 ; 15 14 13 12
    974    psrlq                m6, m5, 8
    975    vpermq               m5, m5, m2 ; 11 10  9  8
    976    vpermq               m6, m6, m1 ;  7  6  5  4
    977    jmp m(iadst_8x16_internal_10bpc).pass2_end
    978 
    979 INV_TXFM_8X16_FN identity, dct
    980 INV_TXFM_8X16_FN identity, adst
    981 INV_TXFM_8X16_FN identity, flipadst
    982 INV_TXFM_8X16_FN identity, identity
    983 
    984 cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
    985    call m(idct_8x16_internal_10bpc).load2
    986    jmp m(idct_8x16_internal_10bpc).pass1_end
    987 .pass2:
    988    vpbroadcastd         m8, [o(pw_1697x16)]
    989    pmulhrsw             m4, m8, m0
    990    pmulhrsw             m5, m8, m1
    991    pmulhrsw             m6, m8, m2
    992    pmulhrsw             m7, m8, m3
    993    REPX      {paddsw x, x}, m0, m1, m2, m3
    994    paddsw               m0, m4
    995    paddsw               m1, m5
    996    paddsw               m2, m6
    997    paddsw               m3, m7
    998    vpbroadcastd         m7, [o(pw_2048)]
    999    punpckhwd            m4, m0, m1
   1000    punpcklwd            m0, m1
   1001    punpckhwd            m1, m2, m3
   1002    punpcklwd            m2, m3
   1003    vpbroadcastd         m6, [o(pixel_10bpc_max)]
   1004    punpckhdq            m3, m0, m2
   1005    punpckldq            m0, m2
   1006    punpckldq            m2, m4, m1
   1007    punpckhdq            m4, m1
   1008    pxor                 m5, m5
   1009    punpckhqdq           m1, m0, m2 ;  1  5  9 13
   1010    punpcklqdq           m0, m2     ;  0  4  8 12
   1011    punpcklqdq           m2, m3, m4 ;  2  6 10 14
   1012    punpckhqdq           m3, m4     ;  3  7 11 15
   1013    lea                  r6, [strideq*3]
   1014    pmulhrsw             m0, m7
   1015    call .write_8x4_start
   1016    pmulhrsw             m0, m7, m1
   1017    call .write_8x4
   1018    pmulhrsw             m0, m7, m2
   1019    call .write_8x4
   1020    pmulhrsw             m0, m7, m3
   1021 .write_8x4:
   1022    add                dstq, strideq
   1023    add                  cq, 64*2
   1024 .write_8x4_start:
   1025    mova                xm4, [dstq+strideq*0]
   1026    vinserti32x4        ym4, [dstq+strideq*4], 1
   1027    vinserti32x4         m4, [dstq+strideq*8], 2
   1028    vinserti32x4         m4, [dstq+r6*4     ], 3
   1029    mova          [cq+64*0], m5
   1030    mova          [cq+64*1], m5
   1031    paddw                m4, m0
   1032    pmaxsw               m4, m5
   1033    pminsw               m4, m6
   1034    mova          [dstq+strideq*0], xm4
   1035    vextracti32x4 [dstq+strideq*4], ym4, 1
   1036    vextracti32x4 [dstq+strideq*8], m4, 2
   1037    vextracti32x4 [dstq+r6*4     ], m4, 3
   1038    ret
   1039 
   1040 %macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
   1041    INV_TXFM_FN          %1, %2, %3, 16x8
   1042 %ifidn %1_%2, dct_dct
   1043    imul                r6d, [cq], 181
   1044    mov                [cq], eobd ; 0
   1045    or                  r3d, 8
   1046 .dconly:
   1047    add                 r6d, 128
   1048    sar                 r6d, 8
   1049    imul                r6d, 181
   1050    add                 r6d, 384
   1051    sar                 r6d, 9
   1052 .dconly2:
   1053    vpbroadcastd         m2, [o(dconly_10bpc)]
   1054    imul                r6d, 181
   1055    add                 r6d, 2176
   1056    sar                 r6d, 12
   1057    vpbroadcastw         m1, r6d
   1058    paddsw               m1, m2
   1059 .dconly_loop:
   1060    mova                ym0, [dstq+strideq*0]
   1061    vinserti32x8         m0, [dstq+strideq*1], 1
   1062    paddsw               m0, m1
   1063    psubusw              m0, m2
   1064    mova          [dstq+strideq*0], ym0
   1065    vextracti32x8 [dstq+strideq*1], m0, 1
   1066    lea                dstq, [dstq+strideq*2]
   1067    sub                 r3d, 2
   1068    jg .dconly_loop
   1069    RET
   1070 %endif
   1071 %endmacro
   1072 
   1073 INV_TXFM_16X8_FN dct, dct
   1074 INV_TXFM_16X8_FN dct, identity, -21
   1075 INV_TXFM_16X8_FN dct, flipadst
   1076 INV_TXFM_16X8_FN dct, adst
   1077 
   1078 cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   1079 %undef cmp
   1080    vpbroadcastd        m12, [o(pd_2896)]
   1081    pmulld               m4, m12, [cq+64*0] ;  0  1
   1082    pmulld               m9, m12, [cq+64*1] ;  2  3
   1083    pmulld               m8, m12, [cq+64*2] ;  4  5
   1084    pmulld               m7, m12, [cq+64*3] ;  6  7
   1085    vpbroadcastd        m13, [o(pd_2048)]
   1086    pxor                 m2, m2
   1087    mova                m15, [o(permB)]
   1088    REPX {mova [cq+64*x], m2}, 0, 1, 2, 3
   1089    psrlq                m0, m15, 32
   1090    REPX     {paddd x, m13}, m4, m9, m8, m7
   1091    vpbroadcastd        m14, [o(clip_18b_min)]
   1092    REPX     {psrad x, 12 }, m4, m8, m9, m7
   1093    mova                 m1, m0
   1094    vpermi2q             m0, m4, m8   ;  0  4
   1095    cmp                eobd, 43
   1096    jl .fast
   1097    pmulld               m5, m12, [cq+64*4] ;  8  9
   1098    pmulld              m10, m12, [cq+64*5] ; 10 11
   1099    pmulld              m11, m12, [cq+64*6] ; 12 13
   1100    pmulld               m6, m12, [cq+64*7] ; 14 15
   1101    REPX {mova [cq+64*x], m2}, 4, 5, 6, 7
   1102    REPX     {paddd x, m13}, m5, m10, m11, m6
   1103    REPX     {psrad x, 12 }, m10, m5, m11, m6
   1104    mova                 m2, m1
   1105    vpermi2q             m1, m9, m10  ;  2 10
   1106    mova                 m3, m2
   1107    vpermi2q             m2, m5, m11  ;  8 12
   1108    vpermi2q             m3, m6, m7   ; 14  6
   1109    vpermt2q             m4, m15, m11 ;  1 13
   1110    vpermt2q             m6, m15, m9  ; 15  3
   1111    vpermt2q             m5, m15, m8  ;  9  5
   1112    vpermt2q             m7, m15, m10 ;  7 11
   1113    vpbroadcastd        m15, [o(clip_18b_max)]
   1114    call m(idct_8x8_internal_10bpc).main
   1115    call .main
   1116    jmp .pass1_end
   1117 .fast:
   1118    vpermi2q             m1, m9, m7   ;  2  6
   1119    vpermt2q             m4, m15, m9  ;  1  3
   1120    vpermt2q             m7, m15, m8  ;  7  5
   1121    vpbroadcastd        m15, [o(clip_18b_max)]
   1122    call m(idct_8x8_internal_10bpc).main_fast
   1123    call .main_fast
   1124 .pass1_end:
   1125    call m(idct_8x16_internal_10bpc).main_end
   1126    mova                 m8, [o(permA)]
   1127    psrlq                m9, m8, 8
   1128 .pass1_end2:
   1129    mova                m10, m9
   1130    mova                m11, m8
   1131    call .transpose_16x8
   1132    jmp                tx2q
   1133 .pass2:
   1134    lea                  r5, [o_base_8bpc]
   1135    call m(idct_16x8_internal_8bpc).main
   1136    movshdup             m4, [permC]
   1137    vpbroadcastd        m11, [pw_2048]
   1138    psrlq                m5, m4, 8
   1139 .end:
   1140    vpbroadcastd        m13, [pixel_10bpc_max]
   1141    pxor                m12, m12
   1142    vpermq               m8, m4, m0
   1143    vpermq               m9, m5, m1
   1144    lea                  r6, [strideq*3]
   1145    call .write_16x4
   1146    vpermq               m8, m4, m2
   1147    vpermq               m9, m5, m3
   1148 .write_16x4:
   1149    pmulhrsw             m8, m11
   1150    pmulhrsw             m9, m11
   1151 .write_16x4_noround:
   1152    mova               ym10, [dstq+strideq*0]
   1153    vinserti32x8        m10, [dstq+strideq*1], 1
   1154    paddw                m8, m10
   1155    mova               ym10, [dstq+strideq*2]
   1156    vinserti32x8        m10, [dstq+r6       ], 1
   1157    paddw                m9, m10
   1158    pmaxsw               m8, m12
   1159    pmaxsw               m9, m12
   1160    pminsw               m8, m13
   1161    pminsw               m9, m13
   1162    mova          [dstq+strideq*0], ym8
   1163    vextracti32x8 [dstq+strideq*1], m8, 1
   1164    mova          [dstq+strideq*2], ym9
   1165    vextracti32x8 [dstq+r6       ], m9, 1
   1166    lea                dstq, [dstq+strideq*4]
   1167    ret
   1168 ALIGN function_align
   1169 .main_fast: ; bottom half is zero
   1170    vbroadcasti32x4      m6, [o(pd_4076_3920)]
   1171    vbroadcasti32x4      m3, [o(pd_401_m1189)]
   1172    vbroadcasti32x4      m5, [o(pd_m2598_1931)]
   1173    vbroadcasti32x4      m9, [o(pd_3166_3612)]
   1174    pmulld               m6, m4    ; t15a t12a
   1175    pmulld               m4, m3    ; t8a  t11a
   1176    pmulld               m5, m7    ; t9a  t10a
   1177    pmulld               m7, m9    ; t14a t13a
   1178    jmp .main2
   1179 .main:
   1180    ITX_MULSUB_2D         4, 6, 3, 9, 10, _,  401_3920, 4076_1189
   1181    ITX_MULSUB_2D         5, 7, 3, 9, 10, _, 3166_1931, 2598_3612
   1182 .main2:
   1183    REPX     {paddd x, m13}, m4, m6, m5, m7
   1184    REPX     {psrad x, 12 }, m4, m5, m6, m7
   1185    paddd                m9, m4, m5 ; t8   t11
   1186    psubd                m4, m5     ; t9   t10
   1187    psubd                m5, m6, m7 ; t14  t13
   1188    paddd                m6, m7     ; t15  t12
   1189    REPX    {pmaxsd x, m14}, m5, m4, m9, m6
   1190    REPX    {pminsd x, m15}, m5, m4, m9, m6
   1191 .main3:
   1192    psubd                m3, m0, m1 ; dct8 out7 out6
   1193    paddd                m0, m1     ; dct8 out0 out1
   1194    vbroadcasti32x4      m7, [o(pd_3784_m3784)]
   1195    pmulld               m7, m5
   1196    vpmulld              m5, [o(pd_1567)] {1to16}
   1197    paddd                m1, m2, m8 ; dct8 out3 out2
   1198    psubd                m2, m8     ; dct8 out4 out5
   1199    vbroadcasti32x4      m8, [o(pd_1567_m1567)]
   1200    pmulld               m8, m4
   1201    vpmulld              m4, [o(pd_3784)] {1to16}
   1202    REPX    {pmaxsd x, m14}, m0, m1
   1203    REPX    {pminsd x, m15}, m0, m1
   1204    paddd                m7, m13
   1205    paddd                m5, m13
   1206    paddd                m7, m8
   1207    psubd                m5, m4
   1208    psrad                m7, 12     ; t14a t10a
   1209    psrad                m5, 12     ; t9a  t13a
   1210    punpckhqdq           m4, m9, m7
   1211    punpcklqdq           m8, m9, m5
   1212    punpckhqdq           m5, m6, m5
   1213    punpcklqdq           m6, m7
   1214    psubd                m7, m8, m4 ; t11a t10
   1215    paddd                m8, m4     ; t8a  t9
   1216    psubd                m4, m6, m5 ; t12a t13
   1217    paddd                m6, m5     ; t15a t14
   1218    REPX    {pmaxsd x, m14}, m4, m7
   1219    REPX    {pminsd x, m15}, m4, m7
   1220    pmulld               m4, m12
   1221    pmulld               m7, m12
   1222    REPX    {pmaxsd x, m14}, m2, m3, m6, m8
   1223    REPX    {pminsd x, m15}, m2, m3, m6, m8
   1224    paddd                m4, m13
   1225    paddd                m5, m4, m7
   1226    psubd                m4, m7
   1227    psrad                m4, 12     ; t11 t10a
   1228    psrad                m5, 12     ; t12 t13a
   1229    ret
   1230 ALIGN function_align
   1231 .transpose_16x8:
   1232    packssdw             m0, m4
   1233    packssdw             m1, m5
   1234    packssdw             m2, m6
   1235    packssdw             m3, m7
   1236    vpermi2d             m8, m0, m2
   1237    vpermt2d             m0, m9, m2
   1238    vpermi2d            m10, m1, m3
   1239    vpermi2d            m11, m1, m3
   1240    punpckhwd            m3, m8, m0
   1241    punpcklwd            m1, m8, m0
   1242    punpckhwd            m4, m10, m11
   1243    punpcklwd            m2, m10, m11
   1244    punpckldq            m0, m1, m2
   1245    punpckhdq            m1, m2
   1246    punpckldq            m2, m3, m4
   1247    punpckhdq            m3, m4
   1248    ret
   1249 
   1250 INV_TXFM_16X8_FN adst, dct
   1251 INV_TXFM_16X8_FN adst, identity, -21
   1252 INV_TXFM_16X8_FN adst, flipadst
   1253 INV_TXFM_16X8_FN adst, adst
   1254 
   1255 cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   1256 %undef cmp
   1257    call .main_pass1
   1258    vpbroadcastd         m9, [o(pd_1)]
   1259    paddd                m0, m9
   1260    psubd                m1, m9, m1
   1261    paddd                m2, m9
   1262    psubd                m3, m9, m3
   1263    paddd                m4, m9, m5
   1264    psubd                m5, m9, m6
   1265    paddd                m6, m9, m7
   1266    psubd                m7, m9, m8
   1267 .pass1_end:
   1268    mova                 m9, [o(permA)]
   1269    psrlq                m8, m9, 8
   1270    REPX       {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7
   1271    jmp m(idct_16x8_internal_10bpc).pass1_end2
   1272 .pass2:
   1273    call .main_pass2
   1274    vpermq               m8, m11, m0
   1275    vpermq               m9, m11, m1
   1276    call m(idct_16x8_internal_10bpc).write_16x4_noround
   1277    vpermq               m8, m11, m2
   1278    vpermq               m9, m11, m3
   1279    jmp m(idct_16x8_internal_10bpc).write_16x4_noround
   1280 ALIGN function_align
   1281 .main_pass1:
   1282    vpbroadcastd        m12, [o(pd_2896)]
   1283    pmulld               m2, m12, [cq+64*0]
   1284    pmulld               m7, m12, [cq+64*1]
   1285    pmulld               m1, m12, [cq+64*2]
   1286    pmulld               m5, m12, [cq+64*3]
   1287    vpbroadcastd        m13, [o(pd_2048)]
   1288    pxor                 m4, m4
   1289    mova                m10, [o(permB)]
   1290    REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
   1291    REPX     {paddd x, m13}, m2, m7, m1, m5
   1292    psrlq                m6, m10, 32
   1293    REPX     {psrad x, 12 }, m2, m7, m1, m5
   1294    mova                 m0, m6
   1295    vpermi2q             m0, m2, m7  ;  0  2
   1296    vpermt2q             m7, m10, m2 ;  3  1
   1297    mova                 m2, m6
   1298    vpermi2q             m2, m1, m5  ;  4  6
   1299    vpermt2q             m5, m10, m1 ;  7  5
   1300    cmp                eobd, 43
   1301    jl .main_fast
   1302    pmulld               m8, m12, [cq+64*4]
   1303    pmulld               m3, m12, [cq+64*5]
   1304    pmulld               m9, m12, [cq+64*6]
   1305    pmulld               m1, m12, [cq+64*7]
   1306    REPX {mova [cq+64*x], m4}, 4, 5, 6, 7
   1307    REPX     {paddd x, m13}, m8, m3, m9, m1
   1308    REPX     {psrad x, 12 }, m8, m3, m9, m1
   1309    mova                 m4, m6
   1310    vpermi2q             m4, m8, m3  ;  8 10
   1311    vpermt2q             m3, m10, m8 ; 11  9
   1312    vpermi2q             m6, m9, m1  ; 12 14
   1313    vpermt2q             m1, m10, m9 ; 15 13
   1314 .main:
   1315    ITX_MULSUB_2D         1, 0, 8, 9, 10, _,  201_995,  4091_3973, 1
   1316    ITX_MULSUB_2D         3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1
   1317    ITX_MULSUB_2D         5, 4, 8, 9, 10, _, 3035_3513, 2751_2106
   1318    ITX_MULSUB_2D         7, 6, 8, 9, 10, _, 3857_4052, 1380_601
   1319    jmp .main2
   1320 .main_fast:
   1321    vbroadcasti32x4      m1, [o(pd_4091_3973)]
   1322    vbroadcasti32x4      m8, [o(pd_201_995)]
   1323    vbroadcasti32x4      m3, [o(pd_3703_3290)]
   1324    vbroadcasti32x4      m9, [o(pd_1751_2440)]
   1325    vbroadcasti32x4      m4, [o(pd_2751_2106)]
   1326    vbroadcasti32x4     m10, [o(pd_3035_3513)]
   1327    vbroadcasti32x4      m6, [o(pd_1380_601)]
   1328    vbroadcasti32x4     m11, [o(pd_3857_4052)]
   1329    pmulld               m1, m0
   1330    pmulld               m0, m8
   1331    pmulld               m3, m2
   1332    pmulld               m2, m9
   1333    pmulld               m4, m5
   1334    pmulld               m5, m10
   1335    pmulld               m6, m7
   1336    pmulld               m7, m11
   1337 .main2:
   1338    vpbroadcastd        m14, [o(clip_18b_min)]
   1339    vpbroadcastd        m15, [o(clip_18b_max)]
   1340    REPX  {psubd x, m13, x}, m1, m3
   1341    REPX  {paddd x, m13   }, m0, m2, m4, m5, m6, m7
   1342    REPX  {psrad x, 12    }, m0, m4, m1, m5, m2, m6, m3, m7
   1343    psubd                m8, m0, m4 ; t8a  t10a
   1344    paddd                m0, m4     ; t0a  t2a
   1345    psubd                m4, m1, m5 ; t9a  t11a
   1346    paddd                m1, m5     ; t1a  t3a
   1347    psubd                m5, m2, m6 ; t12a t14a
   1348    paddd                m2, m6     ; t4a  t6a
   1349    psubd                m6, m3, m7 ; t13a t15a
   1350    paddd                m3, m7     ; t5a  t7a
   1351    REPX    {pmaxsd x, m14}, m8, m4, m5, m6
   1352    REPX    {pminsd x, m15}, m8, m4, m5, m6
   1353    vbroadcasti32x4     m11, [o(pd_4017_2276)]
   1354    vbroadcasti32x4     m10, [o(pd_799_3406)]
   1355    ITX_MULSUB_2D         8, 4, 7, 9, _, 13, 10, 11
   1356    ITX_MULSUB_2D         6, 5, 7, 9, _, 13, 11, 10
   1357    REPX    {pmaxsd x, m14}, m0, m2, m1, m3
   1358    REPX    {pminsd x, m15}, m0, m2, m1, m3
   1359    psubd                m7, m0, m2 ; t4   t6
   1360    paddd                m0, m2     ; t0   t2
   1361    psubd                m2, m1, m3 ; t5   t7
   1362    paddd                m1, m3     ; t1   t3
   1363    psubd                m3, m4, m6 ; t12a t14a
   1364    paddd                m4, m6     ; t8a  t10a
   1365    psubd                m6, m8, m5 ; t13a t15a
   1366    paddd                m8, m5     ; t9a  t11a
   1367    REPX    {pmaxsd x, m14}, m7, m3, m2, m6
   1368    REPX    {pminsd x, m15}, m7, m3, m2, m6
   1369    punpcklqdq           m5, m3, m7 ; t12a t4
   1370    punpckhqdq           m3, m7     ; t14a t6
   1371    punpckhqdq           m7, m6, m2 ; t15a t7
   1372    punpcklqdq           m6, m2     ; t13a t5
   1373    vpbroadcastd        m11, [o(pd_1567)]
   1374    vpbroadcastd        m10, [o(pd_3784)]
   1375    ITX_MULSUB_2D         7, 3, 2, 9, 10, 13, 10, 11
   1376    ITX_MULSUB_2D         5, 6, 2, 9, 10, 13, 11, 10
   1377    REPX    {pmaxsd x, m14}, m0, m4, m1, m8
   1378    REPX    {pminsd x, m15}, m0, m4, m1, m8
   1379    punpckhqdq           m2, m4, m0 ; t10a t2
   1380    punpcklqdq           m4, m0     ; t8a  t0
   1381    punpckhqdq           m0, m8, m1 ; t11a t3
   1382    punpcklqdq           m8, m1     ; t9a  t1
   1383    paddd                m1, m6, m7 ;  out2  -out3
   1384    psubd                m6, m7     ; t14a t6
   1385    paddd                m7, m5, m3 ; -out13  out12
   1386    psubd                m5, m3     ; t15a t7
   1387    psubd                m3, m8, m0 ; t11  t3a
   1388    paddd                m8, m0     ;  out14 -out15
   1389    paddd                m0, m4, m2 ; -out1   out0
   1390    psubd                m4, m2     ; t10  t2a
   1391    REPX    {pmaxsd x, m14}, m6, m5, m3, m4
   1392    mov                 r6d, 0x3333
   1393    REPX    {pminsd x, m15}, m6, m5, m3, m4
   1394    kmovw                k1, r6d
   1395    REPX    {pmulld x, m12}, m6, m5, m3, m4
   1396    pxor                 m9, m9
   1397    REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8
   1398    paddd                m6, m13
   1399    paddd                m4, m13
   1400    paddd                m2, m6, m5 ; -out5   out4
   1401    psubd                m6, m5     ;  out10 -out11
   1402    psubd                m5, m4, m3 ; -out9   out8
   1403    paddd                m3, m4     ;  out6  -out7
   1404    REPX     {psrad  x, 12}, m2, m3, m5, m6
   1405    REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6
   1406    ret
   1407 ALIGN function_align
   1408 .main_pass2:
   1409    lea                  r5, [o_base_8bpc]
   1410    pshufd               m4, m0, q1032
   1411    pshufd               m5, m1, q1032
   1412    call m(iadst_16x8_internal_8bpc).main_pass2
   1413    movshdup            m11, [permC]
   1414    pmulhrsw             m0, m6
   1415    pmulhrsw             m1, m6
   1416    vpbroadcastd        m13, [pixel_10bpc_max]
   1417    pxor                m12, m12
   1418    lea                  r6, [strideq*3]
   1419    ret
   1420 
   1421 INV_TXFM_16X8_FN flipadst, dct
   1422 INV_TXFM_16X8_FN flipadst, identity, -21
   1423 INV_TXFM_16X8_FN flipadst, adst
   1424 INV_TXFM_16X8_FN flipadst, flipadst
   1425 
   1426 cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   1427    call m(iadst_16x8_internal_10bpc).main_pass1
   1428    vpbroadcastd         m9, [o(pd_1)]
   1429    psubd                m4, m9, m3
   1430    paddd                m3, m9, m5
   1431    paddd                m5, m9, m2
   1432    psubd                m2, m9, m6
   1433    psubd                m6, m9, m1
   1434    paddd                m1, m9, m7
   1435    paddd                m7, m9, m0
   1436    psubd                m0, m9, m8
   1437    jmp m(iadst_16x8_internal_10bpc).pass1_end
   1438 .pass2:
   1439    call m(iadst_16x8_internal_10bpc).main_pass2
   1440    psrlq               m11, 8
   1441    vpermq               m8, m11, m3
   1442    vpermq               m9, m11, m2
   1443    call m(idct_16x8_internal_10bpc).write_16x4_noround
   1444    vpermq               m8, m11, m1
   1445    vpermq               m9, m11, m0
   1446    jmp m(idct_16x8_internal_10bpc).write_16x4_noround
   1447 
   1448 INV_TXFM_16X8_FN identity, dct
   1449 INV_TXFM_16X8_FN identity, adst
   1450 INV_TXFM_16X8_FN identity, flipadst
   1451 INV_TXFM_16X8_FN identity, identity
   1452 
   1453 cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   1454    call m(idct_8x16_internal_10bpc).load2
   1455    vpbroadcastd         m8, [o(pd_5793)]
   1456    vpbroadcastd        m13, [o(pd_3072)]
   1457    pxor                m10, m10
   1458    REPX     {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
   1459    REPX {mova [cq+64*x], m10}, 0, 1, 2, 3, 4, 5, 6, 7
   1460    call m(idct_8x16_internal_10bpc).round
   1461    psrlq                m8, [o(permA)], 16
   1462    psrlq                m9, m8, 8
   1463    mova                m10, m8
   1464    mova                m11, m9
   1465    call m(idct_16x8_internal_10bpc).transpose_16x8
   1466    jmp                tx2q
   1467 .pass2:
   1468    movshdup             m4, [o(permC)]
   1469    vpbroadcastd        m11, [o(pw_4096)]
   1470    mova                 m5, m4
   1471    jmp m(idct_16x8_internal_10bpc).end
   1472 
   1473 %macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
   1474    INV_TXFM_FN          %1, %2, %3, 16x16
   1475 %ifidn %1_%2, dct_dct
   1476    imul                r6d, [cq], 181
   1477    mov                [cq], eobd ; 0
   1478    or                  r3d, 16
   1479    add                 r6d, 640
   1480    sar                 r6d, 10
   1481    jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
   1482 %endif
   1483 %endmacro
   1484 
   1485 INV_TXFM_16X16_FN dct, dct
   1486 INV_TXFM_16X16_FN dct, identity, 28
   1487 INV_TXFM_16X16_FN dct, flipadst
   1488 INV_TXFM_16X16_FN dct, adst
   1489 
   1490 cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   1491 %undef cmp
   1492    vpbroadcastd        m13, [o(pd_2048)]
   1493    vpbroadcastd        m12, [o(pd_2896)]
   1494    vpbroadcastd        m14, [o(clip_18b_min)]
   1495    vpbroadcastd        m15, [o(clip_18b_max)]
   1496    cmp                eobd, 36
   1497    jl .fast
   1498    mova                 m0, [cq+64* 0]
   1499    mova                 m1, [cq+64* 2]
   1500    mova                 m2, [cq+64* 4]
   1501    mova                 m3, [cq+64* 6]
   1502    mova                 m4, [cq+64* 8]
   1503    mova                 m5, [cq+64*10]
   1504    mova                 m6, [cq+64*12]
   1505    mova                 m7, [cq+64*14]
   1506 %if WIN64
   1507    movaps        [cq+16*0], xmm6
   1508    movaps        [cq+16*1], xmm7
   1509 %endif
   1510    call m(idct_8x16_internal_10bpc).main
   1511    mova                m16, [cq+64* 1]
   1512    mova                m17, [cq+64* 3]
   1513    mova                m18, [cq+64* 5]
   1514    mova                m19, [cq+64* 7]
   1515    mova                m20, [cq+64* 9]
   1516    mova                m21, [cq+64*11]
   1517    mova                m22, [cq+64*13]
   1518    mova                m23, [cq+64*15]
   1519    call .main
   1520    call .main_end
   1521 .pass1_end:
   1522 %if WIN64
   1523    movaps             xmm6, [cq+16*0]
   1524    movaps             xmm7, [cq+16*1]
   1525 %endif
   1526    vzeroupper
   1527 .pass1_end2:
   1528    call .main_end3
   1529 .pass1_end3:
   1530    mov                 r6d, 64*12
   1531    pxor                 m8, m8
   1532 .zero_loop:
   1533    mova       [cq+r6+64*3], m8
   1534    mova       [cq+r6+64*2], m8
   1535    mova       [cq+r6+64*1], m8
   1536    mova       [cq+r6+64*0], m8
   1537    sub                 r6d, 64*4
   1538    jge .zero_loop
   1539    jmp                tx2q
   1540 .pass2:
   1541    lea                  r5, [o_base_8bpc]
   1542    call m(idct_16x16_internal_8bpc).main
   1543    movshdup            m12, [permC]
   1544    vpbroadcastd        m11, [pw_2048]
   1545    psrlq               m13, m12, 8
   1546    vpermq               m8, m12, m0
   1547    vpermq               m0, m13, m7
   1548    vpermq               m7, m13, m1
   1549    vpermq               m1, m12, m6
   1550    vpermq               m6, m12, m2
   1551    vpermq               m2, m13, m5
   1552    vpermq               m5, m13, m3
   1553    vpermq               m3, m12, m4
   1554 .pass2_end:
   1555    lea                  r6, [strideq*3]
   1556    vpbroadcastd        m13, [pixel_10bpc_max]
   1557    pxor                m12, m12
   1558    pmulhrsw             m8, m11, m8
   1559    pmulhrsw             m9, m11, m7
   1560    call m(idct_16x8_internal_10bpc).write_16x4_noround
   1561    pmulhrsw             m8, m11, m6
   1562    pmulhrsw             m9, m11, m5
   1563    call m(idct_16x8_internal_10bpc).write_16x4_noround
   1564    pmulhrsw             m8, m11, m3
   1565    pmulhrsw             m9, m11, m2
   1566    call m(idct_16x8_internal_10bpc).write_16x4_noround
   1567    pmulhrsw             m8, m11, m1
   1568    pmulhrsw             m9, m11, m0
   1569    jmp m(idct_16x8_internal_10bpc).write_16x4_noround
   1570 .fast:
   1571    mova                ym0, [cq+64*0]
   1572    mova                ym2, [cq+64*4]
   1573    movshdup             m8, [o(permB)]
   1574    mova                ym1, [cq+64*2]
   1575    mova                ym3, [cq+64*6]
   1576    mova                ym4, [cq+64*1]
   1577    mova                ym5, [cq+64*3]
   1578    mova                ym6, [cq+64*5]
   1579    mova                ym7, [cq+64*7]
   1580    vpermt2q             m0, m8, m2 ; 0 4
   1581    vpermt2q             m1, m8, m3 ; 2 6
   1582    vpermt2q             m4, m8, m5 ; 1 3
   1583    vpermt2q             m7, m8, m6 ; 7 5
   1584    call m(idct_8x8_internal_10bpc).main_fast
   1585    call m(idct_16x8_internal_10bpc).main_fast
   1586    vpbroadcastd        m11, [o(pd_2)]
   1587    call m(idct_8x16_internal_10bpc).main_end2
   1588    mova                 m8, [o(permA)]
   1589    psrlq                m9, m8, 8
   1590    jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2
   1591 ALIGN function_align
   1592 .main_fast2_rect2:
   1593    REPX     {paddd x, m13}, m16, m17
   1594    REPX     {psrad x, 12 }, m16, m17
   1595 .main_fast2:
   1596    pmulld              m22, m16, [o(pd_4076)] {1to16} ; t15a
   1597    pmulld               m9, m16, [o(pd_401)] {1to16}  ; t8a
   1598    pmulld              m18, m17, [o(pd_1189)] {1to16} ; t11a
   1599    pmulld              m17, [o(pd_3920)] {1to16}      ; t12a
   1600    psubd               m18, m13, m18
   1601    REPX    {paddd  x, m13}, m22, m9, m17
   1602    REPX    {psrad  x, 12 }, m18, m22, m9, m17
   1603 
   1604    mova                m20, m9
   1605    mova                m16, m18
   1606    mova                m23, m22
   1607    mova                m19, m17
   1608    jmp .main3
   1609 .main_fast_rect2:
   1610    REPX     {paddd x, m13}, m16, m17, m18, m19
   1611    REPX     {psrad x, 12 }, m16, m17, m18, m19
   1612 .main_fast:
   1613    pmulld              m23, m16, [o(pd_4076)] {1to16} ; t15a
   1614    pmulld              m16, [o(pd_401)] {1to16}       ; t8a
   1615    pmulld              m20, m19, [o(pd_2598)] {1to16} ; t9a
   1616    pmulld              m19, [o(pd_3166)] {1to16}      ; t14a
   1617    pmulld              m22, m17, [o(pd_1189)] {1to16} ; t11a
   1618    pmulld              m17, [o(pd_3920)] {1to16}      ; t12a
   1619    pmulld              m21, m18, [o(pd_3612)] {1to16} ; t13a
   1620    pmulld              m18, [o(pd_1931)] {1to16}      ; t10a
   1621    psubd               m20, m13, m20
   1622    psubd               m22, m13, m22
   1623    call .round2
   1624    jmp .main2
   1625 .main_rect2:
   1626    call .round
   1627 .main:
   1628    ITX_MULSUB_2D        16, 23, 7, 9, 10, _,  401, 4076 ; t8a,  t15a
   1629    ITX_MULSUB_2D        20, 19, 7, 9, 10, _, 3166, 2598 ; t9a,  t14a
   1630    ITX_MULSUB_2D        22, 17, 7, 9, 10, _, 3920, 1189 ; t11a, t12a
   1631    ITX_MULSUB_2D        18, 21, 7, 9, 10, _, 1931, 3612 ; t10a, t13a
   1632    call .round
   1633 .main2:
   1634    paddd                m9, m20, m16 ; t8
   1635    psubd               m20, m16, m20 ; t9
   1636    psubd               m16, m22, m18 ; t10
   1637    paddd               m18, m22      ; t11
   1638    paddd               m22, m23, m19 ; t15
   1639    psubd               m23, m19      ; t14
   1640    psubd               m19, m17, m21 ; t13
   1641    paddd               m17, m21      ; t12
   1642    REPX    {pmaxsd x, m14}, m20, m23, m16, m19
   1643    REPX    {pminsd x, m15}, m20, m23, m16, m19
   1644    REPX    {pmaxsd x, m14}, m9, m18, m22, m17
   1645    REPX    {pminsd x, m15}, m9, m18, m22, m17
   1646 .main3:
   1647    vpbroadcastd        m11, [o(pd_3784)]
   1648    vpbroadcastd        m10, [o(pd_1567)]
   1649    ITX_MULSUB_2D        23, 20, 21, 7, _, 13, 10, 11
   1650    ITX_MULSUB_2D        19, 16, 21, 7, _, 13, 10, 11, 2
   1651    paddd               m21, m20, m19 ; t14
   1652    psubd               m20, m19      ; t13
   1653    psubd               m19, m9, m18  ; t11a
   1654    paddd                m9, m18      ; t8a
   1655    psubd               m18, m23, m16 ; t10
   1656    paddd               m16, m23      ; t9
   1657    psubd               m23, m22, m17 ; t12a
   1658    paddd               m22, m17      ; t15a
   1659    REPX    {pmaxsd x, m14}, m20, m23, m18, m19
   1660    REPX    {pminsd x, m15}, m20, m23, m18, m19
   1661    REPX    {pmulld x, m12}, m20, m23, m18, m19
   1662    psubd                m7, m0, m6   ; dct8 out7
   1663    paddd                m0, m6       ; dct8 out0
   1664    psubd                m6, m1, m5   ; dct8 out6
   1665    paddd                m1, m5       ; dct8 out1
   1666    REPX    {pmaxsd x, m14}, m7, m0, m6, m1
   1667    psubd                m5, m2, m4   ; dct8 out5
   1668    paddd                m2, m4       ; dct8 out2
   1669    REPX    {pminsd x, m15}, m7, m0, m6, m1
   1670    psubd                m4, m3, m8   ; dct8 out4
   1671    paddd                m3, m8       ; dct8 out3
   1672    REPX    {pmaxsd x, m14}, m5, m2, m4, m3
   1673    paddd               m20, m13
   1674    paddd               m23, m13
   1675    REPX    {pminsd x, m15}, m5, m2, m4, m3
   1676    psubd               m17, m20, m18 ; t10a
   1677    paddd               m20, m18      ; t13a
   1678    REPX    {pmaxsd x, m14}, m22, m21, m16, m9
   1679    psubd               m18, m23, m19 ; t11
   1680    paddd               m19, m23      ; t12
   1681    REPX    {pminsd x, m15}, m22, m21, m16, m9
   1682    REPX    {psrad  x, 12 }, m20, m19, m18, m17
   1683    ret
   1684 .main_end:
   1685    vpbroadcastd        m11, [o(pd_2)]
   1686 .main_end2:
   1687    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   1688    psubd               m23, m0, m22 ; out15
   1689    paddd                m0, m22     ; out0
   1690    psubd               m22, m1, m21 ; out14
   1691    paddd                m1, m21     ; out1
   1692    psubd               m21, m2, m20 ; out13
   1693    paddd                m2, m20     ; out2
   1694    psubd               m20, m3, m19 ; out12
   1695    paddd                m3, m19     ; out3
   1696    psubd               m19, m4, m18 ; out11
   1697    paddd                m4, m18     ; out4
   1698    psubd               m18, m5, m17 ; out10
   1699    paddd                m5, m17     ; out5
   1700    psubd               m17, m6, m16 ; out9
   1701    paddd                m6, m16     ; out6
   1702    psubd               m16, m7, m9  ; out8
   1703    paddd                m7, m9      ; out7
   1704    REPX   {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \
   1705                             m4, m20, m5, m21, m6, m22, m7, m23
   1706    packssdw             m0, m16
   1707    packssdw             m1, m17
   1708    packssdw             m2, m18
   1709    packssdw             m3, m19
   1710    packssdw             m4, m20
   1711    packssdw             m5, m21
   1712    packssdw             m6, m22
   1713    packssdw             m7, m23
   1714    ret
   1715 .main_end3:
   1716    punpckhwd            m8, m0, m1
   1717    punpcklwd            m0, m1
   1718    punpckhwd            m1, m2, m3
   1719    punpcklwd            m2, m3
   1720    punpckhwd            m3, m4, m5
   1721    punpcklwd            m4, m5
   1722    punpcklwd            m5, m6, m7
   1723    punpckhwd            m6, m7
   1724    punpckhdq            m7, m0, m2
   1725    punpckldq            m0, m2
   1726    punpckhdq            m2, m8, m1
   1727    punpckldq            m8, m1
   1728    punpckhdq            m1, m4, m5
   1729    punpckldq            m4, m5
   1730    punpckhdq            m5, m3, m6
   1731    punpckldq            m3, m6
   1732    vshufi32x4           m6, m0, m4, q3232
   1733    vinserti32x8         m0, ym4, 1
   1734    vinserti32x8         m4, m8, ym3, 1
   1735    vshufi32x4           m8, m3, q3232
   1736    vinserti32x8         m3, m7, ym1, 1
   1737    vshufi32x4           m7, m1, q3232
   1738    vshufi32x4           m1, m2, m5, q3232
   1739    vinserti32x8         m2, ym5, 1
   1740    vshufi32x4           m5, m7, m1, q2020 ; 10 11
   1741    vshufi32x4           m7, m1, q3131     ; 14 15
   1742    vshufi32x4           m1, m3, m2, q2020 ;  2  3
   1743    vshufi32x4           m3, m2, q3131     ;  6  7
   1744    vshufi32x4           m2, m0, m4, q3131 ;  4  5
   1745    vshufi32x4           m0, m4, q2020     ;  0  1
   1746    vshufi32x4           m4, m6, m8, q2020 ;  8  9
   1747    vshufi32x4           m6, m8, q3131     ; 12 13
   1748    ret
   1749 ALIGN function_align
   1750 .round:
   1751    paddd               m20, m13
   1752    paddd               m22, m13
   1753 .round2:
   1754    paddd               m16, m13
   1755    paddd               m18, m13
   1756 .round3:
   1757    REPX     {psrad x, 12 }, m16, m18, m20, m22
   1758    REPX     {paddd x, m13}, m17, m19, m21, m23
   1759    REPX     {psrad x, 12 }, m17, m19, m21, m23
   1760    ret
   1761 
   1762 INV_TXFM_16X16_FN adst, dct
   1763 INV_TXFM_16X16_FN adst, flipadst
   1764 INV_TXFM_16X16_FN adst, adst
   1765 
   1766 cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   1767 %undef cmp
   1768    cmp                eobd, 36
   1769    jl .fast
   1770    call .main_pass1
   1771    packssdw             m0, m16
   1772    packssdw             m1, m17
   1773    packssdw             m2, m18
   1774    packssdw             m3, m19
   1775    packssdw             m4, m5, m20
   1776    packssdw             m5, m6, m21
   1777    packssdw             m6, m7, m22
   1778    packssdw             m7, m8, m23
   1779    jmp m(idct_16x16_internal_10bpc).pass1_end
   1780 .fast:
   1781    call .main_pass1_fast
   1782    vpbroadcastd         m9, [o(pd_2)]
   1783    paddd                m0, m9
   1784    psubd                m1, m9, m1
   1785    paddd                m2, m9
   1786    psubd                m3, m9, m3
   1787    paddd                m4, m9, m5
   1788    psubd                m5, m9, m6
   1789    paddd                m6, m9, m7
   1790    psubd                m7, m9, m8
   1791 .pass1_fast_end:
   1792    mova                 m9, [o(permA)]
   1793    psrlq                m8, m9, 8
   1794    REPX       {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
   1795 .pass1_fast_end2:
   1796    mova                m10, m9
   1797    mova                m11, m8
   1798    call m(idct_16x8_internal_10bpc).transpose_16x8
   1799    pxor                 m4, m4
   1800    REPX       {mova x, m4}, m5, m6, m7
   1801    REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7
   1802    jmp                tx2q
   1803 .pass2:
   1804    lea                  r5, [o_base_8bpc]
   1805    call m(iadst_16x16_internal_8bpc).main_pass2b
   1806    movshdup            m12, [permC]
   1807    mova                m11, [pw_2048_m2048]
   1808    psrlq               m13, m12, 8
   1809    vpermq               m8, m13, m0
   1810    vpermq               m0, m12, m7
   1811    vpermq               m7, m13, m1
   1812    vpermq               m1, m12, m6
   1813    vpermq               m6, m13, m2
   1814    vpermq               m2, m12, m5
   1815    vpermq               m5, m13, m3
   1816    vpermq               m3, m12, m4
   1817    jmp m(idct_16x16_internal_10bpc).pass2_end
   1818 ALIGN function_align
   1819 .main_pass1:
   1820    mova                 m0, [cq+64* 0]
   1821 %if WIN64
   1822    movaps        [cq+16*0], xmm6
   1823    movaps        [cq+16*1], xmm7
   1824 %endif
   1825    mova                m23, [cq+64*15]
   1826    vpbroadcastd        m13, [o(pd_2048)]
   1827    ITX_MULSUB_2D        23,  0, 8, 9, 10, 13,  201, 4091 ; t1  t0
   1828    mova                 m7, [cq+64* 7]
   1829    mova                m16, [cq+64* 8]
   1830    ITX_MULSUB_2D         7, 16, 8, 9, 10, 13, 3035, 2751 ; t9  t8
   1831    mova                 m2, [cq+64* 2]
   1832    mova                m21, [cq+64*13]
   1833    ITX_MULSUB_2D        21,  2, 8, 9, 10, 13,  995, 3973 ; t3  t2
   1834    mova                 m5, [cq+64* 5]
   1835    mova                m18, [cq+64*10]
   1836    ITX_MULSUB_2D         5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10
   1837    mova                 m4, [cq+64* 4]
   1838    mova                m19, [cq+64*11]
   1839    ITX_MULSUB_2D        19,  4, 8, 9, 10, 13, 1751, 3703 ; t5  t4
   1840    mova                 m3, [cq+64* 3]
   1841    mova                m20, [cq+64*12]
   1842    ITX_MULSUB_2D         3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12
   1843    mova                 m6, [cq+64* 6]
   1844    mova                m17, [cq+64* 9]
   1845    ITX_MULSUB_2D        17,  6, 8, 9, 10, 13, 2440, 3290 ; t7  t6
   1846    mova                 m1, [cq+64* 1]
   1847    mova                m22, [cq+64*14]
   1848    ITX_MULSUB_2D         1, 22, 8, 9, 10, 13, 4052,  601 ; t15 t14
   1849    vpbroadcastd        m14, [o(clip_18b_min)]
   1850    vpbroadcastd        m15, [o(clip_18b_max)]
   1851    psubd                m9, m23, m7  ; t9a
   1852    paddd               m23, m7       ; t1a
   1853    psubd                m7, m2, m18  ; t10a
   1854    paddd               m18, m2       ; t2a
   1855    REPX    {pmaxsd x, m14}, m9, m23, m7, m18
   1856    psubd                m2, m17, m1  ; t15a
   1857    paddd               m17, m1       ; t7a
   1858    REPX    {pminsd x, m15}, m9, m23, m7, m18
   1859    psubd                m1, m21, m5  ; t11a
   1860    paddd               m21, m5       ; t3a
   1861    REPX    {pmaxsd x, m14}, m2, m17, m1, m21
   1862    psubd                m5, m4, m20  ; t12a
   1863    paddd                m4, m20      ; t4a
   1864    REPX    {pminsd x, m15}, m2, m17, m1, m21
   1865    psubd               m20, m19, m3  ; t13a
   1866    paddd               m19, m3       ; t5a
   1867    REPX    {pmaxsd x, m14}, m5, m4, m20, m19
   1868    psubd                m8, m6, m22  ; t14a
   1869    paddd                m6, m22      ; t6a
   1870    REPX    {pminsd x, m15}, m5, m4, m20, m19
   1871    psubd               m22, m0, m16  ; t8a
   1872    paddd               m16, m0       ; t0a
   1873    REPX    {pmaxsd x, m14}, m8, m6, m22, m16
   1874    vpbroadcastd        m11, [o(pd_4017)]
   1875    vpbroadcastd        m10, [o(pd_799)]
   1876    REPX    {pminsd x, m15}, m8, m6, m22, m16
   1877    ITX_MULSUB_2D        22,  9, 0, 3, _, 13, 10, 11 ; t9  t8
   1878    ITX_MULSUB_2D        20,  5, 0, 3, _, 13, 11, 10 ; t12 t13
   1879    vpbroadcastd        m11, [o(pd_2276)]
   1880    vpbroadcastd        m10, [o(pd_3406)]
   1881    ITX_MULSUB_2D         7,  1, 0, 3, _, 13, 10, 11 ; t11 t10
   1882    ITX_MULSUB_2D         2,  8, 0, 3, _, 13, 11, 10 ; t14 t15
   1883    paddd                m0, m16, m4  ; t0
   1884    psubd               m16, m4       ; t4
   1885    psubd                m3, m23, m19 ; t5
   1886    paddd               m23, m19      ; t1
   1887    REPX    {pmaxsd x, m14}, m0, m16, m3, m23
   1888    psubd               m19, m18, m6  ; t6
   1889    paddd               m18, m6       ; t2
   1890    REPX    {pminsd x, m15}, m0, m16, m3, m23
   1891    psubd                m6, m21, m17 ; t7
   1892    paddd               m21, m17      ; t3
   1893    REPX    {pmaxsd x, m14}, m19, m18, m6, m21
   1894    paddd               m17, m9, m20  ; t8a
   1895    psubd                m9, m20      ; t12a
   1896    REPX    {pminsd x, m15}, m19, m18, m6, m21
   1897    psubd               m20, m22, m5  ; t13a
   1898    paddd               m22, m5       ; t9a
   1899    REPX    {pmaxsd x, m14}, m17, m9, m20, m22
   1900    psubd                m5, m1, m2   ; t14a
   1901    paddd                m1, m2       ; t10a
   1902    REPX    {pminsd x, m15}, m17, m9, m20, m22
   1903    psubd                m2, m7, m8   ; t15a
   1904    paddd                m7, m8       ; t11a
   1905    REPX    {pmaxsd x, m14}, m5, m1, m2, m7
   1906    vpbroadcastd        m11, [o(pd_3784)]
   1907    vpbroadcastd        m10, [o(pd_1567)]
   1908    REPX    {pminsd x, m15}, m5, m1, m2, m7
   1909    ITX_MULSUB_2D        16,  3, 4, 8, _, 13, 10, 11 ; t5a t4a
   1910    ITX_MULSUB_2D         6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a
   1911    ITX_MULSUB_2D         9, 20, 4, 8, _, 13, 10, 11 ; t13 t12
   1912    ITX_MULSUB_2D         2,  5, 4, 8, _, 13, 11, 10 ; t14 t15
   1913    psubd                m8, m0, m18  ; t2a
   1914    paddd                m0, m18      ;  out0
   1915    psubd               m18, m23, m21 ; t3a
   1916    paddd               m23, m21      ; -out15
   1917    paddd               m21, m9, m5   ; -out13
   1918    psubd                m9, m5       ; t15a
   1919    psubd                m5, m3, m6   ; t6
   1920    paddd                m3, m6       ; -out3
   1921    REPX    {pmaxsd x, m14}, m8, m18, m9, m5
   1922    psubd                m6, m20, m2  ; t14a
   1923    paddd                m2, m20      ;  out2
   1924    paddd               m20, m16, m19 ;  out12
   1925    psubd               m16, m19      ; t7
   1926    REPX    {pminsd x, m15}, m8, m18, m9, m5
   1927    psubd               m19, m22, m7  ; t11
   1928    paddd               m22, m7       ;  out14
   1929    psubd                m7, m17, m1  ; t10
   1930    paddd                m1, m17      ; -out1
   1931    REPX    {pmaxsd x, m14}, m6, m16, m19, m7
   1932    vpbroadcastd        m12, [o(pd_1448)]
   1933    vpbroadcastd         m4, [o(pd_2)]
   1934    vpbroadcastd        m10, [o(pd_5120)]
   1935    vpbroadcastd        m11, [o(pd_5119)]
   1936    REPX    {pminsd x, m15}, m6, m16, m19, m7
   1937    psubd               m17, m7, m19  ; -out9
   1938    paddd                m7, m19      ;  out6
   1939    psubd               m19, m5, m16  ; -out11
   1940    paddd                m5, m16      ;  out4
   1941    REPX    {pmulld x, m12}, m17, m7, m19, m5
   1942    psubd               m16, m8, m18  ;  out8
   1943    paddd                m8, m18      ; -out7
   1944    psubd               m18, m6, m9   ;  out10
   1945    paddd                m6, m9       ; -out5
   1946    REPX    {pmulld x, m12}, m16, m8, m18, m6
   1947    REPX  {paddd x, m4    }, m0, m2, m20, m22
   1948    REPX  {psubd x, m4,  x}, m1, m3, m21, m23
   1949    REPX  {paddd x, m10   }, m7, m5, m16, m18
   1950    REPX  {psubd x, m11, x}, m17, m19, m8, m6
   1951    REPX      {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3
   1952    REPX      {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8
   1953    ret
   1954 ALIGN function_align
   1955 .main_pass1_fast:
   1956    mova                ym0, [cq+64*0]
   1957    mova                ym1, [cq+64*2]
   1958    movshdup             m8, [o(permB)]
   1959    mova                ym6, [cq+64*1]
   1960    mova                ym7, [cq+64*3]
   1961    mova                ym2, [cq+64*4]
   1962    mova                ym3, [cq+64*6]
   1963    mova                ym4, [cq+64*5]
   1964    mova                ym5, [cq+64*7]
   1965    vpermt2q             m0, m8, m1 ; 0 2
   1966    vpermt2q             m7, m8, m6 ; 3 1
   1967    vpermt2q             m2, m8, m3 ; 4 6
   1968    vpermt2q             m5, m8, m4 ; 7 5
   1969    vpbroadcastd        m13, [o(pd_2048)]
   1970    vpbroadcastd        m12, [o(pd_2896)]
   1971    jmp m(iadst_16x8_internal_10bpc).main_fast
   1972 
   1973 INV_TXFM_16X16_FN flipadst, dct
   1974 INV_TXFM_16X16_FN flipadst, adst
   1975 INV_TXFM_16X16_FN flipadst, flipadst
   1976 
   1977 cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   1978 %undef cmp
   1979    cmp                eobd, 36
   1980    jl .fast
   1981    call m(iadst_16x16_internal_10bpc).main_pass1
   1982    packssdw             m4, m19, m3
   1983    packssdw             m3, m20, m5
   1984    packssdw             m5, m18, m2
   1985    packssdw             m2, m21, m6
   1986    packssdw             m6, m17, m1
   1987    packssdw             m1, m22, m7
   1988    packssdw             m7, m16, m0
   1989    packssdw             m0, m23, m8
   1990    jmp m(idct_16x16_internal_10bpc).pass1_end
   1991 .fast:
   1992    call m(iadst_16x16_internal_10bpc).main_pass1_fast
   1993    vpbroadcastd         m9, [o(pd_2)]
   1994    psubd                m4, m9, m3
   1995    paddd                m3, m9, m5
   1996    paddd                m5, m9, m2
   1997    psubd                m2, m9, m6
   1998    psubd                m6, m9, m1
   1999    paddd                m1, m9, m7
   2000    paddd                m7, m9, m0
   2001    psubd                m0, m9, m8
   2002    jmp m(iadst_16x16_internal_10bpc).pass1_fast_end
   2003 .pass2:
   2004    lea                  r5, [o_base_8bpc]
   2005    call m(iadst_16x16_internal_8bpc).main_pass2b
   2006    movshdup            m12, [permC]
   2007    movu                m11, [pw_m2048_2048]
   2008    psrlq               m13, m12, 8
   2009    vpermq               m8, m13, m7
   2010    vpermq               m7, m13, m6
   2011    vpermq               m6, m13, m5
   2012    vpermq               m5, m13, m4
   2013    vpermq               m3, m12, m3
   2014    vpermq               m2, m12, m2
   2015    vpermq               m1, m12, m1
   2016    vpermq               m0, m12, m0
   2017    jmp m(idct_16x16_internal_10bpc).pass2_end
   2018 
   2019 INV_TXFM_16X16_FN identity, dct, -92
   2020 INV_TXFM_16X16_FN identity, identity
   2021 
   2022 cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   2023 %undef cmp
   2024    vpbroadcastd        m10, [o(pd_5793)]
   2025    vpbroadcastd        m11, [o(pd_5120)]
   2026    mov                  r6, cq
   2027    cmp                eobd, 36
   2028    jl .fast
   2029    call .pass1_main
   2030    packssdw             m0, m6, m8
   2031    packssdw             m1, m7, m9
   2032    call .pass1_main
   2033    packssdw             m2, m6, m8
   2034    packssdw             m3, m7, m9
   2035    call .pass1_main
   2036    packssdw             m4, m6, m8
   2037    packssdw             m5, m7, m9
   2038    call .pass1_main
   2039    packssdw             m6, m8
   2040    packssdw             m7, m9
   2041    jmp m(idct_16x16_internal_10bpc).pass1_end2
   2042 .fast:
   2043    call .pass1_main_fast
   2044    packssdw             m0, m6, m7
   2045    call .pass1_main_fast
   2046    packssdw             m1, m6, m7
   2047    call .pass1_main_fast
   2048    packssdw             m2, m6, m7
   2049    call .pass1_main_fast
   2050    packssdw             m3, m6, m7
   2051    punpckhwd            m4, m0, m1
   2052    punpcklwd            m0, m1
   2053    punpckhwd            m1, m2, m3
   2054    punpcklwd            m2, m3
   2055    punpckldq            m3, m4, m1
   2056    punpckhdq            m4, m1
   2057    punpckhdq            m1, m0, m2
   2058    punpckldq            m0, m2
   2059    pxor                 m7, m7
   2060    vshufi32x4           m2, m0, m3, q3131
   2061    vshufi32x4           m0, m3, q2020
   2062    vshufi32x4           m3, m1, m4, q3131
   2063    vshufi32x4           m1, m4, q2020
   2064    REPX       {mova x, m7}, m4, m5, m6
   2065    jmp m(idct_16x16_internal_10bpc).pass1_end3
   2066 .pass2:
   2067    movshdup            m14, [o(permC)]
   2068    vpbroadcastd        m15, [o(pw_1697x16)]
   2069    lea                  r6, [strideq*3]
   2070    vpbroadcastd        m11, [o(pw_2048)]
   2071    pxor                m12, m12
   2072    vpbroadcastd        m13, [pixel_10bpc_max]
   2073    vpermq               m8, m14, m0
   2074    vpermq               m9, m14, m1
   2075    call .pass2_main
   2076    vpermq               m8, m14, m2
   2077    vpermq               m9, m14, m3
   2078    call .pass2_main
   2079    vpermq               m8, m14, m4
   2080    vpermq               m9, m14, m5
   2081    call .pass2_main
   2082    vpermq               m8, m14, m6
   2083    vpermq               m9, m14, m7
   2084 .pass2_main:
   2085    pmulhrsw             m0, m15, m8
   2086    pmulhrsw             m1, m15, m9
   2087    paddsw               m8, m8
   2088    paddsw               m9, m9
   2089    paddsw               m8, m0
   2090    paddsw               m9, m1
   2091    jmp m(idct_16x8_internal_10bpc).write_16x4
   2092 ALIGN function_align
   2093 .pass1_main:
   2094    pmulld               m6, m10, [r6+64*0]
   2095    pmulld               m7, m10, [r6+64*1]
   2096    pmulld               m8, m10, [r6+64*8]
   2097    pmulld               m9, m10, [r6+64*9]
   2098    add                  r6, 64*2
   2099    REPX    {paddd  x, m11}, m6, m7, m8, m9
   2100    REPX    {psrad  x, 13 }, m6, m8, m7, m9
   2101    ret
   2102 ALIGN function_align
   2103 .pass1_main_fast:
   2104    mova                ym6, [r6+64* 0]
   2105    vinserti32x8         m6, [r6+64* 4], 1
   2106    mova                ym7, [r6+64* 8]
   2107    vinserti32x8         m7, [r6+64*12], 1
   2108    add                  r6, 64
   2109    REPX    {pmulld x, m10}, m6, m7
   2110    REPX    {paddd  x, m11}, m6, m7
   2111    REPX    {psrad  x, 13 }, m6, m7
   2112    ret
   2113 
   2114 cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob
   2115 %undef cmp
   2116    lea                  r5, [o_base]
   2117    test               eobd, eobd
   2118    jz .dconly
   2119    vpbroadcastd        m12, [o(pd_2896)]
   2120    vpbroadcastd        m13, [o(pd_2048)]
   2121    vpbroadcastd        m14, [o(clip_18b_min)]
   2122    vpbroadcastd        m15, [o(clip_18b_max)]
   2123    vpbroadcastd        m11, [o(pd_2)]
   2124    mova                m20, [o(idct8x32p)]
   2125    pxor                m21, m21
   2126    cmp                eobd, 43
   2127    jl .fast
   2128    call .pass1_main
   2129    punpcklwd           m16, m0, m1
   2130    punpcklwd           m17, m2, m3
   2131    punpckhwd           m18, m0, m1
   2132    punpckhwd           m19, m2, m3
   2133    cmp                eobd, 107
   2134    jge .full
   2135    punpckldq            m0, m16, m17 ;  0  2
   2136    punpckhdq            m1, m16, m17 ;  4  6
   2137    punpckldq            m2, m18, m19 ;  8 10
   2138    punpckhdq            m3, m18, m19 ; 12 14
   2139    lea                  r5, [o_base_8bpc]
   2140    vextracti32x8      ym14, m0, 1
   2141    vextracti32x8      ym15, m1, 1
   2142    vextracti32x8      ym16, m2, 1
   2143    vextracti32x8      ym17, m3, 1
   2144    call m(idct_8x16_internal_8bpc).main_fast
   2145    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
   2146    jmp .end
   2147 .full:
   2148    add                  cq, 64
   2149    call .pass1_main
   2150    punpcklwd            m5, m0, m1
   2151    punpcklwd            m6, m2, m3
   2152    punpckhwd            m7, m0, m1
   2153    punpckhwd            m8, m2, m3
   2154    punpckldq            m0, m16, m17 ;  0  2
   2155    punpckhdq            m1, m16, m17 ;  4  6
   2156    punpckldq            m2, m18, m19 ;  8 10
   2157    punpckhdq            m3, m18, m19 ; 12 14
   2158    punpckldq            m4, m5, m6   ; 16 18
   2159    punpckhdq            m5, m6       ; 20 22
   2160    punpckldq            m6, m7, m8   ; 24 26
   2161    punpckhdq            m7, m8       ; 28 30
   2162    lea                  r5, [o_base_8bpc]
   2163    vextracti32x8      ym14, m0, 1
   2164    vextracti32x8      ym15, m1, 1
   2165    vextracti32x8      ym16, m2, 1
   2166    vextracti32x8      ym17, m3, 1
   2167    vextracti32x8      ym18, m4, 1
   2168    vextracti32x8      ym19, m5, 1
   2169    vextracti32x8      ym20, m6, 1
   2170    vextracti32x8      ym21, m7, 1
   2171    call m(idct_8x16_internal_8bpc).main
   2172    REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21
   2173    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
   2174    jmp .end
   2175 .fast:
   2176    movshdup             m8, [o(permB)]
   2177    mova                ym1, [cq+128*1]
   2178    mova                ym5, [cq+128*5]
   2179    mova                ym7, [cq+128*3]
   2180    mova                ym3, [cq+128*7]
   2181    mova                ym0, [cq+128*0]
   2182    mova                ym4, [cq+128*2]
   2183    mova                ym2, [cq+128*4]
   2184    mova                ym6, [cq+128*6]
   2185    vpermt2q             m1, m8, m5 ; 1 5
   2186    vpermt2q             m3, m8, m7 ; 7 3
   2187    vpermt2q             m0, m8, m4 ; 0 2
   2188    vpermt2q             m2, m8, m6 ; 4 6
   2189    mova         [cq+128*0], ym21
   2190    REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7
   2191    call m(idct_8x8_internal_10bpc).main
   2192    call m(idct_8x8_internal_10bpc).main_end
   2193    packssdw             m0, m2
   2194    packssdw             m1, m3
   2195    vpermb               m0, m20, m0
   2196    vprold              m20, 16
   2197    vpermb               m2, m20, m1
   2198    punpckhdq            m1, m0, m2
   2199    punpckldq            m0, m2
   2200    lea                  r5, [o_base_8bpc]
   2201    vextracti32x8      ym14, m0, 1
   2202    vextracti32x8      ym15, m1, 1
   2203    call m(idct_8x16_internal_8bpc).main_fast2
   2204    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2
   2205 .end:
   2206    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper
   2207    lea                  r3, [strideq*2]
   2208    vpbroadcastd        m12, [pixel_10bpc_max]
   2209    lea                  r6, [strideq*3]
   2210    pxor                m11, m11
   2211    lea                  r3, [dstq+r3*8]
   2212    pmulhrsw             m0, m10
   2213    pmulhrsw             m1, m10
   2214    call .write_8x4x2
   2215    pmulhrsw             m0, m10, m2
   2216    pmulhrsw             m1, m10, m3
   2217    call .write_8x4x2
   2218    pmulhrsw             m0, m10, m4
   2219    pmulhrsw             m1, m10, m5
   2220    call .write_8x4x2
   2221    pmulhrsw             m0, m10, m6
   2222    pmulhrsw             m1, m10, m7
   2223 .write_8x4x2:
   2224    mova                xm8, [dstq+strideq*0]
   2225    vinserti32x4        ym8, [dstq+strideq*1], 1
   2226    vinserti32x4         m8, [dstq+strideq*2], 2
   2227    vinserti32x4         m8, [dstq+r6       ], 3
   2228    mova                xm9, [r3  +r6       ]
   2229    vinserti32x4        ym9, [r3  +strideq*2], 1
   2230    vinserti32x4         m9, [r3  +strideq*1], 2
   2231    vinserti32x4         m9, [r3  +strideq*0], 3
   2232    paddw                m8, m0
   2233    paddw                m9, m1
   2234    pmaxsw               m8, m11
   2235    pmaxsw               m9, m11
   2236    pminsw               m8, m12
   2237    pminsw               m9, m12
   2238    mova          [dstq+strideq*0], xm8
   2239    vextracti32x4 [dstq+strideq*1], ym8, 1
   2240    vextracti32x4 [dstq+strideq*2], m8, 2
   2241    vextracti32x4 [dstq+r6       ], m8, 3
   2242    lea                dstq, [dstq+strideq*4]
   2243    vextracti32x4 [r3  +strideq*0], m9, 3
   2244    vextracti32x4 [r3  +strideq*1], m9, 2
   2245    vextracti32x4 [r3  +strideq*2], ym9, 1
   2246    mova          [r3  +r6       ], xm9
   2247    lea                  r3, [r3+strideq*4]
   2248    ret
   2249 .dconly:
   2250    imul                r6d, [cq], 181
   2251    mov                [cq], eobd
   2252    or                  r3d, 32
   2253    add                 r6d, 640
   2254    sar                 r6d, 10
   2255    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
   2256 ALIGN function_align
   2257 .pass1_main:
   2258    mova                 m0, [cq+128*0]
   2259    mova                 m1, [cq+128*1]
   2260    mova                 m2, [cq+128*2]
   2261    mova                 m3, [cq+128*3]
   2262    mova                 m4, [cq+128*4]
   2263    mova                 m5, [cq+128*5]
   2264    mova                 m6, [cq+128*6]
   2265    mova                 m7, [cq+128*7]
   2266    REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7
   2267    call m(idct_8x16_internal_10bpc).main
   2268    call m(idct_8x16_internal_10bpc).main_end2
   2269    packssdw             m0, m4
   2270    packssdw             m1, m5
   2271    packssdw             m2, m6
   2272    packssdw             m3, m7
   2273    REPX {vpermb x, m20, x}, m0, m1, m2, m3
   2274    ret
   2275 
   2276 cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob
   2277    vpbroadcastd         m9, [pw_5]
   2278    lea                  r4, [strideq*3]
   2279    pxor                m10, m10
   2280    lea                  r5, [strideq*5]
   2281    vpbroadcastd        m11, [pixel_10bpc_max]
   2282    sub                eobd, 107
   2283    lea                  r6, [strideq+r4*2]
   2284 .loop:
   2285    mova                 m0, [cq+128*0]
   2286    packssdw             m0, [cq+128*1]
   2287    mova                 m1, [cq+128*2]
   2288    packssdw             m1, [cq+128*3]
   2289    mova                 m2, [cq+128*4]
   2290    packssdw             m2, [cq+128*5]
   2291    mova                 m3, [cq+128*6]
   2292    packssdw             m3, [cq+128*7]
   2293    lea                  r7, [dstq+strideq*8]
   2294    REPX {mova [cq+128*x], m10}, 0, 1, 2, 3
   2295    REPX     {paddsw x, m9}, m0, m1, m2, m3
   2296    REPX {mova [cq+128*x], m10}, 4, 5, 6, 7
   2297    REPX     {psraw  x, 3 }, m0, m1, m2, m3
   2298    add                  cq, 64
   2299    mova                xm4, [dstq+strideq*0]
   2300    mova                xm5, [dstq+strideq*1]
   2301    mova                xm6, [dstq+strideq*2]
   2302    mova                xm7, [dstq+r4     *1]
   2303    punpckhwd            m8, m0, m1
   2304    vinserti32x4        ym4, [dstq+strideq*4], 1
   2305    punpcklwd            m0, m1
   2306    vinserti32x4        ym5, [dstq+r5     *1], 1
   2307    punpckhwd            m1, m2, m3
   2308    vinserti32x4        ym6, [dstq+r4     *2], 1
   2309    punpcklwd            m2, m3
   2310    vinserti32x4        ym7, [dstq+r6     *1], 1
   2311    punpckhwd            m3, m0, m8
   2312    vinserti32x4         m4, [r7  +strideq*0], 2
   2313    punpcklwd            m0, m8
   2314    vinserti32x4         m5, [r7  +strideq*1], 2
   2315    punpckhwd            m8, m2, m1
   2316    vinserti32x4         m6, [r7  +strideq*2], 2
   2317    punpcklwd            m2, m1
   2318    vinserti32x4         m7, [r7  +r4     *1], 2
   2319    punpckhqdq           m1, m0, m2
   2320    vinserti32x4         m4, [r7  +strideq*4], 3
   2321    punpcklqdq           m0, m2
   2322    vinserti32x4         m5, [r7  +r5     *1], 3
   2323    punpcklqdq           m2, m3, m8
   2324    vinserti32x4         m6, [r7  +r4     *2], 3
   2325    punpckhqdq           m3, m8
   2326    vinserti32x4         m7, [r7  +r6     *1], 3
   2327    paddw                m0, m4
   2328    paddw                m1, m5
   2329    paddw                m2, m6
   2330    paddw                m3, m7
   2331    REPX    {pmaxsw x, m10}, m0, m1, m2, m3
   2332    REPX    {pminsw x, m11}, m0, m1, m2, m3
   2333    mova          [dstq+strideq*0], xm0
   2334    mova          [dstq+strideq*1], xm1
   2335    mova          [dstq+strideq*2], xm2
   2336    mova          [dstq+r4     *1], xm3
   2337    vextracti32x4 [dstq+strideq*4], ym0, 1
   2338    vextracti32x4 [dstq+r5     *1], ym1, 1
   2339    vextracti32x4 [dstq+r4     *2], ym2, 1
   2340    vextracti32x4 [dstq+r6     *1], ym3, 1
   2341    lea                dstq, [r7+strideq*8]
   2342    vextracti32x4 [r7  +strideq*0], m0, 2
   2343    vextracti32x4 [r7  +strideq*1], m1, 2
   2344    vextracti32x4 [r7  +strideq*2], m2, 2
   2345    vextracti32x4 [r7  +r4     *1], m3, 2
   2346    vextracti32x4 [r7  +strideq*4], m0, 3
   2347    vextracti32x4 [r7  +r5     *1], m1, 3
   2348    vextracti32x4 [r7  +r4     *2], m2, 3
   2349    vextracti32x4 [r7  +r6     *1], m3, 3
   2350    add                eobd, 0x80000000
   2351    jnc .loop
   2352    RET
   2353 
   2354 cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
   2355 %undef cmp
   2356    lea                  r5, [o_base]
   2357    test               eobd, eobd
   2358    jz .dconly
   2359    mova                m11, [o(permB)]
   2360    mova                 m0, [cq+64* 0] ;  0  1
   2361    mova                 m4, [cq+64* 1] ;  2  3
   2362    mova                 m1, [cq+64* 2] ;  4  5
   2363    mova                 m8, [cq+64* 3] ;  6  7
   2364    vpbroadcastd        m12, [o(pd_2896)]
   2365    vpbroadcastd        m13, [o(pd_2048)]
   2366    vpbroadcastd        m14, [o(clip_18b_min)]
   2367    vpbroadcastd        m15, [o(clip_18b_max)]
   2368    psrlq               m10, m11, 32
   2369 %if WIN64
   2370    movaps        [cq+16*0], xmm6
   2371    movaps        [cq+16*1], xmm7
   2372 %endif
   2373    mova                m16, m11
   2374    vpermi2q            m16, m0, m1     ;  1  5
   2375    mova                m17, m11
   2376    vpermi2q            m17, m8, m4     ;  7  3
   2377    cmp                eobd, 43
   2378    jl .fast
   2379    mova                m18, [cq+64* 4] ;  8  9
   2380    mova                m20, [cq+64* 5] ; 10 11
   2381    mova                 m6, [cq+64* 6] ; 12 13
   2382    mova                 m7, [cq+64* 7] ; 14 15
   2383    vpermt2q             m0, m10, m18   ;  0  8
   2384    vpermt2q            m18, m11, m6    ;  9 13
   2385    mova                m19, m11
   2386    vpermi2q            m19, m7, m20    ; 15 11
   2387    cmp                eobd, 107
   2388    jge .full
   2389    vpermt2q             m1, m10, m6    ;  4 12
   2390    vpermt2q             m4, m10, m8    ;  2  6
   2391    vpermt2q             m7, m10, m20   ; 14 10
   2392    mov                 r6d, 64*1
   2393    call m(idct_8x8_internal_10bpc).main_fast
   2394    call m(idct_16x8_internal_10bpc).main_fast
   2395    call .main_fast
   2396    call m(idct_16x16_internal_10bpc).main_end
   2397    jmp .end
   2398 .full:
   2399    mova                 m2, [cq+64* 8] ; 16 17
   2400    mova                 m5, [cq+64* 9] ; 18 19
   2401    mova                 m9, [cq+64*10] ; 20 21
   2402    mova                m21, [cq+64*11] ; 22 23
   2403    vpermt2q             m1, m10, m9    ;  4 20
   2404    vpermt2q             m7, m10, m21   ; 14 22
   2405    vpermt2q            m21, m11, m5    ; 23 19
   2406    vpermt2q             m5, m10, m20   ; 18 10
   2407    mova                m20, m11
   2408    vpermi2q            m20, m2, m9     ; 17 21
   2409    mova                m22, [cq+64*12] ; 24 25
   2410    mova                 m9, [cq+64*13] ; 26 27
   2411    mova                 m3, [cq+64*14] ; 28 29
   2412    mova                m23, [cq+64*15] ; 30 31
   2413    vpermt2q             m2, m10, m22   ; 16 24
   2414    vpermt2q            m22, m11, m3    ; 25 29
   2415    vpermt2q             m3, m10, m6    ; 28 12
   2416    vpermt2q             m4, m10, m9    ;  2 26
   2417    mova                 m6, m10
   2418    vpermi2q             m6, m23, m8    ; 30  6
   2419    vpermt2q            m23, m11, m9    ; 31 27
   2420    mov                 r6d, 64*3
   2421    call m(idct_8x8_internal_10bpc).main
   2422    call m(idct_16x8_internal_10bpc).main
   2423    call .main
   2424    call m(idct_16x16_internal_10bpc).main_end
   2425    jmp .end
   2426 .fast:
   2427    vpermq               m0, m10, m0    ;  0  0
   2428    vpermq               m1, m10, m1    ;  4  4
   2429    vpermt2q             m4, m10, m8    ;  2  6
   2430    xor                 r6d, r6d
   2431    call .main_fast2
   2432    call m(idct_16x16_internal_10bpc).main_end
   2433 .end:
   2434 %if WIN64
   2435    movaps             xmm6, [cq+16*0]
   2436    movaps             xmm7, [cq+16*1]
   2437 %endif
   2438    vzeroupper
   2439    call .transpose_8x32
   2440    pxor                m14, m14
   2441 .zero_loop:
   2442    mova     [cq+r6*4+64*3], m14
   2443    mova     [cq+r6*4+64*2], m14
   2444    mova     [cq+r6*4+64*1], m14
   2445    mova     [cq+r6*4+64*0], m14
   2446    sub                 r6d, 64
   2447    jge .zero_loop
   2448    lea                  r5, [o_base_8bpc]
   2449    punpckhqdq           m1, m0, m2
   2450    punpcklqdq           m0, m2
   2451    punpcklqdq           m2, m3, m4
   2452    punpckhqdq           m3, m4
   2453    punpcklqdq           m4, m5, m7
   2454    punpckhqdq           m5, m7
   2455    punpckhqdq           m7, m6, m8
   2456    punpcklqdq           m6, m8
   2457    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   2458    pxor                m12, m12
   2459 .write_32x8_start:
   2460    vpbroadcastd        m11, [pw_2048]
   2461    vpbroadcastd        m13, [pixel_10bpc_max]
   2462    lea                  r3, [strideq*3]
   2463 .write_32x8:
   2464    pmulhrsw             m0, m11
   2465    pmulhrsw             m1, m11
   2466    pmulhrsw             m2, m11
   2467    pmulhrsw             m3, m11
   2468    call .write_32x4
   2469    pmulhrsw             m0, m11, m4
   2470    pmulhrsw             m1, m11, m5
   2471    pmulhrsw             m2, m11, m6
   2472    pmulhrsw             m3, m11, m7
   2473 .write_32x4:
   2474    paddw                m0, [dstq+strideq*0]
   2475    paddw                m1, [dstq+strideq*1]
   2476    paddw                m2, [dstq+strideq*2]
   2477    paddw                m3, [dstq+r3       ]
   2478    REPX    {pmaxsw x, m12}, m0, m1, m2, m3
   2479    REPX    {pminsw x, m13}, m0, m1, m2, m3
   2480    mova   [dstq+strideq*0], m0
   2481    mova   [dstq+strideq*1], m1
   2482    mova   [dstq+strideq*2], m2
   2483    mova   [dstq+r3       ], m3
   2484    lea                dstq, [dstq+strideq*4]
   2485    ret
   2486 .dconly:
   2487    imul                r6d, [cq], 181
   2488    mov                [cq], eobd
   2489    or                  r3d, 8
   2490    add                 r6d, 640
   2491    sar                 r6d, 10
   2492    jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
   2493 ALIGN function_align
   2494 .main_fast3:
   2495    ; assuming m0=in0 in0, m4=in2 in2, and m16=in1 in3
   2496    vbroadcasti32x4      m5, [o(pd_401_4076)]
   2497    pmulld               m3, m0, m12
   2498    pmulld               m4, m5
   2499    REPX    {paddd  x, m13}, m3, m4
   2500    REPX    {psrad  x, 12 }, m3, m4     ; m3=idct8:t0-7, m4=t8a t15a
   2501 
   2502    ; t8a t15a -> t8/9 t14/15
   2503 
   2504    vbroadcasti32x4      m5, [o(pd_3784_m3784)]
   2505    pshufd               m7, m4, q1032
   2506    pmulld               m6, m4, [o(pd_1567)]{bcstd}
   2507    pmulld               m5, m7
   2508    paddd                m6, m13
   2509    paddd                m5, m6
   2510    psrad                m5, 12         ; m5=t9a t14a
   2511 
   2512    ; t14a t9a -> t13/14 t9/10 [m5] & t8 15 -> t8/11a t12/15a [m4]
   2513 
   2514    shufps               m6, m4, m5, q1032     ; t12  t13
   2515    shufps               m8, m4, m5, q3210     ; t11a t10
   2516    pmulld               m9, m6, m12
   2517    pmulld               m7, m8, m12
   2518    paddd                m9, m13
   2519    paddd                m5, m9, m7     ; t12 t13a
   2520    psubd                m4, m9, m7     ; t11 t10a
   2521    REPX    {psrad  x, 12 }, m5, m4
   2522 
   2523    psubd                m7, m3, m6   ; dct16 out15 out14
   2524    paddd                m0, m3, m6   ; dct16 out0  out1
   2525    psubd                m6, m3, m5   ; dct16 out12 out13
   2526    paddd                m1, m3, m5   ; dct16 out3  out2
   2527    psubd                m5, m3, m4   ; dct16 out11 out10
   2528    paddd                m2, m3, m4   ; dct16 out4  out5
   2529    psubd                m4, m3, m8   ; dct16 out8  out9
   2530    paddd                m3, m8       ; dct16 out7  out6
   2531    REPX    {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
   2532    REPX    {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
   2533 
   2534    ; idct32_bottomhalf
   2535    vbroadcasti32x4     m18, [o(pd_201_m601)]
   2536    vbroadcasti32x4     m19, [o(pd_4091_4052)]
   2537    pmulld              m17, m16, m19
   2538    pmulld              m16, m18
   2539    REPX    {paddd  x, m13}, m17, m16
   2540    REPX    {psrad  x, 12 }, m17, m16
   2541 
   2542    ; m17: t31a t24a -> t30/31 t24/25, m16: t16a t23a -> t16/17 t22/23 [step2]
   2543 
   2544    vbroadcasti32x4     m10, [o(pd_799_m2276)]
   2545    vbroadcasti32x4     m11, [o(pd_4017_3406)]
   2546    pmulld              m18, m17, m10
   2547    pmulld              m19, m17, m11
   2548    pmulld               m8, m16, m11
   2549    pmulld               m9, m16, m10
   2550    REPX    {paddd  x, m13}, m18, m19
   2551    psubd               m18, m8
   2552    paddd               m19, m9
   2553    REPX    {psrad  x, 12 }, m18, m19
   2554 
   2555    ; m17=t31  t24  -> t28/31a t24/27a, m16=t16  t23  -> t16/19a t20/23a
   2556    ; m18=t17a t22a -> t17/18  t21/22,  m19=t30a t25a -> t29/30  t25/26
   2557 
   2558    punpckhqdq          m23, m17, m19   ; t24a t25 [or t27a t26]
   2559    punpcklqdq          m20, m16, m18   ; t16a t17 [or t19a t18]
   2560    punpckhqdq          m22, m16, m18   ; t23a t22 [or t20a t21]
   2561    punpcklqdq          m16, m17, m19   ; t28a t29 [or t31a t30]
   2562    mova                m21, m23
   2563    mova                m18, m20
   2564    mova                m17, m22
   2565    mova                m19, m16
   2566 
   2567    jmp .main4
   2568 .main_fast2: ; bottom three-quarters are zero
   2569    vbroadcasti32x4      m8, [o(pd_799_4017)]
   2570    pmulld               m8, m1     ; t4  t7
   2571    vpmulld              m0, [o(pd_2896)] {1to16} ; t0 t1
   2572    REPX     {paddd x, m13}, m8, m0
   2573    REPX     {psrad x, 12 }, m8, m0
   2574    pmulld               m3, m8, m12
   2575    mova                 m2, m0       ;  t3   t2
   2576    call m(idct_8x8_internal_10bpc).main3
   2577    vbroadcasti32x4      m6, [o(pd_4076_3920)]
   2578    vbroadcasti32x4      m3, [o(pd_401_m1189)]
   2579    pmulld               m6, m4       ;  t15  t12
   2580    pmulld               m4, m3       ;  t9   t10
   2581    REPX     {paddd x, m13}, m6, m4
   2582    REPX     {psrad x, 12 }, m6, m4
   2583    mova                 m5, m6       ;  t14  t13
   2584    mova                 m9, m4       ;  t8   t11
   2585    call m(idct_16x8_internal_10bpc).main3
   2586    vbroadcasti32x4     m23, [o(pd_4091_3973)]
   2587    vbroadcasti32x4      m7, [o(pd_201_995)]
   2588    vbroadcasti32x4     m22, [o(pd_1380_601)]
   2589    vbroadcasti32x4      m9, [o(pd_3857_4052)]
   2590    pmulld              m23, m16      ;  t16  t20
   2591    pmulld              m16, m7       ;  t31  t27
   2592    pmulld              m22, m17      ; -t19 -t25
   2593    pmulld              m17, m9       ;  t28  t24
   2594    REPX    {paddd  x, m13}, m23, m16, m17
   2595    psubd               m22, m13, m22
   2596    REPX    {psrad  x, 12 }, m23, m16, m22, m17
   2597    mova                m20, m23      ;  t30  t26
   2598    mova                 m9, m16      ;  t17  t21
   2599    mova                m19, m22      ;  t18  t22
   2600    mova                m18, m17      ;  t29  t25
   2601    jmp .main3
   2602 .main_fast: ; bottom half is zero
   2603    vbroadcasti32x4     m23, [o(pd_4091_3973)]
   2604    vbroadcasti32x4      m7, [o(pd_201_995)]
   2605    vbroadcasti32x4     m20, [o(pd_2751_2106)]
   2606    vbroadcasti32x4      m9, [o(pd_3035_3513)]
   2607    vbroadcasti32x4     m21, [o(pd_3703_3290)]
   2608    vbroadcasti32x4     m10, [o(pd_1751_2440)]
   2609    vbroadcasti32x4     m22, [o(pd_1380_601)]
   2610    vbroadcasti32x4     m11, [o(pd_3857_4052)]
   2611    pmulld              m23, m16      ;  t16a  t20a
   2612    pmulld              m16, m7       ;  t31a  t27a
   2613    pmulld              m20, m19      ; -t17a -t21a
   2614    pmulld              m19, m9       ;  t30a  t26a
   2615    pmulld              m21, m18      ;  t18a  t22a
   2616    pmulld              m18, m10      ;  t29a  t25a
   2617    pmulld              m22, m17      ; -t19a -t25a
   2618    pmulld              m17, m11      ;  t28a  t24a
   2619    psubd               m20, m13, m20
   2620    psubd               m22, m13, m22
   2621    jmp .main2
   2622 .main:
   2623    ITX_MULSUB_2D        16, 23, 7, 9, 10, _,  201_995,  4091_3973
   2624    ITX_MULSUB_2D        20, 19, 7, 9, 10, _, 3035_3513, 2751_2106
   2625    ITX_MULSUB_2D        18, 21, 7, 9, 10, _, 1751_2440, 3703_3290
   2626    ITX_MULSUB_2D        22, 17, 7, 9, 10, _, 3857_4052, 1380_601
   2627    paddd               m20, m13
   2628    paddd               m22, m13
   2629 .main2:
   2630    REPX    {paddd  x, m13}, m16, m23, m19
   2631    REPX    {psrad  x, 12 }, m16, m20, m23, m19
   2632    psubd                m9, m16, m20 ; t17  t21
   2633    paddd               m16, m20      ; t16  t20
   2634    psubd               m20, m23, m19 ; t30  t26
   2635    paddd               m23, m19      ; t31  t27
   2636    REPX    {pmaxsd x, m14}, m9, m16, m20, m23
   2637    REPX    {paddd  x, m13}, m21, m18, m17
   2638    REPX    {psrad  x, 12 }, m18, m22, m21, m17
   2639    psubd               m19, m22, m18 ; t18  t22
   2640    paddd               m22, m18      ; t19  t23
   2641    psubd               m18, m17, m21 ; t29  t25
   2642    paddd               m17, m21      ; t28  t24
   2643    REPX    {pmaxsd x, m14}, m19, m22, m18, m17
   2644    REPX    {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17
   2645 .main3:
   2646    vbroadcasti32x4     m11, [o(pd_4017_2276)]
   2647    vbroadcasti32x4     m10, [o(pd_799_3406)]
   2648    psubd                m7, m0, m6   ; dct16 out15 out14
   2649    paddd                m0, m6       ; dct16 out0  out1
   2650    psubd                m6, m1, m5   ; dct16 out12 out13
   2651    paddd                m1, m5       ; dct16 out3  out2
   2652    psubd                m5, m2, m4   ; dct16 out11 out10
   2653    paddd                m2, m4       ; dct16 out4  out5
   2654    psubd                m4, m3, m8   ; dct16 out8  out9
   2655    paddd                m3, m8       ; dct16 out7  out6
   2656    ITX_MULSUB_2D        20,  9, 8, 21, _, 13, 10, 11
   2657    ITX_MULSUB_2D        18, 19, 8, 21, _, 13, 10, 11, 2
   2658    REPX    {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
   2659    punpckhqdq          m21, m16, m20 ; t20  t21a
   2660    punpcklqdq          m16, m20      ; t16  t17a
   2661    punpcklqdq          m20, m22, m19 ; t19  t18a
   2662    punpckhqdq          m22, m19      ; t23  t22a
   2663    REPX    {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
   2664    punpcklqdq          m19, m23, m9  ; t31  t30a
   2665    punpckhqdq          m23, m9       ; t27  t26a
   2666    punpckhqdq           m9, m17, m18 ; t24  t25a
   2667    punpcklqdq          m17, m18      ; t28  t29a
   2668    psubd               m18, m16, m20 ; t19a t18
   2669    paddd               m20, m16      ; t16a t17
   2670    psubd               m16, m19, m17 ; t28a t29
   2671    paddd               m19, m17      ; t31a t30
   2672    psubd               m17, m22, m21 ; t20a t21
   2673    paddd               m22, m21      ; t23a t22
   2674    psubd               m21, m9, m23  ; t27a t26
   2675    paddd               m23, m9       ; t24a t25
   2676    REPX    {pmaxsd x, m14}, m18, m16, m17, m21
   2677    REPX    {pminsd x, m15}, m16, m18, m21, m17
   2678    REPX    {pmaxsd x, m14}, m20, m22, m19, m23
   2679    REPX    {pminsd x, m15}, m20, m22, m19, m23
   2680 .main4:
   2681    vpbroadcastd        m11, [o(pd_3784)]
   2682    vpbroadcastd        m10, [o(pd_1567)]
   2683    ITX_MULSUB_2D        16, 18, 8, 9, _, 13, 10, 11
   2684    ITX_MULSUB_2D        21, 17, 8, 9, _, 13, 10, 11, 2
   2685    paddd                m9, m20, m22 ; t16  t17a
   2686    psubd               m20, m22      ; t23  t22a
   2687    paddd               m22, m19, m23 ; t31  t30a
   2688    psubd               m19, m23      ; t24  t25a
   2689    psubd               m23, m16, m17 ; t20a t21
   2690    paddd               m16, m17      ; t19a t18
   2691    psubd               m17, m18, m21 ; t27a t26
   2692    paddd               m21, m18      ; t28a t29
   2693    REPX    {pmaxsd x, m14}, m20, m19, m23, m17
   2694    REPX    {pminsd x, m15}, m19, m20, m17, m23
   2695    REPX    {pmulld x, m12}, m19, m20, m17, m23
   2696    REPX    {pmaxsd x, m14}, m22, m21, m16, m9
   2697    paddd               m19, m13
   2698    paddd               m17, m13
   2699    REPX    {pminsd x, m15}, m22, m21, m16, m9
   2700    psubd               m18, m19, m20 ; t23a t22
   2701    paddd               m19, m20      ; t24a t25
   2702    paddd               m20, m17, m23 ; t27  t26a
   2703    psubd               m17, m23      ; t20  t21a
   2704    REPX    {psrad  x, 12 }, m20, m19, m18, m17
   2705    ret
   2706 .transpose_8x32:
   2707    mova                m10, [o(idct32x8p)]
   2708    psrlw                m8, m10, 8
   2709    mova                 m9, m8
   2710    vpermi2w             m8, m1, m5
   2711    vpermt2w             m1, m10, m5
   2712    vprold               m5, m9, 16
   2713    vpermi2w             m9, m3, m7
   2714    vpermt2w             m3, m10, m7
   2715    vprold              m10, 16
   2716    mova                 m7, m5
   2717    vpermi2w             m5, m0, m4
   2718    vpermt2w             m0, m10, m4
   2719    vpermi2w             m7, m2, m6
   2720    vpermt2w             m2, m10, m6
   2721    punpckhdq            m6, m5, m8
   2722    punpckldq            m5, m8
   2723    punpckhdq            m8, m7, m9
   2724    punpckldq            m7, m9
   2725    punpckhdq            m4, m2, m3
   2726    punpckldq            m2, m3
   2727    punpckhdq            m3, m0, m1
   2728    punpckldq            m0, m1
   2729    ret
   2730 
   2731 cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob
   2732    vpbroadcastd         m5, [pw_4096]
   2733    lea                  r4, [strideq*3]
   2734    mova                 m6, [idtx32x8p]
   2735    lea                  r5, [strideq*5]
   2736    vpbroadcastd         m9, [pixel_10bpc_max]
   2737    lea                  r6, [strideq+r4*2]
   2738    pxor                 m8, m8
   2739    sub                eobd, 107
   2740    psrlw                m7, m6, 8
   2741 .loop:
   2742    mova                 m0, [cq+64*0]
   2743    packssdw             m0, [cq+64*1] ; 02 13
   2744    mova                 m1, [cq+64*2]
   2745    packssdw             m1, [cq+64*3] ; 46 57
   2746    mova                 m2, [cq+64*4]
   2747    packssdw             m2, [cq+64*5] ; 8a 9b
   2748    mova                 m3, [cq+64*6]
   2749    packssdw             m3, [cq+64*7] ; ce df
   2750    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
   2751    REPX {mova [cq+64*x], m8}, 0, 1, 2, 3
   2752    mova                 m4, m6
   2753    vpermi2w             m4, m1, m3
   2754    vpermt2w             m1, m7, m3
   2755    REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
   2756    mova                 m3, m7
   2757    vpermi2w             m3, m0, m2
   2758    vpermt2w             m0, m6, m2
   2759    add                  cq, 64*8
   2760    punpcklqdq           m2, m3, m1 ; 4 5
   2761    punpckhqdq           m3, m1     ; 6 7
   2762    punpckhqdq           m1, m0, m4 ; 2 3
   2763    punpcklqdq           m0, m4     ; 0 1
   2764    mova                ym4, [dstq+strideq*0]
   2765    vinserti32x8         m4, [dstq+strideq*1], 1
   2766    paddw                m0, m4
   2767    mova                ym4, [dstq+strideq*2]
   2768    vinserti32x8         m4, [dstq+r4     *1], 1
   2769    paddw                m1, m4
   2770    mova                ym4, [dstq+strideq*4]
   2771    vinserti32x8         m4, [dstq+r5     *1], 1
   2772    paddw                m2, m4
   2773    mova                ym4, [dstq+r4     *2]
   2774    vinserti32x8         m4, [dstq+r6     *1], 1
   2775    paddw                m3, m4
   2776    REPX     {pmaxsw x, m8}, m0, m1, m2, m3
   2777    REPX     {pminsw x, m9}, m0, m1, m2, m3
   2778    mova          [dstq+strideq*0], ym0
   2779    vextracti32x8 [dstq+strideq*1], m0, 1
   2780    mova          [dstq+strideq*2], ym1
   2781    vextracti32x8 [dstq+r4     *1], m1, 1
   2782    mova          [dstq+strideq*4], ym2
   2783    vextracti32x8 [dstq+r5     *1], m2, 1
   2784    mova          [dstq+r4     *2], ym3
   2785    vextracti32x8 [dstq+r6     *1], m3, 1
   2786    add                dstq, 32
   2787    add                eobd, 0x80000000
   2788    jnc .loop
   2789    RET
   2790 
   2791 cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
   2792 %undef cmp
   2793    lea                  r5, [o_base]
   2794    test               eobd, eobd
   2795    jz .dconly
   2796    vpbroadcastd        m12, [o(pd_2896)]
   2797    vpbroadcastd        m13, [o(pd_2048)]
   2798    vpbroadcastd        m14, [o(clip_18b_min)]
   2799    vpbroadcastd        m15, [o(clip_18b_max)]
   2800 %if WIN64
   2801    movaps         [rsp+ 8], xmm6
   2802    movaps         [rsp+24], xmm7
   2803 %endif
   2804    cmp                eobd, 36
   2805    jl .fast
   2806    call .pass1
   2807    cmp                eobd, 151
   2808    jge .full
   2809    lea                  r5, [o_base_8bpc]
   2810    pxor                 m9, m9
   2811    punpcklwd            m8, m1, m1 ;  2
   2812    punpckhwd           m14, m1, m1 ;  3
   2813    punpcklwd            m1, m3, m3 ;  6
   2814    punpckhwd           m15, m3, m3 ;  7
   2815    punpcklwd            m3, m6, m6 ; 12
   2816    punpckhwd           m19, m6, m6 ; 13
   2817    punpcklwd            m6, m9, m4 ; __  8
   2818    punpckhwd           m20, m4, m4 ;  9
   2819    punpckhwd           m16, m5, m5 ; 11
   2820    punpcklwd            m5, m5     ; 10
   2821    punpcklwd            m9, m0     ; __  0
   2822    punpckhwd           m21, m0, m0 ;  1
   2823    punpcklwd            m0, m7, m7 ; 14
   2824    punpckhwd           m17, m7, m7 ; 15
   2825    punpcklwd            m7, m2, m2 ;  4
   2826    punpckhwd           m18, m2, m2 ;  5
   2827    call m(idct_16x16_internal_8bpc).main_fast
   2828    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   2829    mov                 r6d, 64*3
   2830    pxor                 m8, m8
   2831 .zero_loop:
   2832    REPX {mova [cq+r6*8+128*x], m8}, 3, 2, 1, 0
   2833    sub                 r6d, 64
   2834    jge .zero_loop
   2835    jmp .pass2_end
   2836 .full:
   2837    mova         [cq+128*0], m0
   2838    mova         [cq+128*1], m1
   2839    mova         [cq+128*2], m2
   2840    mova         [cq+128*3], m3
   2841    mova         [cq+128*4], m4
   2842    mova         [cq+128*5], m5
   2843    mova         [cq+128*6], m6
   2844    mova         [cq+128*7], m7
   2845    add                  cq, 64
   2846    call .pass1
   2847    mova                 m9, [cq-64* 1] ;  0  1
   2848    mova                m14, [cq+64* 1] ;  2  3
   2849    mova                m18, [cq+64* 3] ;  4  5
   2850    mova                m15, [cq+64* 5] ;  6  7
   2851    mova                m20, [cq+64* 7] ;  8  9
   2852    mova                m16, [cq+64* 9] ; 10 11
   2853    mova                m22, [cq+64*11] ; 12 13
   2854    mova                m19, [cq+64*13] ; 14 15
   2855    lea                  r5, [o_base_8bpc]
   2856    punpcklwd            m8, m7, m14   ; 30  2
   2857    punpckhwd           m21, m7, m9    ; 31  1
   2858    punpcklwd            m7, m6, m18   ; 28  4
   2859    punpckhwd           m14, m6        ;  3 29
   2860    punpcklwd            m9, m0, m9    ; 16  0
   2861    punpckhwd           m17, m19, m0   ; 15 17
   2862    punpcklwd            m0, m19, m1   ; 14 18
   2863    punpckhwd           m19, m1, m22   ; 19 13
   2864    punpcklwd            m1, m15, m5   ;  6 26
   2865    punpckhwd           m18, m5, m18   ; 27  5
   2866    punpcklwd            m6, m4, m20   ; 24  8
   2867    punpckhwd           m15, m4        ;  7 25
   2868    punpcklwd            m5, m3, m16   ; 22 10
   2869    punpckhwd           m20, m3, m20   ; 23  9
   2870    punpcklwd            m3, m22, m2   ; 12 20
   2871    punpckhwd           m16, m2        ; 11 21
   2872    call m(idct_16x16_internal_8bpc).main2
   2873    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
   2874    mov                 r6d, 32*7
   2875    pxor                 m8, m8
   2876 .full_zero_loop:
   2877    REPX {mova [cq+r6*8+64*x], m8}, 2, 1, 0, -1
   2878    sub                 r6d, 32
   2879    jge .full_zero_loop
   2880    jmp .pass2_end
   2881 .fast:
   2882    mova                ym0, [cq+128*0]
   2883    mova                ym2, [cq+128*4]
   2884    movshdup             m8, [o(permB)]
   2885    mova                ym1, [cq+128*2]
   2886    mova                ym3, [cq+128*6]
   2887    mova                ym4, [cq+128*1]
   2888    mova                ym5, [cq+128*3]
   2889    mova                ym6, [cq+128*5]
   2890    mova                ym7, [cq+128*7]
   2891    vpermt2q             m0, m8, m2 ; 0 4
   2892    vpermt2q             m1, m8, m3 ; 2 6
   2893    vpermt2q             m4, m8, m5 ; 1 3
   2894    vpermt2q             m7, m8, m6 ; 7 5
   2895    REPX    {pmulld x, m12}, m0, m1, m4, m7
   2896    pxor               ym16, ym16
   2897    mova         [cq+128*0], ym16
   2898    REPX {vmovdqa32 [cq+128*x], ym16}, 1, 2, 3, 4, 5, 6, 7
   2899    REPX    {paddd  x, m13}, m0, m1, m4, m7
   2900    REPX    {psrad  x, 12 }, m0, m1, m4, m7
   2901    call m(idct_8x8_internal_10bpc).main_fast
   2902    call m(idct_16x8_internal_10bpc).main_fast
   2903    vpbroadcastd        m11, [o(pd_1)]
   2904    call m(idct_8x16_internal_10bpc).main_end2
   2905    mova                 m8, [o(idct8x32p)]
   2906    packssdw             m0, m4
   2907    packssdw             m1, m5
   2908    packssdw             m2, m6
   2909    packssdw             m3, m7
   2910    mova                 m6, [dup16_perm]
   2911    vpermb               m0, m8, m0
   2912    vpermb               m2, m8, m2
   2913    vprold               m8, 16
   2914    vpermb               m1, m8, m1
   2915    vpermb               m3, m8, m3
   2916    punpckldq            m4, m0, m2
   2917    punpckhdq            m0, m2
   2918    punpckldq            m2, m1, m3
   2919    punpckhdq            m1, m3
   2920    punpckldq           m21, m4, m2
   2921    punpckhdq           m14, m4, m2
   2922    punpckldq           m18, m0, m1
   2923    punpckhdq           m15, m0, m1
   2924    vpermb               m8, m6, m14 ; 2
   2925    vpermb               m1, m6, m15 ; 6
   2926    vpermb               m7, m6, m18 ; 4
   2927    pmovzxwd             m9, ym21    ; 0
   2928    vpord                m6, [o(pb_32)] {1to16}
   2929    lea                  r5, [o_base_8bpc]
   2930    vpermb              m21, m6, m21 ; 1
   2931    vpermb              m15, m6, m15 ; 7
   2932    vpermb              m18, m6, m18 ; 5
   2933    vpermb              m14, m6, m14 ; 3
   2934    pslld                m9, 16
   2935    call m(idct_16x16_internal_8bpc).main_fast2
   2936    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
   2937 .pass2_end:
   2938    movshdup            m22, [permC]
   2939    vpbroadcastd        m11, [pw_2048]
   2940    vpbroadcastd        m13, [pixel_10bpc_max]
   2941    lea                  r6, [strideq*3]
   2942    pxor                m12, m12
   2943    psrlq               m23, m22, 8
   2944    vpermq               m8, m22, m0
   2945    vpermq               m9, m23, m1
   2946    call m(idct_16x8_internal_10bpc).write_16x4
   2947    vpermq               m8, m22, m2
   2948    vpermq               m9, m23, m3
   2949    call m(idct_16x8_internal_10bpc).write_16x4
   2950    vpermq               m8, m22, m4
   2951    vpermq               m9, m23, m5
   2952    call m(idct_16x8_internal_10bpc).write_16x4
   2953    vpermq               m8, m22, m6
   2954    vpermq               m9, m23, m7
   2955    call m(idct_16x8_internal_10bpc).write_16x4
   2956    vpermq               m8, m22, m14
   2957    vpermq               m9, m23, m15
   2958    call m(idct_16x8_internal_10bpc).write_16x4
   2959    vpermq               m8, m22, m16
   2960    vpermq               m9, m23, m17
   2961    call m(idct_16x8_internal_10bpc).write_16x4
   2962    vpermq               m8, m22, m18
   2963    vpermq               m9, m23, m19
   2964    call m(idct_16x8_internal_10bpc).write_16x4
   2965    vpermq               m8, m22, m20
   2966    vpermq               m9, m23, m21
   2967 %if WIN64
   2968    movaps             xmm6, [rsp+ 8]
   2969    movaps             xmm7, [rsp+24]
   2970 %endif
   2971    vzeroupper
   2972    jmp m(idct_16x8_internal_10bpc).write_16x4
   2973 .pass1:
   2974    pmulld               m0, m12, [cq+128* 0]
   2975    pmulld               m1, m12, [cq+128* 2]
   2976    pmulld               m2, m12, [cq+128* 4]
   2977    pmulld               m3, m12, [cq+128* 6]
   2978    pmulld               m4, m12, [cq+128* 8]
   2979    pmulld               m5, m12, [cq+128*10]
   2980    pmulld               m6, m12, [cq+128*12]
   2981    pmulld               m7, m12, [cq+128*14]
   2982    call m(idct_8x16_internal_10bpc).main_rect2
   2983    pmulld              m16, m12, [cq+128* 1]
   2984    pmulld              m17, m12, [cq+128* 3]
   2985    pmulld              m18, m12, [cq+128* 5]
   2986    pmulld              m19, m12, [cq+128* 7]
   2987    pmulld              m20, m12, [cq+128* 9]
   2988    pmulld              m21, m12, [cq+128*11]
   2989    pmulld              m22, m12, [cq+128*13]
   2990    pmulld              m23, m12, [cq+128*15]
   2991    call m(idct_16x16_internal_10bpc).main_rect2
   2992    vpbroadcastd        m11, [o(pd_1)]
   2993    call m(idct_16x16_internal_10bpc).main_end2
   2994    jmp m(idct_16x16_internal_10bpc).main_end3
   2995 .dconly:
   2996    imul                r6d, [cq], 181
   2997    mov                [cq], eobd
   2998    or                  r3d, 32
   2999    jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly
   3000 
   3001 cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 16, dst, stride, c, eob
   3002 %undef cmp
   3003    vpbroadcastd        m10, [pw_2896x8]
   3004    vpbroadcastd        m11, [pw_1697x16]
   3005    vpbroadcastd        m13, [pw_8192]
   3006    vpbroadcastd        m15, [pixel_10bpc_max]
   3007    lea                  r6, [strideq*9]
   3008    pxor                m14, m14
   3009    paddw               m12, m13, m13 ; pw_16384
   3010    cmp                eobd, 151
   3011    jl .main
   3012    call .main
   3013    add                  cq, 64-128*4
   3014    lea                dstq, [dstq+strideq*8]
   3015 .main:
   3016    call .main_internal
   3017    add                  cq, 128*4
   3018    pmulhrsw             m1, m13, m2
   3019    pmulhrsw             m3, m13, m4
   3020    pmulhrsw             m5, m13, m6
   3021    pmulhrsw             m7, m13, m8
   3022    call .main_internal
   3023 .main2:
   3024    pmulhrsw             m2, m13
   3025    pmulhrsw             m4, m13
   3026    pmulhrsw             m6, m13
   3027    pmulhrsw             m8, m13
   3028    punpcklqdq           m0, m1, m2 ;  0  8
   3029    punpckhqdq           m1, m2     ;  1  9
   3030    call .write_16x2x2
   3031    punpcklqdq           m0, m3, m4 ;  2 10
   3032    punpckhqdq           m1, m3, m4 ;  3 11
   3033    call .write_16x2x2
   3034    punpcklqdq           m0, m5, m6 ;  4 12
   3035    punpckhqdq           m1, m5, m6 ;  5 13
   3036    call .write_16x2x2
   3037    punpcklqdq           m0, m7, m8 ;  6 14
   3038    punpckhqdq           m1, m7, m8 ;  7 15
   3039 .write_16x2x2:
   3040    mova                ym2, [dstq+strideq*0]
   3041    vinserti32x8         m2, [dstq+strideq*8], 1
   3042    mova                ym9, [dstq+strideq*1]
   3043    vinserti32x8         m9, [dstq+r6       ], 1
   3044    paddw                m0, m2
   3045    paddw                m1, m9
   3046    pmaxsw               m0, m14
   3047    pmaxsw               m1, m14
   3048    pminsw               m0, m15
   3049    pminsw               m1, m15
   3050    mova          [dstq+strideq*0], ym0
   3051    vextracti32x8 [dstq+strideq*8], m0, 1
   3052    mova          [dstq+strideq*1], ym1
   3053    vextracti32x8 [dstq+r6       ], m1, 1
   3054    lea                dstq, [dstq+strideq*2]
   3055    ret
   3056 .main_internal:
   3057    mova                 m8, [cq+128* 0]
   3058    packssdw             m8, [cq+128* 8]
   3059    mova                 m6, [cq+128* 1]
   3060    packssdw             m6, [cq+128* 9]
   3061    mova                 m0, [cq+128* 2]
   3062    packssdw             m0, [cq+128*10]
   3063    mova                 m2, [cq+128* 3]
   3064    packssdw             m2, [cq+128*11]
   3065    REPX  {pmulhrsw x, m10}, m8, m6, m0, m2
   3066    REPX {vpermq x, x, q3120}, m8, m6, m0, m2
   3067    pmulhrsw             m4, m11, m8
   3068    pmulhrsw             m9, m11, m6
   3069    REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
   3070    pmulhrsw             m4, m12
   3071    pmulhrsw             m9, m12
   3072    paddsw               m8, m4
   3073    paddsw               m6, m9
   3074    pmulhrsw             m4, m11, m0
   3075    pmulhrsw             m9, m11, m2
   3076    REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
   3077    pmulhrsw             m4, m12
   3078    pmulhrsw             m9, m12
   3079    paddsw               m0, m4
   3080    paddsw               m2, m9
   3081    punpcklwd            m4, m8, m6
   3082    punpckhwd            m8, m6
   3083    punpcklwd            m6, m0, m2
   3084    punpckhwd            m0, m2
   3085    punpckldq            m2, m4, m6 ; 0 1
   3086    punpckhdq            m4, m6     ; 2 3
   3087    punpckldq            m6, m8, m0 ; 4 5
   3088    punpckhdq            m8, m0     ; 6 7
   3089    ret
   3090 
   3091 cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
   3092 %undef cmp
   3093    lea                  r5, [o_base]
   3094    test               eobd, eobd
   3095    jz .dconly
   3096    vpbroadcastd        m12, [o(pd_2896)]
   3097    vpbroadcastd        m13, [o(pd_2048)]
   3098    vpbroadcastd        m14, [o(clip_18b_min)]
   3099    vpbroadcastd        m15, [o(clip_18b_max)]
   3100 %if WIN64
   3101    movaps         [rsp+ 8], xmm6
   3102    movaps         [rsp+24], xmm7
   3103 %endif
   3104    mov                 r6d, 8*12
   3105    cmp                eobd, 36
   3106    jl .fast
   3107    pmulld               m0, m12, [cq+64* 0]
   3108    pmulld               m1, m12, [cq+64* 4]
   3109    pmulld               m2, m12, [cq+64* 8]
   3110    pmulld               m3, m12, [cq+64*12]
   3111    pmulld              m16, m12, [cq+64* 2]
   3112    pmulld              m17, m12, [cq+64* 6]
   3113    pmulld              m18, m12, [cq+64*10]
   3114    pmulld              m19, m12, [cq+64*14]
   3115    cmp                eobd, 151
   3116    jge .full
   3117    call m(idct_8x16_internal_10bpc).main_fast_rect2
   3118    call m(idct_16x16_internal_10bpc).main_fast_rect2
   3119    call .idct16_sumsub
   3120    call .pass1_load_spill
   3121    call .main_fast_rect2
   3122    jmp .pass1_end
   3123 .full:
   3124    pmulld               m4, m12, [cq+64*16]
   3125    pmulld               m5, m12, [cq+64*20]
   3126    pmulld               m6, m12, [cq+64*24]
   3127    pmulld               m7, m12, [cq+64*28]
   3128    pmulld              m20, m12, [cq+64*18]
   3129    pmulld              m21, m12, [cq+64*22]
   3130    pmulld              m22, m12, [cq+64*26]
   3131    pmulld              m23, m12, [cq+64*30]
   3132    add                 r6d, 8*16
   3133    call m(idct_8x16_internal_10bpc).main_rect2
   3134    call m(idct_16x16_internal_10bpc).main_rect2
   3135    call .idct16_sumsub
   3136    call .pass1_load_spill
   3137    pmulld              m16, m12, [cq+64*17]
   3138    pmulld              m17, m12, [cq+64*19]
   3139    pmulld              m18, m12, [cq+64*21]
   3140    pmulld              m19, m12, [cq+64*23]
   3141    pmulld              m20, m12, [cq+64*25]
   3142    pmulld              m21, m12, [cq+64*27]
   3143    pmulld              m22, m12, [cq+64*29]
   3144    pmulld              m23, m12, [cq+64*31]
   3145    call .main_rect2
   3146 .pass1_end:
   3147    vpbroadcastd        m11, [o(pd_1)]
   3148    lea                  r4, [cq+64]
   3149    call .idct32_pass1_end
   3150    lea                  r5, [o_base_8bpc]
   3151    punpckhqdq          m19, m5, m16  ; 11
   3152    punpcklqdq           m5, m16      ; 10
   3153    punpckhqdq          m16, m2, m1   ;  5
   3154    punpcklqdq           m2, m1       ;  4
   3155    punpcklqdq           m1, m15, m4  ;  2
   3156    punpckhqdq          m15, m4       ;  3
   3157    punpcklqdq           m4, m14, m18 ;  8
   3158    punpckhqdq          m18, m14, m18 ;  9
   3159    punpckhqdq          m14, m0, m20  ;  1
   3160    punpcklqdq           m0, m20      ;  0
   3161    punpckhqdq          m20, m6, m17  ; 13
   3162    punpcklqdq           m6, m17      ; 12
   3163    punpckhqdq          m17, m3, m21  ;  7
   3164    punpcklqdq           m3, m21      ;  6
   3165    punpckhqdq          m21, m7, m8   ; 15
   3166    punpcklqdq           m7, m8       ; 14
   3167    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   3168    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
   3169    jmp .end
   3170 .fast:
   3171    pmulld              ym0, ym12, [cq+64*0]
   3172    pmulld              ym1, ym12, [cq+64*4]
   3173    movshdup             m7, [o(permB)]
   3174    mova                ym4, [cq+64*2]
   3175    mova                ym5, [cq+64*6]
   3176    mova               ym16, [cq+64*1]
   3177    mova                ym2, [cq+64*5]
   3178    mova                ym3, [cq+64*3]
   3179    mova               ym17, [cq+64*7]
   3180    vpermt2q             m4, m7, m5 ;  2  6
   3181    vpermt2q            m16, m7, m2 ;  1  5
   3182    vpermt2q            m17, m7, m3 ;  7  3
   3183    paddd               ym0, ym13
   3184    paddd               ym1, ym13
   3185    psrad               ym0, 12
   3186    psrad               ym1, 12
   3187    vpermq               m0, m7, m0 ;  0  0
   3188    vpermq               m1, m7, m1 ;  4  4
   3189    REPX    {pmulld x, m12}, m4, m16, m17
   3190    REPX    {paddd  x, m13}, m4, m16, m17
   3191    REPX    {psrad  x, 12 }, m4, m16, m17
   3192    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
   3193    vpbroadcastd        m11, [o(pd_1)]
   3194    call m(idct_16x16_internal_10bpc).main_end2
   3195    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
   3196    lea                  r5, [o_base_8bpc]
   3197    punpckhqdq          m14, m0, m2 ; 1
   3198    punpcklqdq           m0, m2     ; 0
   3199    punpcklqdq           m1, m3, m4 ; 2
   3200    punpckhqdq          m15, m3, m4 ; 3
   3201    punpcklqdq           m2, m5, m7 ; 4
   3202    punpckhqdq          m16, m5, m7 ; 5
   3203    punpcklqdq           m3, m6, m8 ; 6
   3204    punpckhqdq          m17, m6, m8 ; 7
   3205    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
   3206 .end:
   3207 %if WIN64
   3208    movaps             xmm6, [rsp+ 8]
   3209    movaps             xmm7, [rsp+24]
   3210 %endif
   3211    pxor                m12, m12
   3212 .zero_loop:
   3213    mova     [cq+r6*8+64*3], m12
   3214    mova     [cq+r6*8+64*2], m12
   3215    mova     [cq+r6*8+64*1], m12
   3216    mova     [cq+r6*8+64*0], m12
   3217    sub                 r6d, 8*4
   3218    jge .zero_loop
   3219    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
   3220    pmulhrsw             m0, m11, m14
   3221    pmulhrsw             m1, m11, m15
   3222    pmulhrsw             m2, m11, m16
   3223    pmulhrsw             m3, m11, m17
   3224    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
   3225    pmulhrsw             m0, m11, m18
   3226    pmulhrsw             m1, m11, m19
   3227    pmulhrsw             m2, m11, m20
   3228    pmulhrsw             m3, m11, m21
   3229    vzeroupper
   3230    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
   3231 .dconly:
   3232    imul                r6d, [cq], 181
   3233    mov                [cq], eobd
   3234    or                  r3d, 16
   3235 .dconly3:
   3236    add                 r6d, 128
   3237    sar                 r6d, 8
   3238    imul                r6d, 181
   3239    add                 r6d, 384
   3240    sar                 r6d, 9
   3241 .dconly2:
   3242    vpbroadcastd         m3, [o(dconly_10bpc)]
   3243    imul                r6d, 181
   3244    add                 r6d, 2176
   3245    sar                 r6d, 12
   3246    vpbroadcastw         m2, r6d
   3247    paddsw               m2, m3
   3248 .dconly_loop:
   3249    paddsw               m0, m2, [dstq+strideq*0]
   3250    paddsw               m1, m2, [dstq+strideq*1]
   3251    psubusw              m0, m3
   3252    psubusw              m1, m3
   3253    mova   [dstq+strideq*0], m0
   3254    mova   [dstq+strideq*1], m1
   3255    lea                dstq, [dstq+strideq*2]
   3256    sub                 r3d, 2
   3257    jg .dconly_loop
   3258    RET
   3259 ALIGN function_align
   3260 .idct16_sumsub:
   3261    psubd               m23, m0, m22 ; t15
   3262    paddd                m0, m22     ; t0
   3263    psubd               m22, m1, m21 ; t14
   3264    paddd                m1, m21     ; t1
   3265    REPX    {pmaxsd x, m14}, m23, m0, m22, m1
   3266    psubd               m21, m2, m20 ; t13
   3267    paddd                m2, m20     ; t2
   3268    REPX    {pminsd x, m15}, m23, m0, m22, m1
   3269    psubd               m20, m3, m19 ; t12
   3270    paddd                m3, m19     ; t3
   3271    REPX    {pmaxsd x, m14}, m21, m2, m20, m3
   3272    psubd               m19, m4, m18 ; t11
   3273    paddd                m4, m18     ; t4
   3274    REPX    {pminsd x, m15}, m21, m2, m20, m3
   3275    psubd               m18, m5, m17 ; t10
   3276    paddd                m5, m17     ; t5
   3277    REPX    {pmaxsd x, m14}, m19, m4, m18, m5
   3278    psubd               m17, m6, m16 ; t9
   3279    paddd                m6, m16     ; t6
   3280    REPX    {pminsd x, m15}, m19, m4, m18, m5
   3281    psubd               m16, m7, m9  ; t8
   3282    paddd                m7, m9      ; t7
   3283    REPX    {pmaxsd x, m14}, m17, m6, m16, m7
   3284    REPX    {pminsd x, m15}, m17, m6, m16, m7
   3285    ret
   3286 .idct32_pass1_end:
   3287    psrlq               m12, [o(permC)], 24 ;  0  2  8 10  1  3  9 11
   3288    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
   3289 %macro IDCT32_PASS1_END 2 ; low, high
   3290    paddd                m8, m11, [r4+128*%1]
   3291    paddd                m9, m11, [cq+128*%1]
   3292    psubd               m10, m8, m%1  ; out 16+n
   3293    paddd                m8, m%1      ; out 15-n
   3294    paddd               m%1, m9, m%2  ; out  0+n
   3295    psubd                m9, m%2      ; out 31-n
   3296    REPX   {vpsravd x, m11}, m10, m%1, m8, m9
   3297    packssdw            m%1, m10      ;  0+n 16+n
   3298    packssdw            m%2, m8, m9   ; 15-n 31-n
   3299 %endmacro
   3300    IDCT32_PASS1_END      0, 23       ;  0 16, 15 31
   3301    IDCT32_PASS1_END      7, 16       ;  7 23,  8 24
   3302    IDCT32_PASS1_END      1, 22       ;  1 17, 14 30
   3303    IDCT32_PASS1_END      6, 17       ;  6 22,  9 25
   3304    IDCT32_PASS1_END      2, 21       ;  2 18, 13 29
   3305    IDCT32_PASS1_END      5, 18       ;  5 21, 10 26
   3306    IDCT32_PASS1_END      3, 20       ;  3 19, 12 28
   3307    IDCT32_PASS1_END      4, 19       ;  4 20, 11 27
   3308 .transpose_16x32:
   3309    mova                m14, m13
   3310    vpermi2q            m14, m0, m16
   3311    vpermt2q             m0, m12, m16
   3312    mova                m15, m13
   3313    vpermi2q            m15, m1, m17
   3314    vpermt2q             m1, m12, m17
   3315    mova                m16, m13
   3316    vpermi2q            m16, m2, m18
   3317    vpermt2q             m2, m12, m18
   3318    mova                m17, m13
   3319    vpermi2q            m17, m3, m19
   3320    vpermt2q             m3, m12, m19
   3321    mova                m18, m13
   3322    vpermi2q            m18, m4, m20
   3323    vpermt2q             m4, m12, m20
   3324    mova                m19, m13
   3325    vpermi2q            m19, m5, m21
   3326    vpermt2q             m5, m12, m21
   3327    mova                m20, m13
   3328    vpermi2q            m20, m6, m22
   3329    vpermt2q             m6, m12, m22
   3330    mova                m21, m13
   3331    vpermi2q            m21, m7, m23
   3332    vpermt2q             m7, m12, m23
   3333    punpckhwd            m8, m2, m3   ; c04 d04 c05 d05 c06 d06 c07 d07
   3334    punpcklwd            m2, m3       ; c00 d00 c01 d01 c02 d02 c03 d03
   3335    punpckhwd            m3, m0, m1   ; a04 b04 a05 b05 a06 b06 a07 b07
   3336    punpcklwd            m0, m1       ; a00 b00 a01 b01 a02 b02 a03 b03
   3337    punpckhwd            m1, m4, m5   ; e04 f04 e05 f05 e06 f06 e07 f07
   3338    punpcklwd            m4, m5       ; e00 f00 e01 f01 e02 f02 e03 f03
   3339    punpckhwd            m5, m6, m7   ; g04 h04 g05 h05 g06 h06 g07 h07
   3340    punpcklwd            m6, m7       ; g00 h00 g01 h01 g02 h02 g03 h03
   3341    punpckhwd            m7, m14, m15 ; a12 b12 a13 b13 a14 b14 a15 b15
   3342    punpcklwd           m14, m15      ; a08 b08 a09 b09 a10 b10 a11 b11
   3343    punpckhwd           m15, m16, m17 ; c12 d12 c13 d13 c14 d14 c15 d15
   3344    punpcklwd           m16, m17      ; c08 d08 c09 d09 c10 d10 c11 d11
   3345    punpckhwd           m17, m18, m19 ; e12 f12 e13 f13 e14 f14 e15 f15
   3346    punpcklwd           m18, m19      ; e08 f08 e09 f09 e10 f10 e11 f11
   3347    punpckhwd           m19, m20, m21 ; g12 h12 g13 h13 g14 h14 g15 h15
   3348    punpcklwd           m20, m21      ; g08 h08 g09 h09 g10 h10 g11 h11
   3349    punpckhdq           m21, m1, m5   ; e06 f06 g06 h06 e07 f07 g07 h07
   3350    punpckldq            m1, m5       ; e04 f04 g04 h04 e05 f05 g05 h05
   3351    punpckhdq            m5, m14, m16 ; a10 b10 c10 d10 a11 b11 c11 d11
   3352    punpckldq           m14, m16      ; a08 b08 c08 d08 a09 b09 c09 d09
   3353    punpckhdq           m16, m18, m20 ; e10 f10 g10 h10 e11 f11 g11 h11
   3354    punpckldq           m18, m20      ; e08 f08 g08 h08 e09 f09 g09 h09
   3355    punpckldq           m20, m4, m6   ; e00 f00 g00 h00 e01 f01 g01 h01
   3356    punpckhdq            m4, m6       ; e02 f02 g02 h02 e03 f03 g03 h03
   3357    punpckldq            m6, m7, m15  ; a12 b12 c12 d12 a13 b13 c13 d13
   3358    punpckhdq            m7, m15      ; a14 b14 c14 d14 a15 b15 c15 d15
   3359    punpckhdq           m15, m0, m2   ; a02 b02 c02 d02 a03 b03 c03 d03
   3360    punpckldq            m0, m2       ; a00 b00 c00 d00 a01 b01 c01 d01
   3361    punpckldq            m2, m3, m8   ; a04 b04 c04 d04 a05 b05 c05 d05
   3362    punpckhdq            m3, m8       ; a06 b06 c06 d06 a07 b07 c07 d07
   3363    punpckhdq            m8, m17, m19 ; e14 f14 g14 h14 e15 f15 g15 h15
   3364    punpckldq           m17, m19      ; e12 f12 g12 h12 e13 f13 g13 h13
   3365    ret
   3366 .pass1_load_spill:
   3367    mova         [cq+64* 0], m0
   3368    mova         [cq+64* 2], m1
   3369    mova         [cq+64* 4], m2
   3370    mova         [cq+64* 6], m3
   3371    mova         [cq+64* 8], m4
   3372    mova         [cq+64*10], m5
   3373    mova         [cq+64*12], m6
   3374    mova         [cq+64*14], m7
   3375    pmulld               m0, m12, [cq+64* 1]
   3376    pmulld               m1, m12, [cq+64* 3]
   3377    pmulld               m2, m12, [cq+64* 5]
   3378    pmulld               m3, m12, [cq+64* 7]
   3379    pmulld               m4, m12, [cq+64* 9]
   3380    pmulld               m5, m12, [cq+64*11]
   3381    pmulld               m6, m12, [cq+64*13]
   3382    pmulld               m7, m12, [cq+64*15]
   3383    mova         [cq+64* 1], m23
   3384    mova         [cq+64* 3], m22
   3385    mova         [cq+64* 5], m21
   3386    mova         [cq+64* 7], m20
   3387    mova         [cq+64* 9], m19
   3388    mova         [cq+64*11], m18
   3389    mova         [cq+64*13], m17
   3390    mova         [cq+64*15], m16
   3391    ret
   3392 .main_fast2_rect2:
   3393    REPX     {paddd x, m13}, m0, m1, m2, m3
   3394    REPX     {psrad x, 12 }, m0, m1, m2, m3
   3395 .main_fast2: ; bottom 3/4 is zero
   3396    pmulld              m23, m0, [o(pd_4091)] {1to16} ; t31a
   3397    pmulld               m0, [o(pd_201)] {1to16}      ; t16a
   3398    pmulld              m20, m3, [o(pd_1380)] {1to16} ; t19a
   3399    pmulld               m3, [o(pd_3857)] {1to16}     ; t28a
   3400    pmulld              m21, m2, [o(pd_3973)] {1to16} ; t27a
   3401    pmulld               m2, [o(pd_995)] {1to16}      ; t20a
   3402    pmulld               m6, m1, [o(pd_601)] {1to16}  ; t23a
   3403    pmulld              m17, m1, [o(pd_4052)] {1to16} ; t24a
   3404    REPX  {psubd x, m13, x}, m20, m6
   3405    REPX    {paddd  x, m13}, m23, m0, m3, m21, m2, m17
   3406    REPX    {psrad  x, 12 }, m20, m6, m23, m0, m3, m21, m2, m17
   3407    mova                 m8, m0
   3408    mova                m16, m23
   3409    mova                 m7, m20
   3410    mova                 m4, m3
   3411    mova                m19, m2
   3412    mova                m18, m21
   3413    mova                 m5, m6
   3414    mova                m22, m17
   3415    jmp .main3
   3416 .main_fast_rect2:
   3417    call m(idct_8x16_internal_10bpc).round
   3418 .main_fast: ; bottom half is zero
   3419    pmulld              m23, m0, [o(pd_4091)] {1to16} ; t31a
   3420    pmulld               m0, [o(pd_201)] {1to16}      ; t16a
   3421    pmulld              m16, m7, [o(pd_2751)] {1to16} ; t17a
   3422    pmulld               m7, [o(pd_3035)] {1to16}     ; t30a
   3423    pmulld              m19, m4, [o(pd_3703)] {1to16} ; t29a
   3424    pmulld               m4, [o(pd_1751)] {1to16}     ; t18a
   3425    pmulld              m20, m3, [o(pd_1380)] {1to16} ; t19a
   3426    pmulld               m3, [o(pd_3857)] {1to16}     ; t28a
   3427    pmulld              m21, m2, [o(pd_3973)] {1to16} ; t27a
   3428    pmulld               m2, [o(pd_995)] {1to16}      ; t20a
   3429    pmulld              m18, m5, [o(pd_2106)] {1to16} ; t21a
   3430    pmulld               m5, [o(pd_3513)] {1to16}     ; t26a
   3431    pmulld              m17, m6, [o(pd_3290)] {1to16} ; t25a
   3432    pmulld               m6, [o(pd_2440)] {1to16}     ; t22a
   3433    pmulld              m22, m1, [o(pd_601)] {1to16}  ; t23a
   3434    pmulld               m1, [o(pd_4052)] {1to16}     ; t24a
   3435    REPX  {psubd x, m13, x}, m16, m20, m18, m22
   3436    call m(idct_16x16_internal_10bpc).round3
   3437    jmp .main2
   3438 .main_rect2:
   3439    call m(idct_8x16_internal_10bpc).round
   3440    call m(idct_16x16_internal_10bpc).round
   3441 .main:
   3442    ITX_MULSUB_2D         0, 23,  8,  9, 10, _,  201, 4091 ; t16a, t31a
   3443    ITX_MULSUB_2D        16,  7,  8,  9, 10, _, 3035, 2751 ; t17a, t30a
   3444    ITX_MULSUB_2D         4, 19,  8,  9, 10, _, 1751, 3703 ; t18a, t29a
   3445    ITX_MULSUB_2D        20,  3,  8,  9, 10, _, 3857, 1380 ; t19a, t28a
   3446    ITX_MULSUB_2D         2, 21,  8,  9, 10, _,  995, 3973 ; t20a, t27a
   3447    ITX_MULSUB_2D        18,  5,  8,  9, 10, _, 3513, 2106 ; t21a, t26a
   3448    ITX_MULSUB_2D         6, 17,  8,  9, 10, _, 2440, 3290 ; t22a, t25a
   3449    ITX_MULSUB_2D        22,  1,  8,  9, 10, _, 4052,  601 ; t23a, t24a
   3450    call m(idct_16x16_internal_10bpc).round
   3451 .main2:
   3452    call m(idct_8x16_internal_10bpc).round
   3453    psubd                m8, m0, m16  ; t17
   3454    paddd                m0, m16      ; t16
   3455    psubd               m16, m23, m7  ; t30
   3456    paddd               m23, m7       ; t31
   3457    REPX    {pmaxsd x, m14}, m8, m0, m16, m23
   3458    paddd                m7, m20, m4  ; t19
   3459    psubd               m20, m4       ; t18
   3460    REPX    {pminsd x, m15}, m8, m0, m16, m23
   3461    paddd                m4, m3, m19  ; t28
   3462    psubd                m3, m19      ; t29
   3463    REPX    {pmaxsd x, m14}, m7, m20, m4, m3
   3464    psubd               m19, m2, m18  ; t21
   3465    paddd                m2, m18      ; t20
   3466    REPX    {pminsd x, m15}, m7, m20, m4, m3
   3467    psubd               m18, m21, m5  ; t26
   3468    paddd               m21, m5       ; t27
   3469    REPX    {pmaxsd x, m14}, m19, m2, m18, m21
   3470    psubd                m5, m22, m6  ; t22
   3471    paddd                m6, m22      ; t23
   3472    REPX    {pminsd x, m15}, m19, m2, m18, m21
   3473    psubd               m22, m1, m17  ; t25
   3474    paddd               m17, m1       ; t24
   3475    REPX    {pmaxsd x, m14}, m5, m6, m22, m17
   3476    REPX    {pminsd x, m15}, m5, m6, m22, m17
   3477 .main3:
   3478    vpbroadcastd        m11, [o(pd_4017)]
   3479    vpbroadcastd        m10, [o(pd_799)]
   3480    ITX_MULSUB_2D        16,  8, 9, 1, _, 13, 10, 11    ; t17a, t30a
   3481    ITX_MULSUB_2D         3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a
   3482    vpbroadcastd        m11, [o(pd_2276)]
   3483    vpbroadcastd        m10, [o(pd_3406)]
   3484    ITX_MULSUB_2D        18, 19, 9, 1, _, 13, 10, 11    ; t21a, t26a
   3485    ITX_MULSUB_2D        22,  5, 9, 1, _, 13, 10, 11, 2 ; t25a, t22a
   3486    paddd                m1, m6, m2   ; t23a
   3487    psubd                m6, m2       ; t20a
   3488    psubd                m2, m17, m21 ; t27a
   3489    paddd               m17, m21      ; t24a
   3490    REPX    {pmaxsd x, m14}, m1, m6, m2, m17
   3491    psubd               m21, m23, m4  ; t28a
   3492    paddd               m23, m4       ; t31a
   3493    REPX    {pminsd x, m15}, m1, m6, m2, m17
   3494    psubd                m4, m16, m20 ; t18
   3495    paddd               m16, m20      ; t17
   3496    REPX    {pmaxsd x, m14}, m21, m23, m4, m16
   3497    psubd               m20, m0, m7   ; t19a
   3498    paddd                m0, m7       ; t16a
   3499    REPX    {pminsd x, m15}, m21, m23, m4, m16
   3500    psubd                m7, m8, m3   ; t29
   3501    paddd                m3, m8       ; t30
   3502    REPX    {pmaxsd x, m14}, m20, m0, m7, m3
   3503    paddd                m8, m5, m18  ; t22
   3504    psubd                m5, m18      ; t21
   3505    REPX    {pminsd x, m15}, m20, m0, m7, m3
   3506    psubd               m18, m22, m19 ; t26
   3507    paddd               m22, m19      ; t25
   3508    REPX    {pmaxsd x, m14}, m8, m5, m18, m22
   3509    vpbroadcastd        m11, [o(pd_3784)]
   3510    vpbroadcastd        m10, [o(pd_1567)]
   3511    REPX    {pminsd x, m15}, m8, m5, m18, m22
   3512    ITX_MULSUB_2D        21, 20, 9, 19, _, 13, 10, 11    ; t19,  t28
   3513    ITX_MULSUB_2D         2,  6, 9, 19, _, 13, 10, 11, 2 ; t27,  t20
   3514    ITX_MULSUB_2D         7,  4, 9, 19, _, 13, 10, 11    ; t18a, t29a
   3515    ITX_MULSUB_2D        18,  5, 9, 19, _, 13, 10, 11, 2 ; t26a, t21a
   3516    psubd               m19, m0, m1   ; t23
   3517    paddd                m0, m1       ; t16
   3518    paddd                m1, m8, m16  ; t17a
   3519    psubd                m8, m16, m8  ; t22a
   3520    REPX    {pmaxsd x, m14}, m19, m0, m1, m8
   3521    psubd               m16, m23, m17 ; t24
   3522    paddd               m23, m17      ; t31
   3523    REPX    {pminsd x, m15}, m19, m0, m1, m8
   3524    psubd               m17, m3, m22  ; t25a
   3525    paddd               m22, m3       ; t30a
   3526    REPX    {pmaxsd x, m14}, m16, m23, m17, m22
   3527    paddd                m3, m6, m21  ; t19a
   3528    psubd                m6, m21, m6  ; t20a
   3529    REPX    {pminsd x, m15}, m16, m23, m17, m22
   3530    paddd               m21, m18, m4  ; t29
   3531    psubd               m18, m4, m18  ; t26
   3532    REPX    {pmaxsd x, m14}, m3, m6, m21, m18
   3533    psubd                m4, m20, m2  ; t27a
   3534    paddd               m20, m2       ; t28a
   3535    REPX    {pminsd x, m15}, m3, m6, m21, m18
   3536    paddd                m2, m7, m5   ; t18
   3537    psubd                m7, m5       ; t21
   3538    REPX    {pmaxsd x, m14}, m4, m20, m2, m7
   3539    REPX    {pminsd x, m15}, m4, m20, m2, m7
   3540    REPX    {pmulld x, m12}, m18, m16, m4, m17, m7, m19, m6, m8
   3541    REPX    {paddd  x, m13}, m18, m16, m4, m17
   3542    psubd                m5, m18, m7  ; t21a
   3543    paddd               m18, m7       ; t26a
   3544    psubd                m7, m16, m19 ; t23a
   3545    paddd               m16, m19      ; t24a
   3546    REPX    {psrad  x, 12 }, m5, m18, m7, m16
   3547    paddd               m19, m4, m6   ; t27
   3548    psubd                m4, m6       ; t20
   3549    psubd                m6, m17, m8  ; t22
   3550    paddd               m17, m8       ; t25
   3551    REPX    {psrad  x, 12 }, m19, m4, m6, m17
   3552    ret
   3553 
   3554 cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 16, dst, stride, c, eob
   3555 %undef cmp
   3556    vpbroadcastd        m10, [pw_2896x8]
   3557    vpbroadcastd        m11, [pw_1697x16]
   3558    vpbroadcastd        m13, [pw_2048]
   3559    vpbroadcastd        m15, [pixel_10bpc_max]
   3560    lea                  r6, [strideq*9]
   3561    pxor                m14, m14
   3562    cmp                eobd, 151
   3563    jl .main
   3564    mov                  r4, dstq
   3565    call .main
   3566    add                  cq, 64*12
   3567    lea                dstq, [r4+32]
   3568 .main:
   3569    call .main_internal
   3570    add                  cq, 64*4
   3571    pmulhrsw             m1, m13, m2
   3572    pmulhrsw             m3, m13, m4
   3573    pmulhrsw             m5, m13, m6
   3574    pmulhrsw             m7, m13, m8
   3575    call .main_internal
   3576    jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
   3577 .main_internal:
   3578    mova                 m8, [cq+64* 0]
   3579    packssdw             m8, [cq+64* 8]
   3580    mova                 m6, [cq+64* 1]
   3581    packssdw             m6, [cq+64* 9]
   3582    mova                 m0, [cq+64* 2]
   3583    packssdw             m0, [cq+64*10]
   3584    mova                 m2, [cq+64* 3]
   3585    packssdw             m2, [cq+64*11]
   3586    REPX  {pmulhrsw x, m10}, m8, m6, m0, m2
   3587    REPX  {paddsw   x, x  }, m8, m6, m0, m2
   3588    REPX {vpermq x, x, q3120}, m8, m6, m0, m2
   3589    pmulhrsw             m4, m11, m8
   3590    pmulhrsw             m9, m11, m6
   3591    paddsw               m8, m8
   3592    paddsw               m6, m6
   3593    REPX {mova [cq+64*x], m14}, 0, 1, 2, 3
   3594    paddsw               m8, m4
   3595    paddsw               m6, m9
   3596    pmulhrsw             m4, m11, m0
   3597    pmulhrsw             m9, m11, m2
   3598    paddsw               m0, m0
   3599    paddsw               m2, m2
   3600    REPX {mova [cq+64*x], m14}, 8, 9, 10, 11
   3601    paddsw               m0, m4
   3602    paddsw               m2, m9
   3603    punpcklwd            m4, m8, m6
   3604    punpckhwd            m8, m6
   3605    punpcklwd            m6, m0, m2
   3606    punpckhwd            m0, m2
   3607    punpckldq            m2, m4, m6 ; 0 1
   3608    punpckhdq            m4, m6     ; 2 3
   3609    punpckldq            m6, m8, m0 ; 4 5
   3610    punpckhdq            m8, m0     ; 6 7
   3611    ret
   3612 
   3613 cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
   3614 %undef cmp
   3615    lea                  r5, [o_base]
   3616    test               eobd, eobd
   3617    jz .dconly
   3618    vpbroadcastd        m12, [o(pd_2896)]
   3619    vpbroadcastd        m13, [o(pd_2048)]
   3620    vpbroadcastd        m14, [o(clip_18b_min)]
   3621    vpbroadcastd        m15, [o(clip_18b_max)]
   3622    WIN64_SPILL_XMM      30
   3623    cmp                eobd, 136
   3624    jl .fast
   3625    add                  cq, 64
   3626    cmp                eobd, 543
   3627    jge .full
   3628    call .pass1_fast ; bottomright 16x16 zero
   3629    mov                 r6d, 16*12
   3630    jmp .lefthalf
   3631 .full:
   3632    call .pass1
   3633    mov                 r6d, 16*28
   3634 .lefthalf:
   3635    mova        [cq+128* 0], m0
   3636    mova        [cq+128* 1], m1
   3637    mova        [cq+128* 2], m2
   3638    mova        [cq+128* 3], m3
   3639    mova        [cq+128* 4], m14
   3640    mova        [cq+128* 5], m15
   3641    mova        [cq+128* 6], m16
   3642    mova        [cq+128* 7], m17
   3643    mova        [cq+128* 8], m22
   3644    mova        [cq+128* 9], m23
   3645    mova        [cq+128*10], m24
   3646    mova        [cq+128*11], m25
   3647    mova        [cq+128*12], m26
   3648    mova        [cq+128*13], m27
   3649    mova        [cq+128*14], m28
   3650    mova        [cq+128*15], m29
   3651    sub                  cq, 64
   3652    vpbroadcastd        m12, [o(pd_2896)]
   3653    vpbroadcastd        m13, [o(pd_2048)]
   3654    vpbroadcastd        m14, [o(clip_18b_min)]
   3655    vpbroadcastd        m15, [o(clip_18b_max)]
   3656    call .pass1
   3657    lea                  r5, [o_base_8bpc]
   3658    call .pass2_start
   3659    pxor                m12, m12
   3660 .right_zero_loop:
   3661    mova [cq+r6*8+64+128*3], m12
   3662    mova [cq+r6*8+64+128*2], m12
   3663    mova [cq+r6*8+64+128*1], m12
   3664    mova [cq+r6*8+64+128*0], m12
   3665    sub                 r6d, 16*4
   3666    jge .right_zero_loop
   3667    mov                 r6d, 16*28
   3668    jmp .end2
   3669 .pass2_start:
   3670    mova                 m4, [cq+64+128* 0]
   3671    mova                 m5, [cq+64+128* 1]
   3672    mova                 m6, [cq+64+128* 2]
   3673    mova                 m7, [cq+64+128* 3]
   3674    mova                m18, [cq+64+128* 4]
   3675    mova                m19, [cq+64+128* 5]
   3676    mova                m20, [cq+64+128* 6]
   3677    mova                m21, [cq+64+128* 7]
   3678    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   3679    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
   3680    mova         [cq+128*0], m14
   3681    mova         [cq+128*1], m15
   3682    mova         [cq+128*2], m16
   3683    mova         [cq+128*3], m17
   3684    mova         [cq+128*4], m18
   3685    mova         [cq+128*5], m19
   3686    mova         [cq+128*6], m20
   3687    mova         [cq+128*7], m21
   3688    mova                m14, [cq+64+128* 8]
   3689    mova                m15, [cq+64+128* 9]
   3690    mova                m16, [cq+64+128*10]
   3691    mova                m17, [cq+64+128*11]
   3692    mova                m18, [cq+64+128*12]
   3693    mova                m19, [cq+64+128*13]
   3694    mova                m20, [cq+64+128*14]
   3695    mova                m21, [cq+64+128*15]
   3696    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
   3697 .fast: ; topleft 16x16 nonzero
   3698    cmp                eobd, 36
   3699    jl .fast2
   3700    call .pass1_fast
   3701    lea                  r5, [o_base_8bpc]
   3702    call .pass2_fast_start
   3703    jmp .end
   3704 .pass2_fast_start:
   3705    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
   3706    mova         [cq+128*0], m14
   3707    mova         [cq+128*1], m15
   3708    mova         [cq+128*2], m16
   3709    mova         [cq+128*3], m17
   3710    mova         [cq+128*4], m18
   3711    mova         [cq+128*5], m19
   3712    mova         [cq+128*6], m20
   3713    mova         [cq+128*7], m21
   3714    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
   3715 .fast2: ; topleft 8x8 nonzero
   3716    movshdup             m7, [o(permB)]
   3717    mova                ym0, [cq+128*0]
   3718    mova                ym1, [cq+128*4]
   3719    mova                ym4, [cq+128*2]
   3720    mova                ym5, [cq+128*6]
   3721    mova               ym16, [cq+128*1]
   3722    mova                ym2, [cq+128*5]
   3723    mova                ym3, [cq+128*3]
   3724    mova               ym17, [cq+128*7]
   3725    mov                 r6d, 16*4
   3726    vpermq               m0, m7, m0 ;  0  0
   3727    vpermq               m1, m7, m1 ;  4  4
   3728    vpermt2q             m4, m7, m5 ;  2  6
   3729    vpermt2q            m16, m7, m2 ;  1  5
   3730    vpermt2q            m17, m7, m3 ;  7  3
   3731    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
   3732    call m(idct_16x16_internal_10bpc).main_end
   3733    call .pass2_fast2_start
   3734 .end:
   3735    pxor                m12, m12
   3736 .end2:
   3737    call .pass2_end
   3738 .zero_loop:
   3739    mova    [cq+r6*8+128*3], m12
   3740    mova    [cq+r6*8+128*2], m12
   3741    mova    [cq+r6*8+128*1], m12
   3742    mova    [cq+r6*8+128*0], m12
   3743    sub                 r6d, 16*4
   3744    jge .zero_loop
   3745    WIN64_RESTORE_XMM
   3746    vzeroupper
   3747    ret
   3748 .pass2_fast2_start:
   3749    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
   3750    lea                  r5, [o_base_8bpc]
   3751    punpckhqdq          m22, m0, m2 ; 1
   3752    punpcklqdq           m0, m2     ; 0
   3753    punpcklqdq           m1, m5, m7 ; 4
   3754    punpckhqdq          m24, m5, m7 ; 5
   3755    punpcklqdq          m14, m3, m4 ; 2
   3756    punpckhqdq          m23, m3, m4 ; 3
   3757    punpcklqdq          m15, m6, m8 ; 6
   3758    punpckhqdq          m25, m6, m8 ; 7
   3759    mova                m10, m13
   3760    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
   3761    mova         [cq+128*0], m14
   3762    mova         [cq+128*1], m15
   3763    mova         [cq+128*2], m16
   3764    mova         [cq+128*3], m17
   3765    mova         [cq+128*4], m18
   3766    mova         [cq+128*5], m19
   3767    mova         [cq+128*6], m20
   3768    mova         [cq+128*7], m21
   3769    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
   3770 .pass2_end:
   3771    psubsw               m9, m0, m29 ; out31
   3772    paddsw               m0, m29     ; out0
   3773    psubsw              m29, m1, m28 ; out30
   3774    paddsw               m1, m28     ; out1
   3775    psubsw              m28, m2, m27 ; out29
   3776    paddsw               m2, m27     ; out2
   3777    psubsw              m27, m3, m26 ; out28
   3778    paddsw               m3, m26     ; out3
   3779    psubsw              m26, m4, m25 ; out27
   3780    paddsw               m4, m25     ; out4
   3781    psubsw              m25, m5, m24 ; out26
   3782    paddsw               m5, m24     ; out5
   3783    psubsw              m24, m6, m23 ; out25
   3784    paddsw               m6, m23     ; out6
   3785    psubsw              m23, m7, m22 ; out24
   3786    paddsw               m7, m22     ; out7
   3787    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
   3788    mova                 m0, [cq+128*0]
   3789    mova                 m1, [cq+128*1]
   3790    mova                 m2, [cq+128*2]
   3791    mova                 m3, [cq+128*3]
   3792    mova                 m4, [cq+128*4]
   3793    mova                 m5, [cq+128*5]
   3794    mova                 m6, [cq+128*6]
   3795    mova                 m7, [cq+128*7]
   3796    psubsw              m22, m0, m21 ; out23
   3797    paddsw               m0, m21     ; out8
   3798    psubsw              m21, m1, m20 ; out22
   3799    paddsw               m1, m20     ; out9
   3800    psubsw              m20, m2, m19 ; out21
   3801    paddsw               m2, m19     ; out10
   3802    psubsw              m19, m3, m18 ; out20
   3803    paddsw               m3, m18     ; out11
   3804    psubsw              m18, m4, m17 ; out19
   3805    paddsw               m4, m17     ; out12
   3806    psubsw              m17, m5, m16 ; out18
   3807    paddsw               m5, m16     ; out13
   3808    psubsw              m16, m6, m15 ; out17
   3809    paddsw               m6, m15     ; out14
   3810    psubsw              m15, m7, m14 ; out16
   3811    paddsw               m7, m14     ; out15
   3812    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
   3813    pmulhrsw             m0, m11, m15
   3814    pmulhrsw             m1, m11, m16
   3815    pmulhrsw             m2, m11, m17
   3816    pmulhrsw             m3, m11, m18
   3817    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
   3818    pmulhrsw             m0, m11, m19
   3819    pmulhrsw             m1, m11, m20
   3820    pmulhrsw             m2, m11, m21
   3821    pmulhrsw             m3, m11, m22
   3822    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
   3823    pmulhrsw             m0, m11, m23
   3824    pmulhrsw             m1, m11, m24
   3825    pmulhrsw             m2, m11, m25
   3826    pmulhrsw             m3, m11, m26
   3827    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
   3828    pmulhrsw             m0, m11, m27
   3829    pmulhrsw             m1, m11, m28
   3830    pmulhrsw             m2, m11, m29
   3831    pmulhrsw             m3, m11, m9
   3832    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
   3833 .dconly:
   3834    imul                r6d, [cq], 181
   3835    mov                [cq], eobd
   3836    or                  r3d, 32
   3837    add                 r6d, 640
   3838    sar                 r6d, 10
   3839    jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
   3840 .pass1_fast:
   3841    mova                 m0, [cq+128* 0]
   3842    mova                 m1, [cq+128* 4]
   3843    mova                 m2, [cq+128* 8]
   3844    mova                 m3, [cq+128*12]
   3845    mov                 r6d, 16*12
   3846    call m(idct_8x16_internal_10bpc).main_fast
   3847    mova                m16, [cq+128* 2]
   3848    mova                m17, [cq+128* 6]
   3849    mova                m18, [cq+128*10]
   3850    mova                m19, [cq+128*14]
   3851    call m(idct_16x16_internal_10bpc).main_fast
   3852    call .pass1_load_spill
   3853    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
   3854    jmp .pass1_end
   3855 .pass1:
   3856    mova                 m0, [cq+128* 0]
   3857    mova                 m1, [cq+128* 4]
   3858    mova                 m2, [cq+128* 8]
   3859    mova                 m3, [cq+128*12]
   3860    mova                 m4, [cq+128*16]
   3861    mova                 m5, [cq+128*20]
   3862    mova                 m6, [cq+128*24]
   3863    mova                 m7, [cq+128*28]
   3864    call m(idct_8x16_internal_10bpc).main
   3865    mova                m16, [cq+128* 2]
   3866    mova                m17, [cq+128* 6]
   3867    mova                m18, [cq+128*10]
   3868    mova                m19, [cq+128*14]
   3869    mova                m20, [cq+128*18]
   3870    mova                m21, [cq+128*22]
   3871    mova                m22, [cq+128*26]
   3872    mova                m23, [cq+128*30]
   3873    call m(idct_16x16_internal_10bpc).main
   3874    call .pass1_load_spill
   3875    mova                m16, [cq+128*17]
   3876    mova                m17, [cq+128*19]
   3877    mova                m18, [cq+128*21]
   3878    mova                m19, [cq+128*23]
   3879    mova                m20, [cq+128*25]
   3880    mova                m21, [cq+128*27]
   3881    mova                m22, [cq+128*29]
   3882    mova                m23, [cq+128*31]
   3883    call m(inv_txfm_add_dct_dct_32x16_10bpc).main
   3884 .pass1_end:
   3885    vpbroadcastd        m11, [o(pd_2)]
   3886    lea                  r4, [cq+128*8]
   3887    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end
   3888    punpckhqdq          m22, m0, m20  ;  1
   3889    punpcklqdq           m0, m20      ;  0
   3890    punpckhqdq          m24, m2, m1   ;  5
   3891    punpcklqdq           m1, m2, m1   ;  4
   3892    punpcklqdq           m2, m14, m18 ;  8
   3893    punpckhqdq          m26, m14, m18 ;  9
   3894    punpcklqdq          m14, m15, m4  ;  2
   3895    punpckhqdq          m23, m15, m4  ;  3
   3896    punpckhqdq          m25, m3, m21  ;  7
   3897    punpcklqdq          m15, m3, m21  ;  6
   3898    punpckhqdq          m28, m6, m17  ; 13
   3899    punpcklqdq           m3, m6, m17  ; 12
   3900    punpckhqdq          m27, m5, m16  ; 11
   3901    punpcklqdq          m16, m5, m16  ; 10
   3902    punpckhqdq          m29, m7, m8   ; 15
   3903    punpcklqdq          m17, m7, m8   ; 14
   3904    ret
   3905 .pass1_load_spill:
   3906    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
   3907    mova        [cq+128* 0], m0
   3908    mova                 m0, [cq+128* 1]
   3909    mova        [cq+128* 1], m1
   3910    mova        [cq+128* 2], m2
   3911    mova                 m1, [cq+128* 3]
   3912    mova                 m2, [cq+128* 5]
   3913    mova        [cq+128* 3], m3
   3914    mova        [cq+128* 4], m4
   3915    mova                 m3, [cq+128* 7]
   3916    mova                 m4, [cq+128* 9]
   3917    mova        [cq+128* 5], m5
   3918    mova        [cq+128* 6], m6
   3919    mova        [cq+128* 7], m7
   3920    mova                 m5, [cq+128*11]
   3921    mova                 m6, [cq+128*13]
   3922    mova                 m7, [cq+128*15]
   3923    mova        [cq+128* 8], m23
   3924    mova        [cq+128* 9], m22
   3925    mova        [cq+128*10], m21
   3926    mova        [cq+128*11], m20
   3927    mova        [cq+128*12], m19
   3928    mova        [cq+128*13], m18
   3929    mova        [cq+128*14], m17
   3930    mova        [cq+128*15], m16
   3931    ret
   3932 
   3933 cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eob
   3934 %undef cmp
   3935    vpbroadcastd        m13, [pw_8192]
   3936    vpbroadcastd        m15, [pixel_10bpc_max]
   3937    pxor                m14, m14
   3938    lea                  r6, [strideq*9]
   3939    cmp                eobd, 136
   3940    jl .main
   3941    mov                  r4, dstq
   3942    call .main
   3943    add                  cq, 64-128*4
   3944    lea                dstq, [dstq+strideq*8]
   3945    call .main
   3946    add                  cq, 128*12-64
   3947    lea                dstq, [r4+32]
   3948    cmp                eobd, 543
   3949    jl .main
   3950    call .main
   3951    add                  cq, 64-128*4
   3952    lea                dstq, [dstq+strideq*8]
   3953 .main:
   3954    call .main_internal
   3955    add                  cq, 128*4
   3956    pmulhrsw             m1, m13, m2
   3957    pmulhrsw             m3, m13, m4
   3958    pmulhrsw             m5, m13, m6
   3959    pmulhrsw             m7, m13, m8
   3960    call .main_internal
   3961    jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
   3962 .main_internal:
   3963    mova                 m8, [cq+128* 0]
   3964    packssdw             m8, [cq+128* 8]
   3965    mova                 m6, [cq+128* 1]
   3966    packssdw             m6, [cq+128* 9]
   3967    mova                 m0, [cq+128* 2]
   3968    packssdw             m0, [cq+128*10]
   3969    mova                 m2, [cq+128* 3]
   3970    packssdw             m2, [cq+128*11]
   3971    REPX {vpermq x, x, q3120}, m8, m6, m0, m2
   3972    REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
   3973    punpcklwd            m4, m8, m6
   3974    punpckhwd            m8, m6
   3975    punpcklwd            m6, m0, m2
   3976    punpckhwd            m0, m2
   3977    REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
   3978    punpckldq            m2, m4, m6 ; 0 1
   3979    punpckhdq            m4, m6     ; 2 3
   3980    punpckldq            m6, m8, m0 ; 4 5
   3981    punpckhdq            m8, m0     ; 6 7
   3982    ret
   3983 
   3984 cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
   3985    lea                  r5, [o_base]
   3986    test               eobd, eobd
   3987    jz .dconly
   3988 
   3989    PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob
   3990 %undef cmp
   3991    vpbroadcastd        m12, [o(pd_2896)]
   3992    vpbroadcastd        m13, [o(pd_2048)]
   3993    vpbroadcastd        m14, [o(clip_18b_min)]
   3994    vpbroadcastd        m15, [o(clip_18b_max)]
   3995    cmp                eobd, 36
   3996    jl .fast
   3997    call .pass1
   3998    cmp                eobd, 151
   3999    jge .full
   4000    lea                  r5, [o_base_8bpc]
   4001 
   4002    punpckhwd           m22, m0, m0
   4003    punpckhwd           m23, m1, m1
   4004    punpckhwd           m24, m2, m2
   4005    punpckhwd           m25, m3, m3
   4006    punpckhwd           m26, m4, m4
   4007    punpckhwd           m27, m5, m5
   4008    punpckhwd           m28, m6, m6
   4009    punpckhwd           m29, m7, m7
   4010    punpcklwd           m21, m1, m1
   4011    punpcklwd           m14, m3, m3
   4012    punpcklwd           m18, m5, m5
   4013    punpcklwd           m15, m7, m7
   4014    pxor                 m9, m9
   4015    punpcklwd            m9, m9, m0
   4016    punpcklwd            m8, m2, m2
   4017    punpcklwd            m7, m4, m4
   4018    punpcklwd            m1, m6, m6
   4019    call m(idct_16x16_internal_8bpc).main_fast2
   4020    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
   4021    mova     [rsp+mmsize*0], m14
   4022    mova     [rsp+mmsize*1], m15
   4023    mova     [rsp+mmsize*2], m16
   4024    mova     [rsp+mmsize*3], m17
   4025    mova     [rsp+mmsize*4], m18
   4026    mova     [rsp+mmsize*5], m19
   4027    mova     [rsp+mmsize*6], m20
   4028    mova     [rsp+mmsize*7], m21
   4029    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
   4030 
   4031    pxor                m12, m12
   4032    mov                 r3d, 64*3
   4033 .zero_loop:
   4034    REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3
   4035    sub                 r3d, 64
   4036    jge .zero_loop
   4037 
   4038    jmp .pass2_end
   4039 .full:
   4040    mova         [cq+128*0], m0
   4041    mova         [cq+128*1], m1
   4042    mova         [cq+128*2], m2
   4043    mova         [cq+128*3], m3
   4044    mova         [cq+128*4], m4
   4045    mova         [cq+128*5], m5
   4046    mova         [cq+128*6], m6
   4047    mova         [cq+128*7], m7
   4048    add                  cq, 64
   4049    call .pass1
   4050    sub                  cq, 64
   4051    mova                m22, [cq+128*0] ;  0  1
   4052    mova                m23, [cq+128*1] ;  2  3
   4053    mova                m24, [cq+128*2] ;  4  5
   4054    mova                m25, [cq+128*3] ;  6  7
   4055    mova                m26, [cq+128*4] ;  8  9
   4056    mova                m27, [cq+128*5] ; 10 11
   4057    mova                m28, [cq+128*6] ; 12 13
   4058    mova                m29, [cq+128*7] ; 14 15
   4059    mova         [cq+64* 8], m0
   4060    mova         [cq+64* 9], m1
   4061    mova         [cq+64*10], m2
   4062    mova         [cq+64*11], m3
   4063    mova         [cq+64*12], m4
   4064    mova         [cq+64*13], m5
   4065    mova         [cq+64*14], m6
   4066    mova         [cq+64*15], m7
   4067    lea                  r5, [o_base_8bpc]
   4068 
   4069    punpcklwd           m20, m1, m1
   4070    punpcklwd           m16, m3, m3
   4071    punpcklwd           m19, m5, m5
   4072    punpcklwd           m17, m7, m7
   4073    punpcklwd            m8, m24, m24 ;  4
   4074    punpcklwd            m5, m2, m2   ; 20
   4075    punpcklwd            m1, m28, m28 ; 12
   4076    punpcklwd            m7, m26, m26 ;  8
   4077    punpcklwd            m3, m4, m4   ; 24
   4078    punpcklwd            m4, m6, m6   ; 28
   4079    pxor                 m9, m9
   4080    punpcklwd            m6, m9, m0   ; __ 16
   4081    mova                 m0, m4
   4082    punpcklwd            m9, m9, m22  ; __  0
   4083    call m(idct_16x16_internal_8bpc).main_fast
   4084    punpcklwd           m21, m23, m23 ;  2
   4085    punpcklwd           m15, m29, m29 ; 14
   4086    punpcklwd           m18, m27, m27 ; 10
   4087    punpcklwd           m14, m25, m25 ;  6
   4088    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   4089    mova     [rsp+mmsize*0], m14
   4090    mova     [rsp+mmsize*1], m15
   4091    mova     [rsp+mmsize*2], m16
   4092    mova     [rsp+mmsize*3], m17
   4093    mova     [rsp+mmsize*4], m18
   4094    mova     [rsp+mmsize*5], m19
   4095    mova     [rsp+mmsize*6], m20
   4096    mova     [rsp+mmsize*7], m21
   4097    mova                m21, [cq+64*15]
   4098    mova                m14, [cq+64* 8]
   4099    mova                m17, [cq+64*11]
   4100    mova                m18, [cq+64*12]
   4101    mova                m19, [cq+64*13]
   4102    mova                m16, [cq+64*10]
   4103    mova                m15, [cq+64* 9]
   4104    mova                m20, [cq+64*14]
   4105    REPX   {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
   4106                             m24, m19, m16, m27, m28, m15, m20, m23
   4107    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
   4108 
   4109    pxor                m12, m12
   4110    mov                 r3d, 32*7
   4111 .full_zero_loop:
   4112    REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3
   4113    sub                 r3d, 32
   4114    jge .full_zero_loop
   4115 
   4116    jmp .pass2_end
   4117 .fast:
   4118    mova                ym0, [cq+128*0]
   4119    mova                ym2, [cq+128*4]
   4120    movshdup             m8, [o(permB)]
   4121    mova                ym1, [cq+128*2]
   4122    mova                ym3, [cq+128*6]
   4123    mova                ym4, [cq+128*1]
   4124    mova                ym5, [cq+128*3]
   4125    mova                ym6, [cq+128*5]
   4126    mova                ym7, [cq+128*7]
   4127    vpermt2q             m0, m8, m2 ; 0 4
   4128    vpermt2q             m1, m8, m3 ; 2 6
   4129    vpermt2q             m4, m8, m5 ; 1 3
   4130    vpermt2q             m7, m8, m6 ; 7 5
   4131    call m(idct_8x8_internal_10bpc).main_fast
   4132    call m(idct_16x8_internal_10bpc).main_fast
   4133    vpbroadcastd        m11, [o(pd_2)]
   4134    call m(idct_8x16_internal_10bpc).main_end2
   4135    mova                 m8, [o(idct8x32p)]
   4136    packssdw             m0, m4
   4137    packssdw             m1, m5
   4138    packssdw             m2, m6
   4139    packssdw             m3, m7
   4140    mova                 m6, [dup16_perm]
   4141    vpermb               m0, m8, m0
   4142    vpermb               m2, m8, m2
   4143    vprold               m8, 16
   4144    vpermb               m1, m8, m1
   4145    vpermb               m3, m8, m3
   4146    punpckldq            m4, m0, m2
   4147    punpckhdq            m0, m2
   4148    punpckldq            m2, m1, m3
   4149    punpckhdq            m1, m3
   4150    punpckldq           m21, m4, m2
   4151    punpckhdq           m14, m4, m2
   4152    punpckldq           m18, m0, m1
   4153    punpckhdq           m15, m0, m1
   4154    vpord                m7, m6, [o(pb_32)] {1to16}
   4155    vpermb              m22, m7, m21 ; 1
   4156    pmovzxwd             m9, ym21    ; 0
   4157    vpermb               m8, m6, m18 ; 4
   4158    vpermb              m24, m7, m18 ; 5
   4159    vpermb              m21, m6, m14 ; 2
   4160    vpermb              m23, m7, m14 ; 3
   4161    vpermb              m14, m6, m15 ; 6
   4162    vpermb              m25, m7, m15 ; 7
   4163    lea                  r5, [o_base_8bpc]
   4164    pslld                m9, 16
   4165 
   4166    pxor                 m7, m7
   4167    REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29
   4168 
   4169    call m(idct_16x16_internal_8bpc).main_fast2
   4170    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
   4171    mova     [rsp+mmsize*0], m14
   4172    mova     [rsp+mmsize*1], m15
   4173    mova     [rsp+mmsize*2], m16
   4174    mova     [rsp+mmsize*3], m17
   4175    mova     [rsp+mmsize*4], m18
   4176    mova     [rsp+mmsize*5], m19
   4177    mova     [rsp+mmsize*6], m20
   4178    mova     [rsp+mmsize*7], m21
   4179 
   4180    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
   4181 
   4182    pxor                m12, m12
   4183    REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
   4184 .pass2_end:
   4185    movshdup            m30, [permC]
   4186    vpbroadcastd        m11, [pw_2048]
   4187    vpbroadcastd        m13, [pixel_10bpc_max]
   4188    lea                  r6, [strideq*3]
   4189    psrlq               m31, m30, 8
   4190    vpermq               m8, m30, m0
   4191    vpermq               m9, m31, m1
   4192    call m(idct_16x8_internal_10bpc).write_16x4
   4193    vpermq               m8, m30, m2
   4194    vpermq               m9, m31, m3
   4195    call m(idct_16x8_internal_10bpc).write_16x4
   4196    vpermq               m8, m30, m4
   4197    vpermq               m9, m31, m5
   4198    call m(idct_16x8_internal_10bpc).write_16x4
   4199    vpermq               m8, m30, m6
   4200    vpermq               m9, m31, m7
   4201    call m(idct_16x8_internal_10bpc).write_16x4
   4202 
   4203    mova                 m1, [rsp+mmsize*0]
   4204    mova                 m2, [rsp+mmsize*1]
   4205    mova                 m3, [rsp+mmsize*2]
   4206    mova                 m4, [rsp+mmsize*3]
   4207    mova                 m5, [rsp+mmsize*4]
   4208    mova                 m6, [rsp+mmsize*5]
   4209    mova                 m7, [rsp+mmsize*6]
   4210    mova                 m8, [rsp+mmsize*7]
   4211 
   4212    paddsw               m0, m1, m21
   4213    psubsw              m21, m1, m21
   4214    paddsw               m1, m2, m20
   4215    psubsw              m20, m2, m20
   4216    paddsw               m2, m3, m19
   4217    psubsw              m19, m3, m19
   4218    paddsw               m3, m4, m18
   4219    psubsw              m18, m4, m18
   4220    paddsw               m4, m5, m17
   4221    psubsw              m17, m5, m17
   4222    paddsw               m5, m6, m16
   4223    psubsw              m16, m6, m16
   4224    paddsw               m6, m7, m15
   4225    psubsw              m15, m7, m15
   4226    paddsw               m7, m8, m14
   4227    psubsw              m14, m8, m14
   4228 
   4229    vpermq               m8, m30, m0
   4230    vpermq               m9, m31, m1
   4231    call m(idct_16x8_internal_10bpc).write_16x4
   4232    vpermq               m8, m30, m2
   4233    vpermq               m9, m31, m3
   4234    call m(idct_16x8_internal_10bpc).write_16x4
   4235    vpermq               m8, m30, m4
   4236    vpermq               m9, m31, m5
   4237    call m(idct_16x8_internal_10bpc).write_16x4
   4238    vpermq               m8, m30, m6
   4239    vpermq               m9, m31, m7
   4240    call m(idct_16x8_internal_10bpc).write_16x4
   4241 
   4242    vpermq               m8, m30, m14
   4243    vpermq               m9, m31, m15
   4244    call m(idct_16x8_internal_10bpc).write_16x4
   4245    vpermq               m8, m30, m16
   4246    vpermq               m9, m31, m17
   4247    call m(idct_16x8_internal_10bpc).write_16x4
   4248    vpermq               m8, m30, m18
   4249    vpermq               m9, m31, m19
   4250    call m(idct_16x8_internal_10bpc).write_16x4
   4251    vpermq               m8, m30, m20
   4252    vpermq               m9, m31, m21
   4253    call m(idct_16x8_internal_10bpc).write_16x4
   4254 
   4255    vpermq               m8, m30, m22
   4256    vpermq               m9, m31, m23
   4257    call m(idct_16x8_internal_10bpc).write_16x4
   4258    vpermq               m8, m30, m24
   4259    vpermq               m9, m31, m25
   4260    call m(idct_16x8_internal_10bpc).write_16x4
   4261    vpermq               m8, m30, m26
   4262    vpermq               m9, m31, m27
   4263    call m(idct_16x8_internal_10bpc).write_16x4
   4264    vpermq               m8, m30, m28
   4265    vpermq               m9, m31, m29
   4266    call m(idct_16x8_internal_10bpc).write_16x4
   4267    RET
   4268 .pass1:
   4269    mova                 m0, [cq+128* 0]
   4270    mova                 m1, [cq+128* 2]
   4271    mova                 m2, [cq+128* 4]
   4272    mova                 m3, [cq+128* 6]
   4273    mova                 m4, [cq+128* 8]
   4274    mova                 m5, [cq+128*10]
   4275    mova                 m6, [cq+128*12]
   4276    mova                 m7, [cq+128*14]
   4277    call m(idct_8x16_internal_10bpc).main
   4278    mova                m16, [cq+128* 1]
   4279    mova                m17, [cq+128* 3]
   4280    mova                m18, [cq+128* 5]
   4281    mova                m19, [cq+128* 7]
   4282    mova                m20, [cq+128* 9]
   4283    mova                m21, [cq+128*11]
   4284    mova                m22, [cq+128*13]
   4285    mova                m23, [cq+128*15]
   4286    call m(idct_16x16_internal_10bpc).main
   4287    call m(idct_16x16_internal_10bpc).main_end
   4288    jmp m(idct_16x16_internal_10bpc).main_end3
   4289 .dconly:
   4290    imul                r6d, [cq], 181
   4291    mov                [cq], eobd
   4292    or                  r3d, 64
   4293    add                 r6d, 640
   4294    sar                 r6d, 10
   4295    jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
   4296 
   4297 cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
   4298    lea                  r5, [o_base]
   4299    test               eobd, eobd
   4300    jz .dconly
   4301    PROLOGUE 4, 7, 32, -64*40, dst, stride, c, eob
   4302 %undef cmp
   4303    vpbroadcastd        m12, [o(pd_2896)]
   4304    vpbroadcastd        m13, [o(pd_2048)]
   4305    vpbroadcastd        m14, [o(clip_18b_min)]
   4306    vpbroadcastd        m15, [o(clip_18b_max)]
   4307    cmp                eobd, 136
   4308    jl .fast
   4309    add                  cq, 64
   4310    cmp                eobd, 543
   4311    jge .full
   4312    call .pass1_fast ; bottomright 16x16 zero
   4313    jmp .lefthalf
   4314 .full:
   4315    call .pass1
   4316    mov                 r3d, 16*28
   4317 .lefthalf:
   4318    mova        [cq+128* 0], m27
   4319    mova        [cq+128* 1], m14
   4320    mova        [cq+128* 2], m28
   4321    mova        [cq+128* 3], m15
   4322    mova        [cq+128* 4], m22
   4323    mova        [cq+128* 5], m23
   4324    mova        [cq+128* 6], m24
   4325    mova        [cq+128* 7], m25
   4326    mova        [cq+128* 8], m0
   4327    mova        [cq+128* 9], m26
   4328    mova        [cq+128*10], m20
   4329    mova        [cq+128*11], m21
   4330    mova        [cq+128*12], m18
   4331    mova        [cq+128*13], m16
   4332    mova        [cq+128*14], m17
   4333    mova        [cq+128*15], m3
   4334    sub                  cq, 64
   4335    vpbroadcastd        m12, [o(pd_2896)]
   4336    vpbroadcastd        m13, [o(pd_2048)]
   4337    vpbroadcastd        m14, [o(clip_18b_min)]
   4338    vpbroadcastd        m15, [o(clip_18b_max)]
   4339    call .pass1
   4340    call .pass2_start
   4341 
   4342    pxor                m31, m31
   4343 .right_zero_loop:
   4344    REPX {mova [cq+r3*8+64+128*x], m31}, 0, 1, 2, 3
   4345    sub                 r3d, 16*4
   4346    jge .right_zero_loop
   4347    mov                 r3d, 16*28
   4348    jmp .left_zero_loop
   4349 .pass2_start:
   4350    vpbroadcastd        m10, [o(pd_2048)]
   4351    lea                  r5, [o_base_8bpc]
   4352 
   4353    lea                  r4, [rsp+gprsize]
   4354    mova                 m1, [cq+128*15+64]
   4355    mova                 m2, [cq+128* 8+64]
   4356    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   4357    mova                 m0, m21
   4358    mova                 m1, [cq+128*12+64]
   4359    mova                 m2, [cq+128*11+64]
   4360    mova                 m3, m18
   4361    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   4362    mova                 m0, m20
   4363    mova                 m1, [cq+128*13+64]
   4364    mova                 m2, [cq+128*10+64]
   4365    mova                 m3, m16
   4366    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   4367    mova                 m0, m26
   4368    mova                 m1, [cq+128*14+64]
   4369    mova                 m2, [cq+128* 9+64]
   4370    mova                 m3, m17
   4371    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
   4372    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
   4373 
   4374    mova                 m0, m27
   4375    mova                 m1, m28
   4376    mova                 m2, [cq+128* 0+64]
   4377    mova                 m3, [cq+128* 2+64]
   4378    mova                m16, [cq+128* 1+64]
   4379    mova                m17, [cq+128* 3+64]
   4380    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
   4381    mova                m26, [cq+128* 4+64]
   4382    mova                m27, [cq+128* 5+64]
   4383    mova                m28, [cq+128* 6+64]
   4384    mova                m29, [cq+128* 7+64]
   4385    mova        [rsp+64*32+gprsize], m14
   4386    mova        [rsp+64*33+gprsize], m15
   4387    mova        [rsp+64*34+gprsize], m16
   4388    mova        [rsp+64*35+gprsize], m17
   4389    mova        [rsp+64*36+gprsize], m18
   4390    mova        [rsp+64*37+gprsize], m19
   4391    mova        [rsp+64*38+gprsize], m20
   4392    mova        [rsp+64*39+gprsize], m21
   4393    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
   4394 .fast: ; topleft 16x16 nonzero
   4395    cmp                eobd, 36
   4396    jl .fast2
   4397    call .pass1_fast
   4398    vpbroadcastd        m10, [o(pd_2048)]
   4399    call .pass2_fast_start
   4400    jmp .end
   4401 .fast2: ; topleft 8x8 nonzero
   4402    movshdup             m7, [o(permB)]
   4403    mova                ym0, [cq+128*0]
   4404    mova                ym1, [cq+128*4]
   4405    mova                ym4, [cq+128*2]
   4406    mova                ym5, [cq+128*6]
   4407    mova               ym16, [cq+128*1]
   4408    mova                ym2, [cq+128*5]
   4409    mova                ym3, [cq+128*3]
   4410    mova               ym17, [cq+128*7]
   4411    mov                 r3d, 16*4
   4412    vpermq               m0, m7, m0 ;  0  0
   4413    vpermq               m1, m7, m1 ;  4  4
   4414    vpermt2q             m4, m7, m5 ;  2  6
   4415    vpermt2q            m16, m7, m2 ;  1  5
   4416    vpermt2q            m17, m7, m3 ;  7  3
   4417    REPX    {pmulld x, m12}, m0, m1, m4, m16, m17
   4418    REPX    {paddd  x, m13}, m0, m1, m4, m16, m17
   4419    REPX    {psrad  x, 12 }, m0, m1, m4, m16, m17
   4420    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
   4421    vpbroadcastd        m11, [o(pd_1)]
   4422    call m(idct_16x16_internal_10bpc).main_end2
   4423 
   4424    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
   4425    punpcklqdq          m27, m0, m2 ; 0
   4426    punpckhqdq           m0, m2     ; 1
   4427    punpcklqdq          m22, m3, m4 ; 2
   4428    punpckhqdq          m26, m3, m4 ; 3
   4429    punpcklqdq          m14, m5, m7 ; 4
   4430    punpckhqdq          m20, m5, m7 ; 5
   4431    punpcklqdq          m23, m6, m8 ; 6
   4432    punpckhqdq          m21, m6, m8 ; 7
   4433 
   4434    mova                m10, m13
   4435    call .pass2_fast2_start
   4436 .end:
   4437 
   4438    pxor                m31, m31
   4439 
   4440 .left_zero_loop:
   4441    REPX {mova [cq+r3*8+128*x], m31}, 0, 1, 2, 3
   4442    sub                 r3d, 16*4
   4443    jge .left_zero_loop
   4444 
   4445    call .pass2_end
   4446    RET
   4447 .pass2_end:
   4448    DEFINE_ARGS dst, stride, _, dst2, stride32, stklo, stkhi
   4449    vpbroadcastd        m30, [pixel_10bpc_max]
   4450    vpbroadcastd        m13, [pw_2048]
   4451 
   4452    mov           stride32q, strideq
   4453    shl           stride32q, 5
   4454    lea              stkhiq, [rsp+31*mmsize+gprsize]
   4455    lea               dst2q, [dstq+stride32q]
   4456    lea              stkloq, [rsp+gprsize]
   4457    sub               dst2q, strideq    ; dst31
   4458 
   4459    paddsw               m8, m0, m29    ; t0[idct32]
   4460    psubsw               m9, m0, m29    ; t31[idct32]
   4461    call .end_sumsub_write
   4462    paddsw               m8, m1, m28    ; t1[idct32]
   4463    psubsw               m9, m1, m28    ; t30[idct32]
   4464    call .end_sumsub_write
   4465    paddsw               m8, m2, m27    ; t2[idct32]
   4466    psubsw               m9, m2, m27    ; t29[idct32]
   4467    call .end_sumsub_write
   4468    paddsw               m8, m3, m26    ; t3[idct32]
   4469    psubsw               m9, m3, m26    ; t28[idct32]
   4470    call .end_sumsub_write
   4471    paddsw               m8, m4, m25    ; t4[idct32]
   4472    psubsw               m9, m4, m25    ; t27[idct32]
   4473    call .end_sumsub_write
   4474    paddsw               m8, m5, m24    ; t5[idct32]
   4475    psubsw               m9, m5, m24    ; t26[idct32]
   4476    call .end_sumsub_write
   4477    paddsw               m8, m6, m23    ; t6[idct32]
   4478    psubsw               m9, m6, m23    ; t25[idct32]
   4479    call .end_sumsub_write
   4480    paddsw               m8, m7, m22    ; t7[idct32]
   4481    psubsw               m9, m7, m22    ; t24[idct32]
   4482    call .end_sumsub_write
   4483    mova                 m0, [rsp+64*32+gprsize]
   4484    mova                 m1, [rsp+64*33+gprsize]
   4485    mova                 m2, [rsp+64*34+gprsize]
   4486    mova                 m3, [rsp+64*35+gprsize]
   4487    mova                 m4, [rsp+64*36+gprsize]
   4488    mova                 m5, [rsp+64*37+gprsize]
   4489    mova                 m6, [rsp+64*38+gprsize]
   4490    mova                 m7, [rsp+64*39+gprsize]
   4491    paddsw               m8, m0, m21    ; t8[idct32]
   4492    psubsw               m9, m0, m21    ; t23[idct32]
   4493    call .end_sumsub_write
   4494    paddsw               m8, m1, m20    ; t9[idct32]
   4495    psubsw               m9, m1, m20    ; t22[idct32]
   4496    call .end_sumsub_write
   4497    paddsw               m8, m2, m19    ; t10[idct32]
   4498    psubsw               m9, m2, m19    ; t21[idct32]
   4499    call .end_sumsub_write
   4500    paddsw               m8, m3, m18    ; t11[idct32]
   4501    psubsw               m9, m3, m18    ; t20[idct32]
   4502    call .end_sumsub_write
   4503    paddsw               m8, m4, m17    ; t12[idct32]
   4504    psubsw               m9, m4, m17    ; t19[idct32]
   4505    call .end_sumsub_write
   4506    paddsw               m8, m5, m16    ; t13[idct32]
   4507    psubsw               m9, m5, m16    ; t18[idct32]
   4508    call .end_sumsub_write
   4509    paddsw               m8, m6, m15    ; t14[idct32]
   4510    psubsw               m9, m6, m15    ; t17[idct32]
   4511    call .end_sumsub_write
   4512    paddsw               m8, m7, m14    ; t15[idct32]
   4513    psubsw               m9, m7, m14    ; t16[idct32]
   4514    ; fall-through
   4515 .end_sumsub_write:
   4516    mova                m10, [stkhiq]   ; t63-n
   4517    mova                m12, [stkloq]   ; t32+n
   4518    psubsw              m11, m8, m10    ; out63-n
   4519    paddsw               m8, m10        ; out0 +n
   4520    psubsw              m10, m9, m12    ; out32+n
   4521    paddsw               m9, m12        ; out32-n
   4522    REPX  {pmulhrsw x, m13}, m11, m8, m10, m9
   4523    paddw                m8, [dstq]
   4524    paddw                m9, [dst2q]
   4525    paddw               m10, [dstq+stride32q]
   4526    paddw               m11, [dst2q+stride32q]
   4527    REPX  {pminsw   x, m30}, m11, m8, m10, m9
   4528    REPX  {pmaxsw   x, m31}, m11, m8, m10, m9
   4529    mova  [dstq           ], m8
   4530    mova  [dst2q          ], m9
   4531    mova  [dstq +stride32q], m10
   4532    mova  [dst2q+stride32q], m11
   4533    add              stkloq, mmsize
   4534    sub              stkhiq, mmsize
   4535    add                dstq, strideq
   4536    sub               dst2q, strideq
   4537    ret
   4538 .pass2_fast_start:
   4539    lea                  r5, [o_base_8bpc]
   4540    lea                  r4, [rsp+gprsize]
   4541    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
   4542    mova                 m0, m21
   4543    mova                 m3, m18
   4544    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
   4545    mova                 m0, m20
   4546    mova                 m3, m16
   4547    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
   4548    mova                 m0, m26
   4549    mova                 m3, m17
   4550    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
   4551    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
   4552 
   4553    mova                 m0, m27
   4554    mova                 m1, m28
   4555    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
   4556    mova        [rsp+64*32+gprsize], m14
   4557    mova        [rsp+64*33+gprsize], m15
   4558    mova        [rsp+64*34+gprsize], m16
   4559    mova        [rsp+64*35+gprsize], m17
   4560    mova        [rsp+64*36+gprsize], m18
   4561    mova        [rsp+64*37+gprsize], m19
   4562    mova        [rsp+64*38+gprsize], m20
   4563    mova        [rsp+64*39+gprsize], m21
   4564    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
   4565 .pass2_fast2_start:
   4566    lea                  r5, [o_base_8bpc]
   4567    lea                  r4, [rsp+gprsize]
   4568    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
   4569    mova                 m0, m21
   4570    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
   4571    mova                 m0, m20
   4572    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
   4573    mova                 m0, m26
   4574    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2
   4575    call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
   4576 
   4577    mova                 m0, m27
   4578    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast3
   4579    mova        [rsp+64*32+gprsize], m14
   4580    mova        [rsp+64*33+gprsize], m15
   4581    mova        [rsp+64*34+gprsize], m16
   4582    mova        [rsp+64*35+gprsize], m17
   4583    mova        [rsp+64*36+gprsize], m18
   4584    mova        [rsp+64*37+gprsize], m19
   4585    mova        [rsp+64*38+gprsize], m20
   4586    mova        [rsp+64*39+gprsize], m21
   4587    jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast3
   4588 .dconly:
   4589    DEFINE_ARGS dst, stride, c, eob
   4590    imul                r6d, [cq], 181
   4591    mov                [cq], eobd
   4592    or                  r3d, 64
   4593    jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly3
   4594 .pass1_fast:
   4595    pmulld               m0, m12, [cq+128* 0]
   4596    pmulld               m1, m12, [cq+128* 4]
   4597    pmulld               m2, m12, [cq+128* 8]
   4598    pmulld               m3, m12, [cq+128*12]
   4599    mov                 r3d, 16*12
   4600    call m(idct_8x16_internal_10bpc).main_fast_rect2
   4601    pmulld              m16, m12, [cq+128* 2]
   4602    pmulld              m17, m12, [cq+128* 6]
   4603    pmulld              m18, m12, [cq+128*10]
   4604    pmulld              m19, m12, [cq+128*14]
   4605    call m(idct_16x16_internal_10bpc).main_fast_rect2
   4606    call .pass1_load_spill
   4607    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2
   4608    jmp .pass1_end
   4609 .pass1:
   4610    pmulld               m0, m12, [cq+128* 0]
   4611    pmulld               m1, m12, [cq+128* 4]
   4612    pmulld               m2, m12, [cq+128* 8]
   4613    pmulld               m3, m12, [cq+128*12]
   4614    pmulld               m4, m12, [cq+128*16]
   4615    pmulld               m5, m12, [cq+128*20]
   4616    pmulld               m6, m12, [cq+128*24]
   4617    pmulld               m7, m12, [cq+128*28]
   4618    call m(idct_8x16_internal_10bpc).main_rect2
   4619    pmulld              m16, m12, [cq+128* 2]
   4620    pmulld              m17, m12, [cq+128* 6]
   4621    pmulld              m18, m12, [cq+128*10]
   4622    pmulld              m19, m12, [cq+128*14]
   4623    pmulld              m20, m12, [cq+128*18]
   4624    pmulld              m21, m12, [cq+128*22]
   4625    pmulld              m22, m12, [cq+128*26]
   4626    pmulld              m23, m12, [cq+128*30]
   4627    call m(idct_16x16_internal_10bpc).main_rect2
   4628    call .pass1_load_spill
   4629    pmulld              m16, m12, [cq+128*17]
   4630    pmulld              m17, m12, [cq+128*19]
   4631    pmulld              m18, m12, [cq+128*21]
   4632    pmulld              m19, m12, [cq+128*23]
   4633    pmulld              m20, m12, [cq+128*25]
   4634    pmulld              m21, m12, [cq+128*27]
   4635    pmulld              m22, m12, [cq+128*29]
   4636    pmulld              m23, m12, [cq+128*31]
   4637    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_rect2
   4638 .pass1_end:
   4639    vpbroadcastd        m11, [o(pd_1)]
   4640    lea                  r4, [cq+128*8]
   4641    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end
   4642    punpcklqdq          m27, m0, m20  ;  0
   4643    punpckhqdq           m0, m20      ;  1
   4644    punpcklqdq          m24, m5, m16  ; 10
   4645    punpckhqdq          m16, m5, m16  ; 11
   4646    punpcklqdq          m23, m3, m21  ;  6
   4647    punpckhqdq          m21, m3, m21  ;  7
   4648    punpcklqdq          m25, m7, m8   ; 14
   4649    punpckhqdq           m3, m7, m8   ; 15
   4650    punpcklqdq          m22, m15, m4  ;  2
   4651    punpckhqdq          m26, m15, m4  ;  3
   4652    punpcklqdq          m15, m6, m17  ; 12
   4653    punpckhqdq          m17, m6, m17  ; 13
   4654    punpcklqdq          m28, m14, m18 ;  8
   4655    punpckhqdq          m18, m14, m18 ;  9
   4656    punpcklqdq          m14, m2, m1   ;  4
   4657    punpckhqdq          m20, m2, m1   ;  5
   4658    ret
   4659 .pass1_load_spill:
   4660    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
   4661    mova        [cq+128* 0], m0
   4662    pmulld               m0, m12, [cq+128* 1]
   4663    mova        [cq+128* 1], m1
   4664    mova        [cq+128* 2], m2
   4665    pmulld               m1, m12, [cq+128* 3]
   4666    pmulld               m2, m12, [cq+128* 5]
   4667    mova        [cq+128* 3], m3
   4668    mova        [cq+128* 4], m4
   4669    pmulld               m3, m12, [cq+128* 7]
   4670    pmulld               m4, m12, [cq+128* 9]
   4671    mova        [cq+128* 5], m5
   4672    mova        [cq+128* 6], m6
   4673    mova        [cq+128* 7], m7
   4674    pmulld               m5, m12, [cq+128*11]
   4675    pmulld               m6, m12, [cq+128*13]
   4676    pmulld               m7, m12, [cq+128*15]
   4677    mova        [cq+128* 8], m23
   4678    mova        [cq+128* 9], m22
   4679    mova        [cq+128*10], m21
   4680    mova        [cq+128*11], m20
   4681    mova        [cq+128*12], m19
   4682    mova        [cq+128*13], m18
   4683    mova        [cq+128*14], m17
   4684    mova        [cq+128*15], m16
   4685    ret
   4686 
   4687 cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
   4688 %undef cmp
   4689    lea                  r5, [o_base]
   4690    test               eobd, eobd
   4691    jz .dconly
   4692 
   4693    PROLOGUE 4, 7, 32, -64*32, dst, stride, c, eob
   4694 %undef cmp
   4695    vpbroadcastd        m12, [o(pd_2896)]
   4696    vpbroadcastd        m13, [o(pd_2048)]
   4697    vpbroadcastd        m14, [o(clip_18b_min)]
   4698    vpbroadcastd        m15, [o(clip_18b_max)]
   4699    cmp                eobd, 36
   4700    jl .fast ; 8x8
   4701    cmp                eobd, 151
   4702    jge .full ; 16x16
   4703    lea                  r4, [idct64_mul_16bpc]
   4704    lea                  r6, [rsp+4*64]
   4705    mova                 m0, [cq+64* 1]
   4706    mova                 m3, [cq+64*15]
   4707    call .main_part1_fast
   4708    mova                 m0, [cq+64* 7]
   4709    mova                 m3, [cq+64* 9]
   4710    call .main_part1_fast
   4711    mova                 m0, [cq+64* 5]
   4712    mova                 m3, [cq+64*11]
   4713    call .main_part1_fast
   4714    mova                 m0, [cq+64* 3]
   4715    mova                 m3, [cq+64*13]
   4716    call .main_part1_fast
   4717    call .main_part2
   4718    mova                 m0, [cq+64* 0]
   4719    mova                 m1, [cq+64* 8]
   4720    mova                m16, [cq+64* 4]
   4721    mova                m17, [cq+64*12]
   4722    call m(idct_8x16_internal_10bpc).main_fast2
   4723    call m(idct_16x16_internal_10bpc).main_fast2
   4724    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
   4725    call .pass1_load_spill
   4726    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2
   4727    mov                 r6d, 12*8
   4728    jmp .idct64_end
   4729 .full:
   4730    lea                  r4, [idct64_mul_16bpc]
   4731    lea                  r6, [rsp+4*64]
   4732    mova                 m0, [cq+64* 1]
   4733    mova                 m1, [cq+64*31]
   4734    mova                 m2, [cq+64*17]
   4735    mova                 m3, [cq+64*15]
   4736    call .main_part1
   4737    mova                 m0, [cq+64* 7]
   4738    mova                 m1, [cq+64*25]
   4739    mova                 m2, [cq+64*23]
   4740    mova                 m3, [cq+64* 9]
   4741    call .main_part1
   4742    mova                 m0, [cq+64* 5]
   4743    mova                 m1, [cq+64*27]
   4744    mova                 m2, [cq+64*21]
   4745    mova                 m3, [cq+64*11]
   4746    call .main_part1
   4747    mova                 m0, [cq+64* 3]
   4748    mova                 m1, [cq+64*29]
   4749    mova                 m2, [cq+64*19]
   4750    mova                 m3, [cq+64*13]
   4751    call .main_part1
   4752    call .main_part2
   4753    mova                 m0, [cq+64* 0]
   4754    mova                 m1, [cq+64* 8]
   4755    mova                 m2, [cq+64*16]
   4756    mova                 m3, [cq+64*24]
   4757    mova                m16, [cq+64* 4]
   4758    mova                m17, [cq+64*12]
   4759    mova                m18, [cq+64*20]
   4760    mova                m19, [cq+64*28]
   4761    call m(idct_8x16_internal_10bpc).main_fast
   4762    call m(idct_16x16_internal_10bpc).main_fast
   4763    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
   4764    call .pass1_load_spill
   4765    mova                 m4, [cq+64*18]
   4766    mova                 m5, [cq+64*22]
   4767    mova                 m6, [cq+64*26]
   4768    mova                 m7, [cq+64*30]
   4769    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
   4770    mov                 r6d, 28*8
   4771    jmp .idct64_end
   4772 .dconly:
   4773    imul                r6d, [cq], 181
   4774    mov                [cq], eobd
   4775    or                  r3d, 16
   4776 .dconly1:
   4777    add                 r6d, 640
   4778    sar                 r6d, 10
   4779 .dconly2:
   4780    vpbroadcastd         m3, [o(dconly_10bpc)]
   4781    imul                r6d, 181
   4782    add                 r6d, 2176
   4783    sar                 r6d, 12
   4784    vpbroadcastw         m2, r6d
   4785    paddsw               m2, m3
   4786 .dconly_loop:
   4787    paddsw               m0, m2, [dstq+64*0]
   4788    paddsw               m1, m2, [dstq+64*1]
   4789    psubusw              m0, m3
   4790    psubusw              m1, m3
   4791    mova        [dstq+64*0], m0
   4792    mova        [dstq+64*1], m1
   4793    add                dstq, strideq
   4794    dec                 r3d
   4795    jg .dconly_loop
   4796    ret
   4797 .pass1_load_spill:
   4798    mova         [cq+64* 0], m0
   4799    mova                 m0, [cq+64* 2]
   4800    mova         [cq+64* 2], m1
   4801    mova                 m1, [cq+64* 6]
   4802    mova         [cq+64* 4], m2
   4803    mova         [cq+64* 6], m3
   4804    mova                 m2, [cq+64*10]
   4805    mova                 m3, [cq+64*14]
   4806    mova         [cq+64* 8], m4
   4807    mova         [cq+64*10], m5
   4808    mova         [cq+64*12], m6
   4809    mova         [cq+64*14], m7
   4810    mova         [cq+64* 1], m23
   4811    mova         [cq+64* 3], m22
   4812    mova         [cq+64* 5], m21
   4813    mova         [cq+64* 7], m20
   4814    mova         [cq+64* 9], m19
   4815    mova         [cq+64*11], m18
   4816    mova         [cq+64*13], m17
   4817    mova         [cq+64*15], m16
   4818    ret
   4819 ALIGN function_align
   4820 .main_part1_fast_rect2:
   4821    REPX     {paddd x, m13}, m0, m3
   4822    REPX     {psrad x, 12 }, m0, m3
   4823 .main_part1_fast:
   4824    pmulld               m7, m0, [r4+4*0]{bcstd}    ; t63a
   4825    pmulld               m0, [r4+4*1]{bcstd}        ; t32a
   4826    pmulld               m4, m3, [r4+4*6]{bcstd}    ; t60a
   4827    pmulld               m3, [r4+4*7]{bcstd}        ; t35a
   4828    vpbroadcastd        m10, [r4+4*8]
   4829    vpbroadcastd        m11, [r4+4*9]
   4830    REPX     {paddd x, m13}, m7, m0, m4, m3
   4831    REPX     {psrad x, 12 }, m7, m0, m4, m3
   4832    mova                 m8, m0
   4833    mova                 m1, m7
   4834    mova                 m6, m3
   4835    mova                 m2, m4
   4836    jmp .main_part1b
   4837 .main_part1_rect2:
   4838    REPX     {paddd x, m13}, m0, m1, m2, m3
   4839    REPX     {psrad x, 12 }, m0, m1, m2, m3
   4840 .main_part1: ; idct64 steps 1-5
   4841    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
   4842    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
   4843    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
   4844    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
   4845    pmulld               m7, m0, [r4+4*0]{bcstd}    ; t63a
   4846    pmulld               m0, [r4+4*1]{bcstd}        ; t32a
   4847    pmulld               m6, m1, [r4+4*2]{bcstd}    ; t62a
   4848    pmulld               m1, [r4+4*3]{bcstd}        ; t33a
   4849    pmulld               m5, m2, [r4+4*4]{bcstd}    ; t61a
   4850    pmulld               m2, [r4+4*5]{bcstd}        ; t34a
   4851    pmulld               m4, m3, [r4+4*6]{bcstd}    ; t60a
   4852    pmulld               m3, [r4+4*7]{bcstd}        ; t35a
   4853    vpbroadcastd        m10, [r4+4*8]
   4854    vpbroadcastd        m11, [r4+4*9]
   4855    REPX     {paddd x, m13}, m7, m0, m6, m1, m5, m2, m4, m3
   4856    REPX     {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
   4857    psubd                m8, m0, m1 ; t33
   4858    paddd                m0, m1     ; t32
   4859    psubd                m1, m7, m6 ; t62
   4860    paddd                m7, m6     ; t63
   4861    psubd                m6, m3, m2 ; t34
   4862    paddd                m3, m2     ; t35
   4863    psubd                m2, m4, m5 ; t61
   4864    paddd                m4, m5     ; t60
   4865 .main_part1b:
   4866    REPX    {pmaxsd x, m14}, m8, m1, m6, m2
   4867    REPX    {pminsd x, m15}, m8, m1, m6, m2
   4868    ITX_MULSUB_2D         1, 8, 5, 9, _, 13, 10, 11    ; t33a, t62a
   4869    ITX_MULSUB_2D         2, 6, 5, 9, _, 13, 10, 11, 2 ; t61a, t34a
   4870    REPX    {pmaxsd x, m14}, m0, m3, m7, m4
   4871    REPX    {pminsd x, m15}, m0, m3, m7, m4
   4872    vpbroadcastd        m10, [r4+4*10]
   4873    vpbroadcastd        m11, [r4+4*11]
   4874    psubd                m5, m0, m3 ; t35a
   4875    paddd                m0, m3     ; t32a
   4876    psubd                m3, m7, m4 ; t60a
   4877    paddd                m7, m4     ; t63a
   4878    psubd                m4, m1, m6 ; t34
   4879    paddd                m1, m6     ; t33
   4880    psubd                m6, m8, m2 ; t61
   4881    paddd                m8, m2     ; t62
   4882    REPX    {pmaxsd x, m14}, m5, m3, m4, m6
   4883    REPX    {pminsd x, m15}, m5, m3, m4, m6
   4884    ITX_MULSUB_2D         3, 5, 2, 9, _, 13, 10, 11 ; t35,  t60
   4885    ITX_MULSUB_2D         6, 4, 2, 9, _, 13, 10, 11 ; t34a, t61a
   4886    REPX    {pmaxsd x, m14}, m0, m7, m1, m8
   4887    REPX    {pminsd x, m15}, m0, m7, m1, m8
   4888    add                  r4, 4*12
   4889    mova          [r6-64*4], m0
   4890    mova          [r6+64*3], m7
   4891    mova          [r6-64*3], m1
   4892    mova          [r6+64*2], m8
   4893    mova          [r6-64*2], m6
   4894    mova          [r6+64*1], m4
   4895    mova          [r6-64*1], m3
   4896    mova          [r6+64*0], m5
   4897    add                  r6, 64*8
   4898    ret
   4899 .main_part2: ; idct64 steps 6-9
   4900    lea                  r4, [r6+64*3]
   4901    sub                  r6, 64*4
   4902    vpbroadcastd        m10, [pd_1567]
   4903    vpbroadcastd        m11, [pd_3784]
   4904 .main_part2_loop:
   4905    mova                 m0, [r6-64*32] ; t32a
   4906    mova                 m1, [r4-64*24] ; t39a
   4907    mova                 m2, [r4-64*32] ; t63a
   4908    mova                 m3, [r6-64*24] ; t56a
   4909    mova                 m4, [r6-64*16] ; t40a
   4910    mova                 m5, [r4-64* 8] ; t47a
   4911    mova                 m6, [r4-64*16] ; t55a
   4912    mova                 m7, [r6-64* 8] ; t48a
   4913    psubd                m8, m0, m1 ; t39
   4914    paddd                m0, m1     ; t32
   4915    psubd                m1, m2, m3 ; t56
   4916    paddd                m2, m3     ; t63
   4917    psubd                m3, m5, m4 ; t40
   4918    paddd                m5, m4     ; t47
   4919    psubd                m4, m7, m6 ; t55
   4920    paddd                m7, m6     ; t48
   4921    REPX    {pmaxsd x, m14}, m8, m1, m3, m4
   4922    REPX    {pminsd x, m15}, m8, m1, m3, m4
   4923    ITX_MULSUB_2D         1, 8, 6, 9, _, 13, 10, 11    ; t39a, t56a
   4924    ITX_MULSUB_2D         4, 3, 6, 9, _, 13, 10, 11, 2 ; t55a, t40a
   4925    REPX    {pmaxsd x, m14}, m0, m2, m5, m7
   4926    REPX    {pminsd x, m15}, m0, m5, m2, m7
   4927    psubd                m6, m2, m7 ; t48a
   4928    paddd                m2, m7     ; t63a
   4929    psubd                m7, m0, m5 ; t47a
   4930    paddd                m0, m5     ; t32a
   4931    psubd                m5, m8, m4 ; t55
   4932    paddd                m8, m4     ; t56
   4933    psubd                m4, m1, m3 ; t40
   4934    paddd                m1, m3     ; t39
   4935    REPX    {pmaxsd x, m14}, m6, m7, m5, m4
   4936    REPX    {pminsd x, m15}, m6, m7, m5, m4
   4937    REPX    {pmulld x, m12}, m6, m7, m5, m4
   4938    REPX    {pmaxsd x, m14}, m2, m0, m8, m1
   4939    REPX    {pminsd x, m15}, m2, m0, m8, m1
   4940    paddd                m6, m13
   4941    paddd                m5, m13
   4942    psubd                m3, m6, m7 ; t47
   4943    paddd                m6, m7     ; t48
   4944    psubd                m7, m5, m4 ; t40a
   4945    paddd                m5, m4     ; t55a
   4946    REPX      {psrad x, 12}, m3, m6, m7, m5
   4947    mova         [r4-64* 8], m2
   4948    mova         [r6-64*32], m0
   4949    mova         [r6-64* 8], m8
   4950    mova         [r4-64*32], m1
   4951    mova         [r4-64*24], m3
   4952    mova         [r6-64*16], m6
   4953    mova         [r6-64*24], m7
   4954    mova         [r4-64*16], m5
   4955    add                  r6, 64
   4956    sub                  r4, 64
   4957    cmp                  r6, r4
   4958    jl .main_part2_loop
   4959    ret
   4960 .idct64_main_end:
   4961 %macro IDCT64_PASS1_END 9
   4962    mova                m%5, [%9+%1*128]    ; t0+n [idct32] + idct64 rounding
   4963    psubd               m%6, m%5, m%2       ; out31-n [idct32] = t31-n [idct64]
   4964    paddd               m%5, m%2            ; out0+n [idct32] = t0+n [idct64]
   4965    REPX    {pmaxsd x, m14}, m%6, m%5
   4966    REPX    {pminsd x, m15}, m%6, m%5
   4967    REPX    {paddd  x, m11}, m%6, m%5
   4968    mova                m%2, [r3+%3*64]     ; t32+n [idct64]
   4969    mova                m%7, [r3+%4*64]     ; t63-n [idct64]
   4970    psubd               m%8, m%5, m%7       ; out63-n
   4971    paddd               m%5, m%7            ; out0+n
   4972    psubd               m%7, m%6, m%2       ; out32+n
   4973    paddd               m%6, m%2            ; out31-n
   4974    REPX   {vpsravd x, m11}, m%8, m%5, m%7, m%6
   4975 %endmacro
   4976 
   4977 %macro IDCT64_PASS1_ENDx4 1
   4978 %assign %%m1 %1         ; t32+n
   4979 %assign %%m2 (7-%1)     ; t39-n
   4980 %assign %%m3 (8+%1)     ; t40+n
   4981 %assign %%m4 (15-%1)    ; t47-n
   4982 %assign %%m5 (16+%1)    ; t48+n
   4983 %assign %%m6 (23-%1)    ; t55-n
   4984 %assign %%m7 (24+%1)    ; t56+n
   4985 %assign %%m8 (31-%1)    ; t63-n
   4986 
   4987 %assign %%r1 %1         ; t16+n
   4988 %assign %%r2 (7-%1)     ; t23-n
   4989 %assign %%r3 (16+%1)    ; t24-n
   4990 %assign %%r4 (23-%1)    ; t31-n
   4991 
   4992 %assign %%c1 (%1)       ; t0/8+n
   4993 %assign %%c2 (7-%1)     ; t7/15-n
   4994 
   4995    IDCT64_PASS1_END   %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27, cq ; out0/31/32/63
   4996    IDCT64_PASS1_END   %%c1, %%r1, %%m4, %%m5, 28, 29, 30, 31, r4 ; out15/16/47/48
   4997    packssdw      m %+ %%r1, m24, m29
   4998    packssdw      m %+ %%r4, m28, m25
   4999    packssdw            m26, m31
   5000    packssdw            m30, m27
   5001    mova   [r3+%%m5*mmsize], m26
   5002    mova   [r3+%%m8*mmsize], m30
   5003    IDCT64_PASS1_END   %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27, cq ; out7/24/39/56
   5004    IDCT64_PASS1_END   %%c2, %%r2, %%m3, %%m6, 28, 29, 30, 31, r4 ; out8/23/40/55
   5005    packssdw      m %+ %%r2, m24, m29
   5006    packssdw      m %+ %%r3, m28, m25
   5007    packssdw            m26, m31
   5008    packssdw            m30, m27
   5009    mova   [r3+%%m6*mmsize], m26
   5010    mova   [r3+%%m7*mmsize], m30
   5011 %endmacro
   5012    IDCT64_PASS1_ENDx4    0
   5013    IDCT64_PASS1_ENDx4    1
   5014    IDCT64_PASS1_ENDx4    2
   5015    IDCT64_PASS1_ENDx4    3
   5016    ret
   5017 .idct64_end:
   5018    vpbroadcastd        m11, [o(pd_2)]
   5019    lea                  r4, [cq+64]
   5020    mov                  r3, rsp
   5021    lea                  r5, [o_base_8bpc]
   5022    call .idct64_main_end
   5023 
   5024    pxor                m12, m12
   5025 .zero_loop:
   5026    REPX {mova [cq+r6*8+64*x], m12}, 0, 1, 2, 3
   5027    sub                 r6d, 8*4
   5028    jge .zero_loop
   5029 
   5030    lea                  r3, [strideq*3]
   5031    mov                  r4, dstq
   5032    call .pass2
   5033    mova                 m0, [rsp+16*mmsize]
   5034    mova                 m1, [rsp+17*mmsize]
   5035    mova                 m2, [rsp+18*mmsize]
   5036    mova                 m3, [rsp+19*mmsize]
   5037    mova                 m4, [rsp+20*mmsize]
   5038    mova                 m5, [rsp+21*mmsize]
   5039    mova                 m6, [rsp+22*mmsize]
   5040    mova                 m7, [rsp+23*mmsize]
   5041    mova                m16, [rsp+24*mmsize]
   5042    mova                m17, [rsp+25*mmsize]
   5043    mova                m18, [rsp+26*mmsize]
   5044    mova                m19, [rsp+27*mmsize]
   5045    mova                m20, [rsp+28*mmsize]
   5046    mova                m21, [rsp+29*mmsize]
   5047    mova                m22, [rsp+30*mmsize]
   5048    mova                m23, [rsp+31*mmsize]
   5049    lea                dstq, [r4+64]
   5050    call .pass2
   5051    RET
   5052 .pass2:
   5053    psrlq               m12, [permC], 24    ;  0  2  8 10  1  3  9 11
   5054    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
   5055    call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
   5056 
   5057    punpckhqdq          m19, m5, m16  ; 11
   5058    punpcklqdq           m5, m16      ; 10
   5059    punpckhqdq          m16, m2, m1   ;  5
   5060    punpcklqdq           m2, m1       ;  4
   5061    punpcklqdq           m1, m15, m4  ;  2
   5062    punpckhqdq          m15, m4       ;  3
   5063    punpcklqdq           m4, m14, m18 ;  8
   5064    punpckhqdq          m18, m14, m18 ;  9
   5065    punpckhqdq          m14, m0, m20  ;  1
   5066    punpcklqdq           m0, m20      ;  0
   5067    punpckhqdq          m20, m6, m17  ; 13
   5068    punpcklqdq           m6, m17      ; 12
   5069    punpckhqdq          m17, m3, m21  ;  7
   5070    punpcklqdq           m3, m21      ;  6
   5071    punpckhqdq          m21, m7, m8   ; 15
   5072    punpcklqdq           m7, m8       ; 14
   5073 
   5074    call m(inv_txfm_add_dct_dct_32x8_8bpc).main
   5075    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
   5076 .write:
   5077    vpbroadcastd        m11, [pw_2048]
   5078    pxor                m12, m12
   5079    vpbroadcastd        m13, [pixel_10bpc_max]
   5080    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
   5081    pmulhrsw             m0, m11, m14
   5082    pmulhrsw             m1, m11, m15
   5083    pmulhrsw             m2, m11, m16
   5084    pmulhrsw             m3, m11, m17
   5085    call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
   5086    pmulhrsw             m0, m11, m18
   5087    pmulhrsw             m1, m11, m19
   5088    pmulhrsw             m2, m11, m20
   5089    pmulhrsw             m3, m11, m21
   5090    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
   5091 .fast: ; 8x8 packed
   5092    movshdup             m7, [o(permB)]
   5093    mova                ym0, [cq+64*1]
   5094    mova                ym2, [cq+64*5]
   5095    mova                ym3, [cq+64*3]
   5096    mova                ym1, [cq+64*7]
   5097    vpermt2q             m0, m7, m2 ;  1  5
   5098    vpermt2q             m1, m7, m3 ;  7  3
   5099    call .main_oddhalf_packed
   5100    mova    [rsp+ 0*mmsize], m0
   5101    mova    [rsp+ 1*mmsize], m1
   5102    mova    [rsp+ 2*mmsize], m2
   5103    mova    [rsp+ 3*mmsize], m3
   5104    mova    [rsp+ 4*mmsize], m4
   5105    mova    [rsp+ 5*mmsize], m5
   5106    mova    [rsp+ 6*mmsize], m6
   5107    mova    [rsp+ 7*mmsize], m7
   5108    mova    [rsp+ 8*mmsize], m16
   5109    mova    [rsp+ 9*mmsize], m17
   5110    mova    [rsp+10*mmsize], m18
   5111    mova    [rsp+11*mmsize], m19
   5112    mova    [rsp+12*mmsize], m20
   5113    mova    [rsp+13*mmsize], m21
   5114    mova    [rsp+14*mmsize], m22
   5115    mova    [rsp+15*mmsize], m23
   5116 
   5117    movshdup             m7, [o(permB)]
   5118    mova                ym0, [cq+64*0]
   5119    mova                ym4, [cq+64*4]
   5120    mova               ym16, [cq+64*2]
   5121    mova                ym5, [cq+64*6]
   5122    vpermt2q            m16, m7, m5 ;  2  6
   5123    vpermq               m0, m7, m0 ;  0  0
   5124    vpermq               m4, m7, m4 ;  4  4
   5125    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
   5126    ; m0-7,9,16-22 contain un-sumsub'ed dct32 output data
   5127 
   5128    ; zero input coefs
   5129    pxor                m12, m12
   5130    REPX {mova [cq+x*64], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
   5131 
   5132    vpbroadcastd        m11, [o(pd_2)]
   5133    call .main_end
   5134    lea                  r3, [strideq*3]
   5135    mov                  r4, dstq
   5136    call .pass2_fast
   5137    mova                 m0, m24
   5138    mova                 m1, m25
   5139    mova                 m2, m26
   5140    mova                 m3, m27
   5141    mova                 m4, m28
   5142    mova                 m5, m29
   5143    mova                 m6, m30
   5144    mova                 m7, m31
   5145    lea                dstq, [r4+64]
   5146    lea                  r5, [o_base]
   5147    call .pass2_fast
   5148    RET
   5149 .pass2_fast:
   5150    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
   5151    lea                  r5, [o_base_8bpc]
   5152    punpckhqdq          m14, m0, m2 ; 1
   5153    punpcklqdq           m0, m2     ; 0
   5154    punpcklqdq           m1, m3, m4 ; 2
   5155    punpckhqdq          m15, m3, m4 ; 3
   5156    punpcklqdq           m2, m5, m7 ; 4
   5157    punpckhqdq          m16, m5, m7 ; 5
   5158    punpcklqdq           m3, m6, m8 ; 6
   5159    punpckhqdq          m17, m6, m8 ; 7
   5160    call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
   5161    jmp .write
   5162 .main_end:
   5163 
   5164 %macro IDCT64_PASS1_PACKED_END 7
   5165    psubd               m%5, m%1, m%2       ; out31-n [idct32] = t31-n [idct64]
   5166    paddd               m%1, m%2            ; out0+n [idct32] = t0+n [idct64]
   5167    REPX    {pmaxsd x, m14}, m%5, m%1
   5168    REPX    {pminsd x, m15}, m%5, m%1
   5169    REPX    {paddd  x, m11}, m%5, m%1
   5170    mova                m%2, [rsp+%6*64+gprsize]    ; t32+n [idct64]
   5171    mova                m%3, [rsp+%7*64+gprsize]    ; t63-n [idct64]
   5172    psubd               m%4, m%1, m%3       ; out63-n
   5173    paddd               m%1, m%3            ; out0+n
   5174    psubd               m%3, m%5, m%2       ; out32+n
   5175    paddd               m%2, m%5            ; out31-n
   5176    REPX   {vpsravd x, m11}, m%4, m%1, m%3, m%2
   5177 %endmacro
   5178 
   5179    IDCT64_PASS1_PACKED_END  0, 22, 24, 10, 12, 0, 15   ; out0/1,31/30,32/33,63/62
   5180    IDCT64_PASS1_PACKED_END  7,  9, 31, 13, 12, 7,  8   ; out15/14,16/17,47/46,48/49
   5181    packssdw             m0, m9
   5182    packssdw             m7, m22
   5183    packssdw            m24, m13
   5184    packssdw            m31, m10
   5185    IDCT64_PASS1_PACKED_END  1, 21, 25, 10, 12, 1, 14   ; out3/2,28/29,35/34,60/61
   5186    IDCT64_PASS1_PACKED_END  6, 16, 30, 13, 12, 6,  9   ; out12/13,19/18,44/45,51/50
   5187    packssdw             m1, m16
   5188    packssdw             m6, m21
   5189    packssdw            m25, m13
   5190    packssdw            m30, m10
   5191    IDCT64_PASS1_PACKED_END  2, 20, 26, 10, 12, 2, 13   ; out4/5,27/26,36/37,59/58
   5192    IDCT64_PASS1_PACKED_END  5, 17, 29, 13, 12, 5, 10   ; out11/10,20/21,43/42,52/53
   5193    packssdw             m2, m17
   5194    packssdw             m5, m20
   5195    packssdw            m26, m13
   5196    packssdw            m29, m10
   5197    IDCT64_PASS1_PACKED_END  3, 19, 27, 10, 12, 3, 12   ; out7/6,24/25,39/38,56/57
   5198    IDCT64_PASS1_PACKED_END  4, 18, 28, 13, 12, 4, 11   ; out8/9,23/22,40/41,55/54
   5199    packssdw             m3, m18
   5200    packssdw             m4, m19
   5201    packssdw            m27, m13
   5202    packssdw            m28, m10
   5203    ret
   5204 .main_oddhalf_packed_rect2:
   5205    REPX    {paddd  x, m13}, m0, m1
   5206    REPX    {psrad  x, 12 }, m0, m1
   5207 .main_oddhalf_packed:
   5208    ; m0=in1 in5, m1=in7 in3
   5209    vbroadcasti32x4      m2, [o(pd_101_501)]
   5210    vbroadcasti32x4      m3, [o(pd_m700_m301)]
   5211    vbroadcasti32x4      m4, [o(pd_4095_4065)]
   5212    vbroadcasti32x4      m5, [o(pd_4036_4085)]
   5213    pmulld               m2, m0
   5214    pmulld               m3, m1
   5215    pmulld               m0, m4
   5216    pmulld               m1, m5
   5217    REPX    {paddd  x, m13}, m2, m3, m0, m1
   5218    REPX    {psrad  x, 12 }, m2, m3, m0, m1
   5219 
   5220    ; m2=t32a t40a -> t32/33 t40/41, m3=t39a t47a -> t38/39 t46/47
   5221    ; m0=t63a t55a -> t62/63 t54/55, m1=t56a t48a -> t56/57 t48/49
   5222    ; end of step 1-2
   5223 
   5224    vbroadcasti32x4     m10, [o(pd_401_1931)]
   5225    vbroadcasti32x4     m11, [o(pd_4076_3612)]
   5226    mova                 m4, m0
   5227    mova                 m5, m2
   5228    ITX_MULSUB_2D         4, 5, 8, 9, _, 13, 10, 11
   5229    vbroadcasti32x4     m10, [o(pd_3166_3920)]
   5230    vbroadcasti32x4     m11, [o(pd_2598_1189)]
   5231    mova                 m6, m3
   5232    mova                 m7, m1
   5233    ITX_MULSUB_2D         7, 6, 8, 9, _, 13, 10, 11, 2
   5234 
   5235    ; m4=t33a t41a -> t41/42  t33/34,  m5=t63a t54a -> t61/62  t53/54
   5236    ; m6=t38a t46a -> t37/38  t45/46,  m7=t57a t49a -> t57/58  t49/50
   5237    ; and from earlier:
   5238    ; m0=t63  t55  -> t60/63a t52/55a, m1=t56  t48  -> t56/59a t48/51a
   5239    ; m2=t32  t40  -> t32/35a t40/43a, m3=t39  t47  -> t36/39a t44/47a
   5240    ; end of step 3-4
   5241 
   5242    punpcklqdq          m22, m2, m4     ; t32a/33 or t35a/34
   5243    punpcklqdq          m21, m3, m6     ; t36a/37 or t39a/38
   5244    punpckhqdq          m18, m2, m4     ; t40a/41 or t43a/42
   5245    punpckhqdq          m17, m3, m6     ; t44a/45 or t47a/46
   5246    punpckhqdq           m6, m1, m7     ; t48a/49 or t51a/50
   5247    punpckhqdq          m19, m0, m5     ; t52a/53 or t55a/54
   5248    punpcklqdq           m8, m1, m7     ; t56a/57 or t59a/58
   5249    punpcklqdq          m23, m0, m5     ; t60a/61 or t63a/62
   5250    mova                 m0, m22
   5251    mova                 m7, m21
   5252    mova                 m3, m18
   5253    mova                m16, m17
   5254    mova                 m5, m6
   5255    mova                 m4, m19
   5256    mova                 m2, m8
   5257    mova                 m1, m23
   5258    ; m0/22/7/21,18/3/17/16,6/5/19/4,2/8/1/23: t32-63[a]
   5259 
   5260    ; step5
   5261    vpbroadcastd        m10, [o(pd_799)]
   5262    vpbroadcastd        m11, [o(pd_4017)]
   5263    ITX_MULSUB_2D         1, 22, 20, 9, _, 13, 10, 11    ; t35/34a, t60/61a
   5264    ITX_MULSUB_2D         8,  7, 20, 9, _, 13, 10, 11, 2 ; t59/58a, t36/37a
   5265    vpbroadcastd        m10, [o(pd_3406)]
   5266    vpbroadcastd        m11, [o(pd_2276)]
   5267    ITX_MULSUB_2D        19,  3, 20, 9, _, 13, 10, 11    ; t43/42a, t52/53a
   5268    ITX_MULSUB_2D         5, 17, 20, 9, _, 13, 10, 11, 2 ; t51/50a, t44/45a
   5269    ; m0-1/7/21: t32-39[a], m18-19/17-16: t40-47[a]
   5270    ; m6-5/3-4: t48-55[a], m2/8/22-23: t56-63[a]
   5271 
   5272    ; step6
   5273    psubd               m20, m0, m21    ; t39/38a
   5274    paddd                m0, m21        ; t32/33a
   5275    psubd               m21, m1, m7     ; t36a/37
   5276    paddd                m1, m7         ; t35a/34
   5277    REPX    {pmaxsd x, m14}, m20, m0, m21, m1
   5278    psubd                m7, m16, m18   ; t40/41a
   5279    paddd               m16, m18        ; t47/46a
   5280    REPX    {pminsd x, m15}, m20, m0, m21, m1
   5281    psubd               m18, m17, m19   ; t43a/42
   5282    paddd               m17, m19        ; t44a/45
   5283    REPX    {pmaxsd x, m14}, m7, m16, m18, m17
   5284    psubd               m19, m6, m4     ; t55/54a
   5285    paddd                m6, m4         ; t48/49a
   5286    REPX    {pminsd x, m15}, m7, m16, m18, m17
   5287    psubd                m4, m5, m3     ; t52a/53
   5288    paddd                m5, m3         ; t51a/50
   5289    REPX    {pmaxsd x, m14}, m19, m6, m4, m5
   5290    psubd                m3, m23, m2    ; t56/57a
   5291    paddd               m23, m2         ; t63/62a
   5292    REPX    {pminsd x, m15}, m19, m6, m4, m5
   5293    psubd                m2, m22, m8    ; t59a/58
   5294    paddd               m22, m8         ; t60a/61
   5295    REPX    {pmaxsd x, m14}, m3, m23, m2, m22
   5296    REPX    {pminsd x, m15}, m3, m23, m2, m22
   5297    ; m0-1: t32-35[a], m17-16: t44-47[a], m6-5: t48-51[a], m22-23: t60-63[a]
   5298    ; m21-20: t36-39[a], m7/18: t40-43[a], m4/19: t52-55[a], m3-2: t56-59[a]
   5299 
   5300    ; step7
   5301    vpbroadcastd        m10, [o(pd_1567)]
   5302    vpbroadcastd        m11, [o(pd_3784)]
   5303    ITX_MULSUB_2D         2, 21, 8, 9, _, 13, 10, 11    ; t36/37a, t59/58a
   5304    ITX_MULSUB_2D         3, 20, 8, 9, _, 13, 10, 11    ; t39a/38, t56a/57
   5305    ITX_MULSUB_2D        19,  7, 8, 9, _, 13, 10, 11, 2 ; t55a/54, t40a/41
   5306    ITX_MULSUB_2D         4, 18, 8, 9, _, 13, 10, 11, 2 ; t52/53a, t43/42a
   5307    ; m0-3: t32-39[a], m7,18-16: t40-47[a], m6-4,19: t48-55[a], m20-23: t56-63[a]
   5308 
   5309    ; step8
   5310    psubd                m8, m0, m16    ; t47a/46
   5311    paddd                m0, m16        ; t32a/33
   5312    psubd               m16, m1, m17    ; t44/45a
   5313    paddd                m1, m17        ; t35/34a
   5314    REPX    {pmaxsd x, m14}, m8, m0, m16, m1
   5315    psubd               m17, m2, m18    ; t43a/42
   5316    paddd                m2, m18        ; t36a/37
   5317    REPX    {pminsd x, m15}, m8, m0, m16, m1
   5318    psubd               m18, m3, m7     ; t40/41a
   5319    paddd                m3, m7         ; t39/38a
   5320    REPX    {pmaxsd x, m14}, m17, m2, m18, m3
   5321    psubd                m7, m23, m6    ; t48a/49
   5322    paddd               m23, m6         ; t63a/62
   5323    REPX    {pminsd x, m15}, m17, m2, m18, m3
   5324    psubd                m6, m22, m5    ; t51/50a
   5325    paddd               m22, m5         ; t60/61a
   5326    REPX    {pmaxsd x, m14}, m7, m23, m6, m22
   5327    psubd                m5, m21, m4    ; t52a/53
   5328    paddd               m21, m4         ; t59a/58
   5329    REPX    {pminsd x, m15}, m7, m23, m6, m22
   5330    psubd                m4, m20, m19   ; t55/54a
   5331    paddd               m20, m19        ; t56/57a
   5332    REPX    {pmaxsd x, m14}, m5, m21, m4, m20
   5333    REPX    {pminsd x, m15}, m5, m21, m4, m20
   5334    ; m0-3=t32-39[a], m18-16,8: t40-47[a], m7-4=t48-55[a], m20-23=t56-63[a]
   5335 
   5336    ; step9
   5337    REPX    {pmulld x, m12}, m4, m18, m5, m17, m6, m16, m7, m8
   5338    REPX    {paddd  x, m13}, m4, m5, m6, m7
   5339    paddd               m19, m4, m18    ; t55a/54
   5340    psubd                m4, m18        ; t40a/41
   5341    paddd               m18, m5, m17    ; t52/53a
   5342    psubd                m5, m17        ; t43/42a
   5343    paddd               m17, m6, m16    ; t51a/50
   5344    psubd                m6, m16        ; t44a/45
   5345    paddd               m16, m7, m8     ; t48/49a
   5346    psubd                m7, m8         ; t47/46a
   5347    REPX    {psrad  x, 12 }, m19, m4, m18, m5, m17, m6, m16, m7
   5348    ; m4-7=t40-47[a], m16-19=t48-55[a]
   5349    ret
   5350 
   5351 cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
   5352    lea                  r5, [o_base]
   5353    test               eobd, eobd
   5354    jz .dconly
   5355 
   5356    PROLOGUE 4, 8, 32, -64*32, dst, stride, c, eob
   5357 %undef cmp
   5358    vpbroadcastd        m12, [o(pd_2896)]
   5359    vpbroadcastd        m13, [o(pd_2048)]
   5360    vpbroadcastd        m14, [o(clip_18b_min)]
   5361    vpbroadcastd        m15, [o(clip_18b_max)]
   5362    cmp                eobd, 136
   5363    jl .fast
   5364    add                  cq, 64
   5365    cmp                eobd, 543
   5366    jge .full
   5367    call .pass1_fast ; bottomright 16x16 zero
   5368    mov                 r7d, 16*12
   5369    jmp .lefthalf
   5370 .full:
   5371    call .pass1
   5372    mov                 r7d, 16*28
   5373 .lefthalf:
   5374    mova        [cq+128* 0], m0
   5375    mova        [cq+128* 1], m1
   5376    mova        [cq+128* 2], m2
   5377    mova        [cq+128* 3], m3
   5378    mova        [cq+128* 4], m14
   5379    mova        [cq+128* 5], m15
   5380    mova        [cq+128* 6], m16
   5381    mova        [cq+128* 7], m17
   5382    mova        [cq+128* 8], m22
   5383    mova        [cq+128* 9], m23
   5384    mova        [cq+128*10], m24
   5385    mova        [cq+128*11], m25
   5386    mova        [cq+128*12], m26
   5387    mova        [cq+128*13], m27
   5388    mova        [cq+128*14], m28
   5389    mova        [cq+128*15], m29
   5390    sub                  cq, 64
   5391    vpbroadcastd        m12, [o(pd_2896)]
   5392    vpbroadcastd        m13, [o(pd_2048)]
   5393    vpbroadcastd        m14, [o(clip_18b_min)]
   5394    vpbroadcastd        m15, [o(clip_18b_max)]
   5395    sub                 rsp, 16*64
   5396    call .pass1
   5397    add                 rsp, 16*64
   5398    lea                  r5, [o_base_8bpc]
   5399    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start
   5400    mov                  r4, dstq
   5401    pxor                m12, m12
   5402    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
   5403    lea                dstq, [r4+64]
   5404    mova                 m0, [rsp+16*mmsize]
   5405    mova                 m1, [rsp+17*mmsize]
   5406    mova                 m2, [rsp+18*mmsize]
   5407    mova                 m3, [rsp+19*mmsize]
   5408    mova                 m4, [rsp+20*mmsize]
   5409    mova                 m5, [rsp+21*mmsize]
   5410    mova                 m6, [rsp+22*mmsize]
   5411    mova                 m7, [rsp+23*mmsize]
   5412    mova                m16, [rsp+24*mmsize]
   5413    mova                m17, [rsp+25*mmsize]
   5414    mova                m18, [rsp+26*mmsize]
   5415    mova                m19, [rsp+27*mmsize]
   5416    mova                m20, [rsp+28*mmsize]
   5417    mova                m21, [rsp+29*mmsize]
   5418    mova                m22, [rsp+30*mmsize]
   5419    mova                m23, [rsp+31*mmsize]
   5420    call .transpose
   5421    mova     [cq+128* 0+64], m0
   5422    mova     [cq+128* 1+64], m1
   5423    mova     [cq+128* 2+64], m2
   5424    mova     [cq+128* 3+64], m3
   5425    mova     [cq+128* 4+64], m14
   5426    mova     [cq+128* 5+64], m15
   5427    mova     [cq+128* 6+64], m16
   5428    mova     [cq+128* 7+64], m17
   5429    mova     [cq+128* 8+64], m22
   5430    mova     [cq+128* 9+64], m23
   5431    mova     [cq+128*10+64], m24
   5432    mova     [cq+128*11+64], m25
   5433    mova     [cq+128*12+64], m26
   5434    mova     [cq+128*13+64], m27
   5435    mova     [cq+128*14+64], m28
   5436    mova     [cq+128*15+64], m29
   5437    mova                 m0, [rsp+ 0*mmsize]
   5438    mova                 m1, [rsp+ 1*mmsize]
   5439    mova                 m2, [rsp+ 2*mmsize]
   5440    mova                 m3, [rsp+ 3*mmsize]
   5441    mova                 m4, [rsp+ 4*mmsize]
   5442    mova                 m5, [rsp+ 5*mmsize]
   5443    mova                 m6, [rsp+ 6*mmsize]
   5444    mova                 m7, [rsp+ 7*mmsize]
   5445    mova                m16, [rsp+ 8*mmsize]
   5446    mova                m17, [rsp+ 9*mmsize]
   5447    mova                m18, [rsp+10*mmsize]
   5448    mova                m19, [rsp+11*mmsize]
   5449    mova                m20, [rsp+12*mmsize]
   5450    mova                m21, [rsp+13*mmsize]
   5451    mova                m22, [rsp+14*mmsize]
   5452    mova                m23, [rsp+15*mmsize]
   5453    call .transpose
   5454    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start
   5455    pxor                m12, m12
   5456 .right_zero_loop:
   5457    mova [cq+r7*8+64+128*3], m12
   5458    mova [cq+r7*8+64+128*2], m12
   5459    mova [cq+r7*8+64+128*1], m12
   5460    mova [cq+r7*8+64+128*0], m12
   5461    sub                 r7d, 16*4
   5462    jge .right_zero_loop
   5463    mov                 r7d, 16*28
   5464    jmp .end
   5465 .fast: ; topleft 16x16 nonzero
   5466    cmp                eobd, 36
   5467    jl .fast2
   5468    call .pass1_fast
   5469    lea                  r5, [o_base_8bpc]
   5470    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start
   5471    mov                  r4, dstq
   5472    pxor                m12, m12
   5473    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
   5474    lea                dstq, [r4+64]
   5475    mova                 m0, [rsp+16*mmsize]
   5476    mova                 m1, [rsp+17*mmsize]
   5477    mova                 m2, [rsp+18*mmsize]
   5478    mova                 m3, [rsp+19*mmsize]
   5479    mova                 m4, [rsp+20*mmsize]
   5480    mova                 m5, [rsp+21*mmsize]
   5481    mova                 m6, [rsp+22*mmsize]
   5482    mova                 m7, [rsp+23*mmsize]
   5483    mova                m16, [rsp+24*mmsize]
   5484    mova                m17, [rsp+25*mmsize]
   5485    mova                m18, [rsp+26*mmsize]
   5486    mova                m19, [rsp+27*mmsize]
   5487    mova                m20, [rsp+28*mmsize]
   5488    mova                m21, [rsp+29*mmsize]
   5489    mova                m22, [rsp+30*mmsize]
   5490    mova                m23, [rsp+31*mmsize]
   5491    call .transpose
   5492    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start
   5493    mov                 r7d, 16*12
   5494    pxor                m12, m12
   5495    jmp .end
   5496 .fast2: ; topleft 8x8 nonzero
   5497    movshdup             m7, [o(permB)]
   5498    mova                ym0, [cq+128*1]
   5499    mova                ym2, [cq+128*5]
   5500    mova                ym3, [cq+128*3]
   5501    mova                ym1, [cq+128*7]
   5502    vpermt2q             m0, m7, m2 ;  1  5
   5503    vpermt2q             m1, m7, m3 ;  7  3
   5504    REPX    {pmulld x, m12}, m0, m1
   5505    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed_rect2
   5506    mova    [rsp+ 0*mmsize], m0
   5507    mova    [rsp+ 1*mmsize], m1
   5508    mova    [rsp+ 2*mmsize], m2
   5509    mova    [rsp+ 3*mmsize], m3
   5510    mova    [rsp+ 4*mmsize], m4
   5511    mova    [rsp+ 5*mmsize], m5
   5512    mova    [rsp+ 6*mmsize], m6
   5513    mova    [rsp+ 7*mmsize], m7
   5514    mova    [rsp+ 8*mmsize], m16
   5515    mova    [rsp+ 9*mmsize], m17
   5516    mova    [rsp+10*mmsize], m18
   5517    mova    [rsp+11*mmsize], m19
   5518    mova    [rsp+12*mmsize], m20
   5519    mova    [rsp+13*mmsize], m21
   5520    mova    [rsp+14*mmsize], m22
   5521    mova    [rsp+15*mmsize], m23
   5522 
   5523    movshdup             m7, [o(permB)]
   5524    pmulld              ym0, ym12, [cq+128*0]
   5525    pmulld              ym4, ym12, [cq+128*4]
   5526    mova               ym16, [cq+128*2]
   5527    mova                ym5, [cq+128*6]
   5528    REPX    {paddd x, ym13}, ym0, ym4
   5529    REPX    {psrad x, 12  }, ym0, ym4
   5530    vpermt2q            m16, m7, m5 ;  2  6
   5531    vpermq               m0, m7, m0 ;  0  0
   5532    vpermq               m4, m7, m4 ;  4  4
   5533    pmulld              m16, m12
   5534    paddd               m16, m13
   5535    psrad               m16, 12
   5536    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
   5537 
   5538    vpbroadcastd        m11, [o(pd_1)]
   5539    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
   5540    mova    [rsp+16*mmsize], m24
   5541    mova    [rsp+17*mmsize], m25
   5542    mova    [rsp+18*mmsize], m26
   5543    mova    [rsp+19*mmsize], m27
   5544    mova    [rsp+20*mmsize], m28
   5545    mova    [rsp+21*mmsize], m29
   5546    mova    [rsp+22*mmsize], m30
   5547    mova    [rsp+23*mmsize], m31
   5548    vpbroadcastd        m13, [o(pd_2048)]
   5549    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start
   5550    mov                 r7d, 16*4
   5551    mov                  r4, dstq
   5552    pxor                m12, m12
   5553    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
   5554    lea                dstq, [r4+64]
   5555    mova                 m0, [rsp+16*mmsize]
   5556    mova                 m1, [rsp+17*mmsize]
   5557    mova                 m2, [rsp+18*mmsize]
   5558    mova                 m3, [rsp+19*mmsize]
   5559    mova                 m4, [rsp+20*mmsize]
   5560    mova                 m5, [rsp+21*mmsize]
   5561    mova                 m6, [rsp+22*mmsize]
   5562    mova                 m7, [rsp+23*mmsize]
   5563    lea                  r5, [o_base]
   5564    vpbroadcastd        m13, [o(pd_2048)]
   5565    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start
   5566    pxor                m12, m12
   5567 .end:
   5568    call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end
   5569 .zero_loop:
   5570    mova    [cq+r7*8+128*3], m12
   5571    mova    [cq+r7*8+128*2], m12
   5572    mova    [cq+r7*8+128*1], m12
   5573    mova    [cq+r7*8+128*0], m12
   5574    sub                 r7d, 16*4
   5575    jge .zero_loop
   5576    RET
   5577 .dconly:
   5578    imul                r6d, [cq], 181
   5579    mov                [cq], eobd
   5580    or                  r3d, 32
   5581    add                 r6d, 128
   5582    sar                 r6d, 8
   5583    imul                r6d, 181
   5584    add                 r6d, 384
   5585    sar                 r6d, 9
   5586    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
   5587 .pass1_fast:
   5588    lea                  r4, [idct64_mul_16bpc]
   5589    lea                  r6, [rsp+4*64+gprsize]
   5590    pmulld               m0, m12, [cq+128* 1]
   5591    pmulld               m3, m12, [cq+128*15]
   5592    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
   5593    pmulld               m0, m12, [cq+128* 7]
   5594    pmulld               m3, m12, [cq+128* 9]
   5595    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
   5596    pmulld               m0, m12, [cq+128* 5]
   5597    pmulld               m3, m12, [cq+128*11]
   5598    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
   5599    pmulld               m0, m12, [cq+128* 3]
   5600    pmulld               m3, m12, [cq+128*13]
   5601    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2
   5602    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
   5603    pmulld               m0, m12, [cq+128* 0]
   5604    pmulld               m1, m12, [cq+128* 8]
   5605    pmulld              m16, m12, [cq+128* 4]
   5606    pmulld              m17, m12, [cq+128*12]
   5607    call m(idct_8x16_internal_10bpc).main_fast2_rect2
   5608    call m(idct_16x16_internal_10bpc).main_fast2_rect2
   5609    call .pass1_load_spill
   5610    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2_rect2
   5611    jmp .pass1_end
   5612 .pass1:
   5613    lea                  r4, [idct64_mul_16bpc]
   5614    lea                  r6, [rsp+4*64+gprsize]
   5615    pmulld               m0, m12, [cq+128* 1]
   5616    pmulld               m1, m12, [cq+128*31]
   5617    pmulld               m2, m12, [cq+128*17]
   5618    pmulld               m3, m12, [cq+128*15]
   5619    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
   5620    pmulld               m0, m12, [cq+128* 7]
   5621    pmulld               m1, m12, [cq+128*25]
   5622    pmulld               m2, m12, [cq+128*23]
   5623    pmulld               m3, m12, [cq+128* 9]
   5624    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
   5625    pmulld               m0, m12, [cq+128* 5]
   5626    pmulld               m1, m12, [cq+128*27]
   5627    pmulld               m2, m12, [cq+128*21]
   5628    pmulld               m3, m12, [cq+128*11]
   5629    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
   5630    pmulld               m0, m12, [cq+128* 3]
   5631    pmulld               m1, m12, [cq+128*29]
   5632    pmulld               m2, m12, [cq+128*19]
   5633    pmulld               m3, m12, [cq+128*13]
   5634    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2
   5635    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
   5636    pmulld               m0, m12, [cq+128* 0]
   5637    pmulld               m1, m12, [cq+128* 8]
   5638    pmulld               m2, m12, [cq+128*16]
   5639    pmulld               m3, m12, [cq+128*24]
   5640    pmulld              m16, m12, [cq+128* 4]
   5641    pmulld              m17, m12, [cq+128*12]
   5642    pmulld              m18, m12, [cq+128*20]
   5643    pmulld              m19, m12, [cq+128*28]
   5644    call m(idct_8x16_internal_10bpc).main_fast_rect2
   5645    call m(idct_16x16_internal_10bpc).main_fast_rect2
   5646    call .pass1_load_spill
   5647    pmulld               m4, m12, [cq+128*18]
   5648    pmulld               m5, m12, [cq+128*22]
   5649    pmulld               m6, m12, [cq+128*26]
   5650    pmulld               m7, m12, [cq+128*30]
   5651    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2
   5652 .pass1_end:
   5653    vpbroadcastd        m11, [o(pd_1)]
   5654    lea                  r3, [rsp+gprsize]
   5655    lea                  r4, [cq+8*128]
   5656    call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end
   5657    ; transpose one half immediately, we can transpose lower half later
   5658 .transpose:
   5659    ; transpose m0-7,16-23
   5660    psrlq               m12, [permC], 24    ;  0  2  8 10  1  3  9 11
   5661    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
   5662    call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
   5663    punpckhqdq          m22, m0, m20  ;  1
   5664    punpcklqdq           m0, m20      ;  0
   5665    punpckhqdq          m24, m2, m1   ;  5
   5666    punpcklqdq           m1, m2, m1   ;  4
   5667    punpcklqdq           m2, m14, m18 ;  8
   5668    punpckhqdq          m26, m14, m18 ;  9
   5669    punpcklqdq          m14, m15, m4  ;  2
   5670    punpckhqdq          m23, m15, m4  ;  3
   5671    punpckhqdq          m25, m3, m21  ;  7
   5672    punpcklqdq          m15, m3, m21  ;  6
   5673    punpckhqdq          m28, m6, m17  ; 13
   5674    punpcklqdq           m3, m6, m17  ; 12
   5675    punpckhqdq          m27, m5, m16  ; 11
   5676    punpcklqdq          m16, m5, m16  ; 10
   5677    punpckhqdq          m29, m7, m8   ; 15
   5678    punpcklqdq          m17, m7, m8   ; 14
   5679    ret
   5680 .pass1_load_spill:
   5681    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
   5682    mova        [cq+128* 0], m0
   5683    mova        [cq+128* 1], m1
   5684    pmulld               m0, m12, [cq+128* 2]
   5685    pmulld               m1, m12, [cq+128* 6]
   5686    mova        [cq+128* 2], m2
   5687    mova        [cq+128* 3], m3
   5688    pmulld               m2, m12, [cq+128*10]
   5689    pmulld               m3, m12, [cq+128*14]
   5690    mova        [cq+128* 4], m4
   5691    mova        [cq+128* 5], m5
   5692    mova        [cq+128* 6], m6
   5693    mova        [cq+128* 7], m7
   5694    mova        [cq+128* 8], m23
   5695    mova        [cq+128* 9], m22
   5696    mova        [cq+128*10], m21
   5697    mova        [cq+128*11], m20
   5698    mova        [cq+128*12], m19
   5699    mova        [cq+128*13], m18
   5700    mova        [cq+128*14], m17
   5701    mova        [cq+128*15], m16
   5702    ret
   5703 
   5704 cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
   5705    lea                  r5, [o_base]
   5706    test               eobd, eobd
   5707    jz .dconly
   5708 
   5709    PROLOGUE 4, 9, 32, -64*32, dst, stride, c, eob
   5710 %undef cmp
   5711    vpbroadcastd        m12, [o(pd_2896)]
   5712    vpbroadcastd        m13, [o(pd_2048)]
   5713    vpbroadcastd        m14, [o(clip_18b_min)]
   5714    vpbroadcastd        m15, [o(clip_18b_max)]
   5715    cmp                eobd, 136
   5716    jl .fast
   5717    add                  cq, 64
   5718    cmp                eobd, 543
   5719    jge .full
   5720    call .pass1_fast ; bottomright 16x16 zero
   5721    mov                 r7d, 16*12
   5722    jmp .lefthalf
   5723 .full:
   5724    call .pass1
   5725    mov                 r7d, 16*28
   5726 .lefthalf:
   5727    mova        [cq+128* 0], m27
   5728    mova        [cq+128* 1], m14
   5729    mova        [cq+128* 2], m28
   5730    mova        [cq+128* 3], m15
   5731    mova        [cq+128* 4], m22
   5732    mova        [cq+128* 5], m23
   5733    mova        [cq+128* 6], m24
   5734    mova        [cq+128* 7], m25
   5735    mova        [cq+128* 8], m0
   5736    mova        [cq+128* 9], m26
   5737    mova        [cq+128*10], m20
   5738    mova        [cq+128*11], m21
   5739    mova        [cq+128*12], m18
   5740    mova        [cq+128*13], m16
   5741    mova        [cq+128*14], m17
   5742    mova        [cq+128*15], m3
   5743    sub                  cq, 64
   5744    vpbroadcastd        m12, [o(pd_2896)]
   5745    vpbroadcastd        m13, [o(pd_2048)]
   5746    vpbroadcastd        m14, [o(clip_18b_min)]
   5747    vpbroadcastd        m15, [o(clip_18b_max)]
   5748    sub                 rsp, 16*64
   5749    call .pass1
   5750    sub                 rsp, 24*64
   5751    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start
   5752    mov                  r8, dstq
   5753    pxor                m31, m31
   5754    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
   5755    lea                dstq, [r8+64]
   5756    mova                 m0, [rsp+56*mmsize]
   5757    mova                 m1, [rsp+57*mmsize]
   5758    mova                 m2, [rsp+58*mmsize]
   5759    mova                 m3, [rsp+59*mmsize]
   5760    mova                 m4, [rsp+60*mmsize]
   5761    mova                 m5, [rsp+61*mmsize]
   5762    mova                 m6, [rsp+62*mmsize]
   5763    mova                 m7, [rsp+63*mmsize]
   5764    mova                m16, [rsp+64*mmsize]
   5765    mova                m17, [rsp+65*mmsize]
   5766    mova                m18, [rsp+66*mmsize]
   5767    mova                m19, [rsp+67*mmsize]
   5768    mova                m20, [rsp+68*mmsize]
   5769    mova                m21, [rsp+69*mmsize]
   5770    mova                m22, [rsp+70*mmsize]
   5771    mova                m23, [rsp+71*mmsize]
   5772    call .transpose
   5773    mova     [cq+128* 0+64], m27
   5774    mova     [cq+128* 1+64], m14
   5775    mova     [cq+128* 2+64], m28
   5776    mova     [cq+128* 3+64], m15
   5777    mova     [cq+128* 4+64], m22
   5778    mova     [cq+128* 5+64], m23
   5779    mova     [cq+128* 6+64], m24
   5780    mova     [cq+128* 7+64], m25
   5781    mova     [cq+128* 8+64], m0
   5782    mova     [cq+128* 9+64], m26
   5783    mova     [cq+128*10+64], m20
   5784    mova     [cq+128*11+64], m21
   5785    mova     [cq+128*12+64], m18
   5786    mova     [cq+128*13+64], m16
   5787    mova     [cq+128*14+64], m17
   5788    mova     [cq+128*15+64], m3
   5789    mova                 m0, [rsp+40*mmsize]
   5790    mova                 m1, [rsp+41*mmsize]
   5791    mova                 m2, [rsp+42*mmsize]
   5792    mova                 m3, [rsp+43*mmsize]
   5793    mova                 m4, [rsp+44*mmsize]
   5794    mova                 m5, [rsp+45*mmsize]
   5795    mova                 m6, [rsp+46*mmsize]
   5796    mova                 m7, [rsp+47*mmsize]
   5797    mova                m16, [rsp+48*mmsize]
   5798    mova                m17, [rsp+49*mmsize]
   5799    mova                m18, [rsp+50*mmsize]
   5800    mova                m19, [rsp+51*mmsize]
   5801    mova                m20, [rsp+52*mmsize]
   5802    mova                m21, [rsp+53*mmsize]
   5803    mova                m22, [rsp+54*mmsize]
   5804    mova                m23, [rsp+55*mmsize]
   5805    add                 rsp, 32*64
   5806    call .transpose
   5807    lea                  r5, [o_base]
   5808    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start
   5809 .right_zero_loop:
   5810    REPX {mova [cq+r7*8+64+128*x], m31}, 0, 1, 2, 3
   5811    sub                 r7d, 16*4
   5812    jge .right_zero_loop
   5813    mov                 r7d, 16*28
   5814    jmp .end
   5815 .fast: ; topleft 16x16 nonzero
   5816    cmp                eobd, 36
   5817    jl .fast2
   5818    call .pass1_fast
   5819    sub                 rsp, 24*64
   5820    vpbroadcastd        m10, [o(pd_2048)]
   5821    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start
   5822    mov                  r8, dstq
   5823    pxor                m31, m31
   5824    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
   5825    lea                dstq, [r8+64]
   5826    mova                 m0, [rsp+40*mmsize]
   5827    mova                 m1, [rsp+41*mmsize]
   5828    mova                 m2, [rsp+42*mmsize]
   5829    mova                 m3, [rsp+43*mmsize]
   5830    mova                 m4, [rsp+44*mmsize]
   5831    mova                 m5, [rsp+45*mmsize]
   5832    mova                 m6, [rsp+46*mmsize]
   5833    mova                 m7, [rsp+47*mmsize]
   5834    mova                m16, [rsp+48*mmsize]
   5835    mova                m17, [rsp+49*mmsize]
   5836    mova                m18, [rsp+50*mmsize]
   5837    mova                m19, [rsp+51*mmsize]
   5838    mova                m20, [rsp+52*mmsize]
   5839    mova                m21, [rsp+53*mmsize]
   5840    mova                m22, [rsp+54*mmsize]
   5841    mova                m23, [rsp+55*mmsize]
   5842    add                 rsp, 16*64
   5843    call .transpose
   5844    lea                  r5, [o_base]
   5845    vpbroadcastd        m10, [o(pd_2048)]
   5846    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start
   5847    mov                 r7d, 16*12
   5848    jmp .end
   5849 .fast2: ; topleft 8x8 nonzero
   5850    movshdup             m7, [o(permB)]
   5851    mova                ym0, [cq+128*1]
   5852    mova                ym2, [cq+128*5]
   5853    mova                ym3, [cq+128*3]
   5854    mova                ym1, [cq+128*7]
   5855    vpermt2q             m0, m7, m2 ;  1  5
   5856    vpermt2q             m1, m7, m3 ;  7  3
   5857    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed
   5858    mova    [rsp+ 0*mmsize], m0
   5859    mova    [rsp+ 1*mmsize], m1
   5860    mova    [rsp+ 2*mmsize], m2
   5861    mova    [rsp+ 3*mmsize], m3
   5862    mova    [rsp+ 4*mmsize], m4
   5863    mova    [rsp+ 5*mmsize], m5
   5864    mova    [rsp+ 6*mmsize], m6
   5865    mova    [rsp+ 7*mmsize], m7
   5866    mova    [rsp+ 8*mmsize], m16
   5867    mova    [rsp+ 9*mmsize], m17
   5868    mova    [rsp+10*mmsize], m18
   5869    mova    [rsp+11*mmsize], m19
   5870    mova    [rsp+12*mmsize], m20
   5871    mova    [rsp+13*mmsize], m21
   5872    mova    [rsp+14*mmsize], m22
   5873    mova    [rsp+15*mmsize], m23
   5874 
   5875    movshdup             m7, [o(permB)]
   5876    mova                ym0, [cq+128*0]
   5877    mova                ym4, [cq+128*4]
   5878    mova               ym16, [cq+128*2]
   5879    mova                ym5, [cq+128*6]
   5880    vpermt2q            m16, m7, m5 ;  2  6
   5881    vpermq               m0, m7, m0 ;  0  0
   5882    vpermq               m4, m7, m4 ;  4  4
   5883    call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3
   5884 
   5885    vpbroadcastd        m11, [o(pd_2)]
   5886    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
   5887    sub                 rsp, 16*64
   5888    mova    [rsp+40*mmsize], m24
   5889    mova    [rsp+41*mmsize], m25
   5890    mova    [rsp+42*mmsize], m26
   5891    mova    [rsp+43*mmsize], m27
   5892    mova    [rsp+44*mmsize], m28
   5893    mova    [rsp+45*mmsize], m29
   5894    mova    [rsp+46*mmsize], m30
   5895    mova    [rsp+47*mmsize], m31
   5896    call .pass2_fast2_start
   5897    mov                 r7d, 16*4
   5898    mov                  r8, dstq
   5899    pxor                m31, m31
   5900    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
   5901    lea                dstq, [r8+64]
   5902    mova                 m0, [rsp+40*mmsize]
   5903    mova                 m1, [rsp+41*mmsize]
   5904    mova                 m2, [rsp+42*mmsize]
   5905    mova                 m3, [rsp+43*mmsize]
   5906    mova                 m4, [rsp+44*mmsize]
   5907    mova                 m5, [rsp+45*mmsize]
   5908    mova                 m6, [rsp+46*mmsize]
   5909    mova                 m7, [rsp+47*mmsize]
   5910    add                 rsp, 8*64
   5911    lea                  r5, [o_base]
   5912    call .pass2_fast2_start
   5913 .end:
   5914    pxor                m31, m31
   5915 .zero_loop:
   5916    REPX {mova [cq+r7*8+128*x], m31}, 0, 1, 2, 3
   5917    sub                 r7d, 16*4
   5918    jge .zero_loop
   5919    call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end
   5920    add                 rsp, 8*64  ; FIXME adjust stack_size_padded instead?
   5921    RET
   5922 .pass2_fast2_start:
   5923    call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
   5924    punpcklqdq          m27, m0, m2 ; 0
   5925    punpckhqdq           m0, m2     ; 1
   5926    punpcklqdq          m22, m3, m4 ; 2
   5927    punpckhqdq          m26, m3, m4 ; 3
   5928    punpcklqdq          m14, m5, m7 ; 4
   5929    punpckhqdq          m20, m5, m7 ; 5
   5930    punpcklqdq          m23, m6, m8 ; 6
   5931    punpckhqdq          m21, m6, m8 ; 7
   5932    vpbroadcastd        m10, [o(pd_2048)]
   5933    jmp m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast2_start
   5934 .dconly:
   5935    imul                r6d, [cq], 181
   5936    mov                [cq], eobd
   5937    or                  r3d, 64
   5938    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly1
   5939 .pass1_fast:
   5940    lea                  r4, [idct64_mul_16bpc]
   5941    lea                  r6, [rsp+4*64+gprsize]
   5942    mova                 m0, [cq+128* 1]
   5943    mova                 m3, [cq+128*15]
   5944    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
   5945    mova                 m0, [cq+128* 7]
   5946    mova                 m3, [cq+128* 9]
   5947    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
   5948    mova                 m0, [cq+128* 5]
   5949    mova                 m3, [cq+128*11]
   5950    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
   5951    mova                 m0, [cq+128* 3]
   5952    mova                 m3, [cq+128*13]
   5953    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast
   5954    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
   5955    mova                 m0, [cq+128* 0]
   5956    mova                 m1, [cq+128* 8]
   5957    mova                m16, [cq+128* 4]
   5958    mova                m17, [cq+128*12]
   5959    call m(idct_8x16_internal_10bpc).main_fast2
   5960    call m(idct_16x16_internal_10bpc).main_fast2
   5961    call .pass1_load_spill
   5962    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2
   5963    jmp .pass1_end
   5964 .pass1:
   5965    lea                  r4, [idct64_mul_16bpc]
   5966    lea                  r6, [rsp+4*64+gprsize]
   5967    mova                 m0, [cq+128* 1]
   5968    mova                 m1, [cq+128*31]
   5969    mova                 m2, [cq+128*17]
   5970    mova                 m3, [cq+128*15]
   5971    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
   5972    mova                 m0, [cq+128* 7]
   5973    mova                 m1, [cq+128*25]
   5974    mova                 m2, [cq+128*23]
   5975    mova                 m3, [cq+128* 9]
   5976    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
   5977    mova                 m0, [cq+128* 5]
   5978    mova                 m1, [cq+128*27]
   5979    mova                 m2, [cq+128*21]
   5980    mova                 m3, [cq+128*11]
   5981    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
   5982    mova                 m0, [cq+128* 3]
   5983    mova                 m1, [cq+128*29]
   5984    mova                 m2, [cq+128*19]
   5985    mova                 m3, [cq+128*13]
   5986    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1
   5987    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2
   5988    mova                 m0, [cq+128* 0]
   5989    mova                 m1, [cq+128* 8]
   5990    mova                 m2, [cq+128*16]
   5991    mova                 m3, [cq+128*24]
   5992    mova                m16, [cq+128* 4]
   5993    mova                m17, [cq+128*12]
   5994    mova                m18, [cq+128*20]
   5995    mova                m19, [cq+128*28]
   5996    call m(idct_8x16_internal_10bpc).main_fast
   5997    call m(idct_16x16_internal_10bpc).main_fast
   5998    call .pass1_load_spill
   5999    mova                 m4, [cq+128*18]
   6000    mova                 m5, [cq+128*22]
   6001    mova                 m6, [cq+128*26]
   6002    mova                 m7, [cq+128*30]
   6003    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
   6004 .pass1_end:
   6005    vpbroadcastd        m11, [o(pd_2)]
   6006    lea                  r3, [rsp+gprsize]
   6007    lea                  r4, [cq+8*128]
   6008    call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end
   6009    ; transpose one half immediately, we can transpose lower half later
   6010 .transpose:
   6011    ; transpose m0-7,16-23
   6012    psrlq               m12, [permC], 24 ;  0  2  8 10  1  3  9 11
   6013    psrlq               m13, m12, 32        ;  4  6 12 14  5  7 13 15
   6014    call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32
   6015    punpcklqdq          m27, m0, m20  ;  0
   6016    punpckhqdq           m0, m20      ;  1
   6017    punpcklqdq          m24, m5, m16  ; 10
   6018    punpckhqdq          m16, m5, m16  ; 11
   6019    punpcklqdq          m23, m3, m21  ;  6
   6020    punpckhqdq          m21, m3, m21  ;  7
   6021    punpcklqdq          m25, m7, m8   ; 14
   6022    punpckhqdq           m3, m7, m8   ; 15
   6023    punpcklqdq          m22, m15, m4  ;  2
   6024    punpckhqdq          m26, m15, m4  ;  3
   6025    punpcklqdq          m15, m6, m17  ; 12
   6026    punpckhqdq          m17, m6, m17  ; 13
   6027    punpcklqdq          m28, m14, m18 ;  8
   6028    punpckhqdq          m18, m14, m18 ;  9
   6029    punpcklqdq          m14, m2, m1   ;  4
   6030    punpckhqdq          m20, m2, m1   ;  5
   6031    ret
   6032 .pass1_load_spill:
   6033    call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
   6034    mova        [cq+128* 0], m0
   6035    mova        [cq+128* 1], m1
   6036    mova                 m0, [cq+128* 2]
   6037    mova                 m1, [cq+128* 6]
   6038    mova        [cq+128* 2], m2
   6039    mova        [cq+128* 3], m3
   6040    mova                 m2, [cq+128*10]
   6041    mova                 m3, [cq+128*14]
   6042    mova        [cq+128* 4], m4
   6043    mova        [cq+128* 5], m5
   6044    mova        [cq+128* 6], m6
   6045    mova        [cq+128* 7], m7
   6046    mova        [cq+128* 8], m23
   6047    mova        [cq+128* 9], m22
   6048    mova        [cq+128*10], m21
   6049    mova        [cq+128*11], m20
   6050    mova        [cq+128*12], m19
   6051    mova        [cq+128*13], m18
   6052    mova        [cq+128*14], m17
   6053    mova        [cq+128*15], m16
   6054    ret
   6055 
   6056 %endif ; ARCH_X86_64