tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx16_avx2.asm (318764B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; Copyright © 2021, Matthias Dressel
      4 ; All rights reserved.
      5 ;
      6 ; Redistribution and use in source and binary forms, with or without
      7 ; modification, are permitted provided that the following conditions are met:
      8 ;
      9 ; 1. Redistributions of source code must retain the above copyright notice, this
     10 ;    list of conditions and the following disclaimer.
     11 ;
     12 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     13 ;    this list of conditions and the following disclaimer in the documentation
     14 ;    and/or other materials provided with the distribution.
     15 ;
     16 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 
     27 %include "config.asm"
     28 %include "ext/x86/x86inc.asm"
     29 
     30 %if ARCH_X86_64
     31 
     32 SECTION_RODATA 32
     33 itx4_shuf:       dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6
     34                 dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7
     35 idct4_12_shuf:   dd 0, 2, 4, 6, 1, 3, 5, 7
     36 idct4_12_shuf2:  dd 2, 0, 6, 4, 3, 1, 7, 5
     37 iadst8_12_shuf:  dd 0, 4, 1, 5, 2, 6, 3, 7
     38 idct16_12_shuf:  dd 0, 4, 1, 5, 3, 7, 2, 6
     39 iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5
     40 pw_2048_m2048:   dw  2048,  2048,  2048,  2048, -2048, -2048, -2048, -2048
     41 idct4_shuf:   db  0,  1,  4,  5, 12, 13,  8,  9,  2,  3,  6,  7, 14, 15, 10, 11
     42 idct32_shuf:  db  0,  1,  8,  9,  4,  5, 12, 13,  2,  3, 10, 11,  6,  7, 14, 15
     43 
     44 %macro COEF_PAIR 2-3 0
     45 pd_%1_%2: dd %1, %1, %2, %2
     46 %define pd_%1 (pd_%1_%2 + 4*0)
     47 %define pd_%2 (pd_%1_%2 + 4*2)
     48 %if %3
     49 dd -%2, -%2
     50 %define pd_%2_m%2 pd_%2
     51 %endif
     52 %endmacro
     53 
     54 COEF_PAIR  201,  995
     55 COEF_PAIR  401, 1931
     56 COEF_PAIR  799, 3406
     57 COEF_PAIR 1380,  601
     58 COEF_PAIR 1751, 2440
     59 COEF_PAIR 2598, 1189
     60 COEF_PAIR 2751, 2106
     61 COEF_PAIR 2896, 1567, 1
     62 COEF_PAIR 2896, 3784, 1
     63 COEF_PAIR 3035, 3513
     64 COEF_PAIR 3166, 3920
     65 COEF_PAIR 3703, 3290
     66 COEF_PAIR 3857, 4052
     67 COEF_PAIR 4017, 2276
     68 COEF_PAIR 4076, 3612
     69 COEF_PAIR 4091, 3973
     70 
     71 pd_8:      dd     8
     72 pd_m601:   dd  -601
     73 pd_m1189:  dd -1189
     74 pd_m1380:  dd -1380
     75 pd_m2106:  dd -2106
     76 pd_m2598:  dd -2598
     77 pd_m2751:  dd -2751
     78 pd_m3344:  dd -3344
     79 pd_1024:   dd  1024
     80 pd_1321:   dd  1321
     81 pd_1448:   dd  1448
     82 pd_1697:   dd  1697
     83 pd_2482:   dd  2482
     84 pd_3072:   dd  3072 ; 1024 + 2048
     85 pd_3803:   dd  3803
     86 pd_5119:   dd  5119 ; 1024 + 4096 - 1
     87 pd_5120:   dd  5120 ; 1024 + 4096
     88 pd_5793:   dd  5793
     89 pd_6144:   dd  6144 ; 2048 + 4096
     90 pd_17408:  dd 17408 ; 1024 + 16384
     91 
     92 pixel_10bpc_max: times 2 dw 0x03ff
     93 pixel_12bpc_max: times 2 dw 0x0fff
     94 dconly_10bpc:    times 2 dw 0x7c00
     95 dconly_12bpc:    times 2 dw 0x7000
     96 clip_18b_min:  dd -0x20000
     97 clip_18b_max:  dd  0x1ffff
     98 clip_20b_min:  dd -0x80000
     99 clip_20b_max:  dd  0x7ffff
    100 
    101 const idct64_mul_16bpc
    102 dd 4095,  101, 2967, -2824,  3745, 1660, 3822, -1474,   401,  4076,   799,  4017
    103 dd -700, 4036, 2359,  3349, -2191, 3461,  897,  3996, -2598, -3166, -4017,  -799
    104 dd 4065,  501, 3229, -2520,  3564, 2019, 3948, -1092,  1931,  3612,  3406,  2276
    105 dd -301, 4085, 2675,  3102, -1842, 3659, 1285,  3889, -1189, -3920, -2276, -3406
    106 
    107 cextern deint_shuf
    108 cextern idct64_mul
    109 cextern pw_1697x8
    110 cextern pw_1697x16
    111 cextern pw_1567_3784
    112 cextern pw_m1567_m3784
    113 cextern pw_m3784_1567
    114 cextern pw_2896_2896
    115 cextern pw_m2896_2896
    116 cextern pw_5
    117 cextern pw_2048
    118 cextern pw_4096
    119 cextern pw_8192
    120 cextern pw_16384
    121 cextern pw_2896x8
    122 cextern pd_2048
    123 
    124 cextern idct_4x8_internal_8bpc_avx2.main
    125 cextern idct_4x16_internal_8bpc_avx2.main
    126 cextern idct_8x8_internal_8bpc_avx2.main
    127 cextern idct_8x16_internal_8bpc_avx2.main
    128 cextern idct_16x4_internal_8bpc_avx2.main
    129 cextern idct_16x8_internal_8bpc_avx2.main
    130 cextern idct_16x16_internal_8bpc_avx2.main
    131 cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main
    132 cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast
    133 cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf
    134 cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast
    135 cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1
    136 cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal
    137 
    138 cextern iadst_4x4_internal_8bpc_avx2.main
    139 cextern iadst_4x8_internal_8bpc_avx2.main_pass2
    140 cextern iadst_4x16_internal_8bpc_avx2.main2
    141 cextern iadst_8x4_internal_8bpc_avx2.main
    142 cextern iadst_8x8_internal_8bpc_avx2.main_pass2
    143 cextern iadst_8x16_internal_8bpc_avx2.main
    144 cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end
    145 cextern iadst_16x4_internal_8bpc_avx2.main
    146 cextern iadst_16x8_internal_8bpc_avx2.main
    147 cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end
    148 cextern iadst_16x16_internal_8bpc_avx2.main
    149 cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
    150 
    151 SECTION .text
    152 
    153 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
    154 
    155 %macro WRAP_XMM 1+
    156    INIT_XMM cpuname
    157    %1
    158    INIT_YMM cpuname
    159 %endmacro
    160 
    161 %macro IWHT4_1D_PACKED 0
    162    ; m0 = in0 in2, m1 = in1 in3
    163    psubd                m2, m0, m1 ; t2
    164    paddd               xm0, xm1    ; t0
    165    vpermq               m2, m2, q3322
    166    vpermq               m0, m0, q1100
    167    vpermq               m1, m1, q3120
    168    psubd                m3, m0, m2
    169    psrad                m3, 1
    170    psubd                m3, m1     ; t1 t3
    171    psubd                m0, m3     ; ____ out0
    172    paddd                m2, m3     ; out3 ____
    173 %endmacro
    174 
    175 INIT_YMM avx2
    176 cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
    177    mova                xm0, [cq+16*0]
    178    vinserti128          m0, [cq+16*2], 1
    179    mova                xm1, [cq+16*1]
    180    vinserti128          m1, [cq+16*3], 1
    181    pxor                 m4, m4
    182    mova          [cq+32*0], m4
    183    mova          [cq+32*1], m4
    184    lea                  r6, [dstq+strideq*2]
    185    psrad                m0, 2
    186    psrad                m1, 2
    187    IWHT4_1D_PACKED
    188    punpckhdq            m0, m3
    189    punpckldq            m3, m2
    190    punpckhqdq           m1, m0, m3
    191    punpcklqdq           m0, m3
    192    IWHT4_1D_PACKED
    193    vpblendd             m0, m2, 0x33
    194    packssdw             m0, m3
    195    vextracti128        xm2, m0, 1
    196    punpckhdq           xm1, xm0, xm2 ; out2 out1
    197    punpckldq           xm0, xm2      ; out3 out0
    198    movq                xm2, [r6  +strideq*1]
    199    movhps              xm2, [dstq+strideq*0]
    200    movq                xm3, [r6  +strideq*0]
    201    movhps              xm3, [dstq+strideq*1]
    202 %ifidn bdmaxd, bdmaxm
    203    movd                xm5, bdmaxd
    204    vpbroadcastw        xm5, xm5
    205 %else   ; win64: load from stack
    206    vpbroadcastw        xm5, bdmaxm
    207 %endif
    208    paddsw              xm0, xm2
    209    paddsw              xm1, xm3
    210    pmaxsw              xm0, xm4
    211    pmaxsw              xm1, xm4
    212    pminsw              xm0, xm5
    213    pminsw              xm1, xm5
    214    movhps [dstq+strideq*0], xm0
    215    movhps [dstq+strideq*1], xm1
    216    movq   [r6  +strideq*0], xm1
    217    movq   [r6  +strideq*1], xm0
    218    RET
    219 
    220 ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
    221 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
    222 ; flags: 1 = packed, 2 = inv_dst2
    223 ; skip round/shift if rnd is not a number
    224 %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
    225 %if %8 < 32
    226    pmulld              m%4, m%1, m%8
    227    pmulld              m%3, m%2, m%8
    228 %else
    229 %if %9 & 1
    230    vbroadcasti128      m%3, [pd_%8]
    231 %else
    232    vpbroadcastd        m%3, [pd_%8]
    233 %endif
    234    pmulld              m%4, m%1, m%3
    235    pmulld              m%3, m%2
    236 %endif
    237 %if %7 < 32
    238    pmulld              m%1, m%7
    239    pmulld              m%2, m%7
    240 %else
    241 %if %9 & 1
    242    vbroadcasti128      m%5, [pd_%7]
    243 %else
    244    vpbroadcastd        m%5, [pd_%7]
    245 %endif
    246    pmulld              m%1, m%5
    247    pmulld              m%2, m%5
    248 %endif
    249 %if %9 & 2
    250    psubd               m%4, m%6, m%4
    251    psubd               m%2, m%4, m%2
    252 %else
    253 %ifnum %6
    254    paddd               m%4, m%6
    255 %endif
    256    paddd               m%2, m%4
    257 %endif
    258 %ifnum %6
    259    paddd               m%1, m%6
    260 %endif
    261    psubd               m%1, m%3
    262 %ifnum %6
    263    psrad               m%2, 12
    264    psrad               m%1, 12
    265 %endif
    266 %endmacro
    267 
    268 %macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth
    269 cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2
    270    %define %%p1 m(i%1_%4_internal_%5bpc)
    271    ; Jump to the 1st txfm function if we're not taking the fast path, which
    272    ; in turn performs an indirect jump to the 2nd txfm function.
    273    lea tx2q, [m(i%2_%4_internal_%5bpc).pass2]
    274 %ifidn %1_%2, dct_dct
    275    test               eobd, eobd
    276    jnz %%p1
    277 %else
    278 %if %3
    279    add                eobd, %3
    280 %endif
    281    ; jump to the 1st txfm function unless it's located directly after this
    282    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
    283 ALIGN function_align
    284 %%end:
    285 %endif
    286 %endmacro
    287 
    288 %macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth
    289    INV_TXFM_FN          %1, %2, 0, 4x4, %3
    290 %ifidn %1_%2, dct_dct
    291    vpbroadcastd        xm2, [dconly_%3bpc]
    292 %if %3 = 10
    293 .dconly:
    294    imul                r6d, [cq], 181
    295    mov                [cq], eobd ; 0
    296    or                  r3d, 4
    297 .dconly2:
    298    add                 r6d, 128
    299    sar                 r6d, 8
    300 .dconly3:
    301    imul                r6d, 181
    302    add                 r6d, 2176
    303    sar                 r6d, 12
    304    movd                xm0, r6d
    305    paddsw              xm0, xm2
    306    vpbroadcastw        xm0, xm0
    307 .dconly_loop:
    308    movq                xm1, [dstq+strideq*0]
    309    movhps              xm1, [dstq+strideq*1]
    310    paddsw              xm1, xm0
    311    psubusw             xm1, xm2
    312    movq   [dstq+strideq*0], xm1
    313    movhps [dstq+strideq*1], xm1
    314    lea                dstq, [dstq+strideq*2]
    315    sub                 r3d, 2
    316    jg .dconly_loop
    317    WRAP_XMM RET
    318 %else
    319    jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly
    320 %endif
    321 %endif
    322 %endmacro
    323 
    324 %macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd
    325    ITX_MULSUB_2D        %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1
    326    punpckhqdq          m%3, m%2, m%1 ; t3 t2
    327    punpcklqdq          m%2, m%1      ; t0 t1
    328    paddd               m%1, m%2, m%3 ; out0 out1
    329    psubd               m%2, m%3      ; out3 out2
    330 %endmacro
    331 
    332 %macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd
    333    vpbroadcastd        m%5, [pw_m3784_1567]
    334    punpckhwd           m%3, m%2, m%1
    335    vpbroadcastd        m%4, [pw_1567_3784]
    336    punpcklwd           m%2, m%1
    337    vpbroadcastd        m%1, [pw_m2896_2896]
    338    pmaddwd             m%5, m%3
    339    pmaddwd             m%3, m%4
    340    vpbroadcastd        m%4, [pw_2896_2896]
    341    pmaddwd             m%1, m%2
    342    pmaddwd             m%2, m%4
    343    REPX     {paddd x, m%6}, m%5, m%3, m%1, m%2
    344    REPX     {psrad x, 12 }, m%5, m%3, m%1, m%2
    345    packssdw            m%3, m%5      ; t3 t2
    346    packssdw            m%2, m%1      ; t0 t1
    347    paddsw              m%1, m%2, m%3 ; out0 out1
    348    psubsw              m%2, m%3      ; out3 out2
    349 %endmacro
    350 
    351 INV_TXFM_4X4_FN dct, dct
    352 INV_TXFM_4X4_FN dct, identity
    353 INV_TXFM_4X4_FN dct, adst
    354 INV_TXFM_4X4_FN dct, flipadst
    355 
    356 cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
    357    call .main
    358    vbroadcasti128       m2, [idct4_shuf]
    359    packssdw             m0, m1
    360    pshufb               m0, m2
    361    jmp                tx2q
    362 .pass2:
    363    vextracti128        xm1, m0, 1
    364    WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5
    365    packssdw            xm5, xm5 ; pw_2048
    366    pmulhrsw            xm0, xm5
    367    pmulhrsw            xm1, xm5
    368    movq                xm2, [dstq+strideq*0]
    369    movhps              xm2, [dstq+strideq*1]
    370    lea                  r6, [dstq+strideq*2]
    371    movq                xm3, [r6  +strideq*1]
    372    movhps              xm3, [r6  +strideq*0]
    373    vpbroadcastd        xm5, [pixel_10bpc_max]
    374    pxor                 m4, m4
    375    mova          [cq+32*0], m4
    376    mova          [cq+32*1], m4
    377    paddw               xm0, xm2
    378    paddw               xm1, xm3
    379    pmaxsw              xm0, xm4
    380    pmaxsw              xm1, xm4
    381    pminsw              xm0, xm5
    382    pminsw              xm1, xm5
    383    movq   [dstq+strideq*0], xm0
    384    movhps [dstq+strideq*1], xm0
    385    movhps [r6  +strideq*0], xm1
    386    movq   [r6  +strideq*1], xm1
    387    RET
    388 ALIGN function_align
    389 .main:
    390    vpermq               m0, [cq+32*0], q3120
    391    vpermq               m1, [cq+32*1], q3120
    392    vpbroadcastd         m5, [pd_2048]
    393 .main2:
    394    IDCT4_1D_PACKED       0, 1, 2, 3, 4, 5
    395    ret
    396 
    397 INV_TXFM_4X4_FN adst, dct
    398 INV_TXFM_4X4_FN adst, adst
    399 INV_TXFM_4X4_FN adst, flipadst
    400 INV_TXFM_4X4_FN adst, identity
    401 
    402 %macro IADST4_1D 0
    403    vpbroadcastd         m5, [pd_1321]
    404    vpbroadcastd         m7, [pd_2482]
    405    pmulld               m4, m0, m5    ; 1321*in0
    406    pmulld               m6, m3, m7    ; 2482*in3
    407    paddd                m4, m6        ; 1321*in0 + 2482*in3
    408    pmulld               m6, m0, m7    ; 2482*in0
    409    paddd                m0, m3        ; in0 + in3
    410    paddd                m7, m5        ; pd_3803
    411    pmulld               m5, m2        ; 1321*in2
    412    pmulld               m3, m7        ; 3803*in3
    413    pmulld               m7, m2        ; 3803*in2
    414    psubd                m2, m0        ; in2 - in0 - in3
    415    vpbroadcastd         m0, [pd_m3344]
    416    pmulld               m1, m0        ; -t3
    417    pmulld               m2, m0        ; out2 (unrounded)
    418    psubd                m6, m5        ; 2482*in0 - 1321*in2
    419    paddd                m4, m7        ;  t0
    420    psubd                m6, m3        ;  t1
    421    paddd                m3, m4, m6
    422    psubd                m4, m1        ; out0 (unrounded)
    423    psubd                m6, m1        ; out1 (unrounded)
    424    paddd                m3, m1        ; out3 (unrounded)
    425 %endmacro
    426 
    427 cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
    428    call .main
    429    vinserti128          m0, m4, xm6, 1
    430    vinserti128          m1, m2, xm3, 1
    431 .pass1_end:
    432    vpbroadcastd         m5, [pd_2048]
    433    mova                 m2, [itx4_shuf]
    434    paddd                m0, m5
    435    paddd                m1, m5
    436    psrad                m0, 12
    437    psrad                m1, 12
    438    packssdw             m0, m1
    439    vpermd               m0, m2, m0
    440    psrld                m2, 4
    441    pshufb               m0, m2
    442 %if WIN64
    443    movaps             xmm6, [rsp+ 8]
    444    movaps             xmm7, [rsp+24]
    445 %endif
    446    jmp                tx2q
    447 .pass2:
    448    lea                  r6, [deint_shuf+128]
    449    vextracti128        xm1, m0, 1
    450    call m(iadst_4x4_internal_8bpc).main
    451 .end:
    452    vpbroadcastd        xm4, [pw_2048]
    453    movq                xm2, [dstq+strideq*0]
    454    movhps              xm2, [dstq+strideq*1]
    455    lea                  r6, [dstq+strideq*2]
    456    movq                xm3, [r6  +strideq*0]
    457    movhps              xm3, [r6  +strideq*1]
    458    vpbroadcastd        xm5, [pixel_10bpc_max]
    459    pmulhrsw            xm0, xm4
    460    pmulhrsw            xm1, xm4
    461    pxor                 m4, m4
    462    mova          [cq+32*0], m4
    463    mova          [cq+32*1], m4
    464    paddw               xm0, xm2
    465    paddw               xm1, xm3
    466    pmaxsw              xm0, xm4
    467    pmaxsw              xm1, xm4
    468    pminsw              xm0, xm5
    469    pminsw              xm1, xm5
    470    movq   [dstq+strideq*0], xm0
    471    movhps [dstq+strideq*1], xm0
    472    movq   [r6  +strideq*0], xm1
    473    movhps [r6  +strideq*1], xm1
    474    RET
    475 ALIGN function_align
    476 .main:
    477    mova                xm0, [cq+16*0]
    478    mova                xm1, [cq+16*1]
    479    mova                xm2, [cq+16*2]
    480    mova                xm3, [cq+16*3]
    481 %if WIN64
    482    movaps         [rsp+16], xmm6
    483    movaps         [rsp+32], xmm7
    484 %endif
    485 .main2:
    486    WRAP_XMM IADST4_1D
    487    ret
    488 
    489 INV_TXFM_4X4_FN flipadst, dct
    490 INV_TXFM_4X4_FN flipadst, adst
    491 INV_TXFM_4X4_FN flipadst, flipadst
    492 INV_TXFM_4X4_FN flipadst, identity
    493 
    494 cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
    495    call m(iadst_4x4_internal_10bpc).main
    496    vinserti128          m0, m3, xm2, 1
    497    vinserti128          m1, m6, xm4, 1
    498    jmp m(iadst_4x4_internal_10bpc).pass1_end
    499 .pass2:
    500    lea                  r6, [deint_shuf+128]
    501    vextracti128        xm1, m0, 1
    502    call m(iadst_4x4_internal_8bpc).main
    503    vpbroadcastd        xm4, [pw_2048]
    504    movq                xm3, [dstq+strideq*1]
    505    movhps              xm3, [dstq+strideq*0]
    506    lea                  r6, [dstq+strideq*2]
    507    movq                xm2, [r6  +strideq*1]
    508    movhps              xm2, [r6  +strideq*0]
    509    vpbroadcastd        xm5, [pixel_10bpc_max]
    510    pmulhrsw            xm0, xm4
    511    pmulhrsw            xm1, xm4
    512    pxor                 m4, m4
    513    mova          [cq+32*0], m4
    514    mova          [cq+32*1], m4
    515    paddw               xm0, xm2
    516    paddw               xm1, xm3
    517    pmaxsw              xm0, xm4
    518    pmaxsw              xm1, xm4
    519    pminsw              xm0, xm5
    520    pminsw              xm1, xm5
    521    movhps [dstq+strideq*0], xm1
    522    movq   [dstq+strideq*1], xm1
    523    movhps [r6  +strideq*0], xm0
    524    movq   [r6  +strideq*1], xm0
    525    RET
    526 
    527 INV_TXFM_4X4_FN identity, dct
    528 INV_TXFM_4X4_FN identity, adst
    529 INV_TXFM_4X4_FN identity, flipadst
    530 INV_TXFM_4X4_FN identity, identity
    531 
    532 cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
    533    vpbroadcastd         m1, [pd_5793]
    534    pmulld               m0, m1, [cq+32*0]
    535    pmulld               m1,     [cq+32*1]
    536    vpbroadcastd         m5, [pd_2048]
    537    mova                 m3, [itx4_shuf]
    538    paddd                m0, m5
    539    paddd                m1, m5
    540    psrad                m0, 12
    541    psrad                m1, 12
    542    packssdw             m0, m1
    543    vpermd               m0, m3, m0
    544    psrld                m3, 4
    545    pshufb               m0, m3
    546    jmp                tx2q
    547 .pass2:
    548    vpbroadcastd         m1, [pw_1697x8]
    549    movq                xm2, [dstq+strideq*0]
    550    movhps              xm2, [dstq+strideq*1]
    551    lea                  r6, [dstq+strideq*2]
    552    pmulhrsw             m1, m0
    553    paddsw               m0, m1
    554    movq                xm3, [r6  +strideq*0]
    555    movhps              xm3, [r6  +strideq*1]
    556    vpbroadcastd        xm4, [pixel_10bpc_max]
    557    packssdw             m5, m5 ; pw_2048
    558    pmulhrsw             m0, m5
    559    pxor                 m5, m5
    560    mova          [cq+32*0], m5
    561    mova          [cq+32*1], m5
    562    vextracti128        xm1, m0, 1
    563    paddw               xm0, xm2
    564    paddw               xm1, xm3
    565    pmaxsw              xm0, xm5
    566    pmaxsw              xm1, xm5
    567    pminsw              xm0, xm4
    568    pminsw              xm1, xm4
    569    movq   [dstq+strideq*0], xm0
    570    movhps [dstq+strideq*1], xm0
    571    movq   [r6  +strideq*0], xm1
    572    movhps [r6  +strideq*1], xm1
    573    RET
    574 
    575 INV_TXFM_4X4_FN dct, dct,      12
    576 INV_TXFM_4X4_FN dct, identity, 12
    577 INV_TXFM_4X4_FN dct, adst,     12
    578 INV_TXFM_4X4_FN dct, flipadst, 12
    579 
    580 cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
    581    call m(idct_4x4_internal_10bpc).main
    582    mova                 m3, [idct4_12_shuf]
    583    mova                 m4, [idct4_12_shuf2]
    584    vpermd               m2, m4, m1
    585    vpermd               m1, m3, m0
    586    jmp m(iadst_4x4_internal_12bpc).pass1_end2
    587 .pass2:
    588    vpbroadcastd         m5, [pd_2048]
    589    vpermq               m0, m0, q3120
    590    vpermq               m1, m1, q3120
    591    call m(idct_4x4_internal_10bpc).main2
    592    vpermq               m0, m0, q3120
    593    vpermq               m1, m1, q2031
    594    jmp m(iadst_4x4_internal_12bpc).end
    595 
    596 INV_TXFM_4X4_FN adst, dct,      12
    597 INV_TXFM_4X4_FN adst, adst,     12
    598 INV_TXFM_4X4_FN adst, flipadst, 12
    599 INV_TXFM_4X4_FN adst, identity, 12
    600 
    601 cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
    602    call m(iadst_4x4_internal_10bpc).main
    603    vinserti128          m1, m4, xm6, 1
    604    vinserti128          m2, xm3, 1
    605 .pass1_end:
    606    mova                 m3, [itx4_shuf]
    607    vpbroadcastd         m5, [pd_1024]
    608    psrad                m1, 1
    609    psrad                m2, 1
    610    vpermd               m1, m3, m1
    611    vpermd               m2, m3, m2
    612    paddd                m1, m5
    613    paddd                m2, m5
    614    psrad                m1, 11
    615    psrad                m2, 11
    616 .pass1_end2:
    617    vpbroadcastd         m3, [clip_18b_min]
    618    vpbroadcastd         m4, [clip_18b_max]
    619    punpcklqdq           m0, m1, m2
    620    punpckhqdq           m1, m2
    621    pmaxsd               m0, m3
    622    pmaxsd               m1, m3
    623    pminsd               m0, m4
    624    pminsd               m1, m4
    625    jmp                tx2q
    626 .pass2:
    627    call .main_pass2
    628    vinserti128          m0, m4, xm6, 1
    629    vinserti128          m1, m2, xm3, 1
    630 .pass2_end:
    631    vpbroadcastd         m5, [pd_2048]
    632    paddd                m0, m5
    633    paddd                m1, m5
    634    psrad                m0, 12
    635    psrad                m1, 12
    636 .end:
    637 %if WIN64
    638    WIN64_RESTORE_XMM_INTERNAL
    639    %assign xmm_regs_used 6
    640 %endif
    641 .end2:
    642    vpbroadcastd         m4, [pw_16384]
    643    movq                xm2, [dstq+strideq*0]
    644    movq                xm3, [dstq+strideq*1]
    645    lea                  r6, [dstq+strideq*2]
    646    movhps              xm2, [r6  +strideq*0]   ; dst0 dst2
    647    movhps              xm3, [r6  +strideq*1]   ; dst1 dst3
    648    vpbroadcastd         m5, [pixel_12bpc_max]
    649    vinserti128          m2, xm3, 1
    650    psrad                m0, 3
    651    psrad                m1, 3
    652    packssdw             m0, m1     ; t0 t2 t1 t3
    653    pmulhrsw             m0, m4
    654    pxor                 m4, m4
    655    mova          [cq+32*0], m4
    656    mova          [cq+32*1], m4
    657    paddw                m0, m2     ; out0 out2 out1 out3
    658    pmaxsw               m0, m4
    659    pminsw               m0, m5
    660    vextracti128        xm1, m0, 1  ; out1 out3
    661    movq   [dstq+strideq*0], xm0
    662    movq   [dstq+strideq*1], xm1
    663    movhps [r6  +strideq*0], xm0
    664    movhps [r6  +strideq*1], xm1
    665    RET
    666 .main_pass2:
    667    vextracti128        xm3, m1, 1
    668    mova                xm2, xm1
    669    vextracti128        xm1, m0, 1
    670    jmp m(iadst_4x4_internal_10bpc).main2
    671 
    672 INV_TXFM_4X4_FN flipadst, dct,      12
    673 INV_TXFM_4X4_FN flipadst, adst,     12
    674 INV_TXFM_4X4_FN flipadst, flipadst, 12
    675 INV_TXFM_4X4_FN flipadst, identity, 12
    676 
    677 cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
    678    call m(iadst_4x4_internal_10bpc).main
    679    vinserti128          m1, m3, xm2, 1
    680    vinserti128          m2, m6, xm4, 1
    681    jmp m(iadst_4x4_internal_12bpc).pass1_end
    682 .pass2:
    683    call m(iadst_4x4_internal_12bpc).main_pass2
    684    vinserti128          m0, m3, xm2, 1
    685    vinserti128          m1, m6, xm4, 1
    686    jmp m(iadst_4x4_internal_12bpc).pass2_end
    687 
    688 INV_TXFM_4X4_FN identity, dct,      12
    689 INV_TXFM_4X4_FN identity, adst,     12
    690 INV_TXFM_4X4_FN identity, flipadst, 12
    691 INV_TXFM_4X4_FN identity, identity, 12
    692 
    693 cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
    694    mova                 m2, [itx4_shuf]
    695    vpbroadcastd         m3, [pd_1697]
    696    vpermd               m0, m2, [cq+32*0]
    697    vpermd               m2, m2, [cq+32*1]
    698    vpbroadcastd         m5, [pd_2048]
    699    pmulld               m1, m3, m0
    700    pmulld               m3, m2
    701    paddd                m1, m5
    702    paddd                m3, m5
    703    psrad                m1, 12
    704    psrad                m3, 12
    705    paddd                m1, m0
    706    paddd                m2, m3
    707    jmp m(iadst_4x4_internal_12bpc).pass1_end2
    708 .pass2:
    709    ; m0 = in0 in1
    710    ; m1 = in2 in3
    711    vpbroadcastd         m3, [pd_5793]
    712    vpbroadcastd         m5, [pd_2048]
    713    pmulld               m0, m3
    714    pmulld               m1, m3
    715    paddd                m0, m5 ; 2048
    716    paddd                m1, m5
    717    psrad                m0, 12
    718    psrad                m1, 12
    719    jmp m(iadst_4x4_internal_12bpc).end
    720 
    721 %macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth
    722    INV_TXFM_FN          %1, %2, 0, 4x8, %3
    723 %ifidn %1_%2, dct_dct
    724    vpbroadcastd        xm2, [dconly_%3bpc]
    725 %if %3 = 10
    726 .dconly:
    727    imul                r6d, [cq], 181
    728    mov                [cq], eobd ; 0
    729    or                  r3d, 8
    730    add                 r6d, 128
    731    sar                 r6d, 8
    732    imul                r6d, 181
    733    jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2
    734 %else
    735    jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly
    736 %endif
    737 %endif
    738 %endmacro
    739 
    740 %macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
    741    ITX_MULSUB_2D        %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3
    742    vpbroadcastd        m%5, [pd_2896]
    743    pmulld              m%1, m%5
    744    pmulld              m%3, m%5
    745    paddd               m%1, m%8
    746    paddd               m%5, m%1, m%3
    747    psubd               m%1, m%3
    748    psrad               m%5, 12 ; t0
    749    psrad               m%1, 12 ; t1
    750    psubd               m%3, m%1, m%2
    751    paddd               m%2, m%1
    752    paddd               m%1, m%5, m%4
    753    psubd               m%4, m%5, m%4
    754 %endmacro
    755 
    756 INV_TXFM_4X8_FN dct, dct
    757 INV_TXFM_4X8_FN dct, identity
    758 INV_TXFM_4X8_FN dct, adst
    759 INV_TXFM_4X8_FN dct, flipadst
    760 
    761 cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
    762 .pass1:
    763    vpbroadcastd         m3, [pd_2896]
    764    pmulld               m0, m3, [cq+32*0]
    765    pmulld               m1, m3, [cq+32*1]
    766    pmulld               m2, m3, [cq+32*2]
    767    pmulld               m3, m3, [cq+32*3]
    768    vpbroadcastd         m7, [pd_2048]
    769    REPX      {paddd x, m7}, m0, m1, m2, m3
    770    REPX      {psrad x, 12}, m0, m1, m2, m3
    771    IDCT4_1D              0, 1, 2, 3, 4, 5, 6, 7
    772    jmp                tx2q
    773 .pass2:
    774    packssdw             m0, m2
    775    packssdw             m1, m3
    776    lea                  r6, [deint_shuf+128]
    777    punpckhwd            m2, m0, m1
    778    punpcklwd            m0, m1
    779    punpckhdq            m1, m0, m2 ; 2 3
    780    punpckldq            m0, m2     ; 0 1
    781    vextracti128        xm2, m0, 1  ; 4 5
    782    vextracti128        xm3, m1, 1  ; 6 7
    783    call m(idct_4x8_internal_8bpc).main
    784    vpbroadcastd        xm4, [pw_2048]
    785    REPX  {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
    786    lea                  r3, [strideq*3]
    787    lea                  r6, [dstq+strideq*4]
    788    movq                xm4, [dstq+strideq*0]
    789    movhps              xm4, [dstq+strideq*1]
    790    movq                xm5, [dstq+r3       ]
    791    movhps              xm5, [dstq+strideq*2]
    792    movq                xm6, [r6  +strideq*0]
    793    movhps              xm6, [r6  +strideq*1]
    794    movq                xm7, [r6  +r3       ]
    795    movhps              xm7, [r6  +strideq*2]
    796    paddw               xm0, xm4 ; 0 1
    797    paddw               xm1, xm5 ; 3 2
    798    paddw               xm2, xm6 ; 4 5
    799    paddw               xm3, xm7 ; 7 6
    800    vpbroadcastd        xm5, [pixel_10bpc_max]
    801    pxor                 m4, m4
    802    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
    803    REPX    {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
    804    REPX    {pminsw x, xm5}, xm0, xm1, xm2, xm3
    805    movq   [dstq+strideq*0], xm0
    806    movhps [dstq+strideq*1], xm0
    807    movhps [dstq+strideq*2], xm1
    808    movq   [dstq+r3       ], xm1
    809    movq   [r6  +strideq*0], xm2
    810    movhps [r6  +strideq*1], xm2
    811    movhps [r6  +strideq*2], xm3
    812    movq   [r6  +r3       ], xm3
    813    RET
    814 
    815 INV_TXFM_4X8_FN adst, dct
    816 INV_TXFM_4X8_FN adst, adst
    817 INV_TXFM_4X8_FN adst, flipadst
    818 INV_TXFM_4X8_FN adst, identity
    819 
    820 cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
    821    call m(iadst_8x4_internal_10bpc).main
    822    vpbroadcastd         m5, [pd_2048]
    823    paddd                m0, m5, m4
    824    paddd                m1, m5, m6
    825    paddd                m2, m5
    826    paddd                m3, m5
    827 .pass1_end:
    828    REPX      {psrad x, 12}, m0, m1, m2, m3
    829    jmp                tx2q
    830 .pass2:
    831    call .pass2_main
    832    mova                xm4, [pw_2048_m2048]
    833    REPX  {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
    834 .end:
    835    lea                  r3, [strideq*3]
    836    lea                  r6, [dstq+strideq*4]
    837    movq                xm4, [dstq+strideq*0]
    838    movhps              xm4, [dstq+strideq*1]
    839    movq                xm5, [dstq+strideq*2]
    840    movhps              xm5, [dstq+r3       ]
    841    movq                xm6, [r6  +strideq*0]
    842    movhps              xm6, [r6  +strideq*1]
    843    movq                xm7, [r6  +strideq*2]
    844    movhps              xm7, [r6  +r3       ]
    845    paddw               xm0, xm4 ; 0 1
    846    paddw               xm1, xm5 ; 2 3
    847    paddw               xm2, xm6 ; 4 5
    848    paddw               xm3, xm7 ; 6 7
    849    vpbroadcastd        xm5, [pixel_10bpc_max]
    850    pxor                 m4, m4
    851    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
    852    REPX    {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
    853    REPX    {pminsw x, xm5}, xm0, xm1, xm2, xm3
    854    movq   [dstq+strideq*0], xm0
    855    movhps [dstq+strideq*1], xm0
    856    movq   [dstq+strideq*2], xm1
    857    movhps [dstq+r3       ], xm1
    858    movq   [r6  +strideq*0], xm2
    859    movhps [r6  +strideq*1], xm2
    860    movq   [r6  +strideq*2], xm3
    861    movhps [r6  +r3       ], xm3
    862    RET
    863 ALIGN function_align
    864 .pass2_main:
    865    packssdw             m0, m2
    866    packssdw             m1, m3
    867    lea                  r6, [deint_shuf+128]
    868    punpcklwd            m4, m0, m1
    869    punpckhwd            m0, m1
    870    punpckhdq            m5, m4, m0
    871    punpckldq            m4, m0
    872    vextracti128        xm2, m4, 1      ; 4 5
    873    vextracti128        xm3, m5, 1      ; 6 7
    874    pshufd              xm4, xm4, q1032 ; 1 0
    875    pshufd              xm5, xm5, q1032 ; 3 2
    876    jmp m(iadst_4x8_internal_8bpc).main_pass2
    877 ALIGN function_align
    878 .main:
    879    vpbroadcastd         m8, [clip_18b_min]
    880    vpbroadcastd         m9, [clip_18b_max]
    881 .main2:
    882    vbroadcasti128       m0, [cq+16*0]
    883    vbroadcasti128       m2, [cq+16*2]
    884    vbroadcasti128       m3, [cq+16*5]
    885    vbroadcasti128       m1, [cq+16*7]
    886    vpbroadcastd         m6, [pd_2896]
    887    shufpd               m0, m2, 0x0c ; 0 2
    888    shufpd               m1, m3, 0x0c ; 7 5
    889    vbroadcasti128       m2, [cq+16*4]
    890    vbroadcasti128       m4, [cq+16*6]
    891    vbroadcasti128       m5, [cq+16*1]
    892    vbroadcasti128       m3, [cq+16*3]
    893    vpbroadcastd         m7, [pd_2048]
    894    shufpd               m2, m4, 0x0c ; 4 6
    895    shufpd               m3, m5, 0x0c ; 3 1
    896    REPX {pmulld x, m6}, m0, m1, m2, m3
    897    REPX {paddd  x, m7}, m0, m1, m2, m3
    898    REPX {psrad  x, 12}, m0, m1, m2, m3
    899 .main3:
    900    ITX_MULSUB_2D         1, 0, 4, 5, 6, 7,  401_1931, 4076_3612, 1
    901    ITX_MULSUB_2D         3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1
    902    psubd                m4, m0, m2   ; t4  t6
    903    paddd                m0, m2       ; t0  t2
    904    psubd                m2, m1, m3   ; t5  t7
    905    paddd                m1, m3       ; t1  t3
    906    REPX     {pmaxsd x, m8}, m4, m2, m0, m1
    907    REPX     {pminsd x, m9}, m4, m2, m0, m1
    908    pxor                 m5, m5
    909    psubd                m5, m4
    910    vpblendd             m4, m2, 0xcc ; t4  t7
    911    vpblendd             m2, m5, 0xcc ; t5 -t6
    912    ITX_MULSUB_2D         4, 2, 3, 5, 6, 7, 1567, 3784
    913    vpbroadcastd         m5, [pd_2896]
    914    vbroadcasti128       m6, [pw_2048_m2048] ; + + - -
    915    punpckhqdq           m3, m0, m1
    916    punpcklqdq           m0, m1
    917    psubd                m1, m0, m3   ; t2  t3
    918    paddd                m0, m3       ;  out0 -out7
    919    punpckhqdq           m3, m4, m2   ; t7a t6a
    920    punpcklqdq           m4, m2       ; t5a t4a
    921    psubd                m2, m4, m3   ; t7  t6
    922    paddd                m4, m3       ;  out6 -out1
    923    REPX     {pmaxsd x, m8}, m1, m2
    924    REPX     {pminsd x, m9}, m1, m2
    925    vpblendd             m3, m1, m2, 0xcc
    926    shufpd               m1, m2, 0x05
    927    pmulld               m3, m5
    928    pmulld               m5, m1
    929    psignd               m0, m6       ;  out0  out7
    930    psignd               m4, m6       ;  out6  out1
    931    paddd                m3, m7
    932    psubd                m2, m3, m5
    933    paddd                m5, m3
    934    psrad                m2, 12       ;  out4 -out5
    935    psrad                m5, 12       ; -out3  out2
    936    ret
    937 
    938 INV_TXFM_4X8_FN flipadst, dct
    939 INV_TXFM_4X8_FN flipadst, adst
    940 INV_TXFM_4X8_FN flipadst, flipadst
    941 INV_TXFM_4X8_FN flipadst, identity
    942 
    943 cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
    944    call m(iadst_8x4_internal_10bpc).main
    945    vpbroadcastd         m5, [pd_2048]
    946    paddd                m0, m5, m3
    947    paddd                m1, m5, m2
    948    paddd                m2, m5, m6
    949    paddd                m3, m5, m4
    950    jmp m(iadst_4x8_internal_10bpc).pass1_end
    951 .pass2:
    952    call m(iadst_4x8_internal_10bpc).pass2_main
    953    mova                xm4, [pw_2048_m2048]
    954    REPX  {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0
    955    lea                  r3, [strideq*3]
    956    lea                  r6, [dstq+strideq*4]
    957    movq                xm4, [dstq+strideq*1]
    958    movhps              xm4, [dstq+strideq*0]
    959    movq                xm5, [dstq+r3       ]
    960    movhps              xm5, [dstq+strideq*2]
    961    movq                xm6, [r6  +strideq*1]
    962    movhps              xm6, [r6  +strideq*0]
    963    movq                xm7, [r6  +r3       ]
    964    movhps              xm7, [r6  +strideq*2]
    965    paddw               xm3, xm4 ; 1 0
    966    paddw               xm2, xm5 ; 3 2
    967    paddw               xm1, xm6 ; 5 4
    968    paddw               xm0, xm7 ; 7 6
    969    vpbroadcastd        xm5, [pixel_10bpc_max]
    970    pxor                 m4, m4
    971    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
    972    REPX    {pmaxsw x, xm4}, xm3, xm2, xm1, xm0
    973    REPX    {pminsw x, xm5}, xm3, xm2, xm1, xm0
    974    movhps [dstq+strideq*0], xm3
    975    movq   [dstq+strideq*1], xm3
    976    movhps [dstq+strideq*2], xm2
    977    movq   [dstq+r3       ], xm2
    978    movhps [r6  +strideq*0], xm1
    979    movq   [r6  +strideq*1], xm1
    980    movhps [r6  +strideq*2], xm0
    981    movq   [r6  +r3       ], xm0
    982    RET
    983 
    984 INV_TXFM_4X8_FN identity, dct
    985 INV_TXFM_4X8_FN identity, adst
    986 INV_TXFM_4X8_FN identity, flipadst
    987 INV_TXFM_4X8_FN identity, identity
    988 
    989 cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
    990 .pass1:
    991    vpbroadcastd         m3, [pd_2896]
    992    pmulld               m0, m3, [cq+32*0]
    993    pmulld               m1, m3, [cq+32*1]
    994    pmulld               m2, m3, [cq+32*2]
    995    pmulld               m3,     [cq+32*3]
    996    vpbroadcastd         m5, [pd_2048]
    997    vpbroadcastd         m4, [pd_5793]
    998    REPX     {paddd  x, m5}, m0, m1, m2, m3
    999    REPX     {psrad  x, 12}, m0, m1, m2, m3
   1000    REPX     {pmulld x, m4}, m0, m1, m2, m3
   1001    REPX     {paddd  x, m5}, m0, m1, m2, m3
   1002    REPX     {psrad  x, 12}, m0, m1, m2, m3
   1003    jmp                tx2q
   1004 .pass2:
   1005    vpbroadcastd         m6, [pixel_10bpc_max]
   1006    call .pass2_end
   1007    RET
   1008 ALIGN function_align
   1009 .pass2_end:
   1010    vpbroadcastd         m4, [pw_4096]
   1011    packssdw             m0, m2
   1012    packssdw             m1, m3
   1013    punpckhwd            m2, m0, m1
   1014    punpcklwd            m0, m1
   1015    pmulhrsw             m2, m4
   1016    pmulhrsw             m0, m4
   1017    punpckhdq            m1, m0, m2 ; 2 3 6 7
   1018    punpckldq            m0, m2     ; 0 1 4 5
   1019    lea                  r3, [strideq*3]
   1020    lea                  r6, [dstq+strideq*4]
   1021    movq                xm2, [dstq+strideq*0]
   1022    movhps              xm2, [dstq+strideq*1]
   1023    vpbroadcastq         m4, [r6  +strideq*0]
   1024    vpbroadcastq         m5, [r6  +strideq*1]
   1025    movq                xm3, [dstq+strideq*2]
   1026    movhps              xm3, [dstq+r3       ]
   1027    vpblendd             m2, m4, 0x30
   1028    vpblendd             m2, m5, 0xc0
   1029    vpbroadcastq         m4, [r6  +strideq*2]
   1030    vpbroadcastq         m5, [r6  +r3       ]
   1031    vpblendd             m3, m4, 0x30
   1032    vpblendd             m3, m5, 0xc0
   1033    pxor                 m4, m4
   1034    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
   1035    paddw                m0, m2 ; out0 out1 out4 out5
   1036    paddw                m1, m3 ; out2 out3 out6 out7
   1037    pmaxsw               m0, m4
   1038    pmaxsw               m1, m4
   1039    pminsw               m0, m6
   1040    pminsw               m1, m6
   1041    vextracti128        xm2, m0, 1  ; out4 out5
   1042    vextracti128        xm3, m1, 1  ; out6 out7
   1043    movq   [dstq+strideq*0], xm0
   1044    movhps [dstq+strideq*1], xm0
   1045    movq   [dstq+strideq*2], xm1
   1046    movhps [dstq+r3       ], xm1
   1047    movq   [r6  +strideq*0], xm2
   1048    movhps [r6  +strideq*1], xm2
   1049    movq   [r6  +strideq*2], xm3
   1050    movhps [r6  +r3       ], xm3
   1051    ret
   1052 
   1053 INV_TXFM_4X8_FN dct, dct,      12
   1054 INV_TXFM_4X8_FN dct, identity, 12
   1055 INV_TXFM_4X8_FN dct, adst,     12
   1056 INV_TXFM_4X8_FN dct, flipadst, 12
   1057 
   1058 cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
   1059    jmp m(idct_4x8_internal_10bpc).pass1
   1060 .pass2:
   1061    vpbroadcastd         m8, [clip_18b_min]
   1062    vpbroadcastd         m9, [clip_18b_max]
   1063    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
   1064    REPX     {pminsd x, m9}, m0, m1, m2, m3
   1065    ; transpose & interleave
   1066    pshufd               m0, m0, q1320
   1067    pshufd               m1, m1, q1320
   1068    pshufd               m2, m2, q1320
   1069    pshufd               m3, m3, q1320
   1070    punpckldq            m4, m0, m1
   1071    punpckhdq            m0, m1
   1072    punpckldq            m5, m2, m3
   1073    punpckhdq            m2, m3
   1074    vpermq               m0, m0, q3102
   1075    vpermq               m2, m2, q3102
   1076    vperm2i128           m1, m0, m2, 0x31   ; 1 5 (interleaved)
   1077    vperm2i128           m3, m0, m2, 0x20   ; 7 3 (interleaved)
   1078    vperm2i128           m0, m4, m5, 0x20   ; 0 2 (interleaved)
   1079    vperm2i128           m2, m4, m5, 0x31   ; 4 6 (interleaved)
   1080    vpbroadcastd         m7, [pd_2048]
   1081    call m(idct_8x4_internal_10bpc).main
   1082    psubd                m3, m0, m4  ; out7 out6
   1083    paddd                m0, m4      ; out0 out1
   1084    paddd                m1, m2, m5  ; out3 out2
   1085    psubd                m2, m5      ; out4 out5
   1086    pshufd               m1, m1, q1032
   1087    pshufd               m3, m3, q1032
   1088    jmp m(iadst_4x8_internal_12bpc).end
   1089 
   1090 INV_TXFM_4X8_FN adst, dct,      12
   1091 INV_TXFM_4X8_FN adst, adst,     12
   1092 INV_TXFM_4X8_FN adst, flipadst, 12
   1093 INV_TXFM_4X8_FN adst, identity, 12
   1094 
   1095 cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
   1096    call m(iadst_8x4_internal_10bpc).main
   1097    psrad                m0, m4, 1
   1098    psrad                m1, m6, 1
   1099    psrad                m2, 1
   1100    psrad                m3, 1
   1101 .pass1_end:
   1102    vpbroadcastd         m5, [pd_1024]
   1103    REPX      {paddd x, m5}, m0, m1, m2, m3
   1104    REPX      {psrad x, 11}, m0, m1, m2, m3
   1105    jmp                tx2q
   1106 .pass2:
   1107    vpbroadcastd         m8, [clip_18b_min]
   1108    vpbroadcastd         m9, [clip_18b_max]
   1109    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
   1110    REPX     {pminsd x, m9}, m0, m1, m2, m3
   1111    call .pass2_main
   1112    vpblendd             m3, m0, m4, 0x33 ; out6 out7
   1113    vpblendd             m0, m4, 0xcc     ; out0 out1
   1114    pshufd               m1, m5, q1032
   1115    psignd               m2, m6           ; out4 out5
   1116    psignd               m1, m6           ; out2 out3
   1117 .end:
   1118    vpbroadcastd         m4, [pw_16384]
   1119    REPX       {psrad x, 3}, m0, m1, m2, m3
   1120    packssdw             m0, m2     ; 0 1 4 5 (interleaved)
   1121    packssdw             m1, m3     ; 2 3 6 7 (interleaved)
   1122    mova                 m2, [iadst8_12_shuf]
   1123    vpermd               m0, m2, m0 ; 0 1 4 5
   1124    vpermd               m1, m2, m1 ; 2 3 6 7
   1125    pmulhrsw             m0, m4
   1126    pmulhrsw             m1, m4
   1127    lea                  r3, [strideq*3]
   1128    lea                  r6, [dstq+strideq*4]
   1129    movq                xm4, [dstq+strideq*0]
   1130    movhps              xm4, [dstq+strideq*1]
   1131    movq                xm5, [dstq+strideq*2]
   1132    movhps              xm5, [dstq+r3       ]
   1133    movq                xm6, [r6  +strideq*0]
   1134    movhps              xm6, [r6  +strideq*1]
   1135    vinserti128          m4, xm6, 1
   1136    movq                xm7, [r6  +strideq*2]
   1137    movhps              xm7, [r6  +r3       ]
   1138    vinserti128          m5, xm7, 1
   1139    paddw                m0, m4 ; 0 1 4 5
   1140    paddw                m1, m5 ; 2 3 6 7
   1141    vpbroadcastd         m5, [pixel_12bpc_max]
   1142    pxor                 m4, m4
   1143    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
   1144    REPX    {pmaxsw x,  m4}, m0, m1
   1145    REPX    {pminsw x,  m5}, m0, m1
   1146    vextracti128        xm2, m0, 1  ; out4 out5
   1147    vextracti128        xm3, m1, 1  ; out6 out7
   1148    movq   [dstq+strideq*0], xm0
   1149    movhps [dstq+strideq*1], xm0
   1150    movq   [dstq+strideq*2], xm1
   1151    movhps [dstq+r3       ], xm1
   1152    movq   [r6  +strideq*0], xm2
   1153    movhps [r6  +strideq*1], xm2
   1154    movq   [r6  +strideq*2], xm3
   1155    movhps [r6  +r3       ], xm3
   1156    RET
   1157 ALIGN function_align
   1158 .pass2_main:
   1159    ; transpose & interleave
   1160    pshufd               m0, m0, q1320
   1161    pshufd               m1, m1, q1320
   1162    pshufd               m2, m2, q1320
   1163    pshufd               m3, m3, q1320
   1164    punpckldq            m4, m0, m1
   1165    punpckhdq            m0, m1
   1166    punpckldq            m5, m2, m3
   1167    punpckhdq            m2, m3
   1168    vperm2i128           m1, m0, m2, 0x31   ; 7 5 (interleaved)
   1169    vperm2i128           m3, m0, m2, 0x20   ; 3 1 (interleaved)
   1170    vperm2i128           m0, m4, m5, 0x20   ; 0 2 (interleaved)
   1171    vperm2i128           m2, m4, m5, 0x31   ; 4 6 (interleaved)
   1172    vpbroadcastd         m7, [pd_2048]
   1173    jmp m(iadst_4x8_internal_10bpc).main3
   1174 
   1175 INV_TXFM_4X8_FN flipadst, dct,      12
   1176 INV_TXFM_4X8_FN flipadst, adst,     12
   1177 INV_TXFM_4X8_FN flipadst, flipadst, 12
   1178 INV_TXFM_4X8_FN flipadst, identity, 12
   1179 
   1180 cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
   1181    call m(iadst_8x4_internal_10bpc).main
   1182    psrad                m0, m3, 1
   1183    psrad                m1, m2, 1
   1184    psrad                m2, m6, 1
   1185    psrad                m3, m4, 1
   1186    jmp m(iadst_4x8_internal_12bpc).pass1_end
   1187 .pass2:
   1188    vpbroadcastd         m8, [clip_18b_min]
   1189    vpbroadcastd         m9, [clip_18b_max]
   1190    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
   1191    REPX     {pminsd x, m9}, m0, m1, m2, m3
   1192    call m(iadst_4x8_internal_12bpc).pass2_main
   1193    shufpd               m3, m4, m0, 0x05 ; out1 out0
   1194    shufpd               m0, m4, 0x05     ; out7 out6
   1195    psignd               m2, m6
   1196    pshufd               m6, m6, q1032
   1197    pshufd               m1, m2, q1032    ; out5 out4
   1198    psignd               m2, m5, m6       ; out3 out2
   1199    jmp m(iadst_4x8_internal_12bpc).end
   1200 
   1201 INV_TXFM_4X8_FN identity, dct,      12
   1202 INV_TXFM_4X8_FN identity, adst,     12
   1203 INV_TXFM_4X8_FN identity, flipadst, 12
   1204 INV_TXFM_4X8_FN identity, identity, 12
   1205 
   1206 cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
   1207    jmp m(iidentity_4x8_internal_10bpc).pass1
   1208 .pass2:
   1209    ; m0 = in0 in1
   1210    ; m1 = in2 in3
   1211    ; m2 = in4 in5
   1212    ; m3 = in6 in7
   1213    vpbroadcastd         m6, [pixel_12bpc_max]
   1214    call m(iidentity_4x8_internal_10bpc).pass2_end
   1215    RET
   1216 
   1217 %macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth
   1218    INV_TXFM_FN          %1, %2, 0, 4x16, %3
   1219 %ifidn %1_%2, dct_dct
   1220    imul                r6d, [cq], 181
   1221    vpbroadcastd        xm2, [dconly_%3bpc]
   1222    mov                [cq], eobd ; 0
   1223    or                  r3d, 16
   1224    add                 r6d, 384
   1225    sar                 r6d, 9
   1226    jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3
   1227 %endif
   1228 %endmacro
   1229 
   1230 INV_TXFM_4X16_FN dct, dct
   1231 INV_TXFM_4X16_FN dct, identity
   1232 INV_TXFM_4X16_FN dct, adst
   1233 INV_TXFM_4X16_FN dct, flipadst
   1234 
   1235 cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
   1236 .pass1:
   1237    vpbroadcastd        m10, [pd_3072]
   1238    mova                 m1, [cq+32*2]
   1239    mova                 m3, [cq+32*6]
   1240    mova                 m5, [cq+32*3]
   1241    mova                 m7, [cq+32*7]
   1242    call .pass1_main
   1243    pmulld               m0, m6, [cq+32*0]
   1244    pmulld               m2, m6, [cq+32*4]
   1245    pmulld               m4, m6, [cq+32*1]
   1246    pmulld               m6,     [cq+32*5]
   1247    call .pass1_main2
   1248    REPX       {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
   1249    jmp                tx2q
   1250 .pass2:
   1251    packssdw             m0, m4
   1252    packssdw             m1, m5
   1253    packssdw             m2, m6
   1254    packssdw             m3, m7
   1255    lea                  r6, [deint_shuf+128]
   1256    punpcklwd            m4, m2, m3
   1257    punpckhwd            m2, m3
   1258    punpckhwd            m5, m0, m1
   1259    punpcklwd            m0, m1
   1260    punpckhdq            m1, m0, m4     ; 2 3
   1261    punpckldq            m0, m4         ; 0 1
   1262    punpckldq            m4, m5, m2     ; 8 9
   1263    punpckhdq            m5, m2         ; a b
   1264    vextracti128        xm2, m0, 1      ; 4 5
   1265    vextracti128        xm3, m1, 1      ; 6 7
   1266    vextracti128        xm6, m4, 1      ; c d
   1267    vextracti128        xm7, m5, 1      ; e f
   1268    call m(idct_4x16_internal_8bpc).main
   1269    vpbroadcastd         m9, [pw_2048]
   1270    vinserti128          m0, m0, xm1, 1 ; 0 1   3 2
   1271    vinserti128          m1, m2, xm3, 1 ; 4 5   7 6
   1272    vinserti128          m2, m4, xm5, 1 ; 8 9   b a
   1273    vinserti128          m3, m6, xm7, 1 ; c d   f e
   1274    vpbroadcastd         m8, [pixel_10bpc_max]
   1275    call .pass2_end
   1276    RET
   1277 ALIGN function_align
   1278 .pass1_main:
   1279    vpbroadcastd         m4, [pd_3784]
   1280    vpbroadcastd         m8, [pd_1567]
   1281    vpbroadcastd         m9, [pd_2048]
   1282    vpbroadcastd         m6, [pd_1448]
   1283    ITX_MULSUB_2D         1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l
   1284    ITX_MULSUB_2D         5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h
   1285    ret
   1286 ALIGN function_align
   1287 .pass1_main2:
   1288    paddd                m0, m10
   1289    paddd                m4, m10
   1290    paddd                m8, m0, m2
   1291    psubd                m0, m2
   1292    paddd                m9, m4, m6
   1293    psubd                m4, m6
   1294    REPX      {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
   1295    psubd                m2, m0, m1
   1296    paddd                m1, m0
   1297    psubd                m6, m4, m5
   1298    paddd                m5, m4
   1299    paddd                m0, m8, m3
   1300    psubd                m3, m8, m3
   1301    paddd                m4, m9, m7
   1302    psubd                m7, m9, m7
   1303    ret
   1304 ALIGN function_align
   1305 .pass2_end:
   1306    lea                  r6, [strideq*3]
   1307    pxor                 m7, m7
   1308    pmulhrsw             m0, m9
   1309    call .write_4x4
   1310    pmulhrsw             m0, m1, m9
   1311    call .write_4x4
   1312    pmulhrsw             m0, m2, m9
   1313    call .write_4x4
   1314    pmulhrsw             m0, m3, m9
   1315    call .write_4x4
   1316    ret
   1317 ALIGN function_align
   1318 .write_4x4:
   1319    movq                xm4, [dstq+strideq*0]
   1320    movhps              xm4, [dstq+strideq*1]
   1321    vpbroadcastq         m5, [dstq+strideq*2]
   1322    vpbroadcastq         m6, [dstq+r6       ]
   1323    mova          [cq+32*0], m7
   1324    mova          [cq+32*1], m7
   1325    add                  cq, 32*2
   1326    vpblendd             m4, m5, 0xc0
   1327    vpblendd             m4, m6, 0x30
   1328    paddw                m4, m0
   1329    pmaxsw               m4, m7
   1330    pminsw               m4, m8
   1331    vextracti128        xm5, m4, 1
   1332    movq   [dstq+strideq*0], xm4
   1333    movhps [dstq+strideq*1], xm4
   1334    movhps [dstq+strideq*2], xm5
   1335    movq   [dstq+r6       ], xm5
   1336    lea                dstq, [dstq+strideq*4]
   1337    ret
   1338 
   1339 INV_TXFM_4X16_FN adst, dct
   1340 INV_TXFM_4X16_FN adst, adst
   1341 INV_TXFM_4X16_FN adst, flipadst
   1342 INV_TXFM_4X16_FN adst, identity
   1343 
   1344 cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
   1345    call m(iadst_16x4_internal_10bpc).main
   1346    vpbroadcastd         m6, [pd_6144]
   1347    call m(iadst_16x4_internal_10bpc).main_end
   1348    psrad                m0, m4, 13
   1349    psrad                m1, m5, 13
   1350    psrad                m2, 13
   1351    psrad                m3, 13
   1352    psrad                m4, m8, 13
   1353    psrad                m5, m9, 13
   1354    psrad                m6, 13
   1355    psrad                m7, 13
   1356    jmp                tx2q
   1357 .pass2:
   1358    call .pass2_main
   1359    vpbroadcastd         m5, [pw_2048]
   1360    vpbroadcastd         m8, [pixel_10bpc_max]
   1361    lea                  r6, [strideq*3]
   1362    vpblendd             m4, m3, m0, 0xcc ; -out3   out0   out2  -out1
   1363    pshufd               m2, m2, q1032    ; -out11  out8   out10 -out9
   1364    vpblendd             m3, m0, 0x33     ; -out15  out12  out14 -out13
   1365    pxor                 m7, m7
   1366    psubw                m9, m7, m5
   1367    vpblendd             m9, m5, 0x3c     ; -2048   2048   2048  -2048
   1368    pmulhrsw             m0, m4, m9
   1369    call .write_4x4
   1370    pmulhrsw             m0, m1, m9
   1371    call .write_4x4
   1372    pmulhrsw             m0, m2, m9
   1373    call .write_4x4
   1374    pmulhrsw             m0, m3, m9
   1375    call .write_4x4
   1376    RET
   1377 ALIGN function_align
   1378 .write_4x4:
   1379    movq                xm4, [dstq+r6       ]
   1380    movhps              xm4, [dstq+strideq*0]
   1381    vpbroadcastq         m5, [dstq+strideq*1]
   1382    vpbroadcastq         m6, [dstq+strideq*2]
   1383    mova          [cq+32*0], m7
   1384    mova          [cq+32*1], m7
   1385    add                  cq, 32*2
   1386    vpblendd             m4, m5, 0xc0
   1387    vpblendd             m4, m6, 0x30
   1388    paddw                m4, m0
   1389    pmaxsw               m4, m7
   1390    pminsw               m4, m8
   1391    vextracti128        xm5, m4, 1
   1392    movhps [dstq+strideq*0], xm4
   1393    movhps [dstq+strideq*1], xm5
   1394    movq   [dstq+strideq*2], xm5
   1395    movq   [dstq+r6       ], xm4
   1396    lea                dstq, [dstq+strideq*4]
   1397    ret
   1398 ALIGN function_align
   1399 .pass2_main:
   1400    packssdw             m0, m4
   1401    packssdw             m1, m5
   1402    packssdw             m2, m6
   1403    packssdw             m3, m7
   1404    lea                  r6, [deint_shuf+128]
   1405    punpcklwd            m4, m2, m3
   1406    punpckhwd            m2, m3
   1407    punpckhwd            m5, m0, m1
   1408    punpcklwd            m0, m1
   1409    punpckhdq            m1, m0, m4
   1410    punpckldq            m0, m4
   1411    punpckldq            m4, m5, m2
   1412    punpckhdq            m5, m2
   1413    vpblendd             m3, m0, m1, 0x33
   1414    vpblendd             m0, m1, 0xcc
   1415    shufpd               m2, m5, m4, 0x05
   1416    shufpd               m4, m5, 0x05
   1417    vperm2i128           m1, m0, m3, 0x31 ; 4 7   6 5
   1418    vinserti128          m0, xm3, 1       ; 0 3   2 1
   1419    vperm2i128           m3, m2, m4, 0x31 ; c f   e d ; ????
   1420    vinserti128          m2, xm4, 1       ; b 8   9 a
   1421    call m(iadst_4x16_internal_8bpc).main2
   1422    vpbroadcastd         m5, [pw_2896x8]
   1423    paddsw               m1, m2, m4
   1424    psubsw               m2, m4
   1425    pmulhrsw             m1, m5           ; -out7   out4   out6  -out5
   1426    pmulhrsw             m2, m5           ;  out8  -out11 -out9   out10
   1427    ret
   1428 ALIGN function_align
   1429 .main:
   1430    vbroadcasti128       m0, [cq+16* 0]
   1431    vbroadcasti128       m4, [cq+16* 2]
   1432    vbroadcasti128       m1, [cq+16*15]
   1433    vbroadcasti128       m5, [cq+16*13]
   1434    vbroadcasti128       m2, [cq+16* 4]
   1435    vbroadcasti128       m6, [cq+16* 6]
   1436    vbroadcasti128       m3, [cq+16*11]
   1437    vbroadcasti128       m7, [cq+16* 9]
   1438    shufpd               m0, m4, 0x0c ;  0  2
   1439    shufpd               m1, m5, 0x0c ; 15 13
   1440    shufpd               m2, m6, 0x0c ;  4  6
   1441    shufpd               m3, m7, 0x0c ; 11  9
   1442    vbroadcasti128       m4, [cq+16* 8]
   1443    vbroadcasti128       m6, [cq+16*10]
   1444    vbroadcasti128       m5, [cq+16* 7]
   1445    vbroadcasti128       m7, [cq+16* 5]
   1446    shufpd               m4, m6, 0x0c ;  8 10
   1447    shufpd               m5, m7, 0x0c ;  7  5
   1448    vbroadcasti128       m6, [cq+16*12]
   1449    vbroadcasti128       m7, [cq+16*14]
   1450    shufpd               m6, m7, 0x0c ; 12 14
   1451    vbroadcasti128       m7, [cq+16* 3]
   1452    vbroadcasti128       m8, [cq+16* 1]
   1453    shufpd               m7, m8, 0x0c ;  3  1
   1454 .main2:
   1455    ; expects: m12 = clip_min   m13 = clip_max
   1456    vpbroadcastd        m11, [pd_2048]
   1457    ITX_MULSUB_2D         1, 0, 8, 9, 10, 11,  201_995,  4091_3973, 1
   1458    ITX_MULSUB_2D         3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1
   1459    ITX_MULSUB_2D         5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1
   1460    ITX_MULSUB_2D         7, 6, 8, 9, 10, 11, 3857_4052, 1380_601,  1
   1461    psubd                m8, m0, m4 ; t8a  t10a
   1462    paddd                m0, m4     ; t0a  t2a
   1463    psubd                m4, m1, m5 ; t9a  t11a
   1464    paddd                m1, m5     ; t1a  t3a
   1465    psubd                m5, m2, m6 ; t12a t14a
   1466    paddd                m2, m6     ; t4a  t6a
   1467    psubd                m6, m3, m7 ; t13a t15a
   1468    paddd                m3, m7     ; t5a  t7a
   1469    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8
   1470    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8
   1471    ITX_MULSUB_2D         8, 4, 7, 9, 10, 11,  799_3406, 4017_2276, 1
   1472    ITX_MULSUB_2D         6, 5, 7, 9, 10, 11, 4017_2276, 10,        1
   1473    psubd                m7, m0, m2 ; t4   t6
   1474    paddd                m0, m2     ; t0   t2
   1475    psubd                m2, m1, m3 ; t5   t7
   1476    paddd                m1, m3     ; t1   t3
   1477    psubd                m3, m4, m6 ; t12a t14a
   1478    paddd                m4, m6     ; t8a  t10a
   1479    psubd                m6, m8, m5 ; t13a t15a
   1480    paddd                m8, m5     ; t9a  t11a
   1481    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8
   1482    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8
   1483    punpcklqdq           m5, m3, m7 ; t12a t4
   1484    punpckhqdq           m3, m7     ; t14a t6
   1485    punpckhqdq           m7, m6, m2 ; t15a t7
   1486    punpcklqdq           m6, m2     ; t13a t5
   1487    ITX_MULSUB_2D         7, 3, 2, 9, 10, 11, 3784, 1567
   1488    ITX_MULSUB_2D         5, 6, 2, 9, 10, 11, 1567, 10
   1489    vpbroadcastd        m10, [pd_2896]
   1490    vbroadcasti128       m9, [pw_2048_m2048] ; + + - -
   1491    punpckhqdq           m2, m4, m0 ; t10a t2
   1492    punpcklqdq           m4, m0     ; t8a  t0
   1493    punpckhqdq           m0, m8, m1 ; t11a t3
   1494    punpcklqdq           m8, m1     ; t9a  t1
   1495    paddd                m1, m6, m7 ; out2   -out3
   1496    psubd                m6, m7     ; t14a t6
   1497    paddd                m7, m5, m3 ; -out13  out12
   1498    psubd                m5, m3     ; t15a t7
   1499    psubd                m3, m8, m0 ; t11  t3a
   1500    paddd                m8, m0     ; out14  -out15
   1501    paddd                m0, m4, m2 ; -out1   out0
   1502    psubd                m4, m2     ; t10  t2a
   1503    REPX    {pmaxsd x, m12}, m6, m5, m3, m4
   1504    REPX    {pminsd x, m13}, m6, m5, m3, m4
   1505    REPX    {pmulld x, m10}, m6, m5, m3, m4
   1506    paddd                m6, m11
   1507    paddd                m4, m11
   1508    paddd                m2, m6, m5 ; -out5   out4
   1509    psubd                m6, m5     ;  out10 -out11
   1510    psubd                m5, m4, m3 ; -out9   out8
   1511    paddd                m3, m4     ;  out6  -out7
   1512    REPX     {psrad  x, 12}, m2, m3, m5, m6
   1513    REPX     {psignd x, m9}, m1, m8, m3, m6
   1514    pshufd               m9, m9, q1032
   1515    REPX     {psignd x, m9}, m0, m7, m2, m5
   1516    ret
   1517 
   1518 INV_TXFM_4X16_FN flipadst, dct
   1519 INV_TXFM_4X16_FN flipadst, adst
   1520 INV_TXFM_4X16_FN flipadst, flipadst
   1521 INV_TXFM_4X16_FN flipadst, identity
   1522 
   1523 cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
   1524 .pass1:
   1525    call m(iadst_16x4_internal_10bpc).main
   1526    vpbroadcastd         m6, [pd_6144]
   1527    call m(iadst_16x4_internal_10bpc).main_end
   1528    psrad                m0, m3, 13
   1529    psrad                m1, m2, 13
   1530    psrad                m2, m5, 13
   1531    psrad                m3, m4, 13
   1532    psrad                m4, m7, 13
   1533    psrad                m5, m6, 13
   1534    psrad                m6, m9, 13
   1535    psrad                m7, m8, 13
   1536    jmp                tx2q
   1537 .pass2:
   1538    call m(iadst_4x16_internal_10bpc).pass2_main
   1539    vpbroadcastd         m5, [pw_2048]
   1540    vpbroadcastd         m8, [pixel_10bpc_max]
   1541    lea                  r6, [strideq*3]
   1542    vpblendd             m4, m3, m0, 0x33 ; -out0   out3   out1  -out2
   1543    pshufd               m2, m2, q1032    ; -out11  out8   out10 -out9
   1544    vpblendd             m3, m0, 0xcc     ; -out12  out15  out13 -out14
   1545    pxor                 m7, m7
   1546    psubw                m9, m7, m5
   1547    vpblendd             m9, m5, 0x3c     ; -2048   2048   2048  -2048
   1548    pmulhrsw             m0, m4, m9
   1549    call .write_4x4
   1550    pmulhrsw             m0, m2, m9
   1551    call .write_4x4
   1552    pmulhrsw             m0, m1, m9
   1553    call .write_4x4
   1554    pmulhrsw             m0, m3, m9
   1555    call .write_4x4
   1556    RET
   1557 ALIGN function_align
   1558 .write_4x4:
   1559    movq                xm4, [dstq+strideq*0]
   1560    movhps              xm4, [dstq+r6       ]
   1561    vpbroadcastq         m5, [dstq+strideq*1]
   1562    vpbroadcastq         m6, [dstq+strideq*2]
   1563    mova          [cq+32*0], m7
   1564    mova          [cq+32*1], m7
   1565    add                  cq, 32*2
   1566    vpblendd             m4, m5, 0x30
   1567    vpblendd             m4, m6, 0xc0
   1568    paddw                m4, m0
   1569    pmaxsw               m4, m7
   1570    pminsw               m4, m8
   1571    vextracti128        xm5, m4, 1
   1572    movq   [dstq+strideq*0], xm4
   1573    movq   [dstq+strideq*1], xm5
   1574    movhps [dstq+strideq*2], xm5
   1575    movhps [dstq+r6       ], xm4
   1576    lea                dstq, [dstq+strideq*4]
   1577    ret
   1578 
   1579 INV_TXFM_4X16_FN identity, dct
   1580 INV_TXFM_4X16_FN identity, adst
   1581 INV_TXFM_4X16_FN identity, flipadst
   1582 INV_TXFM_4X16_FN identity, identity
   1583 
   1584 cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
   1585    vpbroadcastd         m7, [pd_5793]
   1586    pmulld               m0, m7, [cq+32*0]
   1587    pmulld               m4, m7, [cq+32*1]
   1588    pmulld               m1, m7, [cq+32*2]
   1589    pmulld               m5, m7, [cq+32*3]
   1590    pmulld               m2, m7, [cq+32*4]
   1591    pmulld               m6, m7, [cq+32*5]
   1592    pmulld               m3, m7, [cq+32*6]
   1593    pmulld               m7,     [cq+32*7]
   1594    vpbroadcastd         m8, [pd_6144]
   1595    REPX      {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7
   1596    REPX      {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7
   1597    jmp                tx2q
   1598 .pass2:
   1599    packssdw             m0, m4
   1600    packssdw             m1, m5
   1601    packssdw             m2, m6
   1602    packssdw             m3, m7
   1603    vpbroadcastd         m7, [pw_1697x16]
   1604    vpbroadcastd         m8, [pw_2048]
   1605    pmulhrsw             m4, m7, m0
   1606    pmulhrsw             m5, m7, m1
   1607    pmulhrsw             m6, m7, m2
   1608    pmulhrsw             m7, m3
   1609    REPX      {paddsw x, x}, m0, m1, m2, m3
   1610    paddsw               m0, m4
   1611    paddsw               m1, m5
   1612    paddsw               m2, m6
   1613    paddsw               m3, m7
   1614    vpbroadcastd         m4, [pixel_10bpc_max]
   1615    call .pass2_end
   1616    RET
   1617 ALIGN function_align
   1618 .pass2_end:
   1619    punpckhwd            m7, m0, m1
   1620    punpcklwd            m0, m1
   1621    punpckhwd            m1, m2, m3
   1622    punpcklwd            m2, m3
   1623    lea                  r6, [strideq*5]
   1624    pxor                 m3, m3
   1625    punpckhdq            m5, m0, m2 ; 2 3   6 7
   1626    punpckldq            m0, m2     ; 0 1   4 5
   1627    punpckldq            m6, m7, m1 ; 8 9   c d
   1628    punpckhdq            m7, m1     ; a b   e f
   1629    pmulhrsw             m0, m8
   1630    call .write_2x4x2
   1631    pmulhrsw             m0, m5, m8
   1632    call .write_2x4x2
   1633    pmulhrsw             m0, m6, m8
   1634    lea                dstq, [dstq+strideq*4]
   1635    call .write_2x4x2
   1636    pmulhrsw             m0, m7, m8
   1637    call .write_2x4x2
   1638    ret
   1639 ALIGN function_align
   1640 .write_2x4x2:
   1641    movq                xm1, [dstq+strideq*0]
   1642    movhps              xm1, [dstq+strideq*1]
   1643    vpbroadcastq         m2, [dstq+strideq*4]
   1644    vpblendd             m1, m2, 0x30
   1645    vpbroadcastq         m2, [dstq+r6       ]
   1646    vpblendd             m1, m2, 0xc0
   1647    mova          [cq+32*0], m3
   1648    mova          [cq+32*1], m3
   1649    add                  cq, 32*2
   1650    paddw                m1, m0
   1651    pmaxsw               m1, m3
   1652    pminsw               m1, m4
   1653    vextracti128        xm2, m1, 1
   1654    movq   [dstq+strideq*0], xm1
   1655    movhps [dstq+strideq*1], xm1
   1656    movq   [dstq+strideq*4], xm2
   1657    movhps [dstq+r6       ], xm2
   1658    lea                dstq, [dstq+strideq*2]
   1659    ret
   1660 
   1661 INV_TXFM_4X16_FN dct, dct,      12
   1662 INV_TXFM_4X16_FN dct, identity, 12
   1663 INV_TXFM_4X16_FN dct, adst,     12
   1664 INV_TXFM_4X16_FN dct, flipadst, 12
   1665 
   1666 cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   1667    jmp m(idct_4x16_internal_10bpc).pass1
   1668 .pass2:
   1669    punpckldq            m8, m0, m1
   1670    punpckhdq            m0, m1
   1671    punpckldq            m9, m2, m3
   1672    punpckhdq            m2, m3
   1673    punpckldq            m1, m4, m5
   1674    punpckhdq            m4, m5
   1675    punpckldq            m3, m6, m7
   1676    punpckhdq            m6, m7
   1677    punpcklqdq           m5, m0, m2         ;  2  6
   1678    punpckhqdq          m12, m0, m2         ;  3  7
   1679    punpcklqdq           m0, m8, m9         ;  0  4
   1680    punpckhqdq          m10, m8, m9         ;  1  5
   1681    punpcklqdq           m2, m1, m3         ;  8 12
   1682    punpckhqdq          m13, m1, m3         ;  9 13
   1683    punpcklqdq           m9, m4, m6         ; 10 14
   1684    punpckhqdq           m4, m6             ; 11 15
   1685    vperm2i128           m1,  m5,  m9, 0x20 ;  2 10
   1686    vperm2i128           m3,  m9,  m5, 0x31 ; 14  6
   1687    vpermq              m11,  m4, q1302     ; 15 11
   1688    ; interleave
   1689    REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10
   1690    vpbroadcastd         m8, [clip_18b_min]
   1691    vpbroadcastd         m9, [clip_18b_max]
   1692    REPX     {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13
   1693    REPX     {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13
   1694    call m(idct_16x4_internal_10bpc).pass1_main
   1695    vpermq               m6, m12, q1302 ;  7  3
   1696    vpermq               m5, m13, q3120 ;  9 13
   1697    call m(idct_16x4_internal_10bpc).pass1_main2
   1698    call m(idct_16x4_internal_10bpc).pass1_main3
   1699    REPX       {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
   1700    packssdw             m0, m1
   1701    packssdw             m1, m2, m3
   1702    packssdw             m2, m4, m5
   1703    packssdw             m3, m6, m7
   1704    mova                 m4, [idct16_12_shuf]
   1705    REPX  {vpermd x, m4, x}, m0, m1, m2, m3
   1706    vpbroadcastd         m9, [pw_16384]
   1707    vpbroadcastd         m8, [pixel_12bpc_max]
   1708    call m(idct_4x16_internal_10bpc).pass2_end
   1709    RET
   1710 
   1711 INV_TXFM_4X16_FN adst, dct,      12
   1712 INV_TXFM_4X16_FN adst, adst,     12
   1713 INV_TXFM_4X16_FN adst, flipadst, 12
   1714 INV_TXFM_4X16_FN adst, identity, 12
   1715 
   1716 cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   1717    call .main_pass1
   1718    psrad                m0, m4, 12
   1719    psrad                m1, m5, 12
   1720    psrad                m2, 12
   1721    psrad                m3, 12
   1722    psrad                m4, m8, 12
   1723    psrad                m5, m9, 12
   1724    psrad                m6, 12
   1725    psrad                m7, 12
   1726    jmp                tx2q
   1727 .pass2:
   1728    vpbroadcastd        m12, [clip_18b_min]
   1729    vpbroadcastd        m13, [clip_18b_max]
   1730    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   1731    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   1732    call .transpose_16x4
   1733    call m(iadst_4x16_internal_10bpc).main2
   1734    pshufd               m4, m5, q1032
   1735    psrad                m5, m6, 3
   1736    pshufd               m6, m7, q1032
   1737    psrad                m7, m8, 3
   1738    REPX {pshufd x, x, q1032}, m0, m2
   1739    REPX       {psrad x, 3}, m0, m1, m2, m3, m4, m6
   1740 .pass2_end:
   1741    packssdw             m0, m1
   1742    packssdw             m1, m2, m3
   1743    packssdw             m2, m4, m5
   1744    packssdw             m3, m6, m7
   1745    mova                 m4, [iadst16_12_shuf]
   1746    REPX  {vpermd x, m4, x}, m0, m1, m2, m3
   1747    vpbroadcastd         m9, [pw_16384]
   1748    vpbroadcastd         m8, [pixel_12bpc_max]
   1749    lea                  r6, [strideq*3]
   1750    pxor                 m7, m7
   1751    pmulhrsw             m0, m9
   1752    call m(iadst_4x16_internal_10bpc).write_4x4
   1753    pmulhrsw             m0, m9, m1
   1754    call m(iadst_4x16_internal_10bpc).write_4x4
   1755    pmulhrsw             m0, m9, m2
   1756    call m(iadst_4x16_internal_10bpc).write_4x4
   1757    pmulhrsw             m0, m9, m3
   1758    call m(iadst_4x16_internal_10bpc).write_4x4
   1759    RET
   1760 ALIGN function_align
   1761 .transpose_16x4:
   1762    ; transpose & interleave
   1763    punpckldq            m8, m0, m1
   1764    punpckhdq            m0, m1
   1765    punpckldq            m9, m2, m3
   1766    punpckhdq            m2, m3
   1767    punpckldq            m1, m4, m5
   1768    punpckhdq            m4, m5
   1769    punpckldq            m3, m6, m7
   1770    punpckhdq            m6, m7
   1771    punpcklqdq          m10, m8, m0
   1772    punpckhqdq           m0, m8
   1773    punpcklqdq          m11, m9, m2
   1774    punpckhqdq           m2, m9
   1775    punpcklqdq           m8, m1, m4
   1776    punpckhqdq           m4, m1
   1777    punpcklqdq           m9, m3, m6
   1778    punpckhqdq           m6, m3
   1779    vperm2i128           m5,  m0,  m2, 0x31   ;  7  5
   1780    vperm2i128           m7,  m0,  m2, 0x20   ;  3  1
   1781    vperm2i128           m0, m10, m11, 0x20   ;  0  2
   1782    vperm2i128           m2, m10, m11, 0x31   ;  4  6
   1783    vperm2i128           m1,  m4,  m6, 0x31   ; 15 13
   1784    vperm2i128           m3,  m4,  m6, 0x20   ; 11  9
   1785    vperm2i128           m4,  m8,  m9, 0x20   ;  8 10
   1786    vperm2i128           m6,  m8,  m9, 0x31   ; 12 14
   1787    ret
   1788 ALIGN function_align
   1789 .main_pass1:
   1790    call m(iadst_16x4_internal_10bpc).main
   1791    vpbroadcastd         m6, [pd_3072]
   1792    paddd               m10, m4, m5
   1793    psubd                m4, m3
   1794    psubd                m5, m3
   1795    paddd                m3, m10
   1796    psubd                m8, m7, m1
   1797    paddd                m7, m9
   1798    psubd                m9, m1
   1799    paddd                m7, m1
   1800    REPX      {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7
   1801    REPX      {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7
   1802    paddd                m6, m0
   1803    ret
   1804 
   1805 INV_TXFM_4X16_FN flipadst, dct,      12
   1806 INV_TXFM_4X16_FN flipadst, adst,     12
   1807 INV_TXFM_4X16_FN flipadst, flipadst, 12
   1808 INV_TXFM_4X16_FN flipadst, identity, 12
   1809 
   1810 cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   1811    call m(iadst_4x16_internal_12bpc).main_pass1
   1812    psrad                m0, m3, 12
   1813    psrad                m1, m2, 12
   1814    psrad                m2, m5, 12
   1815    psrad                m3, m4, 12
   1816    psrad                m4, m7, 12
   1817    psrad                m5, m6, 12
   1818    psrad                m6, m9, 12
   1819    psrad                m7, m8, 12
   1820    jmp                tx2q
   1821 .pass2:
   1822    vpbroadcastd        m12, [clip_18b_min]
   1823    vpbroadcastd        m13, [clip_18b_max]
   1824    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   1825    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   1826    call m(iadst_4x16_internal_12bpc).transpose_16x4
   1827    call m(iadst_4x16_internal_10bpc).main2
   1828    pshufd               m4, m3, q1032
   1829    psrad                m3, m5, 3
   1830    psrad                m5, m2, 3
   1831    pshufd               m2, m6, q1032
   1832    pshufd               m6, m1, q1032
   1833    psrad                m1, m7, 3
   1834    psrad                m7, m0, 3
   1835    pshufd               m0, m8, q1032
   1836    REPX       {psrad x, 3}, m0, m2, m4, m6
   1837    jmp m(iadst_4x16_internal_12bpc).pass2_end
   1838 
   1839 INV_TXFM_4X16_FN identity, dct,      12
   1840 INV_TXFM_4X16_FN identity, adst,     12
   1841 INV_TXFM_4X16_FN identity, flipadst, 12
   1842 INV_TXFM_4X16_FN identity, identity, 12
   1843 
   1844 cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   1845    vpbroadcastd         m8, [pd_1697]
   1846    mova                 m0, [cq+32*0]
   1847    mova                 m4, [cq+32*1]
   1848    mova                 m1, [cq+32*2]
   1849    mova                 m5, [cq+32*3]
   1850    vpbroadcastd         m9, [pd_6144]
   1851    pmulld               m2, m8, m0
   1852    pmulld               m6, m8, m4
   1853    pmulld               m3, m8, m1
   1854    pmulld               m7, m8, m5
   1855    mova                m10, [cq+32*4]
   1856    mova                m11, [cq+32*5]
   1857    mova                m12, [cq+32*6]
   1858    mova                m13, [cq+32*7]
   1859    REPX     {paddd  x, m9}, m2, m6, m3, m7
   1860    REPX     {psrad  x, 12}, m2, m6, m3, m7
   1861    paddd                m0, m2
   1862    pmulld               m2, m8, m10
   1863    paddd                m4, m6
   1864    pmulld               m6, m8, m11
   1865    paddd                m1, m3
   1866    pmulld               m3, m8, m12
   1867    paddd                m5, m7
   1868    pmulld               m7, m8, m13
   1869    REPX     {psrad  x, 1 }, m0, m4, m1, m5
   1870    REPX     {paddd  x, m9}, m2, m6, m3, m7
   1871    REPX     {psrad  x, 12}, m2, m6, m3, m7
   1872    paddd                m2, m10
   1873    paddd                m6, m11
   1874    paddd                m3, m12
   1875    paddd                m7, m13
   1876    REPX     {psrad  x, 1 }, m2, m6, m3, m7
   1877    jmp                tx2q
   1878 .pass2:
   1879    vpbroadcastd        m12, [clip_18b_min]
   1880    vpbroadcastd        m13, [clip_18b_max]
   1881    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   1882    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   1883    vpbroadcastd         m8, [pd_5793]
   1884    vpbroadcastd         m9, [pd_1024]
   1885    REPX     {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
   1886    REPX     {paddd  x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
   1887    REPX     {psrad  x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
   1888    packssdw             m0, m4
   1889    packssdw             m1, m5
   1890    packssdw             m2, m6
   1891    packssdw             m3, m7
   1892    vpbroadcastd         m8, [pw_16384]
   1893    vpbroadcastd         m4, [pixel_12bpc_max]
   1894    call m(iidentity_4x16_internal_10bpc).pass2_end
   1895    RET
   1896 
   1897 %macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth
   1898    INV_TXFM_FN          %1, %2, 0, 8x4, %3
   1899 %ifidn %1_%2, dct_dct
   1900    vpbroadcastd         m2, [dconly_%3bpc]
   1901 %if %3 = 10
   1902 .dconly:
   1903    imul                r6d, [cq], 181
   1904    mov                [cq], eobd ; 0
   1905    or                  r3d, 4
   1906    add                 r6d, 128
   1907    sar                 r6d, 8
   1908    imul                r6d, 181
   1909    add                 r6d, 128
   1910    sar                 r6d, 8
   1911    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
   1912 %else
   1913    jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly
   1914 %endif
   1915 %endif
   1916 %endmacro
   1917 
   1918 INV_TXFM_8X4_FN dct, dct
   1919 INV_TXFM_8X4_FN dct, identity
   1920 INV_TXFM_8X4_FN dct, adst
   1921 INV_TXFM_8X4_FN dct, flipadst
   1922 
   1923 cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
   1924    vpbroadcastd         m8, [clip_18b_min]
   1925    vpbroadcastd         m9, [clip_18b_max]
   1926 .pass1:
   1927    vbroadcasti128       m1, [cq+16*1]
   1928    vbroadcasti128       m0, [cq+16*5]
   1929    vbroadcasti128       m2, [cq+16*3]
   1930    vbroadcasti128       m3, [cq+16*7]
   1931    vpbroadcastd         m6, [pd_2896]
   1932    shufpd               m1, m0, 0x0c ; 1 5
   1933    shufpd               m3, m2, 0x0c ; 7 3
   1934    vbroadcasti128       m0, [cq+16*0]
   1935    vbroadcasti128       m4, [cq+16*2]
   1936    vbroadcasti128       m2, [cq+16*4]
   1937    vbroadcasti128       m5, [cq+16*6]
   1938    vpbroadcastd         m7, [pd_2048]
   1939    shufpd               m0, m4, 0x0c ; 0 2
   1940    shufpd               m2, m5, 0x0c ; 4 6
   1941    REPX {pmulld x, m6}, m1, m3, m0, m2
   1942    REPX {paddd  x, m7}, m1, m3, m0, m2
   1943    REPX {psrad  x, 12}, m1, m3, m0, m2
   1944    call .main
   1945    psubd                m3, m0, m4  ; out7 out6 (interleaved)
   1946    paddd                m0, m4      ; out0 out1 (interleaved)
   1947    paddd                m1, m2, m5  ; out3 out2 (interleaved)
   1948    psubd                m2, m5      ; out4 out5 (interleaved)
   1949    pshufd               m1, m1, q1032
   1950    pshufd               m3, m3, q1032
   1951    jmp                tx2q
   1952 .pass2:
   1953    vbroadcasti128       m4, [deint_shuf]
   1954    packssdw             m0, m1
   1955    packssdw             m2, m3
   1956    vperm2i128           m1, m0, m2, 0x31
   1957    vinserti128          m0, xm2, 1
   1958    pshufb               m0, m4
   1959    pshufb               m1, m4
   1960    IDCT4_1D_PACKED_WORD  0, 1, 2, 3, 4, 7
   1961    vpermq               m0, m0, q3120 ; out0 out1
   1962    vpermq               m2, m1, q2031 ; out2 out3
   1963    jmp m(iadst_8x4_internal_10bpc).end
   1964 ALIGN function_align
   1965 .main:
   1966    ITX_MULSUB_2D         1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1
   1967    IDCT4_1D_PACKED       0, 2, 4, 5, 6, 7
   1968    vpbroadcastd         m6, [pd_2896]
   1969    punpcklqdq           m4, m1, m3   ; t4a  t7a
   1970    punpckhqdq           m1, m3       ; t5a  t6a
   1971    psubd                m3, m4, m1   ; t5a  t6a
   1972    paddd                m4, m1       ; t4   t7
   1973    REPX     {pmaxsd x, m8}, m3, m4, m0, m2
   1974    REPX     {pminsd x, m9}, m3, m4, m0, m2
   1975    pmulld               m3, m6
   1976    pshufd               m1, m3, q1032
   1977    paddd                m3, m7
   1978    psubd                m5, m3, m1
   1979    paddd                m1, m3
   1980    psrad                m5, 12
   1981    psrad                m1, 12
   1982    vpblendd             m5, m4, 0x33 ; t4   t5
   1983    punpckhqdq           m4, m1       ; t7   t6
   1984    ret
   1985 
   1986 INV_TXFM_8X4_FN adst, dct
   1987 INV_TXFM_8X4_FN adst, adst
   1988 INV_TXFM_8X4_FN adst, flipadst
   1989 INV_TXFM_8X4_FN adst, identity
   1990 
   1991 cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
   1992    call m(iadst_4x8_internal_10bpc).main
   1993    vpblendd             m3, m0, m4, 0x33 ; out6 out7
   1994    vpblendd             m0, m4, 0xcc     ; out0 out1
   1995    pshufd               m1, m5, q1032
   1996    psignd               m2, m6           ; out4 out5
   1997    psignd               m1, m6           ; out2 out3
   1998    jmp                tx2q
   1999 .pass2:
   2000    call .pass2_main
   2001    vpermq               m0, m0, q3120 ; out0 out1
   2002    vpermq               m2, m1, q3120 ; out2 out3
   2003 .end:
   2004    vpbroadcastd         m1, [pw_2048]
   2005    pmulhrsw             m0, m1
   2006    pmulhrsw             m1, m2
   2007    vpbroadcastd         m5, [pixel_10bpc_max]
   2008 .end2:
   2009    mova                xm2, [dstq+strideq*0]
   2010    vinserti128          m2, [dstq+strideq*1], 1
   2011    lea                  r6, [dstq+strideq*2]
   2012    mova                xm3, [r6  +strideq*0]
   2013    vinserti128          m3, [r6  +strideq*1], 1
   2014    pxor                 m4, m4
   2015    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
   2016    paddw                m0, m2
   2017    paddw                m1, m3
   2018    pmaxsw               m0, m4
   2019    pmaxsw               m1, m4
   2020    pminsw               m0, m5
   2021    pminsw               m1, m5
   2022    mova         [dstq+strideq*0], xm0
   2023    vextracti128 [dstq+strideq*1], m0, 1
   2024    mova         [r6  +strideq*0], xm1
   2025    vextracti128 [r6  +strideq*1], m1, 1
   2026    RET
   2027 ALIGN function_align
   2028 .pass2_main:
   2029    vbroadcasti128       m4, [deint_shuf]
   2030    packssdw             m0, m1
   2031    packssdw             m2, m3
   2032    lea                  r6, [deint_shuf+128]
   2033    vperm2i128           m1, m0, m2, 0x31
   2034    vinserti128          m0, xm2, 1
   2035    pshufb               m0, m4
   2036    pshufb               m1, m4
   2037    jmp m(iadst_8x4_internal_8bpc).main
   2038 ALIGN function_align
   2039 .main:
   2040    vpbroadcastd         m1, [pd_2896]
   2041    pmulld               m0, m1, [cq+32*0]
   2042    pmulld               m3, m1, [cq+32*3]
   2043    pmulld               m2, m1, [cq+32*2]
   2044    pmulld               m1,     [cq+32*1]
   2045    vpbroadcastd         m4, [pd_2048]
   2046    REPX      {paddd x, m4}, m0, m3, m2, m1
   2047    REPX      {psrad x, 12}, m0, m3, m2, m1
   2048 .main2:
   2049    IADST4_1D
   2050    ret
   2051 
   2052 INV_TXFM_8X4_FN flipadst, dct
   2053 INV_TXFM_8X4_FN flipadst, adst
   2054 INV_TXFM_8X4_FN flipadst, flipadst
   2055 INV_TXFM_8X4_FN flipadst, identity
   2056 
   2057 cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2
   2058    call m(iadst_4x8_internal_10bpc).main
   2059    shufpd               m3, m4, m0, 0x05
   2060    shufpd               m0, m4, 0x05
   2061    psignd               m2, m6
   2062    pshufd               m6, m6, q1032
   2063    pshufd               m1, m2, q1032
   2064    psignd               m2, m5, m6
   2065    jmp                tx2q
   2066 .pass2:
   2067    call m(iadst_8x4_internal_10bpc).pass2_main
   2068    vpermq               m2, m0, q2031
   2069    vpermq               m0, m1, q2031
   2070    jmp m(iadst_8x4_internal_10bpc).end
   2071 
   2072 INV_TXFM_8X4_FN identity, dct
   2073 INV_TXFM_8X4_FN identity, adst
   2074 INV_TXFM_8X4_FN identity, flipadst
   2075 INV_TXFM_8X4_FN identity, identity
   2076 
   2077 cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
   2078 .pass1:
   2079    vpbroadcastd         m4, [pd_2896]
   2080    vpermq               m0, [cq+32*0], q3120
   2081    vpermq               m1, [cq+32*1], q3120
   2082    vpermq               m2, [cq+32*2], q3120
   2083    vpermq               m3, [cq+32*3], q3120
   2084    vpbroadcastd         m7, [pd_2048]
   2085    REPX     {pmulld x, m4}, m0, m1, m2, m3
   2086    REPX     {paddd  x, m7}, m0, m1, m2, m3
   2087    REPX     {psrad  x, 12}, m0, m1, m2, m3
   2088    REPX     {paddd  x, x }, m0, m1, m2, m3
   2089    jmp                tx2q
   2090 .pass2:
   2091    vpbroadcastd         m5, [pixel_10bpc_max]
   2092    vpbroadcastd         m4, [pw_1697x8]
   2093    packssdw             m0, m1
   2094    packssdw             m2, m3
   2095    pmulhrsw             m1, m4, m0
   2096    pmulhrsw             m4, m2
   2097    paddsw               m0, m1
   2098    paddsw               m2, m4
   2099    packssdw             m7, m7 ; pw_2048
   2100 .pass2_end:
   2101    punpckhwd            m1, m0, m2
   2102    punpcklwd            m0, m2
   2103    lea                  r6, [dstq+strideq*2]
   2104    punpckhwd            m2, m0, m1
   2105    punpcklwd            m0, m1
   2106    pmulhrsw             m2, m7
   2107    pmulhrsw             m0, m7
   2108    punpckhwd            m1, m0, m2
   2109    punpcklwd            m0, m2
   2110    mova                xm2, [dstq+strideq*0]
   2111    vinserti128          m2, [r6  +strideq*0], 1
   2112    mova                xm3, [dstq+strideq*1]
   2113    vinserti128          m3, [r6  +strideq*1], 1
   2114    pxor                 m4, m4
   2115    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
   2116    paddw                m0, m2
   2117    paddw                m1, m3
   2118    pmaxsw               m0, m4
   2119    pmaxsw               m1, m4
   2120    pminsw               m0, m5
   2121    pminsw               m1, m5
   2122    mova         [dstq+strideq*0], xm0
   2123    mova         [dstq+strideq*1], xm1
   2124    vextracti128 [r6  +strideq*0], m0, 1
   2125    vextracti128 [r6  +strideq*1], m1, 1
   2126    RET
   2127 
   2128 INV_TXFM_8X4_FN dct, dct,      12
   2129 INV_TXFM_8X4_FN dct, identity, 12
   2130 INV_TXFM_8X4_FN dct, adst,     12
   2131 INV_TXFM_8X4_FN dct, flipadst, 12
   2132 
   2133 cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
   2134    vpbroadcastd         m8, [clip_20b_min]
   2135    vpbroadcastd         m9, [clip_20b_max]
   2136    jmp m(idct_8x4_internal_10bpc).pass1
   2137 .pass2:
   2138    vpbroadcastd         m8, [clip_18b_min]
   2139    vpbroadcastd         m9, [clip_18b_max]
   2140    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
   2141    REPX     {pminsd x, m9}, m0, m1, m2, m3
   2142    call m(iadst_8x4_internal_12bpc).transpose_4x8
   2143    IDCT4_1D              0, 1, 2, 3, 4, 5, 6, 7
   2144    jmp m(iadst_8x4_internal_12bpc).end
   2145 
   2146 INV_TXFM_8X4_FN adst, dct,      12
   2147 INV_TXFM_8X4_FN adst, adst,     12
   2148 INV_TXFM_8X4_FN adst, flipadst, 12
   2149 INV_TXFM_8X4_FN adst, identity, 12
   2150 
   2151 cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
   2152    vpbroadcastd         m8, [clip_20b_min]
   2153    vpbroadcastd         m9, [clip_20b_max]
   2154    call m(iadst_4x8_internal_10bpc).main2
   2155    vpblendd             m3, m0, m4, 0x33 ; out6 out7
   2156    vpblendd             m0, m4, 0xcc     ; out0 out1
   2157    pshufd               m1, m5, q1032
   2158    psignd               m2, m6           ; out4 out5
   2159    psignd               m1, m6           ; out2 out3
   2160    jmp                tx2q
   2161 .pass2:
   2162    vpbroadcastd         m8, [clip_18b_min]
   2163    vpbroadcastd         m9, [clip_18b_max]
   2164    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
   2165    REPX     {pminsd x, m9}, m0, m1, m2, m3
   2166    call .pass2_main
   2167    vpbroadcastd         m5, [pd_2048]
   2168    paddd                m0, m5, m4
   2169    paddd                m1, m5, m6
   2170    paddd                m2, m5
   2171    paddd                m3, m5
   2172 .pass2_end:
   2173    REPX      {psrad x, 12}, m0, m1, m2, m3
   2174 .end:
   2175    vpbroadcastd         m4, [pw_16384]
   2176    REPX       {psrad x, 3}, m0, m1, m2, m3
   2177    packssdw             m0, m1
   2178    packssdw             m2, m3
   2179    pmulhrsw             m0, m4
   2180    pmulhrsw             m1, m2, m4
   2181    vpermq               m0, m0, q3120 ; out0 out1
   2182    vpermq               m1, m1, q3120 ; out2 out3
   2183    vpbroadcastd         m5, [pixel_12bpc_max]
   2184    jmp m(iadst_8x4_internal_10bpc).end2
   2185 ALIGN function_align
   2186 .pass2_main:
   2187    call .transpose_4x8
   2188    jmp m(iadst_8x4_internal_10bpc).main2
   2189 ALIGN function_align
   2190 .transpose_4x8:
   2191    ; deinterleave
   2192    pshufd               m0, m0, q3120
   2193    pshufd               m1, m1, q3120
   2194    pshufd               m2, m2, q3120
   2195    pshufd               m3, m3, q3120
   2196    ; transpose
   2197    punpcklqdq           m4, m0, m1
   2198    punpckhqdq           m0, m1
   2199    punpcklqdq           m5, m2, m3
   2200    punpckhqdq           m2, m3
   2201    vperm2i128           m1, m0, m2, 0x20   ; out1
   2202    vperm2i128           m3, m0, m2, 0x31   ; out3
   2203    vperm2i128           m2, m4, m5, 0x31   ; out2
   2204    vperm2i128           m0, m4, m5, 0x20   ; out0
   2205    ret
   2206 
   2207 INV_TXFM_8X4_FN flipadst, dct,      12
   2208 INV_TXFM_8X4_FN flipadst, adst,     12
   2209 INV_TXFM_8X4_FN flipadst, flipadst, 12
   2210 INV_TXFM_8X4_FN flipadst, identity, 12
   2211 
   2212 cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2
   2213    vpbroadcastd         m8, [clip_20b_min]
   2214    vpbroadcastd         m9, [clip_20b_max]
   2215    call m(iadst_4x8_internal_10bpc).main2
   2216    shufpd               m3, m4, m0, 0x05
   2217    shufpd               m0, m4, 0x05
   2218    psignd               m2, m6
   2219    pshufd               m6, m6, q1032
   2220    pshufd               m1, m2, q1032
   2221    psignd               m2, m5, m6
   2222    jmp                tx2q
   2223 .pass2:
   2224    vpbroadcastd         m8, [clip_18b_min]
   2225    vpbroadcastd         m9, [clip_18b_max]
   2226    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
   2227    REPX     {pminsd x, m9}, m0, m1, m2, m3
   2228    call m(iadst_8x4_internal_12bpc).pass2_main
   2229    vpbroadcastd         m5, [pd_2048]
   2230    paddd                m0, m5, m3
   2231    paddd                m1, m5, m2
   2232    paddd                m3, m5, m4
   2233    paddd                m2, m5, m6
   2234    jmp m(iadst_8x4_internal_12bpc).pass2_end
   2235 
   2236 INV_TXFM_8X4_FN identity, dct,      12
   2237 INV_TXFM_8X4_FN identity, adst,     12
   2238 INV_TXFM_8X4_FN identity, flipadst, 12
   2239 INV_TXFM_8X4_FN identity, identity, 12
   2240 
   2241 cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
   2242    jmp m(iidentity_8x4_internal_10bpc).pass1
   2243 .pass2:
   2244    ; m0 = in0 in1 (interleaved)
   2245    ; m1 = in2 in3 (interleaved)
   2246    ; m2 = in4 in5 (interleaved)
   2247    ; m3 = in6 in7 (interleaved)
   2248    vpbroadcastd         m8, [clip_18b_min]
   2249    vpbroadcastd         m9, [clip_18b_max]
   2250    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
   2251    REPX     {pminsd x, m9}, m0, m1, m2, m3
   2252    vpbroadcastd         m4, [pd_5793]
   2253    REPX     {pmulld x, m4}, m0, m1, m2, m3
   2254    REPX     {paddd  x, m7}, m0, m1, m2, m3
   2255    REPX     {psrad  x, 15}, m0, m1, m2, m3
   2256    vpbroadcastd         m5, [pixel_12bpc_max]
   2257    vpbroadcastd         m7, [pw_16384]
   2258    packssdw             m0, m1
   2259    packssdw             m2, m3
   2260    jmp m(iidentity_8x4_internal_10bpc).pass2_end
   2261 
   2262 %macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth
   2263    INV_TXFM_FN          %1, %2, 0, 8x8, %3
   2264 %ifidn %1_%2, dct_dct
   2265    vpbroadcastd         m2, [dconly_%3bpc]
   2266 %if %3 = 10
   2267 .dconly:
   2268    imul                r6d, [cq], 181
   2269    mov                [cq], eobd ; 0
   2270    or                  r3d, 8
   2271 .dconly2:
   2272    add                 r6d, 384
   2273    sar                 r6d, 9
   2274 .dconly3:
   2275    imul                r6d, 181
   2276    add                 r6d, 2176
   2277    sar                 r6d, 12
   2278    movd                xm0, r6d
   2279    paddsw              xm0, xm2
   2280    vpbroadcastw         m0, xm0
   2281 .dconly_loop:
   2282    mova                xm1, [dstq+strideq*0]
   2283    vinserti128          m1, [dstq+strideq*1], 1
   2284    paddsw               m1, m0
   2285    psubusw              m1, m2
   2286    mova         [dstq+strideq*0], xm1
   2287    vextracti128 [dstq+strideq*1], m1, 1
   2288    lea                dstq, [dstq+strideq*2]
   2289    sub                 r3d, 2
   2290    jg .dconly_loop
   2291    RET
   2292 %else
   2293    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
   2294 %endif
   2295 %endif
   2296 %endmacro
   2297 
   2298 %macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2]
   2299    ITX_MULSUB_2D        %8, %1, %9, %10, %11, %12,  401, 4076 ; t1a, t0a
   2300    ITX_MULSUB_2D        %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a
   2301    ITX_MULSUB_2D        %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a
   2302    ITX_MULSUB_2D        %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a
   2303    psubd               m%9, m%3, m%7 ; t6
   2304    paddd               m%3, m%7      ; t2
   2305    psubd               m%7, m%1, m%5 ; t4
   2306    paddd               m%1, m%5      ; t0
   2307    psubd               m%5, m%6, m%2 ; t7
   2308    paddd               m%6, m%2      ; t3
   2309    psubd               m%2, m%8, m%4 ; t5
   2310    paddd               m%8, m%4      ; t1
   2311    REPX   {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
   2312    REPX   {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
   2313    ITX_MULSUB_2D        %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a
   2314    ITX_MULSUB_2D        %5, %9, %4, %10, %11, %12, 3784, %11  ; t6a, t7a
   2315    psubd              m%10, m%7, m%9 ;  t7
   2316    paddd               m%7, m%9      ;  out6
   2317    vpbroadcastd        m%9, [pd_1448]
   2318    psubd               m%4, m%8, m%6 ;  t3
   2319    paddd               m%8, m%6      ; -out7
   2320    psubd               m%6, m%1, m%3 ;  t2
   2321    paddd               m%1, m%3      ;  out0
   2322    psubd               m%3, m%2, m%5 ;  t6
   2323    paddd               m%2, m%5      ; -out1
   2324    REPX   {pmaxsd x, m%13}, m%6, m%4, m%3, m%10
   2325    REPX   {pminsd x, m%14}, m%6, m%4, m%3, m%10
   2326    REPX   {pmulld x, m%9 }, m%6, m%4, m%3, m%10
   2327    psubd               m%5, m%6, m%4  ; (t2 - t3) * 1448
   2328    paddd               m%4, m%6       ; (t2 + t3) * 1448
   2329    psubd               m%6, m%3, m%10 ; (t6 - t7) * 1448
   2330    paddd               m%3, m%10      ; (t6 + t7) * 1448
   2331 %endmacro
   2332 
   2333 INV_TXFM_8X8_FN dct, dct
   2334 INV_TXFM_8X8_FN dct, identity
   2335 INV_TXFM_8X8_FN dct, adst
   2336 INV_TXFM_8X8_FN dct, flipadst
   2337 
   2338 cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
   2339    vpbroadcastd        m12, [clip_18b_min]
   2340    vpbroadcastd        m13, [clip_18b_max]
   2341 .pass1:
   2342    mova                 m0, [cq+32*0]
   2343    mova                 m1, [cq+32*1]
   2344    mova                 m2, [cq+32*2]
   2345    mova                 m3, [cq+32*3]
   2346    mova                 m4, [cq+32*4]
   2347    mova                 m5, [cq+32*5]
   2348    mova                 m6, [cq+32*6]
   2349    mova                 m7, [cq+32*7]
   2350    vpbroadcastd        m11, [pd_2048]
   2351    call .main
   2352    call .round_shift1
   2353    jmp                tx2q
   2354 .pass2:
   2355    call .transpose_8x8_packed
   2356    call m(idct_8x8_internal_8bpc).main
   2357    vpbroadcastd        m12, [pw_2048]
   2358    vpermq               m0, m0, q3120
   2359    vpermq               m1, m1, q2031
   2360    vpermq               m2, m2, q3120
   2361    vpermq               m3, m3, q2031
   2362    pmulhrsw             m0, m12
   2363    pmulhrsw             m1, m12
   2364    call .write_8x4_start
   2365    pmulhrsw             m0, m2, m12
   2366    pmulhrsw             m1, m3, m12
   2367    call .write_8x4
   2368    RET
   2369 ALIGN function_align
   2370 .write_8x4_start:
   2371    vpbroadcastd        m11, [pixel_10bpc_max]
   2372    lea                  r6, [strideq*3]
   2373    pxor                m10, m10
   2374 .write_8x4:
   2375    mova                xm8, [dstq+strideq*0]
   2376    vinserti128          m8, [dstq+strideq*1], 1
   2377    mova                xm9, [dstq+strideq*2]
   2378    vinserti128          m9, [dstq+r6       ], 1
   2379    mova          [cq+32*0], m10
   2380    mova          [cq+32*1], m10
   2381    mova          [cq+32*2], m10
   2382    mova          [cq+32*3], m10
   2383    add                  cq, 32*4
   2384    paddw                m0, m8
   2385    paddw                m1, m9
   2386    pmaxsw               m0, m10
   2387    pmaxsw               m1, m10
   2388    pminsw               m0, m11
   2389    pminsw               m1, m11
   2390    mova         [dstq+strideq*0], xm0
   2391    vextracti128 [dstq+strideq*1], m0, 1
   2392    mova         [dstq+strideq*2], xm1
   2393    vextracti128 [dstq+r6       ], m1, 1
   2394    lea                dstq, [dstq+strideq*4]
   2395    ret
   2396 ALIGN function_align
   2397 .transpose_8x8_packed:
   2398    packssdw             m0, m4
   2399    packssdw             m1, m5
   2400    packssdw             m2, m6
   2401    packssdw             m3, m7
   2402    lea                  r6, [deint_shuf+128]
   2403    punpckhwd            m4, m0, m1
   2404    punpcklwd            m0, m1
   2405    punpckhwd            m1, m2, m3
   2406    punpcklwd            m2, m3
   2407    punpckhdq            m3, m0, m2
   2408    punpckldq            m0, m2
   2409    punpckhdq            m2, m4, m1
   2410    punpckldq            m4, m1
   2411    vinserti128          m1, m3, xm2, 1
   2412    vperm2i128           m3, m2, 0x31
   2413    vperm2i128           m2, m0, m4, 0x31
   2414    vinserti128          m0, xm4, 1
   2415    ret
   2416 ALIGN function_align
   2417 .main_rect2:
   2418    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   2419    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
   2420 .main:
   2421    ITX_MULSUB_2D         5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a
   2422    ITX_MULSUB_2D         1, 7, 8, 9, 10, 11,  799, 4017 ; t4a t7a
   2423    ITX_MULSUB_2D         2, 6, 8, 9, 10, 11, 1567, 3784 ; t2  t3
   2424    paddd                m8, m1, m5 ; t4
   2425    psubd                m1, m5     ; t5a
   2426    paddd                m9, m7, m3 ; t7
   2427    psubd                m7, m3     ; t6a
   2428    vpbroadcastd         m3, [pd_2896]
   2429    REPX    {pmaxsd x, m12}, m1, m8, m7, m9
   2430    REPX    {pminsd x, m13}, m1, m8, m7, m9
   2431    REPX    {pmulld x, m3 }, m0, m4, m7, m1
   2432    paddd                m0, m11
   2433    paddd                m7, m11
   2434    psubd                m5, m0, m4
   2435    paddd                m0, m4
   2436    psubd                m4, m7, m1
   2437    paddd                m7, m1
   2438    REPX    {psrad  x, 12 }, m5, m0, m4, m7
   2439    psubd                m3, m0, m6 ; dct4 out3
   2440    paddd                m0, m6     ; dct4 out0
   2441    paddd                m6, m5, m2 ; dct4 out1
   2442    psubd                m5, m2     ; dct4 out2
   2443    REPX    {pmaxsd x, m12}, m0, m6, m5, m3
   2444    REPX    {pminsd x, m13}, m0, m6, m5, m3
   2445    ret
   2446 ALIGN function_align
   2447 .round_shift1:
   2448    pcmpeqd              m1, m1
   2449    REPX      {psubd x, m1}, m0, m6, m5, m3
   2450    paddd                m1, m6, m7 ; out1
   2451    psubd                m6, m7     ; out6
   2452    psubd                m7, m0, m9 ; out7
   2453    paddd                m0, m9     ; out0
   2454    paddd                m2, m5, m4 ; out2
   2455    psubd                m5, m4     ; out5
   2456    psubd                m4, m3, m8 ; out4
   2457    paddd                m3, m8     ; out3
   2458    REPX      {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
   2459    ret
   2460 
   2461 INV_TXFM_8X8_FN adst, dct
   2462 INV_TXFM_8X8_FN adst, adst
   2463 INV_TXFM_8X8_FN adst, flipadst
   2464 INV_TXFM_8X8_FN adst, identity
   2465 
   2466 cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
   2467    vpbroadcastd        m12, [clip_18b_min]
   2468    vpbroadcastd        m13, [clip_18b_max]
   2469 .pass1:
   2470    call .main
   2471    call .main_end
   2472    jmp                tx2q
   2473 .pass2:
   2474    call m(idct_8x8_internal_10bpc).transpose_8x8_packed
   2475    pshufd               m4, m0, q1032
   2476    pshufd               m5, m1, q1032
   2477    call m(iadst_8x8_internal_8bpc).main_pass2
   2478    vpbroadcastd         m5, [pw_2048]
   2479    vpbroadcastd       xm12, [pw_4096]
   2480    psubw               m12, m5
   2481    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
   2482    pmulhrsw             m0, m12
   2483    pmulhrsw             m1, m12
   2484    call m(idct_8x8_internal_10bpc).write_8x4_start
   2485    pmulhrsw             m0, m2, m12
   2486    pmulhrsw             m1, m3, m12
   2487    call m(idct_8x8_internal_10bpc).write_8x4
   2488    RET
   2489 ALIGN function_align
   2490 .main:
   2491    mova                 m0, [cq+32*0]
   2492    mova                 m7, [cq+32*7]
   2493    mova                 m1, [cq+32*1]
   2494    mova                 m6, [cq+32*6]
   2495    mova                 m2, [cq+32*2]
   2496    mova                 m5, [cq+32*5]
   2497    mova                 m3, [cq+32*3]
   2498    mova                 m4, [cq+32*4]
   2499    vpbroadcastd        m11, [pd_2048]
   2500 .main2:
   2501    IADST8_1D             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
   2502    psrld                m8, 10 ; pd_1
   2503    vpbroadcastd         m9, [pd_3072]
   2504    ret
   2505 ALIGN function_align
   2506 .main_end:
   2507    paddd                m0, m8
   2508    psubd                m1, m8, m1
   2509    paddd                m6, m8
   2510    psubd                m7, m8, m7
   2511    REPX      {psrad x, 1 }, m0, m1, m6, m7
   2512    ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12
   2513    ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12
   2514    psubd                m8, m9, m8 ; pd_3071
   2515    paddd                m2, m9
   2516    psubd                m3, m8, m3
   2517    paddd                m4, m9
   2518    psubd                m5, m8, m5
   2519    REPX      {psrad x, 12}, m2, m3, m4, m5
   2520    ret
   2521 
   2522 INV_TXFM_8X8_FN flipadst, dct
   2523 INV_TXFM_8X8_FN flipadst, adst
   2524 INV_TXFM_8X8_FN flipadst, flipadst
   2525 INV_TXFM_8X8_FN flipadst, identity
   2526 
   2527 cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
   2528    vpbroadcastd        m12, [clip_18b_min]
   2529    vpbroadcastd        m13, [clip_18b_max]
   2530 .pass1:
   2531    call m(iadst_8x8_internal_10bpc).main
   2532    call .main_end
   2533    jmp                tx2q
   2534 .pass2:
   2535    call m(idct_8x8_internal_10bpc).transpose_8x8_packed
   2536    pshufd               m4, m0, q1032
   2537    pshufd               m5, m1, q1032
   2538    call m(iadst_8x8_internal_8bpc).main_pass2
   2539    vpbroadcastd        m12, [pw_2048]
   2540    vpbroadcastd        xm5, [pw_4096]
   2541    psubw               m12, m5
   2542    vpermq               m8, m3, q2031
   2543    vpermq               m9, m2, q2031
   2544    vpermq               m2, m1, q2031
   2545    vpermq               m3, m0, q2031
   2546    pmulhrsw             m0, m8, m12
   2547    pmulhrsw             m1, m9, m12
   2548    call m(idct_8x8_internal_10bpc).write_8x4_start
   2549    pmulhrsw             m0, m2, m12
   2550    pmulhrsw             m1, m3, m12
   2551    call m(idct_8x8_internal_10bpc).write_8x4
   2552    RET
   2553 ALIGN function_align
   2554 .main_end:
   2555    paddd               m10, m8, m0
   2556    psubd                m0, m8, m7
   2557    psubd                m7, m8, m1
   2558    paddd                m1, m8, m6
   2559    psrad                m0, 1
   2560    psrad                m1, 1
   2561    psrad                m6, m7, 1
   2562    psrad                m7, m10, 1
   2563    psubd                m8, m9, m8 ; pd_6143
   2564    psubd               m10, m8, m5
   2565    paddd                m5, m9, m2
   2566    psubd                m2, m8, m3
   2567    paddd                m3, m9, m4
   2568    psrad                m4, m2, 12
   2569    psrad                m2, m10, 12
   2570    psrad                m3, 12
   2571    psrad                m5, 12
   2572    ret
   2573 
   2574 INV_TXFM_8X8_FN identity, dct
   2575 INV_TXFM_8X8_FN identity, adst
   2576 INV_TXFM_8X8_FN identity, flipadst
   2577 INV_TXFM_8X8_FN identity, identity
   2578 
   2579 cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
   2580 .pass1:
   2581    mova                 m0, [cq+32*0]
   2582    mova                 m1, [cq+32*1]
   2583    mova                 m2, [cq+32*2]
   2584    mova                 m3, [cq+32*3]
   2585    mova                 m4, [cq+32*4]
   2586    mova                 m5, [cq+32*5]
   2587    mova                 m6, [cq+32*6]
   2588    mova                 m7, [cq+32*7]
   2589    jmp                tx2q
   2590 .pass2:
   2591    packssdw             m3, m7
   2592    vpbroadcastd         m7, [pixel_10bpc_max]
   2593 .pass2_main:
   2594    packssdw             m0, m4
   2595    packssdw             m1, m5
   2596    packssdw             m2, m6
   2597    vpbroadcastd        m12, [pw_4096]
   2598    punpckhwd            m4, m0, m1
   2599    punpcklwd            m0, m1
   2600    punpckhwd            m1, m2, m3
   2601    punpcklwd            m2, m3
   2602    punpckhdq            m3, m0, m2
   2603    punpckldq            m0, m2
   2604    punpckldq            m2, m4, m1
   2605    punpckhdq            m4, m1
   2606    punpckhqdq           m1, m0, m2 ; 1 5
   2607    punpcklqdq           m0, m2     ; 0 4
   2608    punpcklqdq           m2, m3, m4 ; 2 6
   2609    punpckhqdq           m3, m4     ; 3 7
   2610    pmulhrsw             m0, m12
   2611    pmulhrsw             m1, m12
   2612    call .write_2x8x2_start
   2613    pmulhrsw             m0, m2, m12
   2614    pmulhrsw             m1, m3, m12
   2615    call .write_2x8x2_zero
   2616    RET
   2617 .write_2x8x2_start:
   2618    lea                  r6, [strideq*5]
   2619    pxor                 m6, m6
   2620 .write_2x8x2_zero:
   2621    mova          [cq+32*0], m6
   2622    mova          [cq+32*1], m6
   2623    mova          [cq+32*2], m6
   2624    mova          [cq+32*3], m6
   2625    add                  cq, 32*4
   2626 .write_2x8x2:
   2627    mova                xm4, [dstq+strideq*0]
   2628    vinserti128          m4, [dstq+strideq*4], 1
   2629    mova                xm5, [dstq+strideq*1]
   2630    vinserti128          m5, [dstq+r6       ], 1
   2631    paddw                m0, m4
   2632    paddw                m1, m5
   2633    pmaxsw               m0, m6
   2634    pmaxsw               m1, m6
   2635    pminsw               m0, m7
   2636    pminsw               m1, m7
   2637    mova         [dstq+strideq*0], xm0
   2638    mova         [dstq+strideq*1], xm1
   2639    vextracti128 [dstq+strideq*4], m0, 1
   2640    vextracti128 [dstq+r6       ], m1, 1
   2641    lea                dstq, [dstq+strideq*2]
   2642    ret
   2643 
   2644 %macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4]
   2645    punpckldq            m%9,  m%1,  m%2 ; aibj emfn
   2646    punpckhdq            m%1,  m%2       ; ckdl gohp
   2647    punpckldq           m%10,  m%3,  m%4 ; qyrz uCvD
   2648    punpckhdq            m%3,  m%4       ; sAtB wExF
   2649    punpckldq           m%11,  m%5,  m%6 ; GOHP KSLT
   2650    punpckhdq            m%5,  m%6       ; IQJR MUNV
   2651    punpckldq           m%12,  m%7,  m%8 ; WeXf aibj
   2652    punpckhdq            m%7,  m%8       ; YgZh ckdl
   2653    punpcklqdq           m%2,  m%9, m%10 ; aiqy emuC
   2654    punpckhqdq           m%9, m%10       ; bjrz fnvD
   2655    punpcklqdq           m%4,  m%1,  m%3 ; cksA gowE
   2656    punpckhqdq          m%10,  m%1,  m%3 ; dltB hpxF
   2657    punpcklqdq           m%6, m%11, m%12 ; GOWe KSai
   2658    punpckhqdq          m%11, m%12       ; HPXf LTbj
   2659    punpcklqdq           m%8,  m%5,  m%7 ; IQYg MUck
   2660    punpckhqdq          m%12,  m%5,  m%7 ; JRZh NVdl
   2661    vperm2i128           m%1,  m%2,  m%6, 0x20   ; out0
   2662    vperm2i128           m%5,  m%2,  m%6, 0x31   ; out4
   2663    vperm2i128           m%2,  m%9, m%11, 0x20   ; out1
   2664    vperm2i128           m%6,  m%9, m%11, 0x31   ; out5
   2665    vperm2i128           m%3,  m%4,  m%8, 0x20   ; out2
   2666    vperm2i128           m%7,  m%4,  m%8, 0x31   ; out6
   2667    vperm2i128           m%4, m%10, m%12, 0x20   ; out3
   2668    vperm2i128           m%8, m%10, m%12, 0x31   ; out7
   2669 %endmacro
   2670 
   2671 INV_TXFM_8X8_FN dct, dct,      12
   2672 INV_TXFM_8X8_FN dct, identity, 12
   2673 INV_TXFM_8X8_FN dct, adst,     12
   2674 INV_TXFM_8X8_FN dct, flipadst, 12
   2675 
   2676 cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   2677    vpbroadcastd        m12, [clip_20b_min]
   2678    vpbroadcastd        m13, [clip_20b_max]
   2679    jmp m(idct_8x8_internal_10bpc).pass1
   2680 .pass2:
   2681    vpbroadcastd        m12, [clip_18b_min]
   2682    vpbroadcastd        m13, [clip_18b_max]
   2683    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   2684    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   2685    call .transpose_8x8
   2686    vpbroadcastd        m11, [pd_2048]
   2687    call m(idct_8x8_internal_10bpc).main
   2688    call .round_shift4
   2689    jmp m(iadst_8x8_internal_12bpc).pass2_end
   2690 ALIGN function_align
   2691 .write_8x4_start:
   2692    vpbroadcastd        m11, [pixel_12bpc_max]
   2693    lea                  r6, [strideq*3]
   2694    pxor                m10, m10
   2695    ret
   2696 ALIGN function_align
   2697 .transpose_8x8:
   2698    TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
   2699    ret
   2700 ALIGN function_align
   2701 .round_shift4:
   2702    vpbroadcastd         m1, [pd_8]
   2703    REPX      {paddd x, m1}, m0, m6, m5, m3
   2704    paddd                m1, m6, m7 ; out1
   2705    psubd                m6, m7     ; out6
   2706    psubd                m7, m0, m9 ; out7
   2707    paddd                m0, m9     ; out0
   2708    paddd                m2, m5, m4 ; out2
   2709    psubd                m5, m4     ; out5
   2710    psubd                m4, m3, m8 ; out4
   2711    paddd                m3, m8     ; out3
   2712    REPX       {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
   2713    ret
   2714 
   2715 INV_TXFM_8X8_FN adst, dct,      12
   2716 INV_TXFM_8X8_FN adst, adst,     12
   2717 INV_TXFM_8X8_FN adst, flipadst, 12
   2718 INV_TXFM_8X8_FN adst, identity, 12
   2719 
   2720 cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   2721    vpbroadcastd        m12, [clip_20b_min]
   2722    vpbroadcastd        m13, [clip_20b_max]
   2723    jmp m(iadst_8x8_internal_10bpc).pass1
   2724 .pass2:
   2725    call .pass2_main
   2726 .pass2_end:
   2727    packssdw             m0, m1
   2728    packssdw             m1, m2, m3
   2729    REPX {vpermq x, x, q3120}, m0, m1
   2730    call m(idct_8x8_internal_12bpc).write_8x4_start
   2731    call m(idct_8x8_internal_10bpc).write_8x4
   2732    packssdw             m0, m4, m5
   2733    packssdw             m1, m6, m7
   2734    REPX {vpermq x, x, q3120}, m0, m1
   2735    call m(idct_8x8_internal_10bpc).write_8x4
   2736    RET
   2737 ALIGN function_align
   2738 .pass2_main:
   2739    vpbroadcastd        m12, [clip_18b_min]
   2740    vpbroadcastd        m13, [clip_18b_max]
   2741    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   2742    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   2743    call m(idct_8x8_internal_12bpc).transpose_8x8
   2744    vpbroadcastd        m11, [pd_2048]
   2745 .pass2_main2:
   2746    call m(iadst_8x8_internal_10bpc).main2
   2747    pslld                m9, m8, 3  ; pd_8
   2748    paddd                m0, m9
   2749    psubd                m1, m9, m1 ; 8+x
   2750    paddd                m6, m9
   2751    psubd                m7, m9, m7
   2752    REPX       {psrad x, 4}, m0, m1, m6, m7
   2753    vpbroadcastd         m9, [pd_17408]
   2754    psubd                m8, m9, m8 ; 17407
   2755    paddd                m2, m9
   2756    psubd                m3, m8, m3
   2757    paddd                m4, m9
   2758    psubd                m5, m8, m5
   2759    REPX      {psrad x, 15}, m2, m3, m4, m5
   2760    ret
   2761 
   2762 INV_TXFM_8X8_FN flipadst, dct,      12
   2763 INV_TXFM_8X8_FN flipadst, adst,     12
   2764 INV_TXFM_8X8_FN flipadst, flipadst, 12
   2765 INV_TXFM_8X8_FN flipadst, identity, 12
   2766 
   2767 cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   2768    vpbroadcastd        m12, [clip_20b_min]
   2769    vpbroadcastd        m13, [clip_20b_max]
   2770    jmp m(iflipadst_8x8_internal_10bpc).pass1
   2771 .pass2:
   2772    call m(iadst_8x8_internal_12bpc).pass2_main
   2773    packssdw             m7, m7, m6
   2774    packssdw             m6, m1, m0
   2775    packssdw             m1, m5, m4
   2776    vpermq               m0, m7, q3120
   2777    vpermq               m1, m1, q3120
   2778    call m(idct_8x8_internal_12bpc).write_8x4_start
   2779    call m(idct_8x8_internal_10bpc).write_8x4
   2780    packssdw             m0, m3, m2
   2781    vpermq               m0, m0, q3120
   2782    vpermq               m1, m6, q3120
   2783    call m(idct_8x8_internal_10bpc).write_8x4
   2784    RET
   2785 
   2786 INV_TXFM_8X8_FN identity, dct,      12
   2787 INV_TXFM_8X8_FN identity, adst,     12
   2788 INV_TXFM_8X8_FN identity, flipadst, 12
   2789 INV_TXFM_8X8_FN identity, identity, 12
   2790 
   2791 cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   2792    jmp m(iidentity_8x8_internal_10bpc).pass1
   2793 .pass2:
   2794    packssdw             m3, m7
   2795    vpbroadcastd         m7, [pixel_12bpc_max]
   2796    jmp m(iidentity_8x8_internal_10bpc).pass2_main
   2797 
   2798 %macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
   2799    INV_TXFM_FN          %1, %2, %3, 8x16, %4
   2800 %ifidn %1_%2, dct_dct
   2801    imul                r6d, [cq], 181
   2802    vpbroadcastd         m2, [dconly_%4bpc]
   2803    mov                [cq], eobd ; 0
   2804    or                  r3d, 16
   2805    add                 r6d, 128
   2806    sar                 r6d, 8
   2807    imul                r6d, 181
   2808    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
   2809 %endif
   2810 %endmacro
   2811 
   2812 INV_TXFM_8X16_FN dct, dct
   2813 INV_TXFM_8X16_FN dct, identity, 35
   2814 INV_TXFM_8X16_FN dct, adst
   2815 INV_TXFM_8X16_FN dct, flipadst
   2816 
   2817 cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   2818 %undef cmp
   2819    vpbroadcastd        m12, [clip_18b_min]
   2820    vpbroadcastd        m13, [clip_18b_max]
   2821 .pass1:
   2822    vpbroadcastd        m14, [pd_2896]
   2823    vpbroadcastd        m11, [pd_2048]
   2824    cmp                eobd, 43
   2825    jl .fast
   2826    add                  cq, 32
   2827    call .pass1_main
   2828    sub                  cq, 32
   2829    mova         [cq+32* 1], m0
   2830    mova         [cq+32* 3], m1
   2831    mova         [cq+32* 5], m2
   2832    mova         [cq+32* 7], m3
   2833    mova         [cq+32* 9], m4
   2834    mova         [cq+32*11], m5
   2835    mova         [cq+32*13], m6
   2836    mova                m15, m7
   2837    call .pass1_main
   2838    mova                 m8, [cq+32* 1]
   2839    mova                 m9, [cq+32* 3]
   2840    mova                m10, [cq+32* 5]
   2841    mova                m11, [cq+32* 7]
   2842    mova                m12, [cq+32* 9]
   2843    mova                m13, [cq+32*11]
   2844    mova                m14, [cq+32*13]
   2845    jmp                tx2q
   2846 .fast:
   2847    call .pass1_main
   2848    pxor                 m8, m8
   2849    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
   2850    jmp                tx2q
   2851 .pass2:
   2852    call .transpose
   2853    call m(idct_8x16_internal_8bpc).main
   2854    vpbroadcastd        m12, [pw_2048]
   2855    REPX {vpermq x, x, q3120}, m0, m2, m4, m6
   2856    REPX {vpermq x, x, q2031}, m1, m3, m5, m7
   2857 .end:
   2858    pmulhrsw             m0, m12
   2859    pmulhrsw             m1, m12
   2860    call m(idct_8x8_internal_10bpc).write_8x4_start
   2861    pmulhrsw             m0, m2, m12
   2862    pmulhrsw             m1, m3, m12
   2863    call m(idct_8x8_internal_10bpc).write_8x4
   2864    pmulhrsw             m0, m4, m12
   2865    pmulhrsw             m1, m5, m12
   2866    call m(idct_8x8_internal_10bpc).write_8x4
   2867    pmulhrsw             m0, m6, m12
   2868    pmulhrsw             m1, m7, m12
   2869    call m(idct_8x8_internal_10bpc).write_8x4
   2870    RET
   2871 ALIGN function_align
   2872 .transpose:
   2873    packssdw             m0, m8
   2874    packssdw             m1, m9
   2875    packssdw             m2, m10
   2876    packssdw             m3, m11
   2877    packssdw             m4, m12
   2878    packssdw             m5, m13
   2879    packssdw             m6, m14
   2880    packssdw             m7, m15
   2881    lea                  r6, [deint_shuf+128]
   2882    punpckhwd            m8, m0, m1
   2883    punpcklwd            m0, m1
   2884    punpckhwd            m1, m2, m3
   2885    punpcklwd            m2, m3
   2886    punpcklwd            m3, m4, m5
   2887    punpckhwd            m4, m5
   2888    punpckhwd            m5, m6, m7
   2889    punpcklwd            m6, m7
   2890    punpckhdq            m7, m3, m6
   2891    punpckldq            m3, m6
   2892    punpckhdq            m6, m4, m5
   2893    punpckldq            m4, m5
   2894    punpckhdq            m5, m8, m1
   2895    punpckldq            m8, m1
   2896    punpckhdq            m1, m0, m2
   2897    punpckldq            m0, m2
   2898    vperm2i128           m2, m0, m3, 0x31
   2899    vinserti128          m0, xm3, 1
   2900    vperm2i128           m3, m1, m7, 0x31
   2901    vinserti128          m1, xm7, 1
   2902    vperm2i128           m7, m5, m6, 0x31
   2903    vinserti128          m5, xm6, 1
   2904    vperm2i128           m6, m8, m4, 0x31
   2905    vinserti128          m4, m8, xm4, 1
   2906    ret
   2907 ALIGN function_align
   2908 .pass1_main:
   2909    pmulld               m0, m14, [cq+32* 0]
   2910    pmulld               m1, m14, [cq+32* 2]
   2911    pmulld               m2, m14, [cq+32* 4]
   2912    pmulld               m3, m14, [cq+32* 6]
   2913    pmulld               m4, m14, [cq+32* 8]
   2914    pmulld               m5, m14, [cq+32*10]
   2915    pmulld               m6, m14, [cq+32*12]
   2916    pmulld               m7, m14, [cq+32*14]
   2917    call m(idct_8x8_internal_10bpc).main_rect2
   2918    jmp  m(idct_8x8_internal_10bpc).round_shift1
   2919 ALIGN function_align
   2920 .main_evenhalf:
   2921    paddd                m1, m6, m7  ; idct8 out1
   2922    psubd                m6, m7      ; idct8 out6
   2923    psubd                m7, m0, m9  ; idct8 out7
   2924    paddd                m0, m9      ; idct8 out0
   2925    paddd                m2, m5, m4  ; idct8 out2
   2926    psubd                m5, m4      ; idct8 out5
   2927    psubd                m4, m3, m8  ; idct8 out4
   2928    paddd                m3, m8      ; idct8 out3
   2929    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   2930    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   2931    ret
   2932 .main_oddhalf_fast_rect2:
   2933    REPX     {paddd x, m11}, m0, m1, m2, m3
   2934    REPX     {psrad x, 12 }, m0, m1, m2, m3
   2935 .main_oddhalf_fast: ; lower half zero
   2936    vpbroadcastd         m7, [pd_4076]
   2937    vpbroadcastd         m8, [pd_401]
   2938    vpbroadcastd         m6, [pd_m1189]
   2939    vpbroadcastd         m9, [pd_3920]
   2940    vpbroadcastd         m5, [pd_3612]
   2941    vpbroadcastd        m10, [pd_1931]
   2942    vpbroadcastd         m4, [pd_m2598]
   2943    vpbroadcastd        m15, [pd_3166]
   2944    pmulld               m7, m0
   2945    pmulld               m0, m8
   2946    pmulld               m6, m1
   2947    pmulld               m1, m9
   2948    pmulld               m5, m2
   2949    pmulld               m2, m10
   2950    pmulld               m4, m3
   2951    pmulld               m3, m15
   2952    jmp .main_oddhalf_fast2
   2953 .main_oddhalf_rect2:
   2954    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   2955    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
   2956 .main_oddhalf:
   2957    ITX_MULSUB_2D         0, 7, 8, 9, 10, _,  401, 4076 ; t8a,  t15a
   2958    ITX_MULSUB_2D         6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
   2959    ITX_MULSUB_2D         2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
   2960    ITX_MULSUB_2D         4, 3, 8, 9, 10, _, 3166, 2598 ; t9a,  t14a
   2961 .main_oddhalf_fast2:
   2962    REPX     {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
   2963    REPX     {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
   2964    psubd                m8, m0, m4 ; t9
   2965    paddd                m0, m4     ; t8
   2966    psubd                m4, m6, m2 ; t10
   2967    paddd                m2, m6     ; t11
   2968    psubd                m6, m1, m5 ; t13
   2969    paddd                m5, m1     ; t12
   2970    psubd                m1, m7, m3 ; t14
   2971    paddd                m7, m3     ; t15
   2972    REPX    {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
   2973    REPX    {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
   2974    vpbroadcastd        m15, [pd_3784]
   2975    vpbroadcastd        m10, [pd_1567]
   2976    ITX_MULSUB_2D         1, 8, 3, 9, _, 11, 10, 15
   2977    ITX_MULSUB_2D         6, 4, 3, 9, _, 11, 10, 15, 2
   2978    psubd                m3, m1, m4 ; t10
   2979    paddd                m1, m4     ; t9
   2980    psubd                m4, m0, m2 ; t11a
   2981    paddd                m0, m2     ; t8a
   2982    psubd                m2, m8, m6 ; t13
   2983    paddd                m6, m8     ; t14
   2984    psubd                m8, m7, m5 ; t12a
   2985    paddd                m7, m5     ; t15a
   2986    REPX    {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
   2987    REPX    {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
   2988    REPX    {pmulld x, m14}, m2, m8, m3, m4
   2989    paddd                m2, m11
   2990    paddd                m8, m11
   2991    paddd                m5, m2, m3 ; t13a
   2992    psubd                m2, m3     ; t10a
   2993    psubd                m3, m8, m4 ; t11
   2994    paddd                m4, m8     ; t12
   2995    REPX      {psrad x, 12}, m5, m2, m3, m4
   2996    mova          [r6-32*4], m7
   2997    mova          [r6-32*3], m6
   2998    mova          [r6-32*2], m5
   2999    mova          [r6-32*1], m4
   3000    mova          [r6+32*0], m3
   3001    mova          [r6+32*1], m2
   3002    mova          [r6+32*2], m1
   3003    mova          [r6+32*3], m0
   3004    ret
   3005 
   3006 INV_TXFM_8X16_FN adst, dct
   3007 INV_TXFM_8X16_FN adst, adst
   3008 INV_TXFM_8X16_FN adst, flipadst
   3009 INV_TXFM_8X16_FN adst, identity, 35
   3010 
   3011 cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   3012 %undef cmp
   3013    vpbroadcastd        m12, [clip_18b_min]
   3014    vpbroadcastd        m13, [clip_18b_max]
   3015 .pass1:
   3016    vpbroadcastd        m14, [pd_2896]
   3017    vpbroadcastd        m11, [pd_2048]
   3018    cmp                eobd, 43
   3019    jl .fast
   3020    add                  cq, 32
   3021    call .pass1_main
   3022    call m(iadst_8x8_internal_10bpc).main_end
   3023    sub                  cq, 32
   3024    mova         [cq+32* 1], m0
   3025    mova         [cq+32* 3], m1
   3026    mova         [cq+32* 5], m2
   3027    mova         [cq+32* 7], m3
   3028    mova         [cq+32* 9], m4
   3029    mova         [cq+32*11], m5
   3030    mova         [cq+32*13], m6
   3031    mova                m15, m7
   3032    call .pass1_main
   3033    call m(iadst_8x8_internal_10bpc).main_end
   3034    mova                 m8, [cq+32* 1]
   3035    mova                 m9, [cq+32* 3]
   3036    mova                m10, [cq+32* 5]
   3037    mova                m11, [cq+32* 7]
   3038    mova                m12, [cq+32* 9]
   3039    mova                m13, [cq+32*11]
   3040    mova                m14, [cq+32*13]
   3041    jmp                tx2q
   3042 .fast:
   3043    call .pass1_main
   3044    call m(iadst_8x8_internal_10bpc).main_end
   3045    pxor                 m8, m8
   3046    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
   3047    jmp                tx2q
   3048 .pass2:
   3049    call m(idct_8x16_internal_10bpc).transpose
   3050    call m(iadst_8x16_internal_8bpc).main
   3051    call m(iadst_8x16_internal_8bpc).main_pass2_end
   3052    vpbroadcastd         m8, [pw_2048]
   3053    vpbroadcastd       xm12, [pw_4096]
   3054    REPX {vpermq x, x, q2031}, m0, m1, m2, m3
   3055    REPX {vpermq x, x, q3120}, m4, m5, m6, m7
   3056    psubw               m12, m8
   3057    jmp m(idct_8x16_internal_10bpc).end
   3058 ALIGN function_align
   3059 .pass1_main:
   3060    pmulld               m0, m14, [cq+32* 0]
   3061    pmulld               m7, m14, [cq+32*14]
   3062    pmulld               m1, m14, [cq+32* 2]
   3063    pmulld               m6, m14, [cq+32*12]
   3064    pmulld               m2, m14, [cq+32* 4]
   3065    pmulld               m5, m14, [cq+32*10]
   3066    pmulld               m3, m14, [cq+32* 6]
   3067    pmulld               m4, m14, [cq+32* 8]
   3068    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   3069    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
   3070    jmp m(iadst_8x8_internal_10bpc).main2
   3071 
   3072 INV_TXFM_8X16_FN flipadst, dct
   3073 INV_TXFM_8X16_FN flipadst, adst
   3074 INV_TXFM_8X16_FN flipadst, flipadst
   3075 INV_TXFM_8X16_FN flipadst, identity, 35
   3076 
   3077 cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   3078 %undef cmp
   3079    vpbroadcastd        m12, [clip_18b_min]
   3080    vpbroadcastd        m13, [clip_18b_max]
   3081 .pass1:
   3082    vpbroadcastd        m14, [pd_2896]
   3083    vpbroadcastd        m11, [pd_2048]
   3084    cmp                eobd, 43
   3085    jl .fast
   3086    add                  cq, 32
   3087    call m(iadst_8x16_internal_10bpc).pass1_main
   3088    call m(iflipadst_8x8_internal_10bpc).main_end
   3089    sub                  cq, 32
   3090    mova         [cq+32* 1], m0
   3091    mova         [cq+32* 3], m1
   3092    mova         [cq+32* 5], m2
   3093    mova         [cq+32* 7], m3
   3094    mova         [cq+32* 9], m4
   3095    mova         [cq+32*11], m5
   3096    mova         [cq+32*13], m6
   3097    mova                m15, m7
   3098    call m(iadst_8x16_internal_10bpc).pass1_main
   3099    call m(iflipadst_8x8_internal_10bpc).main_end
   3100    mova                 m8, [cq+32* 1]
   3101    mova                 m9, [cq+32* 3]
   3102    mova                m10, [cq+32* 5]
   3103    mova                m11, [cq+32* 7]
   3104    mova                m12, [cq+32* 9]
   3105    mova                m13, [cq+32*11]
   3106    mova                m14, [cq+32*13]
   3107    jmp                tx2q
   3108 .fast:
   3109    call m(iadst_8x16_internal_10bpc).pass1_main
   3110    call m(iflipadst_8x8_internal_10bpc).main_end
   3111    pxor                 m8, m8
   3112    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
   3113    jmp                tx2q
   3114 .pass2:
   3115    call m(idct_8x16_internal_10bpc).transpose
   3116    call m(iadst_8x16_internal_8bpc).main
   3117    call m(iadst_8x16_internal_8bpc).main_pass2_end
   3118    vpbroadcastd        m12, [pw_2048]
   3119    vpbroadcastd       xm13, [pw_4096]
   3120    mova                m11, m0
   3121    vpermq               m0, m7, q2031
   3122    mova                m10, m1
   3123    vpermq               m1, m6, q2031
   3124    mova                 m9, m2
   3125    vpermq               m2, m5, q2031
   3126    mova                 m8, m3
   3127    vpermq               m3, m4, q2031
   3128    vpermq               m4, m8, q3120
   3129    vpermq               m5, m9, q3120
   3130    vpermq               m6, m10, q3120
   3131    vpermq               m7, m11, q3120
   3132    psubw               m12, m13
   3133    jmp m(idct_8x16_internal_10bpc).end
   3134 
   3135 INV_TXFM_8X16_FN identity, dct
   3136 INV_TXFM_8X16_FN identity, adst
   3137 INV_TXFM_8X16_FN identity, flipadst
   3138 INV_TXFM_8X16_FN identity, identity
   3139 
   3140 %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384]
   3141    pmulhrsw            m%2, m%3, m%1
   3142 %if %0 == 4 ; if downshifting by 1
   3143 %ifnum %4
   3144    pmulhrsw            m%2, m%4
   3145 %else ; without rounding
   3146    psraw               m%2, 1
   3147 %endif
   3148 %else
   3149    paddsw              m%1, m%1
   3150 %endif
   3151    paddsw              m%1, m%2
   3152 %endmacro
   3153 
   3154 cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
   3155 .pass1:
   3156    vpbroadcastd        m15, [pd_2896]
   3157    pmulld               m0, m15, [cq+32* 0]
   3158    pmulld               m8, m15, [cq+32* 1]
   3159    pmulld               m1, m15, [cq+32* 2]
   3160    pmulld               m9, m15, [cq+32* 3]
   3161    pmulld               m2, m15, [cq+32* 4]
   3162    pmulld              m10, m15, [cq+32* 5]
   3163    pmulld               m3, m15, [cq+32* 6]
   3164    pmulld              m11, m15, [cq+32* 7]
   3165    pmulld               m4, m15, [cq+32* 8]
   3166    pmulld              m12, m15, [cq+32* 9]
   3167    pmulld               m5, m15, [cq+32*10]
   3168    pmulld              m13, m15, [cq+32*11]
   3169    pmulld               m6, m15, [cq+32*12]
   3170    pmulld              m14, m15, [cq+32*13]
   3171    pmulld               m7, m15, [cq+32*14]
   3172    pmulld              m15,      [cq+32*15]
   3173    mova               [cq], m7
   3174    vpbroadcastd         m7, [pd_2048]
   3175    REPX     {paddd  x, m7}, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
   3176                             m8,  m9,  m10, m11, m12, m13, m14, m15
   3177    paddd                m7, [cq]
   3178    REPX     {psrad  x, 12}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   3179                             m8,  m9,  m10, m11, m12, m13, m14, m15
   3180    jmp                tx2q
   3181 .pass2:
   3182    packssdw             m0, m8
   3183    packssdw             m1, m9
   3184    packssdw             m2, m10
   3185    packssdw             m3, m11
   3186    packssdw             m4, m12
   3187    packssdw             m5, m13
   3188    packssdw             m6, m14
   3189    packssdw            m13, m7, m15
   3190    vpbroadcastd         m8, [pw_1697x16]
   3191    REPX {IDTX16   x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13
   3192    vpbroadcastd         m7, [pixel_10bpc_max]
   3193    vpbroadcastd        m12, [pw_2048]
   3194    call .pass2_end
   3195    RET
   3196 ALIGN function_align
   3197 .pass2_end:
   3198    punpckhwd            m9, m0, m1
   3199    punpcklwd            m0, m1
   3200    punpckhwd            m1, m6, m13
   3201    punpcklwd            m6, m13
   3202    punpckhwd           m13, m4, m5
   3203    punpcklwd            m4, m5
   3204    punpcklwd            m5, m2, m3
   3205    punpckhwd            m2, m3
   3206    punpckhdq            m3, m0, m5
   3207    punpckldq            m0, m5
   3208    punpckhdq           m11, m9, m2
   3209    punpckldq            m9, m2
   3210    punpckldq            m2, m4, m6
   3211    punpckhdq            m4, m6
   3212    punpckldq            m6, m13, m1
   3213    punpckhdq           m13, m1
   3214    punpckhqdq           m1, m0, m2
   3215    punpcklqdq           m0, m2
   3216    punpcklqdq           m2, m3, m4
   3217    punpckhqdq           m3, m4
   3218    punpcklqdq           m8, m9, m6
   3219    punpckhqdq           m9, m6
   3220    punpcklqdq          m10, m11, m13
   3221    punpckhqdq          m11, m13
   3222    pmulhrsw             m0, m12
   3223    pmulhrsw             m1, m12
   3224    call m(iidentity_8x8_internal_10bpc).write_2x8x2_start
   3225    pmulhrsw             m0, m12, m2
   3226    pmulhrsw             m1, m12, m3
   3227    call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
   3228    pmulhrsw             m0, m12, m8
   3229    pmulhrsw             m1, m12, m9
   3230    lea                dstq, [dstq+strideq*4]
   3231    call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
   3232    pmulhrsw             m0, m12, m10
   3233    pmulhrsw             m1, m12, m11
   3234    call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
   3235    ret
   3236 
   3237 INV_TXFM_8X16_FN dct, dct,       0, 12
   3238 INV_TXFM_8X16_FN dct, identity, 35, 12
   3239 INV_TXFM_8X16_FN dct, adst,      0, 12
   3240 INV_TXFM_8X16_FN dct, flipadst,  0, 12
   3241 
   3242 cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   3243    vpbroadcastd        m12, [clip_20b_min]
   3244    vpbroadcastd        m13, [clip_20b_max]
   3245    jmp m(idct_8x16_internal_10bpc).pass1
   3246 .pass2:
   3247    lea                  r6, [rsp+32*4]
   3248    call .transpose
   3249    vpbroadcastd        m12, [clip_18b_min]
   3250    vpbroadcastd        m13, [clip_18b_max]
   3251    mova         [cq+32* 8], m0
   3252    mova         [cq+32*10], m2
   3253    mova         [cq+32*12], m4
   3254    mova         [cq+32*14], m6
   3255    pmaxsd               m0, m12, [cq+32* 1]
   3256    pmaxsd               m4, m12, m1
   3257    pmaxsd               m1, m12, [cq+32* 3]
   3258    pmaxsd               m2, m12, [cq+32* 5]
   3259    pmaxsd               m6, m12, m5
   3260    pmaxsd               m5, m12, m3
   3261    pmaxsd               m3, m12, [cq+32* 7]
   3262    pmaxsd               m7, m12
   3263    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   3264    vpbroadcastd        m11, [pd_2048]
   3265    vpbroadcastd        m14, [pd_2896]
   3266    call m(idct_8x16_internal_10bpc).main_oddhalf
   3267    pmaxsd               m0, m12, [cq+32* 0]
   3268    pmaxsd               m1, m12, [cq+32* 2]
   3269    pmaxsd               m2, m12, [cq+32* 4]
   3270    pmaxsd               m3, m12, [cq+32* 6]
   3271    pmaxsd               m4, m12, [cq+32* 8]
   3272    pmaxsd               m5, m12, [cq+32*10]
   3273    pmaxsd               m6, m12, [cq+32*12]
   3274    pmaxsd               m7, m12, [cq+32*14]
   3275    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   3276    call m(idct_8x8_internal_10bpc).main
   3277    call m(idct_8x16_internal_10bpc).main_evenhalf
   3278    vpbroadcastd        m11, [pd_8]
   3279    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   3280    call m(idct_16x8_internal_10bpc).pass1_rotations
   3281    REPX       {psrad x, 4}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   3282                             m8,  m9,  m10, m11, m12, m13, m14, m15
   3283 .end:
   3284    packssdw             m0, m1
   3285    packssdw             m1, m2, m3
   3286    packssdw             m2, m4, m5
   3287    packssdw             m3, m6, m7
   3288    packssdw             m4, m8, m9
   3289    packssdw             m5, m10, m11
   3290    packssdw             m6, m12, m13
   3291    packssdw             m7, m14, m15
   3292    vpermq               m0, m0, q3120
   3293    vpermq               m1, m1, q3120
   3294    call m(idct_8x8_internal_12bpc).write_8x4_start
   3295    call m(idct_8x8_internal_10bpc).write_8x4
   3296    vpermq               m0, m2, q3120
   3297    vpermq               m1, m3, q3120
   3298    call m(idct_8x8_internal_10bpc).write_8x4
   3299    vpermq               m0, m4, q3120
   3300    vpermq               m1, m5, q3120
   3301    call m(idct_8x8_internal_10bpc).write_8x4
   3302    vpermq               m0, m6, q3120
   3303    vpermq               m1, m7, q3120
   3304    call m(idct_8x8_internal_10bpc).write_8x4
   3305    RET
   3306 ALIGN function_align
   3307 .transpose:
   3308    mova         [cq+32* 8], m8
   3309    mova         [cq+32* 9], m9
   3310    mova         [cq+32*10], m10
   3311    mova         [cq+32*11], m11
   3312    call m(idct_8x8_internal_12bpc).transpose_8x8
   3313    mova         [cq+32* 0], m0
   3314    mova         [cq+32* 1], m1
   3315    mova         [cq+32* 2], m2
   3316    mova         [cq+32* 3], m3
   3317    mova         [cq+32* 4], m4
   3318    mova         [cq+32* 5], m5
   3319    mova         [cq+32* 6], m6
   3320    mova         [cq+32* 7], m7
   3321    mova                 m0, [cq+32* 8]
   3322    mova                 m1, [cq+32* 9]
   3323    mova                 m2, [cq+32*10]
   3324    mova                 m3, [cq+32*11]
   3325    mova                 m4, m12
   3326    mova                 m5, m13
   3327    mova                 m6, m14
   3328    mova                 m7, m15
   3329    jmp m(idct_8x8_internal_12bpc).transpose_8x8
   3330 
   3331 INV_TXFM_8X16_FN adst, dct,       0, 12
   3332 INV_TXFM_8X16_FN adst, adst,      0, 12
   3333 INV_TXFM_8X16_FN adst, flipadst,  0, 12
   3334 INV_TXFM_8X16_FN adst, identity, 35, 12
   3335 
   3336 cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   3337    vpbroadcastd        m12, [clip_20b_min]
   3338    vpbroadcastd        m13, [clip_20b_max]
   3339    jmp m(iadst_8x16_internal_10bpc).pass1
   3340 .pass2:
   3341    lea                  r6, [rsp+32*4]
   3342    call .pass2_main
   3343    call m(iadst_16x8_internal_10bpc).pass1_rotations
   3344 .pass2_end:
   3345    REPX      {psrad x, 4 }, m0,  m1,  m2,  m3,  m12, m13, m14, m15
   3346    REPX      {psrad x, 15}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
   3347    jmp m(idct_8x16_internal_12bpc).end
   3348 ALIGN function_align
   3349 .pass2_main:
   3350    call m(idct_8x16_internal_12bpc).transpose
   3351    vpbroadcastd        m13, [clip_18b_min]
   3352    vpbroadcastd        m14, [clip_18b_max]
   3353    mova         [cq+32* 8], m0
   3354    mova         [cq+32*11], m3
   3355    mova         [cq+32*12], m4
   3356    mova         [cq+32*15], m7
   3357    pmaxsd               m0, m13, [cq+32* 2] ;  2
   3358    pmaxsd               m3, m13, m1         ;  9
   3359    pmaxsd               m1, m13, m5         ; 13
   3360    pmaxsd               m4, m13, m2         ; 10
   3361    pmaxsd               m2, m13, [cq+32* 6] ;  6
   3362    pmaxsd               m5, m13, [cq+32* 5] ;  5
   3363    pmaxsd               m6, m13, m6         ; 14
   3364    pmaxsd               m7, m13, [cq+32* 1] ;  1
   3365    REPX    {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
   3366    vpbroadcastd        m12, [pd_2048]
   3367    vpbroadcastd        m15, [pd_2896]
   3368    call m(iadst_16x8_internal_10bpc).main_part1
   3369    pmaxsd               m0, m13, [cq+32* 0] ;  0
   3370    pmaxsd               m1, m13, [cq+32*15] ; 15
   3371    pmaxsd               m2, m13, [cq+32* 4] ;  4
   3372    pmaxsd               m3, m13, [cq+32*11] ; 11
   3373    pmaxsd               m4, m13, [cq+32* 8] ;  8
   3374    pmaxsd               m5, m13, [cq+32* 7] ;  7
   3375    pmaxsd               m6, m13, [cq+32*12] ; 12
   3376    pmaxsd               m7, m13, [cq+32* 3] ;  3
   3377    REPX    {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
   3378    call m(iadst_16x8_internal_10bpc).main_part2
   3379    vpbroadcastd        m14, [pd_17408]
   3380    psrld               m15, 11              ; pd_1
   3381    psubd               m13, m14, m15        ; pd_17407
   3382    pslld               m15, 3               ; pd_8
   3383    ret
   3384 
   3385 INV_TXFM_8X16_FN flipadst, dct,       0, 12
   3386 INV_TXFM_8X16_FN flipadst, adst,      0, 12
   3387 INV_TXFM_8X16_FN flipadst, flipadst,  0, 12
   3388 INV_TXFM_8X16_FN flipadst, identity, 35, 12
   3389 
   3390 cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   3391    vpbroadcastd        m12, [clip_20b_min]
   3392    vpbroadcastd        m13, [clip_20b_max]
   3393    jmp m(iflipadst_8x16_internal_10bpc).pass1
   3394 .pass2:
   3395    lea                  r6, [rsp+32*4]
   3396    call m(iadst_8x16_internal_12bpc).pass2_main
   3397    call m(iflipadst_16x8_internal_10bpc).pass1_rotations
   3398    jmp m(iadst_8x16_internal_12bpc).pass2_end
   3399 
   3400 INV_TXFM_8X16_FN identity, dct,      0, 12
   3401 INV_TXFM_8X16_FN identity, adst,     0, 12
   3402 INV_TXFM_8X16_FN identity, flipadst, 0, 12
   3403 INV_TXFM_8X16_FN identity, identity, 0, 12
   3404 
   3405 cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   3406    jmp m(iidentity_8x16_internal_10bpc).pass1
   3407 .pass2:
   3408    call .pass2_main
   3409    packssdw             m0, m8
   3410    packssdw             m1, m9
   3411    packssdw             m2, m10
   3412    packssdw             m3, m11
   3413    packssdw             m4, m12
   3414    packssdw             m5, m13
   3415    packssdw             m6, m14
   3416    packssdw            m13, m7, m15
   3417    vpbroadcastd         m7, [pixel_12bpc_max]
   3418    vpbroadcastd        m12, [pw_16384]
   3419    call m(iidentity_8x16_internal_10bpc).pass2_end
   3420    RET
   3421 ALIGN function_align
   3422 .pass2_main:
   3423    mova               [cq], m7
   3424    vpbroadcastd         m7, [clip_18b_min]
   3425    REPX     {pmaxsd x, m7}, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
   3426                             m8,  m9,  m10, m11, m12, m13, m14, m15
   3427    pmaxsd               m7, [cq]
   3428    mova               [cq], m15
   3429    vpbroadcastd        m15, [clip_18b_max]
   3430    REPX    {pminsd x, m15}, m0,  m1,  m2,  m3,  m4,  m5,  m6, m7, \
   3431                             m8,  m9,  m10, m11, m12, m13, m14
   3432    pminsd              m15, [cq]
   3433    mova               [cq], m7
   3434    vpbroadcastd         m7, [pd_5793]
   3435    REPX     {pmulld x, m7}, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
   3436                             m8,  m9,  m10, m11, m12, m13, m14, m15
   3437    pmulld               m7, [cq]
   3438    mova               [cq], m15
   3439    vpbroadcastd        m15, [pd_1024]
   3440    REPX    {paddd  x, m15}, m0,  m1,  m2,  m3,  m4,  m5,  m6, m7, \
   3441                             m8,  m9,  m10, m11, m12, m13, m14
   3442    paddd               m15, [cq]
   3443    REPX     {psrad  x, 14}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   3444                             m8,  m9,  m10, m11, m12, m13, m14, m15
   3445    ret
   3446 
   3447 %macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
   3448    INV_TXFM_FN          %1, %2, 0, 16x4, %3
   3449 %ifidn %1_%2, dct_dct
   3450    vpbroadcastd         m3, [dconly_%3bpc]
   3451 %if %3 = 10
   3452 .dconly:
   3453    imul                r6d, [cq], 181
   3454    mov                [cq], eobd ; 0
   3455    or                  r3d, 4
   3456 .dconly2:
   3457    add                 r6d, 384
   3458    sar                 r6d, 9
   3459 .dconly3:
   3460    imul                r6d, 181
   3461    add                 r6d, 2176
   3462    sar                 r6d, 12
   3463    movd                xm0, r6d
   3464    paddsw              xm0, xm3
   3465    vpbroadcastw         m0, xm0
   3466 .dconly_loop:
   3467    paddsw               m1, m0, [dstq+strideq*0]
   3468    paddsw               m2, m0, [dstq+strideq*1]
   3469    psubusw              m1, m3
   3470    psubusw              m2, m3
   3471    mova   [dstq+strideq*0], m1
   3472    mova   [dstq+strideq*1], m2
   3473    lea                dstq, [dstq+strideq*2]
   3474    sub                 r3d, 2
   3475    jg .dconly_loop
   3476    RET
   3477 %else
   3478    jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly
   3479 %endif
   3480 %endif
   3481 %endmacro
   3482 
   3483 INV_TXFM_16X4_FN dct, dct
   3484 INV_TXFM_16X4_FN dct, identity
   3485 INV_TXFM_16X4_FN dct, adst
   3486 INV_TXFM_16X4_FN dct, flipadst
   3487 
   3488 cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
   3489    vpbroadcastd         m8, [clip_18b_min]
   3490    vpbroadcastd         m9, [clip_18b_max]
   3491 .pass1:
   3492    vbroadcasti128       m0, [cq+16* 0]
   3493    vbroadcasti128       m4, [cq+16* 4]
   3494    vbroadcasti128       m1, [cq+16* 2]
   3495    vbroadcasti128       m7, [cq+16* 6]
   3496    vbroadcasti128       m5, [cq+16*10]
   3497    vbroadcasti128       m2, [cq+16* 8]
   3498    vbroadcasti128       m6, [cq+16*12]
   3499    vbroadcasti128       m3, [cq+16*14]
   3500    shufpd               m0, m4, 0x0c ;  0  4
   3501    shufpd               m1, m5, 0x0c ;  2 10
   3502    shufpd               m2, m6, 0x0c ;  8 12
   3503    shufpd               m3, m7, 0x0c ; 14  6
   3504    call .pass1_main
   3505    vbroadcasti128      m10, [cq+16* 1]
   3506    vbroadcasti128       m4, [cq+16* 5]
   3507    vbroadcasti128      m11, [cq+16*15]
   3508    vbroadcasti128       m5, [cq+16*11]
   3509    shufpd              m10, m4, 0x0c ;  1  5
   3510    shufpd              m11, m5, 0x0c ; 15 11
   3511    vbroadcasti128       m5, [cq+16* 9]
   3512    vbroadcasti128       m4, [cq+16*13]
   3513    shufpd               m5, m4, 0x0c ;  9 13
   3514    vbroadcasti128       m6, [cq+16* 7]
   3515    vbroadcasti128       m4, [cq+16* 3]
   3516    shufpd               m6, m4, 0x0c ;  7  3
   3517    call .pass1_main2
   3518    pcmpeqd              m4, m4
   3519    REPX      {psubd x, m4}, m0, m1, m2, m3
   3520    call .pass1_main3
   3521    REPX      {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
   3522    jmp                tx2q
   3523 .pass2:
   3524    call .transpose_4x16_packed
   3525    lea                  r6, [deint_shuf+128]
   3526    call m(idct_16x4_internal_8bpc).main
   3527 .end:
   3528    vpbroadcastd         m4, [pw_2048]
   3529    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
   3530    vpbroadcastd         m5, [pixel_10bpc_max]
   3531 .end2:
   3532    paddw                m0, [dstq+strideq*0]
   3533    paddw                m1, [dstq+strideq*1]
   3534 .end3:
   3535    lea                  r6, [dstq+strideq*2]
   3536    paddw                m2, [r6  +strideq*0]
   3537    paddw                m3, [r6  +strideq*1]
   3538    pxor                 m4, m4
   3539    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
   3540    REPX     {pmaxsw x, m4}, m0, m1, m2, m3
   3541    REPX     {pminsw x, m5}, m0, m1, m2, m3
   3542    mova   [dstq+strideq*0], m0
   3543    mova   [dstq+strideq*1], m1
   3544    mova   [r6  +strideq*0], m2
   3545    mova   [r6  +strideq*1], m3
   3546    RET
   3547 ALIGN function_align
   3548 .pass1_main:
   3549    vpbroadcastd         m7, [pd_2048]
   3550    call m(idct_8x4_internal_10bpc).main
   3551    psubd                m3, m0, m4   ; idct8 out7 out6
   3552    paddd                m0, m4       ; idct8 out0 out1
   3553    paddd                m1, m2, m5   ; idct8 out3 out2
   3554    psubd                m2, m5       ; idct8 out4 out5
   3555    ret
   3556 ALIGN function_align
   3557 .pass1_main2:
   3558    ITX_MULSUB_2D        10, 11, 4, 12, 13, 7,  401_1931, 4076_3612, 1
   3559    ITX_MULSUB_2D         5,  6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1
   3560    vbroadcasti128      m12, [pd_3784_m3784]
   3561    psubd                m4, m10, m5
   3562    paddd               m10, m5       ;  t8  t11
   3563    psignd               m4, m12      ;  t9  t10
   3564    psubd                m5, m11, m6
   3565    paddd               m11, m6       ; t15  t12
   3566    psignd               m5, m12      ; t14  t13
   3567    vpbroadcastd         m6, [pd_1567]
   3568    vpbroadcastd        m13, [pd_3784]
   3569    REPX     {pmaxsd x, m8}, m5, m4
   3570    REPX     {pminsd x, m9}, m5, m4
   3571    pmulld              m12, m5
   3572    pmulld               m5, m6
   3573    vbroadcasti128       m6, [pd_1567_m1567]
   3574    pmulld              m13, m4
   3575    pmulld               m4, m6
   3576    REPX     {pmaxsd x, m8}, m10, m11, m0, m1
   3577    REPX     {pminsd x, m9}, m10, m11, m0, m1
   3578    paddd               m12, m7
   3579    paddd                m5, m7
   3580    paddd                m4, m12
   3581    psubd                m5, m13
   3582    psrad                m4, 12       ; t14a t10a
   3583    psrad                m5, 12       ; t9a  t13a
   3584    vpbroadcastd        m12, [pd_2896]
   3585    punpckhqdq           m6, m11, m5
   3586    punpcklqdq          m11, m4
   3587    punpckhqdq           m4, m10, m4
   3588    punpcklqdq          m10, m5
   3589    psubd                m5, m11, m6  ; t12a t13
   3590    paddd               m11, m6       ; t15a t14
   3591    psubd                m6, m10, m4  ; t11a t10
   3592    paddd               m10, m4       ; t8a  t9
   3593    REPX     {pmaxsd x, m8}, m5, m6
   3594    REPX     {pminsd x, m9}, m5, m6
   3595    pmulld               m5, m12
   3596    pmulld               m6, m12
   3597    REPX     {pmaxsd x, m8}, m2, m3, m11, m10
   3598    REPX     {pminsd x, m9}, m2, m3, m11, m10
   3599    ret
   3600 ALIGN function_align
   3601 .pass1_main3:
   3602    paddd                m5, m7
   3603    psubd                m4, m5, m6
   3604    paddd                m5, m6
   3605    psrad                m4, 12      ; t11 t10a
   3606    psrad                m5, 12      ; t12 t13a
   3607    psubd                m7, m0, m11 ; out15 out14
   3608    paddd                m0, m11     ; out0  out1
   3609    psubd                m6, m1, m5  ; out12 out13
   3610    paddd                m1, m5      ; out3  out2
   3611    psubd                m5, m2, m4  ; out11 out10
   3612    paddd                m2, m4      ; out4  out5
   3613    psubd                m4, m3, m10 ; out8  out9
   3614    paddd                m3, m10     ; out7  out6
   3615    REPX {pshufd x, x, q1032}, m1, m3, m5, m7
   3616    ret
   3617 ALIGN function_align
   3618 .transpose_4x16_packed:
   3619    vbroadcasti128       m8, [deint_shuf]
   3620    packssdw             m0, m1
   3621    packssdw             m2, m3
   3622    packssdw             m4, m5
   3623    packssdw             m6, m7
   3624    REPX     {pshufb x, m8}, m0, m2, m4, m6
   3625    punpckhqdq           m1, m0, m2
   3626    punpcklqdq           m0, m2
   3627    punpckhqdq           m2, m4, m6
   3628    punpcklqdq           m4, m6
   3629    vperm2i128           m3, m1, m2, 0x31
   3630    vinserti128          m1, xm2, 1
   3631    vperm2i128           m2, m0, m4, 0x31
   3632    vinserti128          m0, xm4, 1
   3633    ret
   3634 
   3635 INV_TXFM_16X4_FN adst, dct
   3636 INV_TXFM_16X4_FN adst, adst
   3637 INV_TXFM_16X4_FN adst, flipadst
   3638 INV_TXFM_16X4_FN adst, identity
   3639 
   3640 cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
   3641    vpbroadcastd        m12, [clip_18b_min]
   3642    vpbroadcastd        m13, [clip_18b_max]
   3643 .pass1:
   3644    call m(iadst_4x16_internal_10bpc).main
   3645    psrad               m11, 11 ; pd_1
   3646    REPX     {paddd x, m11}, m0, m1, m2, m3
   3647    paddd                m4, m5, m11
   3648    paddd                m5, m6, m11
   3649    paddd                m6, m7, m11
   3650    paddd                m7, m8, m11
   3651 .pass1_end:
   3652    REPX {pshufd x, x, q1032}, m0, m2, m4, m6
   3653    REPX       {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
   3654    jmp                tx2q
   3655 .pass2:
   3656    call m(idct_16x4_internal_10bpc).transpose_4x16_packed
   3657    lea                  r6, [deint_shuf+128]
   3658    call m(iadst_16x4_internal_8bpc).main
   3659    jmp m(idct_16x4_internal_10bpc).end
   3660 ALIGN function_align
   3661 .main:
   3662    vpbroadcastd         m6, [pd_1321]
   3663    mova                 m0, [cq+32*0]
   3664    mova                 m1, [cq+32*1]
   3665    vpbroadcastd         m7, [pd_2482]
   3666    mova                 m2, [cq+32*6]
   3667    mova                 m3, [cq+32*7]
   3668    pmulld               m4, m0, m6
   3669    pmulld               m5, m1, m6    ; 1321*in0
   3670    pmulld               m9, m2, m7
   3671    pmulld               m8, m3, m7    ; 2482*in3
   3672    paddd                m4, m9
   3673    paddd                m8, m5        ; 1321*in0 + 2482*in3
   3674    pmulld               m5, m0, m7
   3675    pmulld               m9, m1, m7    ; 2482*in0
   3676    paddd                m0, m2
   3677    paddd                m1, m3        ; in0 + in3
   3678    paddd                m7, m6        ; pd_3803
   3679    pmulld               m2, m7
   3680    pmulld               m3, m7        ; 3803*in3
   3681    psubd                m5, m2
   3682    psubd                m9, m3        ; 2482*in0 - 3803*in3
   3683    mova                 m2, [cq+32*4]
   3684    pmulld              m10, m7, m2
   3685    pmulld               m3, m6, m2
   3686    psubd                m2, m0
   3687    mova                 m0, [cq+32*5]
   3688    pmulld               m7, m0        ; 3803*in2
   3689    pmulld               m6, m0        ; 1321*in2
   3690    psubd                m0, m1        ; in2 - in0 - in3
   3691    vpbroadcastd         m1, [pd_m3344]
   3692    paddd                m4, m10
   3693    paddd                m7, m8        ; t0
   3694    psubd                m5, m3
   3695    psubd                m9, m6        ; t1
   3696    pmulld               m2, m1
   3697    pmulld               m0, m1        ; t2
   3698    pmulld               m3, m1, [cq+32*2]
   3699    pmulld               m1, [cq+32*3] ; -t3
   3700    ret
   3701 ALIGN function_align
   3702 .main_end:
   3703    ; expects: m6 = rnd
   3704    paddd                m5, m6
   3705    paddd                m9, m6
   3706    paddd               m10, m4, m5
   3707    paddd                m4, m6
   3708    paddd                m8, m7, m6
   3709    paddd                m7, m9
   3710    psubd                m4, m3        ; out0 (unshifted)
   3711    psubd                m5, m3        ; out1 (unshifted)
   3712    paddd                m2, m6        ; out2 (unshifted)
   3713    paddd                m3, m10       ; out3 (unshifted)
   3714    psubd                m8, m1        ; out4 (unshifted)
   3715    psubd                m9, m1        ; out5 (unshifted)
   3716    paddd                m6, m0        ; out6 (unshifted)
   3717    paddd                m7, m1        ; out7 (unshifted)
   3718    ret
   3719 
   3720 INV_TXFM_16X4_FN flipadst, dct
   3721 INV_TXFM_16X4_FN flipadst, adst
   3722 INV_TXFM_16X4_FN flipadst, flipadst
   3723 INV_TXFM_16X4_FN flipadst, identity
   3724 
   3725 cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
   3726    vpbroadcastd        m12, [clip_18b_min]
   3727    vpbroadcastd        m13, [clip_18b_max]
   3728 .pass1:
   3729    call m(iadst_4x16_internal_10bpc).main
   3730    psrad               m11, 11 ; pd_1
   3731    paddd                m4, m3, m11
   3732    paddd                m3, m5, m11
   3733    paddd                m5, m2, m11
   3734    paddd                m2, m6, m11
   3735    paddd                m6, m1, m11
   3736    paddd                m1, m7, m11
   3737    paddd                m7, m0, m11
   3738    paddd                m0, m8, m11
   3739    jmp m(iadst_16x4_internal_10bpc).pass1_end
   3740 .pass2:
   3741    call m(idct_16x4_internal_10bpc).transpose_4x16_packed
   3742    lea                  r6, [deint_shuf+128]
   3743    call m(iadst_16x4_internal_8bpc).main
   3744    vpbroadcastd         m4, [pw_2048]
   3745    pmulhrsw             m5, m3, m4
   3746    pmulhrsw             m6, m2, m4
   3747    pmulhrsw             m2, m1, m4
   3748    pmulhrsw             m3, m0, m4
   3749    paddw                m0, m5, [dstq+strideq*0]
   3750    paddw                m1, m6, [dstq+strideq*1]
   3751    vpbroadcastd         m5, [pixel_10bpc_max]
   3752    jmp m(idct_16x4_internal_10bpc).end3
   3753 
   3754 INV_TXFM_16X4_FN identity, dct
   3755 INV_TXFM_16X4_FN identity, adst
   3756 INV_TXFM_16X4_FN identity, flipadst
   3757 INV_TXFM_16X4_FN identity, identity
   3758 
   3759 cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
   3760    vpbroadcastd         m8, [pd_5793]
   3761    vpermq               m0, [cq+32*0], q3120 ; 0 1
   3762    vpermq               m1, [cq+32*1], q3120 ; 2 3
   3763    vpermq               m2, [cq+32*2], q3120 ; 4 5
   3764    vpermq               m3, [cq+32*3], q3120 ; 6 7
   3765    vpermq               m4, [cq+32*4], q3120 ; 8 9
   3766    vpermq               m5, [cq+32*5], q3120 ; a b
   3767    vpermq               m6, [cq+32*6], q3120 ; c d
   3768    vpermq               m7, [cq+32*7], q3120 ; e f
   3769    vpbroadcastd         m9, [pd_3072]
   3770    REPX     {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
   3771    REPX     {paddd  x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
   3772    REPX     {psrad  x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
   3773    jmp                tx2q
   3774 .pass2:
   3775    call m(idct_16x4_internal_10bpc).transpose_4x16_packed
   3776    vpbroadcastd         m7, [pw_1697x8]
   3777    pmulhrsw             m4, m7, m0
   3778    pmulhrsw             m5, m7, m1
   3779    pmulhrsw             m6, m7, m2
   3780    pmulhrsw             m7, m3
   3781    paddsw               m0, m4
   3782    paddsw               m1, m5
   3783    paddsw               m2, m6
   3784    paddsw               m3, m7
   3785    jmp m(idct_16x4_internal_10bpc).end
   3786 
   3787 INV_TXFM_16X4_FN dct, dct,      12
   3788 INV_TXFM_16X4_FN dct, identity, 12
   3789 INV_TXFM_16X4_FN dct, adst,     12
   3790 INV_TXFM_16X4_FN dct, flipadst, 12
   3791 
   3792 cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   3793    vpbroadcastd         m8, [clip_20b_min]
   3794    vpbroadcastd         m9, [clip_20b_max]
   3795    jmp m(idct_16x4_internal_10bpc).pass1
   3796 .pass2:
   3797    vpbroadcastd        m12, [clip_18b_min]
   3798    vpbroadcastd        m13, [clip_18b_max]
   3799    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   3800    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   3801    ; deinterleave
   3802    REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
   3803    ; transpose
   3804    punpcklqdq           m8, m0, m1
   3805    punpckhqdq           m0, m1
   3806    punpcklqdq           m9, m2, m3
   3807    punpckhqdq           m2, m3
   3808    punpcklqdq          m10, m4, m5
   3809    punpckhqdq           m4, m5
   3810    punpcklqdq          m11, m6, m7
   3811    punpckhqdq           m6, m7
   3812    vperm2i128           m3,  m0,  m2, 0x31   ; out6
   3813    vperm2i128           m1,  m0,  m2, 0x20   ; out2
   3814    vperm2i128           m7,  m4,  m6, 0x31   ; out7
   3815    vperm2i128           m5,  m4,  m6, 0x20   ; out3
   3816    vperm2i128          m13, m10, m11, 0x31   ; out5
   3817    vperm2i128          m12, m10, m11, 0x20   ; out1
   3818    vperm2i128          m11,  m8,  m9, 0x31   ; out4
   3819    vperm2i128          m10,  m8,  m9, 0x20   ; out0
   3820    call m(idct_4x16_internal_10bpc).pass1_main
   3821    pmulld               m0, m6, m10
   3822    pmulld               m2, m6, m11
   3823    pmulld               m4, m6, m12
   3824    pmulld               m6, m13
   3825    vpbroadcastd        m10, [pd_17408]
   3826    call m(idct_4x16_internal_10bpc).pass1_main2
   3827    REPX       {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
   3828    packssdw             m0, m4
   3829    packssdw             m1, m5
   3830    packssdw             m2, m6
   3831    packssdw             m3, m7
   3832    vpbroadcastd         m5, [pixel_12bpc_max]
   3833    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
   3834    jmp m(idct_16x4_internal_10bpc).end2
   3835 
   3836 INV_TXFM_16X4_FN adst, dct,      12
   3837 INV_TXFM_16X4_FN adst, adst,     12
   3838 INV_TXFM_16X4_FN adst, flipadst, 12
   3839 INV_TXFM_16X4_FN adst, identity, 12
   3840 
   3841 cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   3842    vpbroadcastd        m12, [clip_20b_min]
   3843    vpbroadcastd        m13, [clip_20b_max]
   3844    jmp m(iadst_16x4_internal_10bpc).pass1
   3845 .pass2:
   3846    call .pass2_main
   3847    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
   3848    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
   3849    jmp m(idct_16x4_internal_10bpc).end2
   3850 ALIGN function_align
   3851 .pass2_main:
   3852    vpbroadcastd        m12, [clip_18b_min]
   3853    vpbroadcastd        m13, [clip_18b_max]
   3854    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7
   3855    pmaxsd               m8, m4, m12
   3856    pmaxsd               m9, m5, m12
   3857    REPX    {pminsd x, m13}, m0, m1, m2, m3
   3858    call m(iadst_8x4_internal_12bpc).transpose_4x8
   3859    mova          [cq+32*0], m0
   3860    mova          [cq+32*2], m1
   3861    mova          [cq+32*4], m2
   3862    mova          [cq+32*6], m3
   3863    pminsd               m0, m8, m13
   3864    pminsd               m1, m9, m13
   3865    pminsd               m2, m6, m13
   3866    pminsd               m3, m7, m13
   3867    call m(iadst_8x4_internal_12bpc).transpose_4x8
   3868    mova          [cq+32*1], m0
   3869    mova          [cq+32*3], m1
   3870    mova          [cq+32*5], m2
   3871    mova          [cq+32*7], m3
   3872    call m(iadst_16x4_internal_10bpc).main
   3873    vpbroadcastd         m6, [pd_2048]
   3874    call m(iadst_16x4_internal_10bpc).main_end
   3875    psrad                m0, m4, 15
   3876    psrad                m1, m5, 15
   3877    psrad                m2, 15
   3878    psrad                m3, 15
   3879    psrad                m4, m8, 15
   3880    psrad                m5, m9, 15
   3881    psrad                m6, 15
   3882    psrad                m7, 15
   3883    packssdw             m0, m4
   3884    packssdw             m1, m5
   3885    packssdw             m2, m6
   3886    packssdw             m3, m7
   3887    vpbroadcastd         m4, [pw_16384]
   3888    vpbroadcastd         m5, [pixel_12bpc_max]
   3889    ret
   3890 
   3891 INV_TXFM_16X4_FN flipadst, dct,      12
   3892 INV_TXFM_16X4_FN flipadst, adst,     12
   3893 INV_TXFM_16X4_FN flipadst, flipadst, 12
   3894 INV_TXFM_16X4_FN flipadst, identity, 12
   3895 
   3896 cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   3897    vpbroadcastd        m12, [clip_20b_min]
   3898    vpbroadcastd        m13, [clip_20b_max]
   3899    jmp m(iflipadst_16x4_internal_10bpc).pass1
   3900 .pass2:
   3901    call m(iadst_16x4_internal_12bpc).pass2_main
   3902    vpermq               m7, m0, q3120
   3903    vpermq               m6, m1, q3120
   3904    vpermq               m1, m2, q3120
   3905    vpermq               m0, m3, q3120
   3906    pmulhrsw             m0, m4
   3907    pmulhrsw             m1, m4
   3908    pmulhrsw             m2, m6, m4
   3909    pmulhrsw             m3, m7, m4
   3910    jmp m(idct_16x4_internal_10bpc).end2
   3911 
   3912 INV_TXFM_16X4_FN identity, dct,      12
   3913 INV_TXFM_16X4_FN identity, adst,     12
   3914 INV_TXFM_16X4_FN identity, flipadst, 12
   3915 INV_TXFM_16X4_FN identity, identity, 12
   3916 
   3917 cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
   3918    vpbroadcastd         m8, [pd_1697]
   3919    vpermq               m0, [cq+32*0], q3120 ; 0 1
   3920    vpermq               m1, [cq+32*1], q3120 ; 2 3
   3921    vpermq               m2, [cq+32*2], q3120 ; 4 5
   3922    vpermq               m3, [cq+32*3], q3120 ; 6 7
   3923    vpbroadcastd         m9, [pd_3072]
   3924    pmulld               m4, m8, m0
   3925    pmulld               m5, m8, m1
   3926    pmulld               m6, m8, m2
   3927    pmulld               m7, m8, m3
   3928    vpermq              m10, [cq+32*4], q3120 ; 8 9
   3929    vpermq              m11, [cq+32*5], q3120 ; a b
   3930    vpermq              m12, [cq+32*6], q3120 ; c d
   3931    vpermq              m13, [cq+32*7], q3120 ; e f
   3932    REPX     {paddd  x, m9}, m4, m5, m6, m7
   3933    REPX     {psrad  x, 12}, m4, m5, m6, m7
   3934    paddd                m0, m4
   3935    pmulld               m4, m8, m10
   3936    paddd                m1, m5
   3937    pmulld               m5, m8, m11
   3938    paddd                m2, m6
   3939    pmulld               m6, m8, m12
   3940    paddd                m3, m7
   3941    pmulld               m7, m8, m13
   3942    REPX     {paddd  x, m9}, m4, m5, m6, m7
   3943    REPX     {psrad  x, 12}, m4, m5, m6, m7
   3944    paddd                m4, m10
   3945    paddd                m5, m11
   3946    paddd                m6, m12
   3947    paddd                m7, m13
   3948    jmp                tx2q
   3949 .pass2:
   3950    vpbroadcastd        m12, [clip_18b_min]
   3951    vpbroadcastd        m13, [clip_18b_max]
   3952    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   3953    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   3954    vpbroadcastd         m8, [pd_5793]
   3955    vpbroadcastd         m9, [pd_2048]
   3956    REPX     {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
   3957    REPX     {paddd  x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
   3958    REPX     {psrad  x, 15}, m0, m1, m2, m3, m4, m5, m6, m7
   3959    call m(idct_16x4_internal_10bpc).transpose_4x16_packed
   3960    vpbroadcastd         m4, [pw_16384]
   3961    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
   3962    vpbroadcastd         m5, [pixel_12bpc_max]
   3963    jmp m(idct_16x4_internal_10bpc).end2
   3964 
   3965 %macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth
   3966    INV_TXFM_FN          %1, %2, 0, 16x8, %3
   3967 %ifidn %1_%2, dct_dct
   3968    imul                r6d, [cq], 181
   3969    vpbroadcastd         m3, [dconly_%3bpc]
   3970    mov                [cq], eobd ; 0
   3971    or                  r3d, 8
   3972    add                 r6d, 128
   3973    sar                 r6d, 8
   3974    imul                r6d, 181
   3975    jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
   3976 %endif
   3977 %endmacro
   3978 
   3979 INV_TXFM_16X8_FN dct, dct
   3980 INV_TXFM_16X8_FN dct, identity
   3981 INV_TXFM_16X8_FN dct, adst
   3982 INV_TXFM_16X8_FN dct, flipadst
   3983 
   3984 cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   3985    vpbroadcastd        m12, [clip_18b_min]
   3986    vpbroadcastd        m13, [clip_18b_max]
   3987 .pass1:
   3988    vpbroadcastd        m14, [pd_2896]
   3989    pmulld               m0, m14, [cq+32* 1]
   3990    pmulld               m1, m14, [cq+32* 3]
   3991    pmulld               m2, m14, [cq+32* 5]
   3992    pmulld               m3, m14, [cq+32* 7]
   3993    pmulld               m4, m14, [cq+32* 9]
   3994    pmulld               m5, m14, [cq+32*11]
   3995    pmulld               m6, m14, [cq+32*13]
   3996    pmulld               m7, m14, [cq+32*15]
   3997    vpbroadcastd        m11, [pd_2048]
   3998    lea                  r6, [rsp+32*4]
   3999    call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
   4000    pmulld               m0, m14, [cq+32* 0]
   4001    pmulld               m1, m14, [cq+32* 2]
   4002    pmulld               m2, m14, [cq+32* 4]
   4003    pmulld               m3, m14, [cq+32* 6]
   4004    pmulld               m4, m14, [cq+32* 8]
   4005    pmulld               m5, m14, [cq+32*10]
   4006    pmulld               m6, m14, [cq+32*12]
   4007    pmulld               m7, m14, [cq+32*14]
   4008    call m(idct_8x8_internal_10bpc).main_rect2
   4009    call m(idct_8x16_internal_10bpc).main_evenhalf
   4010    psrld               m11, 11 ; pd_1
   4011    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   4012    call .pass1_rotations
   4013    REPX       {psrad x, 1}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   4014                             m8,  m9,  m10, m11, m12, m13, m14, m15
   4015    jmp                tx2q
   4016 .pass2:
   4017    call .transpose
   4018    call m(idct_16x8_internal_8bpc).main
   4019    vpbroadcastd        m10, [pw_2048]
   4020 .end:
   4021    pmulhrsw             m0, m10
   4022    pmulhrsw             m1, m10
   4023    pmulhrsw             m2, m10
   4024    pmulhrsw             m3, m10
   4025    call .write_16x4_start
   4026 .end2:
   4027    pmulhrsw             m0, m4, m10
   4028    pmulhrsw             m1, m5, m10
   4029    pmulhrsw             m2, m6, m10
   4030    pmulhrsw             m3, m7, m10
   4031    call .write_16x4_zero
   4032    RET
   4033 ALIGN function_align
   4034 .pass1_rotations:
   4035    mova                m14, [r6-32*4]
   4036    mova                m13, [r6-32*3]
   4037    mova                m12, [r6-32*2]
   4038    mova                m11, [r6-32*1]
   4039    mova                m10, [r6+32*0]
   4040    mova                 m9, [r6+32*1]
   4041    mova                 m8, [r6+32*2]
   4042    psubd               m15, m0, m14       ; out15
   4043    paddd                m0, m14           ; out0
   4044    psubd               m14, m1, m13       ; out14
   4045    paddd                m1, m13           ; out1
   4046    psubd               m13, m2, m12       ; out13
   4047    paddd                m2, m12           ; out2
   4048    psubd               m12, m3, m11       ; out12
   4049    paddd                m3, m11           ; out3
   4050    psubd               m11, m4, m10       ; out11
   4051    paddd                m4, m10           ; out4
   4052    psubd               m10, m5, m9        ; out10
   4053    paddd                m5, m9            ; out5
   4054    psubd                m9, m6, m8        ; out9
   4055    paddd                m6, m8            ; out6
   4056    psubd                m8, m7, [r6+32*3] ; out8
   4057    paddd                m7, [r6+32*3]     ; out7
   4058    ret
   4059 ALIGN function_align
   4060 .transpose:
   4061    lea                  r6, [deint_shuf+128]
   4062 .transpose2:
   4063    packssdw             m0, m8
   4064    packssdw             m1, m9
   4065    packssdw             m2, m10
   4066    packssdw             m3, m11
   4067    packssdw             m4, m12
   4068    packssdw             m5, m13
   4069    packssdw             m6, m14
   4070    packssdw             m7, m15
   4071 .transpose3:
   4072    punpckhwd            m8, m0, m1
   4073    punpcklwd            m0, m1
   4074    punpcklwd            m1, m2, m3
   4075    punpckhwd            m2, m3
   4076    punpckhwd            m3, m4, m5
   4077    punpcklwd            m4, m5
   4078    punpckhwd            m5, m6, m7
   4079    punpcklwd            m6, m7
   4080    punpckhdq            m7, m4, m6
   4081    punpckldq            m4, m6
   4082    punpckldq            m6, m8, m2
   4083    punpckhdq            m8, m2
   4084    punpckhdq            m2, m0, m1
   4085    punpckldq            m0, m1
   4086    punpckhdq            m1, m3, m5
   4087    punpckldq            m3, m5
   4088    punpcklqdq           m5, m6, m3
   4089    punpckhqdq           m6, m3
   4090    punpckhqdq           m3, m2, m7
   4091    punpcklqdq           m2, m7
   4092    punpcklqdq           m7, m8, m1
   4093    punpckhqdq           m8, m1
   4094    punpckhqdq           m1, m0, m4
   4095    punpcklqdq           m0, m4
   4096    vperm2i128           m4, m0, m5, 0x31
   4097    vinserti128          m0, xm5, 1
   4098    vperm2i128           m5, m1, m6, 0x31
   4099    vinserti128          m1, xm6, 1
   4100    vperm2i128           m6, m2, m7, 0x31
   4101    vinserti128          m2, xm7, 1
   4102    vperm2i128           m7, m3, m8, 0x31
   4103    vinserti128          m3, xm8, 1
   4104    ret
   4105 ALIGN function_align
   4106 .write_16x4_start:
   4107    vpbroadcastd         m9, [pixel_10bpc_max]
   4108    lea                  r3, [strideq*3]
   4109    pxor                 m8, m8
   4110 .write_16x4_zero:
   4111    REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7
   4112    add                  cq, 32*8
   4113 .write_16x4:
   4114    paddw                m0, [dstq+strideq*0]
   4115    paddw                m1, [dstq+strideq*1]
   4116    paddw                m2, [dstq+strideq*2]
   4117    paddw                m3, [dstq+r3       ]
   4118    REPX     {pmaxsw x, m8}, m0, m1, m2, m3
   4119    REPX     {pminsw x, m9}, m0, m1, m2, m3
   4120    mova   [dstq+strideq*0], m0
   4121    mova   [dstq+strideq*1], m1
   4122    mova   [dstq+strideq*2], m2
   4123    mova   [dstq+r3       ], m3
   4124    lea                dstq, [dstq+strideq*4]
   4125    ret
   4126 
   4127 INV_TXFM_16X8_FN adst, dct
   4128 INV_TXFM_16X8_FN adst, adst
   4129 INV_TXFM_16X8_FN adst, flipadst
   4130 INV_TXFM_16X8_FN adst, identity
   4131 
   4132 cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   4133    vpbroadcastd        m13, [clip_18b_min]
   4134    vpbroadcastd        m14, [clip_18b_max]
   4135 .pass1:
   4136    lea                  r6, [rsp+32*4]
   4137    call .main
   4138    vpbroadcastd        m14, [pd_3072]
   4139    psrld               m15, 11       ; pd_1
   4140    psubd               m13, m14, m15 ; pd_3071
   4141    call .pass1_rotations
   4142 .pass1_end:
   4143    REPX      {psrad x, 1 }, m0,  m1,  m2,  m3,  m12, m13, m14, m15
   4144    REPX      {psrad x, 12}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
   4145    jmp                tx2q
   4146 .pass2:
   4147    call m(idct_16x8_internal_10bpc).transpose
   4148    call m(iadst_16x8_internal_8bpc).main
   4149    call m(iadst_16x8_internal_8bpc).main_pass2_end
   4150    vpbroadcastd        m10, [pw_2048]
   4151    pxor                m11, m11
   4152    psubw               m11, m10
   4153    pmulhrsw             m0, m10
   4154    pmulhrsw             m1, m11
   4155    pmulhrsw             m2, m10
   4156    pmulhrsw             m3, m11
   4157    call m(idct_16x8_internal_10bpc).write_16x4_start
   4158    pmulhrsw             m0, m4, m10
   4159    pmulhrsw             m1, m5, m11
   4160    pmulhrsw             m2, m6, m10
   4161    pmulhrsw             m3, m7, m11
   4162    call m(idct_16x8_internal_10bpc).write_16x4_zero
   4163    RET
   4164 ALIGN function_align
   4165 .pass1_rotations:
   4166    paddd                m0, m15
   4167    psubd                m1, m15, m1
   4168    paddd                m2, m15
   4169    psubd                m3, m15, m3
   4170    paddd                m4, m14
   4171    psubd                m5, m13, m5
   4172    paddd                m6, m14
   4173    psubd                m7, m13, m7
   4174    paddd                m8, m14, m9
   4175    psubd                m9, m13, m10
   4176    paddd               m10, m14, m11
   4177    psubd               m11, m13, m12
   4178    paddd               m12, m15, [r6-32*1]
   4179    psubd               m13, m15, [r6-32*2]
   4180    paddd               m14, m15, [r6-32*3]
   4181    psubd               m15,      [r6-32*4]
   4182    ret
   4183 ALIGN function_align
   4184 .main:
   4185    ; expects: m13 = clip_min   m14 = clip_max
   4186    vpbroadcastd        m15, [pd_2896]
   4187    pmulld               m0, m15, [cq+32* 2]
   4188    pmulld               m1, m15, [cq+32*13]
   4189    pmulld               m2, m15, [cq+32* 6]
   4190    pmulld               m3, m15, [cq+32* 9]
   4191    pmulld               m4, m15, [cq+32*10]
   4192    pmulld               m5, m15, [cq+32* 5]
   4193    pmulld               m6, m15, [cq+32*14]
   4194    pmulld               m7, m15, [cq+32* 1]
   4195    vpbroadcastd        m12, [pd_2048]
   4196    REPX     {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   4197    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
   4198    call .main_part1
   4199    pmulld               m0, m15, [cq+32* 0]
   4200    pmulld               m1, m15, [cq+32*15]
   4201    pmulld               m2, m15, [cq+32* 4]
   4202    pmulld               m3, m15, [cq+32*11]
   4203    pmulld               m4, m15, [cq+32* 8]
   4204    pmulld               m5, m15, [cq+32* 7]
   4205    pmulld               m6, m15, [cq+32*12]
   4206    pmulld               m7, m15, [cq+32* 3]
   4207    REPX     {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   4208    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
   4209 .main_part2:
   4210    ITX_MULSUB_2D         1, 0, 8, 9, 10, 12,  201, 4091
   4211    ITX_MULSUB_2D         3, 2, 8, 9, 10, 12, 1751, 3703
   4212    ITX_MULSUB_2D         5, 4, 8, 9, 10, 12, 3035, 2751
   4213    ITX_MULSUB_2D         7, 6, 8, 9, 10, 12, 3857, 1380
   4214    psubd                m8, m0, m4 ; t8a
   4215    paddd                m0, m4     ; t0a
   4216    psubd                m4, m1, m5 ; t9a
   4217    paddd                m1, m5     ; t1a
   4218    psubd                m5, m2, m6 ; t12a
   4219    paddd                m2, m6     ; t4a
   4220    psubd                m6, m3, m7 ; t13a
   4221    paddd                m7, m3     ; t5a
   4222    REPX    {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
   4223    REPX    {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
   4224    vpbroadcastd        m11, [pd_4017]
   4225    vpbroadcastd        m10, [pd_799]
   4226    ITX_MULSUB_2D         8, 4, 3, 9, _, 12, 10, 11
   4227    ITX_MULSUB_2D         6, 5, 3, 9, _, 12, 11, 10
   4228    psubd                m3, m0, m2 ; t4
   4229    paddd                m0, m2     ; t0
   4230    psubd                m2, m1, m7 ; t5
   4231    paddd                m1, m7     ; t1
   4232    psubd                m7, m4, m6 ; t12a
   4233    paddd                m4, m6     ; t8a
   4234    psubd                m6, m8, m5 ; t13a
   4235    paddd                m5, m8     ; t9a
   4236    REPX    {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
   4237    REPX    {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5
   4238    vpbroadcastd        m11, [pd_3784]
   4239    vpbroadcastd        m10, [pd_1567]
   4240    ITX_MULSUB_2D         3, 2, 8, 9, _, 12, 10, 11
   4241    ITX_MULSUB_2D         7, 6, 8, 9, _, 12, 10, 11
   4242    pminsd              m10, m14, [r6-32*4] ;  t2
   4243    pminsd               m8, m14, [r6-32*3] ;  t3
   4244    psubd                m9, m0, m10        ;  t2a
   4245    paddd                m0, m10            ;  out0
   4246    psubd               m10, m1, m8         ;  t3a
   4247    paddd                m1, m8             ; -out15
   4248    pmaxsd               m9, m13
   4249    pmaxsd              m10, m13
   4250    pminsd               m9, m14
   4251    pminsd              m10, m14
   4252    mova          [r6-32*4], m1
   4253    mova                m11, [r6-32*1]      ;  t7a
   4254    mova                 m1, [r6-32*2]      ;  t6a
   4255    psubd                m8, m3, m11        ;  t7
   4256    paddd               m11, m3             ;  out12
   4257    paddd                m3, m2, m1         ; -out3
   4258    psubd                m2, m1             ;  t6
   4259    pmaxsd               m8, m13
   4260    pmaxsd               m2, m13
   4261    pminsd               m8, m14
   4262    pminsd               m2, m14
   4263    mova          [r6-32*1], m11
   4264    mova          [r6-32*3], m2
   4265    mova                 m1, [r6+32*3]      ;  t15
   4266    mova                 m2, [r6+32*2]      ;  t14
   4267    paddd               m12, m7, m1         ; -out13
   4268    psubd                m7, m1             ;  t15a
   4269    psubd               m11, m6, m2         ;  t14a
   4270    paddd                m2, m6             ;  out2
   4271    pmaxsd               m7, m13
   4272    pmaxsd              m11, m13
   4273    pminsd               m7, m14
   4274    pminsd              m11, m14
   4275    mova          [r6-32*2], m12
   4276    pminsd               m1, m14, [r6+32*0] ;  t10a
   4277    pminsd              m12, m14, [r6+32*1] ;  t11a
   4278    psubd                m6, m4, m1         ;  t10
   4279    paddd                m1, m4             ; -out1
   4280    psubd                m4, m5, m12        ;  t11
   4281    paddd                m5, m12            ;  out14
   4282    vpbroadcastd        m12, [pd_1448]
   4283    pmaxsd               m6, m13
   4284    pmaxsd               m4, m13
   4285    pminsd               m6, m14
   4286    pminsd               m4, m14
   4287    REPX    {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4
   4288    pmulld              m12, [r6-32*3]      ;  t6
   4289    mova          [r6-32*3], m5
   4290    paddd                m5, m11, m7        ; -out5  (unshifted)
   4291    psubd               m11, m7             ;  out10 (unshifted)
   4292    paddd                m7, m9, m10        ; -out7  (unshifted)
   4293    psubd                m9, m10            ;  out8  (unshifted)
   4294    psubd               m10, m6, m4         ; -out9  (unshifted)
   4295    paddd                m6, m4             ;  out6  (unshifted)
   4296    paddd                m4, m12, m8        ;  out4  (unshifted)
   4297    psubd               m12, m8             ; -out11 (unshifted)
   4298    ret
   4299 .main_part1:
   4300    ITX_MULSUB_2D         1, 0, 8, 9, 10, 12,  995, 3973
   4301    ITX_MULSUB_2D         3, 2, 8, 9, 10, 12, 2440, 3290
   4302    ITX_MULSUB_2D         5, 4, 8, 9, 10, 12, 3513, 2106
   4303    ITX_MULSUB_2D         7, 6, 8, 9, 10, 12, 4052,  601
   4304    psubd                m8, m0, m4 ; t10a
   4305    paddd                m0, m4     ; t2a
   4306    psubd                m4, m1, m5 ; t11a
   4307    paddd                m1, m5     ; t3a
   4308    psubd                m5, m2, m6 ; t14a
   4309    paddd                m2, m6     ; t6a
   4310    psubd                m6, m3, m7 ; t15a
   4311    paddd                m7, m3     ; t7a
   4312    REPX    {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
   4313    REPX    {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
   4314    vpbroadcastd        m11, [pd_2276]
   4315    vpbroadcastd        m10, [pd_3406]
   4316    ITX_MULSUB_2D         8, 4, 3, 9, _, 12, 10, 11
   4317    ITX_MULSUB_2D         6, 5, 3, 9, _, 12, 11, 10
   4318    psubd                m3, m0, m2 ; t6
   4319    paddd                m0, m2     ; t2
   4320    psubd                m2, m1, m7 ; t7
   4321    paddd                m1, m7     ; t3
   4322    psubd                m7, m4, m6 ; t14a
   4323    paddd                m4, m6     ; t10a
   4324    psubd                m6, m8, m5 ; t15a
   4325    paddd                m5, m8     ; t11a
   4326    REPX    {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
   4327    REPX    {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later
   4328    vpbroadcastd        m11, [pd_1567]
   4329    vpbroadcastd        m10, [pd_3784]
   4330    ITX_MULSUB_2D         2, 3, 8, 9, _, 12, 10, 11
   4331    ITX_MULSUB_2D         6, 7, 8, 9, _, 12, 10, 11
   4332    mova          [r6-32*4], m0
   4333    mova          [r6-32*3], m1
   4334    mova          [r6+32*0], m4
   4335    mova          [r6+32*1], m5
   4336    mova          [r6-32*2], m2
   4337    mova          [r6-32*1], m3
   4338    mova          [r6+32*2], m6
   4339    mova          [r6+32*3], m7
   4340    ret
   4341 
   4342 INV_TXFM_16X8_FN flipadst, dct
   4343 INV_TXFM_16X8_FN flipadst, adst
   4344 INV_TXFM_16X8_FN flipadst, flipadst
   4345 INV_TXFM_16X8_FN flipadst, identity
   4346 
   4347 cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   4348    vpbroadcastd        m13, [clip_18b_min]
   4349    vpbroadcastd        m14, [clip_18b_max]
   4350 .pass1:
   4351    lea                  r6, [rsp+32*4]
   4352    call m(iadst_16x8_internal_10bpc).main
   4353    vpbroadcastd        m14, [pd_3072]
   4354    psrld               m15, 11
   4355    psubd               m13, m14, m15
   4356    call .pass1_rotations
   4357    jmp m(iadst_16x8_internal_10bpc).pass1_end
   4358 .pass2:
   4359    call m(idct_16x8_internal_10bpc).transpose
   4360    call m(iadst_16x8_internal_8bpc).main
   4361    call m(iadst_16x8_internal_8bpc).main_pass2_end
   4362    vpbroadcastd        m10, [pw_2048]
   4363    pxor                m11, m11
   4364    psubw               m11, m10
   4365    mova                m12, m0
   4366    pmulhrsw             m0, m7, m11
   4367    mova                 m7, m1
   4368    pmulhrsw             m1, m6, m10
   4369    mova                 m6, m2
   4370    pmulhrsw             m2, m5, m11
   4371    mova                 m5, m3
   4372    pmulhrsw             m3, m4, m10
   4373    call m(idct_16x8_internal_10bpc).write_16x4_start
   4374    pmulhrsw             m0, m5, m11
   4375    pmulhrsw             m1, m6, m10
   4376    pmulhrsw             m2, m7, m11
   4377    pmulhrsw             m3, m12, m10
   4378    call m(idct_16x8_internal_10bpc).write_16x4_zero
   4379    RET
   4380 ALIGN function_align
   4381 .pass1_rotations:
   4382    psubd                m8, m13, m7
   4383    paddd                m7, m14, m9
   4384    paddd                m9, m14, m6
   4385    psubd                m6, m13, m10
   4386    psubd               m10, m13, m5
   4387    paddd                m5, m14, m11
   4388    paddd               m11, m14, m4
   4389    psubd                m4, m13, m12
   4390    psubd               m12, m15, m3
   4391    paddd                m3, m15, [r6-32*1]
   4392    paddd               m13, m15, m2
   4393    psubd                m2, m15, [r6-32*2]
   4394    psubd               m14, m15, m1
   4395    mova                 m1, m15
   4396    paddd               m15, m0
   4397    psubd                m0, m1, [r6-32*4]
   4398    paddd                m1,     [r6-32*3]
   4399    ret
   4400 
   4401 INV_TXFM_16X8_FN identity, dct
   4402 INV_TXFM_16X8_FN identity, adst
   4403 INV_TXFM_16X8_FN identity, flipadst
   4404 INV_TXFM_16X8_FN identity, identity
   4405 
   4406 cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   4407 .pass1:
   4408    vpbroadcastd        m15, [pd_2896]
   4409    pmulld               m0, m15, [cq+32* 0]
   4410    pmulld               m1, m15, [cq+32* 1]
   4411    pmulld               m2, m15, [cq+32* 2]
   4412    pmulld               m3, m15, [cq+32* 3]
   4413    pmulld               m4, m15, [cq+32* 4]
   4414    pmulld               m5, m15, [cq+32* 5]
   4415    pmulld               m6, m15, [cq+32* 6]
   4416    pmulld               m7, m15, [cq+32* 7]
   4417    pmulld               m8, m15, [cq+32* 8]
   4418    pmulld               m9, m15, [cq+32* 9]
   4419    pmulld              m10, m15, [cq+32*10]
   4420    pmulld              m11, m15, [cq+32*11]
   4421    pmulld              m12, m15, [cq+32*12]
   4422    pmulld              m13, m15, [cq+32*13]
   4423    pmulld              m14, m15, [cq+32*14]
   4424    pmulld              m15,      [cq+32*15]
   4425    mova              [rsp], m7
   4426    vpbroadcastd         m7, [pd_2048]
   4427    REPX    {paddd  x, m7 }, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
   4428                             m8,  m9,  m10, m11, m12, m13, m14, m15
   4429    paddd                m7, [rsp]
   4430    REPX    {psrad  x, 12 }, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   4431                             m8,  m9,  m10, m11, m12, m13, m14, m15
   4432    mova              [rsp], m15
   4433    vpbroadcastd        m15, [pd_5793]
   4434    REPX    {pmulld x, m15}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   4435                             m8,  m9,  m10, m11, m12, m13, m14
   4436    pmulld              m15, [rsp]
   4437    mova              [rsp], m7
   4438    vpbroadcastd         m7, [pd_3072]
   4439    REPX    {paddd  x, m7 }, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
   4440                             m8,  m9,  m10, m11, m12, m13, m14, m15
   4441    paddd                m7, [rsp]
   4442    REPX    {psrad  x, 12 }, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   4443                             m8,  m9,  m10, m11, m12, m13, m14, m15
   4444    jmp                tx2q
   4445 .pass2:
   4446    call m(idct_16x8_internal_10bpc).transpose
   4447    vpbroadcastd        m10, [pw_4096]
   4448    jmp m(idct_16x8_internal_10bpc).end
   4449 
   4450 INV_TXFM_16X8_FN dct, dct,      12
   4451 INV_TXFM_16X8_FN dct, identity, 12
   4452 INV_TXFM_16X8_FN dct, adst,     12
   4453 INV_TXFM_16X8_FN dct, flipadst, 12
   4454 
   4455 cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   4456    vpbroadcastd        m12, [clip_20b_min]
   4457    vpbroadcastd        m13, [clip_20b_max]
   4458    jmp m(idct_16x8_internal_10bpc).pass1
   4459 .pass2:
   4460    call .pass2_main
   4461    RET
   4462 ALIGN function_align
   4463 .pass2_main:
   4464    call m(idct_8x16_internal_12bpc).transpose
   4465    vpbroadcastd        m12, [clip_18b_min]
   4466    vpbroadcastd        m13, [clip_18b_max]
   4467    vpbroadcastd        m11, [pd_2048]
   4468    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   4469    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   4470    call m(idct_8x8_internal_10bpc).main
   4471    call m(idct_8x8_internal_12bpc).round_shift4
   4472    mova         [cq+32* 8], m0
   4473    mova         [cq+32* 9], m1
   4474    mova         [cq+32*10], m2
   4475    mova         [cq+32*11], m3
   4476    mova         [cq+32*12], m4
   4477    mova         [cq+32*13], m5
   4478    mova         [cq+32*14], m6
   4479    mova         [cq+32*15], m7
   4480    pmaxsd               m0, m12, [cq+32*0]
   4481    pmaxsd               m1, m12, [cq+32*1]
   4482    pmaxsd               m2, m12, [cq+32*2]
   4483    pmaxsd               m3, m12, [cq+32*3]
   4484    pmaxsd               m4, m12, [cq+32*4]
   4485    pmaxsd               m5, m12, [cq+32*5]
   4486    pmaxsd               m6, m12, [cq+32*6]
   4487    pmaxsd               m7, m12, [cq+32*7]
   4488    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   4489    call m(idct_8x8_internal_10bpc).main
   4490    call m(idct_8x8_internal_12bpc).round_shift4
   4491 .end:
   4492    packssdw             m0, [cq+32* 8]
   4493    packssdw             m1, [cq+32* 9]
   4494    packssdw             m2, [cq+32*10]
   4495    packssdw             m3, [cq+32*11]
   4496    packssdw             m4, [cq+32*12]
   4497    packssdw             m5, [cq+32*13]
   4498    packssdw             m6, [cq+32*14]
   4499    packssdw             m7, [cq+32*15]
   4500    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
   4501    call .write_16x4_start
   4502    call m(idct_16x8_internal_10bpc).write_16x4_zero
   4503    vpermq               m0, m4, q3120
   4504    vpermq               m1, m5, q3120
   4505    vpermq               m2, m6, q3120
   4506    vpermq               m3, m7, q3120
   4507    jmp m(idct_16x8_internal_10bpc).write_16x4_zero
   4508 ALIGN function_align
   4509 .write_16x4_start:
   4510    vpbroadcastd         m9, [pixel_12bpc_max]
   4511    lea                  r3, [strideq*3]
   4512    pxor                 m8, m8
   4513    ret
   4514 
   4515 INV_TXFM_16X8_FN adst, dct,      12
   4516 INV_TXFM_16X8_FN adst, adst,     12
   4517 INV_TXFM_16X8_FN adst, flipadst, 12
   4518 INV_TXFM_16X8_FN adst, identity, 12
   4519 
   4520 cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   4521    vpbroadcastd        m13, [clip_20b_min]
   4522    vpbroadcastd        m14, [clip_20b_max]
   4523    jmp m(iadst_16x8_internal_10bpc).pass1
   4524 .pass2:
   4525    call .pass2_main
   4526    call m(idct_16x8_internal_12bpc).end
   4527    RET
   4528 ALIGN function_align
   4529 .pass2_main:
   4530    call m(idct_8x16_internal_12bpc).transpose
   4531    vpbroadcastd        m12, [clip_18b_min]
   4532    vpbroadcastd        m13, [clip_18b_max]
   4533    vpbroadcastd        m11, [pd_2048]
   4534    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   4535    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   4536    call m(iadst_8x8_internal_12bpc).pass2_main2
   4537    mova         [cq+32* 8], m0
   4538    mova         [cq+32* 9], m1
   4539    mova         [cq+32*10], m2
   4540    mova         [cq+32*11], m3
   4541    mova         [cq+32*12], m4
   4542    mova         [cq+32*13], m5
   4543    mova         [cq+32*14], m6
   4544    mova         [cq+32*15], m7
   4545    pmaxsd               m0, m12, [cq+32*0]
   4546    pmaxsd               m1, m12, [cq+32*1]
   4547    pmaxsd               m2, m12, [cq+32*2]
   4548    pmaxsd               m3, m12, [cq+32*3]
   4549    pmaxsd               m4, m12, [cq+32*4]
   4550    pmaxsd               m5, m12, [cq+32*5]
   4551    pmaxsd               m6, m12, [cq+32*6]
   4552    pmaxsd               m7, m12, [cq+32*7]
   4553    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   4554    call m(iadst_8x8_internal_12bpc).pass2_main2
   4555    ret
   4556 
   4557 INV_TXFM_16X8_FN flipadst, dct,      12
   4558 INV_TXFM_16X8_FN flipadst, adst,     12
   4559 INV_TXFM_16X8_FN flipadst, flipadst, 12
   4560 INV_TXFM_16X8_FN flipadst, identity, 12
   4561 
   4562 cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   4563    vpbroadcastd        m13, [clip_20b_min]
   4564    vpbroadcastd        m14, [clip_20b_max]
   4565    jmp m(iflipadst_16x8_internal_10bpc).pass1
   4566 .pass2:
   4567    call m(iadst_16x8_internal_12bpc).pass2_main
   4568    packssdw            m13, m0, [cq+32* 8]
   4569    packssdw            m12, m1, [cq+32* 9]
   4570    packssdw            m11, m2, [cq+32*10]
   4571    packssdw            m10, m3, [cq+32*11]
   4572    packssdw             m3, m4, [cq+32*12]
   4573    packssdw             m2, m5, [cq+32*13]
   4574    packssdw             m1, m6, [cq+32*14]
   4575    packssdw             m0, m7, [cq+32*15]
   4576    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
   4577    call m(idct_16x8_internal_12bpc).write_16x4_start
   4578    call m(idct_16x8_internal_10bpc).write_16x4_zero
   4579    vpermq               m0, m10, q3120
   4580    vpermq               m1, m11, q3120
   4581    vpermq               m2, m12, q3120
   4582    vpermq               m3, m13, q3120
   4583    call m(idct_16x8_internal_10bpc).write_16x4_zero
   4584    RET
   4585 
   4586 INV_TXFM_16X8_FN identity, dct,      12
   4587 INV_TXFM_16X8_FN identity, adst,     12
   4588 INV_TXFM_16X8_FN identity, flipadst, 12
   4589 INV_TXFM_16X8_FN identity, identity, 12
   4590 
   4591 cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
   4592    jmp m(iidentity_16x8_internal_10bpc).pass1
   4593 .pass2:
   4594    call m(idct_16x8_internal_10bpc).transpose2
   4595    vpbroadcastd        m10, [pw_4096]
   4596    pmulhrsw             m0, m10
   4597    pmulhrsw             m1, m10
   4598    pmulhrsw             m2, m10
   4599    pmulhrsw             m3, m10
   4600    call m(idct_16x8_internal_12bpc).write_16x4_start
   4601    call m(idct_16x8_internal_10bpc).write_16x4_zero
   4602    jmp m(idct_16x8_internal_10bpc).end2
   4603 
   4604 %macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
   4605    INV_TXFM_FN          %1, %2, %3, 16x16, %4
   4606 %ifidn %1_%2, dct_dct
   4607    imul                r6d, [cq], 181
   4608    vpbroadcastd         m3, [dconly_%4bpc]
   4609    mov                [cq], eobd ; 0
   4610    or                  r3d, 16
   4611    add                 r6d, 640
   4612    sar                 r6d, 10
   4613    jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
   4614 %endif
   4615 %endmacro
   4616 
   4617 INV_TXFM_16X16_FN dct, dct
   4618 INV_TXFM_16X16_FN dct, identity, 28
   4619 INV_TXFM_16X16_FN dct, adst
   4620 INV_TXFM_16X16_FN dct, flipadst
   4621 
   4622 cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
   4623    vpbroadcastd        m12, [clip_18b_min]
   4624    vpbroadcastd        m13, [clip_18b_max]
   4625 .pass1:
   4626    vpbroadcastd        m11, [pd_2048]
   4627    vpbroadcastd        m14, [pd_2896]
   4628    lea                  r6, [rsp+32*4]
   4629    sub                eobd, 36
   4630    jl .fast
   4631    add                  cq, 32
   4632    call .main
   4633    sub                  cq, 32
   4634    mova                m10, [r6-32*4]
   4635    mova                 m9, [r6-32*3]
   4636    mova                 m8, [r6-32*2]
   4637    psubd               m15, m0, m10 ; out15
   4638    paddd                m0, m10     ; out0
   4639    psubd               m10, m1, m9  ; out14
   4640    paddd                m1, m9      ; out1
   4641    psubd                m9, m2, m8  ; out13
   4642    paddd                m2, m8      ; out2
   4643    REPX       {psrad x, 2}, m0, m1, m2
   4644    mova          [r6-32*4], m0
   4645    mova          [r6-32*3], m1
   4646    mova          [r6-32*2], m2
   4647    mova                 m2, [r6-32*1]
   4648    mova                 m1, [r6+32*0]
   4649    mova                 m0, [r6+32*1]
   4650    REPX       {psrad x, 2}, m9, m10, m15
   4651    psubd                m8, m3, m2 ; out12
   4652    paddd                m3, m2     ; out3
   4653    psubd                m2, m4, m1 ; out11
   4654    paddd                m4, m1     ; out4
   4655    psubd                m1, m5, m0 ; out10
   4656    paddd                m5, m0     ; out5
   4657    REPX       {psrad x, 2}, m3, m4, m5
   4658    mova          [r6-32*1], m3
   4659    mova          [r6+32*0], m4
   4660    mova          [r6+32*1], m5
   4661    mova                 m4, [r6+32*2]
   4662    mova                 m3, [r6+32*3]
   4663    REPX       {psrad x, 2}, m1, m2, m8
   4664    psubd                m5, m6, m4 ; out9
   4665    paddd                m6, m4     ; out6
   4666    psubd                m4, m7, m3 ; out8
   4667    paddd                m7, m3     ; out7
   4668    REPX       {psrad x, 2}, m6, m7, m4, m5
   4669    mova          [r6+32*2], m6
   4670    mova          [r6+32*3], m7
   4671    add                  r6, 32*8
   4672    mova          [r6-32*4], m4
   4673    mova          [r6-32*3], m5
   4674    mova          [r6-32*2], m1
   4675    mova          [r6-32*1], m2
   4676    mova          [r6+32*0], m8
   4677    mova          [r6+32*1], m9
   4678    mova          [r6+32*2], m10
   4679    mova          [r6+32*3], m15
   4680 .fast:
   4681    add                  r6, 32*8
   4682    call .main
   4683    mova                m14, [r6-32*4]
   4684    mova                m13, [r6-32*3]
   4685    mova                m12, [r6-32*2]
   4686    mova                m11, [r6-32*1]
   4687    mova                m10, [r6+32*0]
   4688    mova                 m9, [r6+32*1]
   4689    mova                 m8, [r6+32*2]
   4690    psubd               m15, m0, m14       ; out15
   4691    paddd                m0, m14           ; out0
   4692    psubd               m14, m1, m13       ; out14
   4693    paddd                m1, m13           ; out1
   4694    psubd               m13, m2, m12       ; out13
   4695    paddd                m2, m12           ; out2
   4696    psubd               m12, m3, m11       ; out12
   4697    paddd                m3, m11           ; out3
   4698    psubd               m11, m4, m10       ; out11
   4699    paddd                m4, m10           ; out4
   4700    psubd               m10, m5, m9        ; out10
   4701    paddd                m5, m9            ; out5
   4702    psubd                m9, m6, m8        ; out9
   4703    paddd                m6, m8            ; out6
   4704    psubd                m8, m7, [r6+32*3] ; out8
   4705    paddd                m7, [r6+32*3]     ; out7
   4706    sub                  r6, 32*8
   4707    REPX       {psrad x, 2}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   4708                             m8,  m9,  m10, m11, m12, m13, m14, m15
   4709    jmp                tx2q
   4710 .pass2:
   4711    call .transpose
   4712    lea                  r6, [pw_5+128]
   4713    mova              [rsp], m15
   4714    call m(idct_16x16_internal_8bpc).main
   4715    mova                 m1, [rsp+32*1]
   4716 .end:
   4717    call .write_16x16
   4718    RET
   4719 ALIGN function_align
   4720 .write_16x16:
   4721    mova [rsp+gprsize+32*0], m8
   4722    mova [rsp+gprsize+32*1], m9
   4723    mova [rsp+gprsize+32*2], m12
   4724    vpbroadcastd        m12, [pw_2048]
   4725    pmulhrsw             m0, m12
   4726    pmulhrsw             m1, m12
   4727    pmulhrsw             m2, m12
   4728    pmulhrsw             m3, m12
   4729    call m(idct_16x8_internal_10bpc).write_16x4_start
   4730 .write_16x16_2:
   4731    pmulhrsw             m0, m12, m4
   4732    pmulhrsw             m1, m12, m5
   4733    pmulhrsw             m2, m12, m6
   4734    pmulhrsw             m3, m12, m7
   4735    call m(idct_16x8_internal_10bpc).write_16x4_zero
   4736    pmulhrsw             m0, m12, [rsp+gprsize+32*0]
   4737    pmulhrsw             m1, m12, [rsp+gprsize+32*1]
   4738    pmulhrsw             m2, m12, m10
   4739    pmulhrsw             m3, m12, m11
   4740    call m(idct_16x8_internal_10bpc).write_16x4_zero
   4741    pmulhrsw             m0, m12, [rsp+gprsize+32*2]
   4742    pmulhrsw             m1, m12, m13
   4743    pmulhrsw             m2, m12, m14
   4744    pmulhrsw             m3, m12, m15
   4745    jmp m(idct_16x8_internal_10bpc).write_16x4_zero
   4746 ALIGN function_align
   4747 .transpose:
   4748    test               eobd, eobd
   4749    jl .transpose_fast
   4750    packssdw             m8, [r6-32*4]
   4751    packssdw             m9, [r6-32*3]
   4752    packssdw            m10, [r6-32*2]
   4753    packssdw            m11, [r6-32*1]
   4754    packssdw            m12, [r6+32*0]
   4755    packssdw            m13, [r6+32*1]
   4756    packssdw            m14, [r6+32*2]
   4757    packssdw            m15, [r6+32*3]
   4758    sub                  r6, 32*8
   4759    packssdw             m0, [r6-32*4]
   4760    packssdw             m1, [r6-32*3]
   4761    packssdw             m2, [r6-32*2]
   4762    packssdw             m3, [r6-32*1]
   4763    packssdw             m4, [r6+32*0]
   4764    packssdw             m5, [r6+32*1]
   4765    packssdw             m6, [r6+32*2]
   4766    packssdw             m7, [r6+32*3]
   4767    mova               [r6], m8
   4768    punpckhwd            m8, m0, m1
   4769    punpcklwd            m0, m1
   4770    punpcklwd            m1, m2, m3
   4771    punpckhwd            m2, m3
   4772    punpckhwd            m3, m6, m7
   4773    punpcklwd            m6, m7
   4774    punpcklwd            m7, m4, m5
   4775    punpckhwd            m4, m5
   4776    punpckldq            m5, m8, m2
   4777    punpckhdq            m8, m2
   4778    punpckhdq            m2, m0, m1
   4779    punpckldq            m0, m1
   4780    punpckhdq            m1, m7, m6
   4781    punpckldq            m7, m6
   4782    punpckhdq            m6, m4, m3
   4783    punpckldq            m4, m3
   4784    punpckhqdq           m3, m2, m1
   4785    punpcklqdq           m2, m1
   4786    punpckhqdq           m1, m0, m7
   4787    punpcklqdq           m0, m7
   4788    punpcklqdq           m7, m8, m6
   4789    punpckhqdq           m8, m6
   4790    punpckhqdq           m6, m5, m4
   4791    punpcklqdq           m5, m4
   4792    mova                 m4, [r6]
   4793    mova               [r6], m8
   4794    punpcklwd            m8, m4, m9
   4795    punpckhwd            m4, m9
   4796    punpcklwd            m9, m10, m11
   4797    punpckhwd           m10, m11
   4798    punpckhwd           m11, m14, m15
   4799    punpcklwd           m14, m15
   4800    punpckhwd           m15, m12, m13
   4801    punpcklwd           m12, m13
   4802    punpckldq           m13, m4, m10
   4803    punpckhdq            m4, m10
   4804    punpckhdq           m10, m8, m9
   4805    punpckldq            m8, m9
   4806    punpckhdq            m9, m12, m14
   4807    punpckldq           m12, m14
   4808    punpckhdq           m14, m15, m11
   4809    punpckldq           m15, m11
   4810    punpckhqdq          m11, m10, m9
   4811    punpcklqdq          m10, m9
   4812    punpckhqdq           m9, m8, m12
   4813    punpcklqdq           m8, m12
   4814    punpcklqdq          m12, m13, m15
   4815    punpckhqdq          m13, m15
   4816    punpckhqdq          m15, m4, m14
   4817    punpcklqdq          m14, m4, m14
   4818    vperm2i128           m4, m0, m8, 0x31
   4819    vinserti128          m0, xm8, 1
   4820    vinserti128          m8, m5, xm12, 1
   4821    vperm2i128          m12, m5, 0x13
   4822    vperm2i128           m5, m1, m9, 0x31
   4823    vinserti128          m1, xm9, 1
   4824    vinserti128          m9, m6, xm13, 1
   4825    vperm2i128          m13, m6, 0x13
   4826    vperm2i128           m6, m2, m10, 0x31
   4827    vinserti128          m2, xm10, 1
   4828    vinserti128         m10, m7, xm14, 1
   4829    vperm2i128          m14, m7, 0x13
   4830    vperm2i128           m7, m3, m11, 0x31
   4831    vinserti128          m3, xm11, 1
   4832    mova               xm11, [r6]
   4833    vinserti128         m11, xm15, 1
   4834    vinserti128         m15, [r6+16], 0
   4835    ret
   4836 .transpose_fast:
   4837    call m(idct_16x8_internal_10bpc).transpose2
   4838    pxor                 m8, m8
   4839    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
   4840    ret
   4841 ALIGN function_align
   4842 .main:
   4843    mova                 m0, [cq+64* 1]
   4844    mova                 m1, [cq+64* 3]
   4845    mova                 m2, [cq+64* 5]
   4846    mova                 m3, [cq+64* 7]
   4847    mova                 m4, [cq+64* 9]
   4848    mova                 m5, [cq+64*11]
   4849    mova                 m6, [cq+64*13]
   4850    mova                 m7, [cq+64*15]
   4851    call m(idct_8x16_internal_10bpc).main_oddhalf
   4852    mova                 m0, [cq+64* 0]
   4853    mova                 m1, [cq+64* 2]
   4854    mova                 m2, [cq+64* 4]
   4855    mova                 m3, [cq+64* 6]
   4856    mova                 m4, [cq+64* 8]
   4857    mova                 m5, [cq+64*10]
   4858    mova                 m6, [cq+64*12]
   4859    mova                 m7, [cq+64*14]
   4860    call m(idct_8x8_internal_10bpc).main
   4861    call m(idct_8x16_internal_10bpc).main_evenhalf
   4862    psrld               m10, m11, 10 ; pd_2
   4863    REPX    {paddd  x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
   4864    ret
   4865 
   4866 INV_TXFM_16X16_FN adst, dct
   4867 INV_TXFM_16X16_FN adst, adst
   4868 INV_TXFM_16X16_FN adst, flipadst
   4869 
   4870 cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
   4871    vpbroadcastd        m13, [clip_18b_min]
   4872    vpbroadcastd        m14, [clip_18b_max]
   4873 .pass1:
   4874    vpbroadcastd        m15, [pd_2896]
   4875    lea                  r6, [rsp+32*4]
   4876    sub                eobd, 36
   4877    jl .fast
   4878    add                  cq, 32
   4879    call .main
   4880    sub                  cq, 32
   4881    vpbroadcastd         m8, [pd_5120]
   4882    paddd                m4, m8
   4883    paddd                m6, m8
   4884    paddd                m9, m8
   4885    paddd               m11, m8
   4886    vpbroadcastd         m8, [pd_5119]
   4887    psubd                m5, m8, m5
   4888    psubd                m7, m8, m7
   4889    psubd               m10, m8, m10
   4890    psubd               m12, m8, m12
   4891    REPX      {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12
   4892    mova          [r6+32*0], m4
   4893    mova          [r6+32*1], m5
   4894    mova          [r6+32*2], m6
   4895    mova          [r6+32*3], m7
   4896    psrld                m4, m15, 10 ; pd_2
   4897    paddd                m0, m4
   4898    psubd                m1, m4, m1
   4899    paddd                m2, m4
   4900    psubd                m3, m4, m3
   4901    psubd                m7, m4, [r6-32*4]
   4902    paddd                m6, m4, [r6-32*3]
   4903    psubd                m5, m4, [r6-32*2]
   4904    paddd                m4,     [r6-32*1]
   4905    REPX      {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
   4906    mova          [r6-32*4], m0
   4907    mova          [r6-32*3], m1
   4908    mova          [r6-32*2], m2
   4909    mova          [r6-32*1], m3
   4910    add                  r6, 32*8
   4911    mova          [r6-32*4], m9
   4912    mova          [r6-32*3], m10
   4913    mova          [r6-32*2], m11
   4914    mova          [r6-32*1], m12
   4915    mova          [r6+32*0], m4
   4916    mova          [r6+32*1], m5
   4917    mova          [r6+32*2], m6
   4918    mova          [r6+32*3], m7
   4919 .fast:
   4920    add                  r6, 32*8
   4921    call .main
   4922    vpbroadcastd        m14, [pd_5120]
   4923    vpbroadcastd        m13, [pd_5119]
   4924    psrld               m15, 10 ; pd_2
   4925    paddd                m0, m15
   4926    psubd                m1, m15, m1
   4927    paddd                m2, m15
   4928    psubd                m3, m15, m3
   4929    paddd                m4, m14
   4930    psubd                m5, m13, m5
   4931    paddd                m6, m14
   4932    psubd                m7, m13, m7
   4933    paddd                m8, m14, m9
   4934    psubd                m9, m13, m10
   4935    paddd               m10, m14, m11
   4936    psubd               m11, m13, m12
   4937    paddd               m12, m15, [r6-32*1]
   4938    psubd               m13, m15, [r6-32*2]
   4939    paddd               m14, m15, [r6-32*3]
   4940    psubd               m15,      [r6-32*4]
   4941 .pass1_end:
   4942    REPX      {psrad x, 2 }, m0,  m1,  m2,  m3,  m12, m13, m14, m15
   4943    REPX      {psrad x, 13}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
   4944    sub                  r6, 32*8
   4945    jmp                tx2q
   4946 .pass2:
   4947    call m(idct_16x16_internal_10bpc).transpose
   4948    lea                  r6, [pw_5+128]
   4949    mova              [rsp], m15
   4950    call m(iadst_16x16_internal_8bpc).main
   4951    call m(iadst_16x16_internal_8bpc).main_pass2_end
   4952    mova         [rsp+32*0], m8
   4953    mova         [rsp+32*2], m12
   4954    mova         [rsp+32*3], m13
   4955    vpbroadcastd        m12, [pw_2048]
   4956    pxor                m13, m13
   4957    psubw               m13, m12
   4958    pmulhrsw             m0, m12
   4959    pmulhrsw             m1, m13, [rsp+32*1]
   4960    mova         [rsp+32*1], m9
   4961    pmulhrsw             m2, m12
   4962    pmulhrsw             m3, m13
   4963    call m(idct_16x8_internal_10bpc).write_16x4_start
   4964    pmulhrsw             m0, m12, m4
   4965    pmulhrsw             m1, m13, m5
   4966    pmulhrsw             m2, m12, m6
   4967    pmulhrsw             m3, m13, m7
   4968    call m(idct_16x8_internal_10bpc).write_16x4_zero
   4969    pmulhrsw             m0, m12, [rsp+32*0]
   4970    pmulhrsw             m1, m13, [rsp+32*1]
   4971    pmulhrsw             m2, m12, m10
   4972    pmulhrsw             m3, m13, m11
   4973    call m(idct_16x8_internal_10bpc).write_16x4_zero
   4974    pmulhrsw             m0, m12, [rsp+32*2]
   4975    pmulhrsw             m1, m13, [rsp+32*3]
   4976    pmulhrsw             m2, m12, m14
   4977    pmulhrsw             m3, m13, m15
   4978    call m(idct_16x8_internal_10bpc).write_16x4_zero
   4979    RET
   4980 ALIGN function_align
   4981 .main:
   4982    mova                 m0, [cq+64* 2]
   4983    mova                 m1, [cq+64*13]
   4984    mova                 m2, [cq+64* 6]
   4985    mova                 m3, [cq+64* 9]
   4986    mova                 m4, [cq+64*10]
   4987    mova                 m5, [cq+64* 5]
   4988    mova                 m6, [cq+64*14]
   4989    mova                 m7, [cq+64* 1]
   4990    vpbroadcastd        m12, [pd_2048]
   4991    call m(iadst_16x8_internal_10bpc).main_part1
   4992    mova                 m0, [cq+64* 0]
   4993    mova                 m1, [cq+64*15]
   4994    mova                 m2, [cq+64* 4]
   4995    mova                 m3, [cq+64*11]
   4996    mova                 m4, [cq+64* 8]
   4997    mova                 m5, [cq+64* 7]
   4998    mova                 m6, [cq+64*12]
   4999    mova                 m7, [cq+64* 3]
   5000    jmp m(iadst_16x8_internal_10bpc).main_part2
   5001 
   5002 INV_TXFM_16X16_FN flipadst, dct
   5003 INV_TXFM_16X16_FN flipadst, adst
   5004 INV_TXFM_16X16_FN flipadst, flipadst
   5005 
   5006 cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
   5007    vpbroadcastd        m13, [clip_18b_min]
   5008    vpbroadcastd        m14, [clip_18b_max]
   5009 .pass1:
   5010    vpbroadcastd        m15, [pd_2896]
   5011    lea                  r6, [rsp+32*4]
   5012    sub                eobd, 36
   5013    jl .fast
   5014    add                  cq, 32
   5015    call m(iadst_16x16_internal_10bpc).main
   5016    sub                  cq, 32
   5017    vpbroadcastd         m8, [pd_5120]
   5018    paddd               m11, m8
   5019    paddd                m9, m8
   5020    paddd                m6, m8
   5021    paddd                m4, m8
   5022    vpbroadcastd         m8, [pd_5119]
   5023    psubd               m12, m8, m12
   5024    psubd               m10, m8, m10
   5025    psubd                m7, m8, m7
   5026    psubd                m5, m8, m5
   5027    REPX      {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4
   5028    mova          [r6+32*0], m12
   5029    mova          [r6+32*1], m11
   5030    mova          [r6+32*2], m10
   5031    mova          [r6+32*3], m9
   5032    psrld                m9, m15, 10 ; pd_2
   5033    psubd                m3, m9, m3
   5034    paddd                m2, m9
   5035    psubd                m1, m9, m1
   5036    paddd                m0, m9
   5037    psubd               m12, m9, [r6-32*4]
   5038    paddd               m11, m9, [r6-32*3]
   5039    psubd               m10, m9, [r6-32*2]
   5040    paddd                m9,     [r6-32*1]
   5041    REPX      {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0
   5042    mova          [r6-32*4], m12
   5043    mova          [r6-32*3], m11
   5044    mova          [r6-32*2], m10
   5045    mova          [r6-32*1], m9
   5046    add                  r6, 32*8
   5047    mova          [r6-32*4], m7
   5048    mova          [r6-32*3], m6
   5049    mova          [r6-32*2], m5
   5050    mova          [r6-32*1], m4
   5051    mova          [r6+32*0], m3
   5052    mova          [r6+32*1], m2
   5053    mova          [r6+32*2], m1
   5054    mova          [r6+32*3], m0
   5055 .fast:
   5056    add                  r6, 32*8
   5057    call m(iadst_16x16_internal_10bpc).main
   5058    vpbroadcastd        m14, [pd_5120]
   5059    vpbroadcastd        m13, [pd_5119]
   5060    psrld               m15, 10 ; pd_2
   5061    psubd                m8, m13, m7
   5062    paddd                m7, m14, m9
   5063    paddd                m9, m14, m6
   5064    psubd                m6, m13, m10
   5065    psubd               m10, m13, m5
   5066    paddd                m5, m14, m11
   5067    paddd               m11, m14, m4
   5068    psubd                m4, m13, m12
   5069    psubd               m12, m15, m3
   5070    paddd                m3, m15, [r6-32*1]
   5071    paddd               m13, m15, m2
   5072    psubd                m2, m15, [r6-32*2]
   5073    psubd               m14, m15, m1
   5074    mova                 m1, m15
   5075    paddd               m15, m0
   5076    psubd                m0, m1, [r6-32*4]
   5077    paddd                m1,     [r6-32*3]
   5078    jmp m(iadst_16x16_internal_10bpc).pass1_end
   5079 .pass2:
   5080    call m(idct_16x16_internal_10bpc).transpose
   5081    lea                  r6, [pw_5+128]
   5082    mova              [rsp], m15
   5083    call m(iadst_16x16_internal_8bpc).main
   5084    call m(iadst_16x16_internal_8bpc).main_pass2_end
   5085    mova         [rsp+32*3], m3
   5086    mova         [rsp+32*2], m2
   5087    mova         [rsp+32*0], m0
   5088    mova                 m2, m13
   5089    mova                 m3, m12
   5090    vpbroadcastd        m12, [pw_2048]
   5091    pxor                m13, m13
   5092    psubw               m13, m12
   5093    pmulhrsw             m0, m13, m15
   5094    pmulhrsw             m1, m12, m14
   5095    pmulhrsw             m2, m13
   5096    pmulhrsw             m3, m12
   5097    mova                m14, m8
   5098    mova                m15, m9
   5099    call m(idct_16x8_internal_10bpc).write_16x4_start
   5100    pmulhrsw             m0, m13, m11
   5101    pmulhrsw             m1, m12, m10
   5102    pmulhrsw             m2, m13, m15
   5103    pmulhrsw             m3, m12, m14
   5104    call m(idct_16x8_internal_10bpc).write_16x4_zero
   5105    pmulhrsw             m0, m13, m7
   5106    pmulhrsw             m1, m12, m6
   5107    pmulhrsw             m2, m13, m5
   5108    pmulhrsw             m3, m12, m4
   5109    call m(idct_16x8_internal_10bpc).write_16x4_zero
   5110    pmulhrsw             m0, m13, [rsp+32*3]
   5111    pmulhrsw             m1, m12, [rsp+32*2]
   5112    pmulhrsw             m2, m13, [rsp+32*1]
   5113    pmulhrsw             m3, m12, [rsp+32*0]
   5114    call m(idct_16x8_internal_10bpc).write_16x4_zero
   5115    RET
   5116 
   5117 INV_TXFM_16X16_FN identity, dct, -92
   5118 INV_TXFM_16X16_FN identity, identity
   5119 
   5120 cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
   5121    vpbroadcastd        m15, [pd_5793]
   5122    vpbroadcastd         m7, [pd_5120]
   5123    lea                  r6, [rsp+32*4]
   5124    sub                eobd, 36
   5125    jl .fast
   5126    mov                  r3, -32*8*4
   5127 .righthalf:
   5128    pmulld               m0, m15, [cq+r3+32*33]
   5129    pmulld               m1, m15, [cq+r3+32*35]
   5130    pmulld               m2, m15, [cq+r3+32*37]
   5131    pmulld               m3, m15, [cq+r3+32*39]
   5132    add                  r6, 32*4
   5133    REPX      {paddd x, m7}, m0, m1, m2, m3
   5134    REPX      {psrad x, 13}, m0, m1, m2, m3
   5135    mova          [r6+32*0], m0
   5136    mova          [r6+32*1], m1
   5137    mova          [r6+32*2], m2
   5138    mova          [r6+32*3], m3
   5139    add                  r3, 32*8
   5140    jl .righthalf
   5141 .fast:
   5142    pmulld               m0, m15, [cq+64* 0]
   5143    pmulld               m1, m15, [cq+64* 1]
   5144    pmulld               m2, m15, [cq+64* 2]
   5145    pmulld               m3, m15, [cq+64* 3]
   5146    pmulld               m4, m15, [cq+64* 4]
   5147    pmulld               m5, m15, [cq+64* 5]
   5148    pmulld               m6, m15, [cq+64* 6]
   5149    pmulld               m8, m15, [cq+64* 7]
   5150    mova               [cq], m8
   5151    pmulld               m8, m15, [cq+64* 8]
   5152    pmulld               m9, m15, [cq+64* 9]
   5153    pmulld              m10, m15, [cq+64*10]
   5154    pmulld              m11, m15, [cq+64*11]
   5155    pmulld              m12, m15, [cq+64*12]
   5156    pmulld              m13, m15, [cq+64*13]
   5157    pmulld              m14, m15, [cq+64*14]
   5158    pmulld              m15,      [cq+64*15]
   5159    REPX      {paddd x, m7}, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
   5160                             m8,  m9,  m10, m11, m12, m13, m14, m15
   5161    paddd                m7, [cq]
   5162    REPX      {psrad x, 13}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   5163                             m8,  m9,  m10, m11, m12, m13, m14, m15
   5164    jmp                tx2q
   5165 .pass2:
   5166    call m(idct_16x16_internal_10bpc).transpose
   5167 
   5168    mova          [cq+32*0], m15
   5169    mova          [cq+32*1], m0
   5170    vpbroadcastd        m15, [pw_1697x16]
   5171 
   5172    REPX  {IDTX16 x, 0, 15},  1,  2,  3,  4,  5,  6,  7, \
   5173                              8,  9, 10, 11, 12, 13, 14
   5174    mova                 m0, [cq+32*1]
   5175    mova          [cq+32*1], m1
   5176    IDTX16                0, 1, 15
   5177    mova                 m1, [cq+32*0]
   5178    pmulhrsw            m15, m1
   5179    paddsw               m1, m1
   5180    paddsw              m15, m1
   5181    mova                 m1, [cq+32*1]
   5182    jmp m(idct_16x16_internal_10bpc).end
   5183 
   5184 INV_TXFM_16X16_FN dct, dct,       0, 12
   5185 INV_TXFM_16X16_FN dct, identity, 28, 12
   5186 INV_TXFM_16X16_FN dct, adst,      0, 12
   5187 INV_TXFM_16X16_FN dct, flipadst,  0, 12
   5188 
   5189 cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
   5190    vpbroadcastd        m12, [clip_20b_min]
   5191    vpbroadcastd        m13, [clip_20b_max]
   5192    jmp m(idct_16x16_internal_10bpc).pass1
   5193 .pass2:
   5194    mova         [cq+32* 8], m8
   5195    mova         [cq+32* 9], m9
   5196    mova         [cq+32*10], m10
   5197    mova         [cq+32*11], m11
   5198    mova         [cq+32*12], m12
   5199    mova         [cq+32*13], m13
   5200    mova         [cq+32*14], m14
   5201    mova         [cq+32*15], m15
   5202    call .pass2_main
   5203    packssdw             m0,  m1
   5204    packssdw             m1,  m2,  m3
   5205    packssdw             m2,  m4,  m5
   5206    packssdw             m3,  m6,  m7
   5207    packssdw             m4,  m8,  m9
   5208    packssdw             m5, m10, m11
   5209    packssdw             m6, m12, m13
   5210    packssdw             m7, m14, m15
   5211    mova          [r6-32*4], m0
   5212    mova          [r6-32*3], m1
   5213    mova          [r6-32*2], m2
   5214    mova          [r6-32*1], m3
   5215    mova          [r6+32*0], m4
   5216    mova          [r6+32*1], m5
   5217    mova          [r6+32*2], m6
   5218    mova          [r6+32*3], m7
   5219    mova                 m0, [cq+32* 8]
   5220    mova                 m1, [cq+32* 9]
   5221    mova                 m2, [cq+32*10]
   5222    mova                 m3, [cq+32*11]
   5223    mova                 m4, [cq+32*12]
   5224    mova                 m5, [cq+32*13]
   5225    mova                 m6, [cq+32*14]
   5226    mova                 m7, [cq+32*15]
   5227    mov                  r5, r6
   5228    add                  r6, 32*16
   5229    call .pass2_main
   5230    jmp m(iadst_16x16_internal_12bpc).end
   5231 ALIGN function_align
   5232 .write_16x16:
   5233    mova [rsp+gprsize+32*0], m8
   5234    mova [rsp+gprsize+32*1], m9
   5235    mova [rsp+gprsize+32*2], m12
   5236    vpbroadcastd        m12, [pw_16384]
   5237    pmulhrsw             m0, m12
   5238    pmulhrsw             m1, m12
   5239    pmulhrsw             m2, m12
   5240    pmulhrsw             m3, m12
   5241    call m(idct_16x8_internal_12bpc).write_16x4_start
   5242    call m(idct_16x8_internal_10bpc).write_16x4_zero
   5243    jmp m(idct_16x16_internal_10bpc).write_16x16_2
   5244 ALIGN function_align
   5245 .pass2_main:
   5246    call m(idct_8x8_internal_12bpc).transpose_8x8
   5247    mova         [cq+32* 0], m0
   5248    mova         [cq+32* 1], m2
   5249    mova         [cq+32* 2], m4
   5250    mova         [cq+32* 3], m6
   5251    vpbroadcastd        m12, [clip_18b_min]
   5252    vpbroadcastd        m13, [clip_18b_max]
   5253    pmaxsd               m0, m12, m1
   5254    pmaxsd               m1, m12, m3
   5255    pmaxsd               m2, m12, m5
   5256    pmaxsd               m3, m12, m7
   5257    REPX    {pminsd x, m13}, m0, m1, m2, m3
   5258    test               eobd, eobd
   5259    jge .pass2_slow
   5260    pxor                 m4, m4
   5261    REPX       {mova x, m4}, m5, m6, m7
   5262    jmp .pass2_fast
   5263 .pass2_slow:
   5264    sub                  r6, 32*8
   5265    mova                 m8, [r6-32*4]
   5266    mova                 m4, [r6-32*3]
   5267    mova                m10, [r6-32*2]
   5268    mova                 m5, [r6-32*1]
   5269    mova                m12, [r6+32*0]
   5270    mova                 m6, [r6+32*1]
   5271    mova                m14, [r6+32*2]
   5272    mova                 m7, [r6+32*3]
   5273    TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15
   5274    mova         [cq+32* 4], m8
   5275    mova         [cq+32* 5], m10
   5276    mova         [cq+32* 6], m12
   5277    mova         [cq+32* 7], m14
   5278    vpbroadcastd        m12, [clip_18b_min]
   5279    vpbroadcastd        m13, [clip_18b_max]
   5280    REPX    {pmaxsd x, m12}, m4, m5, m6, m7
   5281    REPX    {pminsd x, m13}, m4, m5, m6, m7
   5282 .pass2_fast:
   5283    vpbroadcastd        m11, [pd_2048]
   5284    vpbroadcastd        m14, [pd_2896]
   5285    call m(idct_8x16_internal_10bpc).main_oddhalf
   5286    pmaxsd               m0, m12, [cq+32* 0]
   5287    pmaxsd               m1, m12, [cq+32* 1]
   5288    pmaxsd               m2, m12, [cq+32* 2]
   5289    pmaxsd               m3, m12, [cq+32* 3]
   5290    REPX    {pminsd x, m13}, m0, m1, m2, m3
   5291    test               eobd, eobd
   5292    jge .pass2_slow2
   5293    pxor                 m4, m4
   5294    REPX       {mova x, m4}, m5, m6, m7
   5295    jmp .pass2_fast2
   5296 .pass2_slow2:
   5297    pmaxsd               m4, m12, [cq+32* 4]
   5298    pmaxsd               m5, m12, [cq+32* 5]
   5299    pmaxsd               m6, m12, [cq+32* 6]
   5300    pmaxsd               m7, m12, [cq+32* 7]
   5301    REPX    {pminsd x, m13}, m4, m5, m6, m7
   5302 .pass2_fast2:
   5303    call m(idct_8x8_internal_10bpc).main
   5304    call m(idct_8x16_internal_10bpc).main_evenhalf
   5305    psrad               m11, 8  ; pd_8
   5306    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   5307    call m(idct_16x8_internal_10bpc).pass1_rotations
   5308    REPX       {psrad x, 4}, m0, m1, m2,  m3,  m4,  m5,  m6,  m7, \
   5309                             m8, m9, m10, m11, m12, m13, m14, m15
   5310    ret
   5311 
   5312 INV_TXFM_16X16_FN adst, dct,      0, 12
   5313 INV_TXFM_16X16_FN adst, adst,     0, 12
   5314 INV_TXFM_16X16_FN adst, flipadst, 0, 12
   5315 
   5316 cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
   5317    vpbroadcastd        m13, [clip_20b_min]
   5318    vpbroadcastd        m14, [clip_20b_max]
   5319    jmp m(iadst_16x16_internal_10bpc).pass1
   5320 .pass2:
   5321    call .pass2_part1
   5322    call m(iadst_16x8_internal_10bpc).pass1_rotations
   5323    call .pass2_part2
   5324    call m(iadst_16x8_internal_10bpc).pass1_rotations
   5325 .pass2_part3:
   5326    REPX      {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
   5327    REPX      {psrad x, 15}, m4, m5, m6, m7, m8,  m9,  m10, m11
   5328 .end:
   5329    packssdw            m15, m14
   5330    packssdw            m14, m13, m12
   5331    packssdw            m13, m11, m10
   5332    packssdw            m12,  m9,  m8
   5333    packssdw            m11,  m7,  m6
   5334    packssdw            m10,  m5,  m4
   5335    packssdw             m7,  m3,  m2
   5336    packssdw             m6,  m1,  m0
   5337    vpblendd             m0, m6, [r5-32*4], 0x33
   5338    vpblendd             m1, m6, [r5-32*4], 0xcc
   5339    vpblendd             m2, m7, [r5-32*3], 0x33
   5340    vpblendd             m3, m7, [r5-32*3], 0xcc
   5341    vpermq               m0, m0, q3120
   5342    vpermq               m1, m1, q2031
   5343    vpermq               m2, m2, q3120
   5344    vpermq               m3, m3, q2031
   5345    call m(idct_16x8_internal_12bpc).write_16x4_start
   5346    call m(idct_16x8_internal_10bpc).write_16x4_zero
   5347    vpblendd             m0, m10, [r5-32*2], 0x33
   5348    vpblendd             m1, m10, [r5-32*2], 0xcc
   5349    vpblendd             m2, m11, [r5-32*1], 0x33
   5350    vpblendd             m3, m11, [r5-32*1], 0xcc
   5351    vpermq               m0, m0, q3120
   5352    vpermq               m1, m1, q2031
   5353    vpermq               m2, m2, q3120
   5354    vpermq               m3, m3, q2031
   5355    call m(idct_16x8_internal_10bpc).write_16x4_zero
   5356    vpblendd             m0, m12, [r5+32*0], 0x33
   5357    vpblendd             m1, m12, [r5+32*0], 0xcc
   5358    vpblendd             m2, m13, [r5+32*1], 0x33
   5359    vpblendd             m3, m13, [r5+32*1], 0xcc
   5360    vpermq               m0, m0, q3120
   5361    vpermq               m1, m1, q2031
   5362    vpermq               m2, m2, q3120
   5363    vpermq               m3, m3, q2031
   5364    call m(idct_16x8_internal_10bpc).write_16x4_zero
   5365    vpblendd             m0, m14, [r5+32*2], 0x33
   5366    vpblendd             m1, m14, [r5+32*2], 0xcc
   5367    vpblendd             m2, m15, [r5+32*3], 0x33
   5368    vpblendd             m3, m15, [r5+32*3], 0xcc
   5369    vpermq               m0, m0, q3120
   5370    vpermq               m1, m1, q2031
   5371    vpermq               m2, m2, q3120
   5372    vpermq               m3, m3, q2031
   5373    call m(idct_16x8_internal_10bpc).write_16x4_zero
   5374    RET
   5375 ALIGN function_align
   5376 .pass2_part1:
   5377    mova         [cq+32* 8], m8
   5378    mova         [cq+32* 9], m9
   5379    mova         [cq+32*10], m10
   5380    mova         [cq+32*11], m11
   5381    mova         [cq+32*12], m12
   5382    mova         [cq+32*13], m13
   5383    mova         [cq+32*14], m14
   5384    mova         [cq+32*15], m15
   5385 .pass2_main:
   5386    call m(idct_8x8_internal_12bpc).transpose_8x8
   5387    mova         [cq+32* 0], m0
   5388    mova         [cq+32* 1], m3
   5389    mova         [cq+32* 2], m4
   5390    mova         [cq+32* 3], m7
   5391    vpbroadcastd        m13, [clip_18b_min]
   5392    vpbroadcastd        m14, [clip_18b_max]
   5393    pmaxsd               m0, m13, m2
   5394    pmaxsd               m2, m13, m6
   5395    pmaxsd               m5, m13, m5
   5396    pmaxsd               m7, m13, m1
   5397    REPX    {pminsd x, m14}, m0, m2, m5, m7
   5398    test               eobd, eobd
   5399    jge .pass2_slow
   5400    pxor                 m1, m1
   5401    REPX       {mova x, m1}, m3, m4, m6
   5402    jmp .pass2_fast
   5403 .pass2_slow:
   5404    sub                  r6, 32*8
   5405    mova                 m8, [r6-32*4]
   5406    mova                 m3, [r6-32*3]
   5407    mova                 m4, [r6-32*2]
   5408    mova                m11, [r6-32*1]
   5409    mova                m12, [r6+32*0]
   5410    mova                 m1, [r6+32*1]
   5411    mova                 m6, [r6+32*2]
   5412    mova                m15, [r6+32*3]
   5413    TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14
   5414    mova         [cq+32* 4], m8
   5415    mova         [cq+32* 5], m11
   5416    mova         [cq+32* 6], m12
   5417    mova         [cq+32* 7], m15
   5418    vpbroadcastd        m13, [clip_18b_min]
   5419    vpbroadcastd        m14, [clip_18b_max]
   5420    REPX    {pmaxsd x, m13}, m1, m3, m4, m6
   5421    REPX    {pminsd x, m14}, m1, m3, m4, m6
   5422 .pass2_fast:
   5423    vpbroadcastd        m12, [pd_2048]
   5424    vpbroadcastd        m15, [pd_2896]
   5425    call m(iadst_16x8_internal_10bpc).main_part1
   5426    pmaxsd               m0, m13, [cq+32* 0] ;  0
   5427    pmaxsd               m7, m13, [cq+32* 1] ;  3
   5428    pmaxsd               m2, m13, [cq+32* 2] ;  4
   5429    pmaxsd               m5, m13, [cq+32* 3] ;  7
   5430    REPX    {pminsd x, m14}, m0, m2, m5, m7
   5431    test               eobd, eobd
   5432    jge .pass2_slow2
   5433    pxor                 m1, m1
   5434    REPX       {mova x, m1}, m3, m4, m6
   5435    jmp .pass2_fast2
   5436 .pass2_slow2:
   5437    pmaxsd               m4, m13, [cq+32* 4] ;  8
   5438    pmaxsd               m3, m13, [cq+32* 5] ; 11
   5439    pmaxsd               m6, m13, [cq+32* 6] ; 12
   5440    pmaxsd               m1, m13, [cq+32* 7] ; 15
   5441    REPX    {pminsd x, m14}, m1, m3, m4, m6
   5442 .pass2_fast2:
   5443    call m(iadst_16x8_internal_10bpc).main_part2
   5444    vpbroadcastd        m14, [pd_17408]
   5445    psrld               m15, 11              ; pd_1
   5446    psubd               m13, m14, m15        ; pd_17407
   5447    pslld               m15, 3               ; pd_8
   5448    ret
   5449 ALIGN function_align
   5450 .pass2_part2:
   5451    REPX      {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
   5452    REPX      {psrad x, 15}, m4, m5, m6, m7, m8,  m9,  m10, m11
   5453    packssdw             m0,  m1
   5454    packssdw             m1,  m2,  m3
   5455    packssdw             m2,  m4,  m5
   5456    packssdw             m3,  m6,  m7
   5457    packssdw             m4,  m8,  m9
   5458    packssdw             m5, m10, m11
   5459    packssdw             m6, m12, m13
   5460    packssdw             m7, m14, m15
   5461    mova          [r6-32*4], m0
   5462    mova          [r6-32*3], m1
   5463    mova          [r6-32*2], m2
   5464    mova          [r6-32*1], m3
   5465    mova          [r6+32*0], m4
   5466    mova          [r6+32*1], m5
   5467    mova          [r6+32*2], m6
   5468    mova          [r6+32*3], m7
   5469    mova                 m0, [cq+32* 8]
   5470    mova                 m1, [cq+32* 9]
   5471    mova                 m2, [cq+32*10]
   5472    mova                 m3, [cq+32*11]
   5473    mova                 m4, [cq+32*12]
   5474    mova                 m5, [cq+32*13]
   5475    mova                 m6, [cq+32*14]
   5476    mova                 m7, [cq+32*15]
   5477    mov                  r5, r6
   5478    add                  r6, 32*16
   5479    jmp .pass2_main
   5480 
   5481 INV_TXFM_16X16_FN flipadst, dct,      0, 12
   5482 INV_TXFM_16X16_FN flipadst, adst,     0, 12
   5483 INV_TXFM_16X16_FN flipadst, flipadst, 0, 12
   5484 
   5485 cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
   5486    vpbroadcastd        m13, [clip_20b_min]
   5487    vpbroadcastd        m14, [clip_20b_max]
   5488    jmp m(iflipadst_16x16_internal_10bpc).pass1
   5489 .pass2:
   5490    call m(iadst_16x16_internal_12bpc).pass2_part1
   5491    call m(iflipadst_16x8_internal_10bpc).pass1_rotations
   5492    call m(iadst_16x16_internal_12bpc).pass2_part2
   5493    call m(iflipadst_16x8_internal_10bpc).pass1_rotations
   5494    jmp m(iadst_16x16_internal_12bpc).pass2_part3
   5495 
   5496 INV_TXFM_16X16_FN identity, dct,    -92, 12
   5497 INV_TXFM_16X16_FN identity, identity, 0, 12
   5498 
   5499 %macro IDTX16_12BPC 1 ; src
   5500    pmulld               m6, m7, m%1
   5501    paddd                m6, m15
   5502    psrad                m6, 12
   5503    paddd                m6, m%1
   5504    psrad               m%1, m6, 1
   5505 %endmacro
   5506 
   5507 cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
   5508    vpbroadcastd         m7, [pd_1697]
   5509    vpbroadcastd        m15, [pd_5120]
   5510    lea                  r6, [rsp+32*4]
   5511    sub                eobd, 36
   5512    jl .fast
   5513    mov                  r3, -32*8*4
   5514 .righthalf:
   5515    mova                m10, [cq+r3+32*33]
   5516    mova                m11, [cq+r3+32*35]
   5517    mova                m12, [cq+r3+32*37]
   5518    mova                m13, [cq+r3+32*39]
   5519    add                  r6, 32*4
   5520    pmulld               m0, m7, m10
   5521    pmulld               m1, m7, m11
   5522    pmulld               m2, m7, m12
   5523    pmulld               m3, m7, m13
   5524    REPX     {paddd x, m15}, m0, m1, m2, m3
   5525    REPX     {psrad x, 12 }, m0, m1, m2, m3
   5526    paddd                m0, m10
   5527    paddd                m1, m11
   5528    paddd                m2, m12
   5529    paddd                m3, m13
   5530    REPX     {psrad x, 1  }, m0, m1, m2, m3
   5531    mova          [r6+32*0], m0
   5532    mova          [r6+32*1], m1
   5533    mova          [r6+32*2], m2
   5534    mova          [r6+32*3], m3
   5535    add                  r3, 32*8
   5536    jl .righthalf
   5537 .fast:
   5538    mova                 m0, [cq+64* 0]
   5539    mova                 m1, [cq+64* 1]
   5540    mova                 m2, [cq+64* 2]
   5541    mova                 m3, [cq+64* 3]
   5542    mova                 m4, [cq+64* 4]
   5543    mova                 m5, [cq+64* 5]
   5544    mova                 m8, [cq+64* 6]
   5545    mova                 m9, [cq+64* 7]
   5546    REPX   {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9
   5547    mova          [cq+64*0], m8
   5548    mova          [cq+64*1], m9
   5549    mova                 m8, [cq+64* 8]
   5550    mova                 m9, [cq+64* 9]
   5551    mova                m10, [cq+64*10]
   5552    mova                m11, [cq+64*11]
   5553    mova                m12, [cq+64*12]
   5554    mova                m13, [cq+64*13]
   5555    mova                m14, [cq+64*14]
   5556    REPX   {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14
   5557    mova                 m6, [cq+64*15]
   5558    pmulld               m7, m6
   5559    paddd                m7, m15
   5560    psrad                m7, 12
   5561    paddd                m7, m6
   5562    mova                 m6, [cq+64*0]
   5563    psrad               m15, m7, 1
   5564    mova                 m7, [cq+64*1]
   5565    jmp                tx2q
   5566 .pass2:
   5567    call m(iidentity_8x16_internal_12bpc).pass2_main
   5568    call m(idct_16x16_internal_10bpc).transpose_fast
   5569    test               eobd, eobd
   5570    jl .pass2_fast
   5571    mova         [cq+32* 8], m0
   5572    mova         [cq+32* 9], m1
   5573    mova         [cq+32*10], m2
   5574    mova         [cq+32*11], m3
   5575    mova         [cq+32*12], m4
   5576    mova         [cq+32*13], m5
   5577    mova         [cq+32*14], m6
   5578    mova         [cq+32*15], m7
   5579    mova                 m8, [r6-32*4]
   5580    mova                 m9, [r6-32*3]
   5581    mova                m10, [r6-32*2]
   5582    mova                m11, [r6-32*1]
   5583    mova                m12, [r6+32*0]
   5584    mova                m13, [r6+32*1]
   5585    mova                m14, [r6+32*2]
   5586    mova                m15, [r6+32*3]
   5587    sub                  r6, 32*8
   5588    mova                 m0, [r6-32*4]
   5589    mova                 m1, [r6-32*3]
   5590    mova                 m2, [r6-32*2]
   5591    mova                 m3, [r6-32*1]
   5592    mova                 m4, [r6+32*0]
   5593    mova                 m5, [r6+32*1]
   5594    mova                 m6, [r6+32*2]
   5595    mova                 m7, [r6+32*3]
   5596    call m(iidentity_8x16_internal_12bpc).pass2_main
   5597    call m(idct_16x8_internal_10bpc).transpose2
   5598    mova                 m8, m0
   5599    mova                 m9, m1
   5600    mova                m10, m2
   5601    mova                m11, m3
   5602    mova                m12, m4
   5603    mova                m13, m5
   5604    mova                m14, m6
   5605    mova                m15, m7
   5606    mova                 m0, [cq+32* 8]
   5607    mova                 m1, [cq+32* 9]
   5608    mova                 m2, [cq+32*10]
   5609    mova                 m3, [cq+32*11]
   5610    mova                 m4, [cq+32*12]
   5611    mova                 m5, [cq+32*13]
   5612    mova                 m6, [cq+32*14]
   5613    mova                 m7, [cq+32*15]
   5614 .pass2_fast:
   5615    call m(idct_16x16_internal_12bpc).write_16x16
   5616    RET
   5617 
   5618 %macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack
   5619    mova                m%4, [r6+32*(%1-4)]
   5620    mova                m%2, [r5+32*(3-%1)]
   5621    mova                m%5, [r4+32*(%1-4)]
   5622    psubd               m%3, m%1, m%4 ; idct16 out15 - n
   5623    paddd               m%1, m%4      ; idct16 out0  + n
   5624    pmaxsd              m%1, m12
   5625    pmaxsd              m%3, m12
   5626    pminsd              m%1, m13
   5627    pminsd              m%3, m13
   5628    paddd               m%1, m11
   5629    paddd               m%3, m11
   5630    psubd               m%4, m%1, m%2 ; out31 - n
   5631    paddd               m%1, m%2      ; out0  + n
   5632    paddd               m%2, m%3, m%5 ; out15 - n
   5633    psubd               m%3, m%5      ; out16 + n
   5634    REPX      {psrad x, %6}, m%1, m%3, m%2, m%4
   5635 %if %7 & 1
   5636    packssdw            m%1, m%3      ; out0  + n, out16 + n
   5637    packssdw            m%2, m%4      ; out15 - n, out31 - n
   5638 %endif
   5639 %endmacro
   5640 
   5641 cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
   5642    test               eobd, eobd
   5643    jz .dconly
   5644    PROLOGUE              0, 7, 16, 32*12, dst, stride, c, eob
   5645 %undef cmp
   5646    vpbroadcastd        m11, [pd_2048]
   5647    vpbroadcastd        m12, [clip_18b_min]
   5648    vpbroadcastd        m13, [clip_18b_max]
   5649    vbroadcasti128      m14, [idct32_shuf]
   5650    mov                  r4, cq
   5651    call .pass1_main
   5652    mova         [rsp+32*0], m2
   5653    mova         [rsp+32*1], m3
   5654    cmp                eobd, 43
   5655    jge .eob43
   5656    pxor                 m4, m4
   5657    REPX       {mova x, m4}, [rsp+32*2], m2, m3, m11
   5658    jmp .pass1_end_fast
   5659 .eob43:
   5660    lea                  r6, [rsp+32*8]
   5661    mova          [r6-32*4], m0
   5662    mova          [r6-32*3], m1
   5663    call .pass1_main
   5664    mova         [rsp+32*2], m2
   5665    cmp                eobd, 107
   5666    jge .eob107
   5667    mova                m11, m3
   5668    mova                 m2, m0
   5669    mova                 m3, m1
   5670    mova                 m0, [r6-32*4]
   5671    mova                 m1, [r6-32*3]
   5672    pxor                 m4, m4
   5673 .pass1_end_fast:
   5674    vpbroadcastd        m10, [pw_2048]
   5675    lea                  r6, [deint_shuf+128]
   5676    REPX       {mova x, m4}, m5, m6, m7
   5677    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
   5678    jmp .end
   5679 .eob107:
   5680    mova         [rsp+32*3], m3
   5681    mova          [r6-32*2], m0
   5682    mova          [r6-32*1], m1
   5683    call .pass1_main
   5684    cmp                eobd, 171
   5685    jge .eob171
   5686    pshufd              m12, m2, q1032
   5687    pshufd              m13, m3, q1032
   5688    mova                 m4, m0
   5689    mova                 m5, m1
   5690    pxor                 m6, m6
   5691    REPX       {mova x, m6}, m7, m14, m15
   5692    jmp .pass1_end
   5693 .eob171:
   5694    mova          [r6+32*0], m0
   5695    mova          [r6+32*1], m1
   5696    mova          [r6+32*2], m2
   5697    mova          [r6+32*3], m3
   5698    call .pass1_main
   5699    pshufd              m12, [r6+32*2], q1032 ; out19 out17
   5700    pshufd              m13, [r6+32*3], q1032 ; out23 out21
   5701    mova                 m4, [r6+32*0]        ; out16 out18
   5702    mova                 m5, [r6+32*1]        ; out20 out22
   5703    pshufd              m14, m2, q1032        ; out27 out25
   5704    pshufd              m15, m3, q1032        ; out31 out29
   5705    mova                 m6, m0               ; out24 out26
   5706    mova                 m7, m1               ; out28 out30
   5707 .pass1_end:
   5708    mova                 m0, [r6-32*4]        ; out0  out2
   5709    mova                 m1, [r6-32*3]        ; out4  out6
   5710    mova                 m2, [r6-32*2]        ; out8  out10
   5711    mova                 m3, [r6-32*1]        ; out12 out14
   5712    lea                  r6, [deint_shuf+128]
   5713    mova                m11, [rsp+32*3]       ; out13 out15
   5714    vpbroadcastd        m10, [pw_2048]
   5715    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
   5716 .end: ; [rsp+0*32] = m12
   5717    vpbroadcastd        m12, [pw_2048]
   5718    mov                  cq, r4
   5719    mova         [rsp+32*1], m8
   5720    mova         [rsp+32*2], m9
   5721    mova         [rsp+32*3], m10
   5722    mova         [rsp+32*4], m11
   5723    vpermq               m0, m0, q3120
   5724    vpermq               m1, m1, q2031
   5725    pmulhrsw             m0, m12
   5726    pmulhrsw             m1, m12
   5727    call m(idct_8x8_internal_10bpc).write_8x4_start
   5728    vpermq               m0, m2, q3120
   5729    vpermq               m1, m3, q2031
   5730    pmulhrsw             m0, m12
   5731    pmulhrsw             m1, m12
   5732    call m(idct_8x8_internal_10bpc).write_8x4
   5733    vpermq               m0, m4, q3120
   5734    vpermq               m1, m5, q2031
   5735    pmulhrsw             m0, m12
   5736    pmulhrsw             m1, m12
   5737    call m(idct_8x8_internal_10bpc).write_8x4
   5738    vpermq               m0, m6, q3120
   5739    vpermq               m1, m7, q2031
   5740    pmulhrsw             m0, m12
   5741    pmulhrsw             m1, m12
   5742    call m(idct_8x8_internal_10bpc).write_8x4
   5743    vpermq               m0, [rsp+32*1], q3120
   5744    vpermq               m1, [rsp+32*2], q2031
   5745    pmulhrsw             m0, m12
   5746    pmulhrsw             m1, m12
   5747    call m(idct_8x8_internal_10bpc).write_8x4
   5748    vpermq               m0, [rsp+32*3], q3120
   5749    vpermq               m1, [rsp+32*4], q2031
   5750    pmulhrsw             m0, m12
   5751    pmulhrsw             m1, m12
   5752    call m(idct_8x8_internal_10bpc).write_8x4
   5753    vpermq               m0, [rsp+32*0], q3120
   5754    vpermq               m1, m13, q2031
   5755    pmulhrsw             m0, m12
   5756    pmulhrsw             m1, m12
   5757    call m(idct_8x8_internal_10bpc).write_8x4
   5758    vpermq               m0, m14, q3120
   5759    vpermq               m1, m15, q2031
   5760    pmulhrsw             m0, m12
   5761    pmulhrsw             m1, m12
   5762    call m(idct_8x8_internal_10bpc).write_8x4
   5763    RET
   5764 .dconly:
   5765    imul                r6d, [cq], 181
   5766    vpbroadcastd         m2, [dconly_10bpc]
   5767    mov                [cq], eobd ; 0
   5768    or                  r3d, 32
   5769    add                 r6d, 640
   5770    sar                 r6d, 10
   5771    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
   5772 ALIGN function_align
   5773 .pass1_main_part1:
   5774    mova                 m0, [cq+128*0]
   5775    mova                 m1, [cq+128*1]
   5776    mova                 m2, [cq+128*2]
   5777    mova                 m3, [cq+128*3]
   5778    mova                 m4, [cq+128*4]
   5779    mova                 m5, [cq+128*5]
   5780    mova                 m6, [cq+128*6]
   5781    mova                 m7, [cq+128*7]
   5782    call m(idct_8x8_internal_10bpc).main
   5783    psrld                m1, m11, 10 ; pd_2
   5784    REPX      {paddd x, m1}, m0, m6, m5, m3
   5785    paddd                m1, m6, m7  ; out1
   5786    psubd                m6, m7      ; out6
   5787    psubd                m7, m0, m9  ; out7
   5788    paddd                m0, m9      ; out0
   5789    paddd                m2, m5, m4  ; out2
   5790    psubd                m5, m4      ; out5
   5791    psubd                m4, m3, m8  ; out4
   5792    paddd                m3, m8      ; out3
   5793    REPX      {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
   5794    ret
   5795 ALIGN function_align
   5796 .pass1_main:
   5797    call .pass1_main_part1
   5798    add                  cq, 32
   5799    packssdw             m0, m1
   5800    packssdw             m2, m3
   5801    packssdw             m4, m5
   5802    packssdw             m6, m7
   5803    pshufb               m0, m14
   5804    pshufb               m2, m14
   5805    pshufb               m4, m14
   5806    pshufb               m6, m14
   5807    punpckhdq            m3, m0, m2
   5808    punpckldq            m0, m2
   5809    punpckldq            m2, m4, m6
   5810    punpckhdq            m4, m6
   5811    vperm2i128           m1, m0, m2, 0x31 ; 4 6
   5812    vinserti128          m0, xm2, 1       ; 0 2
   5813    vinserti128          m2, m3, xm4, 1   ; 1 3
   5814    vperm2i128           m3, m4, 0x31     ; 5 7
   5815    ret
   5816 .main_oddhalf_part1_fast_rect2:
   5817    REPX     {paddd x, m11}, m0, m1, m2, m3
   5818    REPX     {psrad x, 12 }, m0, m1, m2, m3
   5819 .main_oddhalf_part1_fast: ; lower half zero
   5820    vpbroadcastd         m7, [pd_4091]
   5821    vpbroadcastd         m8, [pd_201]
   5822    vpbroadcastd         m6, [pd_m1380]
   5823    vpbroadcastd         m9, [pd_3857]
   5824    vpbroadcastd         m5, [pd_3703]
   5825    vpbroadcastd        m10, [pd_1751]
   5826    vpbroadcastd         m4, [pd_m2751]
   5827    vpbroadcastd        m15, [pd_3035]
   5828    pmulld               m7, m0
   5829    pmulld               m0, m8
   5830    pmulld               m6, m1
   5831    pmulld               m1, m9
   5832    pmulld               m5, m2
   5833    pmulld               m2, m10
   5834    pmulld               m4, m3
   5835    pmulld               m3, m15
   5836    jmp .main_oddhalf_part1_fast2
   5837 .main_oddhalf_part1_rect2:
   5838    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   5839    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
   5840 .main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
   5841    ITX_MULSUB_2D         0, 7, 8, 9, 10, _,  201, 4091 ; t16a, t31a
   5842    ITX_MULSUB_2D         6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
   5843    ITX_MULSUB_2D         2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
   5844    ITX_MULSUB_2D         4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
   5845 .main_oddhalf_part1_fast2:
   5846    REPX     {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
   5847    REPX     {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
   5848    psubd                m8, m0, m4 ; t17
   5849    paddd                m0, m4     ; t16
   5850    psubd                m4, m6, m2 ; t18
   5851    paddd                m6, m2     ; t19
   5852    psubd                m2, m1, m5 ; t29
   5853    paddd                m1, m5     ; t28
   5854    psubd                m5, m7, m3 ; t30
   5855    paddd                m7, m3     ; t31
   5856    REPX    {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
   5857    REPX    {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
   5858    vpbroadcastd        m15, [pd_4017]
   5859    vpbroadcastd        m10, [pd_799]
   5860    ITX_MULSUB_2D         5, 8, 3, 9, _, 11, 10, 15    ; t17a, t30a
   5861    ITX_MULSUB_2D         2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a
   5862    psubd                m3, m0, m6 ; t19a
   5863    paddd                m0, m6     ; t16a
   5864    psubd                m6, m7, m1 ; t28a
   5865    paddd                m7, m1     ; t31a
   5866    psubd                m1, m5, m4 ; t18
   5867    paddd                m5, m4     ; t17
   5868    psubd                m4, m8, m2 ; t29
   5869    paddd                m8, m2     ; t30
   5870    REPX    {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
   5871    REPX    {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
   5872    vpbroadcastd        m15, [pd_3784]
   5873    vpbroadcastd        m10, [pd_1567]
   5874    ITX_MULSUB_2D         4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
   5875    ITX_MULSUB_2D         6, 3, 2, 9, _, 11, 10, 15 ; t19,  t28
   5876    mova          [r6-32*4], m0
   5877    mova          [r6-32*3], m5
   5878    mova          [r6-32*2], m4
   5879    mova          [r6-32*1], m6
   5880    mova          [r6+32*0], m3
   5881    mova          [r6+32*1], m1
   5882    mova          [r6+32*2], m8
   5883    mova          [r6+32*3], m7
   5884    ret
   5885 .main_oddhalf_part2_fast_rect2:
   5886    REPX     {paddd x, m11}, m0, m1, m2, m3
   5887    REPX     {psrad x, 12 }, m0, m1, m2, m3
   5888 .main_oddhalf_part2_fast: ; lower half zero
   5889    vpbroadcastd         m7, [pd_m601]
   5890    vpbroadcastd         m8, [pd_4052]
   5891    vpbroadcastd         m6, [pd_3973]
   5892    vpbroadcastd         m9, [pd_995]
   5893    vpbroadcastd         m5, [pd_m2106]
   5894    vpbroadcastd        m10, [pd_3513]
   5895    vpbroadcastd         m4, [pd_3290]
   5896    vpbroadcastd        m15, [pd_2440]
   5897    pmulld               m7, m0
   5898    pmulld               m0, m8
   5899    pmulld               m6, m1
   5900    pmulld               m1, m9
   5901    pmulld               m5, m2
   5902    pmulld               m2, m10
   5903    pmulld               m4, m3
   5904    pmulld               m3, m15
   5905    jmp .main_oddhalf_part2_fast2
   5906 .main_oddhalf_part2_rect2:
   5907    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   5908    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
   5909 .main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
   5910    ITX_MULSUB_2D         7, 0, 8, 9, 10, _, 4052,  601 ; t23a, t24a
   5911    ITX_MULSUB_2D         1, 6, 8, 9, 10, _,  995, 3973 ; t20a, t27a
   5912    ITX_MULSUB_2D         5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
   5913    ITX_MULSUB_2D         3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
   5914 .main_oddhalf_part2_fast2:
   5915    REPX     {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
   5916    REPX     {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
   5917    psubd                m8, m0, m4 ; t25
   5918    paddd                m0, m4     ; t24
   5919    psubd                m4, m6, m2 ; t26
   5920    paddd                m6, m2     ; t27
   5921    psubd                m2, m1, m5 ; t21
   5922    paddd                m1, m5     ; t20
   5923    psubd                m5, m7, m3 ; t22
   5924    paddd                m7, m3     ; t23
   5925    REPX    {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
   5926    REPX    {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
   5927    vpbroadcastd        m15, [pd_2276]
   5928    vpbroadcastd        m10, [pd_3406]
   5929    ITX_MULSUB_2D         4, 2, 3, 9, _, 11, 10, 15    ; t21a, t26a
   5930    ITX_MULSUB_2D         8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a
   5931    psubd                m3, m0, m6 ; t27a
   5932    paddd                m0, m6     ; t24a
   5933    psubd                m6, m7, m1 ; t20a
   5934    paddd                m7, m1     ; t23a
   5935    psubd                m1, m5, m4 ; t21
   5936    paddd                m5, m4     ; t22
   5937    psubd                m4, m8, m2 ; t26
   5938    paddd                m8, m2     ; t25
   5939    REPX    {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
   5940    REPX    {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
   5941    vpbroadcastd        m15, [pd_3784]
   5942    vpbroadcastd        m10, [pd_1567]
   5943    ITX_MULSUB_2D         4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a
   5944    ITX_MULSUB_2D         3, 6, 2, 9, _, 11, 10, 15, 2 ; t27,  t20
   5945    mova                 m9, [r6-32*4] ; t16a
   5946    mova                m10, [r6-32*3] ; t17
   5947    psubd                m2, m9, m7    ; t23
   5948    paddd                m9, m7        ; t16
   5949    psubd                m7, m10, m5   ; t22a
   5950    paddd               m10, m5        ; t17a
   5951    REPX    {pmaxsd x, m12}, m9, m10, m2, m7
   5952    REPX    {pminsd x, m13}, m9, m10, m2, m7
   5953    mova          [r6-32*4], m9
   5954    mova          [r6-32*3], m10
   5955    mova                 m9, [r6-32*2] ; t18a
   5956    mova                m10, [r6-32*1] ; t19
   5957    psubd                m5, m9, m1    ; t21
   5958    paddd                m9, m1        ; t18
   5959    psubd                m1, m10, m6   ; t20a
   5960    paddd               m10, m6        ; t19a
   5961    REPX    {pmaxsd x, m12}, m9, m10, m5, m1
   5962    REPX    {pminsd x, m13}, m9, m10, m5, m1
   5963    mova          [r6-32*2], m9
   5964    mova          [r6-32*1], m10
   5965    mova                 m9, [r6+32*0] ; t28
   5966    mova                m10, [r6+32*1] ; t29a
   5967    psubd                m6, m9, m3    ; t27a
   5968    paddd                m9, m3        ; t28a
   5969    psubd                m3, m10, m4   ; t26
   5970    paddd               m10, m4        ; t29
   5971    REPX    {pmaxsd x, m12}, m9, m10, m6, m3
   5972    REPX    {pminsd x, m13}, m9, m10, m6, m3
   5973    REPX    {pmulld x, m14}, m6, m3, m1, m5
   5974    paddd                m6, m11
   5975    paddd                m3, m11
   5976    psubd                m4, m6, m1    ; t20
   5977    paddd                m6, m1        ; t27
   5978    psubd                m1, m3, m5    ; t21a
   5979    paddd                m3, m5        ; t26a
   5980    REPX    {psrad  x, 12 }, m4, m1, m3, m6
   5981    mova          [r6+32*0], m4
   5982    mova          [r6+32*1], m1
   5983    mova                 m4, [r6+32*2] ; t30
   5984    mova                 m1, [r6+32*3] ; t31a
   5985    psubd                m5, m4, m8    ; t25a
   5986    paddd                m4, m8        ; t30a
   5987    psubd                m8, m1, m0    ; t24
   5988    paddd                m1, m0        ; t31
   5989    REPX    {pmaxsd x, m12}, m8, m5, m4, m1
   5990    REPX    {pminsd x, m13}, m8, m5, m4, m1
   5991    REPX    {pmulld x, m14}, m5, m8, m7, m2
   5992    paddd                m5, m11
   5993    paddd                m8, m11
   5994    psubd                m0, m5, m7    ; t22
   5995    paddd                m5, m7        ; t25
   5996    psubd                m7, m8, m2    ; t23a
   5997    paddd                m2, m8        ; t24a
   5998    REPX    {psrad  x, 12 }, m0, m7, m2, m5
   5999    mova          [r6+32*2], m0
   6000    mova          [r6+32*3], m7
   6001    mov                  r4, r6
   6002    add                  r6, 32*8
   6003    mova          [r6-32*4], m2
   6004    mova          [r6-32*3], m5
   6005    mova          [r6-32*2], m3
   6006    mova          [r6-32*1], m6
   6007    mova          [r6+32*0], m9
   6008    mova          [r6+32*1], m10
   6009    mova          [r6+32*2], m4
   6010    mova          [r6+32*3], m1
   6011    mov                  r5, r6
   6012    add                  r6, 32*8
   6013    ret
   6014 ALIGN function_align
   6015 .main_end:
   6016    psrld               m11, 10 ; pd_2
   6017    IDCT32_END            0, 15, 8, 9, 10, 2
   6018    IDCT32_END            1, 14, 8, 9, 10, 2
   6019    punpckhwd            m8, m0, m1   ; 16 17
   6020    punpcklwd            m0, m1       ;  0  1
   6021    punpcklwd            m1, m14, m15 ; 14 15
   6022    punpckhwd           m14, m15      ; 30 31
   6023    mova          [r5+32*3], m8
   6024    mova          [r5+32*2], m14
   6025    IDCT32_END            2, 15, 8, 9, 10, 2
   6026    IDCT32_END            3, 14, 8, 9, 10, 2
   6027    punpckhwd            m8, m2, m3   ; 18 19
   6028    punpcklwd            m2, m3       ;  2  3
   6029    punpcklwd            m3, m14, m15 ; 12 13
   6030    punpckhwd           m14, m15      ; 28 29
   6031    mova          [r5+32*1], m8
   6032    mova          [r5+32*0], m14
   6033    IDCT32_END            4, 15, 8, 9, 10, 2
   6034    IDCT32_END            5, 14, 8, 9, 10, 2
   6035    punpckhwd            m8, m4, m5   ; 20 21
   6036    punpcklwd            m4, m5       ;  4  5
   6037    punpcklwd            m5, m14, m15 ; 10 11
   6038    punpckhwd           m14, m15      ; 26 27
   6039    mova          [r5-32*1], m8
   6040    mova          [r5-32*2], m14
   6041    IDCT32_END            6, 15, 8, 9, 10, 2
   6042    IDCT32_END            7, 14, 8, 9, 10, 2
   6043    punpckhwd            m8, m6, m7   ; 22 23
   6044    punpcklwd            m6, m7       ;  6  7
   6045    punpcklwd            m7, m14, m15 ;  8  9
   6046    punpckhwd           m14, m15      ; 24 25
   6047    mova          [r5-32*3], m8
   6048    mova          [r5-32*4], m14
   6049 .transpose:
   6050    punpckhdq           m15, m3, m1
   6051    punpckldq            m3, m1
   6052    punpckhdq            m1, m4, m6
   6053    punpckldq            m4, m6
   6054    punpckhdq            m6, m0, m2
   6055    punpckldq            m0, m2
   6056    punpckhdq            m2, m7, m5
   6057    punpckldq            m7, m5
   6058    punpcklqdq           m5, m2, m15
   6059    punpckhqdq           m2, m15
   6060    punpckhqdq          m15, m7, m3
   6061    punpcklqdq           m7, m3
   6062    punpckhqdq           m3, m6, m1
   6063    punpcklqdq           m6, m1
   6064    punpckhqdq           m1, m0, m4
   6065    punpcklqdq           m0, m4
   6066    vperm2i128           m4, m0, m7, 0x31
   6067    vinserti128          m0, xm7, 1
   6068    vperm2i128           m7, m3, m2, 0x31
   6069    vinserti128          m3, xm2, 1
   6070    vinserti128          m2, m6, xm5, 1
   6071    vperm2i128           m6, m5, 0x31
   6072    vperm2i128           m5, m1, m15, 0x31
   6073    vinserti128          m1, xm15, 1
   6074    ret
   6075 
   6076 cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob
   6077    vpbroadcastd         m7, [pixel_10bpc_max]
   6078 .pass1:
   6079    vpbroadcastd         m5, [pw_5]
   6080    pxor                 m6, m6
   6081    mov                 r6d, eobd
   6082    add                eobb, 21
   6083    cmovc              eobd, r6d ; 43, 107, 171 -> 64, 128, 192
   6084    lea                  r6, [strideq*3]
   6085    lea                  r5, [strideq*5]
   6086    lea                  r4, [strideq+r6*2] ; strideq*7
   6087 .loop:
   6088    mova                 m0, [cq+128*0]
   6089    packssdw             m0, [cq+128*1]
   6090    mova                 m1, [cq+128*2]
   6091    packssdw             m1, [cq+128*3]
   6092    mova                 m2, [cq+128*4]
   6093    packssdw             m2, [cq+128*5]
   6094    mova                 m3, [cq+128*6]
   6095    packssdw             m3, [cq+128*7]
   6096    REPX     {paddsw x, m5}, m0, m1, m2, m3
   6097    REPX     {psraw  x, 3 }, m0, m1, m2, m3
   6098    call .main_zero
   6099    add                  cq, 32
   6100    lea                dstq, [dstq+strideq*8]
   6101    sub                eobd, 64
   6102    jge .loop
   6103    RET
   6104 ALIGN function_align
   6105 .main_zero:
   6106    REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
   6107 .main:
   6108    punpckhwd            m4, m0, m1
   6109    punpcklwd            m0, m1
   6110    punpckhwd            m1, m2, m3
   6111    punpcklwd            m2, m3
   6112    punpckhwd            m3, m0, m4
   6113    punpcklwd            m0, m4
   6114    punpckhwd            m4, m2, m1
   6115    punpcklwd            m2, m1
   6116    punpckhqdq           m1, m0, m2
   6117    punpcklqdq           m0, m2
   6118    punpcklqdq           m2, m3, m4
   6119    punpckhqdq           m3, m4
   6120    mova                xm4, [dstq+strideq*0]
   6121    vinserti128          m4, [dstq+strideq*4], 1
   6122    paddw                m0, m4
   6123    mova                xm4, [dstq+strideq*1]
   6124    vinserti128          m4, [dstq+r5       ], 1
   6125    paddw                m1, m4
   6126    mova                xm4, [dstq+strideq*2]
   6127    vinserti128          m4, [dstq+r6*2     ], 1
   6128    paddw                m2, m4
   6129    mova                xm4, [dstq+r6       ]
   6130    vinserti128          m4, [dstq+r4       ], 1
   6131    paddw                m3, m4
   6132    REPX     {pmaxsw x, m6}, m0, m1, m2, m3
   6133    REPX     {pminsw x, m7}, m0, m1, m2, m3
   6134    mova         [dstq+strideq*0], xm0
   6135    vextracti128 [dstq+strideq*4], m0, 1
   6136    mova         [dstq+strideq*1], xm1
   6137    vextracti128 [dstq+r5       ], m1, 1
   6138    mova         [dstq+strideq*2], xm2
   6139    vextracti128 [dstq+r6*2     ], m2, 1
   6140    mova         [dstq+r6       ], xm3
   6141    vextracti128 [dstq+r4       ], m3, 1
   6142    ret
   6143 
   6144 cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob
   6145    test               eobd, eobd
   6146    jz .dconly
   6147    PROLOGUE              0, 7, 16, 32*24, dst, stride, c, eob
   6148 %undef cmp
   6149    vpbroadcastd        m11, [pd_2048]
   6150    vpbroadcastd        m12, [clip_20b_min]
   6151    vpbroadcastd        m13, [clip_20b_max]
   6152    mov                  r4, cq
   6153    lea                  r6, [rsp+32*4]
   6154    call .pass1_main
   6155    cmp                eobd, 43
   6156    jge .eob43
   6157    jmp .pass2_fast
   6158 .eob43:
   6159    call .pass1_main
   6160    cmp                eobd, 107
   6161    jge .eob107
   6162 .pass2_fast:
   6163    mov                  cq, r4
   6164    vpbroadcastd        m12, [clip_18b_min]
   6165    vpbroadcastd        m13, [clip_18b_max]
   6166    pmaxsd               m0, m12, [cq+128*1+ 0]
   6167    pmaxsd               m1, m12, [cq+128*7+ 0]
   6168    pmaxsd               m2, m12, [cq+128*1+32]
   6169    pmaxsd               m3, m12, [cq+128*7+32]
   6170    REPX    {pminsd x, m13}, m0, m1, m2, m3
   6171    vpbroadcastd        m14, [pd_2896]
   6172    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
   6173    pmaxsd               m0, m12, [cq+128*3+ 0]
   6174    pmaxsd               m1, m12, [cq+128*5+ 0]
   6175    pmaxsd               m2, m12, [cq+128*3+32]
   6176    pmaxsd               m3, m12, [cq+128*5+32]
   6177    REPX    {pminsd x, m13}, m0, m1, m2, m3
   6178    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
   6179    pmaxsd               m0, m12, [cq+128*2+ 0]
   6180    pmaxsd               m1, m12, [cq+128*6+ 0]
   6181    pmaxsd               m2, m12, [cq+128*2+32]
   6182    pmaxsd               m3, m12, [cq+128*6+32]
   6183    REPX    {pminsd x, m13}, m0, m1, m2, m3
   6184    call m(idct_8x16_internal_10bpc).main_oddhalf_fast
   6185    pmaxsd               m0, m12, [cq+128*0+ 0]
   6186    pmaxsd               m1, m12, [cq+128*4+ 0]
   6187    pmaxsd               m2, m12, [cq+128*0+32]
   6188    pmaxsd               m3, m12, [cq+128*4+32]
   6189    REPX    {pminsd x, m13}, m0, m1, m2, m3
   6190    pxor                 m4, m4
   6191    REPX       {mova x, m4}, m5, m6, m7
   6192    call m(idct_8x8_internal_10bpc).main
   6193    call m(idct_8x16_internal_10bpc).main_evenhalf
   6194    jmp .pass2_end
   6195 .eob107:
   6196    call .pass1_main
   6197    cmp                eobd, 171
   6198    jge .eob171
   6199    jmp .pass2
   6200 .eob171:
   6201    call .pass1_main
   6202 .pass2:
   6203    mov                  cq, r4
   6204    vpbroadcastd        m12, [clip_18b_min]
   6205    vpbroadcastd        m13, [clip_18b_max]
   6206    pmaxsd               m0, m12, [cq+128*1+ 0]
   6207    pmaxsd               m1, m12, [cq+128*7+ 0]
   6208    pmaxsd               m2, m12, [cq+128*1+32]
   6209    pmaxsd               m3, m12, [cq+128*7+32]
   6210    pmaxsd               m4, m12, [cq+128*1+64]
   6211    pmaxsd               m5, m12, [cq+128*7+64]
   6212    pmaxsd               m6, m12, [cq+128*1+96]
   6213    pmaxsd               m7, m12, [cq+128*7+96]
   6214    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   6215    vpbroadcastd        m14, [pd_2896]
   6216    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
   6217    pmaxsd               m0, m12, [cq+128*3+ 0]
   6218    pmaxsd               m1, m12, [cq+128*5+ 0]
   6219    pmaxsd               m2, m12, [cq+128*3+32]
   6220    pmaxsd               m3, m12, [cq+128*5+32]
   6221    pmaxsd               m4, m12, [cq+128*3+64]
   6222    pmaxsd               m5, m12, [cq+128*5+64]
   6223    pmaxsd               m6, m12, [cq+128*3+96]
   6224    pmaxsd               m7, m12, [cq+128*5+96]
   6225    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   6226    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
   6227    pmaxsd               m0, m12, [cq+128*2+ 0]
   6228    pmaxsd               m1, m12, [cq+128*6+ 0]
   6229    pmaxsd               m2, m12, [cq+128*2+32]
   6230    pmaxsd               m3, m12, [cq+128*6+32]
   6231    pmaxsd               m4, m12, [cq+128*2+64]
   6232    pmaxsd               m5, m12, [cq+128*6+64]
   6233    pmaxsd               m6, m12, [cq+128*2+96]
   6234    pmaxsd               m7, m12, [cq+128*6+96]
   6235    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   6236    call m(idct_8x16_internal_10bpc).main_oddhalf
   6237    pmaxsd               m0, m12, [cq+128*0+ 0]
   6238    pmaxsd               m1, m12, [cq+128*4+ 0]
   6239    pmaxsd               m2, m12, [cq+128*0+32]
   6240    pmaxsd               m3, m12, [cq+128*4+32]
   6241    pmaxsd               m4, m12, [cq+128*0+64]
   6242    pmaxsd               m5, m12, [cq+128*4+64]
   6243    pmaxsd               m6, m12, [cq+128*0+96]
   6244    pmaxsd               m7, m12, [cq+128*4+96]
   6245    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   6246    call m(idct_8x8_internal_10bpc).main
   6247    call m(idct_8x16_internal_10bpc).main_evenhalf
   6248 .pass2_end:
   6249    psrld               m11, 8 ; pd_8
   6250    IDCT32_END            0, 15, 8, 9, 10, 4
   6251    IDCT32_END            1, 14, 8, 9, 10, 4
   6252    punpckhqdq           m8, m0, m1   ; 16 17 (interleaved)
   6253    punpcklqdq           m0, m1       ;  0  1 (interleaved)
   6254    punpcklqdq           m1, m14, m15 ; 14 15 (interleaved)
   6255    punpckhqdq          m14, m15      ; 30 31 (interleaved)
   6256    mova          [r5+32*3], m8
   6257    mova          [r5+32*2], m14
   6258    IDCT32_END            2, 15, 8, 9, 10, 4
   6259    IDCT32_END            3, 14, 8, 9, 10, 4
   6260    punpckhqdq            m8, m2, m3   ; 18 19 (interleaved)
   6261    punpcklqdq            m2, m3       ;  2  3 (interleaved)
   6262    punpcklqdq            m3, m14, m15 ; 12 13 (interleaved)
   6263    punpckhqdq           m14, m15      ; 28 29 (interleaved)
   6264    mova          [r5+32*1], m8
   6265    mova          [r5+32*0], m14
   6266    IDCT32_END            4, 15, 8, 9, 10, 4
   6267    IDCT32_END            5, 14, 8, 9, 10, 4
   6268    punpckhqdq            m8, m4, m5   ; 20 21 (interleaved)
   6269    punpcklqdq            m4, m5       ;  4  5 (interleaved)
   6270    punpcklqdq            m5, m14, m15 ; 10 11 (interleaved)
   6271    punpckhqdq           m14, m15      ; 26 27 (interleaved)
   6272    mova          [r5-32*1], m8
   6273    mova          [r5-32*2], m14
   6274    IDCT32_END            6, 15, 8, 9, 10, 4
   6275    IDCT32_END            7, 14, 8, 9, 10, 4
   6276    punpckhqdq            m8, m6, m7   ; 22 23 (interleaved)
   6277    punpcklqdq            m6, m7       ;  6  7 (interleaved)
   6278    punpcklqdq            m7, m14, m15 ;  8  9 (interleaved)
   6279    punpckhqdq           m14, m15      ; 24 25 (interleaved)
   6280    mova          [r5-32*3], m8
   6281    mova          [r5-32*4], m14
   6282    mova                m15, m1
   6283 .end:
   6284    vpermq               m0, m0, q3120
   6285    vpermq               m1, m2, q3120
   6286    call m(idct_8x8_internal_12bpc).write_8x4_start
   6287    call m(idct_8x8_internal_10bpc).write_8x4
   6288    vpermq               m0, m4, q3120
   6289    vpermq               m1, m6, q3120
   6290    call m(idct_8x8_internal_10bpc).write_8x4
   6291    vpermq               m0, m7, q3120
   6292    vpermq               m1, m5, q3120
   6293    call m(idct_8x8_internal_10bpc).write_8x4
   6294    vpermq               m0, m3, q3120
   6295    vpermq               m1, m15, q3120
   6296    call m(idct_8x8_internal_10bpc).write_8x4
   6297    vpermq               m0, [r5+32*3], q3120
   6298    vpermq               m1, [r5+32*1], q3120
   6299    call m(idct_8x8_internal_10bpc).write_8x4
   6300    vpermq               m0, [r5-32*1], q3120
   6301    vpermq               m1, [r5-32*3], q3120
   6302    call m(idct_8x8_internal_10bpc).write_8x4
   6303    vpermq               m0, [r5-32*4], q3120
   6304    vpermq               m1, [r5-32*2], q3120
   6305    call m(idct_8x8_internal_10bpc).write_8x4
   6306    vpermq               m0, [r5+32*0], q3120
   6307    vpermq               m1, [r5+32*2], q3120
   6308    call m(idct_8x8_internal_10bpc).write_8x4
   6309    RET
   6310 .dconly:
   6311    imul                r6d, [cq], 181
   6312    vpbroadcastd         m2, [dconly_12bpc]
   6313    mov                [cq], eobd ; 0
   6314    or                  r3d, 32
   6315    add                 r6d, 640
   6316    sar                 r6d, 10
   6317    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
   6318 ALIGN function_align
   6319 .pass1_main:
   6320    call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1
   6321    TRANSPOSE_8X8_DWORD   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15
   6322    mova         [cq+128*0], m0
   6323    mova         [cq+128*1], m1
   6324    mova         [cq+128*2], m2
   6325    mova         [cq+128*3], m3
   6326    mova         [cq+128*4], m4
   6327    mova         [cq+128*5], m5
   6328    mova         [cq+128*6], m6
   6329    mova         [cq+128*7], m7
   6330    add                  cq, 32
   6331    ret
   6332 ALIGN function_align
   6333 .main_end:
   6334    psrld               m11, 10 ; pd_2
   6335    IDCT32_END            0, 15, 8, 9, 10, 2, 0
   6336    mova         [cq+32*16], m8
   6337    mova         [cq+32*31], m9
   6338    IDCT32_END            1, 14, 8, 9, 10, 2, 0
   6339    mova         [cq+32*17], m8
   6340    mova         [cq+32*30], m9
   6341    mova         [cq+32*14], m14
   6342    IDCT32_END            2, 14, 8, 9, 10, 2, 0
   6343    mova         [cq+32*18], m8
   6344    mova         [cq+32*29], m9
   6345    mova         [cq+32*13], m14
   6346    IDCT32_END            3, 14, 8, 9, 10, 2, 0
   6347    mova         [cq+32*19], m8
   6348    mova         [cq+32*28], m9
   6349    mova         [cq+32*12], m14
   6350    IDCT32_END            4, 14, 8, 9, 10, 2, 0
   6351    mova         [cq+32*20], m8
   6352    mova         [cq+32*27], m9
   6353    mova         [cq+32* 0], m0
   6354    mova         [cq+32* 1], m1
   6355    mova         [cq+32* 2], m2
   6356    IDCT32_END            5, 10, 0, 1, 2, 2, 0
   6357    mova         [cq+32*21], m0
   6358    mova         [cq+32*26], m1
   6359    IDCT32_END            6, 9, 0, 1, 2, 2, 0
   6360    mova         [cq+32*22], m0
   6361    mova         [cq+32*25], m1
   6362    IDCT32_END            7, 8, 0, 1, 2, 2, 0
   6363    mova         [cq+32*23], m0
   6364    mova         [cq+32*24], m1
   6365    mova                 m0, [cq+32* 0]
   6366    mova                 m1, [cq+32* 1]
   6367    mova                 m2, [cq+32* 2]
   6368    mova                m11, m14
   6369    mova                m12, [cq+32*12]
   6370    mova                m13, [cq+32*13]
   6371    mova                m14, [cq+32*14]
   6372    ret
   6373 
   6374 cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob
   6375    vpbroadcastd         m7, [pixel_12bpc_max]
   6376    jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1
   6377 
   6378 cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
   6379    test               eobd, eobd
   6380    jnz .full
   6381    imul                r6d, [cq], 181
   6382    vpbroadcastd         m3, [dconly_10bpc]
   6383    mov                [cq], eobd ; 0
   6384    or                  r3d, 8
   6385 .dconly:
   6386    add                 r6d, 640
   6387    sar                 r6d, 10
   6388 .dconly2:
   6389    imul                r6d, 181
   6390    add                 r6d, 2176
   6391    sar                 r6d, 12
   6392    movd                xm0, r6d
   6393    paddsw              xm0, xm3
   6394    vpbroadcastw         m0, xm0
   6395 .dconly_loop:
   6396    paddsw               m1, m0, [dstq+32*0]
   6397    paddsw               m2, m0, [dstq+32*1]
   6398    psubusw              m1, m3
   6399    psubusw              m2, m3
   6400    mova        [dstq+32*0], m1
   6401    mova        [dstq+32*1], m2
   6402    add                dstq, strideq
   6403    dec                 r3d
   6404    jg .dconly_loop
   6405    RET
   6406 .full:
   6407    PROLOGUE              0, 7, 16, 32*24, dst, stride, c, eob
   6408    lea                  r6, [rsp+32*4]
   6409    vpbroadcastd        m12, [clip_18b_min]
   6410    vpbroadcastd        m13, [clip_18b_max]
   6411    call .pass1
   6412    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
   6413    lea                  r6, [deint_shuf+128]
   6414    vpbroadcastd        m11, [pw_2048]
   6415    mov                  r4, dstq
   6416    call .pass2
   6417    mova                 m0, [r5+32*3] ; 16 17
   6418    mova                 m1, [r5+32*2] ; 30 31
   6419    mova                 m2, [r5+32*1] ; 18 19
   6420    mova                 m3, [r5+32*0] ; 28 29
   6421    mova                 m4, [r5-32*1] ; 20 21
   6422    mova                 m5, [r5-32*2] ; 26 27
   6423    mova                 m6, [r5-32*3] ; 22 23
   6424    mova                 m7, [r5-32*4] ; 24 25
   6425    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
   6426    lea                dstq, [r4+32]
   6427    call .pass2
   6428    RET
   6429 ALIGN function_align
   6430 .pass2:
   6431    call m(idct_16x8_internal_8bpc).main
   6432    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
   6433    call m(idct_16x8_internal_10bpc).write_16x4_start
   6434    pmulhrsw             m0, m11, m4
   6435    pmulhrsw             m1, m11, m5
   6436    pmulhrsw             m2, m11, m6
   6437    pmulhrsw             m3, m11, m7
   6438    jmp m(idct_16x8_internal_10bpc).write_16x4_zero
   6439 ALIGN function_align
   6440 .pass1:
   6441    mova                 m0, [cq+32* 1]
   6442    mova                 m1, [cq+32* 7]
   6443    mova                 m2, [cq+32* 9]
   6444    mova                 m3, [cq+32*15]
   6445    mova                 m4, [cq+32*17]
   6446    mova                 m5, [cq+32*23]
   6447    mova                 m6, [cq+32*25]
   6448    mova                 m7, [cq+32*31]
   6449    vpbroadcastd        m11, [pd_2048]
   6450    vpbroadcastd        m14, [pd_2896]
   6451    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
   6452    mova                 m0, [cq+32* 3]
   6453    mova                 m1, [cq+32* 5]
   6454    mova                 m2, [cq+32*11]
   6455    mova                 m3, [cq+32*13]
   6456    mova                 m4, [cq+32*19]
   6457    mova                 m5, [cq+32*21]
   6458    mova                 m6, [cq+32*27]
   6459    mova                 m7, [cq+32*29]
   6460    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
   6461    mova                 m0, [cq+32* 2]
   6462    mova                 m1, [cq+32* 6]
   6463    mova                 m2, [cq+32*10]
   6464    mova                 m3, [cq+32*14]
   6465    mova                 m4, [cq+32*18]
   6466    mova                 m5, [cq+32*22]
   6467    mova                 m6, [cq+32*26]
   6468    mova                 m7, [cq+32*30]
   6469    call m(idct_8x16_internal_10bpc).main_oddhalf
   6470    mova                 m0, [cq+32* 0]
   6471    mova                 m1, [cq+32* 4]
   6472    mova                 m2, [cq+32* 8]
   6473    mova                 m3, [cq+32*12]
   6474    mova                 m4, [cq+32*16]
   6475    mova                 m5, [cq+32*20]
   6476    mova                 m6, [cq+32*24]
   6477    mova                 m7, [cq+32*28]
   6478    call m(idct_8x8_internal_10bpc).main
   6479    call m(idct_8x16_internal_10bpc).main_evenhalf
   6480    ret
   6481 
   6482 cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
   6483    vpbroadcastd         m7, [pixel_10bpc_max]
   6484 .pass1:
   6485    vpbroadcastd         m5, [pw_4096]
   6486    pxor                 m6, m6
   6487    mov                 r6d, eobd
   6488    add                eobb, 21
   6489    cmovc              eobd, r6d
   6490    lea                  r6, [strideq*3]
   6491    lea                  r5, [strideq*5]
   6492    lea                  r4, [strideq+r6*2] ; strideq*7
   6493 .loop:
   6494    mova                 m0, [cq+32*0]
   6495    packssdw             m0, [cq+32*1]
   6496    mova                 m1, [cq+32*2]
   6497    packssdw             m1, [cq+32*3]
   6498    REPX {mova [cq+32*x], m6}, 0, 1, 2, 3
   6499    add                  cq, 32*8
   6500    mova                 m2, [cq-32*4]
   6501    packssdw             m2, [cq-32*3]
   6502    mova                 m3, [cq-32*2]
   6503    packssdw             m3, [cq-32*1]
   6504    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
   6505    REPX {mova [cq+32*x], m6}, -4, -3, -2, -1
   6506    call m(inv_txfm_add_identity_identity_8x32_10bpc).main
   6507    add                dstq, 16
   6508    sub                eobd, 64
   6509    jge .loop
   6510    RET
   6511 
   6512 cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob
   6513    test               eobd, eobd
   6514    jnz .full
   6515    imul                r6d, [cq], 181
   6516    vpbroadcastd         m3, [dconly_12bpc]
   6517    mov                [cq], eobd ; 0
   6518    or                  r3d, 8
   6519    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
   6520 .full:
   6521    PROLOGUE              0, 7, 16, 32*24, dst, stride, c, eob
   6522    lea                  r6, [rsp+32*4]
   6523    vpbroadcastd        m12, [clip_20b_min]
   6524    vpbroadcastd        m13, [clip_20b_max]
   6525    call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1
   6526    call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end
   6527    mov                  r4, dstq
   6528    call m(idct_16x8_internal_12bpc).pass2_main
   6529    mova                 m0, [cq+32* 0] ; 16
   6530    mova                 m1, [cq+32* 1] ; 17
   6531    mova                 m2, [cq+32* 2] ; 18
   6532    mova                 m3, [cq+32* 3] ; 19
   6533    mova                 m4, [cq+32* 4] ; 20
   6534    mova                 m5, [cq+32* 5] ; 21
   6535    mova                 m6, [cq+32* 6] ; 22
   6536    mova                 m7, [cq+32* 7] ; 23
   6537    mova                 m8, [cq+32* 8] ; 24
   6538    mova                 m9, [cq+32* 9] ; 25
   6539    mova                m10, [cq+32*10] ; 26
   6540    mova                m11, [cq+32*11] ; 27
   6541    mova                m12, [cq+32*12] ; 28
   6542    mova                m13, [cq+32*13] ; 29
   6543    mova                m14, [cq+32*14] ; 30
   6544    mova                m15, [cq+32*15] ; 31
   6545    lea                dstq, [r4+32]
   6546    call m(idct_16x8_internal_12bpc).pass2_main
   6547    RET
   6548 
   6549 cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob
   6550    vpbroadcastd         m7, [pixel_12bpc_max]
   6551    jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1
   6552 
   6553 %macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2]
   6554    mova                m%4, [%2]
   6555    paddsw              m%3, m%1, m%4
   6556    psubsw              m%1, m%4
   6557 %if %1 == 0
   6558    pxor                 m6, m6
   6559 %endif
   6560    pmulhrsw            m%3, m15
   6561    pmulhrsw            m%1, m15
   6562    paddw               m%3, [dstq+%5]
   6563    paddw               m%1, [r2+%6]
   6564    pmaxsw              m%3, m6
   6565    pmaxsw              m%1, m6
   6566    pminsw              m%3, m7
   6567    pminsw              m%1, m7
   6568    mova          [dstq+%5], m%3
   6569    mova            [r2+%6], m%1
   6570 %endmacro
   6571 
   6572 cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
   6573    test               eobd, eobd
   6574    jz .dconly
   6575    PROLOGUE              0, 8, 16, 32*36, dst, stride, c, eob
   6576 %undef cmp
   6577    vpbroadcastd        m11, [pd_2048]
   6578    vpbroadcastd        m12, [clip_18b_min]
   6579    vpbroadcastd        m13, [clip_18b_max]
   6580    vpbroadcastd        m14, [pd_2896]
   6581    lea                  r6, [rsp+32*16]
   6582    lea                  r4, [r6+32*8]
   6583    lea                  r5, [r6+32*16]
   6584    call .main
   6585    sub                eobd, 44
   6586    jge .eob44
   6587    vperm2i128           m2, m0, m3, 0x31 ;  5
   6588    vinserti128          m0, xm3, 1       ;  1
   6589    vperm2i128           m3, m1, m4, 0x31 ;  7
   6590    vinserti128          m1, xm4, 1       ;  3
   6591    pxor                 m4, m4
   6592    REPX       {mova x, m4}, m5, m6, m7
   6593    REPX {mova [r6+32*x], m4}, 0, 1, 2, 3
   6594    jmp .fast
   6595 .dconly:
   6596    imul                r6d, [cq], 181
   6597    vpbroadcastd         m3, [dconly_10bpc]
   6598    mov                [cq], eobd ; 0
   6599    or                  r3d, 32
   6600    add                 r6d, 128
   6601    sar                 r6d, 8
   6602    imul                r6d, 181
   6603    jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
   6604 .eob44:
   6605    mova          [r4+16*0], xm0
   6606    mova          [r4+16*1], xm3
   6607    mova          [r4+16*2], xm1
   6608    mova          [r4+16*3], xm4
   6609    vextracti128  [r4+16*4], m0, 1
   6610    vextracti128  [r4+16*5], m3, 1
   6611    vextracti128  [r4+16*6], m1, 1
   6612    vextracti128  [r4+16*7], m4, 1
   6613    call .main
   6614    sub                eobd, 107
   6615    jge .eob151
   6616    vperm2i128           m7, m1, m4, 0x31 ; 15
   6617    vinserti128          m5, m1, xm4, 1   ; 11
   6618    vperm2i128           m6, m0, m3, 0x31 ; 13
   6619    vinserti128          m4, m0, xm3, 1   ;  9
   6620    mova                 m0, [r4+32*0]
   6621    mova                 m1, [r4+32*1]
   6622    mova                 m2, [r4+32*2]
   6623    mova                 m3, [r4+32*3]
   6624 .fast:
   6625    lea                  r6, [pw_5+128]
   6626    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   6627    pxor                 m8, m8
   6628    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
   6629    jmp .idct16
   6630 .eob151:
   6631    mova          [r4-16*8], xm0
   6632    mova          [r4-16*7], xm3
   6633    mova          [r4-16*6], xm1
   6634    mova          [r4-16*5], xm4
   6635    vextracti128  [r4-16*4], m0, 1
   6636    vextracti128  [r4-16*3], m3, 1
   6637    vextracti128  [r4-16*2], m1, 1
   6638    vextracti128  [r4-16*1], m4, 1
   6639    call .main
   6640    sub                eobd, 128
   6641    jge .eob279
   6642    vperm2i128          m10, m0, m3, 0x31 ; 21
   6643    vinserti128          m8, m0, xm3, 1   ; 17
   6644    vperm2i128          m11, m1, m4, 0x31 ; 23
   6645    vinserti128          m9, m1, xm4, 1   ; 19
   6646    pxor                m12, m12
   6647    REPX      {mova x, m12}, m13, m14, m15
   6648    REPX {mova [r6+32*x], m12}, 0, 1, 2, 3
   6649    jmp .full
   6650 .eob279:
   6651    mova          [r5+16*0], xm0
   6652    mova          [r5+16*1], xm3
   6653    mova          [r5+16*2], xm1
   6654    mova          [r5+16*3], xm4
   6655    vextracti128  [r5+16*4], m0, 1
   6656    vextracti128  [r5+16*5], m3, 1
   6657    vextracti128  [r5+16*6], m1, 1
   6658    vextracti128  [r5+16*7], m4, 1
   6659    call .main
   6660    vperm2i128          m14, m0, m3, 0x31 ; 29
   6661    vinserti128         m12, m0, xm3, 1   ; 25
   6662    vperm2i128          m15, m1, m4, 0x31 ; 31
   6663    vinserti128         m13, m1, xm4, 1   ; 27
   6664    mova                 m8, [r5+32*0]
   6665    mova                 m9, [r5+32*1]
   6666    mova                m10, [r5+32*2]
   6667    mova                m11, [r5+32*3]
   6668 .full:
   6669    mova                 m0, [r4+32*0]
   6670    mova                 m1, [r4+32*1]
   6671    mova                 m2, [r4+32*2]
   6672    mova                 m3, [r4+32*3]
   6673    mova                 m4, [r4-32*4]
   6674    mova                 m5, [r4-32*3]
   6675    mova                 m6, [r4-32*2]
   6676    mova                 m7, [r4-32*1]
   6677    lea                  r6, [pw_5 + 128]
   6678    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
   6679    lea                  r3, [rsp+32*8]
   6680    mova                 m8, [r3+32*0]
   6681    mova                 m9, [r3+32*1]
   6682    mova                m10, [r3+32*2]
   6683    mova                m11, [r3+32*3]
   6684    mova                m12, [r3-32*4]
   6685    mova                m13, [r3-32*3]
   6686    mova                m14, [r3-32*2]
   6687    mova                m15, [r3-32*1]
   6688 .idct16:
   6689    lea                  r3, [rsp+32*16]
   6690    mova                 m0, [r3+32*0]
   6691    mova                 m1, [r3+32*1]
   6692    mova                 m2, [r3+32*2]
   6693    mova                 m3, [r3+32*3]
   6694    mova                 m4, [r3-32*4]
   6695    mova                 m5, [r3-32*3]
   6696    mova                 m6, [r3-32*2]
   6697    mova                 m7, [r3-32*1]
   6698    mova              [rsp], m15
   6699    call m(idct_16x16_internal_8bpc).main
   6700    imul                 r2, strideq, 19
   6701    lea                  r3, [strideq*3]
   6702    add                  r2, dstq
   6703    call .pass2_end
   6704    RET
   6705 ALIGN function_align
   6706 .main:
   6707    pmulld               m0, m14, [cq+128* 1]
   6708    pmulld               m1, m14, [cq+128* 3]
   6709    pmulld               m2, m14, [cq+128* 5]
   6710    pmulld               m3, m14, [cq+128* 7]
   6711    pmulld               m4, m14, [cq+128* 9]
   6712    pmulld               m5, m14, [cq+128*11]
   6713    pmulld               m6, m14, [cq+128*13]
   6714    pmulld               m7, m14, [cq+128*15]
   6715    call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
   6716    pmulld               m0, m14, [cq+128* 0]
   6717    pmulld               m1, m14, [cq+128* 2]
   6718    pmulld               m2, m14, [cq+128* 4]
   6719    pmulld               m3, m14, [cq+128* 6]
   6720    pmulld               m4, m14, [cq+128* 8]
   6721    pmulld               m5, m14, [cq+128*10]
   6722    pmulld               m6, m14, [cq+128*12]
   6723    pmulld               m7, m14, [cq+128*14]
   6724    call m(idct_8x8_internal_10bpc).main_rect2
   6725    call m(idct_8x16_internal_10bpc).main_evenhalf
   6726    psrld               m15, m11, 11 ; pd_1
   6727    mova                 m8, [r6-32*4]
   6728    mova                 m9, [r6-32*3]
   6729    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
   6730    psubd               m10, m0, m8 ; out15
   6731    paddd                m0, m8     ; out0
   6732    mova                 m8, [r6-32*2]
   6733    paddd               m15, m1, m9 ; out1
   6734    psubd                m1, m9     ; out14
   6735    mova                 m9, [r6-32*1]
   6736    REPX       {psrad x, 1}, m0, m15, m10, m1
   6737    packssdw             m0, m15
   6738    packssdw             m1, m10
   6739    psubd               m10, m2, m8 ; out13
   6740    paddd                m2, m8     ; out2
   6741    mova                 m8, [r6+32*0]
   6742    paddd               m15, m3, m9 ; out3
   6743    psubd                m3, m9     ; out12
   6744    mova                 m9, [r6+32*1]
   6745    REPX       {psrad x, 1}, m2, m15, m10, m3
   6746    packssdw             m2, m15
   6747    packssdw             m3, m10
   6748    psubd               m10, m4, m8 ; out11
   6749    paddd                m4, m8     ; out4
   6750    mova                 m8, [r6+32*2]
   6751    paddd               m15, m5, m9 ; out5
   6752    psubd                m5, m9     ; out10
   6753    mova                 m9, [r6+32*3]
   6754    REPX       {psrad x, 1}, m4, m10, m15, m5
   6755    packssdw             m4, m15
   6756    packssdw             m5, m10
   6757    psubd               m10, m6, m8 ; out9
   6758    paddd                m6, m8     ; out6
   6759    paddd               m15, m7, m9 ; out7
   6760    psubd                m7, m9     ; out8
   6761    REPX       {psrad x, 1}, m6, m10, m15, m7
   6762    packssdw             m6, m15
   6763    packssdw             m7, m10
   6764    punpckhwd            m8, m0, m2
   6765    punpcklwd            m0, m2
   6766    punpckhwd            m2, m3, m1
   6767    punpcklwd            m3, m1
   6768    punpckhwd            m1, m4, m6
   6769    punpcklwd            m4, m6
   6770    punpcklwd            m6, m7, m5
   6771    punpckhwd            m7, m5
   6772    pxor                 m5, m5
   6773    mov                 r7d, 128*13
   6774 .main_zero_loop:
   6775    mova      [cq+r7-128*1], m5
   6776    mova      [cq+r7+128*0], m5
   6777    mova      [cq+r7+128*1], m5
   6778    mova      [cq+r7+128*2], m5
   6779    sub                 r7d, 128*4
   6780    jg .main_zero_loop
   6781    add                  cq, 32
   6782    punpcklwd            m5, m3, m2
   6783    punpckhwd            m3, m2
   6784    punpcklwd            m2, m4, m1
   6785    punpckhwd            m4, m1
   6786    punpckhwd            m1, m0, m8
   6787    punpcklwd            m0, m8
   6788    punpckhwd            m8, m6, m7
   6789    punpcklwd            m6, m7
   6790    punpcklqdq           m7, m1, m4
   6791    punpckhqdq           m1, m4
   6792    punpckhqdq           m4, m8, m3
   6793    punpcklqdq           m8, m3
   6794    punpckhqdq           m3, m6, m5
   6795    punpcklqdq           m6, m5
   6796    punpcklqdq           m5, m0, m2
   6797    punpckhqdq           m0, m2
   6798    mova          [r6+16*0], xm5
   6799    mova          [r6+16*1], xm6
   6800    mova          [r6+16*2], xm7
   6801    mova          [r6+16*3], xm8
   6802    vextracti128  [r6+16*4], m5, 1
   6803    vextracti128  [r6+16*5], m6, 1
   6804    vextracti128  [r6+16*6], m7, 1
   6805    vextracti128  [r6+16*7], m8, 1
   6806    sub                  r6, 32*4
   6807    ret
   6808 ALIGN function_align
   6809 .pass2_end:
   6810    mova [rsp+gprsize+32*0], m6
   6811    mova [rsp+gprsize+32*2], m7
   6812    mova [rsp+gprsize+32*3], m15
   6813    vpbroadcastd        m15, [pw_2048]
   6814    vpbroadcastd         m7, [pixel_10bpc_max]
   6815    IDCT32_PASS2_END      0, r5+32*3, 1, 6, strideq*0, r3*4
   6816    IDCT32_PASS2_END      4, r5-32*1, 0, 1, strideq*4, strideq*8
   6817    IDCT32_PASS2_END      8, r4+32*3, 0, 4, strideq*8, strideq*4
   6818    IDCT32_PASS2_END     12, r4-32*1, 0, 4, r3*4,      strideq*0
   6819    add                dstq, strideq
   6820    sub                  r2, strideq
   6821    mova                 m1, [rsp+gprsize+32*1]
   6822    IDCT32_PASS2_END      1, r5+32*2, 0, 4, strideq*0, r3*4
   6823    IDCT32_PASS2_END      5, r5-32*2, 0, 4, strideq*4, strideq*8
   6824    IDCT32_PASS2_END      9, r4+32*2, 0, 4, strideq*8, strideq*4
   6825    IDCT32_PASS2_END     13, r4-32*2, 0, 4, r3*4,      strideq*0
   6826    add                dstq, strideq
   6827    sub                  r2, strideq
   6828    mova                 m1, [rsp+gprsize+32*0]
   6829    IDCT32_PASS2_END      2, r5+32*1, 0, 4, strideq*0, r3*4
   6830    IDCT32_PASS2_END      1, r5-32*3, 0, 4, strideq*4, strideq*8
   6831    IDCT32_PASS2_END     10, r4+32*1, 0, 4, strideq*8, strideq*4
   6832    IDCT32_PASS2_END     14, r4-32*3, 0, 4, r3*4,      strideq*0
   6833    add                dstq, strideq
   6834    sub                  r2, strideq
   6835    mova                 m1, [rsp+gprsize+32*2]
   6836    mova                 m2, [rsp+gprsize+32*3]
   6837    IDCT32_PASS2_END      3, r5+32*0, 0, 4, strideq*0, r3*4
   6838    IDCT32_PASS2_END      1, r5-32*4, 0, 4, strideq*4, strideq*8
   6839    IDCT32_PASS2_END     11, r4+32*0, 0, 4, strideq*8, strideq*4
   6840    IDCT32_PASS2_END      2, r4-32*4, 0, 4, r3*4,      strideq*0
   6841    ret
   6842 
   6843 cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob
   6844    vpbroadcastd         m7, [pixel_10bpc_max]
   6845 .pass1:
   6846    vpbroadcastd         m8, [pw_2896x8]
   6847    vpbroadcastd         m9, [pw_1697x16]
   6848    vpbroadcastd        m11, [pw_8192]
   6849    lea                  r6, [strideq*5]
   6850    pxor                 m6, m6
   6851    paddw               m10, m11, m11 ; pw_16384
   6852    mov                  r5, dstq
   6853    call .main
   6854    sub                eobd, 36
   6855    jl .ret
   6856    add                  cq, 128*8
   6857    lea                dstq, [r5+16]
   6858    call .main
   6859    sub                  cq, 128*8-32
   6860    lea                dstq, [r5+strideq*8]
   6861    mov                  r5, dstq
   6862    call .main
   6863    sub                eobd, 107 ; eob < 143
   6864    jl .ret
   6865    add                  cq, 128*8
   6866    lea                dstq, [r5+16]
   6867    call .main
   6868    sub                  cq, 128*8-32
   6869    lea                dstq, [r5+strideq*8]
   6870    mov                  r5, dstq
   6871    call .main
   6872    sub                eobd, 128 ; eob < 271
   6873    jl .ret
   6874    add                  cq, 128*8
   6875    lea                dstq, [r5+16]
   6876    call .main
   6877    sub                  cq, 128*8-32
   6878    lea                dstq, [r5+strideq*8]
   6879    mov                  r5, dstq
   6880    call .main
   6881    sub                eobd, 128 ; eob < 399
   6882    jl .ret
   6883    add                  cq, 128*8
   6884    lea                dstq, [r5+16]
   6885    call .main
   6886 .ret:
   6887    RET
   6888 ALIGN function_align
   6889 .main:
   6890    mova                 m0, [cq+128*0]
   6891    packssdw             m0, [cq+128*1]
   6892    mova                 m1, [cq+128*2]
   6893    packssdw             m1, [cq+128*3]
   6894    mova                 m2, [cq+128*4]
   6895    packssdw             m2, [cq+128*5]
   6896    mova                 m3, [cq+128*6]
   6897    packssdw             m3, [cq+128*7]
   6898    REPX  {pmulhrsw x, m8 }, m0, m1, m2, m3
   6899    REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3
   6900    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
   6901    REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
   6902 .main2:
   6903    punpckhwd            m4, m0, m1
   6904    punpcklwd            m0, m1
   6905    punpckhwd            m1, m2, m3
   6906    punpcklwd            m2, m3
   6907    punpckhwd            m3, m0, m4
   6908    punpcklwd            m0, m4
   6909    punpcklwd            m4, m2, m1
   6910    punpckhwd            m2, m1
   6911    punpckhqdq           m1, m0, m4
   6912    punpcklqdq           m0, m4
   6913    call m(iidentity_8x8_internal_10bpc).write_2x8x2
   6914    punpcklqdq           m0, m3, m2
   6915    punpckhqdq           m1, m3, m2
   6916    jmp m(iidentity_8x8_internal_10bpc).write_2x8x2
   6917 
   6918 cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob
   6919    vpbroadcastd         m7, [pixel_12bpc_max]
   6920    jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1
   6921 
   6922 cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
   6923    test               eobd, eobd
   6924    jz .dconly
   6925    PROLOGUE              0, 8, 16, 32*40, dst, stride, c, eob
   6926 %undef cmp
   6927    vpbroadcastd        m12, [clip_18b_min]
   6928    vpbroadcastd        m13, [clip_18b_max]
   6929    lea                  r6, [rsp+32*4]
   6930    call .main
   6931    cmp                eobd, 36
   6932    jge .full
   6933    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
   6934    pxor                 m8, m8
   6935    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
   6936    lea                  r6, [pw_5+128]
   6937    mov                  r7, dstq
   6938    call m(idct_16x16_internal_8bpc).main
   6939    call .write_16x16
   6940    mova                 m0, [r5+32*3]
   6941    mova                 m1, [r5+32*2]
   6942    mova                 m2, [r5+32*1]
   6943    mova                 m3, [r5+32*0]
   6944    mova                 m4, [r5-32*1]
   6945    mova                 m5, [r5-32*2]
   6946    mova                 m6, [r5-32*3]
   6947    mova                 m7, [r5-32*4]
   6948    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
   6949    pxor                 m8, m8
   6950    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
   6951    jmp .end
   6952 .dconly:
   6953    imul                r6d, [cq], 181
   6954    vpbroadcastd         m3, [dconly_10bpc]
   6955    mov                [cq], eobd ; 0
   6956    or                  r3d, 16
   6957    add                 r6d, 128
   6958    sar                 r6d, 8
   6959    imul                r6d, 181
   6960    add                 r6d, 384
   6961    sar                 r6d, 9
   6962    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
   6963 .full:
   6964    add                  cq, 32
   6965    mova          [r4+32*3], m0
   6966    mova          [r4+32*2], m1
   6967    mova          [r4+32*1], m2
   6968    mova          [r4+32*0], m3
   6969    mova          [r4-32*1], m4
   6970    mova          [r4-32*2], m5
   6971    mova          [r4-32*3], m6
   6972    mova          [r4-32*4], m7
   6973    call .main
   6974    sub                  r4, 32*16 ; topleft 16x8
   6975    call .transpose_16x16
   6976    lea                  r6, [pw_5+128]
   6977    mov                  r7, dstq
   6978    call m(idct_16x16_internal_8bpc).main
   6979    call .write_16x16
   6980    mova                 m0, [r5+32*3]
   6981    mova                 m1, [r5+32*2]
   6982    mova                 m2, [r5+32*1]
   6983    mova                 m3, [r5+32*0]
   6984    mova                 m4, [r5-32*1]
   6985    mova                 m5, [r5-32*2]
   6986    mova                 m6, [r5-32*3]
   6987    mova                 m7, [r5-32*4]
   6988    add                  r4, 32*8 ; bottomleft 16x8
   6989    call .transpose_16x16
   6990 .end:
   6991    lea                dstq, [r7+32]
   6992    call m(idct_16x16_internal_8bpc).main
   6993    call .write_16x16
   6994    RET
   6995 ALIGN function_align
   6996 .transpose_16x16:
   6997    punpckhdq            m8, m3, m1
   6998    punpckldq            m3, m1
   6999    punpckhdq            m1, m0, m2
   7000    punpckldq            m0, m2
   7001    punpckhdq            m2, m7, m5
   7002    punpckldq            m7, m5
   7003    punpckhdq            m5, m4, m6
   7004    punpckldq            m4, m6
   7005    punpckhqdq           m6, m0, m4
   7006    punpcklqdq           m0, m4
   7007    punpckhqdq           m4, m1, m5
   7008    punpcklqdq           m1, m5
   7009    punpckhqdq           m5, m7, m3
   7010    punpcklqdq           m7, m3
   7011    punpckhqdq           m3, m2, m8
   7012    punpcklqdq           m2, m8
   7013    vinserti128          m8, m0, xm7, 1
   7014    vperm2i128          m12, m0, m7, 0x31
   7015    vinserti128          m9, m6, xm5, 1
   7016    vperm2i128          m13, m6, m5, 0x31
   7017    vinserti128         m10, m1, xm2, 1
   7018    vperm2i128          m14, m1, m2, 0x31
   7019    vinserti128         m11, m4, xm3, 1
   7020    vperm2i128          m15, m4, m3, 0x31
   7021    mova                 m0, [r4+32*3]
   7022    mova                 m1, [r4+32*2]
   7023    mova                 m2, [r4+32*1]
   7024    mova                 m3, [r4+32*0]
   7025    mova                 m4, [r4-32*1]
   7026    mova                 m5, [r4-32*2]
   7027    mova                 m6, [r4-32*3]
   7028    mova                 m7, [r4-32*4]
   7029    mova      [rsp+gprsize], m15
   7030    jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
   7031 ALIGN function_align
   7032 .main:
   7033    vpbroadcastd        m14, [pd_2896]
   7034    vpbroadcastd        m11, [pd_2048]
   7035    pmulld               m0, m14, [cq+64* 1]
   7036    pmulld               m1, m14, [cq+64* 7]
   7037    pmulld               m2, m14, [cq+64* 9]
   7038    pmulld               m3, m14, [cq+64*15]
   7039    pmulld               m4, m14, [cq+64*17]
   7040    pmulld               m5, m14, [cq+64*23]
   7041    pmulld               m6, m14, [cq+64*25]
   7042    pmulld               m7, m14, [cq+64*31]
   7043    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
   7044    pmulld               m0, m14, [cq+64* 3]
   7045    pmulld               m1, m14, [cq+64* 5]
   7046    pmulld               m2, m14, [cq+64*11]
   7047    pmulld               m3, m14, [cq+64*13]
   7048    pmulld               m4, m14, [cq+64*19]
   7049    pmulld               m5, m14, [cq+64*21]
   7050    pmulld               m6, m14, [cq+64*27]
   7051    pmulld               m7, m14, [cq+64*29]
   7052    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
   7053    pmulld               m0, m14, [cq+64* 2]
   7054    pmulld               m1, m14, [cq+64* 6]
   7055    pmulld               m2, m14, [cq+64*10]
   7056    pmulld               m3, m14, [cq+64*14]
   7057    pmulld               m4, m14, [cq+64*18]
   7058    pmulld               m5, m14, [cq+64*22]
   7059    pmulld               m6, m14, [cq+64*26]
   7060    pmulld               m7, m14, [cq+64*30]
   7061    call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
   7062    pmulld               m0, m14, [cq+64* 0]
   7063    pmulld               m1, m14, [cq+64* 4]
   7064    pmulld               m2, m14, [cq+64* 8]
   7065    pmulld               m3, m14, [cq+64*12]
   7066    pmulld               m4, m14, [cq+64*16]
   7067    pmulld               m5, m14, [cq+64*20]
   7068    pmulld               m6, m14, [cq+64*24]
   7069    pmulld               m7, m14, [cq+64*28]
   7070    call m(idct_8x8_internal_10bpc).main_rect2
   7071    call m(idct_8x16_internal_10bpc).main_evenhalf
   7072    pxor                 m8, m8
   7073    mov                 r7d, 64*30
   7074 .main_zero_loop:
   7075    mova       [cq+r7-64*2], m8
   7076    mova       [cq+r7-64*1], m8
   7077    mova       [cq+r7+64*0], m8
   7078    mova       [cq+r7+64*1], m8
   7079    sub                 r7d, 64*4
   7080    jg .main_zero_loop
   7081 .main_end:
   7082    psrld               m11, 11 ; pd_1
   7083    IDCT32_END            0, 15, 8, 9, 10, 1
   7084    IDCT32_END            1, 14, 8, 9, 10, 1
   7085    punpckhwd            m8, m0, m1   ; 16 17
   7086    punpcklwd            m0, m1       ;  0  1
   7087    punpcklwd            m1, m14, m15 ; 14 15
   7088    punpckhwd           m14, m15      ; 30 31
   7089    mova          [r5+32*3], m8
   7090    mova          [r5+32*2], m14
   7091    IDCT32_END            2, 15, 8, 9, 10, 1
   7092    IDCT32_END            3, 14, 8, 9, 10, 1
   7093    punpckhwd            m8, m2, m3   ; 18 19
   7094    punpcklwd            m2, m3       ;  2  3
   7095    punpcklwd            m3, m14, m15 ; 12 13
   7096    punpckhwd           m14, m15      ; 28 29
   7097    mova          [r5+32*1], m8
   7098    mova          [r5+32*0], m14
   7099    IDCT32_END            4, 15, 8, 9, 10, 1
   7100    IDCT32_END            5, 14, 8, 9, 10, 1
   7101    punpckhwd            m8, m4, m5   ; 20 21
   7102    punpcklwd            m4, m5       ;  4  5
   7103    punpcklwd            m5, m14, m15 ; 10 11
   7104    punpckhwd           m14, m15      ; 26 27
   7105    mova          [r5-32*1], m8
   7106    mova          [r5-32*2], m14
   7107    IDCT32_END            6, 15, 8, 9, 10, 1
   7108    IDCT32_END            7, 14, 8, 9, 10, 1
   7109    punpckhwd            m8, m6, m7   ; 22 23
   7110    punpcklwd            m6, m7       ;  6  7
   7111    punpcklwd            m7, m14, m15 ;  8  9
   7112    punpckhwd           m14, m15      ; 24 25
   7113    mova          [r5-32*3], m8
   7114    mova          [r5-32*4], m14
   7115    ret
   7116 ALIGN function_align
   7117 .write_16x16:
   7118    mova                 m1, [rsp+gprsize+32*1]
   7119    mova [rsp+gprsize+32*0], m8
   7120    mova [rsp+gprsize+32*1], m9
   7121    mova [rsp+gprsize+32*2], m12
   7122    vpbroadcastd        m12, [pw_2048]
   7123    vpbroadcastd         m9, [pixel_10bpc_max]
   7124    lea                  r3, [strideq*3]
   7125    pxor                 m8, m8
   7126    pmulhrsw             m0, m12
   7127    pmulhrsw             m1, m12
   7128    pmulhrsw             m2, m12
   7129    pmulhrsw             m3, m12
   7130    call m(idct_16x8_internal_10bpc).write_16x4
   7131    pmulhrsw             m0, m12, m4
   7132    pmulhrsw             m1, m12, m5
   7133    pmulhrsw             m2, m12, m6
   7134    pmulhrsw             m3, m12, m7
   7135    call m(idct_16x8_internal_10bpc).write_16x4
   7136    pmulhrsw             m0, m12, [rsp+gprsize+32*0]
   7137    pmulhrsw             m1, m12, [rsp+gprsize+32*1]
   7138    pmulhrsw             m2, m12, m10
   7139    pmulhrsw             m3, m12, m11
   7140    call m(idct_16x8_internal_10bpc).write_16x4
   7141    pmulhrsw             m0, m12, [rsp+gprsize+32*2]
   7142    pmulhrsw             m1, m12, m13
   7143    pmulhrsw             m2, m12, m14
   7144    pmulhrsw             m3, m12, m15
   7145    jmp m(idct_16x8_internal_10bpc).write_16x4
   7146 
   7147 cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob
   7148    vpbroadcastd         m7, [pixel_10bpc_max]
   7149 .pass1:
   7150    vpbroadcastd         m8, [pw_2896x8]
   7151    vpbroadcastd         m9, [pw_1697x16]
   7152    vpbroadcastd        m10, [pw_4096]
   7153    lea                  r6, [strideq*5]
   7154    pxor                 m6, m6
   7155    mov                  r5, dstq
   7156    call .main
   7157    sub                eobd, 36
   7158    jl .ret
   7159    add                  cq, 32
   7160    lea                dstq, [dstq+strideq*4]
   7161    call .main
   7162    add                  cq, 64*8-32
   7163    lea                dstq, [r5+16*1]
   7164    call .main
   7165    sub                eobd, 107 ; eob < 143
   7166    jl .ret
   7167    add                  cq, 32
   7168    lea                dstq, [dstq+strideq*4]
   7169    call .main
   7170    add                  cq, 64*8-32
   7171    lea                dstq, [r5+16*2]
   7172    call .main
   7173    sub                eobd, 128 ; eob < 271
   7174    jl .ret
   7175    add                  cq, 32
   7176    lea                dstq, [dstq+strideq*4]
   7177    call .main
   7178    add                  cq, 64*8-32
   7179    lea                dstq, [r5+16*3]
   7180    call .main
   7181    sub                eobd, 128 ; eob < 399
   7182    jl .ret
   7183    add                  cq, 32
   7184    lea                dstq, [dstq+strideq*4]
   7185    call .main
   7186 .ret:
   7187    RET
   7188 ALIGN function_align
   7189 .main:
   7190    mova                 m0, [cq+64*0]
   7191    packssdw             m0, [cq+64*1]
   7192    mova                 m1, [cq+64*2]
   7193    packssdw             m1, [cq+64*3]
   7194    mova                 m2, [cq+64*4]
   7195    packssdw             m2, [cq+64*5]
   7196    mova                 m3, [cq+64*6]
   7197    packssdw             m3, [cq+64*7]
   7198    REPX  {pmulhrsw x, m8 }, m0, m1, m2, m3
   7199    REPX  {paddsw   x, x  }, m0, m1, m2, m3
   7200    REPX  {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3
   7201    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3
   7202    REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
   7203    jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
   7204 
   7205 cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob
   7206    vpbroadcastd         m7, [pixel_12bpc_max]
   7207    jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1
   7208 
   7209 cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
   7210    test               eobd, eobd
   7211    jz .dconly
   7212    PROLOGUE              0, 8, 16, 32*83, dst, stride, c, eob
   7213 %undef cmp
   7214    vpbroadcastd        m12, [clip_18b_min]
   7215    vpbroadcastd        m13, [clip_18b_max]
   7216    lea                  r6, [rsp+32*7]
   7217    call .main
   7218    cmp                eobd, 36
   7219    jl .fast
   7220    call .main
   7221    cmp                eobd, 136
   7222    jl .fast
   7223    call .main
   7224    cmp                eobd, 300
   7225    jl .fast
   7226    call .main
   7227    jmp .pass2
   7228 .dconly:
   7229    imul                r6d, [cq], 181
   7230    vpbroadcastd         m3, [dconly_10bpc]
   7231    mov                [cq], eobd ; 0
   7232    or                  r3d, 32
   7233    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
   7234 .fast:
   7235    lea                  r4, [rsp+32*71]
   7236    pxor                 m0, m0
   7237 .fast_loop:
   7238    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
   7239    add                  r6, 32*8
   7240    cmp                  r6, r4
   7241    jl .fast_loop
   7242 .pass2:
   7243    lea                  r3, [rsp+32*3]
   7244    mov                  r4, r6
   7245    lea                  r5, [r6+32*8]
   7246    lea                  r6, [pw_5+128]
   7247    call .pass2_oddhalf
   7248    call .pass2_evenhalf
   7249    imul                 r2, strideq, 19
   7250    lea                  r3, [strideq*3]
   7251    add                  r2, dstq
   7252    call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
   7253    sub                dstq, r3
   7254    lea                  r2, [r2+r3+32]
   7255    add                dstq, 32
   7256    lea                  r3, [rsp+32*11]
   7257    call .pass2_oddhalf
   7258    call .pass2_evenhalf
   7259    lea                  r3, [strideq*3]
   7260    call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
   7261    RET
   7262 ALIGN function_align
   7263 .main:
   7264    mova                 m0, [cq+128* 1]
   7265    mova                 m1, [cq+128* 7]
   7266    mova                 m2, [cq+128* 9]
   7267    mova                 m3, [cq+128*15]
   7268    mova                 m4, [cq+128*17]
   7269    mova                 m5, [cq+128*23]
   7270    mova                 m6, [cq+128*25]
   7271    mova                 m7, [cq+128*31]
   7272    vpbroadcastd        m11, [pd_2048]
   7273    vpbroadcastd        m14, [pd_2896]
   7274    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
   7275    mova                 m0, [cq+128* 3]
   7276    mova                 m1, [cq+128* 5]
   7277    mova                 m2, [cq+128*11]
   7278    mova                 m3, [cq+128*13]
   7279    mova                 m4, [cq+128*19]
   7280    mova                 m5, [cq+128*21]
   7281    mova                 m6, [cq+128*27]
   7282    mova                 m7, [cq+128*29]
   7283    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
   7284    mova                 m0, [cq+128* 2]
   7285    mova                 m1, [cq+128* 6]
   7286    mova                 m2, [cq+128*10]
   7287    mova                 m3, [cq+128*14]
   7288    mova                 m4, [cq+128*18]
   7289    mova                 m5, [cq+128*22]
   7290    mova                 m6, [cq+128*26]
   7291    mova                 m7, [cq+128*30]
   7292    call m(idct_8x16_internal_10bpc).main_oddhalf
   7293    mova                 m0, [cq+128* 0]
   7294    mova                 m1, [cq+128* 4]
   7295    mova                 m2, [cq+128* 8]
   7296    mova                 m3, [cq+128*12]
   7297    mova                 m4, [cq+128*16]
   7298    mova                 m5, [cq+128*20]
   7299    mova                 m6, [cq+128*24]
   7300    mova                 m7, [cq+128*28]
   7301    call m(idct_8x8_internal_10bpc).main
   7302    call m(idct_8x16_internal_10bpc).main_evenhalf
   7303    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
   7304    pxor                m15, m15
   7305    mov                 r7d, 128*29
   7306 .main_zero_loop:
   7307    mova      [cq+r7-128*1], m15
   7308    mova      [cq+r7+128*0], m15
   7309    mova      [cq+r7+128*1], m15
   7310    mova      [cq+r7+128*2], m15
   7311    sub                 r7d, 128*4
   7312    jg .main_zero_loop
   7313    add                  cq, 32
   7314    mova          [r4-32*4], m0
   7315    mova          [r4-32*3], m1
   7316    mova          [r4-32*2], m2
   7317    mova          [r4-32*1], m3
   7318    mova          [r4+32*0], m4
   7319    mova          [r4+32*1], m5
   7320    mova          [r4+32*2], m6
   7321    mova          [r4+32*3], m7
   7322    mova                 m0, [r5+32*3]
   7323    mova                 m1, [r5+32*2]
   7324    mova                 m2, [r5+32*1]
   7325    mova                 m3, [r5+32*0]
   7326    mova                 m4, [r5-32*1]
   7327    mova                 m5, [r5-32*2]
   7328    mova                 m6, [r5-32*3]
   7329    mova                 m7, [r5-32*4]
   7330    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
   7331    mova          [r5-32*4], m0
   7332    mova          [r5-32*3], m1
   7333    mova          [r5-32*2], m2
   7334    mova          [r5-32*1], m3
   7335    mova          [r5+32*0], m4
   7336    mova          [r5+32*1], m5
   7337    mova          [r5+32*2], m6
   7338    mova          [r5+32*3], m7
   7339    ret
   7340 ALIGN function_align
   7341 .pass2_oddhalf:
   7342    mova                 m0, [r3+32* 1] ;  1
   7343    mova                 m1, [r3+32* 3] ;  3
   7344    mova                 m2, [r3+32* 5] ;  5
   7345    mova                 m3, [r3+32* 7] ;  7
   7346    mova                 m4, [r3+32*17] ;  9
   7347    mova                 m5, [r3+32*19] ; 11
   7348    mova                 m6, [r3+32*21] ; 13
   7349    mova                 m7, [r3+32*23] ; 15
   7350    mova                 m8, [r3+32*33] ; 17
   7351    mova                 m9, [r3+32*35] ; 19
   7352    mova                m10, [r3+32*37] ; 21
   7353    mova                m11, [r3+32*39] ; 23
   7354    mova                m12, [r3+32*49] ; 25
   7355    mova                m13, [r3+32*51] ; 27
   7356    mova                m14, [r3+32*53] ; 29
   7357    mova                m15, [r3+32*55] ; 31
   7358    jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
   7359 ALIGN function_align
   7360 .pass2_evenhalf:
   7361    mova                 m0, [r3+32* 0] ;  0
   7362    mova                 m1, [r3+32* 2] ;  2
   7363    mova                 m2, [r3+32* 4] ;  4
   7364    mova                 m3, [r3+32* 6] ;  6
   7365    mova                 m4, [r3+32*16] ;  8
   7366    mova                 m5, [r3+32*18] ; 10
   7367    mova                 m6, [r3+32*20] ; 12
   7368    mova                 m7, [r3+32*22] ; 14
   7369    mova                 m8, [r3+32*32] ; 16
   7370    mova                 m9, [r3+32*34] ; 18
   7371    mova                m10, [r3+32*36] ; 20
   7372    mova                m11, [r3+32*38] ; 22
   7373    mova                m12, [r3+32*48] ; 24
   7374    mova                m13, [r3+32*50] ; 26
   7375    mova                m14, [r3+32*52] ; 28
   7376    mova                m15, [r3+32*54] ; 30
   7377    mova      [rsp+gprsize], m15
   7378    jmp m(idct_16x16_internal_8bpc).main
   7379 
   7380 cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob
   7381 %undef cmp
   7382    vpbroadcastd         m7, [pixel_10bpc_max]
   7383 .pass1:
   7384    vpbroadcastd         m5, [pw_8192]
   7385    pxor                 m6, m6
   7386    lea                  r6, [strideq*3]
   7387    lea                  r5, [strideq*5]
   7388    lea                  r4, [strideq+r6*2] ; strideq*7
   7389    call .main                              ; 0
   7390    cmp                eobd, 36
   7391    jl .ret
   7392    add                  cq, 128*8          ; 0 1
   7393    mov                  r7, dstq           ; 1
   7394    add                dstq, 16
   7395    call .main
   7396    call .main2
   7397    cmp                eobd, 136
   7398    jl .ret
   7399    add                  cq, 128*16-32      ; 0 1 2
   7400    lea                dstq, [r7+16*2]      ; 1 2
   7401    call .main                              ; 2
   7402    call .main2
   7403    call .main2
   7404    cmp                eobd, 300
   7405    jl .ret
   7406    add                  cq, 128*24-64      ; 0 1 2 3
   7407    add                  r7, 16*3           ; 1 2 3
   7408    mov                dstq, r7             ; 2 3
   7409    call .main                              ; 3
   7410    call .main2
   7411    call .main2
   7412    call .main2
   7413    cmp                eobd, 535
   7414    jl .ret
   7415    add                  cq, 128*24-64      ; 0 1 2 3
   7416    lea                dstq, [r7+strideq*8] ; 1 2 3 4
   7417    mov                  r7, dstq           ; 2 3 4
   7418    call .main                              ; 3 4
   7419    call .main2
   7420    call .main2
   7421    cmp                eobd, 755
   7422    jl .ret
   7423    add                  cq, 128*16-32      ; 0 1 2 3
   7424    lea                dstq, [r7+strideq*8] ; 1 2 3 4
   7425    call .main                              ; 2 3 4 5
   7426    call .main2                             ; 3 4 5
   7427    cmp                eobd, 911
   7428    jl .ret
   7429    add                  cq, 128*8          ; 0 1 2 3
   7430    add                dstq, 16             ; 1 2 3 4
   7431    call .main                              ; 2 3 4 5
   7432 .ret:                                       ; 3 4 5 6
   7433    RET
   7434 ALIGN function_align
   7435 .main2:
   7436    sub                  cq, 128*8-32
   7437    lea                dstq, [dstq+strideq*8-16]
   7438 .main:
   7439    mova                 m0, [cq+128*0]
   7440    packssdw             m0, [cq+128*1]
   7441    mova                 m1, [cq+128*2]
   7442    packssdw             m1, [cq+128*3]
   7443    mova                 m2, [cq+128*4]
   7444    packssdw             m2, [cq+128*5]
   7445    mova                 m3, [cq+128*6]
   7446    packssdw             m3, [cq+128*7]
   7447    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
   7448    jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero
   7449 
   7450 cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob
   7451    vpbroadcastd         m7, [pixel_12bpc_max]
   7452    jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1
   7453 
   7454 %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
   7455 %if %1 & 1
   7456    mova                m%5, [r5-32*(51-%1)] ; idct16 out 0+n
   7457    mova                m%4, [r4-32*(14+%1)] ; idct32 out31-n
   7458 %else
   7459    mova                m%5, [r4-32*(45-%1)]
   7460    mova                m%4, [r5-32*(20+%1)]
   7461 %endif
   7462    paddsw              m%6, m%5, m%4 ; idct32 out 0+n
   7463    psubsw              m%5, m%4      ; idct32 out31-n
   7464    paddsw              m%4, m%5, m%3 ; out31-n
   7465    psubsw              m%5, m%3      ; out32+n
   7466    paddsw              m%3, m%6, m%2 ; out 0+n
   7467    psubsw              m%6, m%2      ; out63-n
   7468    REPX  {pmulhrsw x, m14}, m%5, m%6, m%4, m%3
   7469 %if %1 & 1
   7470    %define %%d0 r2
   7471    %define %%d1 dstq
   7472 %else
   7473    %define %%d0 dstq
   7474    %define %%d1 r2
   7475 %endif
   7476    paddw               m%3, [%%d0+%7 ]
   7477    paddw               m%4, [%%d1+%8 ]
   7478    paddw               m%5, [%%d0+%9 ]
   7479    paddw               m%6, [%%d1+%10]
   7480    pxor                m%2, m%2
   7481    REPX    {pmaxsw x, m%2}, m%3, m%4, m%5, m%6
   7482    vpbroadcastd        m%2, [pixel_10bpc_max]
   7483    REPX    {pminsw x, m%2}, m%3, m%4, m%5, m%6
   7484    mova         [%%d0+%7 ], m%3
   7485    mova         [%%d1+%8 ], m%4
   7486    mova         [%%d0+%9 ], m%5
   7487    mova         [%%d1+%10], m%6
   7488 %endmacro
   7489 
   7490 cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
   7491    test               eobd, eobd
   7492    jz .dconly
   7493    PROLOGUE              0, 10, 16, 32*98, dst, stride, c, eob
   7494 %undef cmp
   7495    vpbroadcastd        m11, [pd_2048]
   7496    vpbroadcastd        m12, [clip_18b_min]
   7497    vpbroadcastd        m13, [clip_18b_max]
   7498    vpbroadcastd        m14, [pd_2896]
   7499    lea                  r6, [rsp+32*6]
   7500    call .main
   7501    sub                eobd, 44
   7502    jl .fast
   7503    call .main
   7504    sub                eobd, 107
   7505    jl .fast
   7506    call .main
   7507    sub                eobd, 128
   7508    jl .fast
   7509    call .main
   7510    jmp .pass2
   7511 .dconly:
   7512    imul                r6d, [cq], 181
   7513    vpbroadcastd         m3, [dconly_10bpc]
   7514    mov                [cq], eobd ; 0
   7515    or                  r3d, 64
   7516    add                 r6d, 640
   7517    sar                 r6d, 10
   7518    jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
   7519 .fast:
   7520    lea                  r4, [rsp+32*38]
   7521    pxor                 m0, m0
   7522 .fast_loop:
   7523    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
   7524    add                  r6, 32*8
   7525    cmp                  r6, r4
   7526    jl .fast_loop
   7527 .pass2:
   7528    lea                  r6, [pw_5+128]
   7529    mova                 m0, [rsp+32* 2] ; in0
   7530    mova                 m1, [rsp+32* 6] ; in4
   7531    mova                 m2, [rsp+32*10] ; in8
   7532    mova                 m3, [rsp+32*14] ; in12
   7533    mova                 m4, [rsp+32*18] ; in16
   7534    mova                 m5, [rsp+32*22] ; in20
   7535    mova                 m6, [rsp+32*26] ; in24
   7536    mova                 m7, [rsp+32*30] ; in28
   7537    pxor                 m8, m8
   7538    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
   7539    mova              [rsp], m8
   7540    call m(idct_16x16_internal_8bpc).main
   7541    mova                 m1, [rsp+32*1]
   7542    lea                  r4, [rsp+32*38]
   7543    mova          [r4-32*4], m0
   7544    mova          [r4-32*3], m1
   7545    mova          [r4-32*2], m2
   7546    mova          [r4-32*1], m3
   7547    mova          [r4+32*0], m4
   7548    mova          [r4+32*1], m5
   7549    mova          [r4+32*2], m6
   7550    mova          [r4+32*3], m7
   7551    add                  r4, 32*8
   7552    mova          [r4-32*4], m8
   7553    mova          [r4-32*3], m9
   7554    mova          [r4-32*2], m10
   7555    mova          [r4-32*1], m11
   7556    mova          [r4+32*0], m12
   7557    mova          [r4+32*1], m13
   7558    mova          [r4+32*2], m14
   7559    mova          [r4+32*3], m15
   7560    mova                 m0, [rsp+32* 4] ; in2
   7561    mova                 m1, [rsp+32* 8] ; in6
   7562    mova                 m2, [rsp+32*12] ; in10
   7563    mova                 m3, [rsp+32*16] ; in14
   7564    mova                 m4, [rsp+32*20] ; in18
   7565    mova                 m5, [rsp+32*24] ; in22
   7566    mova                 m6, [rsp+32*28] ; in26
   7567    mova                 m7, [rsp+32*32] ; in30
   7568    lea                  r5, [r4+32*16]
   7569    add                  r4, 32*8
   7570    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   7571    mova                 m0, [rsp+32* 3] ; in1
   7572    mova                 m1, [rsp+32*33] ; in31
   7573    mova                 m2, [rsp+32*19] ; in17
   7574    mova                 m3, [rsp+32*17] ; in15
   7575    mova                 m4, [rsp+32*11] ; in9
   7576    mova                 m5, [rsp+32*25] ; in23
   7577    mova                 m6, [rsp+32*27] ; in25
   7578    mova                 m7, [rsp+32* 9] ; in7
   7579    lea                  r6, [idct64_mul - 8]
   7580    add                  r4, 32*16
   7581    add                  r5, 32*32
   7582    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   7583    mova                 m0, [rsp+32* 7] ; in5
   7584    mova                 m1, [rsp+32*29] ; in27
   7585    mova                 m2, [rsp+32*23] ; in21
   7586    mova                 m3, [rsp+32*13] ; in11
   7587    mova                 m4, [rsp+32*15] ; in13
   7588    mova                 m5, [rsp+32*21] ; in19
   7589    mova                 m6, [rsp+32*31] ; in29
   7590    mova                 m7, [rsp+32* 5] ; in3
   7591    add                  r6, 8
   7592    add                  r4, 32*8
   7593    sub                  r5, 32*8
   7594    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   7595    lea                  r8, [strideq*4]
   7596    lea                  r9, [strideq*5]
   7597    lea                  r3, [r9+strideq*1] ; stride*6
   7598    lea                  r7, [r9+strideq*2] ; stride*7
   7599    call .main_part2_pass2
   7600    RET
   7601 ALIGN function_align
   7602 .main:
   7603    mova                 m0, [cq+128* 1]
   7604    mova                 m1, [cq+128* 3]
   7605    mova                 m2, [cq+128* 5]
   7606    mova                 m3, [cq+128* 7]
   7607    mova                 m4, [cq+128* 9]
   7608    mova                 m5, [cq+128*11]
   7609    mova                 m6, [cq+128*13]
   7610    mova                 m7, [cq+128*15]
   7611    call m(idct_8x16_internal_10bpc).main_oddhalf
   7612    mova                 m0, [cq+128* 0]
   7613    mova                 m1, [cq+128* 2]
   7614    mova                 m2, [cq+128* 4]
   7615    mova                 m3, [cq+128* 6]
   7616    mova                 m4, [cq+128* 8]
   7617    mova                 m5, [cq+128*10]
   7618    mova                 m6, [cq+128*12]
   7619    mova                 m7, [cq+128*14]
   7620    call m(idct_8x8_internal_10bpc).main
   7621    call m(idct_8x16_internal_10bpc).main_evenhalf
   7622    pxor                m15, m15
   7623    mov                 r7d, 128*13
   7624 .main_zero_loop:
   7625    mova      [cq+r7-128*1], m15
   7626    mova      [cq+r7+128*0], m15
   7627    mova      [cq+r7+128*1], m15
   7628    mova      [cq+r7+128*2], m15
   7629    sub                 r7d, 128*4
   7630    jg .main_zero_loop
   7631    add                  cq, 32
   7632    psrld               m15, m11, 10 ; pd_2
   7633    mova                 m8, [r6-32*4]
   7634    mova                 m9, [r6+32*3]
   7635    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
   7636    psubd               m10, m0, m8 ; out15
   7637    paddd                m0, m8     ; out0
   7638    mova                 m8, [r6-32*3]
   7639    psubd               m15, m7, m9 ; out8
   7640    paddd                m7, m9     ; out7
   7641    mova                 m9, [r6+32*2]
   7642    REPX       {psrad x, 2}, m0, m15, m10, m7
   7643    packssdw             m0, m15
   7644    packssdw             m7, m10
   7645    psubd               m10, m1, m8 ; out14
   7646    paddd                m1, m8     ; out1
   7647    mova                 m8, [r6-32*2]
   7648    psubd               m15, m6, m9 ; out9
   7649    paddd                m6, m9     ; out6
   7650    mova                 m9, [r6+32*1]
   7651    REPX       {psrad x, 2}, m1, m15, m10, m6
   7652    packssdw             m1, m15
   7653    packssdw             m6, m10
   7654    psubd               m10, m2, m8 ; out13
   7655    paddd                m2, m8     ; out2
   7656    mova                 m8, [r6-32*1]
   7657    psubd               m15, m5, m9 ; out10
   7658    paddd                m5, m9     ; out5
   7659    mova                 m9, [r6+32*0]
   7660    REPX       {psrad x, 2}, m2, m15, m10, m5
   7661    packssdw             m2, m15
   7662    packssdw             m5, m10
   7663    psubd               m10, m3, m8 ; out12
   7664    paddd                m3, m8     ; out3
   7665    psubd               m15, m4, m9 ; out11
   7666    paddd                m4, m9     ; out4
   7667    REPX       {psrad x, 2}, m3, m15, m10, m4
   7668    packssdw             m3, m15
   7669    packssdw             m4, m10
   7670    call m(idct_16x8_internal_10bpc).transpose3
   7671    mova          [r6-32*4], m0
   7672    mova          [r6-32*3], m1
   7673    mova          [r6-32*2], m2
   7674    mova          [r6-32*1], m3
   7675    mova          [r6+32*0], m4
   7676    mova          [r6+32*1], m5
   7677    mova          [r6+32*2], m6
   7678    mova          [r6+32*3], m7
   7679    add                  r6, 32*8
   7680    ret
   7681 .main_part2_pass2:
   7682    vpbroadcastd        m11, [pw_1567_3784]
   7683    vpbroadcastd        m12, [pw_m3784_1567]
   7684    vpbroadcastd        m13, [pw_2896_2896]
   7685    lea                  r6, [pw_5+128]
   7686    lea                  r2, [dstq+r7]
   7687 .main_part2_pass2_loop:
   7688    vpbroadcastd        m14, [pw_m2896_2896]
   7689    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal
   7690    vpbroadcastd        m14, [pw_2048]
   7691    IDCT64_PART2_END      0,  7,  0,  6,  9, 10, strideq*0, r3*4, r8*8, r7*8
   7692    IDCT64_PART2_END      7,  8,  5,  0,  6,  7, strideq*0, r3*4, r8*8, r7*8
   7693    IDCT64_PART2_END      8,  2,  1,  0,  6,  7, strideq*8, r8*4, r9*8, r3*8
   7694    IDCT64_PART2_END     15,  3,  4,  0,  6,  7, strideq*8, r8*4, r9*8, r3*8
   7695    add                dstq, strideq
   7696    sub                  r2, strideq
   7697    cmp                  r4, r5
   7698    jne .main_part2_pass2_loop
   7699    ret
   7700 ALIGN function_align
   7701 .main_part1_rect2:
   7702    REPX     {paddd x, m11}, m0, m1, m2, m3
   7703    REPX     {psrad x, 12 }, m0, m1, m2, m3
   7704 .main_part1: ; idct64 steps 1-5
   7705    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
   7706    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
   7707    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
   7708    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
   7709    vpbroadcastd         m7, [r5+4*0]
   7710    vpbroadcastd         m8, [r5+4*1]
   7711    vpbroadcastd         m6, [r5+4*2]
   7712    vpbroadcastd         m9, [r5+4*3]
   7713    vpbroadcastd         m5, [r5+4*4]
   7714    vpbroadcastd        m10, [r5+4*5]
   7715    vpbroadcastd         m4, [r5+4*6]
   7716    vpbroadcastd        m15, [r5+4*7]
   7717    pmulld               m7, m0     ; t63a
   7718    pmulld               m0, m8     ; t32a
   7719    pmulld               m6, m1     ; t62a
   7720    pmulld               m1, m9     ; t33a
   7721    pmulld               m5, m2     ; t61a
   7722    pmulld               m2, m10    ; t34a
   7723    pmulld               m4, m3     ; t60a
   7724    pmulld               m3, m15    ; t35a
   7725    vpbroadcastd        m10, [r5+4*8]
   7726    vpbroadcastd        m15, [r5+4*9]
   7727    REPX     {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
   7728    REPX     {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
   7729    psubd                m8, m0, m1 ; t33
   7730    paddd                m0, m1     ; t32
   7731    psubd                m1, m7, m6 ; t62
   7732    paddd                m7, m6     ; t63
   7733    psubd                m6, m3, m2 ; t34
   7734    paddd                m3, m2     ; t35
   7735    psubd                m2, m4, m5 ; t61
   7736    paddd                m4, m5     ; t60
   7737    REPX    {pmaxsd x, m12}, m8, m1, m6, m2
   7738    REPX    {pminsd x, m13}, m8, m1, m6, m2
   7739    ITX_MULSUB_2D         1, 8, 5, 9, _, 11, 10, 15    ; t33a, t62a
   7740    ITX_MULSUB_2D         2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a
   7741    REPX    {pmaxsd x, m12}, m0, m3, m7, m4
   7742    REPX    {pminsd x, m13}, m0, m3, m7, m4
   7743    vpbroadcastd        m10, [r5+4*10]
   7744    vpbroadcastd        m15, [r5+4*11]
   7745    psubd                m5, m0, m3 ; t35a
   7746    paddd                m0, m3     ; t32a
   7747    psubd                m3, m7, m4 ; t60a
   7748    paddd                m7, m4     ; t63a
   7749    psubd                m4, m1, m6 ; t34
   7750    paddd                m1, m6     ; t33
   7751    psubd                m6, m8, m2 ; t61
   7752    paddd                m8, m2     ; t62
   7753    REPX    {pmaxsd x, m12}, m5, m3, m4, m6
   7754    REPX    {pminsd x, m13}, m5, m3, m4, m6
   7755    ITX_MULSUB_2D         3, 5, 2, 9, _, 11, 10, 15 ; t35,  t60
   7756    ITX_MULSUB_2D         6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
   7757    REPX    {pmaxsd x, m12}, m0, m7, m1, m8
   7758    REPX    {pminsd x, m13}, m0, m7, m1, m8
   7759    add                  r5, 4*12
   7760    mova          [r6-32*4], m0
   7761    mova          [r6+32*3], m7
   7762    mova          [r6-32*3], m1
   7763    mova          [r6+32*2], m8
   7764    mova          [r6-32*2], m6
   7765    mova          [r6+32*1], m4
   7766    mova          [r6-32*1], m3
   7767    mova          [r6+32*0], m5
   7768    add                  r6, 32*8
   7769    ret
   7770 .main_part2: ; idct64 steps 6-9
   7771    lea                  r5, [r6+32*3]
   7772    sub                  r6, 32*4
   7773    vpbroadcastd        m10, [pd_1567]
   7774    vpbroadcastd        m15, [pd_3784]
   7775 .main_part2_loop:
   7776    mova                 m0, [r6-32*32] ; t32a
   7777    mova                 m1, [r5-32*24] ; t39a
   7778    mova                 m2, [r5-32*32] ; t63a
   7779    mova                 m3, [r6-32*24] ; t56a
   7780    mova                 m4, [r6-32*16] ; t40a
   7781    mova                 m5, [r5-32* 8] ; t47a
   7782    mova                 m6, [r5-32*16] ; t55a
   7783    mova                 m7, [r6-32* 8] ; t48a
   7784    psubd                m8, m0, m1 ; t39
   7785    paddd                m0, m1     ; t32
   7786    psubd                m1, m2, m3 ; t56
   7787    paddd                m2, m3     ; t63
   7788    psubd                m3, m5, m4 ; t40
   7789    paddd                m5, m4     ; t47
   7790    psubd                m4, m7, m6 ; t55
   7791    paddd                m7, m6     ; t48
   7792    REPX    {pmaxsd x, m12}, m8, m1, m3, m4
   7793    REPX    {pminsd x, m13}, m8, m1, m3, m4
   7794    ITX_MULSUB_2D         1, 8, 6, 9, _, 11, 10, 15    ; t39a, t56a
   7795    ITX_MULSUB_2D         4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a
   7796    REPX    {pmaxsd x, m12}, m0, m2, m5, m7
   7797    REPX    {pminsd x, m13}, m0, m5, m2, m7
   7798    psubd                m6, m2, m7 ; t48a
   7799    paddd                m2, m7     ; t63a
   7800    psubd                m7, m0, m5 ; t47a
   7801    paddd                m0, m5     ; t32a
   7802    psubd                m5, m8, m4 ; t55
   7803    paddd                m8, m4     ; t56
   7804    psubd                m4, m1, m3 ; t40
   7805    paddd                m1, m3     ; t39
   7806    REPX    {pmaxsd x, m12}, m6, m7, m5, m4
   7807    REPX    {pminsd x, m13}, m6, m7, m5, m4
   7808    REPX    {pmulld x, m14}, m6, m7, m5, m4
   7809    REPX    {pmaxsd x, m12}, m2, m0, m8, m1
   7810    REPX    {pminsd x, m13}, m2, m0, m8, m1
   7811    paddd                m6, m11
   7812    paddd                m5, m11
   7813    psubd                m3, m6, m7 ; t47
   7814    paddd                m6, m7     ; t48
   7815    psubd                m7, m5, m4 ; t40a
   7816    paddd                m5, m4     ; t55a
   7817    REPX      {psrad x, 12}, m3, m6, m7, m5
   7818    mova         [r5-32* 8], m2
   7819    mova         [r6-32*32], m0
   7820    mova         [r6-32* 8], m8
   7821    mova         [r5-32*32], m1
   7822    mova         [r5-32*24], m3
   7823    mova         [r6-32*16], m6
   7824    mova         [r6-32*24], m7
   7825    mova         [r5-32*16], m5
   7826    add                  r6, 32
   7827    sub                  r5, 32
   7828    cmp                  r6, r5
   7829    jl .main_part2_loop
   7830    ret
   7831 
   7832 cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
   7833    test               eobd, eobd
   7834    jz .dconly
   7835    PROLOGUE              0, 11, 16, 32*134, dst, stride, c, eob
   7836 %undef cmp
   7837    vpbroadcastd        m12, [clip_18b_min]
   7838    vpbroadcastd        m13, [clip_18b_max]
   7839    lea                  r6, [rsp+32*6]
   7840    call .main
   7841    cmp                eobd, 36
   7842    jl .fast
   7843    call .main
   7844    cmp                eobd, 136
   7845    jl .fast
   7846    call .main
   7847    cmp                eobd, 300
   7848    jl .fast
   7849    call .main
   7850    jmp .pass2
   7851 .dconly:
   7852    imul                r6d, [cq], 181
   7853    vpbroadcastd         m3, [dconly_10bpc]
   7854    mov                [cq], eobd ; 0
   7855    or                  r3d, 64
   7856    add                 r6d, 128
   7857    sar                 r6d, 8
   7858    imul                r6d, 181
   7859    add                 r6d, 384
   7860    sar                 r6d, 9
   7861    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
   7862 .fast:
   7863    lea                  r4, [rsp+32*70]
   7864    pxor                 m0, m0
   7865 .fast_loop:
   7866    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
   7867    add                  r6, 32*8
   7868    cmp                  r6, r4
   7869    jl .fast_loop
   7870 .pass2:
   7871    lea                  r6, [pw_5 + 128]
   7872    mov                 r10, rsp
   7873    lea                  r8, [strideq*4]
   7874    lea                  r9, [strideq*5]
   7875    lea                  r3, [r9+strideq*1] ; stride*6
   7876    lea                  r7, [r9+strideq*2] ; stride*7
   7877 .pass2_loop:
   7878    mova                 m0, [r10+32* 2] ; in0
   7879    mova                 m1, [r10+32* 6] ; in4
   7880    mova                 m2, [r10+32*18] ; in8
   7881    mova                 m3, [r10+32*22] ; in12
   7882    mova                 m4, [r10+32*34] ; in16
   7883    mova                 m5, [r10+32*38] ; in20
   7884    mova                 m6, [r10+32*50] ; in24
   7885    mova                 m7, [r10+32*54] ; in28
   7886    pxor                 m8, m8
   7887    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
   7888    mova              [rsp], m8
   7889    call m(idct_16x16_internal_8bpc).main
   7890    mova                 m1, [rsp+32*1]
   7891    lea                  r4, [rsp+32*70]
   7892    mova          [r4-32*4], m0
   7893    mova          [r4-32*3], m1
   7894    mova          [r4-32*2], m2
   7895    mova          [r4-32*1], m3
   7896    mova          [r4+32*0], m4
   7897    mova          [r4+32*1], m5
   7898    mova          [r4+32*2], m6
   7899    mova          [r4+32*3], m7
   7900    add                  r4, 32*8
   7901    mova          [r4-32*4], m8
   7902    mova          [r4-32*3], m9
   7903    mova          [r4-32*2], m10
   7904    mova          [r4-32*1], m11
   7905    mova          [r4+32*0], m12
   7906    mova          [r4+32*1], m13
   7907    mova          [r4+32*2], m14
   7908    mova          [r4+32*3], m15
   7909    mova                 m0, [r10+32* 4] ; in2
   7910    mova                 m1, [r10+32* 8] ; in6
   7911    mova                 m2, [r10+32*20] ; in10
   7912    mova                 m3, [r10+32*24] ; in14
   7913    mova                 m4, [r10+32*36] ; in18
   7914    mova                 m5, [r10+32*40] ; in22
   7915    mova                 m6, [r10+32*52] ; in26
   7916    mova                 m7, [r10+32*56] ; in30
   7917    lea                  r5, [r4+32*16]
   7918    add                  r4, 32*8
   7919    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   7920    mova                 m0, [r10+32* 3] ; in1
   7921    mova                 m1, [r10+32*57] ; in31
   7922    mova                 m2, [r10+32*35] ; in17
   7923    mova                 m3, [r10+32*25] ; in15
   7924    mova                 m4, [r10+32*19] ; in9
   7925    mova                 m5, [r10+32*41] ; in23
   7926    mova                 m6, [r10+32*51] ; in25
   7927    mova                 m7, [r10+32* 9] ; in7
   7928    lea                  r6, [idct64_mul - 8]
   7929    add                  r4, 32*16
   7930    add                  r5, 32*32
   7931    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   7932    mova                 m0, [r10+32* 7] ; in5
   7933    mova                 m1, [r10+32*53] ; in27
   7934    mova                 m2, [r10+32*39] ; in21
   7935    mova                 m3, [r10+32*21] ; in11
   7936    mova                 m4, [r10+32*23] ; in13
   7937    mova                 m5, [r10+32*37] ; in19
   7938    mova                 m6, [r10+32*55] ; in29
   7939    mova                 m7, [r10+32* 5] ; in3
   7940    add                  r6, 8
   7941    add                  r4, 32*8
   7942    sub                  r5, 32*8
   7943    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   7944    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
   7945    add                 r10, 32*8
   7946    sub                  r4, 32*98 ; rsp+32*16
   7947    sub                dstq, r8
   7948    add                dstq, 32
   7949    cmp                 r10, r4
   7950    jl .pass2_loop
   7951    RET
   7952 ALIGN function_align
   7953 .main:
   7954    vpbroadcastd        m14, [pd_2896]
   7955    vpbroadcastd        m11, [pd_2048]
   7956    pmulld               m0, m14, [cq+128* 1]
   7957    pmulld               m1, m14, [cq+128* 7]
   7958    pmulld               m2, m14, [cq+128* 9]
   7959    pmulld               m3, m14, [cq+128*15]
   7960    pmulld               m4, m14, [cq+128*17]
   7961    pmulld               m5, m14, [cq+128*23]
   7962    pmulld               m6, m14, [cq+128*25]
   7963    pmulld               m7, m14, [cq+128*31]
   7964    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
   7965    pmulld               m0, m14, [cq+128* 3]
   7966    pmulld               m1, m14, [cq+128* 5]
   7967    pmulld               m2, m14, [cq+128*11]
   7968    pmulld               m3, m14, [cq+128*13]
   7969    pmulld               m4, m14, [cq+128*19]
   7970    pmulld               m5, m14, [cq+128*21]
   7971    pmulld               m6, m14, [cq+128*27]
   7972    pmulld               m7, m14, [cq+128*29]
   7973    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
   7974    pmulld               m0, m14, [cq+128* 2]
   7975    pmulld               m1, m14, [cq+128* 6]
   7976    pmulld               m2, m14, [cq+128*10]
   7977    pmulld               m3, m14, [cq+128*14]
   7978    pmulld               m4, m14, [cq+128*18]
   7979    pmulld               m5, m14, [cq+128*22]
   7980    pmulld               m6, m14, [cq+128*26]
   7981    pmulld               m7, m14, [cq+128*30]
   7982    call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
   7983    pmulld               m0, m14, [cq+128* 0]
   7984    pmulld               m1, m14, [cq+128* 4]
   7985    pmulld               m2, m14, [cq+128* 8]
   7986    pmulld               m3, m14, [cq+128*12]
   7987    pmulld               m4, m14, [cq+128*16]
   7988    pmulld               m5, m14, [cq+128*20]
   7989    pmulld               m6, m14, [cq+128*24]
   7990    pmulld               m7, m14, [cq+128*28]
   7991    pxor                m15, m15
   7992    mov                 r7d, 128*29
   7993 .main_zero_loop:
   7994    mova      [cq+r7-128*1], m15
   7995    mova      [cq+r7+128*0], m15
   7996    mova      [cq+r7+128*1], m15
   7997    mova      [cq+r7+128*2], m15
   7998    sub                 r7d, 128*4
   7999    jg .main_zero_loop
   8000    add                  cq, 32
   8001    call m(idct_8x8_internal_10bpc).main_rect2
   8002    call m(idct_8x16_internal_10bpc).main_evenhalf
   8003    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end
   8004    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
   8005    mova          [r4-32*4], m0
   8006    mova          [r4-32*3], m1
   8007    mova          [r4-32*2], m2
   8008    mova          [r4-32*1], m3
   8009    mova          [r4+32*0], m4
   8010    mova          [r4+32*1], m5
   8011    mova          [r4+32*2], m6
   8012    mova          [r4+32*3], m7
   8013    mova                 m0, [r5+32*3]
   8014    mova                 m1, [r5+32*2]
   8015    mova                 m2, [r5+32*1]
   8016    mova                 m3, [r5+32*0]
   8017    mova                 m4, [r5-32*1]
   8018    mova                 m5, [r5-32*2]
   8019    mova                 m6, [r5-32*3]
   8020    mova                 m7, [r5-32*4]
   8021    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
   8022    mova          [r5-32*4], m0
   8023    mova          [r5-32*3], m1
   8024    mova          [r5-32*2], m2
   8025    mova          [r5-32*1], m3
   8026    mova          [r5+32*0], m4
   8027    mova          [r5+32*1], m5
   8028    mova          [r5+32*2], m6
   8029    mova          [r5+32*3], m7
   8030    ret
   8031 
   8032 cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
   8033    test               eobd, eobd
   8034    jnz .normal
   8035    imul                r6d, [cq], 181
   8036    mov                [cq], eobd ; 0
   8037    or                  r3d, 16
   8038 .dconly:
   8039    add                 r6d, 640
   8040    sar                 r6d, 10
   8041 .dconly2:
   8042    vpbroadcastd         m5, [dconly_10bpc]
   8043    imul                r6d, 181
   8044    add                 r6d, 2176
   8045    sar                 r6d, 12
   8046    movd                xm0, r6d
   8047    paddsw              xm0, xm5
   8048    vpbroadcastw         m0, xm0
   8049 .dconly_loop:
   8050    paddsw               m1, m0, [dstq+32*0]
   8051    paddsw               m2, m0, [dstq+32*1]
   8052    paddsw               m3, m0, [dstq+32*2]
   8053    paddsw               m4, m0, [dstq+32*3]
   8054    REPX    {psubusw x, m5}, m1, m2, m3, m4
   8055    mova        [dstq+32*0], m1
   8056    mova        [dstq+32*1], m2
   8057    mova        [dstq+32*2], m3
   8058    mova        [dstq+32*3], m4
   8059    add                dstq, strideq
   8060    dec                 r3d
   8061    jg .dconly_loop
   8062    RET
   8063 .normal:
   8064    PROLOGUE              0, 8, 16, 32*96, dst, stride, c, eob
   8065 %undef cmp
   8066    vpbroadcastd        m11, [pd_2048]
   8067    vpbroadcastd        m12, [clip_18b_min]
   8068    vpbroadcastd        m13, [clip_18b_max]
   8069    vpbroadcastd        m14, [pd_2896]
   8070    lea                  r6, [rsp+32*4]
   8071    call .main
   8072    call .shift_transpose
   8073    cmp                eobd, 36
   8074    jl .fast
   8075    call .main
   8076    call .shift_transpose
   8077    jmp .pass2
   8078 .fast:
   8079    pxor                 m0, m0
   8080    mov                 r3d, 4
   8081 .fast_loop:
   8082    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
   8083    add                  r6, 32*8
   8084    dec                 r3d
   8085    jg .fast_loop
   8086 .pass2:
   8087    lea                  r7, [r6-32*64]
   8088    lea                  r4, [r6-32*32]
   8089    lea                  r6, [pw_5+128]
   8090    mov                  r5, dstq
   8091 .pass2_loop:
   8092    mova                 m0, [r7-32*4]
   8093    mova                 m1, [r7-32*3]
   8094    mova                 m2, [r7-32*2]
   8095    mova                 m3, [r7-32*1]
   8096    mova                 m4, [r7+32*0]
   8097    mova                 m5, [r7+32*1]
   8098    mova                 m6, [r7+32*2]
   8099    mova                 m7, [r7+32*3]
   8100    add                  r7, 32*32
   8101    mova                 m8, [r7-32*4]
   8102    mova                 m9, [r7-32*3]
   8103    mova                m10, [r7-32*2]
   8104    mova                m11, [r7-32*1]
   8105    mova                m12, [r7+32*0]
   8106    mova                m13, [r7+32*1]
   8107    mova                m14, [r7+32*2]
   8108    mova                m15, [r7+32*3]
   8109    sub                  r7, 32*24
   8110    mova              [rsp], m15
   8111    call m(idct_16x16_internal_8bpc).main
   8112    mova                 m1, [rsp+32*1]
   8113    call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16
   8114    add                  r5, 32
   8115    mov                dstq, r5
   8116    cmp                  r7, r4
   8117    jl .pass2_loop
   8118    RET
   8119 ALIGN function_align
   8120 .main:
   8121    lea                  r5, [idct64_mul_16bpc]
   8122    mova                 m0, [cq+64* 1]
   8123    mova                 m1, [cq+64*31]
   8124    mova                 m2, [cq+64*17]
   8125    mova                 m3, [cq+64*15]
   8126    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
   8127    mova                 m0, [cq+64* 7]
   8128    mova                 m1, [cq+64*25]
   8129    mova                 m2, [cq+64*23]
   8130    mova                 m3, [cq+64* 9]
   8131    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
   8132    mova                 m0, [cq+64* 5]
   8133    mova                 m1, [cq+64*27]
   8134    mova                 m2, [cq+64*21]
   8135    mova                 m3, [cq+64*11]
   8136    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
   8137    mova                 m0, [cq+64* 3]
   8138    mova                 m1, [cq+64*29]
   8139    mova                 m2, [cq+64*19]
   8140    mova                 m3, [cq+64*13]
   8141    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
   8142    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
   8143    mova                 m0, [cq+64* 2]
   8144    mova                 m1, [cq+64*14]
   8145    mova                 m2, [cq+64*18]
   8146    mova                 m3, [cq+64*30]
   8147    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
   8148    mova                 m0, [cq+64* 6]
   8149    mova                 m1, [cq+64*10]
   8150    mova                 m2, [cq+64*22]
   8151    mova                 m3, [cq+64*26]
   8152    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
   8153    mova                 m0, [cq+64* 4]
   8154    mova                 m1, [cq+64*12]
   8155    mova                 m2, [cq+64*20]
   8156    mova                 m3, [cq+64*28]
   8157    call m(idct_8x16_internal_10bpc).main_oddhalf_fast
   8158    mova                 m0, [cq+64* 0]
   8159    mova                 m1, [cq+64* 8]
   8160    mova                 m2, [cq+64*16]
   8161    mova                 m3, [cq+64*24]
   8162    pxor                m15, m15
   8163    mov                 r7d, 64*30
   8164 .main_zero_loop:
   8165    mova       [cq+r7-64*2], m15
   8166    mova       [cq+r7-64*1], m15
   8167    mova       [cq+r7+64*0], m15
   8168    mova       [cq+r7+64*1], m15
   8169    sub                 r7d, 64*4
   8170    jg .main_zero_loop
   8171 .main_end:
   8172    psrld               m15, m11, 10 ; pd_2
   8173 .main_end2:
   8174    add                  cq, 32
   8175    pxor                 m4, m4
   8176    REPX       {mova x, m4}, m5, m6, m7
   8177    call m(idct_8x8_internal_10bpc).main
   8178    add                  r6, 32*8
   8179    call m(idct_8x16_internal_10bpc).main_evenhalf
   8180    mova          [r6+32*2], m1
   8181    mova          [r6+32*1], m2
   8182    mova          [r6+32*0], m3
   8183    mova          [r6-32*1], m4
   8184    mova          [r6-32*2], m5
   8185    mova          [r6-32*3], m6
   8186    mova          [r6-32*4], m7
   8187    jmp .main_end_loop_start
   8188 .main_end_loop:
   8189    mova                 m0, [r6+32* 3] ; idct8  0  + n
   8190 .main_end_loop_start:
   8191    mova                 m1, [r5+32* 4] ; idct16 15 - n
   8192    mova                 m2, [r5-32*12] ; idct32 16 + n
   8193    mova                 m3, [r6-32*13] ; idct32 31 - n
   8194    mova                 m4, [r6-32*29] ; idct64 63 - n
   8195    mova                 m5, [r5-32*28] ; idct64 48 + n
   8196    mova                 m6, [r6-32*45] ; idct64 47 - n
   8197    mova                 m7, [r5-32*44] ; idct64 32 + n
   8198    paddd                m8, m0, m1     ; idct16 out0  + n
   8199    psubd                m0, m1         ; idct16 out15 - n
   8200    REPX    {pmaxsd x, m12}, m8, m0
   8201    REPX    {pminsd x, m13}, m8, m0
   8202    paddd                m1, m8, m3     ; idct32 out0  + n
   8203    psubd                m8, m3         ; idct32 out31 - n
   8204    paddd                m3, m0, m2     ; idct32 out15 - n
   8205    psubd                m0, m2         ; idct32 out16 + n
   8206    REPX    {pmaxsd x, m12}, m1, m8, m3, m0
   8207    REPX    {pminsd x, m13}, m1, m3, m8, m0
   8208    REPX    {paddd  x, m15}, m1, m3, m0, m8
   8209    paddd                m2, m1, m4     ; idct64 out0  + n (unshifted)
   8210    psubd                m1, m4         ; idct64 out63 - n (unshifted)
   8211    paddd                m4, m3, m5     ; idct64 out15 - n (unshifted)
   8212    psubd                m3, m5         ; idct64 out48 + n (unshifted)
   8213    paddd                m5, m0, m6     ; idct64 out16 + n (unshifted)
   8214    psubd                m0, m6         ; idct64 out47 - n (unshifted)
   8215    paddd                m6, m8, m7     ; idct64 out31 - n (unshifted)
   8216    psubd                m8, m7         ; idct64 out32 + n (unshifted)
   8217    mova         [r5-32*44], m2
   8218    mova         [r6+32* 3], m1
   8219    mova         [r6-32*45], m4
   8220    mova         [r5+32* 4], m3
   8221    mova         [r5-32*28], m5
   8222    mova         [r6-32*13], m0
   8223    mova         [r6-32*29], m6
   8224    mova         [r5-32*12], m8
   8225    add                  r5, 32
   8226    sub                  r6, 32
   8227    cmp                  r5, r6
   8228    jl .main_end_loop
   8229    ret
   8230 .shift_transpose:
   8231 %macro IDCT64_SHIFT_TRANSPOSE 1 ; shift
   8232    sub                  r6, 32*48
   8233    mov                  r5, r6
   8234 %%loop:
   8235    mova                 m0, [r6-32* 4]
   8236    mova                 m4, [r6+32* 4]
   8237    mova                 m1, [r6-32* 3]
   8238    mova                 m5, [r6+32* 5]
   8239    mova                 m2, [r6-32* 2]
   8240    mova                 m6, [r6+32* 6]
   8241    mova                 m3, [r6-32* 1]
   8242    mova                 m7, [r6+32* 7]
   8243    REPX      {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7
   8244    packssdw             m0, m4
   8245    packssdw             m1, m5
   8246    packssdw             m2, m6
   8247    packssdw             m3, m7
   8248    mova                 m4, [r6+32* 0]
   8249    mova                 m6, [r6+32* 8]
   8250    mova                 m5, [r6+32* 1]
   8251    mova                 m7, [r6+32* 9]
   8252    REPX      {psrad x, %1}, m4, m6, m5, m7
   8253    packssdw             m4, m6
   8254    packssdw             m5, m7
   8255    mova                 m6, [r6+32* 2]
   8256    mova                 m8, [r6+32*10]
   8257    mova                 m7, [r6+32* 3]
   8258    mova                 m9, [r6+32*11]
   8259    REPX      {psrad x, %1}, m6, m8, m7, m9
   8260    packssdw             m6, m8
   8261    packssdw             m7, m9
   8262    call m(idct_16x8_internal_10bpc).transpose3
   8263    mova          [r5-32*4], m0
   8264    mova          [r5-32*3], m1
   8265    mova          [r5-32*2], m2
   8266    mova          [r5-32*1], m3
   8267    mova          [r5+32*0], m4
   8268    mova          [r5+32*1], m5
   8269    mova          [r5+32*2], m6
   8270    mova          [r5+32*3], m7
   8271    add                  r6, 32*16
   8272    add                  r5, 32*8
   8273    cmp                  r5, r4
   8274    jl %%loop
   8275    mov                  r6, r4
   8276 %endmacro
   8277    IDCT64_SHIFT_TRANSPOSE 2
   8278    ret
   8279 
   8280 cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
   8281    test               eobd, eobd
   8282    jz .dconly
   8283    PROLOGUE              0, 8, 16, 32*163, dst, stride, c, eob
   8284 %undef cmp
   8285    vpbroadcastd        m11, [pd_2048]
   8286    vpbroadcastd        m12, [clip_18b_min]
   8287    vpbroadcastd        m13, [clip_18b_max]
   8288    vpbroadcastd        m14, [pd_2896]
   8289    lea                  r6, [rsp+32*7]
   8290    call .main
   8291    cmp                eobd, 36
   8292    jl .fast
   8293    call .main
   8294    cmp                eobd, 136
   8295    jl .fast
   8296    call .main
   8297    cmp                eobd, 300
   8298    jl .fast
   8299    call .main
   8300    jmp .pass2
   8301 .dconly:
   8302    imul                r6d, [cq], 181
   8303    mov                [cq], eobd ; 0
   8304    or                  r3d, 32
   8305    add                 r6d, 128
   8306    sar                 r6d, 8
   8307    imul                r6d, 181
   8308    add                 r6d, 384
   8309    sar                 r6d, 9
   8310    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
   8311 .fast:
   8312    pxor                 m0, m0
   8313    lea                  r4, [rsp+32*135]
   8314 .fast_loop:
   8315    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
   8316    add                  r6, 32*8
   8317    cmp                  r6, r4
   8318    jl .fast_loop
   8319 .pass2:
   8320    lea                  r7, [r6-32*32]
   8321    lea                  r5, [r6+32*8]
   8322    lea                  r6, [pw_5+128]
   8323    imul                 r2, strideq, 19
   8324    lea                  r3, [strideq*3]
   8325    add                  r2, dstq
   8326 .pass2_loop:
   8327    mova                 m0, [r7-32*99]
   8328    mova                 m1, [r7-32*97]
   8329    mova                 m2, [r7-32*95]
   8330    mova                 m3, [r7-32*93]
   8331    mova                 m4, [r7-32*67]
   8332    mova                 m5, [r7-32*65]
   8333    mova                 m6, [r7-32*63]
   8334    mova                 m7, [r7-32*61]
   8335    mova                 m8, [r7-32*35]
   8336    mova                 m9, [r7-32*33]
   8337    mova                m10, [r7-32*31]
   8338    mova                m11, [r7-32*29]
   8339    mova                m12, [r7-32* 3]
   8340    mova                m13, [r7-32* 1]
   8341    mova                m14, [r7+32* 1]
   8342    mova                m15, [r7+32* 3]
   8343    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
   8344    mova                 m0, [r7-32*100]
   8345    mova                 m1, [r7-32*98]
   8346    mova                 m2, [r7-32*96]
   8347    mova                 m3, [r7-32*94]
   8348    mova                 m4, [r7-32*68]
   8349    mova                 m5, [r7-32*66]
   8350    mova                 m6, [r7-32*64]
   8351    mova                 m7, [r7-32*62]
   8352    mova                 m8, [r7-32*36]
   8353    mova                 m9, [r7-32*34]
   8354    mova                m10, [r7-32*32]
   8355    mova                m11, [r7-32*30]
   8356    mova                m12, [r7-32* 4]
   8357    mova                m13, [r7-32* 2]
   8358    mova                m14, [r7+32* 0]
   8359    mova                m15, [r7+32* 2]
   8360    add                  r7, 32*8
   8361    mova              [rsp], m15
   8362    call m(idct_16x16_internal_8bpc).main
   8363    call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
   8364    sub                dstq, r3
   8365    lea                  r2, [r2+r3+32]
   8366    add                dstq, 32
   8367    cmp                  r7, r4
   8368    jl .pass2_loop
   8369    RET
   8370 ALIGN function_align
   8371 .main:
   8372    lea                  r5, [idct64_mul_16bpc]
   8373    pmulld               m0, m14, [cq+128* 1]
   8374    pmulld               m1, m14, [cq+128*31]
   8375    pmulld               m2, m14, [cq+128*17]
   8376    pmulld               m3, m14, [cq+128*15]
   8377    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
   8378    pmulld               m0, m14, [cq+128* 7]
   8379    pmulld               m1, m14, [cq+128*25]
   8380    pmulld               m2, m14, [cq+128*23]
   8381    pmulld               m3, m14, [cq+128* 9]
   8382    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
   8383    pmulld               m0, m14, [cq+128* 5]
   8384    pmulld               m1, m14, [cq+128*27]
   8385    pmulld               m2, m14, [cq+128*21]
   8386    pmulld               m3, m14, [cq+128*11]
   8387    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
   8388    pmulld               m0, m14, [cq+128* 3]
   8389    pmulld               m1, m14, [cq+128*29]
   8390    pmulld               m2, m14, [cq+128*19]
   8391    pmulld               m3, m14, [cq+128*13]
   8392    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
   8393    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
   8394    pmulld               m0, m14, [cq+128* 2]
   8395    pmulld               m1, m14, [cq+128*14]
   8396    pmulld               m2, m14, [cq+128*18]
   8397    pmulld               m3, m14, [cq+128*30]
   8398    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2
   8399    pmulld               m0, m14, [cq+128* 6]
   8400    pmulld               m1, m14, [cq+128*10]
   8401    pmulld               m2, m14, [cq+128*22]
   8402    pmulld               m3, m14, [cq+128*26]
   8403    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2
   8404    pmulld               m0, m14, [cq+128* 4]
   8405    pmulld               m1, m14, [cq+128*12]
   8406    pmulld               m2, m14, [cq+128*20]
   8407    pmulld               m3, m14, [cq+128*28]
   8408    call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2
   8409    pmulld               m0, m14, [cq+128* 0]
   8410    pmulld               m1, m14, [cq+128* 8]
   8411    pmulld               m2, m14, [cq+128*16]
   8412    pmulld               m3, m14, [cq+128*24]
   8413    pxor                m15, m15
   8414    mov                 r7d, 128*29
   8415 .main_zero_loop:
   8416    mova      [cq+r7-128*1], m15
   8417    mova      [cq+r7+128*0], m15
   8418    mova      [cq+r7+128*1], m15
   8419    mova      [cq+r7+128*2], m15
   8420    sub                 r7d, 128*4
   8421    jg .main_zero_loop
   8422    psrld               m15, m11, 11 ; pd_1
   8423    REPX     {paddd x, m11}, m0, m1, m2, m3
   8424    REPX     {psrad x, 12 }, m0, m1, m2, m3
   8425    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2
   8426    IDCT64_SHIFT_TRANSPOSE 1
   8427    ret
   8428 
   8429 cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
   8430    test               eobd, eobd
   8431    jz .dconly
   8432    PROLOGUE              0, 11, 16, 32*195, dst, stride, c, eob
   8433 %undef cmp
   8434    vpbroadcastd        m11, [pd_2048]
   8435    vpbroadcastd        m12, [clip_18b_min]
   8436    vpbroadcastd        m13, [clip_18b_max]
   8437    vpbroadcastd        m14, [pd_2896]
   8438    lea                  r6, [rsp+32*7]
   8439    call .main
   8440    cmp                eobd, 36
   8441    jl .fast
   8442    call .main
   8443    cmp                eobd, 136
   8444    jl .fast
   8445    call .main
   8446    cmp                eobd, 300
   8447    jl .fast
   8448    call .main
   8449    jmp .pass2
   8450 .dconly:
   8451    imul                r6d, [cq], 181
   8452    mov                [cq], eobd ; 0
   8453    or                  r3d, 64
   8454    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly
   8455 .fast:
   8456    pxor                 m0, m0
   8457    lea                  r4, [rsp+32*135]
   8458 .fast_loop:
   8459    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
   8460    add                  r6, 32*8
   8461    cmp                  r6, r4
   8462    jl .fast_loop
   8463 .pass2:
   8464    lea                 r10, [r6-32*32]
   8465    lea                  r6, [pw_5+128]
   8466    lea                  r8, [strideq*4]
   8467    lea                  r9, [strideq*5]
   8468    lea                  r3, [r9+strideq*1] ; stride*6
   8469    lea                  r7, [r9+strideq*2] ; stride*7
   8470 .pass2_loop:
   8471    mova                 m0, [r10-32*100] ; in0
   8472    mova                 m1, [r10-32*96]  ; in4
   8473    mova                 m2, [r10-32*68]  ; in8
   8474    mova                 m3, [r10-32*64]  ; in12
   8475    mova                 m4, [r10-32*36]  ; in16
   8476    mova                 m5, [r10-32*32]  ; in20
   8477    mova                 m6, [r10-32* 4]  ; in24
   8478    mova                 m7, [r10+32* 0]  ; in28
   8479    pxor                 m8, m8
   8480    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
   8481    mova              [rsp], m8
   8482    call m(idct_16x16_internal_8bpc).main
   8483    mova                 m1, [rsp+32*1]
   8484    mova          [r4-32*4], m0
   8485    mova          [r4-32*3], m1
   8486    mova          [r4-32*2], m2
   8487    mova          [r4-32*1], m3
   8488    mova          [r4+32*0], m4
   8489    mova          [r4+32*1], m5
   8490    mova          [r4+32*2], m6
   8491    mova          [r4+32*3], m7
   8492    add                  r4, 32*8
   8493    mova          [r4-32*4], m8
   8494    mova          [r4-32*3], m9
   8495    mova          [r4-32*2], m10
   8496    mova          [r4-32*1], m11
   8497    mova          [r4+32*0], m12
   8498    mova          [r4+32*1], m13
   8499    mova          [r4+32*2], m14
   8500    mova          [r4+32*3], m15
   8501    mova                 m0, [r10-32*98] ; in2
   8502    mova                 m1, [r10-32*94] ; in6
   8503    mova                 m2, [r10-32*66] ; in10
   8504    mova                 m3, [r10-32*62] ; in14
   8505    mova                 m4, [r10-32*34] ; in18
   8506    mova                 m5, [r10-32*30] ; in22
   8507    mova                 m6, [r10-32* 2] ; in26
   8508    mova                 m7, [r10+32* 2] ; in30
   8509    lea                  r5, [r4+32*16]
   8510    add                  r4, 32*8
   8511    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   8512    mova                 m0, [r10-32*99] ; in1
   8513    mova                 m1, [r10+32* 3] ; in31
   8514    mova                 m2, [r10-32*35] ; in17
   8515    mova                 m3, [r10-32*61] ; in15
   8516    mova                 m4, [r10-32*67] ; in9
   8517    mova                 m5, [r10-32*29] ; in23
   8518    mova                 m6, [r10-32* 3] ; in25
   8519    mova                 m7, [r10-32*93] ; in7
   8520    lea                  r6, [idct64_mul - 8]
   8521    add                  r4, 32*16
   8522    add                  r5, 32*32
   8523    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   8524    mova                 m0, [r10-32*95] ; in5
   8525    mova                 m1, [r10-32* 1] ; in27
   8526    mova                 m2, [r10-32*31] ; in21
   8527    mova                 m3, [r10-32*65] ; in11
   8528    mova                 m4, [r10-32*63] ; in13
   8529    mova                 m5, [r10-32*33] ; in19
   8530    mova                 m6, [r10+32* 1] ; in29
   8531    mova                 m7, [r10-32*97] ; in3
   8532    add                  r6, 8
   8533    add                  r4, 32*8
   8534    sub                  r5, 32*8
   8535    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   8536    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
   8537    add                 r10, 32*8
   8538    sub                dstq, r8
   8539    sub                  r4, 32*44
   8540    add                dstq, 32
   8541    cmp                 r10, r4
   8542    jl .pass2_loop
   8543    RET
   8544 ALIGN function_align
   8545 .main:
   8546    lea                  r5, [idct64_mul_16bpc]
   8547    mova                 m0, [cq+128* 1]
   8548    mova                 m1, [cq+128*31]
   8549    mova                 m2, [cq+128*17]
   8550    mova                 m3, [cq+128*15]
   8551    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
   8552    mova                 m0, [cq+128* 7]
   8553    mova                 m1, [cq+128*25]
   8554    mova                 m2, [cq+128*23]
   8555    mova                 m3, [cq+128* 9]
   8556    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
   8557    mova                 m0, [cq+128* 5]
   8558    mova                 m1, [cq+128*27]
   8559    mova                 m2, [cq+128*21]
   8560    mova                 m3, [cq+128*11]
   8561    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
   8562    mova                 m0, [cq+128* 3]
   8563    mova                 m1, [cq+128*29]
   8564    mova                 m2, [cq+128*19]
   8565    mova                 m3, [cq+128*13]
   8566    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
   8567    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
   8568    mova                 m0, [cq+128* 2]
   8569    mova                 m1, [cq+128*14]
   8570    mova                 m2, [cq+128*18]
   8571    mova                 m3, [cq+128*30]
   8572    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
   8573    mova                 m0, [cq+128* 6]
   8574    mova                 m1, [cq+128*10]
   8575    mova                 m2, [cq+128*22]
   8576    mova                 m3, [cq+128*26]
   8577    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
   8578    mova                 m0, [cq+128* 4]
   8579    mova                 m1, [cq+128*12]
   8580    mova                 m2, [cq+128*20]
   8581    mova                 m3, [cq+128*28]
   8582    call m(idct_8x16_internal_10bpc).main_oddhalf_fast
   8583    mova                 m0, [cq+128* 0]
   8584    mova                 m1, [cq+128* 8]
   8585    mova                 m2, [cq+128*16]
   8586    mova                 m3, [cq+128*24]
   8587    pxor                m15, m15
   8588    mov                 r7d, 128*29
   8589 .main_zero_loop:
   8590    mova      [cq+r7-128*1], m15
   8591    mova      [cq+r7+128*0], m15
   8592    mova      [cq+r7+128*1], m15
   8593    mova      [cq+r7+128*2], m15
   8594    sub                 r7d, 128*4
   8595    jg .main_zero_loop
   8596    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
   8597    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose
   8598 
   8599 %endif ; ARCH_X86_64