tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx_sse.asm (264136B)


      1 ; Copyright © 2018-2021, VideoLAN and dav1d authors
      2 ; Copyright © 2018, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 
     30 SECTION_RODATA 16
     31 
     32 deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
     33 
     34 deint_shuf1: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
     35 deint_shuf2: db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
     36 
     37 %macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1
     38 pw_%1_m%2:  times 4 dw  %1, -%2
     39 %if %3 != 2
     40 pw_%2_%1:   times 4 dw  %2,  %1
     41 %endif
     42 %if %3
     43 pw_m%1_m%2: times 4 dw -%1, -%2
     44 %endif
     45 %endmacro
     46 
     47 ;adst4
     48 pw_1321_3803:   times 4 dw  1321,  3803
     49 pw_2482_m1321:  times 4 dw  2482, -1321
     50 pw_3344_2482:   times 4 dw  3344,  2482
     51 pw_3344_m3803:  times 4 dw  3344, -3803
     52 pw_3344_m3344:  times 4 dw  3344, -3344
     53 pw_0_3344       times 4 dw     0,  3344
     54 pw_m6688_m3803: times 4 dw -6688, -3803
     55 
     56 COEF_PAIR 2896, 2896
     57 COEF_PAIR 1567, 3784
     58 COEF_PAIR  799, 4017
     59 COEF_PAIR 3406, 2276
     60 COEF_PAIR  401, 4076
     61 COEF_PAIR 1931, 3612
     62 COEF_PAIR 3166, 2598
     63 COEF_PAIR 3920, 1189
     64 COEF_PAIR 3784, 1567, 1
     65 COEF_PAIR  995, 3973
     66 COEF_PAIR 1751, 3703
     67 COEF_PAIR 3513, 2106
     68 COEF_PAIR 3857, 1380
     69 COEF_PAIR 4017,  799, 1
     70 COEF_PAIR  201, 4091
     71 COEF_PAIR 2440, 3290
     72 COEF_PAIR 3035, 2751
     73 COEF_PAIR 4052,  601
     74 COEF_PAIR 2276, 3406, 1
     75 COEF_PAIR 4076,  401, 2
     76 COEF_PAIR 2598, 3166, 2
     77 COEF_PAIR 3612, 1931, 2
     78 COEF_PAIR 1189, 3920, 2
     79 
     80 pd_2048:        times 4 dd  2048
     81 pw_2048:        times 8 dw  2048
     82 pw_m2048:       times 8 dw -2048
     83 pw_4096:        times 8 dw  4096
     84 pw_16384:       times 8 dw  16384
     85 pw_m16384:      times 8 dw  -16384
     86 pw_1697x16:     times 8 dw  1697*16
     87 pw_1697x8:      times 8 dw  1697*8
     88 pw_2896x8:      times 8 dw  2896*8
     89 pw_3344x8:      times 8 dw  3344*8
     90 pw_8192:        times 8 dw  8192
     91 pw_m8192:       times 8 dw -8192
     92 pw_5:           times 8 dw  5
     93 pw_201x8:       times 8 dw   201*8
     94 pw_4091x8:      times 8 dw  4091*8
     95 pw_m2751x8:     times 8 dw -2751*8
     96 pw_3035x8:      times 8 dw  3035*8
     97 pw_1751x8:      times 8 dw  1751*8
     98 pw_3703x8:      times 8 dw  3703*8
     99 pw_m1380x8:     times 8 dw -1380*8
    100 pw_3857x8:      times 8 dw  3857*8
    101 pw_995x8:       times 8 dw   995*8
    102 pw_3973x8:      times 8 dw  3973*8
    103 pw_m2106x8:     times 8 dw -2106*8
    104 pw_3513x8:      times 8 dw  3513*8
    105 pw_2440x8:      times 8 dw  2440*8
    106 pw_3290x8:      times 8 dw  3290*8
    107 pw_m601x8:      times 8 dw  -601*8
    108 pw_4052x8:      times 8 dw  4052*8
    109 
    110 pw_4095x8:      times 8 dw  4095*8
    111 pw_101x8:       times 8 dw   101*8
    112 pw_2967x8:      times 8 dw  2967*8
    113 pw_m2824x8:     times 8 dw -2824*8
    114 pw_3745x8:      times 8 dw  3745*8
    115 pw_1660x8:      times 8 dw  1660*8
    116 pw_3822x8:      times 8 dw  3822*8
    117 pw_m1474x8:     times 8 dw -1474*8
    118 pw_3996x8:      times 8 dw  3996*8
    119 pw_897x8:       times 8 dw   897*8
    120 pw_3461x8:      times 8 dw  3461*8
    121 pw_m2191x8:     times 8 dw -2191*8
    122 pw_3349x8:      times 8 dw  3349*8
    123 pw_2359x8:      times 8 dw  2359*8
    124 pw_4036x8:      times 8 dw  4036*8
    125 pw_m700x8:      times 8 dw  -700*8
    126 pw_4065x8:      times 8 dw  4065*8
    127 pw_501x8:       times 8 dw   501*8
    128 pw_3229x8:      times 8 dw  3229*8
    129 pw_m2520x8:     times 8 dw -2520*8
    130 pw_3564x8:      times 8 dw  3564*8
    131 pw_2019x8:      times 8 dw  2019*8
    132 pw_3948x8:      times 8 dw  3948*8
    133 pw_m1092x8:     times 8 dw -1092*8
    134 pw_3889x8:      times 8 dw  3889*8
    135 pw_1285x8:      times 8 dw  1285*8
    136 pw_3659x8:      times 8 dw  3659*8
    137 pw_m1842x8:     times 8 dw -1842*8
    138 pw_3102x8:      times 8 dw  3102*8
    139 pw_2675x8:      times 8 dw  2675*8
    140 pw_4085x8:      times 8 dw  4085*8
    141 pw_m301x8:      times 8 dw  -301*8
    142 
    143 SECTION .text
    144 
    145 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
    146 
    147 %if ARCH_X86_64
    148 %define o(x) x
    149 %else
    150 %define o(x) r5-$$+x ; PIC
    151 %endif
    152 
    153 %macro WRITE_4X4 9  ;src[1-2], tmp[1-3], row[1-4]
    154    lea                  r2, [dstq+strideq*2]
    155 %assign %%i 1
    156 %rotate 5
    157 %rep 4
    158    %if %1 & 2
    159        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
    160    %else
    161        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
    162    %endif
    163    %assign %%i %%i + 1
    164    %rotate 1
    165 %endrep
    166 
    167    movd                 m%3, [%%row_adr1]        ;dst0
    168    movd                 m%5, [%%row_adr2]        ;dst1
    169    punpckldq            m%3, m%5                 ;high: dst1 :low: dst0
    170    movd                 m%4, [%%row_adr3]        ;dst2
    171    movd                 m%5, [%%row_adr4]        ;dst3
    172    punpckldq            m%4, m%5                 ;high: dst3 :low: dst2
    173 
    174    pxor                 m%5, m%5
    175    punpcklbw            m%3, m%5                 ;extend byte to word
    176    punpcklbw            m%4, m%5                 ;extend byte to word
    177 
    178    paddw                m%3, m%1                 ;high: dst1 + out1 ;low: dst0 + out0
    179    paddw                m%4, m%2                 ;high: dst3 + out3 ;low: dst2 + out2
    180 
    181    packuswb             m%3, m%4                 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
    182 
    183    movd        [%%row_adr1], m%3                  ;store dst0 + out0
    184    pshuflw              m%4, m%3, q1032
    185    movd        [%%row_adr2], m%4                  ;store dst1 + out1
    186    punpckhqdq           m%3, m%3
    187    movd        [%%row_adr3], m%3                  ;store dst2 + out2
    188    psrlq                m%3, 32
    189    movd        [%%row_adr4], m%3                  ;store dst3 + out3
    190 %endmacro
    191 
    192 %macro ITX4_END 4-5 2048 ; row[1-4], rnd
    193 %if %5
    194    mova                 m2, [o(pw_%5)]
    195    pmulhrsw             m0, m2
    196    pmulhrsw             m1, m2
    197 %endif
    198 
    199    WRITE_4X4            0, 1, 2, 3, 4, %1, %2, %3, %4
    200    ret
    201 %endmacro
    202 
    203 ; flags: 1 = swap, 2: coef_regs, 4: no_pack
    204 %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
    205 %if %6 & 2
    206    pmaddwd              m%2, m%4, m%1
    207    pmaddwd              m%1, m%5
    208 %elif %6 & 1
    209    pmaddwd              m%2, m%1, [o(pw_%5_%4)]
    210    pmaddwd              m%1, [o(pw_%4_m%5)]
    211 %else
    212    pmaddwd              m%2, m%1, [o(pw_%4_m%5)]
    213    pmaddwd              m%1, [o(pw_%5_%4)]
    214 %endif
    215    paddd                m%2, m%3
    216    paddd                m%1, m%3
    217    psrad                m%2, 12
    218    psrad                m%1, 12
    219 %if %6 & 4 == 0
    220    packssdw             m%1, m%2
    221 %endif
    222 %endmacro
    223 
    224 %macro IDCT4_1D_PACKED 0-1   ;pw_2896x8
    225    mova                 m3, [o(pd_2048)]
    226    punpckhwd            m2, m0, m1            ;unpacked in1 in3
    227    punpcklwd            m0, m1                ;unpacked in0 in2
    228    ITX_MUL2X_PACK        2, 1, 3, 1567, 3784
    229    ITX_MUL2X_PACK        0, 1, 3, 2896, 2896
    230    psubsw               m1, m0, m2            ;high: out2 ;low: out3
    231    paddsw               m0, m2                ;high: out1 ;low: out0
    232 %endmacro
    233 
    234 %macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
    235 cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2
    236    %define %%p1 m(i%1_%3_internal_8bpc)
    237 %if ARCH_X86_32
    238    LEA                    r5, $$
    239 %endif
    240 %if has_epilogue
    241 %ifidn %1_%2, dct_dct
    242    test                 eobd, eobd
    243    jz %%end
    244 %endif
    245    lea                  tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
    246    call %%p1
    247    RET
    248 %%end:
    249 %else
    250    lea                  tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
    251 %ifidn %1_%2, dct_dct
    252    test                 eobd, eobd
    253    jnz %%p1
    254 %else
    255    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
    256 ALIGN function_align
    257 %%end:
    258 %endif
    259 %endif
    260 %endmacro
    261 
    262 %macro INV_TXFM_4X4_FN 2 ; type1, type2
    263    INV_TXFM_FN          %1, %2, 4x4, 6
    264 %ifidn %1_%2, dct_dct
    265    pshuflw              m0, [coeffq], q0000
    266    punpcklqdq           m0, m0
    267    mova                 m1, [o(pw_2896x8)]
    268    pmulhrsw             m0, m1
    269    mov            [coeffq], eobd                ;0
    270    pmulhrsw             m0, m1
    271    mova                 m1, m0
    272    TAIL_CALL m(iadst_4x4_internal_8bpc).end2
    273 %endif
    274 %endmacro
    275 
    276 INIT_XMM ssse3
    277 ; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16.
    278 
    279 INV_TXFM_4X4_FN dct, dct
    280 INV_TXFM_4X4_FN dct, adst
    281 INV_TXFM_4X4_FN dct, flipadst
    282 INV_TXFM_4X4_FN dct, identity
    283 
    284 cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
    285    mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
    286    mova                 m1, [coeffq+16*1]      ;high: in3 ;low in2
    287 
    288    IDCT4_1D_PACKED
    289 
    290    mova                 m2, [o(deint_shuf)]
    291    shufps               m3, m0, m1, q1331
    292    shufps               m0, m1, q0220
    293    pshufb               m0, m2                 ;high: in1 ;low: in0
    294    pshufb               m1, m3, m2             ;high: in3 ;low :in2
    295    jmp                tx2q
    296 
    297 .pass2:
    298    IDCT4_1D_PACKED
    299 
    300    pxor                 m2, m2
    301    mova      [coeffq+16*0], m2
    302    mova      [coeffq+16*1], m2                 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
    303 
    304    ITX4_END     0, 1, 3, 2
    305 
    306 INV_TXFM_4X4_FN adst, dct
    307 INV_TXFM_4X4_FN adst, adst
    308 INV_TXFM_4X4_FN adst, flipadst
    309 INV_TXFM_4X4_FN adst, identity
    310 
    311 cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
    312    mova                 m0, [coeffq+16*0]
    313    mova                 m1, [coeffq+16*1]
    314    call .main
    315    punpckhwd            m2, m0, m1
    316    punpcklwd            m0, m1
    317    punpckhwd            m1, m0, m2       ;high: in3 ;low :in2
    318    punpcklwd            m0, m2           ;high: in1 ;low: in0
    319    jmp                tx2q
    320 
    321 .pass2:
    322    call .main
    323 
    324 .end:
    325    pxor                 m2, m2
    326    mova      [coeffq+16*0], m2
    327    mova      [coeffq+16*1], m2
    328 
    329 .end2:
    330    ITX4_END              0, 1, 2, 3
    331 
    332 ALIGN function_align
    333 cglobal_label .main
    334    punpcklwd            m2, m0, m1                ;unpacked in0 in2
    335    punpckhwd            m0, m1                    ;unpacked in1 in3
    336    mova                 m3, m0
    337    pmaddwd              m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
    338    pmaddwd              m0, [o(pw_0_3344)]        ;3344 * in3
    339    paddd                m1, m0                    ;t2
    340    pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
    341    pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2
    342    pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
    343    pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
    344    paddd                m4, m0                    ;t0 + t3
    345    pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3
    346    mova                 m0, [o(pd_2048)]
    347    paddd                m1, m0                    ;t2 + 2048
    348    paddd                m2, m0
    349    paddd                m0, m4                    ;t0 + t3 + 2048
    350    paddd                m5, m2                    ;t1 + t3 + 2048
    351    paddd                m2, m4
    352    paddd                m2, m3                    ;t0 + t1 - t3 + 2048
    353    REPX      {psrad x, 12}, m1, m0, m5, m2
    354    packssdw             m0, m5                    ;high: out1 ;low: out0
    355    packssdw             m1, m2                    ;high: out3 ;low: out3
    356    ret
    357 
    358 INV_TXFM_4X4_FN flipadst, dct
    359 INV_TXFM_4X4_FN flipadst, adst
    360 INV_TXFM_4X4_FN flipadst, flipadst
    361 INV_TXFM_4X4_FN flipadst, identity
    362 
    363 cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
    364    mova                 m0, [coeffq+16*0]
    365    mova                 m1, [coeffq+16*1]
    366    call m(iadst_4x4_internal_8bpc).main
    367    punpcklwd            m2, m1, m0
    368    punpckhwd            m1, m0
    369    punpcklwd            m0, m1, m2            ;high: in3 ;low :in2
    370    punpckhwd            m1, m2                ;high: in1 ;low: in0
    371    jmp                tx2q
    372 
    373 .pass2:
    374    call m(iadst_4x4_internal_8bpc).main
    375 
    376 .end:
    377    pxor                 m2, m2
    378    mova      [coeffq+16*0], m2
    379    mova      [coeffq+16*1], m2
    380 
    381 .end2:
    382    ITX4_END              3, 2, 1, 0
    383 
    384 INV_TXFM_4X4_FN identity, dct
    385 INV_TXFM_4X4_FN identity, adst
    386 INV_TXFM_4X4_FN identity, flipadst
    387 INV_TXFM_4X4_FN identity, identity
    388 
    389 cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
    390    mova                 m0, [coeffq+16*0]
    391    mova                 m1, [coeffq+16*1]
    392    mova                 m3, [o(pw_1697x8)]
    393    pmulhrsw             m2, m0, m3
    394    pmulhrsw             m3, m1
    395    paddsw               m0, m2
    396    paddsw               m1, m3
    397    punpckhwd            m2, m0, m1
    398    punpcklwd            m0, m1
    399    punpckhwd            m1, m0, m2            ;high: in3 ;low :in2
    400    punpcklwd            m0, m2                ;high: in1 ;low: in0
    401    jmp                tx2q
    402 
    403 .pass2:
    404    mova                 m3, [o(pw_1697x8)]
    405    pmulhrsw             m2, m3, m0
    406    pmulhrsw             m3, m1
    407    paddsw               m0, m2
    408    paddsw               m1, m3
    409    jmp m(iadst_4x4_internal_8bpc).end
    410 
    411 %macro IWHT4_1D_PACKED 0
    412    punpckhqdq           m3, m0, m1            ;low: in1 high: in3
    413    punpcklqdq           m0, m1                ;low: in0 high: in2
    414    psubw                m2, m0, m3            ;low: in0 - in1 high: in2 - in3
    415    paddw                m0, m3                ;low: in0 + in1 high: in2 + in3
    416    punpckhqdq           m2, m2                ;t2 t2
    417    punpcklqdq           m0, m0                ;t0 t0
    418    psubw                m1, m0, m2
    419    psraw                m1, 1                 ;t4 t4
    420    psubw                m1, m3                ;low: t1/out2 high: t3/out1
    421    psubw                m0, m1                ;high: out0
    422    paddw                m2, m1                ;low: out3
    423 %endmacro
    424 
    425 INIT_XMM sse2
    426 cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff
    427    mova                 m0, [coeffq+16*0]
    428    mova                 m1, [coeffq+16*1]
    429    pxor                 m2, m2
    430    mova      [coeffq+16*0], m2
    431    mova      [coeffq+16*1], m2
    432    psraw                m0, 2
    433    psraw                m1, 2
    434    IWHT4_1D_PACKED
    435    punpckhwd            m0, m1
    436    punpcklwd            m3, m1, m2
    437    punpckhdq            m1, m0, m3
    438    punpckldq            m0, m3
    439    IWHT4_1D_PACKED
    440    shufpd               m0, m2, 0x01
    441    ITX4_END              0, 3, 2, 1, 0
    442 
    443 %macro IDCT8_1D_PACKED 0
    444    mova                 m6, [o(pd_2048)]
    445    punpckhwd            m4, m0, m3                 ;unpacked in1 in7
    446    punpcklwd            m0, m2                     ;unpacked in0 in4
    447    punpckhwd            m2, m1                     ;unpacked in5 in3
    448    punpcklwd            m1, m3                     ;unpacked in2 in6
    449    ITX_MUL2X_PACK        4, 3, 6,  799, 4017       ;low: t7a high: t4a
    450    ITX_MUL2X_PACK        2, 3, 6, 3406, 2276       ;low: t6a high: t5a
    451    ITX_MUL2X_PACK        1, 3, 6, 1567, 3784       ;low: t3  high: t2
    452    psubsw               m3, m4, m2                 ;low: t6a high: t5a
    453    paddsw               m4, m2                     ;low: t7  high: t4
    454    pshufb               m3, [o(deint_shuf1)]
    455    ITX_MUL2X_PACK        0, 2, 6, 2896, 2896       ;low: t0  high: t1
    456    ITX_MUL2X_PACK        3, 2, 6, 2896, 2896       ;low: t6  high: t5
    457    psubsw               m2, m0, m1                 ;low: tmp3 high: tmp2
    458    paddsw               m0, m1                     ;low: tmp0 high: tmp1
    459    punpcklqdq           m1, m4, m3                 ;low: t7   high: t6
    460    punpckhqdq           m4, m3                     ;low: t4   high: t5
    461    psubsw               m3, m0, m1                 ;low: out7 high: out6
    462    paddsw               m0, m1                     ;low: out0 high: out1
    463    paddsw               m1, m2, m4                 ;low: out3 high: out2
    464    psubsw               m2, m4                     ;low: out4 high: out5
    465 %endmacro
    466 
    467 ;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
    468 ;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
    469 %macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
    470    punpckhwd           m%4, m%1, m%2
    471    punpcklwd           m%1, m%2
    472 %if %7 < 8
    473    pmaddwd             m%2, m%7, m%1
    474    pmaddwd             m%3, m%7, m%4
    475 %else
    476    mova                m%2, [o(pw_%7_%6)]
    477 %if %8
    478    pmaddwd             m%3, m%1, m%2
    479    pmaddwd             m%2, m%4
    480 %else
    481    pmaddwd             m%3, m%4, m%2
    482    pmaddwd             m%2, m%1
    483 %endif
    484 %endif
    485    paddd               m%3, m%5
    486    paddd               m%2, m%5
    487    psrad               m%3, 12
    488    psrad               m%2, 12
    489 %if %8
    490    packssdw            m%3, m%2
    491 %else
    492    packssdw            m%2, m%3                 ;dst2
    493 %endif
    494 %if %7 < 8
    495    pmaddwd             m%4, m%6
    496    pmaddwd             m%1, m%6
    497 %elif %8
    498    mova                m%2, [o(pw_%6_m%7)]
    499    pmaddwd             m%4, m%2
    500    pmaddwd             m%1, m%2
    501 %else
    502    mova                m%3, [o(pw_%6_m%7)]
    503    pmaddwd             m%4, m%3
    504    pmaddwd             m%1, m%3
    505 %endif
    506    paddd               m%4, m%5
    507    paddd               m%1, m%5
    508    psrad               m%4, 12
    509    psrad               m%1, 12
    510    packssdw            m%1, m%4                 ;dst1
    511 %endmacro
    512 
    513 %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
    514    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
    515    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
    516    psubsw              m%3, m%1, m%2                      ;out2
    517    paddsw              m%2, m%1                           ;out1
    518    paddsw              m%1, m%5, m%4                      ;out0
    519    psubsw              m%4, m%5                           ;out3
    520 %endmacro
    521 
    522 %macro WRITE_4X8 4 ;row[1-4]
    523    WRITE_4X4             0, 1, 4, 5, 6, %1, %2, %3, %4
    524    lea                dstq, [dstq+strideq*4]
    525    WRITE_4X4             2, 3, 4, 5, 6, %1, %2, %3, %4
    526 %endmacro
    527 
    528 %macro INV_4X8 0
    529    punpckhwd            m4, m2, m3
    530    punpcklwd            m2, m3
    531    punpckhwd            m3, m0, m1
    532    punpcklwd            m0, m1
    533    punpckhdq            m1, m0, m2                  ;low: in2 high: in3
    534    punpckldq            m0, m2                      ;low: in0 high: in1
    535    punpckldq            m2, m3, m4                  ;low: in4 high: in5
    536    punpckhdq            m3, m4                      ;low: in6 high: in7
    537 %endmacro
    538 
    539 %macro INV_TXFM_4X8_FN 2 ; type1, type2
    540    INV_TXFM_FN          %1, %2, 4x8, 8
    541 %ifidn %1_%2, dct_dct
    542    pshuflw              m0, [coeffq], q0000
    543    punpcklqdq           m0, m0
    544    mova                 m1, [o(pw_2896x8)]
    545    pmulhrsw             m0, m1
    546    mov           [coeffq], eobd
    547    pmulhrsw             m0, m1
    548    pmulhrsw             m0, m1
    549    pmulhrsw             m0, [o(pw_2048)]
    550    mova                 m1, m0
    551    mova                 m2, m0
    552    mova                 m3, m0
    553    TAIL_CALL m(iadst_4x8_internal_8bpc).end3
    554 %endif
    555 %endmacro
    556 
    557 INIT_XMM ssse3
    558 INV_TXFM_4X8_FN dct, dct
    559 INV_TXFM_4X8_FN dct, adst
    560 INV_TXFM_4X8_FN dct, flipadst
    561 INV_TXFM_4X8_FN dct, identity
    562 
    563 cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
    564    mova                 m3, [o(pw_2896x8)]
    565    pmulhrsw             m0, m3, [coeffq+16*0]
    566    pmulhrsw             m1, m3, [coeffq+16*1]
    567    pmulhrsw             m2, m3, [coeffq+16*2]
    568    pmulhrsw             m3,     [coeffq+16*3]
    569 
    570 .pass1:
    571    call m(idct_8x4_internal_8bpc).main
    572    jmp m(iadst_4x8_internal_8bpc).pass1_end
    573 
    574 .pass2:
    575    call .main
    576    shufps               m1, m1, q1032
    577    shufps               m3, m3, q1032
    578    mova                 m4, [o(pw_2048)]
    579    jmp m(iadst_4x8_internal_8bpc).end2
    580 
    581 ALIGN function_align
    582 cglobal_label .main
    583    IDCT8_1D_PACKED
    584    ret
    585 
    586 
    587 INV_TXFM_4X8_FN adst, dct
    588 INV_TXFM_4X8_FN adst, adst
    589 INV_TXFM_4X8_FN adst, flipadst
    590 INV_TXFM_4X8_FN adst, identity
    591 
    592 cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
    593    mova                 m3, [o(pw_2896x8)]
    594    pmulhrsw             m0, m3, [coeffq+16*0]
    595    pmulhrsw             m1, m3, [coeffq+16*1]
    596    pmulhrsw             m2, m3, [coeffq+16*2]
    597    pmulhrsw             m3,     [coeffq+16*3]
    598 
    599 .pass1:
    600    call m(iadst_8x4_internal_8bpc).main
    601 
    602 .pass1_end:
    603    INV_4X8
    604    jmp                tx2q
    605 
    606 .pass2:
    607    shufps               m0, m0, q1032
    608    shufps               m1, m1, q1032
    609    call .main
    610    mova                 m4, [o(pw_2048)]
    611    pxor                 m5, m5
    612    psubw                m5, m4
    613 
    614 .end:
    615    punpcklqdq           m4, m5
    616 
    617 .end2:
    618    pmulhrsw             m0, m4
    619    pmulhrsw             m1, m4
    620    pmulhrsw             m2, m4
    621    pmulhrsw             m3, m4
    622    pxor                 m5, m5
    623    mova      [coeffq+16*0], m5
    624    mova      [coeffq+16*1], m5
    625    mova      [coeffq+16*2], m5
    626    mova      [coeffq+16*3], m5
    627 
    628 .end3:
    629    WRITE_4X8             0, 1, 2, 3
    630    RET
    631 
    632 ALIGN function_align
    633 cglobal_label .main
    634    mova                 m6, [o(pd_2048)]
    635    punpckhwd            m4, m3, m0                ;unpacked in7 in0
    636    punpckhwd            m5, m2, m1                ;unpacked in5 in2
    637    punpcklwd            m1, m2                    ;unpacked in3 in4
    638    punpcklwd            m0, m3                    ;unpacked in1 in6
    639    ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a
    640    ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a
    641    ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a
    642    ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a
    643 
    644    psubsw               m3, m4, m1                ;low:  t4    high:  t5
    645    paddsw               m4, m1                    ;low:  t0    high:  t1
    646    psubsw               m2, m5, m0                ;low:  t6    high:  t7
    647    paddsw               m5, m0                    ;low:  t2    high:  t3
    648 
    649    shufps               m1, m3, m2, q1032
    650    punpckhwd            m2, m1
    651    punpcklwd            m3, m1
    652    ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a
    653    ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a
    654 
    655    psubsw               m1, m4, m5                ;low:  t2    high:  t3
    656    paddsw               m4, m5                    ;low:  out0  high: -out7
    657    psubsw               m5, m3, m2                ;low:  t7    high:  t6
    658    paddsw               m3, m2                    ;low:  out6  high: -out1
    659    shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1
    660    shufps               m3, m4, q3210             ;low:  out6  high: -out7
    661 
    662    mova                 m2, [o(pw_2896_m2896)]
    663    mova                 m7, [o(pw_2896_2896)]
    664    shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7
    665    shufps               m1, m5, q3210             ;low:  t2    high:  t6
    666    punpcklwd            m5, m1, m4
    667    punpckhwd            m1, m4
    668    pmaddwd              m4, m2, m1                ;-out5
    669    pmaddwd              m2, m5                    ; out4
    670    pmaddwd              m1, m7                    ; out2
    671    pmaddwd              m5, m7                    ;-out3
    672    REPX      {paddd x, m6}, m4, m2, m1, m5
    673    REPX      {psrad x, 12}, m4, m2, m1, m5
    674    packssdw             m1, m5                    ;low:  out2  high: -out3
    675    packssdw             m2, m4                    ;low:  out4  high: -out5
    676    ret
    677 
    678 INV_TXFM_4X8_FN flipadst, dct
    679 INV_TXFM_4X8_FN flipadst, adst
    680 INV_TXFM_4X8_FN flipadst, flipadst
    681 INV_TXFM_4X8_FN flipadst, identity
    682 
    683 cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
    684    mova                 m3, [o(pw_2896x8)]
    685    pmulhrsw             m0, m3, [coeffq+16*0]
    686    pmulhrsw             m1, m3, [coeffq+16*1]
    687    pmulhrsw             m2, m3, [coeffq+16*2]
    688    pmulhrsw             m3,     [coeffq+16*3]
    689 
    690 .pass1:
    691    call m(iadst_8x4_internal_8bpc).main
    692 
    693    punpcklwd            m4, m3, m2
    694    punpckhwd            m3, m2
    695    punpcklwd            m5, m1, m0
    696    punpckhwd            m1, m0
    697    punpckldq            m2, m3, m1                  ;low: in4 high: in5
    698    punpckhdq            m3, m1                      ;low: in6 high: in7
    699    punpckldq            m0, m4, m5                  ;low: in0 high: in1
    700    punpckhdq            m1, m4, m5                  ;low: in2 high: in3
    701    jmp                tx2q
    702 
    703 .pass2:
    704    shufps               m0, m0, q1032
    705    shufps               m1, m1, q1032
    706    call m(iadst_4x8_internal_8bpc).main
    707 
    708    mova                 m4, m0
    709    mova                 m5, m1
    710    pshufd               m0, m3, q1032
    711    pshufd               m1, m2, q1032
    712    pshufd               m2, m5, q1032
    713    pshufd               m3, m4, q1032
    714    mova                 m5, [o(pw_2048)]
    715    pxor                 m4, m4
    716    psubw                m4, m5
    717    jmp m(iadst_4x8_internal_8bpc).end
    718 
    719 INV_TXFM_4X8_FN identity, dct
    720 INV_TXFM_4X8_FN identity, adst
    721 INV_TXFM_4X8_FN identity, flipadst
    722 INV_TXFM_4X8_FN identity, identity
    723 
    724 cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
    725    mova                 m3, [o(pw_2896x8)]
    726    pmulhrsw             m0, m3, [coeffq+16*0]
    727    pmulhrsw             m1, m3, [coeffq+16*1]
    728    pmulhrsw             m2, m3, [coeffq+16*2]
    729    pmulhrsw             m3,     [coeffq+16*3]
    730 
    731 .pass1:
    732    mova                 m7, [o(pw_1697x8)]
    733    pmulhrsw             m4, m7, m0
    734    pmulhrsw             m5, m7, m1
    735    pmulhrsw             m6, m7, m2
    736    pmulhrsw             m7, m3
    737    paddsw               m0, m4
    738    paddsw               m1, m5
    739    paddsw               m2, m6
    740    paddsw               m3, m7
    741    jmp m(iadst_4x8_internal_8bpc).pass1_end
    742 
    743 .pass2:
    744    mova                 m4, [o(pw_4096)]
    745    jmp m(iadst_4x8_internal_8bpc).end2
    746 
    747 
    748 %macro WRITE_8X2 5       ;coefs[1-2], tmp[1-3]
    749    movq                 m%3, [dstq        ]
    750    movq                 m%4, [dstq+strideq]
    751    pxor                 m%5, m%5
    752    punpcklbw            m%3, m%5                 ;extend byte to word
    753    punpcklbw            m%4, m%5                 ;extend byte to word
    754 %ifnum %1
    755    paddw                m%3, m%1
    756 %else
    757    paddw                m%3, %1
    758 %endif
    759 %ifnum %2
    760    paddw                m%4, m%2
    761 %else
    762    paddw                m%4, %2
    763 %endif
    764    packuswb             m%3, m%4
    765    movq      [dstq        ], m%3
    766    punpckhqdq           m%3, m%3
    767    movq      [dstq+strideq], m%3
    768 %endmacro
    769 
    770 %macro WRITE_8X4 7      ;coefs[1-4], tmp[1-3]
    771    WRITE_8X2             %1, %2, %5, %6, %7
    772    lea                dstq, [dstq+strideq*2]
    773    WRITE_8X2             %3, %4, %5, %6, %7
    774 %endmacro
    775 
    776 %macro INV_TXFM_8X4_FN 2 ; type1, type2
    777    INV_TXFM_FN          %1, %2, 8x4, 8
    778 %ifidn %1_%2, dct_dct
    779    pshuflw              m0, [coeffq], q0000
    780    punpcklqdq           m0, m0
    781    mova                 m1, [o(pw_2896x8)]
    782    pmulhrsw             m0, m1
    783    pmulhrsw             m0, m1
    784    mova                 m2, [o(pw_2048)]
    785    pmulhrsw             m0, m1
    786    pmulhrsw             m0, m2
    787    mova                 m1, m0
    788    mova                 m2, m0
    789    mova                 m3, m0
    790    TAIL_CALL m(iadst_8x4_internal_8bpc).end2
    791 %endif
    792 %endmacro
    793 
    794 INV_TXFM_8X4_FN dct, dct
    795 INV_TXFM_8X4_FN dct, adst
    796 INV_TXFM_8X4_FN dct, flipadst
    797 INV_TXFM_8X4_FN dct, identity
    798 
    799 cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
    800    mova                 m3, [o(pw_2896x8)]
    801    pmulhrsw             m0, m3, [coeffq+16*0]
    802    pmulhrsw             m1, m3, [coeffq+16*1]
    803    pmulhrsw             m2, m3, [coeffq+16*2]
    804    pmulhrsw             m3,     [coeffq+16*3]
    805 
    806    call m(idct_4x8_internal_8bpc).main
    807 
    808    mova                 m4, [o(deint_shuf1)]
    809    mova                 m5, [o(deint_shuf2)]
    810    pshufb               m0, m4
    811    pshufb               m1, m5
    812    pshufb               m2, m4
    813    pshufb               m3, m5
    814    punpckhdq            m4, m0, m1
    815    punpckldq            m0, m1
    816    punpckhdq            m5, m2, m3
    817    punpckldq            m2, m3
    818    punpckhqdq           m1, m0, m2                      ;in1
    819    punpcklqdq           m0, m2                          ;in0
    820    punpckhqdq           m3, m4, m5                      ;in3
    821    punpcklqdq           m2 ,m4, m5                      ;in2
    822    jmp                tx2q
    823 
    824 .pass2:
    825    call .main
    826    jmp m(iadst_8x4_internal_8bpc).end
    827 
    828 ALIGN function_align
    829 cglobal_label .main
    830    mova                 m6, [o(pd_2048)]
    831    IDCT4_1D             0, 1, 2, 3, 4, 5, 6
    832    ret
    833 
    834 INV_TXFM_8X4_FN adst, dct
    835 INV_TXFM_8X4_FN adst, adst
    836 INV_TXFM_8X4_FN adst, flipadst
    837 INV_TXFM_8X4_FN adst, identity
    838 
    839 cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
    840    mova                 m3, [o(pw_2896x8)]
    841    pmulhrsw             m0, m3, [coeffq+16*0]
    842    pmulhrsw             m1, m3, [coeffq+16*1]
    843    pmulhrsw             m2, m3, [coeffq+16*2]
    844    pmulhrsw             m3,     [coeffq+16*3]
    845 
    846    shufps               m0, m0, q1032
    847    shufps               m1, m1, q1032
    848    call m(iadst_4x8_internal_8bpc).main
    849 
    850    punpckhwd            m4, m0, m1
    851    punpcklwd            m0, m1
    852    punpckhwd            m1, m2, m3
    853    punpcklwd            m2, m3
    854    pxor                 m5, m5
    855    psubsw               m3, m5, m1
    856    psubsw               m5, m4
    857    punpckhdq            m4, m5, m3
    858    punpckldq            m5, m3
    859    punpckhdq            m3, m0, m2
    860    punpckldq            m0, m2
    861    punpckhwd            m1, m0, m5      ;in1
    862    punpcklwd            m0, m5          ;in0
    863    punpcklwd            m2, m3, m4      ;in2
    864    punpckhwd            m3, m4          ;in3
    865    jmp              tx2q
    866 
    867 .pass2:
    868    call .main
    869 
    870 .end:
    871    mova                 m4, [o(pw_2048)]
    872    pmulhrsw             m0, m4
    873    pmulhrsw             m1, m4
    874    pmulhrsw             m2, m4
    875    pmulhrsw             m3, m4
    876 
    877 .end2:
    878    pxor                 m6, m6
    879    mova      [coeffq+16*0], m6
    880    mova      [coeffq+16*1], m6
    881    mova      [coeffq+16*2], m6
    882    mova      [coeffq+16*3], m6
    883 .end3:
    884    WRITE_8X4             0, 1, 2, 3, 4, 5, 6
    885    RET
    886 
    887 ALIGN function_align
    888 cglobal_label .main
    889    punpckhwd            m6, m0, m2                    ;unpacked in0 in2
    890    punpcklwd            m0, m2                        ;unpacked in0 in2
    891    punpckhwd            m7, m1, m3                    ;unpacked in1 in3
    892    punpcklwd            m1, m3                        ;unpacked in1 in3
    893 
    894    mova                 m2, [o(pw_3344_m3344)]
    895    mova                 m4, [o(pw_0_3344)]
    896    pmaddwd              m3, m2, m6                    ;3344 * in0 - 3344 * in2
    897    pmaddwd              m5, m4, m7                    ;3344 * in3
    898    pmaddwd              m2, m0
    899    pmaddwd              m4, m1
    900    paddd                m3, m5
    901    paddd                m2, m4
    902    mova                 m4, [o(pd_2048)]
    903    paddd                m3, m4                        ;t2 + 2048
    904    paddd                m2, m4
    905    psrad                m3, 12
    906    psrad                m2, 12
    907    packssdw             m2, m3                        ;out2
    908 
    909    pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
    910    pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
    911    pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
    912    pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
    913    paddd                m3, m4                        ;t0 + t3
    914 
    915    pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
    916    mova                 m4, [o(pd_2048)]
    917    paddd                m0, m4
    918    paddd                m4, m3                        ;t0 + t3 + 2048
    919    paddd                m5, m0                        ;t1 + t3 + 2048
    920    paddd                m3, m0
    921    paddd                m3, m1                        ;t0 + t1 - t3 + 2048
    922 
    923    psrad                m4, 12                        ;out0
    924    psrad                m5, 12                        ;out1
    925    psrad                m3, 12                        ;out3
    926    packssdw             m0, m4, m5                    ;low: out0  high: out1
    927 
    928    pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
    929    pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
    930    pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
    931    pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
    932    paddd                m1, m4                        ;t0 + t3
    933    pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
    934 
    935    mova                 m4, [o(pd_2048)]
    936    paddd                m6, m4
    937    paddd                m4, m1                        ;t0 + t3 + 2048
    938    paddd                m5, m6                        ;t1 + t3 + 2048
    939    paddd                m1, m6
    940    paddd                m1, m7                        ;t0 + t1 - t3 + 2048
    941 
    942    psrad                m4, 12                        ;out0
    943    psrad                m5, 12                        ;out1
    944    psrad                m1, 12                        ;out3
    945    packssdw             m3, m1                        ;out3
    946    packssdw             m4, m5                        ;low: out0  high: out1
    947 
    948    punpckhqdq           m1, m0, m4                    ;out1
    949    punpcklqdq           m0, m4                        ;out0
    950    ret
    951 
    952 INV_TXFM_8X4_FN flipadst, dct
    953 INV_TXFM_8X4_FN flipadst, adst
    954 INV_TXFM_8X4_FN flipadst, flipadst
    955 INV_TXFM_8X4_FN flipadst, identity
    956 
    957 cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
    958    mova                 m3, [o(pw_2896x8)]
    959    pmulhrsw             m0, m3, [coeffq+16*0]
    960    pmulhrsw             m1, m3, [coeffq+16*1]
    961    pmulhrsw             m2, m3, [coeffq+16*2]
    962    pmulhrsw             m3,     [coeffq+16*3]
    963 
    964    shufps               m0, m0, q1032
    965    shufps               m1, m1, q1032
    966    call m(iadst_4x8_internal_8bpc).main
    967 
    968    punpckhwd            m5, m3, m2
    969    punpcklwd            m3, m2
    970    punpckhwd            m2, m1, m0
    971    punpcklwd            m1, m0
    972 
    973    pxor                 m0, m0
    974    psubsw               m4, m0, m2
    975    psubsw               m0, m5
    976    punpckhdq            m2, m0, m4
    977    punpckldq            m0, m4
    978    punpckhdq            m4, m3, m1
    979    punpckldq            m3, m1
    980    punpckhwd            m1, m0, m3      ;in1
    981    punpcklwd            m0, m3          ;in0
    982    punpckhwd            m3, m2, m4      ;in3
    983    punpcklwd            m2, m4          ;in2
    984    jmp                  tx2q
    985 
    986 .pass2:
    987    call m(iadst_8x4_internal_8bpc).main
    988    mova                 m4, m0
    989    mova                 m5, m1
    990    mova                 m0, m3
    991    mova                 m1, m2
    992    mova                 m2, m5
    993    mova                 m3, m4
    994    jmp m(iadst_8x4_internal_8bpc).end
    995 
    996 INV_TXFM_8X4_FN identity, dct
    997 INV_TXFM_8X4_FN identity, adst
    998 INV_TXFM_8X4_FN identity, flipadst
    999 INV_TXFM_8X4_FN identity, identity
   1000 
   1001 cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   1002    mova                 m3, [o(pw_2896x8)]
   1003    pmulhrsw             m0, m3, [coeffq+16*0]
   1004    pmulhrsw             m1, m3, [coeffq+16*1]
   1005    pmulhrsw             m2, m3, [coeffq+16*2]
   1006    pmulhrsw             m3,     [coeffq+16*3]
   1007    paddsw               m0, m0
   1008    paddsw               m1, m1
   1009    paddsw               m2, m2
   1010    paddsw               m3, m3
   1011 
   1012    punpckhwd            m4, m0, m1
   1013    punpcklwd            m0, m1
   1014    punpckhwd            m1, m2, m3
   1015    punpcklwd            m2, m3
   1016    punpckhdq            m5, m4, m1
   1017    punpckldq            m4, m1
   1018    punpckhdq            m3, m0, m2
   1019    punpckldq            m0, m2
   1020    punpckhwd            m1, m0, m4      ;in1
   1021    punpcklwd            m0, m4          ;in0
   1022    punpcklwd            m2, m3, m5      ;in2
   1023    punpckhwd            m3, m5          ;in3
   1024    jmp                tx2q
   1025 
   1026 .pass2:
   1027    mova                 m7, [o(pw_1697x8)]
   1028    pmulhrsw             m4, m7, m0
   1029    pmulhrsw             m5, m7, m1
   1030    pmulhrsw             m6, m7, m2
   1031    pmulhrsw             m7, m3
   1032    paddsw               m0, m4
   1033    paddsw               m1, m5
   1034    paddsw               m2, m6
   1035    paddsw               m3, m7
   1036    jmp m(iadst_8x4_internal_8bpc).end
   1037 
   1038 %macro INV_TXFM_8X8_FN 2 ; type1, type2
   1039    INV_TXFM_FN          %1, %2, 8x8, 8, 16*4
   1040 %ifidn %1_%2, dct_dct
   1041    pshuflw              m0, [coeffq], q0000
   1042    punpcklwd            m0, m0
   1043    mova                 m1, [o(pw_2896x8)]
   1044    pmulhrsw             m0, m1
   1045    mova                 m2, [o(pw_16384)]
   1046    mov            [coeffq], eobd
   1047    pmulhrsw             m0, m2
   1048    psrlw                m2, 3
   1049    pmulhrsw             m0, m1
   1050    pmulhrsw             m0, m2
   1051 .end:
   1052    mov                 r3d, 2
   1053    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)]
   1054 .loop:
   1055    WRITE_8X4             0, 0, 0, 0, 1, 2, 3
   1056    lea                dstq, [dstq+strideq*2]
   1057    dec                 r3d
   1058    jg .loop
   1059    jmp                tx2q
   1060 .end3:
   1061    RET
   1062 %endif
   1063 %endmacro
   1064 
   1065 %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
   1066 %if %3
   1067    mova                 m7, [o(pw_2896x8)]
   1068    pmulhrsw             m0, m7, [%1+%2*0]
   1069    pmulhrsw             m1, m7, [%1+%2*1]
   1070    pmulhrsw             m2, m7, [%1+%2*2]
   1071    pmulhrsw             m3, m7, [%1+%2*3]
   1072    pmulhrsw             m4, m7, [%1+%2*4]
   1073    pmulhrsw             m5, m7, [%1+%2*5]
   1074    pmulhrsw             m6, m7, [%1+%2*6]
   1075    pmulhrsw             m7, [%1+%2*7]
   1076 %else
   1077    mova                 m0, [%1+%2*0]
   1078    mova                 m1, [%1+%2*1]
   1079    mova                 m2, [%1+%2*2]
   1080    mova                 m3, [%1+%2*3]
   1081    mova                 m4, [%1+%2*4]
   1082    mova                 m5, [%1+%2*5]
   1083    mova                 m6, [%1+%2*6]
   1084    mova                 m7, [%1+%2*7]
   1085 %endif
   1086 %endmacro
   1087 
   1088 %macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
   1089    ITX_MULSUB_2W         %1, %4, %5, %6, %7,  799, 4017    ;t4a, t7a
   1090    ITX_MULSUB_2W         %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
   1091    psubsw               m%2, m%4, m%5                      ;t6a
   1092    paddsw               m%4, m%5                           ;t7
   1093    psubsw               m%5, m%1, m%3                      ;t5a
   1094    paddsw               m%1, m%3                           ;t4
   1095    ITX_MULSUB_2W         %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
   1096 %endmacro
   1097 
   1098 INV_TXFM_8X8_FN dct, dct
   1099 INV_TXFM_8X8_FN dct, adst
   1100 INV_TXFM_8X8_FN dct, flipadst
   1101 INV_TXFM_8X8_FN dct, identity
   1102 
   1103 cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   1104    LOAD_8ROWS          coeffq, 16
   1105 
   1106 .pass1:
   1107    call .main
   1108 
   1109 .pass1_end:
   1110    mova                    m7, [o(pw_16384)]
   1111 
   1112 .pass1_end1:
   1113    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
   1114    mova    [rsp+gprsize+16*1], m6
   1115 
   1116 .pass1_end2:
   1117    REPX      {pmulhrsw x, m7}, m1, m3, m5
   1118    pmulhrsw                m7, [rsp+gprsize+16*0]
   1119 
   1120 cglobal_label .pass1_end3
   1121    punpcklwd               m6, m1, m5             ;10 50 11 51 12 52 13 53
   1122    punpckhwd               m1, m5                 ;14 54 15 55 16 56 17 57
   1123    punpckhwd               m5, m0, m4             ;04 44 05 45 06 46 07 47
   1124    punpcklwd               m0, m4                 ;00 40 01 41 02 42 03 43
   1125    punpckhwd               m4, m3, m7             ;34 74 35 75 36 76 37 77
   1126    punpcklwd               m3, m7                 ;30 70 31 71 32 72 33 73
   1127    punpckhwd               m7, m1, m4             ;16 36 56 76 17 37 57 77
   1128    punpcklwd               m1, m4                 ;14 34 54 74 15 35 55 75
   1129    punpckhwd               m4, m6, m3             ;12 32 52 72 13 33 53 73
   1130    punpcklwd               m6, m3                 ;10 30 50 70 11 31 51 71
   1131    mova    [rsp+gprsize+16*2], m6
   1132    mova                    m6, [rsp+gprsize+16*1]
   1133    punpckhwd               m3, m2, m6             ;24 64 25 65 26 66 27 67
   1134    punpcklwd               m2, m6                 ;20 60 21 61 22 62 23 63
   1135    punpckhwd               m6, m5, m3             ;06 26 46 66 07 27 47 67
   1136    punpcklwd               m5, m3                 ;04 24 44 64 05 25 45 65
   1137    punpckhwd               m3, m0, m2             ;02 22 42 62 03 23 43 63
   1138    punpcklwd               m0, m2                 ;00 20 40 60 01 21 41 61
   1139 
   1140    punpckhwd               m2, m6, m7             ;07 17 27 37 47 57 67 77
   1141    punpcklwd               m6, m7                 ;06 16 26 36 46 56 66 76
   1142    mova    [rsp+gprsize+16*0], m2
   1143    punpcklwd               m2, m3, m4             ;02 12 22 32 42 52 62 72
   1144    punpckhwd               m3, m4                 ;03 13 23 33 43 53 63 73
   1145    punpcklwd               m4, m5, m1             ;04 14 24 34 44 54 64 74
   1146    punpckhwd               m5, m1                 ;05 15 25 35 45 55 65 75
   1147    mova                    m7, [rsp+gprsize+16*2]
   1148    punpckhwd               m1, m0, m7             ;01 11 21 31 41 51 61 71
   1149    punpcklwd               m0, m7                 ;00 10 20 30 40 50 60 70
   1150    mova                    m7, [rsp+gprsize+16*0]
   1151    jmp                   tx2q
   1152 
   1153 .pass2:
   1154    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
   1155 
   1156 .pass2_main:
   1157    call .main
   1158 
   1159 .end:
   1160    mova                    m7, [o(pw_2048)]
   1161    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
   1162    mova    [rsp+gprsize+16*1], m6
   1163 
   1164 .end2:
   1165    REPX      {pmulhrsw x, m7}, m1, m3, m5
   1166    pmulhrsw                m7, [rsp+gprsize+16*0]
   1167    mova    [rsp+gprsize+16*2], m5
   1168    mova    [rsp+gprsize+16*0], m7
   1169 
   1170 .end3:
   1171    WRITE_8X4                0, 1, 2, 3, 5, 6, 7
   1172    lea                   dstq, [dstq+strideq*2]
   1173    WRITE_8X4                4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
   1174    jmp                   tx2q
   1175 
   1176 .end4:
   1177    pxor                    m7, m7
   1178    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
   1179    ret
   1180 
   1181 ALIGN function_align
   1182 cglobal_label .main
   1183    mova  [rsp+gprsize*2+16*0], m7
   1184    mova  [rsp+gprsize*2+16*1], m3
   1185    mova  [rsp+gprsize*2+16*2], m1
   1186    mova                    m7, [o(pd_2048)]
   1187    IDCT4_1D                 0, 2, 4, 6, 1, 3, 7
   1188    mova                    m3, [rsp+gprsize*2+16*2]
   1189    mova  [rsp+gprsize*2+16*2], m2
   1190    mova                    m2, [rsp+gprsize*2+16*1]
   1191    mova  [rsp+gprsize*2+16*1], m4
   1192    mova                    m4, [rsp+gprsize*2+16*0]
   1193    mova  [rsp+gprsize*2+16*0], m6
   1194    IDCT8_1D_ODDHALF         3, 2, 5, 4, 1, 6, 7
   1195    mova                    m6, [rsp+gprsize*2+16*0]
   1196    psubsw                  m7, m0, m4                    ;out7
   1197    paddsw                  m0, m4                        ;out0
   1198    mova  [rsp+gprsize*2+16*0], m7
   1199    mova                    m1, [rsp+gprsize*2+16*2]
   1200    psubsw                  m4, m6, m3                    ;out4
   1201    paddsw                  m3, m6                        ;out3
   1202    mova                    m7, [rsp+gprsize*2+16*1]
   1203    psubsw                  m6, m1, m5                    ;out6
   1204    paddsw                  m1, m5                        ;out1
   1205    psubsw                  m5, m7, m2                    ;out5
   1206    paddsw                  m2, m7                        ;out2
   1207    ret
   1208 
   1209 
   1210 INV_TXFM_8X8_FN adst, dct
   1211 INV_TXFM_8X8_FN adst, adst
   1212 INV_TXFM_8X8_FN adst, flipadst
   1213 INV_TXFM_8X8_FN adst, identity
   1214 
   1215 cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   1216    LOAD_8ROWS          coeffq, 16
   1217 
   1218 .pass1:
   1219    call .main
   1220    call .main_pass1_end
   1221 
   1222 .pass1_end:
   1223    mova                    m7, [o(pw_16384)]
   1224 
   1225 .pass1_end1:
   1226    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
   1227    mova    [rsp+gprsize+16*1], m6
   1228    pxor                    m6, m6
   1229    psubw                   m6, m7
   1230    mova                    m7, m6
   1231    jmp m(idct_8x8_internal_8bpc).pass1_end2
   1232 
   1233 ALIGN function_align
   1234 .pass2:
   1235    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
   1236 
   1237 .pass2_main:
   1238    call .main
   1239    call .main_pass2_end
   1240 
   1241 .end:
   1242    mova                    m7, [o(pw_2048)]
   1243    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
   1244    mova    [rsp+gprsize+16*1], m6
   1245    pxor                    m6, m6
   1246    psubw                   m6, m7
   1247    mova                    m7, m6
   1248    jmp m(idct_8x8_internal_8bpc).end2
   1249 
   1250 ALIGN function_align
   1251 cglobal_label .main
   1252    mova  [rsp+gprsize*2+16*0], m7
   1253    mova  [rsp+gprsize*2+16*1], m3
   1254    mova  [rsp+gprsize*2+16*2], m4
   1255    mova                    m7, [o(pd_2048)]
   1256    ITX_MULSUB_2W            5, 2, 3, 4, 7, 1931, 3612    ;t3a, t2a
   1257    ITX_MULSUB_2W            1, 6, 3, 4, 7, 3920, 1189    ;t7a, t6a
   1258    paddsw                  m3, m2, m6                    ;t2
   1259    psubsw                  m2, m6                        ;t6
   1260    paddsw                  m4, m5, m1                    ;t3
   1261    psubsw                  m5, m1                        ;t7
   1262    ITX_MULSUB_2W            5, 2, 1, 6, 7, 3784, 1567    ;t6a, t7a
   1263 
   1264    mova                    m6, [rsp+gprsize*2+16*2]
   1265    mova  [rsp+gprsize*2+16*2], m5
   1266    mova                    m1, [rsp+gprsize*2+16*1]
   1267    mova  [rsp+gprsize*2+16*1], m2
   1268    mova                    m5, [rsp+gprsize*2+16*0]
   1269    mova  [rsp+gprsize*2+16*0], m3
   1270    ITX_MULSUB_2W            5, 0, 2, 3, 7,  401, 4076    ;t1a, t0a
   1271    ITX_MULSUB_2W            1, 6, 2, 3, 7, 3166, 2598    ;t5a, t4a
   1272    psubsw                  m2, m0, m6                    ;t4
   1273    paddsw                  m0, m6                        ;t0
   1274    paddsw                  m3, m5, m1                    ;t1
   1275    psubsw                  m5, m1                        ;t5
   1276    ITX_MULSUB_2W            2, 5, 1, 6, 7, 1567, 3784    ;t5a, t4a
   1277 
   1278    mova                    m7, [rsp+gprsize*2+16*0]
   1279    paddsw                  m1, m3, m4                    ;-out7
   1280    psubsw                  m3, m4                        ;t3
   1281    mova  [rsp+gprsize*2+16*0], m1
   1282    psubsw                  m4, m0, m7                    ;t2
   1283    paddsw                  m0, m7                        ;out0
   1284    mova                    m6, [rsp+gprsize*2+16*2]
   1285    mova                    m7, [rsp+gprsize*2+16*1]
   1286    paddsw                  m1, m5, m6                    ;-out1
   1287    psubsw                  m5, m6                        ;t6
   1288    paddsw                  m6, m2, m7                    ;out6
   1289    psubsw                  m2, m7                        ;t7
   1290    ret
   1291 ALIGN function_align
   1292 .main_pass1_end:
   1293    mova  [rsp+gprsize*2+16*1], m1
   1294    mova  [rsp+gprsize*2+16*2], m6
   1295    punpckhwd               m1, m4, m3
   1296    punpcklwd               m4, m3
   1297    punpckhwd               m7, m5, m2
   1298    punpcklwd               m5, m2
   1299    mova                    m2, [o(pw_2896_2896)]
   1300    mova                    m6, [o(pd_2048)]
   1301    pmaddwd                 m3, m2, m7
   1302    pmaddwd                 m2, m5
   1303    paddd                   m3, m6
   1304    paddd                   m2, m6
   1305    psrad                   m3, 12
   1306    psrad                   m2, 12
   1307    packssdw                m2, m3                        ;out2
   1308    mova                    m3, [o(pw_2896_m2896)]
   1309    pmaddwd                 m7, m3
   1310    pmaddwd                 m5, m3
   1311    paddd                   m7, m6
   1312    paddd                   m5, m6
   1313    psrad                   m7, 12
   1314    psrad                   m5, 12
   1315    packssdw                m5, m7                        ;-out5
   1316    mova                    m3, [o(pw_2896_2896)]
   1317    pmaddwd                 m7, m3, m1
   1318    pmaddwd                 m3, m4
   1319    paddd                   m7, m6
   1320    paddd                   m3, m6
   1321    psrad                   m7, 12
   1322    psrad                   m3, 12
   1323    packssdw                m3, m7                        ;-out3
   1324    mova                    m7, [o(pw_2896_m2896)]
   1325    pmaddwd                 m1, m7
   1326    pmaddwd                 m4, m7
   1327    paddd                   m1, m6
   1328    paddd                   m4, m6
   1329    psrad                   m1, 12
   1330    psrad                   m4, 12
   1331    packssdw                m4, m1                        ;-out5
   1332    mova                    m1, [rsp+gprsize*2+16*1]
   1333    mova                    m6, [rsp+gprsize*2+16*2]
   1334    ret
   1335 ALIGN function_align
   1336 cglobal_label .main_pass2_end
   1337    paddsw                  m7, m4, m3                    ;t2 + t3
   1338    psubsw                  m4, m3                        ;t2 - t3
   1339    paddsw                  m3, m5, m2                    ;t6 + t7
   1340    psubsw                  m5, m2                        ;t6 - t7
   1341    mova                    m2, [o(pw_2896x8)]
   1342    pmulhrsw                m4, m2                        ;out4
   1343    pmulhrsw                m5, m2                        ;-out5
   1344    pmulhrsw                m7, m2                        ;-out3
   1345    pmulhrsw                m2, m3                        ;out2
   1346    mova                    m3, m7
   1347    ret
   1348 
   1349 INV_TXFM_8X8_FN flipadst, dct
   1350 INV_TXFM_8X8_FN flipadst, adst
   1351 INV_TXFM_8X8_FN flipadst, flipadst
   1352 INV_TXFM_8X8_FN flipadst, identity
   1353 
   1354 cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   1355    LOAD_8ROWS          coeffq, 16
   1356 
   1357 .pass1:
   1358    call m(iadst_8x8_internal_8bpc).main
   1359    call m(iadst_8x8_internal_8bpc).main_pass1_end
   1360 
   1361 .pass1_end:
   1362    mova                    m7, [o(pw_m16384)]
   1363 
   1364 .pass1_end1:
   1365    pmulhrsw                m1, m7
   1366    mova    [rsp+gprsize+16*1], m1
   1367    mova                    m1, m6
   1368    mova                    m6, m2
   1369    pmulhrsw                m2, m5, m7
   1370    mova                    m5, m6
   1371    mova                    m6, m4
   1372    pmulhrsw                m4, m3, m7
   1373    mova                    m3, m6
   1374    mova                    m6, m0
   1375    mova                    m0, m7
   1376    pxor                    m7, m7
   1377    psubw                   m7, m0
   1378    pmulhrsw                m0, [rsp+gprsize+16*0]
   1379    REPX      {pmulhrsw x, m7}, m1, m3, m5
   1380    pmulhrsw                m7, m6
   1381    jmp m(idct_8x8_internal_8bpc).pass1_end3
   1382 
   1383 ALIGN function_align
   1384 .pass2:
   1385    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
   1386 
   1387 .pass2_main:
   1388    call m(iadst_8x8_internal_8bpc).main
   1389    call m(iadst_8x8_internal_8bpc).main_pass2_end
   1390 
   1391 .end:
   1392    mova                    m7, [o(pw_2048)]
   1393    REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
   1394    mova    [rsp+gprsize+16*2], m2
   1395    mova                    m2, m0
   1396    pxor                    m0, m0
   1397    psubw                   m0, m7
   1398    mova                    m7, m2
   1399    pmulhrsw                m1, m0
   1400    pmulhrsw                m2, m5, m0
   1401    mova    [rsp+gprsize+16*1], m1
   1402    mova                    m5, m4
   1403    mova                    m1, m6
   1404    pmulhrsw                m4, m3, m0
   1405    pmulhrsw                m0, [rsp+gprsize+16*0]
   1406    mova                    m3, m5
   1407    mova    [rsp+gprsize+16*0], m7
   1408    jmp m(idct_8x8_internal_8bpc).end3
   1409 
   1410 INV_TXFM_8X8_FN identity, dct
   1411 INV_TXFM_8X8_FN identity, adst
   1412 INV_TXFM_8X8_FN identity, flipadst
   1413 INV_TXFM_8X8_FN identity, identity
   1414 
   1415 cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   1416    LOAD_8ROWS          coeffq, 16
   1417    mova    [rsp+gprsize+16*1], m6
   1418    jmp   m(idct_8x8_internal_8bpc).pass1_end3
   1419 
   1420 ALIGN function_align
   1421 .pass2:
   1422    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
   1423 
   1424 .end:
   1425    pmulhrsw                m7, [o(pw_4096)]
   1426    mova    [rsp+gprsize+16*0], m7
   1427    mova                    m7, [o(pw_4096)]
   1428    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   1429    mova    [rsp+gprsize+16*2], m5
   1430    mova    [rsp+gprsize+16*1], m6
   1431    jmp m(idct_8x8_internal_8bpc).end3
   1432 
   1433 
   1434 %macro INV_TXFM_4X16_FN 2 ; type1, type2
   1435    INV_TXFM_FN          %1, %2, 4x16, 8
   1436 %ifidn %1_%2, dct_dct
   1437    pshuflw               m0, [coeffq], q0000
   1438    punpcklwd             m0, m0
   1439    mova                  m1, [o(pw_2896x8)]
   1440    pmulhrsw              m0, m1
   1441    mov             [coeffq], eobd
   1442    pmulhrsw              m0, [o(pw_16384)]
   1443    pmulhrsw              m0, m1
   1444    pmulhrsw              m0, [o(pw_2048)]
   1445 .end:
   1446    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
   1447    lea                dstq, [dstq+strideq*4]
   1448    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
   1449    lea                dstq, [dstq+strideq*4]
   1450    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
   1451    lea                dstq, [dstq+strideq*4]
   1452    WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
   1453    RET
   1454 %endif
   1455 %endmacro
   1456 
   1457 INV_TXFM_4X16_FN dct, dct
   1458 INV_TXFM_4X16_FN dct, adst
   1459 INV_TXFM_4X16_FN dct, flipadst
   1460 INV_TXFM_4X16_FN dct, identity
   1461 
   1462 cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   1463    lea                  r3, [o(m(idct_4x8_internal_8bpc).pass1)]
   1464 
   1465 .pass1:
   1466    mova                 m0, [coeffq+16*1]
   1467    mova                 m1, [coeffq+16*3]
   1468    mova                 m2, [coeffq+16*5]
   1469    mova                 m3, [coeffq+16*7]
   1470    push               tx2q
   1471    lea                tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)]
   1472    jmp                  r3
   1473 
   1474 .pass1_2:
   1475    mova      [coeffq+16*1], m0
   1476    mova      [coeffq+16*3], m1
   1477    mova      [coeffq+16*5], m2
   1478    mova      [coeffq+16*7], m3
   1479    mova                 m0, [coeffq+16*0]
   1480    mova                 m1, [coeffq+16*2]
   1481    mova                 m2, [coeffq+16*4]
   1482    mova                 m3, [coeffq+16*6]
   1483    lea                tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)]
   1484    jmp                  r3
   1485 
   1486 .pass1_end:
   1487    pop                tx2q
   1488 
   1489    mova                 m4, [coeffq+16*1]
   1490    mova                 m5, [coeffq+16*3]
   1491    mova                 m6, [coeffq+16*5]
   1492    mova                 m7, [o(pw_16384)]
   1493    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   1494 
   1495    pmulhrsw             m7, [coeffq+16*7]
   1496    mova       [coeffq+16*7], m7
   1497    jmp                tx2q
   1498 
   1499 .pass2:
   1500    call m(idct_16x4_internal_8bpc).main
   1501 
   1502 .end:
   1503    mova                  m7, [o(pw_2048)]
   1504    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   1505    pmulhrsw              m7, [coeffq+16*7]
   1506    mova       [coeffq+16*4], m4
   1507 
   1508 .end1:
   1509    mova       [coeffq+16*5], m5
   1510    mova       [coeffq+16*6], m6
   1511    mov                   r3, coeffq
   1512    WRITE_4X8              0, 1, 3, 2
   1513 
   1514    mova                  m0, [r3+16*4]
   1515    mova                  m1, [r3+16*5]
   1516    mova                  m2, [r3+16*6]
   1517    mova                  m3, m7
   1518    lea                 dstq, [dstq+strideq*4]
   1519    WRITE_4X8              0, 1, 3, 2
   1520 
   1521 .end2:
   1522    pxor                  m7, m7
   1523    REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
   1524    ret
   1525 
   1526 INV_TXFM_4X16_FN adst, dct
   1527 INV_TXFM_4X16_FN adst, adst
   1528 INV_TXFM_4X16_FN adst, flipadst
   1529 INV_TXFM_4X16_FN adst, identity
   1530 
   1531 cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   1532    lea                   r3, [o(m(iadst_4x8_internal_8bpc).pass1)]
   1533    jmp   m(idct_4x16_internal_8bpc).pass1
   1534 
   1535 .pass2:
   1536    call m(iadst_16x4_internal_8bpc).main
   1537    call m(iadst_16x4_internal_8bpc).main_pass2_end
   1538 
   1539    punpcklqdq            m6, m5, m4                ;low: -out5  high: -out7
   1540    punpckhqdq            m4, m5                    ;low:  out8  high:  out10
   1541    punpcklqdq            m5, m7, m2                ;low:  out4  high:  out6
   1542    punpckhqdq            m2, m7                    ;low: -out9  high: -out11
   1543    mova       [coeffq+16*4], m2
   1544    mova       [coeffq+16*5], m6
   1545    mova                  m2, [coeffq+16*6]
   1546    mova                  m6, [coeffq+16*7]
   1547    punpckhqdq            m1, m6, m0                ;low: -out13 high: -out15
   1548    punpcklqdq            m0, m6                    ;low:  out0  high:  out2
   1549    punpckhqdq            m6, m3, m2                ;low:  out12 high:  out14
   1550    punpcklqdq            m2, m3                    ;low: -out1  high: -out3
   1551 
   1552    mova                  m7, [o(pw_2048)]
   1553 
   1554 .end1:
   1555    REPX    {pmulhrsw x, m7}, m0, m5, m4, m6
   1556    pxor                  m3, m3
   1557    psubw                 m3, m7
   1558    mova                  m7, [coeffq+16*4]
   1559    REPX    {pmulhrsw x, m3}, m2, m7, m1
   1560    pmulhrsw              m3, [coeffq+16*5]
   1561    mova       [coeffq+16*7], m5
   1562 
   1563    punpckhqdq            m5, m4, m7                ;low:  out10 high:  out11
   1564    punpcklqdq            m4, m7                    ;low:  out8  high:  out9
   1565    punpckhqdq            m7, m6, m1                ;low:  out14 high:  out15
   1566    punpcklqdq            m6, m1                    ;low:  out12 high:  out13
   1567    punpckhqdq            m1, m0, m2                ;low:  out2  high:  out3
   1568    punpcklqdq            m0, m2                    ;low:  out0  high:  out1
   1569    mova       [coeffq+16*4], m4
   1570    mova                  m4, [coeffq+16*7]
   1571    punpcklqdq            m2, m4, m3                ;low:  out4  high:  out5
   1572    punpckhqdq            m4, m3                    ;low:  out6  high:  out7
   1573    mova                  m3, m4
   1574 
   1575 .end2:
   1576    mova       [coeffq+16*5], m5
   1577    mova       [coeffq+16*6], m6
   1578    mov                   r3, coeffq
   1579    WRITE_4X8              0, 1, 2, 3
   1580 
   1581    mova                  m0, [r3+16*4]
   1582    mova                  m1, [r3+16*5]
   1583    mova                  m2, [r3+16*6]
   1584    mova                  m3, m7
   1585    lea                 dstq, [dstq+strideq*4]
   1586    WRITE_4X8              0, 1, 2, 3
   1587 
   1588 .end3:
   1589    pxor                  m7, m7
   1590    REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
   1591    ret
   1592 
   1593 
   1594 INV_TXFM_4X16_FN flipadst, dct
   1595 INV_TXFM_4X16_FN flipadst, adst
   1596 INV_TXFM_4X16_FN flipadst, flipadst
   1597 INV_TXFM_4X16_FN flipadst, identity
   1598 
   1599 cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   1600    lea                   r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)]
   1601    jmp   m(idct_4x16_internal_8bpc).pass1
   1602 
   1603 .pass2:
   1604    call m(iadst_16x4_internal_8bpc).main
   1605    call m(iadst_16x4_internal_8bpc).main_pass2_end
   1606 
   1607    punpckhqdq            m6, m5, m4                ;low:  out5  high:  out7
   1608    punpcklqdq            m4, m5                    ;low: -out8  high: -out10
   1609    punpckhqdq            m5, m7, m2                ;low: -out4  high: -out6
   1610    punpcklqdq            m2, m7                    ;low:  out9  high:  out11
   1611    mova       [coeffq+16*4], m2
   1612    mova       [coeffq+16*5], m6
   1613    mova                  m2, [coeffq+16*6]
   1614    mova                  m6, [coeffq+16*7]
   1615    punpcklqdq            m1, m6, m0                ;low:  out13 high:  out15
   1616    punpckhqdq            m0, m6                    ;low: -out0  high: -out2
   1617    punpcklqdq            m6, m3, m2                ;low: -out12 high: -out14
   1618    punpckhqdq            m2, m3                    ;low:  out1  high:  out3
   1619 
   1620    mova                  m7, [o(pw_m2048)]
   1621    jmp   m(iadst_4x16_internal_8bpc).end1
   1622 
   1623 
   1624 INV_TXFM_4X16_FN identity, dct
   1625 INV_TXFM_4X16_FN identity, adst
   1626 INV_TXFM_4X16_FN identity, flipadst
   1627 INV_TXFM_4X16_FN identity, identity
   1628 
   1629 %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
   1630    pmulhrsw            m%2, m%3, m%1
   1631 %if %0 == 4 ; if downshifting by 1
   1632    pmulhrsw            m%2, m%4
   1633 %else
   1634    paddsw              m%1, m%1
   1635 %endif
   1636    paddsw              m%1, m%2
   1637 %endmacro
   1638 
   1639 cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   1640    mova                  m0, [coeffq+16*1]
   1641    mova                  m6, [o(pw_1697x8)]
   1642    mova                  m1, [coeffq+16*3]
   1643    mova                  m2, [coeffq+16*5]
   1644    mova                  m3, [coeffq+16*7]
   1645    pcmpeqw               m7, m7
   1646    mov                   r3, tx2q
   1647    lea                 tx2q, [o(.pass1_2)]
   1648 .pass1:
   1649    pmulhrsw              m4, m6, m0
   1650    pmulhrsw              m5, m6, m1
   1651    pavgw                 m4, m0
   1652    pcmpeqw               m0, m7
   1653    pavgw                 m5, m1
   1654    pcmpeqw               m1, m7
   1655    pandn                 m0, m4
   1656    pmulhrsw              m4, m6, m2
   1657    pandn                 m1, m5
   1658    pmulhrsw              m5, m6, m3
   1659    pavgw                 m4, m2
   1660    pcmpeqw               m2, m7
   1661    pavgw                 m5, m3
   1662    pcmpeqw               m3, m7
   1663    pandn                 m2, m4
   1664    pandn                 m3, m5
   1665    jmp m(iadst_4x8_internal_8bpc).pass1_end
   1666 .pass1_2:
   1667    mova       [coeffq+16*1], m0
   1668    mova       [coeffq+16*3], m1
   1669    mova       [coeffq+16*5], m2
   1670    mova       [coeffq+16*7], m3
   1671    mova                  m0, [coeffq+16*0]
   1672    mova                  m1, [coeffq+16*2]
   1673    mova                  m2, [coeffq+16*4]
   1674    mova                  m3, [coeffq+16*6]
   1675    lea                 tx2q, [o(.pass1_end)]
   1676    jmp .pass1
   1677 .pass1_end:
   1678    mova                  m4, [coeffq+16*1]
   1679    mova                  m5, [coeffq+16*3]
   1680    mova                  m6, [coeffq+16*5]
   1681    jmp                   r3
   1682 .pass2:
   1683    mova                  m7, [o(pw_1697x16)]
   1684    mova       [coeffq+16*6], m6
   1685    REPX    {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
   1686    mova                  m6, [coeffq+16*7]
   1687    IDTX16                 6, 7, 7
   1688    mova       [coeffq+16*7], m6
   1689    mova                  m6, [coeffq+16*6]
   1690    pmulhrsw              m7, m6, [o(pw_1697x16)]
   1691    paddsw                m6, m6
   1692    paddsw                m6, m7
   1693    mova                  m7, [o(pw_2048)]
   1694    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   1695    pmulhrsw              m7, [coeffq+16*7]
   1696    mova       [coeffq+16*4], m4
   1697    jmp m(iadst_4x16_internal_8bpc).end2
   1698 
   1699 
   1700 %macro INV_TXFM_16X4_FN 2 ; type1, type2
   1701    INV_TXFM_FN          %1, %2, 16x4, 8
   1702 %ifidn %1_%2, dct_dct
   1703    movd                 m1, [o(pw_2896x8)]
   1704    pmulhrsw             m0, m1, [coeffq]
   1705    movd                 m2, [o(pw_16384)]
   1706    mov            [coeffq], eobd
   1707    mov                 r2d, 2
   1708    lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)]
   1709 .dconly:
   1710    pmulhrsw             m0, m2
   1711    movd                 m2, [o(pw_2048)]              ;intentionally rip-relative
   1712    pmulhrsw             m0, m1
   1713    pmulhrsw             m0, m2
   1714    pshuflw              m0, m0, q0000
   1715    punpcklwd            m0, m0
   1716    pxor                 m5, m5
   1717 .dconly_loop:
   1718    mova                 m1, [dstq]
   1719    mova                 m3, [dstq+strideq]
   1720    punpckhbw            m2, m1, m5
   1721    punpcklbw            m1, m5
   1722    punpckhbw            m4, m3, m5
   1723    punpcklbw            m3, m5
   1724    paddw                m2, m0
   1725    paddw                m1, m0
   1726    paddw                m4, m0
   1727    paddw                m3, m0
   1728    packuswb             m1, m2
   1729    packuswb             m3, m4
   1730    mova             [dstq], m1
   1731    mova     [dstq+strideq], m3
   1732    lea                dstq, [dstq+strideq*2]
   1733    dec                 r2d
   1734    jg .dconly_loop
   1735    jmp                tx2q
   1736 .end:
   1737    RET
   1738 %endif
   1739 %endmacro
   1740 
   1741 %macro LOAD_7ROWS 2 ;src, stride
   1742    mova                 m0, [%1+%2*0]
   1743    mova                 m1, [%1+%2*1]
   1744    mova                 m2, [%1+%2*2]
   1745    mova                 m3, [%1+%2*3]
   1746    mova                 m4, [%1+%2*4]
   1747    mova                 m5, [%1+%2*5]
   1748    mova                 m6, [%1+%2*6]
   1749 %endmacro
   1750 
   1751 %macro SAVE_7ROWS 2 ;src, stride
   1752    mova          [%1+%2*0], m0
   1753    mova          [%1+%2*1], m1
   1754    mova          [%1+%2*2], m2
   1755    mova          [%1+%2*3], m3
   1756    mova          [%1+%2*4], m4
   1757    mova          [%1+%2*5], m5
   1758    mova          [%1+%2*6], m6
   1759 %endmacro
   1760 
   1761 %macro IDCT16_1D_PACKED_ODDHALF 7  ;src[1-4], tmp[1-3]
   1762    punpckhwd            m%5, m%4, m%1                ;packed in13 in3
   1763    punpcklwd            m%1, m%4                     ;packed in1  in15
   1764    punpcklwd            m%4, m%3, m%2                ;packed in9  in7
   1765    punpckhwd            m%2, m%3                     ;packed in5  in11
   1766    mova                 m%7, [o(pd_2048)]
   1767    ITX_MUL2X_PACK        %1, %6, %7,  401, 4076, 1    ;low: t8a   high: t15a
   1768    ITX_MUL2X_PACK        %4, %6, %7, 3166, 2598, 1    ;low: t9a   high: t14a
   1769    ITX_MUL2X_PACK        %2, %6, %7, 1931, 3612, 1    ;low: t10a  high: t13a
   1770    ITX_MUL2X_PACK        %5, %6, %7, 3920, 1189, 1    ;low: t11a  high: t12a
   1771    psubsw               m%6, m%1, m%4                 ;low: t9    high: t14
   1772    paddsw               m%1, m%4                      ;low: t8    high: t15
   1773    psubsw               m%4, m%5, m%2                 ;low: t10   high: t13
   1774    paddsw               m%5, m%2                      ;low: t11   high: t12
   1775    mova                 m%2, [o(deint_shuf2)]
   1776    pshufb               m%6, m%2
   1777    pshufb               m%4, m%2
   1778    ITX_MUL2X_PACK        %6, %3, %7, 1567, 3784, 1    ;low: t9a   high: t14a
   1779    ITX_MUL2X_PACK        %4, %3, %7, m3784, 1567, 1   ;low: t10a  high: t13a
   1780    psubsw               m%3, m%1, m%5                 ;low: t11a  high: t12a
   1781    paddsw               m%1, m%5                      ;low: t8a   high: t15a
   1782    psubsw               m%5, m%6, m%4                 ;low: t10   high: t13
   1783    paddsw               m%6, m%4                      ;low: t9    high: t14
   1784    pshufb               m%3, m%2
   1785    pshufb               m%5, m%2
   1786    ITX_MUL2X_PACK        %3, %2, %7, 2896, 2896, 4    ;t12,  t11
   1787    ITX_MUL2X_PACK        %5, %4, %7, 2896, 2896, 4    ;t13a, t10a
   1788    packssdw             m%2, m%4                      ;low: t11   high: t10a
   1789    packssdw             m%3, m%5                      ;low: t12   high: t13a
   1790    punpckhqdq           m%4, m%1, m%6                 ;low: t15a  high: t14
   1791    punpcklqdq           m%1, m%6                      ;low: t8a   high: t9
   1792 %endmacro
   1793 
   1794 INV_TXFM_16X4_FN dct, dct
   1795 INV_TXFM_16X4_FN dct, adst
   1796 INV_TXFM_16X4_FN dct, flipadst
   1797 INV_TXFM_16X4_FN dct, identity
   1798 
   1799 cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   1800    LOAD_7ROWS        coeffq, 16
   1801    call .main
   1802 
   1803 .pass1_end:
   1804    punpckhwd             m7, m0, m2                 ;packed out1,  out5
   1805    punpcklwd             m0, m2                     ;packed out0,  out4
   1806    punpcklwd             m2, m1, m3                 ;packed out3,  out7
   1807    punpckhwd             m1, m3                     ;packed out2,  out6
   1808    mova       [coeffq+16*6], m7
   1809    mova                  m7, [coeffq+16*7]
   1810    punpckhwd             m3, m4, m6                 ;packed out9,  out13
   1811    punpcklwd             m4, m6                     ;packed out8,  out12
   1812    punpcklwd             m6, m5, m7                 ;packed out11, out15
   1813    punpckhwd             m5, m7                     ;packed out10, out14
   1814 
   1815 .pass1_end2:
   1816    mova                  m7, [o(pw_16384)]
   1817    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   1818    pmulhrsw              m7, [coeffq+16*6]
   1819    mova       [coeffq+16*6], m7
   1820 
   1821 .pass1_end3:
   1822    punpckhwd             m7, m3, m6                 ;packed 9, 11, 13, 15 high
   1823    punpcklwd             m3, m6                     ;packed 9, 10, 13, 15 low
   1824    punpckhwd             m6, m4, m5                 ;packed 8, 10, 12, 14 high
   1825    punpcklwd             m4, m5                     ;packed 8, 10, 12, 14 low
   1826    punpckhwd             m5, m4, m3                 ;8, 9, 10, 11, 12, 13, 14, 15(1)
   1827    punpcklwd             m4, m3                     ;8, 9, 10, 11, 12, 13, 14, 15(0)
   1828    punpckhwd             m3, m6, m7                 ;8, 9, 10, 11, 12, 13, 14, 15(3)
   1829    punpcklwd             m6, m7                     ;8, 9, 10, 11, 12, 13, 14, 15(2)
   1830    mova       [coeffq+16*7], m3
   1831    mova                  m3, [coeffq+16*6]
   1832    punpckhwd             m7, m3, m2                 ;packed 1, 3, 5, 7 high
   1833    punpcklwd             m3, m2                     ;packed 1, 3, 5, 7 low
   1834    punpckhwd             m2, m0, m1                 ;packed 0, 2, 4, 6 high
   1835    punpcklwd             m0, m1                     ;packed 0, 2, 4, 6 low
   1836    punpckhwd             m1, m0, m3                 ;0, 1, 2, 3, 4, 5, 6, 7(1)
   1837    punpcklwd             m0, m3                     ;0, 1, 2, 3, 4, 5, 6, 7(0)
   1838    punpckhwd             m3, m2, m7                 ;0, 1, 2, 3, 4, 5, 6, 7(3)
   1839    punpcklwd             m2, m7                     ;0, 1, 2, 3, 4, 5, 6, 7(2)
   1840    jmp                 tx2q
   1841 
   1842 .pass2:
   1843    lea                 tx2q, [o(m(idct_8x4_internal_8bpc).pass2)]
   1844 
   1845 .pass2_end:
   1846    mova       [coeffq+16*4], m4
   1847    mova       [coeffq+16*5], m5
   1848    mova       [coeffq+16*6], m6
   1849    lea                   r3, [dstq+8]
   1850    call                tx2q
   1851 
   1852    add               coeffq, 16*4
   1853    mova                  m0, [coeffq+16*0]
   1854    mova                  m1, [coeffq+16*1]
   1855    mova                  m2, [coeffq+16*2]
   1856    mova                  m3, [coeffq+16*3]
   1857    mov                 dstq, r3
   1858    jmp                 tx2q
   1859 
   1860 ALIGN function_align
   1861 cglobal_label .main
   1862    punpckhqdq            m7, m0, m1                 ;low:in1  high:in3
   1863    punpcklqdq            m0, m1
   1864    punpcklqdq            m1, m2, m3
   1865    punpckhqdq            m3, m2                     ;low:in7  high:in5
   1866    mova       [coeffq+16*4], m7
   1867    mova       [coeffq+16*5], m3
   1868    mova                  m7, [coeffq+16*7]
   1869    punpcklqdq            m2, m4, m5
   1870    punpckhqdq            m4, m5                     ;low:in9  high:in11
   1871    punpcklqdq            m3, m6, m7
   1872    punpckhqdq            m7, m6                     ;low:in15 high:in13
   1873    mova       [coeffq+16*6], m4
   1874    IDCT8_1D_PACKED
   1875    mova                  m6, [coeffq+16*4]
   1876    mova                  m4, [coeffq+16*5]
   1877    mova                  m5, [coeffq+16*6]
   1878    mova       [coeffq+16*4], m1
   1879    mova       [coeffq+16*5], m2
   1880    mova       [coeffq+16*6], m3
   1881 
   1882    IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3
   1883 
   1884    mova                  m1, [coeffq+16*4]
   1885    psubsw                m3, m0, m7                 ;low:out15 high:out14
   1886    paddsw                m0, m7                     ;low:out0  high:out1
   1887    psubsw                m7, m1, m5                 ;low:out12 high:out13
   1888    paddsw                m1, m5                     ;low:out3  high:out2
   1889    mova       [coeffq+16*7], m3
   1890    mova                  m2, [coeffq+16*5]
   1891    mova                  m3, [coeffq+16*6]
   1892    psubsw                m5, m2, m4                 ;low:out11 high:out10
   1893    paddsw                m2, m4                     ;low:out4  high:out5
   1894    psubsw                m4, m3, m6                 ;low:out8  high:out9
   1895    paddsw                m3, m6                     ;low:out7  high:out6
   1896    mova                  m6, m7
   1897    ret
   1898 
   1899 INV_TXFM_16X4_FN adst, dct
   1900 INV_TXFM_16X4_FN adst, adst
   1901 INV_TXFM_16X4_FN adst, flipadst
   1902 INV_TXFM_16X4_FN adst, identity
   1903 
   1904 cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   1905    LOAD_7ROWS        coeffq, 16
   1906    call .main
   1907    call .main_pass1_end
   1908 
   1909    punpckhwd             m6, m7, m0                 ;packed -out11, -out15
   1910    punpcklwd             m0, m7                     ;packed   out0,   out4
   1911    punpcklwd             m7, m3, m4                 ;packed  -out3,  -out7
   1912    punpckhwd             m4, m3                     ;packed   out8,  out12
   1913    mova                  m1, [coeffq+16*6]
   1914    punpcklwd             m3, m1, m5                 ;packed  -out1,  -out5
   1915    punpckhwd             m5, m1                     ;packed  out10,  out14
   1916    mova                  m1, [coeffq+16*7]
   1917    mova       [coeffq+16*6], m3
   1918    mova       [coeffq+16*7], m7
   1919    punpckhwd             m3, m2, m1                 ;packed  -out9,  -out13
   1920    punpcklwd             m1, m2                     ;packed   out2,   out6
   1921 
   1922    mova                  m7, [o(pw_16384)]
   1923 
   1924 .pass1_end:
   1925    REPX    {pmulhrsw x, m7}, m0, m1, m4, m5
   1926    pxor                  m2, m2
   1927    psubw                 m2, m7
   1928    mova                  m7, [coeffq+16*6]
   1929    REPX    {pmulhrsw x, m2}, m7, m3, m6
   1930    pmulhrsw              m2, [coeffq+16*7]
   1931    mova       [coeffq+16*6], m7
   1932    jmp   m(idct_16x4_internal_8bpc).pass1_end3
   1933 
   1934 .pass2:
   1935    lea                 tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)]
   1936    jmp   m(idct_16x4_internal_8bpc).pass2_end
   1937 
   1938 ALIGN function_align
   1939 cglobal_label .main
   1940    mova       [coeffq+16*6], m0
   1941    pshufd                m0, m1, q1032
   1942    pshufd                m2, m2, q1032
   1943    punpckhwd             m1, m6, m0                 ;packed in13,  in2
   1944    punpcklwd             m0, m6                     ;packed  in3, in12
   1945    punpckhwd             m7, m5, m2                 ;packed in11,  in4
   1946    punpcklwd             m2, m5                     ;packed  in5, in10
   1947    mova                  m6, [o(pd_2048)]
   1948    ITX_MUL2X_PACK         1, 5, 6,  995, 3973       ;low:t2   high:t3
   1949    ITX_MUL2X_PACK         7, 5, 6, 1751, 3703       ;low:t4   high:t5
   1950    ITX_MUL2X_PACK         2, 5, 6, 3513, 2106       ;low:t10  high:t11
   1951    ITX_MUL2X_PACK         0, 5, 6, 3857, 1380       ;low:t12  high:t13
   1952    psubsw                m5, m1, m2                 ;low:t10a high:t11a
   1953    paddsw                m1, m2                     ;low:t2a  high:t3a
   1954    psubsw                m2, m7, m0                 ;low:t12a high:t13a
   1955    paddsw                m7, m0                     ;low:t4a  high:t5a
   1956    punpcklqdq            m0, m5
   1957    punpckhwd             m0, m5                     ;packed t10a, t11a
   1958    punpcklqdq            m5, m2
   1959    punpckhwd             m2, m5                     ;packed t13a, t12a
   1960    ITX_MUL2X_PACK         0, 5, 6, 3406, 2276       ;low:t10  high:t11
   1961    ITX_MUL2X_PACK         2, 5, 6, 4017,  799, 1    ;low:t12  high:t13
   1962    mova       [coeffq+16*4], m1
   1963    mova       [coeffq+16*5], m7
   1964    mova                  m1, [coeffq+16*6]
   1965    mova                  m7, [coeffq+16*7]
   1966    pshufd                m1, m1, q1032
   1967    pshufd                m3, m3, q1032
   1968    punpckhwd             m5, m7, m1                 ;packed in15,  in0
   1969    punpcklwd             m1, m7                     ;packed  in1, in14
   1970    punpckhwd             m7, m4, m3                 ;packed  in9,  in6
   1971    punpcklwd             m3, m4                     ;packed  in7,  in8
   1972    ITX_MUL2X_PACK         5, 4, 6,  201, 4091       ;low:t0    high:t1
   1973    ITX_MUL2X_PACK         7, 4, 6, 2440, 3290       ;low:t6    high:t7
   1974    ITX_MUL2X_PACK         3, 4, 6, 3035, 2751       ;low:t8    high:t9
   1975    ITX_MUL2X_PACK         1, 4, 6, 4052,  601       ;low:t14   high:t15
   1976    psubsw                m4, m5, m3                 ;low:t8a   high:t9a
   1977    paddsw                m5, m3                     ;low:t0a   high:t1a
   1978    psubsw                m3, m7, m1                 ;low:t14a  high:t15a
   1979    paddsw                m7, m1                     ;low:t6a   high:t7a
   1980    punpcklqdq            m1, m4
   1981    punpckhwd             m1, m4                     ;packed  t8a,  t9a
   1982    punpcklqdq            m4, m3
   1983    punpckhwd             m3, m4                     ;packed t15a, t14a
   1984    ITX_MUL2X_PACK         1, 4, 6,  799, 4017       ;low:t8    high:t9
   1985    ITX_MUL2X_PACK         3, 4, 6, 2276, 3406, 1    ;low:t14   high:t15
   1986    paddsw                m4, m1, m2                 ;low:t12a  high:t13a
   1987    psubsw                m1, m2                     ;low:t8a   high:t9a
   1988    psubsw                m2, m0, m3                 ;low:t14a  high:t15a
   1989    paddsw                m0, m3                     ;low:t10a  high:t11a
   1990    punpcklqdq            m3, m1
   1991    punpckhwd             m3, m1                     ;packed t12a, t13a
   1992    punpcklqdq            m1, m2
   1993    punpckhwd             m2, m1                     ;packed t15a, t14a
   1994    ITX_MUL2X_PACK         3, 1, 6, 1567, 3784       ;low:t12   high:t13
   1995    ITX_MUL2X_PACK         2, 1, 6, 3784, 1567, 1    ;low:t14   high:t15
   1996    psubsw                m1, m3, m2                 ;low:t14a  high:t15a
   1997    paddsw                m3, m2                     ;low:out2  high:-out13
   1998    psubsw                m2, m4, m0                 ;low:t10   high:t11
   1999    paddsw                m0, m4                     ;low:-out1 high:out14
   2000    mova       [coeffq+16*6], m0
   2001    mova       [coeffq+16*7], m3
   2002    mova                  m0, [coeffq+16*4]
   2003    mova                  m3, [coeffq+16*5]
   2004    psubsw                m4, m5, m3                 ;low:t4    high:t5
   2005    paddsw                m5, m3                     ;low:t0    high:t1
   2006    psubsw                m3, m0, m7                 ;low:t6    high:t7
   2007    paddsw                m0, m7                     ;low:t2    high:t3
   2008    punpcklqdq            m7, m4
   2009    punpckhwd             m7, m4                     ;packed t4, t5
   2010    punpcklqdq            m4, m3
   2011    punpckhwd             m3, m4                     ;packed t7, t6
   2012    ITX_MUL2X_PACK         7, 4, 6, 1567, 3784       ;low:t4a   high:t5a
   2013    ITX_MUL2X_PACK         3, 4, 6, 3784, 1567, 1    ;low:t6a   high:t7a
   2014    psubsw                m4, m5, m0                 ;low:t2a   high:t3a
   2015    paddsw                m0, m5                     ;low:out0  high:-out15
   2016    psubsw                m5, m7, m3                 ;low:t6    high:t7
   2017    paddsw                m3, m7                     ;low:-out3 high:out12
   2018    ret
   2019 ALIGN function_align
   2020 .main_pass1_end:
   2021    mova                  m7, [o(deint_shuf1)]
   2022    mova       [coeffq+16*4], m0
   2023    mova       [coeffq+16*5], m3
   2024    mova                  m0, [o(pw_2896_m2896)]
   2025    mova                  m3, [o(pw_2896_2896)]
   2026    pshufb                m1, m7                     ;t14a t15a
   2027    pshufb                m2, m7                     ;t10  t11
   2028    pshufb                m4, m7                     ;t2a  t3a
   2029    pshufb                m5, m7                     ;t6   t7
   2030    pmaddwd               m7, m0, m2
   2031    pmaddwd               m2, m3
   2032    paddd                 m7, m6
   2033    paddd                 m2, m6
   2034    psrad                 m7, 12
   2035    psrad                 m2, 12
   2036    packssdw              m2, m7                     ;low:out6  high:-out9
   2037    pmaddwd               m7, m0, m4
   2038    pmaddwd               m4, m3
   2039    paddd                 m7, m6
   2040    paddd                 m4, m6
   2041    psrad                 m7, 12
   2042    psrad                 m4, 12
   2043    packssdw              m4, m7                     ;low:-out7 high:out8
   2044    pmaddwd               m7, m3, m5
   2045    pmaddwd               m5, m0
   2046    paddd                 m7, m6
   2047    paddd                 m5, m6
   2048    psrad                 m7, 12
   2049    psrad                 m5, 12
   2050    packssdw              m7, m5                     ;low:out4  high:-out11
   2051    pmaddwd               m5, m3, m1
   2052    pmaddwd               m1, m0
   2053    paddd                 m5, m6
   2054    paddd                 m1, m6
   2055    psrad                 m5, 12
   2056    psrad                 m1, 12
   2057    packssdw              m5, m1                     ;low:-out5 high:out10
   2058    mova                  m0, [coeffq+16*4]
   2059    mova                  m3, [coeffq+16*5]
   2060    ret
   2061 ALIGN function_align
   2062 cglobal_label .main_pass2_end
   2063    mova                  m7, [o(pw_2896x8)]
   2064    punpckhqdq            m6, m2, m1                 ;low:t11   high:t15a
   2065    punpcklqdq            m2, m1                     ;low:t10   high:t14a
   2066    psubsw                m1, m2, m6
   2067    paddsw                m2, m6
   2068    punpckhqdq            m6, m4, m5                 ;low:t3a   high:t7
   2069    punpcklqdq            m4, m5                     ;low:t2a   high:t6
   2070    psubsw                m5, m4, m6
   2071    paddsw                m4, m6
   2072    pmulhrsw              m1, m7                     ;low:-out9 high:out10
   2073    pmulhrsw              m2, m7                     ;low:out6  high:-out5
   2074    pmulhrsw              m5, m7                     ;low:out8  high:-out11
   2075    pmulhrsw              m4, m7                     ;low:-out7 high:out4
   2076    punpckhqdq            m7, m4, m5                 ;low:out4  high:-out11
   2077    punpcklqdq            m4, m5                     ;low:-out7 high:out8
   2078    punpckhqdq            m5, m2, m1                 ;low:-out5 high:out10
   2079    punpcklqdq            m2, m1                     ;low:out6  high:-out9
   2080    ret
   2081 
   2082 
   2083 INV_TXFM_16X4_FN flipadst, dct
   2084 INV_TXFM_16X4_FN flipadst, adst
   2085 INV_TXFM_16X4_FN flipadst, flipadst
   2086 INV_TXFM_16X4_FN flipadst, identity
   2087 
   2088 cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   2089    LOAD_7ROWS        coeffq, 16
   2090    call m(iadst_16x4_internal_8bpc).main
   2091    call m(iadst_16x4_internal_8bpc).main_pass1_end
   2092 
   2093    punpcklwd             m6, m7, m0                 ;packed  out11,  out15
   2094    punpckhwd             m0, m7                     ;packed  -out0,  -out4
   2095    punpckhwd             m7, m3, m4                 ;packed   out3,   out7
   2096    punpcklwd             m4, m3                     ;packed  -out8, -out12
   2097    mova                  m1, [coeffq+16*6]
   2098    punpckhwd             m3, m1, m5                 ;packed   out1,   out5
   2099    punpcklwd             m5, m1                     ;packed -out10, -out14
   2100    mova                  m1, [coeffq+16*7]
   2101    mova       [coeffq+16*6], m3
   2102    mova       [coeffq+16*7], m7
   2103    punpcklwd             m3, m2, m1                 ;packed   out9,  out13
   2104    punpckhwd             m1, m2                     ;packed  -out2,  -out6
   2105 
   2106    mova                  m7, [o(pw_m16384)]
   2107    jmp   m(iadst_16x4_internal_8bpc).pass1_end
   2108 
   2109 .pass2:
   2110    lea                 tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)]
   2111    jmp   m(idct_16x4_internal_8bpc).pass2_end
   2112 
   2113 
   2114 INV_TXFM_16X4_FN identity, dct
   2115 INV_TXFM_16X4_FN identity, adst
   2116 INV_TXFM_16X4_FN identity, flipadst
   2117 INV_TXFM_16X4_FN identity, identity
   2118 
   2119 cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   2120    mova                  m1, [coeffq+16*6]
   2121    mova                  m0, [coeffq+16*5]
   2122    mova                  m2, [coeffq+16*7]
   2123    mova                  m6, [o(pw_1697x16)]
   2124    mova                  m7, [o(pw_16384)]
   2125    pmulhrsw              m4, m6, m1
   2126    pmulhrsw              m3, m6, m0
   2127    pmulhrsw              m5, m6, m2
   2128    pmulhrsw              m4, m7
   2129    pmulhrsw              m3, m7
   2130    pmulhrsw              m5, m7
   2131    paddsw                m1, m4
   2132    paddsw                m0, m3
   2133    paddsw                m5, m2
   2134    mova                  m2, [coeffq+16*2]
   2135    mova                  m3, [coeffq+16*3]
   2136    mova                  m4, [coeffq+16*4]
   2137    mova       [coeffq+16*6], m1
   2138    mova       [coeffq+16*5], m0
   2139    mova       [coeffq+16*7], m5
   2140    pmulhrsw              m0, m6, m2
   2141    pmulhrsw              m1, m6, m3
   2142    pmulhrsw              m5, m6, m4
   2143    pmulhrsw              m0, m7
   2144    pmulhrsw              m1, m7
   2145    pmulhrsw              m5, m7
   2146    paddsw                m2, m0
   2147    paddsw                m3, m1
   2148    paddsw                m4, m5
   2149    mova                  m0, [coeffq+16*0]
   2150    mova                  m1, [coeffq+16*1]
   2151    pmulhrsw              m5, m6, m0
   2152    pmulhrsw              m6, m1
   2153    pmulhrsw              m5, m7
   2154    pmulhrsw              m6, m7
   2155    paddsw                m0, m5
   2156    paddsw                m1, m6
   2157    mova                  m6, [coeffq+16*6]
   2158    mova                  m5, [coeffq+16*5]
   2159    punpckhwd             m7, m0, m2                 ;packed out1,  out5
   2160    punpcklwd             m0, m2                     ;packed out0,  out4
   2161    punpckhwd             m2, m1, m3                 ;packed out3,  out7
   2162    punpcklwd             m1, m3                     ;packed out2,  out6
   2163    mova       [coeffq+16*6], m7
   2164    mova                  m7, [coeffq+16*7]
   2165    punpckhwd             m3, m4, m6                 ;packed out9,  out13
   2166    punpcklwd             m4, m6                     ;packed out8,  out12
   2167    punpckhwd             m6, m5, m7                 ;packed out11, out15
   2168    punpcklwd             m5, m7                     ;packed out10, out14
   2169    jmp   m(idct_16x4_internal_8bpc).pass1_end3
   2170 
   2171 .pass2:
   2172    lea                 tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)]
   2173    jmp   m(idct_16x4_internal_8bpc).pass2_end
   2174 
   2175 
   2176 %macro SAVE_8ROWS 2  ;src, stride
   2177    mova                 [%1+%2*0], m0
   2178    mova                 [%1+%2*1], m1
   2179    mova                 [%1+%2*2], m2
   2180    mova                 [%1+%2*3], m3
   2181    mova                 [%1+%2*4], m4
   2182    mova                 [%1+%2*5], m5
   2183    mova                 [%1+%2*6], m6
   2184    mova                 [%1+%2*7], m7
   2185 %endmacro
   2186 
   2187 %macro INV_TXFM_8X16_FN 2 ; type1, type2
   2188    INV_TXFM_FN          %1, %2, 8x16, 8, 16*16
   2189 %ifidn %1_%2, dct_dct
   2190    pshuflw              m0, [coeffq], q0000
   2191    punpcklwd            m0, m0
   2192    mova                 m1, [o(pw_2896x8)]
   2193    pmulhrsw             m0, m1
   2194    mova                 m2, [o(pw_16384)]
   2195    mov            [coeffq], eobd
   2196    pmulhrsw             m0, m1
   2197    pmulhrsw             m0, m2
   2198    psrlw                m2, 3              ; pw_2048
   2199    pmulhrsw             m0, m1
   2200    pmulhrsw             m0, m2
   2201    mov                 r3d, 4
   2202    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)]
   2203    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
   2204 .end:
   2205    RET
   2206 %endif
   2207 %endmacro
   2208 
   2209 INV_TXFM_8X16_FN dct, dct
   2210 INV_TXFM_8X16_FN dct, adst
   2211 INV_TXFM_8X16_FN dct, flipadst
   2212 INV_TXFM_8X16_FN dct, identity
   2213 
   2214 cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   2215    lea                    r3, [o(m(idct_8x8_internal_8bpc).pass1)]
   2216 
   2217 .pass1:
   2218    LOAD_8ROWS    coeffq+16*1, 32, 1
   2219    mov   [rsp+gprsize+16*11], tx2q
   2220    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)]
   2221    jmp                    r3
   2222 
   2223 .pass1_end:
   2224    SAVE_8ROWS    coeffq+16*1, 32
   2225    LOAD_8ROWS    coeffq+16*0, 32, 1
   2226    mov                  tx2q, [rsp+gprsize+16*11]
   2227    jmp                    r3
   2228 
   2229 .pass2:
   2230    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end)]
   2231 
   2232 .pass2_pre:
   2233    mova       [coeffq+16*2 ], m1
   2234    mova       [coeffq+16*6 ], m3
   2235    mova       [coeffq+16*10], m5
   2236    mova       [coeffq+16*14], m7
   2237    mova                   m1, m2
   2238    mova                   m2, m4
   2239    mova                   m3, m6
   2240    mova                   m4, [coeffq+16*1 ]
   2241    mova                   m5, [coeffq+16*5 ]
   2242    mova                   m6, [coeffq+16*9 ]
   2243    mova                   m7, [coeffq+16*13]
   2244 
   2245 .pass2_main:
   2246    call m(idct_8x8_internal_8bpc).main
   2247 
   2248    SAVE_7ROWS   rsp+gprsize+16*3, 16
   2249    mova                   m0, [coeffq+16*2 ]
   2250    mova                   m1, [coeffq+16*6 ]
   2251    mova                   m2, [coeffq+16*10]
   2252    mova                   m3, [coeffq+16*14]
   2253    mova                   m4, [coeffq+16*3 ]
   2254    mova                   m5, [coeffq+16*7 ]
   2255    mova                   m6, [coeffq+16*11]
   2256    mova                   m7, [coeffq+16*15]
   2257    call m(idct_16x8_internal_8bpc).main
   2258 
   2259    mov                    r3, dstq
   2260    lea                  dstq, [dstq+strideq*8]
   2261    jmp  m(idct_8x8_internal_8bpc).end
   2262 
   2263 .end:
   2264    LOAD_8ROWS   rsp+gprsize+16*3, 16
   2265    mova   [rsp+gprsize+16*0], m7
   2266    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
   2267    mov                  dstq, r3
   2268    jmp  m(idct_8x8_internal_8bpc).end
   2269 
   2270 .end1:
   2271    pxor                   m7, m7
   2272    REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
   2273    ret
   2274 
   2275 INV_TXFM_8X16_FN adst, dct
   2276 INV_TXFM_8X16_FN adst, adst
   2277 INV_TXFM_8X16_FN adst, flipadst
   2278 INV_TXFM_8X16_FN adst, identity
   2279 
   2280 cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   2281    lea                    r3, [o(m(iadst_8x8_internal_8bpc).pass1)]
   2282    jmp  m(idct_8x16_internal_8bpc).pass1
   2283 
   2284 .pass2:
   2285    lea                  tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
   2286 
   2287 .pass2_pre:
   2288    mova    [rsp+gprsize+16*7], m0
   2289    mova    [rsp+gprsize+16*8], m1
   2290    mova    [rsp+gprsize+16*5], m6
   2291    mova    [rsp+gprsize+16*6], m7
   2292    mova                    m0, m2
   2293    mova                    m1, m3
   2294    mova                    m2, m4
   2295    mova                    m3, m5
   2296 
   2297 .pass2_main:
   2298    mova                    m4, [coeffq+16*1 ]
   2299    mova                    m5, [coeffq+16*3 ]
   2300    mova                    m6, [coeffq+16*13]
   2301    mova                    m7, [coeffq+16*15]
   2302    mova    [rsp+gprsize+16*3], m4
   2303    mova    [rsp+gprsize+16*4], m5
   2304    mova    [rsp+gprsize+16*9], m6
   2305    mova    [rsp+gprsize+32*5], m7
   2306    mova                    m4, [coeffq+16*5 ]
   2307    mova                    m5, [coeffq+16*7 ]
   2308    mova                    m6, [coeffq+16*9 ]
   2309    mova                    m7, [coeffq+16*11]
   2310 
   2311    call m(iadst_16x8_internal_8bpc).main
   2312    call m(iadst_16x8_internal_8bpc).main_pass2_end
   2313 
   2314    mov                    r3, dstq
   2315    lea                  dstq, [dstq+strideq*8]
   2316    jmp m(iadst_8x8_internal_8bpc).end
   2317 
   2318 .end:
   2319    LOAD_8ROWS   rsp+gprsize+16*3, 16
   2320    mova   [rsp+gprsize+16*0], m7
   2321    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
   2322    mov                  dstq, r3
   2323    jmp  m(iadst_8x8_internal_8bpc).end
   2324 
   2325 
   2326 INV_TXFM_8X16_FN flipadst, dct
   2327 INV_TXFM_8X16_FN flipadst, adst
   2328 INV_TXFM_8X16_FN flipadst, flipadst
   2329 INV_TXFM_8X16_FN flipadst, identity
   2330 
   2331 cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   2332    lea                    r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)]
   2333    jmp  m(idct_8x16_internal_8bpc).pass1
   2334 
   2335 .pass2:
   2336    lea                   tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)]
   2337    lea                     r3, [dstq+strideq*8]
   2338 
   2339 .pass2_pre:
   2340    mova    [rsp+gprsize+16*7], m0
   2341    mova    [rsp+gprsize+16*8], m1
   2342    mova    [rsp+gprsize+16*5], m6
   2343    mova    [rsp+gprsize+16*6], m7
   2344    mova                    m0, m2
   2345    mova                    m1, m3
   2346    mova                    m2, m4
   2347    mova                    m3, m5
   2348 
   2349 .pass2_main:
   2350    mova                    m4, [coeffq+16*1 ]
   2351    mova                    m5, [coeffq+16*3 ]
   2352    mova                    m6, [coeffq+16*13]
   2353    mova                    m7, [coeffq+16*15]
   2354    mova    [rsp+gprsize+16*3], m4
   2355    mova    [rsp+gprsize+16*4], m5
   2356    mova    [rsp+gprsize+16*9], m6
   2357    mova    [rsp+gprsize+32*5], m7
   2358    mova                    m4, [coeffq+16*5 ]
   2359    mova                    m5, [coeffq+16*7 ]
   2360    mova                    m6, [coeffq+16*9 ]
   2361    mova                    m7, [coeffq+16*11]
   2362 
   2363    call m(iadst_16x8_internal_8bpc).main
   2364    call m(iadst_16x8_internal_8bpc).main_pass2_end
   2365    jmp  m(iflipadst_8x8_internal_8bpc).end
   2366 
   2367 .end:
   2368    LOAD_8ROWS    rsp+gprsize+16*3, 16
   2369    mova    [rsp+gprsize+16*0], m7
   2370    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
   2371    mov                   dstq, r3
   2372    jmp  m(iflipadst_8x8_internal_8bpc).end
   2373 
   2374 
   2375 INV_TXFM_8X16_FN identity, dct
   2376 INV_TXFM_8X16_FN identity, adst
   2377 INV_TXFM_8X16_FN identity, flipadst
   2378 INV_TXFM_8X16_FN identity, identity
   2379 
   2380 cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   2381    LOAD_8ROWS    coeffq+16*1, 32, 1
   2382    mov                    r3, tx2q
   2383    lea                  tx2q, [o(.pass1_end)]
   2384    mova   [rsp+gprsize+16*1], m6
   2385    jmp  m(idct_8x8_internal_8bpc).pass1_end3
   2386 
   2387 .pass1_end:
   2388    SAVE_8ROWS    coeffq+16*1, 32
   2389    LOAD_8ROWS    coeffq+16*0, 32, 1
   2390    mov                  tx2q, r3
   2391    mova   [rsp+gprsize+16*1], m6
   2392    jmp  m(idct_8x8_internal_8bpc).pass1_end3
   2393 
   2394 .pass2:
   2395    lea                  tx2q, [o(.end1)]
   2396 
   2397 .end:
   2398    mova   [rsp+gprsize+16*0], m7
   2399    mova   [rsp+gprsize+16*1], m6
   2400    mova                   m7, [o(pw_1697x16)]
   2401    REPX     {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
   2402    mova                   m6, [rsp+gprsize+16*1]
   2403    mova   [rsp+gprsize+16*2], m5
   2404    IDTX16                  6, 5, 7
   2405    mova                   m5, [rsp+gprsize+16*0]
   2406    IDTX16                  5, 7, 7
   2407    mova                   m7, [o(pw_2048)]
   2408    REPX     {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   2409    pmulhrsw               m7, [rsp+gprsize+16*2]
   2410    mova   [rsp+gprsize+16*0], m5
   2411    mova   [rsp+gprsize+16*1], m6
   2412    mova   [rsp+gprsize+16*2], m7
   2413    jmp  m(idct_8x8_internal_8bpc).end3
   2414 
   2415 .end1:
   2416    LOAD_8ROWS    coeffq+16*1, 32
   2417    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
   2418    lea                  dstq, [dstq+strideq*2]
   2419    jmp .end
   2420 
   2421 
   2422 %macro INV_TXFM_16X8_FN 2 ; type1, type2
   2423    INV_TXFM_FN          %1, %2, 16x8, 8, 16*16
   2424 %ifidn %1_%2, dct_dct
   2425    movd                 m1, [o(pw_2896x8)]
   2426    pmulhrsw             m0, m1, [coeffq]
   2427    movd                 m2, [o(pw_16384)]
   2428    mov            [coeffq], eobd
   2429    pmulhrsw             m0, m1
   2430    mov                 r2d, 4
   2431    lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)]
   2432    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
   2433 .end:
   2434    RET
   2435 %endif
   2436 %endmacro
   2437 
   2438 INV_TXFM_16X8_FN dct, dct
   2439 INV_TXFM_16X8_FN dct, adst
   2440 INV_TXFM_16X8_FN dct, flipadst
   2441 INV_TXFM_16X8_FN dct, identity
   2442 
   2443 cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   2444    LOAD_8ROWS    coeffq+16*0, 32, 1
   2445    call m(idct_8x8_internal_8bpc).main
   2446    SAVE_7ROWS   rsp+gprsize+16*3, 16
   2447 
   2448    LOAD_8ROWS    coeffq+16*1, 32, 1
   2449    call  .main
   2450    mov                    r3, tx2q
   2451    lea                  tx2q, [o(.pass1_end)]
   2452    jmp  m(idct_8x8_internal_8bpc).pass1_end
   2453 
   2454 .pass1_end:
   2455    SAVE_8ROWS    coeffq+16*1, 32
   2456    LOAD_8ROWS   rsp+gprsize+16*3, 16
   2457    mova   [rsp+gprsize+16*0], m7
   2458    mov                  tx2q, r3
   2459    jmp  m(idct_8x8_internal_8bpc).pass1_end
   2460 
   2461 .pass2:
   2462    lea                  tx2q, [o(.end)]
   2463    lea                    r3, [dstq+8]
   2464    jmp  m(idct_8x8_internal_8bpc).pass2_main
   2465 
   2466 .end:
   2467    LOAD_8ROWS    coeffq+16*1, 32
   2468    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
   2469    mov                  dstq, r3
   2470    jmp  m(idct_8x8_internal_8bpc).pass2_main
   2471 
   2472 
   2473 ALIGN function_align
   2474 cglobal_label .main
   2475    mova [rsp+gprsize*2+16*1], m2
   2476    mova [rsp+gprsize*2+16*2], m6
   2477    mova [rsp+gprsize*2+32*5], m5
   2478 
   2479    mova                   m6, [o(pd_2048)]
   2480    ITX_MULSUB_2W           0, 7, 2, 5, 6,  401, 4076   ;t8a, t15a
   2481    ITX_MULSUB_2W           4, 3, 2, 5, 6, 3166, 2598   ;t9a, t14a
   2482    psubsw                 m2, m0, m4                   ;t9
   2483    paddsw                 m0, m4                       ;t8
   2484    psubsw                 m4, m7, m3                   ;t14
   2485    paddsw                 m7, m3                       ;t15
   2486    ITX_MULSUB_2W           4, 2, 3, 5, 6, 1567, 3784   ;t9a, t14a
   2487    mova                   m3, [rsp+gprsize*2+16*1]
   2488    mova                   m5, [rsp+gprsize*2+32*5]
   2489    mova [rsp+gprsize*2+16*1], m2
   2490    mova [rsp+gprsize*2+32*5], m4
   2491    mova                   m2, [rsp+gprsize*2+16*2]
   2492    mova [rsp+gprsize*2+16*2], m7
   2493    ITX_MULSUB_2W           3, 5, 7, 4, 6, 1931, 3612   ;t10a, t13a
   2494    ITX_MULSUB_2W           2, 1, 7, 4, 6, 3920, 1189   ;t11a, t12a
   2495    psubsw                 m4, m2, m3                   ;t10
   2496    paddsw                 m2, m3                       ;t11
   2497    psubsw                 m3, m1, m5                   ;t13
   2498    paddsw                 m1, m5                       ;t12
   2499    ITX_MULSUB_2W           3, 4, 7, 5, 6, m3784, 1567  ;t10a, t13a
   2500    mova                   m7, [rsp+gprsize*2+32*5]
   2501    psubsw                 m6, m0, m2                   ;t11a
   2502    paddsw                 m0, m2                       ;t8a
   2503    paddsw                 m2, m7, m3                   ;t9
   2504    psubsw                 m7, m3                       ;t10
   2505    mova                   m5, [rsp+gprsize*2+16*0]
   2506    psubsw                 m3, m5, m0                   ;out8
   2507    paddsw                 m0, m5                       ;out7
   2508    mova [rsp+gprsize*2+32*5], m0
   2509    mova                   m5, [rsp+gprsize*2+16*9]
   2510    psubsw                 m0, m5, m2                   ;out9
   2511    paddsw                 m2, m5                       ;out6
   2512    mova [rsp+gprsize*2+16*0], m0
   2513    mova [rsp+gprsize*2+16*9], m2
   2514    mova                   m0, [rsp+gprsize*2+16*1]
   2515    mova                   m2, [rsp+gprsize*2+16*2]
   2516    mova [rsp+gprsize*2+16*1], m3
   2517    psubsw                 m5, m0, m4                   ;t13
   2518    paddsw                 m0, m4                       ;t14
   2519    mova                   m3, [o(pd_2048)]
   2520    psubsw                 m4, m2, m1                   ;t12a
   2521    paddsw                 m1, m2                       ;t15a
   2522    mova [rsp+gprsize*2+16*2], m1
   2523    ITX_MULSUB_2W           5, 7, 1, 2, 3, 2896, 2896   ;t10a, t13a
   2524    ITX_MULSUB_2W           4, 6, 1, 2, 3, 2896, 2896   ;t11,  t12
   2525    mova                   m3, [rsp+gprsize*2+16*8]
   2526    psubsw                 m2, m3, m5                   ;out10
   2527    paddsw                 m3, m5                       ;out5
   2528    mova                   m5, [rsp+gprsize*2+16*7]
   2529    mova [rsp+gprsize*2+16*8], m3
   2530    psubsw                 m3, m5, m4                   ;out11
   2531    paddsw                 m5, m4                       ;out4
   2532    mova                   m4, [rsp+gprsize*2+16*6]
   2533    mova [rsp+gprsize*2+16*7], m5
   2534    paddsw                 m5, m4, m6                   ;out3
   2535    psubsw                 m4, m6                       ;out12
   2536    mova                   m6, [rsp+gprsize*2+16*5]
   2537    mova [rsp+gprsize*2+16*6], m5
   2538    psubsw                 m5, m6, m7                   ;out13
   2539    paddsw                 m6, m7                       ;out2
   2540    mova                   m7, [rsp+gprsize*2+16*4]
   2541    mova [rsp+gprsize*2+16*5], m6
   2542    psubsw                 m6, m7, m0                   ;out14
   2543    paddsw                 m7, m0                       ;out1
   2544    mova                   m1, [rsp+gprsize*2+16*2]
   2545    mova                   m0, [rsp+gprsize*2+16*3]
   2546    mova [rsp+gprsize*2+16*4], m7
   2547    psubsw                 m7, m0, m1                   ;out15
   2548    paddsw                 m0, m1                       ;out0
   2549    mova [rsp+gprsize*2+16*3], m0
   2550    mova                   m1, [rsp+gprsize*2+16*0]
   2551    mova                   m0, [rsp+gprsize*2+16*1]
   2552    mova [rsp+gprsize*2+16*0], m7
   2553    ret
   2554 
   2555 INV_TXFM_16X8_FN adst, dct
   2556 INV_TXFM_16X8_FN adst, adst
   2557 INV_TXFM_16X8_FN adst, flipadst
   2558 INV_TXFM_16X8_FN adst, identity
   2559 
   2560 cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   2561    mova                    m7, [o(pw_2896x8)]
   2562    pmulhrsw                m0, m7, [coeffq+16*0 ]
   2563    pmulhrsw                m1, m7, [coeffq+16*1 ]
   2564    pmulhrsw                m2, m7, [coeffq+16*14]
   2565    pmulhrsw                m3, m7, [coeffq+16*15]
   2566    mova    [rsp+gprsize+16*7], m0
   2567    mova    [rsp+gprsize+16*8], m1
   2568    mova    [rsp+gprsize+16*9], m2
   2569    mova    [rsp+gprsize+32*5], m3
   2570    pmulhrsw                m0, m7, [coeffq+16*6 ]
   2571    pmulhrsw                m1, m7, [coeffq+16*7 ]
   2572    pmulhrsw                m2, m7, [coeffq+16*8 ]
   2573    pmulhrsw                m3, m7, [coeffq+16*9 ]
   2574    mova    [rsp+gprsize+16*3], m2
   2575    mova    [rsp+gprsize+16*4], m3
   2576    mova    [rsp+gprsize+16*5], m0
   2577    mova    [rsp+gprsize+16*6], m1
   2578    pmulhrsw                m0, m7, [coeffq+16*2 ]
   2579    pmulhrsw                m1, m7, [coeffq+16*3 ]
   2580    pmulhrsw                m2, m7, [coeffq+16*4 ]
   2581    pmulhrsw                m3, m7, [coeffq+16*5 ]
   2582    pmulhrsw                m4, m7, [coeffq+16*10]
   2583    pmulhrsw                m5, m7, [coeffq+16*11]
   2584    pmulhrsw                m6, m7, [coeffq+16*12]
   2585    pmulhrsw                m7,     [coeffq+16*13]
   2586 
   2587    call .main
   2588    call .main_pass1_end
   2589    mov                    r3, tx2q
   2590    lea                  tx2q, [o(.pass1_end)]
   2591    jmp m(iadst_8x8_internal_8bpc).pass1_end
   2592 
   2593 .pass1_end:
   2594    SAVE_8ROWS    coeffq+16*1, 32
   2595    LOAD_8ROWS   rsp+gprsize+16*3, 16
   2596    mova   [rsp+gprsize+16*0], m7
   2597    mov                  tx2q, r3
   2598    jmp m(iadst_8x8_internal_8bpc).pass1_end
   2599 
   2600 .pass2:
   2601    lea                  tx2q, [o(.end)]
   2602    lea                    r3, [dstq+8]
   2603    jmp m(iadst_8x8_internal_8bpc).pass2_main
   2604 
   2605 .end:
   2606    LOAD_8ROWS    coeffq+16*1, 32
   2607    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
   2608    mov                  dstq, r3
   2609    jmp m(iadst_8x8_internal_8bpc).pass2_main
   2610 
   2611 ALIGN function_align
   2612 cglobal_label .main
   2613    mova  [rsp+gprsize*2+16*0], m1
   2614    mova  [rsp+gprsize*2+16*1], m2
   2615    mova  [rsp+gprsize*2+16*2], m6
   2616 
   2617    mova                    m6, [o(pd_2048)]
   2618    ITX_MULSUB_2W            7, 0, 1, 2, 6,  995, 3973   ;t3,  t2
   2619    ITX_MULSUB_2W            3, 4, 1, 2, 6, 3513, 2106   ;t11, t10
   2620    psubsw                  m1, m0, m4                   ;t10a
   2621    paddsw                  m0, m4                       ;t2a
   2622    psubsw                  m4, m7, m3                   ;t11a
   2623    paddsw                  m3, m7                       ;t3a
   2624    ITX_MULSUB_2W            1, 4, 7, 2, 6, 3406, 2276   ;t11, t10
   2625    mova                    m2, [rsp+gprsize*2+16*0]     ;in3
   2626    mova                    m7, [rsp+gprsize*2+16*1]     ;in4
   2627    mova  [rsp+gprsize*2+16*0], m1                       ;t11
   2628    mova  [rsp+gprsize*2+16*1], m4                       ;t10
   2629    mova                    m1, [rsp+gprsize*2+16*2]     ;in12
   2630    mova  [rsp+gprsize*2+16*2], m0                       ;t2a
   2631    ITX_MULSUB_2W            5, 7, 0, 4, 6, 1751, 3703   ;t5,  t4
   2632    ITX_MULSUB_2W            2, 1, 0, 4, 6, 3857, 1380   ;t13, t12
   2633    psubsw                  m0, m7, m1                   ;t12a
   2634    paddsw                  m1, m7                       ;t4a
   2635    psubsw                  m4, m5, m2                   ;t13a
   2636    paddsw                  m5, m2                       ;t5a
   2637    ITX_MULSUB_2W            4, 0, 7, 2, 6, 4017,  799   ;t12, t13
   2638    mova                    m2, [rsp+gprsize*2+16*8]     ;in1
   2639    mova                    m7, [rsp+gprsize*2+16*9]     ;in14
   2640    mova  [rsp+gprsize*2+16*8], m4                       ;t12
   2641    mova  [rsp+gprsize*2+16*9], m0                       ;t13
   2642    mova                    m4, [rsp+gprsize*2+16*4]     ;in9
   2643    mova                    m0, [rsp+gprsize*2+16*5]     ;in6
   2644    mova  [rsp+gprsize*2+16*4], m1                       ;t4a
   2645    mova  [rsp+gprsize*2+16*5], m5                       ;t5a
   2646    ITX_MULSUB_2W            2, 7, 1, 5, 6, 4052,  601   ;t15, t14
   2647    ITX_MULSUB_2W            4, 0, 1, 5, 6, 2440, 3290   ;t7,  t6
   2648    psubsw                  m1, m0, m7                   ;t14a
   2649    paddsw                  m0, m7                       ;t6a
   2650    psubsw                  m5, m4, m2                   ;t15a
   2651    paddsw                  m4, m2                       ;t7a
   2652    ITX_MULSUB_2W            5, 1, 7, 2, 6, 2276, 3406   ;t14, t15
   2653    mova                    m2, [rsp+gprsize*2+16*2]     ;t2a
   2654    mova  [rsp+gprsize*2+16*2], m5                       ;t14
   2655    psubsw                  m7, m2, m0                   ;t6
   2656    paddsw                  m2, m0                       ;t2
   2657    psubsw                  m0, m3, m4                   ;t7
   2658    paddsw                  m3, m4                       ;t3
   2659    ITX_MULSUB_2W            0, 7, 4, 5, 6, 3784, 1567   ;t6a, t7a
   2660    mova                    m4, [rsp+gprsize*2+16*7]     ;in0
   2661    mova                    m5, [rsp+gprsize*2+32*5]     ;in15
   2662    mova  [rsp+gprsize*2+16*7], m3                       ;t3
   2663    mova  [rsp+gprsize*2+32*5], m1                       ;t15
   2664    mova                    m1, [rsp+gprsize*2+16*6]     ;in7
   2665    mova                    m3, [rsp+gprsize*2+16*3]     ;in8
   2666    mova  [rsp+gprsize*2+16*6], m7                       ;t7a
   2667    mova  [rsp+gprsize*2+16*3], m0                       ;t6a
   2668    ITX_MULSUB_2W            5, 4, 0, 7, 6,  201, 4091   ;t1,  t0
   2669    ITX_MULSUB_2W            1, 3, 0, 7, 6, 3035, 2751   ;t9,  t8
   2670    psubsw                  m0, m4, m3                   ;t8a
   2671    paddsw                  m4, m3                       ;t0a
   2672    psubsw                  m3, m5, m1                   ;t9a
   2673    paddsw                  m5, m1                       ;t1a
   2674    ITX_MULSUB_2W            0, 3, 1, 7, 6,  799, 4017   ;t9,  t8
   2675    mova                    m1, [rsp+gprsize*2+16*4]     ;t4a
   2676    mova                    m7, [rsp+gprsize*2+16*5]     ;t5a
   2677    mova  [rsp+gprsize*2+16*4], m3                       ;t8
   2678    mova  [rsp+gprsize*2+16*5], m0                       ;t9
   2679    psubsw                  m0, m4, m1                   ;t4
   2680    paddsw                  m4, m1                       ;t0
   2681    psubsw                  m3, m5, m7                   ;t5
   2682    paddsw                  m5, m7                       ;t1
   2683    ITX_MULSUB_2W            0, 3, 1, 7, 6, 1567, 3784   ;t5a, t4a
   2684    mova                    m7, [rsp+gprsize*2+16*3]     ;t6a
   2685    psubsw                  m1, m4, m2                   ;t2a
   2686    paddsw                  m4, m2                       ;out0
   2687    mova  [rsp+gprsize*2+16*3], m4                       ;out0
   2688    mova                    m4, [rsp+gprsize*2+16*6]     ;t7a
   2689    psubsw                  m2, m3, m7                   ;t6
   2690    paddsw                  m3, m7                       ;-out3
   2691    mova  [rsp+gprsize*2+16*6], m3                       ;-out3
   2692    psubsw                  m3, m0, m4                   ;t7
   2693    paddsw                  m0, m4                       ;out12
   2694    mova [rsp+gprsize*2+16*12], m3
   2695    mova                    m3, [rsp+gprsize*2+16*7]     ;t3
   2696    mova [rsp+gprsize*2+16* 7], m2                       ;out4
   2697    psubsw                  m2, m5, m3                   ;t3a
   2698    paddsw                  m5, m3                       ;-out15
   2699    mova [rsp+gprsize*2+16*11], m2
   2700    mova                    m2, [rsp+gprsize*2+32*5]     ;t15
   2701    mova [rsp+gprsize*2+16*10], m1                       ;-out7
   2702    mova                    m1, [rsp+gprsize*2+16*0]     ;t11
   2703    mova [rsp+gprsize*2+16*0 ], m5                       ;-out15
   2704    mova                    m3, [rsp+gprsize*2+16*1]     ;t10
   2705    mova [rsp+gprsize*2+16*1 ], m4                       ;-out11
   2706    mova                    m4, [rsp+gprsize*2+16*2]     ;t14
   2707    mova [rsp+gprsize*2+16*2 ], m0                       ;out12
   2708    psubsw                  m0, m3, m4                   ;t14a
   2709    paddsw                  m3, m4                       ;t10a
   2710    psubsw                  m5, m1, m2                   ;t15a
   2711    paddsw                  m1, m2                       ;t11a
   2712    ITX_MULSUB_2W            5, 0, 2, 4, 6, 3784, 1567   ;t14, t15
   2713    mova                    m2, [rsp+gprsize*2+16*4]     ;t8
   2714    mova                    m4, [rsp+gprsize*2+16*5]     ;t9
   2715    mova  [rsp+gprsize*2+16*4], m3                       ;t10a
   2716    mova  [rsp+gprsize*2+16*5], m1                       ;t11a
   2717    mova                    m3, [rsp+gprsize*2+16*8]     ;t12
   2718    mova                    m1, [rsp+gprsize*2+16*9]     ;t13
   2719    mova  [rsp+gprsize*2+16*8], m5                       ;t14
   2720    mova  [rsp+gprsize*2+16*9], m0                       ;t15
   2721    psubsw                  m5, m2, m3                   ;t12a
   2722    paddsw                  m2, m3                       ;t8a
   2723    psubsw                  m0, m4, m1                   ;t13a
   2724    paddsw                  m4, m1                       ;t9a
   2725    ITX_MULSUB_2W            5, 0, 1, 3, 6, 1567, 3784   ;t13, t12
   2726    mova                    m6, [rsp+gprsize*2+16*4]     ;t10a
   2727    mova                    m1, [rsp+gprsize*2+16*5]     ;t11a
   2728    psubsw                  m3, m2, m6                   ;t10
   2729    paddsw                  m2, m6                       ;-out1
   2730    paddsw                  m6, m4, m1                   ;out14
   2731    psubsw                  m4, m1                       ;t11
   2732    mova [rsp+gprsize*2+16*14], m4
   2733    mova [rsp+gprsize*2+16* 4], m2                       ;-out1
   2734    mova                    m4, [rsp+gprsize*2+16*8]     ;t14
   2735    mova                    m2, [rsp+gprsize*2+16*9]     ;t15
   2736    mova [rsp+gprsize*2+16* 9], m3                       ;out6
   2737    psubsw                  m3, m0, m4                   ;t14a
   2738    paddsw                  m0, m4                       ;out2
   2739    psubsw                  m4, m5, m2                   ;t15a
   2740    paddsw                  m5, m2                       ;-out13
   2741    mova [rsp+gprsize*2+16* 5], m0                       ;out2
   2742    ret
   2743 ALIGN function_align
   2744 .main_pass1_end:
   2745    mova                    m0, [rsp+gprsize*2+16*14]
   2746    mova [rsp+gprsize*2+16*14], m5
   2747    mova [rsp+gprsize*2+16*15], m6
   2748    mova                    m5, [o(pw_2896_2896)]
   2749    mova                    m6, [o(pw_2896_m2896)]
   2750    mova                    m7, [o(pd_2048)]
   2751    punpcklwd               m2, m3, m4
   2752    punpckhwd               m3, m4
   2753    pmaddwd                 m4, m5, m2
   2754    pmaddwd                 m2, m6
   2755    pmaddwd                 m1, m5, m3
   2756    pmaddwd                 m3, m6
   2757    REPX         {paddd x, m7}, m4, m2, m1, m3
   2758    REPX         {psrad x, 12}, m4, m1, m2, m3
   2759    packssdw                m4, m1                       ;-out5
   2760    packssdw                m2, m3                       ;out10
   2761    mova [rsp+gprsize*2+16* 8], m4
   2762    mova                    m3, [rsp+gprsize*2+16* 9]
   2763    punpcklwd               m1, m3, m0
   2764    punpckhwd               m3, m0
   2765    pmaddwd                 m0, m5, m1
   2766    pmaddwd                 m1, m6
   2767    pmaddwd                 m4, m5, m3
   2768    pmaddwd                 m3, m6
   2769    REPX         {paddd x, m7}, m0, m1, m4, m3
   2770    REPX         {psrad x, 12}, m0, m4, m1, m3
   2771    packssdw                m0, m4                       ;out6
   2772    packssdw                m1, m3                       ;-out9
   2773    mova [rsp+gprsize*2+16* 9], m0
   2774    mova                    m0, [rsp+gprsize*2+16* 7]
   2775    mova                    m4, [rsp+gprsize*2+16*12]
   2776    punpcklwd               m3, m0, m4
   2777    punpckhwd               m0, m4
   2778    pmaddwd                 m4, m5, m3
   2779    pmaddwd                 m3, m6
   2780    pmaddwd                 m5, m0
   2781    pmaddwd                 m0, m6
   2782    REPX         {paddd x, m7}, m4, m3, m5, m0
   2783    REPX         {psrad x, 12}, m4, m5, m3, m0
   2784    packssdw                m4, m5                       ;out4
   2785    packssdw                m3, m0                       ;-out11
   2786    mova [rsp+gprsize*2+16* 7], m4
   2787    mova                    m4, [rsp+gprsize*2+16*10]
   2788    mova                    m5, [rsp+gprsize*2+16*11]
   2789    punpcklwd               m0, m4, m5
   2790    punpckhwd               m4, m5
   2791    pmaddwd                 m5, m0, [o(pw_2896_2896)]
   2792    pmaddwd                 m0, m6
   2793    pmaddwd                 m6, m4
   2794    pmaddwd                 m4, [o(pw_2896_2896)]
   2795    REPX         {paddd x, m7}, m5, m0, m6, m4
   2796    REPX         {psrad x, 12}, m0, m6, m5, m4
   2797    packssdw                m0, m6                       ;out8
   2798    packssdw                m5, m4                       ;-out7
   2799    mova [rsp+gprsize*2+16*10], m5
   2800    mova                    m4, [rsp+gprsize*2+16* 2]    ;out12
   2801    mova                    m5, [rsp+gprsize*2+16*14]    ;-out13
   2802    mova                    m6, [rsp+gprsize*2+16*15]    ;out14
   2803    ret
   2804 ALIGN function_align
   2805 cglobal_label .main_pass2_end
   2806    mova                    m7, [o(pw_2896x8)]
   2807    mova                    m1, [rsp+gprsize*2+16* 9]
   2808    mova                    m2, [rsp+gprsize*2+16*14]
   2809    paddsw                  m0, m1, m2
   2810    psubsw                  m1, m2
   2811    pmulhrsw                m0, m7                       ;out6
   2812    pmulhrsw                m1, m7                       ;-out9
   2813    mova [rsp+gprsize*2+16* 9], m0
   2814    psubsw                  m2, m3, m4
   2815    paddsw                  m3, m4
   2816    pmulhrsw                m2, m7                       ;out10
   2817    pmulhrsw                m3, m7                       ;-out5
   2818    mova [rsp+gprsize*2+16* 8], m3
   2819    mova                    m3, [rsp+gprsize*2+16* 7]
   2820    mova                    m4, [rsp+gprsize*2+16*12]
   2821    paddsw                  m0, m3, m4
   2822    psubsw                  m3, m4
   2823    pmulhrsw                m0, m7                       ;out4
   2824    pmulhrsw                m3, m7                       ;-out11
   2825    mova [rsp+gprsize*2+16* 7], m0
   2826    mova                    m0, [rsp+gprsize*2+16*10]
   2827    paddsw                  m4, m0, [rsp+gprsize*2+16*11]
   2828    psubsw                  m0, [rsp+gprsize*2+16*11]
   2829    pmulhrsw                m4, m7                       ;-out7
   2830    pmulhrsw                m0, m7                       ;out8
   2831    mova [rsp+gprsize*2+16*10], m4
   2832    mova                    m4, [rsp+gprsize*2+16*2 ]    ;out12
   2833    ret
   2834 
   2835 INV_TXFM_16X8_FN flipadst, dct
   2836 INV_TXFM_16X8_FN flipadst, adst
   2837 INV_TXFM_16X8_FN flipadst, flipadst
   2838 INV_TXFM_16X8_FN flipadst, identity
   2839 
   2840 cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   2841    mova                    m7, [o(pw_2896x8)]
   2842    pmulhrsw                m0, m7, [coeffq+16*0 ]
   2843    pmulhrsw                m1, m7, [coeffq+16*1 ]
   2844    pmulhrsw                m2, m7, [coeffq+16*14]
   2845    pmulhrsw                m3, m7, [coeffq+16*15]
   2846    mova    [rsp+gprsize+16*7], m0
   2847    mova    [rsp+gprsize+16*8], m1
   2848    mova    [rsp+gprsize+16*9], m2
   2849    mova    [rsp+gprsize+32*5], m3
   2850    pmulhrsw                m0, m7, [coeffq+16*6 ]
   2851    pmulhrsw                m1, m7, [coeffq+16*7 ]
   2852    pmulhrsw                m2, m7, [coeffq+16*8 ]
   2853    pmulhrsw                m3, m7, [coeffq+16*9 ]
   2854    mova    [rsp+gprsize+16*3], m2
   2855    mova    [rsp+gprsize+16*4], m3
   2856    mova    [rsp+gprsize+16*5], m0
   2857    mova    [rsp+gprsize+16*6], m1
   2858    pmulhrsw                m0, m7, [coeffq+16*2 ]
   2859    pmulhrsw                m1, m7, [coeffq+16*3 ]
   2860    pmulhrsw                m2, m7, [coeffq+16*4 ]
   2861    pmulhrsw                m3, m7, [coeffq+16*5 ]
   2862    pmulhrsw                m4, m7, [coeffq+16*10]
   2863    pmulhrsw                m5, m7, [coeffq+16*11]
   2864    pmulhrsw                m6, m7, [coeffq+16*12]
   2865    pmulhrsw                m7,     [coeffq+16*13]
   2866 
   2867    call m(iadst_16x8_internal_8bpc).main
   2868    call m(iadst_16x8_internal_8bpc).main_pass1_end
   2869 
   2870    mova                    m7, [rsp+gprsize+16*0]
   2871    SAVE_8ROWS     coeffq+16*0, 32
   2872    LOAD_8ROWS    rsp+gprsize+16*3, 16
   2873    mova    [rsp+gprsize+16*0], m7
   2874    mov                     r3, tx2q
   2875    lea                   tx2q, [o(.pass1_end)]
   2876    jmp m(iflipadst_8x8_internal_8bpc).pass1_end
   2877 
   2878 .pass1_end:
   2879    SAVE_8ROWS     coeffq+16*1, 32
   2880    LOAD_8ROWS     coeffq+16*0, 32
   2881    mova    [rsp+gprsize+16*0], m7
   2882    mov                   tx2q, r3
   2883    jmp m(iflipadst_8x8_internal_8bpc).pass1_end
   2884 
   2885 .pass2:
   2886    lea                   tx2q, [o(.end)]
   2887    lea                     r3, [dstq+8]
   2888    jmp m(iflipadst_8x8_internal_8bpc).pass2_main
   2889 
   2890 .end:
   2891    LOAD_8ROWS     coeffq+16*1, 32
   2892    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
   2893    mov                   dstq, r3
   2894    jmp m(iflipadst_8x8_internal_8bpc).pass2_main
   2895 
   2896 
   2897 INV_TXFM_16X8_FN identity, dct
   2898 INV_TXFM_16X8_FN identity, adst
   2899 INV_TXFM_16X8_FN identity, flipadst
   2900 INV_TXFM_16X8_FN identity, identity
   2901 
   2902 cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   2903    add                coeffq, 16*16
   2904    mova                   m4, [coeffq-16*7]
   2905    mova                   m5, [coeffq-16*5]
   2906    mova                   m6, [coeffq-16*3]
   2907    mova                   m7, [coeffq-16*1]
   2908    mov                    r3, tx2q
   2909    lea                  tx2q, [o(.pass1_end)]
   2910 
   2911 .pass1:
   2912    mova                   m0, [o(pw_2896x8)]
   2913    mova                   m2, [o(pw_1697x16)]
   2914    mova                   m3, [o(pw_16384)]
   2915    sub                coeffq, 8*16
   2916    REPX     {pmulhrsw x, m0}, m4, m5, m6, m7
   2917    pmulhrsw               m1, m2, m4
   2918    pmulhrsw               m1, m3
   2919    paddsw                 m1, m4 ; 1
   2920    pmulhrsw               m4, m2, m5
   2921    pmulhrsw               m4, m3
   2922    paddsw                 m4, m5 ; 3
   2923    pmulhrsw               m5, m2, m6
   2924    pmulhrsw               m5, m3
   2925    paddsw                 m5, m6 ; 5
   2926    pmulhrsw               m6, m2, m7
   2927    pmulhrsw               m6, m3
   2928    paddsw                 m7, m6 ; 7
   2929    pmulhrsw               m6, m0, [coeffq+16*6]
   2930    mova   [rsp+gprsize+16*0], m4
   2931    pmulhrsw               m4, m2, m6
   2932    pmulhrsw               m4, m3
   2933    paddsw                 m6, m4 ; 6
   2934    pmulhrsw               m4, m0, [coeffq+16*4]
   2935    mova   [rsp+gprsize+16*1], m6
   2936    pmulhrsw               m6, m2, m4
   2937    pmulhrsw               m6, m3
   2938    paddsw                 m4, m6 ; 4
   2939    pmulhrsw               m6, m0, [coeffq+16*2]
   2940    pmulhrsw               m0,     [coeffq+16*0]
   2941    pmulhrsw               m2, m6
   2942    pmulhrsw               m2, m3
   2943    paddsw                 m2, m6 ; 2
   2944    pmulhrsw               m6, m0, [o(pw_1697x16)]
   2945    pmulhrsw               m6, m3
   2946    mova                   m3, [rsp+gprsize+16*0]
   2947    paddsw                 m0, m6
   2948    jmp   m(idct_8x8_internal_8bpc).pass1_end3
   2949 
   2950 .pass1_end:
   2951    mova        [coeffq+16*1], m4
   2952    mova        [coeffq+16*3], m5
   2953    mova        [coeffq+16*5], m6
   2954    mova        [coeffq+16*7], m7
   2955    mova                   m4, [coeffq-16*7]
   2956    mova                   m5, [coeffq-16*5]
   2957    mova                   m6, [coeffq-16*3]
   2958    mova                   m7, [coeffq-16*1]
   2959    mova        [coeffq-16*7], m0
   2960    mova        [coeffq-16*5], m1
   2961    mova        [coeffq-16*3], m2
   2962    mova        [coeffq-16*1], m3
   2963    mov                  tx2q, r3
   2964    jmp .pass1
   2965 
   2966 .pass2:
   2967    lea                  tx2q, [o(.end)]
   2968    lea                    r3, [dstq+8]
   2969    jmp  m(iidentity_8x8_internal_8bpc).end
   2970 
   2971 .end:
   2972    LOAD_8ROWS    coeffq+16*1, 32
   2973    lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
   2974    mov                  dstq, r3
   2975    jmp  m(iidentity_8x8_internal_8bpc).end
   2976 
   2977 
   2978 %macro INV_TXFM_16X16_FN 2 ; type1, type2
   2979    INV_TXFM_FN          %1, %2, 16x16, 8, 16*16
   2980 %ifidn %1_%2, dct_dct
   2981    movd                   m1, [o(pw_2896x8)]
   2982    pmulhrsw               m0, m1, [coeffq]
   2983    movd                   m2, [o(pw_8192)]
   2984    mov              [coeffq], eobd
   2985    mov                   r2d, 8
   2986    lea                  tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)]
   2987    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
   2988 .end:
   2989    RET
   2990 %endif
   2991 %endmacro
   2992 
   2993 INV_TXFM_16X16_FN dct, dct
   2994 INV_TXFM_16X16_FN dct, adst
   2995 INV_TXFM_16X16_FN dct, flipadst
   2996 INV_TXFM_16X16_FN dct, identity
   2997 
   2998 cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   2999    LOAD_8ROWS     coeffq+16*1, 64
   3000    call  m(idct_8x8_internal_8bpc).main
   3001    SAVE_7ROWS    rsp+gprsize+16*3, 16
   3002    LOAD_8ROWS     coeffq+16*3, 64
   3003    call m(idct_16x8_internal_8bpc).main
   3004    mov                     r3, tx2q
   3005    lea                   tx2q, [o(.pass1_end)]
   3006    mova                    m7, [o(pw_8192)]
   3007    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3008 
   3009 .pass1_end:
   3010    SAVE_8ROWS    coeffq+16*17, 32
   3011    LOAD_8ROWS    rsp+gprsize+16*3, 16
   3012    mova    [rsp+gprsize+16*0], m7
   3013    lea                   tx2q, [o(.pass1_end1)]
   3014    mova                    m7, [o(pw_8192)]
   3015    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3016 
   3017 .pass1_end1:
   3018    SAVE_8ROWS     coeffq+16*1, 32
   3019    LOAD_8ROWS     coeffq+16*0, 64
   3020    call  m(idct_8x8_internal_8bpc).main
   3021    SAVE_7ROWS    rsp+gprsize+16*3, 16
   3022    LOAD_8ROWS     coeffq+16*2, 64
   3023    call m(idct_16x8_internal_8bpc).main
   3024    lea                   tx2q, [o(.pass1_end2)]
   3025    mova                    m7, [o(pw_8192)]
   3026    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3027 
   3028 .pass1_end2:
   3029    SAVE_8ROWS    coeffq+16*16, 32
   3030    LOAD_8ROWS    rsp+gprsize+16*3, 16
   3031    mova    [rsp+gprsize+16*0], m7
   3032    mov                   tx2q, r3
   3033    mova                    m7, [o(pw_8192)]
   3034    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3035 
   3036 .pass2:
   3037    lea                   tx2q, [o(.end)]
   3038    jmp  m(idct_8x16_internal_8bpc).pass2_pre
   3039 
   3040 .end:
   3041    LOAD_8ROWS    rsp+gprsize+16*3, 16
   3042    mova    [rsp+gprsize+16*0], m7
   3043    lea                   tx2q, [o(.end1)]
   3044    mov                   dstq, r3
   3045    lea                     r3, [dstq+8]
   3046    jmp   m(idct_8x8_internal_8bpc).end
   3047 
   3048 .end1:
   3049    pxor                    m7, m7
   3050    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
   3051 
   3052    add                 coeffq, 32*8
   3053    mov                   dstq, r3
   3054 
   3055    mova                    m0, [coeffq+16*0 ]
   3056    mova                    m1, [coeffq+16*4 ]
   3057    mova                    m2, [coeffq+16*8 ]
   3058    mova                    m3, [coeffq+16*12]
   3059    mova                    m4, [coeffq+16*1 ]
   3060    mova                    m5, [coeffq+16*5 ]
   3061    mova                    m6, [coeffq+16*9 ]
   3062    mova                    m7, [coeffq+16*13]
   3063    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end)]
   3064    jmp  m(idct_8x16_internal_8bpc).pass2_main
   3065 
   3066 
   3067 %macro ITX_16X16_ADST_LOAD_ODD_COEFS 0
   3068    mova                    m0, [coeffq+16*1 ]
   3069    mova                    m1, [coeffq+16*3 ]
   3070    mova                    m2, [coeffq+16*29]
   3071    mova                    m3, [coeffq+16*31]
   3072    mova    [rsp+gprsize+16*7], m0
   3073    mova    [rsp+gprsize+16*8], m1
   3074    mova    [rsp+gprsize+16*9], m2
   3075    mova    [rsp+gprsize+32*5], m3
   3076    mova                    m0, [coeffq+16*13]
   3077    mova                    m1, [coeffq+16*15]
   3078    mova                    m2, [coeffq+16*17]
   3079    mova                    m3, [coeffq+16*19]
   3080    mova    [rsp+gprsize+16*3], m2
   3081    mova    [rsp+gprsize+16*4], m3
   3082    mova    [rsp+gprsize+16*5], m0
   3083    mova    [rsp+gprsize+16*6], m1
   3084    mova                    m0, [coeffq+16*5 ]
   3085    mova                    m1, [coeffq+16*7 ]
   3086    mova                    m2, [coeffq+16*9 ]
   3087    mova                    m3, [coeffq+16*11]
   3088    mova                    m4, [coeffq+16*21]
   3089    mova                    m5, [coeffq+16*23]
   3090    mova                    m6, [coeffq+16*25]
   3091    mova                    m7, [coeffq+16*27]
   3092 %endmacro
   3093 
   3094 %macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0
   3095    mova                    m0, [coeffq+16*0 ]
   3096    mova                    m1, [coeffq+16*2 ]
   3097    mova                    m2, [coeffq+16*28]
   3098    mova                    m3, [coeffq+16*30]
   3099    mova    [rsp+gprsize+16*7], m0
   3100    mova    [rsp+gprsize+16*8], m1
   3101    mova    [rsp+gprsize+16*9], m2
   3102    mova    [rsp+gprsize+32*5], m3
   3103    mova                    m0, [coeffq+16*12]
   3104    mova                    m1, [coeffq+16*14]
   3105    mova                    m2, [coeffq+16*16]
   3106    mova                    m3, [coeffq+16*18]
   3107    mova    [rsp+gprsize+16*3], m2
   3108    mova    [rsp+gprsize+16*4], m3
   3109    mova    [rsp+gprsize+16*5], m0
   3110    mova    [rsp+gprsize+16*6], m1
   3111    mova                    m0, [coeffq+16*4 ]
   3112    mova                    m1, [coeffq+16*6 ]
   3113    mova                    m2, [coeffq+16*8 ]
   3114    mova                    m3, [coeffq+16*10]
   3115    mova                    m4, [coeffq+16*20]
   3116    mova                    m5, [coeffq+16*22]
   3117    mova                    m6, [coeffq+16*24]
   3118    mova                    m7, [coeffq+16*26]
   3119 %endmacro
   3120 
   3121 INV_TXFM_16X16_FN adst, dct
   3122 INV_TXFM_16X16_FN adst, adst
   3123 INV_TXFM_16X16_FN adst, flipadst
   3124 
   3125 cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   3126    ITX_16X16_ADST_LOAD_ODD_COEFS
   3127    call m(iadst_16x8_internal_8bpc).main
   3128    call m(iadst_16x8_internal_8bpc).main_pass1_end
   3129 
   3130    mov                     r3, tx2q
   3131    lea                   tx2q, [o(.pass1_end)]
   3132    mova                    m7, [o(pw_8192)]
   3133    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
   3134 
   3135 .pass1_end:
   3136    SAVE_8ROWS    coeffq+16*17, 32
   3137    LOAD_8ROWS    rsp+gprsize+16*3, 16
   3138    mova    [rsp+gprsize+16*0], m7
   3139    lea                   tx2q, [o(.pass1_end1)]
   3140    mova                    m7, [o(pw_8192)]
   3141    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
   3142 
   3143 .pass1_end1:
   3144    SAVE_8ROWS     coeffq+16*1, 32
   3145    ITX_16X16_ADST_LOAD_EVEN_COEFS
   3146    call m(iadst_16x8_internal_8bpc).main
   3147    call m(iadst_16x8_internal_8bpc).main_pass1_end
   3148 
   3149    lea                   tx2q, [o(.pass1_end2)]
   3150    mova                    m7, [o(pw_8192)]
   3151    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
   3152 
   3153 .pass1_end2:
   3154    SAVE_8ROWS    coeffq+16*16, 32
   3155    LOAD_8ROWS    rsp+gprsize+16*3, 16
   3156    mova    [rsp+gprsize+16*0], m7
   3157    mov                   tx2q, r3
   3158    mova                    m7, [o(pw_8192)]
   3159    jmp  m(iadst_8x8_internal_8bpc).pass1_end1
   3160 
   3161 .pass2:
   3162    lea                   tx2q, [o(.end)]
   3163    jmp m(iadst_8x16_internal_8bpc).pass2_pre
   3164 
   3165 .end:
   3166    LOAD_8ROWS    rsp+gprsize+16*3, 16
   3167    mova    [rsp+gprsize+16*0], m7
   3168    lea                   tx2q, [o(.end1)]
   3169    mov                   dstq, r3
   3170    lea                     r3, [dstq+8]
   3171    jmp  m(iadst_8x8_internal_8bpc).end
   3172 
   3173 .end1:
   3174    pxor                    m7, m7
   3175    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
   3176 
   3177    add                 coeffq, 32*8
   3178    mov                   dstq, r3
   3179 
   3180    mova                    m4, [coeffq+16*0 ]
   3181    mova                    m5, [coeffq+16*2 ]
   3182    mova                    m0, [coeffq+16*4 ]
   3183    mova                    m1, [coeffq+16*6 ]
   3184    mova                    m2, [coeffq+16*8 ]
   3185    mova                    m3, [coeffq+16*10]
   3186    mova                    m6, [coeffq+16*12]
   3187    mova                    m7, [coeffq+16*14]
   3188    mova    [rsp+gprsize+16*7], m4
   3189    mova    [rsp+gprsize+16*8], m5
   3190    mova    [rsp+gprsize+16*5], m6
   3191    mova    [rsp+gprsize+16*6], m7
   3192    lea                   tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
   3193    jmp m(iadst_8x16_internal_8bpc).pass2_main
   3194 
   3195 
   3196 INV_TXFM_16X16_FN flipadst, dct
   3197 INV_TXFM_16X16_FN flipadst, adst
   3198 INV_TXFM_16X16_FN flipadst, flipadst
   3199 
   3200 cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   3201    ITX_16X16_ADST_LOAD_ODD_COEFS
   3202    call m(iadst_16x8_internal_8bpc).main
   3203    call m(iadst_16x8_internal_8bpc).main_pass1_end
   3204 
   3205    mov                     r3, tx2q
   3206    lea                   tx2q, [o(.pass1_end)]
   3207    mova                    m7, [o(pw_m8192)]
   3208    jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
   3209 
   3210 .pass1_end:
   3211    SAVE_8ROWS     coeffq+16*1, 32
   3212    LOAD_8ROWS    rsp+gprsize+16*3, 16
   3213    mova    [rsp+gprsize+16*0], m7
   3214    lea                   tx2q, [o(.pass1_end1)]
   3215    mova                    m7, [o(pw_m8192)]
   3216    jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
   3217 
   3218 .pass1_end1:
   3219    SAVE_8ROWS    coeffq+16*17, 32
   3220    ITX_16X16_ADST_LOAD_EVEN_COEFS
   3221    call m(iadst_16x8_internal_8bpc).main
   3222    call m(iadst_16x8_internal_8bpc).main_pass1_end
   3223 
   3224    mova                    m7, [rsp+gprsize+16*0]
   3225    SAVE_8ROWS     coeffq+16*0, 32
   3226    LOAD_8ROWS    rsp+gprsize+16*3, 16
   3227    mova    [rsp+gprsize+16*0], m7
   3228    lea                   tx2q, [o(.pass1_end2)]
   3229    mova                    m7, [o(pw_m8192)]
   3230    jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
   3231 
   3232 .pass1_end2:
   3233    SAVE_8ROWS    coeffq+16*16, 32
   3234    LOAD_8ROWS    coeffq+16* 0, 32
   3235    mova    [rsp+gprsize+16*0], m7
   3236    mov                   tx2q, r3
   3237    mova                    m7, [o(pw_m8192)]
   3238    jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
   3239 
   3240 .pass2:
   3241    lea                   tx2q, [o(.end)]
   3242    lea                     r3, [dstq+8]
   3243    jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
   3244 
   3245 .end:
   3246    LOAD_8ROWS    rsp+gprsize+16*3, 16
   3247    mova    [rsp+gprsize+16*0], m7
   3248    lea                   tx2q, [o(.end1)]
   3249    lea                   dstq, [dstq+strideq*2]
   3250    jmp  m(iflipadst_8x8_internal_8bpc).end
   3251 
   3252 .end1:
   3253    pxor                    m7, m7
   3254    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
   3255 
   3256    add                 coeffq, 32*8
   3257 
   3258    mova                    m4, [coeffq+16*0 ]
   3259    mova                    m5, [coeffq+16*2 ]
   3260    mova                    m0, [coeffq+16*4 ]
   3261    mova                    m1, [coeffq+16*6 ]
   3262    mova                    m2, [coeffq+16*8 ]
   3263    mova                    m3, [coeffq+16*10]
   3264    mova                    m6, [coeffq+16*12]
   3265    mova                    m7, [coeffq+16*14]
   3266    mova    [rsp+gprsize+16*7], m4
   3267    mova    [rsp+gprsize+16*8], m5
   3268    mova    [rsp+gprsize+16*5], m6
   3269    mova    [rsp+gprsize+16*6], m7
   3270 
   3271    lea                   tx2q, [o(.end2)]
   3272    mov                   dstq, r3
   3273    jmp m(iflipadst_8x16_internal_8bpc).pass2_main
   3274 
   3275 .end2:
   3276    LOAD_8ROWS    rsp+gprsize+16*3, 16
   3277    mova    [rsp+gprsize+16*0], m7
   3278    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
   3279    lea                   dstq, [dstq+strideq*2]
   3280    jmp  m(iflipadst_8x8_internal_8bpc).end
   3281 
   3282 
   3283 %macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
   3284    pmulhrsw            m%2, m%3, m%1
   3285    psraw               m%2, 1
   3286    pavgw               m%1, m%2
   3287 %endmacro
   3288 
   3289 INV_TXFM_16X16_FN identity, dct
   3290 INV_TXFM_16X16_FN identity, identity
   3291 
   3292 cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   3293    add                 coeffq, 16*17
   3294    mov                     r3, tx2q
   3295    lea                   tx2q, [o(.pass1_end)]
   3296 
   3297 .pass1:
   3298    mova                    m6, [o(pw_1697x16)]
   3299    mova                    m7, [coeffq+32*6]
   3300    mova                    m0, [coeffq+32*0]
   3301    mova                    m1, [coeffq+32*1]
   3302    mova                    m2, [coeffq+32*2]
   3303    mova                    m3, [coeffq+32*3]
   3304    mova                    m4, [coeffq+32*4]
   3305    REPX     {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4
   3306    mova                    m5, [coeffq+32*5]
   3307    mova    [rsp+gprsize+16*1], m7
   3308    IDTX16B                  5, 7, 6
   3309    mova                    m7, [coeffq+32*7]
   3310    IDTX16B                  7, 6, 6
   3311    jmp   m(idct_8x8_internal_8bpc).pass1_end3
   3312 
   3313 .pass1_end:
   3314    SAVE_8ROWS          coeffq, 32
   3315    sub                 coeffq, 16
   3316    lea                   tx2q, [o(.pass1_end1)]
   3317    jmp .pass1
   3318 
   3319 .pass1_end1:
   3320    SAVE_8ROWS          coeffq, 32
   3321    sub                 coeffq, 15*16
   3322    lea                   tx2q, [o(.pass1_end2)]
   3323    jmp .pass1
   3324 
   3325 .pass1_end2:
   3326    SAVE_8ROWS          coeffq, 32
   3327    sub                 coeffq, 16
   3328    mov                   tx2q, r3
   3329    jmp .pass1
   3330 
   3331 .pass2:
   3332    lea                     r3, [dstq+8]
   3333    lea                   tx2q, [o(.end1)]
   3334 
   3335 .end:
   3336    mova    [rsp+gprsize+16*0], m7
   3337    mova    [rsp+gprsize+16*1], m4
   3338    mova                    m7, [o(pw_1697x16)]
   3339    REPX      {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
   3340    mova                    m4, [o(pw_2048)]
   3341    pmulhrsw                m5, m4
   3342    pmulhrsw                m6, m4
   3343    mova    [rsp+gprsize+16*2], m5
   3344    mova                    m5, [rsp+gprsize+16*1]
   3345    mova    [rsp+gprsize+16*1], m6
   3346    IDTX16                   5, 6, 7
   3347    mova                    m6, [rsp+gprsize+16*0]
   3348    IDTX16                   6, 7, 7
   3349    REPX      {pmulhrsw x, m4}, m0, m1, m2, m3, m6
   3350    pmulhrsw                m4, m5
   3351    mova    [rsp+gprsize+16*0], m6
   3352    jmp   m(idct_8x8_internal_8bpc).end3
   3353 
   3354 .end1:
   3355    LOAD_8ROWS     coeffq+16*1, 32
   3356    lea                   tx2q, [o(.end2)]
   3357    lea                   dstq, [dstq+strideq*2]
   3358    jmp .end
   3359 
   3360 .end2:
   3361    pxor                    m7, m7
   3362    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
   3363 
   3364    add                 coeffq, 32*8
   3365    LOAD_8ROWS          coeffq, 32
   3366    lea                   tx2q, [o(.end3)]
   3367    mov                   dstq, r3
   3368    jmp .end
   3369 
   3370 .end3:
   3371    LOAD_8ROWS     coeffq+16*1, 32
   3372    lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
   3373    lea                   dstq, [dstq+strideq*2]
   3374    jmp .end
   3375 
   3376 
   3377 cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
   3378 %if ARCH_X86_32
   3379    LEA                     r5, $$
   3380 %endif
   3381    test                  eobd, eobd
   3382    jz .dconly
   3383    call  m(idct_8x32_internal_8bpc)
   3384    RET
   3385 
   3386 .dconly:
   3387    movd                 m1, [o(pw_2896x8)]
   3388    pmulhrsw             m0, m1, [coeffq]
   3389    movd                 m2, [o(pw_8192)]
   3390    mov            [coeffq], eobd
   3391    pmulhrsw             m0, m2
   3392    psrlw                m2, 2            ;pw_2048
   3393    pmulhrsw             m0, m1
   3394    pmulhrsw             m0, m2
   3395    pshuflw              m0, m0, q0000
   3396    punpcklwd            m0, m0
   3397    mov                 r3d, 8
   3398    lea                tx2q, [o(.end)]
   3399    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
   3400 
   3401 .end:
   3402    RET
   3403 
   3404 
   3405 
   3406 cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   3407    cmp                   eobd, 106
   3408    jle .fast
   3409 
   3410    LOAD_8ROWS     coeffq+16*3, 64
   3411    call  m(idct_8x8_internal_8bpc).main
   3412    mova                    m7, [o(pw_8192)]
   3413    lea                   tx2q, [o(.pass1)]
   3414    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3415 
   3416 .pass1:
   3417    mova   [rsp+gprsize+16*9 ], m0                        ;in24
   3418    mova   [rsp+gprsize+16*10], m4                        ;in28
   3419    mova   [rsp+gprsize+16*17], m2                        ;in26
   3420    mova   [rsp+gprsize+16*18], m6                        ;in30
   3421    mova   [rsp+gprsize+16*31], m1                        ;in25
   3422    mova   [rsp+gprsize+16*30], m3                        ;in27
   3423    mova   [rsp+gprsize+16*27], m5                        ;in29
   3424    mova   [rsp+gprsize+16*34], m7                        ;in31
   3425    LOAD_8ROWS     coeffq+16*2, 64
   3426    call  m(idct_8x8_internal_8bpc).main
   3427    mova                    m7, [o(pw_8192)]
   3428    lea                   tx2q, [o(.pass1_1)]
   3429    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3430 
   3431 .pass1_1:
   3432    mova   [rsp+gprsize+16*7 ], m0                        ;in16
   3433    mova   [rsp+gprsize+16*8 ], m4                        ;in20
   3434    mova   [rsp+gprsize+16*15], m2                        ;in18
   3435    mova   [rsp+gprsize+16*16], m6                        ;in22
   3436    mova   [rsp+gprsize+16*33], m1                        ;in17
   3437    mova   [rsp+gprsize+16*28], m3                        ;in19
   3438    mova   [rsp+gprsize+16*29], m5                        ;in21
   3439    mova   [rsp+gprsize+16*32], m7                        ;in23
   3440 
   3441 .fast:
   3442    LOAD_8ROWS     coeffq+16*1, 64
   3443    call  m(idct_8x8_internal_8bpc).main
   3444    mova                    m7, [o(pw_8192)]
   3445    lea                   tx2q, [o(.pass1_end)]
   3446    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3447 
   3448 .pass1_end:
   3449    mova   [rsp+gprsize+16*5 ], m0                        ;in8
   3450    mova   [rsp+gprsize+16*6 ], m4                        ;in12
   3451    mova   [rsp+gprsize+16*13], m2                        ;in10
   3452    mova   [rsp+gprsize+16*14], m6                        ;in14
   3453    mova   [rsp+gprsize+16*21], m1                        ;in9
   3454    mova   [rsp+gprsize+16*24], m3                        ;in11
   3455    mova   [rsp+gprsize+16*25], m5                        ;in13
   3456    mova   [rsp+gprsize+16*20], m7                        ;in15
   3457    LOAD_8ROWS     coeffq+16*0, 64
   3458    call  m(idct_8x8_internal_8bpc).main
   3459    mova                    m7, [o(pw_8192)]
   3460    lea                   tx2q, [o(.pass1_end1)]
   3461    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3462 
   3463 .pass1_end1:
   3464    mova   [rsp+gprsize+16*11], m2                        ;in2
   3465    mova   [rsp+gprsize+16*12], m6                        ;in6
   3466    mova   [rsp+gprsize+16*19], m1                        ;in1
   3467    mova   [rsp+gprsize+16*26], m3                        ;in3
   3468    mova   [rsp+gprsize+16*23], m5                        ;in5
   3469    mova   [rsp+gprsize+16*22], m7                        ;in7
   3470    mova                    m1, m4                        ;in4
   3471    mova                    m2, [rsp+gprsize+16*5 ]       ;in8
   3472    mova                    m3, [rsp+gprsize+16*6 ]       ;in12
   3473 
   3474    cmp                   eobd, 106
   3475    jg .full
   3476 
   3477    pxor                    m4, m4
   3478    REPX          {mova x, m4}, m5, m6, m7
   3479    call  m(idct_8x8_internal_8bpc).main
   3480    SAVE_7ROWS   rsp+gprsize+16*3 , 16
   3481    mova                    m0, [rsp+gprsize+16*11]
   3482    mova                    m1, [rsp+gprsize+16*12]
   3483    mova                    m2, [rsp+gprsize+16*13]
   3484    mova                    m3, [rsp+gprsize+16*14]
   3485    pxor                    m4, m4
   3486    REPX          {mova x, m4}, m5, m6, m7
   3487    call m(idct_16x8_internal_8bpc).main
   3488    mova                    m7, [rsp+gprsize+16*0]
   3489    SAVE_8ROWS   rsp+gprsize+16*11, 16
   3490 
   3491    call .main_fast
   3492    jmp  .pass2
   3493 
   3494 .full:
   3495    mova                    m4, [rsp+gprsize+16*7 ]       ;in16
   3496    mova                    m5, [rsp+gprsize+16*8 ]       ;in20
   3497    mova                    m6, [rsp+gprsize+16*9 ]       ;in24
   3498    mova                    m7, [rsp+gprsize+16*10]       ;in28
   3499    call  m(idct_8x8_internal_8bpc).main
   3500    SAVE_7ROWS   rsp+gprsize+16*3 , 16
   3501    LOAD_8ROWS   rsp+gprsize+16*11, 16
   3502    call m(idct_16x8_internal_8bpc).main
   3503    mova                    m7, [rsp+gprsize+16*0]
   3504    SAVE_8ROWS   rsp+gprsize+16*11, 16
   3505    call .main
   3506 
   3507 .pass2:
   3508    lea                     r3, [o(.end6)]
   3509 
   3510 .end:
   3511    mova   [rsp+gprsize+16*0 ], m7
   3512    lea                   tx2q, [o(.end2)]
   3513 
   3514 .end1:
   3515    pxor                    m7, m7
   3516    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  \
   3517                                     8,  9,  10, 11, 12, 13, 14, 15, \
   3518                                     16, 17, 18, 19, 20, 21, 22, 23, \
   3519                                     24, 25, 26, 27, 28, 29, 30, 31
   3520 
   3521    jmp                   tx2q
   3522 
   3523 .end2:
   3524    lea                   tx2q, [o(.end3)]
   3525    jmp   m(idct_8x8_internal_8bpc).end
   3526 
   3527 .end3:
   3528    LOAD_8ROWS   rsp+gprsize+16*11, 16
   3529    mova   [rsp+gprsize+16*0 ], m7
   3530    lea                   dstq, [dstq+strideq*2]
   3531    lea                   tx2q, [o(.end4)]
   3532    jmp   m(idct_8x8_internal_8bpc).end
   3533 
   3534 .end4:
   3535    LOAD_8ROWS   rsp+gprsize+16*19, 16
   3536    mova   [rsp+gprsize+16*0 ], m7
   3537    lea                   dstq, [dstq+strideq*2]
   3538    lea                   tx2q, [o(.end5)]
   3539    jmp   m(idct_8x8_internal_8bpc).end
   3540 
   3541 .end5:
   3542    LOAD_8ROWS   rsp+gprsize+16*27, 16
   3543    mova   [rsp+gprsize+16*0 ], m7
   3544    lea                   dstq, [dstq+strideq*2]
   3545    mov                   tx2q, r3
   3546    jmp   m(idct_8x8_internal_8bpc).end
   3547 
   3548 .end6:
   3549    ret
   3550 
   3551 ALIGN function_align
   3552 cglobal_label .main_veryfast
   3553    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
   3554    pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t30,t31
   3555    pmulhrsw                m0, [o(pw_201x8)]             ;t16,t17
   3556    mova                    m7, [o(pd_2048)]
   3557    mova [rsp+gprsize*2+16*19], m0                        ;t16
   3558    mova [rsp+gprsize*2+16*34], m3                        ;t31
   3559    ITX_MULSUB_2W            3, 0, 1, 2, 7,  799, 4017    ;t17a, t30a
   3560    mova [rsp+gprsize*2+16*20], m3                        ;t17a
   3561    mova [rsp+gprsize*2+16*33], m0                        ;t30a
   3562    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
   3563    pmulhrsw                m2, m1, [o(pw_3857x8)]        ;t28,t29
   3564    pmulhrsw                m1, [o(pw_m1380x8)]           ;t18,t19
   3565    mova [rsp+gprsize*2+16*22], m1                        ;t19
   3566    mova [rsp+gprsize*2+16*31], m2                        ;t28
   3567    ITX_MULSUB_2W            2, 1, 0, 3, 7, m4017, 799    ;t18a, t29a
   3568    mova [rsp+gprsize*2+16*21], m2                        ;t18a
   3569    mova [rsp+gprsize*2+16*32], m1                        ;t29a
   3570    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
   3571    pmulhrsw                m3, m0, [o(pw_3973x8)]        ;t26, t27
   3572    pmulhrsw                m0, [o(pw_995x8)]             ;t20, t21
   3573    mova [rsp+gprsize*2+16*23], m0                        ;t20
   3574    mova [rsp+gprsize*2+16*30], m3                        ;t27
   3575    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3406, 2276    ;t21a, t26a
   3576    mova [rsp+gprsize*2+16*24], m3                        ;t21a
   3577    mova [rsp+gprsize*2+16*29], m0                        ;t26a
   3578    mova                    m2, [rsp+gprsize*2+16*26]     ;in3
   3579    pxor                    m0, m0
   3580    mova                    m3, m0
   3581    pmulhrsw                m1, m2, [o(pw_4052x8)]
   3582    pmulhrsw                m2, [o(pw_m601x8)]
   3583    jmp .main2
   3584 
   3585 ALIGN function_align
   3586 cglobal_label .main_fast ;bottom half is zero
   3587    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
   3588    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
   3589    pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t31a
   3590    pmulhrsw                m0, [o(pw_201x8)]             ;t16a
   3591    pmulhrsw                m2, m1, [o(pw_3035x8)]        ;t30a
   3592    pmulhrsw                m1, [o(pw_m2751x8)]           ;t17a
   3593    mova                    m7, [o(pd_2048)]
   3594    psubsw                  m4, m0, m1                    ;t17
   3595    paddsw                  m0, m1                        ;t16
   3596    psubsw                  m5, m3, m2                    ;t30
   3597    paddsw                  m3, m2                        ;t31
   3598    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
   3599    mova [rsp+gprsize*2+16*19], m0                        ;t16
   3600    mova [rsp+gprsize*2+16*20], m5                        ;t17a
   3601    mova [rsp+gprsize*2+16*33], m4                        ;t30a
   3602    mova [rsp+gprsize*2+16*34], m3                        ;t31
   3603    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
   3604    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
   3605    pmulhrsw                m3, m0, [o(pw_3703x8)]
   3606    pmulhrsw                m0, [o(pw_1751x8)]
   3607    pmulhrsw                m2, m1, [o(pw_3857x8)]
   3608    pmulhrsw                m1, [o(pw_m1380x8)]
   3609    psubsw                  m4, m1, m0                    ;t18
   3610    paddsw                  m0, m1                        ;t19
   3611    psubsw                  m5, m2, m3                    ;t29
   3612    paddsw                  m3, m2                        ;t28
   3613    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a
   3614    mova [rsp+gprsize*2+16*21], m5                        ;t18a
   3615    mova [rsp+gprsize*2+16*22], m0                        ;t19
   3616    mova [rsp+gprsize*2+16*31], m3                        ;t28
   3617    mova [rsp+gprsize*2+16*32], m4                        ;t29a
   3618    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
   3619    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
   3620    pmulhrsw                m3, m0, [o(pw_3973x8)]
   3621    pmulhrsw                m0, [o(pw_995x8)]
   3622    pmulhrsw                m2, m1, [o(pw_3513x8)]
   3623    pmulhrsw                m1, [o(pw_m2106x8)]
   3624    psubsw                  m4, m0, m1                    ;t21
   3625    paddsw                  m0, m1                        ;t20
   3626    psubsw                  m5, m3, m2                    ;t26
   3627    paddsw                  m3, m2                        ;t27
   3628    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
   3629    mova [rsp+gprsize*2+16*23], m0                        ;t20
   3630    mova [rsp+gprsize*2+16*24], m5                        ;t21a
   3631    mova [rsp+gprsize*2+16*29], m4                        ;t26a
   3632    mova [rsp+gprsize*2+16*30], m3                        ;t27
   3633    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
   3634    mova                    m2, [rsp+gprsize*2+16*26]     ;in3
   3635    pmulhrsw                m3, m0, [o(pw_3290x8)]
   3636    pmulhrsw                m0, [o(pw_2440x8)]
   3637    pmulhrsw                m1, m2, [o(pw_4052x8)]
   3638    pmulhrsw                m2, [o(pw_m601x8)]
   3639    jmp .main2
   3640 
   3641 ALIGN function_align
   3642 cglobal_label .main
   3643    mova                    m7, [o(pd_2048)]
   3644    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
   3645    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
   3646    mova                    m2, [rsp+gprsize*2+16*33]     ;in17
   3647    mova                    m3, [rsp+gprsize*2+16*34]     ;in31
   3648    ITX_MULSUB_2W            0, 3, 4, 5, 7,  201, 4091    ;t16a, t31a
   3649    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3035, 2751    ;t17a, t30a
   3650    psubsw                  m4, m0, m2                    ;t17
   3651    paddsw                  m0, m2                        ;t16
   3652    psubsw                  m5, m3, m1                    ;t30
   3653    paddsw                  m3, m1                        ;t31
   3654    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
   3655    mova [rsp+gprsize*2+16*19], m0                        ;t16
   3656    mova [rsp+gprsize*2+16*20], m5                        ;t17a
   3657    mova [rsp+gprsize*2+16*33], m4                        ;t30a
   3658    mova [rsp+gprsize*2+16*34], m3                        ;t31
   3659    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
   3660    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
   3661    mova                    m2, [rsp+gprsize*2+16*31]     ;in25
   3662    mova                    m3, [rsp+gprsize*2+16*32]     ;in23
   3663    ITX_MULSUB_2W            0, 3, 4, 5, 7, 1751, 3703    ;t18a, t29a
   3664    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3857, 1380    ;t19a, t28a
   3665    psubsw                  m4, m2, m0                    ;t18
   3666    paddsw                  m0, m2                        ;t19
   3667    psubsw                  m5, m1, m3                    ;t29
   3668    paddsw                  m3, m1                        ;t28
   3669    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4017, 799    ;t18a, t29a
   3670    mova [rsp+gprsize*2+16*21], m5                        ;t18a
   3671    mova [rsp+gprsize*2+16*22], m0                        ;t19
   3672    mova [rsp+gprsize*2+16*31], m3                        ;t28
   3673    mova [rsp+gprsize*2+16*32], m4                        ;t29a
   3674    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
   3675    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
   3676    mova                    m2, [rsp+gprsize*2+16*29]     ;in21
   3677    mova                    m3, [rsp+gprsize*2+16*30]     ;in27
   3678    ITX_MULSUB_2W            0, 3, 4, 5, 7,  995, 3973    ;t20a, t27a
   3679    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3513, 2106    ;t21a, t26a
   3680    psubsw                  m4, m0, m2                    ;t21
   3681    paddsw                  m0, m2                        ;t20
   3682    psubsw                  m5, m3, m1                    ;t26
   3683    paddsw                  m3, m1                        ;t27
   3684    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
   3685    mova [rsp+gprsize*2+16*23], m0                        ;t20
   3686    mova [rsp+gprsize*2+16*24], m5                        ;t21a
   3687    mova [rsp+gprsize*2+16*29], m4                        ;t26a
   3688    mova [rsp+gprsize*2+16*30], m3                        ;t27
   3689    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
   3690    mova                    m1, [rsp+gprsize*2+16*26]     ;in3
   3691    mova                    m2, [rsp+gprsize*2+16*27]     ;in29
   3692    mova                    m3, [rsp+gprsize*2+16*28]     ;in19
   3693    ITX_MULSUB_2W            0, 3, 4, 5, 7, 2440, 3290    ;t22a, t25a
   3694    ITX_MULSUB_2W            2, 1, 4, 5, 7, 4052,  601    ;t23a, t24a
   3695 
   3696 .main2:
   3697    psubsw                  m4, m2, m0                    ;t22
   3698    paddsw                  m0, m2                        ;t23
   3699    psubsw                  m5, m1, m3                    ;t25
   3700    paddsw                  m3, m1                        ;t24
   3701    ITX_MULSUB_2W            5, 4, 1, 2, 7, m2276, 3406   ;t22a, t25a
   3702    mova                    m2, [rsp+gprsize*2+16*24]     ;t21a
   3703    psubsw                  m1, m5, m2                    ;t21
   3704    paddsw                  m5, m2                        ;t22
   3705    mova [rsp+gprsize*2+16*25], m5                        ;t22
   3706    mova                    m2, [rsp+gprsize*2+16*29]     ;t26a
   3707    psubsw                  m5, m4, m2                    ;t26
   3708    paddsw                  m4, m2                        ;t25
   3709    mova [rsp+gprsize*2+16*28], m4                        ;t25
   3710    ITX_MULSUB_2W            5, 1, 2, 4, 7, m3784, 1567   ;t21a, t26a
   3711    mova [rsp+gprsize*2+16*24], m5                        ;t21a
   3712    mova [rsp+gprsize*2+16*29], m1                        ;t26a
   3713 
   3714    mova                    m1, [rsp+gprsize*2+16*23]     ;t20
   3715    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
   3716    psubsw                  m2, m0, m1                    ;t20a
   3717    paddsw                  m0, m1                        ;t23a
   3718    psubsw                  m6, m3, m5                    ;t27a
   3719    paddsw                  m3, m5                        ;t24a
   3720    ITX_MULSUB_2W            6, 2, 1, 5, 7, m3784, 1567   ;t20, t27
   3721    mova [rsp+gprsize*2+16*26], m0                        ;t23a
   3722    mova [rsp+gprsize*2+16*27], m3                        ;t24a
   3723    mova [rsp+gprsize*2+16*30], m2                        ;t27
   3724 
   3725    mova                    m0, [rsp+gprsize*2+16*20]     ;t17a
   3726    mova                    m1, [rsp+gprsize*2+16*21]     ;t18a
   3727    mova                    m2, [rsp+gprsize*2+16*32]     ;t29a
   3728    mova                    m3, [rsp+gprsize*2+16*33]     ;t30a
   3729    psubsw                  m4, m0, m1                    ;t18
   3730    paddsw                  m0, m1                        ;t17
   3731    psubsw                  m5, m3, m2                    ;t29
   3732    paddsw                  m3, m2                        ;t30
   3733    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t18a, t29a
   3734    mova [rsp+gprsize*2+16*20], m0                        ;t17
   3735    mova [rsp+gprsize*2+16*21], m5                        ;t18a
   3736    mova [rsp+gprsize*2+16*32], m4                        ;t29a
   3737    mova [rsp+gprsize*2+16*33], m3                        ;t30
   3738    mova                    m0, [rsp+gprsize*2+16*19]     ;t16
   3739    mova                    m1, [rsp+gprsize*2+16*22]     ;t19
   3740    mova                    m2, [rsp+gprsize*2+16*31]     ;t28
   3741    mova                    m3, [rsp+gprsize*2+16*34]     ;t31
   3742    psubsw                  m4, m0, m1                    ;t19a
   3743    paddsw                  m0, m1                        ;t16a
   3744    psubsw                  m5, m3, m2                    ;t28a
   3745    paddsw                  m3, m2                        ;t31a
   3746    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t19, t28
   3747    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp12
   3748    psubsw                  m1, m5, m6                    ;t20a
   3749    paddsw                  m5, m6                        ;t19a
   3750    psubsw                  m6, m2, m5                    ;out19
   3751    paddsw                  m2, m5                        ;out12
   3752    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
   3753    mova [rsp+gprsize*2+16*22], m6                        ;out19
   3754    mova [rsp+gprsize*2+16*15], m2                        ;out12
   3755    psubsw                  m6, m4, m5                    ;t27a
   3756    paddsw                  m4, m5                        ;t28a
   3757    ITX_MULSUB_2W            6, 1, 2, 5, 7, 2896, 2896    ;t20, t27
   3758    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp3
   3759    psubsw                  m5, m2, m4                    ;out28
   3760    paddsw                  m2, m4                        ;out3
   3761    mova                    m4, [rsp+gprsize*2+16*14]     ;tmp11
   3762    mova [rsp+gprsize*2+16*31], m5                        ;out28
   3763    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
   3764    psubsw                  m5, m4, m6                    ;out20
   3765    paddsw                  m4, m6                        ;out11
   3766    mova                    m2, [rsp+gprsize*2+16*7 ]     ;tmp4
   3767    mova [rsp+gprsize*2+16*23], m5                        ;out20
   3768    mova [rsp+gprsize*2+16*14], m4                        ;out11
   3769    psubsw                  m5, m2, m1                    ;out27
   3770    paddsw                  m2, m1                        ;out4
   3771    mova                    m1, [rsp+gprsize*2+16*26]     ;t23a
   3772    mova                    m4, [rsp+gprsize*2+16*27]     ;t24a
   3773    mova [rsp+gprsize*2+16*30], m5                        ;out27
   3774    mova [rsp+gprsize*2+16*7 ], m2                        ;out4
   3775    psubsw                  m5, m0, m1                    ;t23
   3776    paddsw                  m0, m1                        ;t16
   3777    psubsw                  m2, m3, m4                    ;t24
   3778    paddsw                  m3, m4                        ;t31
   3779    ITX_MULSUB_2W            2, 5, 4, 6, 7, 2896, 2896    ;t23a, t24a
   3780    mova                    m6, [rsp+gprsize*2+16*18]     ;tmp15
   3781    psubsw                  m4, m6, m0                    ;out16
   3782    paddsw                  m6, m0                        ;out15
   3783    mova                    m0, [rsp+gprsize*2+16*3 ]     ;tmp0
   3784    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp8
   3785    mova [rsp+gprsize*2+16*18], m6                        ;out15
   3786    mova [rsp+gprsize*2+16*19], m4                        ;out16
   3787    psubsw                  m6, m0, m3                    ;out31
   3788    paddsw                  m0, m3                        ;out0
   3789    psubsw                  m4, m1, m2                    ;out23
   3790    paddsw                  m1, m2                        ;out8
   3791    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp7
   3792    mova [rsp+gprsize*2+16*34], m6                        ;out31
   3793    mova [rsp+gprsize*2+16*11], m1                        ;out8
   3794    mova [rsp+gprsize*2+16*26], m4                        ;out23
   3795    paddsw                  m6, m3, m5                    ;out7
   3796    psubsw                  m3, m5                        ;out24
   3797    mova                    m1, [rsp+gprsize*2+16*20]     ;t17
   3798    mova                    m5, [rsp+gprsize*2+16*25]     ;t22
   3799    mova                    m2, [rsp+gprsize*2+16*17]     ;tmp14
   3800    mova [rsp+gprsize*2+16*27], m3                        ;out24
   3801    psubsw                  m4, m1, m5                    ;t22a
   3802    paddsw                  m1, m5                        ;t17a
   3803    psubsw                  m3, m2, m1                    ;out17
   3804    paddsw                  m2, m1                        ;out14
   3805    mova                    m5, [rsp+gprsize*2+16*28]     ;t25
   3806    mova                    m1, [rsp+gprsize*2+16*33]     ;t30
   3807    mova [rsp+gprsize*2+16*17], m2                        ;out14
   3808    mova [rsp+gprsize*2+16*20], m3                        ;out17
   3809    psubsw                  m2, m1, m5                    ;t25a
   3810    paddsw                  m1, m5                        ;t30a
   3811    ITX_MULSUB_2W            2, 4, 3, 5, 7, 2896, 2896    ;t22, t25
   3812    mova                    m5, [rsp+gprsize*2+16*4 ]     ;tmp1
   3813    psubsw                  m3, m5, m1                    ;out30
   3814    paddsw                  m5, m1                        ;out1
   3815    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp9
   3816    mova [rsp+gprsize*2+16*33], m3                        ;out30
   3817    mova [rsp+gprsize*2+16*4 ], m5                        ;out1
   3818    psubsw                  m3, m1, m2                    ;out22
   3819    paddsw                  m1, m2                        ;out9
   3820    mova                    m5, [rsp+gprsize*2+16*9 ]     ;tmp6
   3821    mova [rsp+gprsize*2+16*25], m3                        ;out22
   3822    mova [rsp+gprsize*2+16*12], m1                        ;out9
   3823    psubsw                  m3, m5, m4                    ;out25
   3824    paddsw                  m5, m4                        ;out6
   3825    mova                    m4, [rsp+gprsize*2+16*21]     ;t18a
   3826    mova                    m1, [rsp+gprsize*2+16*24]     ;t21a
   3827    mova                    m2, [rsp+gprsize*2+16*16]     ;tmp13
   3828    mova [rsp+gprsize*2+16*28], m3                        ;out25
   3829    mova [rsp+gprsize*2+16*9 ], m5                        ;out6
   3830    paddsw                  m3, m4, m1                    ;t18
   3831    psubsw                  m4, m1                        ;t21
   3832    psubsw                  m5, m2, m3                    ;out18
   3833    paddsw                  m2, m3                        ;out13
   3834    mova                    m1, [rsp+gprsize*2+16*29]     ;t26a
   3835    mova                    m3, [rsp+gprsize*2+16*32]     ;t29a
   3836    mova [rsp+gprsize*2+16*21], m5                        ;out18
   3837    mova [rsp+gprsize*2+16*16], m2                        ;out13
   3838    psubsw                  m5, m3, m1                    ;t26
   3839    paddsw                  m3, m1                        ;t29
   3840    ITX_MULSUB_2W            5, 4, 1, 2, 7, 2896, 2896    ;t21a, t26a
   3841    mova                    m2, [rsp+gprsize*2+16*5 ]     ;tmp2
   3842    psubsw                  m1, m2, m3                    ;out29
   3843    paddsw                  m2, m3                        ;out2
   3844    mova                    m3, [rsp+gprsize*2+16*13]     ;tmp10
   3845    mova [rsp+gprsize*2+16*32], m1                        ;out29
   3846    psubsw                  m7, m3, m5                    ;out21
   3847    paddsw                  m3, m5                        ;out10
   3848    mova                    m5, [rsp+gprsize*2+16*8 ]     ;tmp5
   3849    mova [rsp+gprsize*2+16*24], m7                        ;out21
   3850    mova [rsp+gprsize*2+16*13], m3                        ;out10
   3851    psubsw                  m1, m5, m4                    ;out26
   3852    paddsw                  m5, m4                        ;out5
   3853    mova                    m7, m6                        ;out7
   3854    mova                    m3, [rsp+gprsize*2+16*6 ]     ;out3
   3855    mova                    m4, [rsp+gprsize*2+16*7 ]     ;out4
   3856    mova [rsp+gprsize*2+16*29], m1                        ;out26
   3857    mova                    m6, [rsp+gprsize*2+16*9 ]     ;out6
   3858    mova                    m1, [rsp+gprsize*2+16*4 ]     ;out1
   3859    ret
   3860 
   3861 
   3862 cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
   3863 %if ARCH_X86_32
   3864    LEA                     r5, $$
   3865 %endif
   3866    test                  eobd, eobd
   3867    jz .dconly
   3868    call  m(idct_32x8_internal_8bpc)
   3869    RET
   3870 
   3871 .dconly:
   3872    movd                    m1, [o(pw_2896x8)]
   3873    pmulhrsw                m0, m1, [coeffq]
   3874    movd                    m2, [o(pw_8192)]
   3875    mov               [coeffq], eobd
   3876    mov                    r3d, 8
   3877    lea                   tx2q, [o(.end)]
   3878 
   3879 .body:
   3880    pmulhrsw                m0, m2
   3881    movd                    m2, [o(pw_2048)]  ;intentionally rip-relative
   3882    pmulhrsw                m0, m1
   3883    pmulhrsw                m0, m2
   3884    pshuflw                 m0, m0, q0000
   3885    punpcklwd               m0, m0
   3886    pxor                    m5, m5
   3887 
   3888 .loop:
   3889    mova                    m1, [dstq+16*0]
   3890    mova                    m3, [dstq+16*1]
   3891    punpckhbw               m2, m1, m5
   3892    punpcklbw               m1, m5
   3893    punpckhbw               m4, m3, m5
   3894    punpcklbw               m3, m5
   3895    paddw                   m2, m0
   3896    paddw                   m1, m0
   3897    paddw                   m4, m0
   3898    paddw                   m3, m0
   3899    packuswb                m1, m2
   3900    packuswb                m3, m4
   3901    mova           [dstq+16*0], m1
   3902    mova           [dstq+16*1], m3
   3903    add                   dstq, strideq
   3904    dec                    r3d
   3905    jg .loop
   3906    jmp                   tx2q
   3907 
   3908 .end:
   3909    RET
   3910 
   3911 
   3912 cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   3913    LOAD_8ROWS     coeffq+16*0, 64
   3914    call  m(idct_8x8_internal_8bpc).main
   3915    SAVE_7ROWS    rsp+gprsize+16*3, 16
   3916 
   3917    LOAD_8ROWS     coeffq+16*2, 64
   3918    call m(idct_16x8_internal_8bpc).main
   3919    mova                    m7, [rsp+gprsize+16*0]
   3920    SAVE_8ROWS   rsp+gprsize+16*11, 16
   3921 
   3922    LOAD_8ROWS     coeffq+16*1, 32
   3923    mova   [rsp+gprsize+16*19], m0                        ;in1
   3924    mova   [rsp+gprsize+16*26], m1                        ;in3
   3925    mova   [rsp+gprsize+16*23], m2                        ;in5
   3926    mova   [rsp+gprsize+16*22], m3                        ;in7
   3927    mova   [rsp+gprsize+16*21], m4                        ;in9
   3928    mova   [rsp+gprsize+16*24], m5                        ;in11
   3929    mova   [rsp+gprsize+16*25], m6                        ;in13
   3930    mova   [rsp+gprsize+16*20], m7                        ;in15
   3931 
   3932    cmp                   eobd, 106
   3933    jg  .full
   3934    call m(idct_8x32_internal_8bpc).main_fast
   3935    jmp .pass2
   3936 
   3937 .full:
   3938    LOAD_8ROWS    coeffq+16*17, 32
   3939    mova   [rsp+gprsize+16*33], m0                        ;in17
   3940    mova   [rsp+gprsize+16*28], m1                        ;in19
   3941    mova   [rsp+gprsize+16*29], m2                        ;in21
   3942    mova   [rsp+gprsize+16*32], m3                        ;in23
   3943    mova   [rsp+gprsize+16*31], m4                        ;in25
   3944    mova   [rsp+gprsize+16*30], m5                        ;in27
   3945    mova   [rsp+gprsize+16*27], m6                        ;in29
   3946    mova   [rsp+gprsize+16*34], m7                        ;in31
   3947    call m(idct_8x32_internal_8bpc).main
   3948 
   3949 .pass2:
   3950    mova   [rsp+gprsize+16*0 ], m7
   3951    lea                   tx2q, [o(.end)]
   3952    jmp  m(idct_8x32_internal_8bpc).end1
   3953 
   3954 .end:
   3955    mova                    m7, [o(pw_8192)]
   3956    lea                   tx2q, [o(.end1)]
   3957    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3958 
   3959 .end1:
   3960    lea                     r3, [dstq+8]
   3961    lea                   tx2q, [o(.end2)]
   3962    jmp   m(idct_8x8_internal_8bpc).pass2_main
   3963 
   3964 .end2:
   3965    LOAD_8ROWS   rsp+gprsize+16*11, 16
   3966    mova   [rsp+gprsize+16*0 ], m7
   3967    mova                    m7, [o(pw_8192)]
   3968    lea                   tx2q, [o(.end3)]
   3969    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3970 
   3971 .end3:
   3972    mov                   dstq, r3
   3973    add                     r3, 8
   3974    lea                   tx2q, [o(.end4)]
   3975    jmp   m(idct_8x8_internal_8bpc).pass2_main
   3976 
   3977 .end4:
   3978    LOAD_8ROWS   rsp+gprsize+16*19, 16
   3979    mova   [rsp+gprsize+16*0 ], m7
   3980    mova                    m7, [o(pw_8192)]
   3981    lea                   tx2q, [o(.end5)]
   3982    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3983 
   3984 .end5:
   3985    mov                   dstq, r3
   3986    add                     r3, 8
   3987    lea                   tx2q, [o(.end6)]
   3988    jmp   m(idct_8x8_internal_8bpc).pass2_main
   3989 
   3990 .end6:
   3991    LOAD_8ROWS   rsp+gprsize+16*27, 16
   3992    mova   [rsp+gprsize+16*0 ], m7
   3993    mova                    m7, [o(pw_8192)]
   3994    lea                   tx2q, [o(.end7)]
   3995    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   3996 
   3997 .end7:
   3998    mov                   dstq, r3
   3999    lea                   tx2q, [o(.end8)]
   4000    jmp   m(idct_8x8_internal_8bpc).pass2_main
   4001 
   4002 .end8:
   4003    ret
   4004 
   4005 
   4006 cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
   4007    mov                    r5d, 4
   4008    mov                   tx2d, 2
   4009    cmp                   eobd, 107
   4010    cmovns                tx2d, r5d
   4011    mov                    r3d, tx2d
   4012 %if ARCH_X86_32
   4013    LEA                     r5, $$
   4014 %endif
   4015    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
   4016 .loop:
   4017    LOAD_8ROWS     coeffq+16*0, 64
   4018    paddsw                  m6, [o(pw_5)]
   4019    mova            [rsp+16*1], m6
   4020    mova                    m6, [o(pw_5)]
   4021    REPX        {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7
   4022    call  m(idct_8x8_internal_8bpc).pass1_end3
   4023    REPX        {psraw  x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
   4024    mova            [rsp+16*2], m5
   4025    mova            [rsp+16*1], m6
   4026    mova            [rsp+16*0], m7
   4027    call  m(idct_8x8_internal_8bpc).end3
   4028    lea                   dstq, [dstq+strideq*2]
   4029    pxor                    m7, m7
   4030    REPX   {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
   4031    add                 coeffq, 16
   4032    dec                    r3d
   4033    jg .loop
   4034    RET
   4035 
   4036 cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
   4037    mov                    r5d, 4
   4038    mov                   tx2d, 2
   4039    cmp                   eobd, 107
   4040    cmovns                tx2d, r5d
   4041    mov                    r3d, tx2d
   4042 %if ARCH_X86_32
   4043    LEA                     r5, $$
   4044 %endif
   4045 
   4046 .loop:
   4047    LOAD_8ROWS     coeffq+16*0, 16
   4048    pmulhrsw                m6, [o(pw_4096)]
   4049    mova            [rsp+16*1], m6
   4050    mova                    m6, [o(pw_4096)]
   4051    REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
   4052    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
   4053    call  m(idct_8x8_internal_8bpc).pass1_end3
   4054 
   4055    mov             [rsp+16*3], dstq
   4056    mova            [rsp+16*2], m5
   4057    mova            [rsp+16*1], m6
   4058    mova            [rsp+16*0], m7
   4059    lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
   4060    call  m(idct_8x8_internal_8bpc).end3
   4061 
   4062    add                 coeffq, 16*8
   4063    mov                   dstq, [rsp+16*3]
   4064    lea                   dstq, [dstq+8]
   4065    dec                    r3d
   4066    jg .loop
   4067    jnc .loop
   4068    RET
   4069 
   4070 
   4071 cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
   4072 %if ARCH_X86_32
   4073    LEA                     r5, $$
   4074 %endif
   4075    test                  eobd, eobd
   4076    jz .dconly
   4077    call  m(idct_16x32_internal_8bpc)
   4078 .end:
   4079    RET
   4080 
   4081 .dconly:
   4082    movd                    m1, [o(pw_2896x8)]
   4083    pmulhrsw                m0, m1, [coeffq]
   4084    movd                    m2, [o(pw_16384)]
   4085    mov               [coeffq], eobd
   4086    pmulhrsw                m0, m1
   4087    mov                    r2d, 16
   4088    lea                   tx2q, [o(.end)]
   4089    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
   4090 
   4091 
   4092 cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   4093    LOAD_8ROWS     coeffq+16*1, 128, 1
   4094    call  m(idct_8x8_internal_8bpc).main
   4095    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4096    LOAD_8ROWS     coeffq+16*5, 128, 1
   4097    call m(idct_16x8_internal_8bpc).main
   4098    lea                   tx2q, [o(.pass1_end)]
   4099    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4100 
   4101 .pass1_end:
   4102    SAVE_8ROWS    coeffq+16*33, 64               ;in8~in15
   4103    LOAD_8ROWS    rsp+gprsize+16*3, 16
   4104    mova    [rsp+gprsize+16*0], m7
   4105    lea                   tx2q, [o(.pass1_end1)]
   4106    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4107 
   4108 .pass1_end1:
   4109    mova        [coeffq+16*1 ], m0                        ;in8
   4110    mova        [coeffq+16*5 ], m4                        ;in12
   4111    mova   [rsp+gprsize+16*13], m2                        ;in10
   4112    mova   [rsp+gprsize+16*14], m6                        ;in14
   4113    mova   [rsp+gprsize+16*21], m1                        ;in9
   4114    mova   [rsp+gprsize+16*24], m3                        ;in11
   4115    mova   [rsp+gprsize+16*25], m5                        ;in13
   4116    mova   [rsp+gprsize+16*20], m7                        ;in15
   4117    LOAD_8ROWS     coeffq+16*0, 128, 1
   4118    call  m(idct_8x8_internal_8bpc).main
   4119    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4120    LOAD_8ROWS     coeffq+16*4, 128, 1
   4121    call m(idct_16x8_internal_8bpc).main
   4122    lea                   tx2q, [o(.pass1_end2)]
   4123    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4124 
   4125 .pass1_end2:
   4126    SAVE_8ROWS    coeffq+16*32, 64               ;in0~in7
   4127    LOAD_8ROWS    rsp+gprsize+16*3, 16
   4128    mova    [rsp+gprsize+16*0], m7
   4129    lea                   tx2q, [o(.pass1_end3)]
   4130    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4131 
   4132 .pass1_end3:
   4133    mova   [rsp+gprsize+16*11], m2                        ;in2
   4134    mova   [rsp+gprsize+16*12], m6                        ;in6
   4135    mova   [rsp+gprsize+16*19], m1                        ;in1
   4136    mova   [rsp+gprsize+16*26], m3                        ;in3
   4137    mova   [rsp+gprsize+16*23], m5                        ;in5
   4138    mova   [rsp+gprsize+16*22], m7                        ;in7
   4139 
   4140    cmp                   eobd, 150
   4141    jg .full
   4142 
   4143    mova                    m1, m4                        ;in4
   4144    mova                    m2, [coeffq+16*1 ]            ;in8
   4145    mova                    m3, [coeffq+16*5 ]            ;in12
   4146    pxor                    m4, m4
   4147    REPX          {mova x, m4}, m5, m6, m7
   4148    call  m(idct_8x8_internal_8bpc).main
   4149    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4150    mova                    m0, [rsp+gprsize+16*11]       ;in2
   4151    mova                    m1, [rsp+gprsize+16*12]       ;in6
   4152    mova                    m2, [rsp+gprsize+16*13]       ;in10
   4153    mova                    m3, [rsp+gprsize+16*14]       ;in14
   4154    pxor                    m4, m4
   4155    REPX          {mova x, m4}, m5, m6, m7
   4156    call m(idct_16x8_internal_8bpc).main
   4157    mova                    m7, [rsp+gprsize+16*0]
   4158    SAVE_8ROWS   rsp+gprsize+16*11, 16
   4159 
   4160    call m(idct_8x32_internal_8bpc).main_fast
   4161    jmp  .pass2
   4162 
   4163 .full:
   4164    mova        [coeffq+16*0 ], m0                        ;in0
   4165    mova        [coeffq+16*4 ], m4                        ;in4
   4166 
   4167    LOAD_8ROWS     coeffq+16*2, 128, 1
   4168    call  m(idct_8x8_internal_8bpc).main
   4169    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4170    LOAD_8ROWS     coeffq+16*6, 128, 1
   4171    call m(idct_16x8_internal_8bpc).main
   4172    lea                   tx2q, [o(.pass1_end4)]
   4173    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4174 
   4175 .pass1_end4:
   4176    SAVE_8ROWS    coeffq+16*34, 64               ;in16~in23
   4177    LOAD_8ROWS    rsp+gprsize+16*3, 16
   4178    mova    [rsp+gprsize+16*0], m7
   4179    lea                   tx2q, [o(.pass1_end5)]
   4180    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4181 
   4182 .pass1_end5:
   4183    mova        [coeffq+16*2 ], m0                        ;in16
   4184    mova        [coeffq+16*6 ], m4                        ;in20
   4185    mova   [rsp+gprsize+16*15], m2                        ;in18
   4186    mova   [rsp+gprsize+16*16], m6                        ;in22
   4187    mova   [rsp+gprsize+16*33], m1                        ;in17
   4188    mova   [rsp+gprsize+16*28], m3                        ;in19
   4189    mova   [rsp+gprsize+16*29], m5                        ;in21
   4190    mova   [rsp+gprsize+16*32], m7                        ;in23
   4191 
   4192    LOAD_8ROWS     coeffq+16*3, 128, 1
   4193    call  m(idct_8x8_internal_8bpc).main
   4194    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4195    LOAD_8ROWS     coeffq+16*7, 128, 1
   4196    call m(idct_16x8_internal_8bpc).main
   4197    lea                   tx2q, [o(.pass1_end6)]
   4198    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4199 
   4200 .pass1_end6:
   4201    SAVE_8ROWS    coeffq+16*35, 64                        ;in24~in31
   4202    LOAD_8ROWS    rsp+gprsize+16*3, 16
   4203    mova    [rsp+gprsize+16*0], m7
   4204    lea                   tx2q, [o(.pass1_end7)]
   4205    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4206 
   4207 .pass1_end7:
   4208    mova   [rsp+gprsize+16*17], m2                        ;in26
   4209    mova   [rsp+gprsize+16*18], m6                        ;in30
   4210    mova   [rsp+gprsize+16*31], m1                        ;in25
   4211    mova   [rsp+gprsize+16*30], m3                        ;in27
   4212    mova   [rsp+gprsize+16*27], m5                        ;in29
   4213    mova   [rsp+gprsize+16*34], m7                        ;in31
   4214 
   4215    mova                    m6, m0                        ;in24
   4216    mova                    m7, m4                        ;in28
   4217    mova                    m0, [coeffq+16*0 ]            ;in0
   4218    mova                    m1, [coeffq+16*4 ]            ;in4
   4219    mova                    m2, [coeffq+16*1 ]            ;in8
   4220    mova                    m3, [coeffq+16*5 ]            ;in12
   4221    mova                    m4, [coeffq+16*2 ]            ;in16
   4222    mova                    m5, [coeffq+16*6 ]            ;in20
   4223    call  m(idct_8x8_internal_8bpc).main
   4224    SAVE_7ROWS   rsp+gprsize+16*3 , 16
   4225    LOAD_8ROWS   rsp+gprsize+16*11, 16
   4226    call m(idct_16x8_internal_8bpc).main
   4227    mova                    m7, [rsp+gprsize+16*0]
   4228    SAVE_8ROWS   rsp+gprsize+16*11, 16
   4229 
   4230    call m(idct_8x32_internal_8bpc).main
   4231 
   4232 .pass2:
   4233    mov  [rsp+gprsize*1+16*35], eobd
   4234    lea                     r3, [dstq+8]
   4235    mov  [rsp+gprsize*2+16*35], r3
   4236    lea                     r3, [o(.end)]
   4237    jmp  m(idct_8x32_internal_8bpc).end
   4238 
   4239 .end:
   4240    mov                   dstq, [rsp+gprsize*2+16*35]
   4241    mov                   eobd, [rsp+gprsize*1+16*35]
   4242    add                 coeffq, 16*32
   4243 
   4244    mova                    m0, [coeffq+16*4 ]            ;in1
   4245    mova                    m1, [coeffq+16*12]            ;in3
   4246    mova                    m2, [coeffq+16*20]            ;in5
   4247    mova                    m3, [coeffq+16*28]            ;in7
   4248    mova                    m4, [coeffq+16*5 ]            ;in9
   4249    mova                    m5, [coeffq+16*13]            ;in11
   4250    mova                    m6, [coeffq+16*21]            ;in13
   4251    mova                    m7, [coeffq+16*29]            ;in15
   4252 
   4253    mova   [rsp+gprsize+16*19], m0                        ;in1
   4254    mova   [rsp+gprsize+16*26], m1                        ;in3
   4255    mova   [rsp+gprsize+16*23], m2                        ;in5
   4256    mova   [rsp+gprsize+16*22], m3                        ;in7
   4257    mova   [rsp+gprsize+16*21], m4                        ;in9
   4258    mova   [rsp+gprsize+16*24], m5                        ;in11
   4259    mova   [rsp+gprsize+16*25], m6                        ;in13
   4260    mova   [rsp+gprsize+16*20], m7                        ;in15
   4261 
   4262    mova                    m0, [coeffq+16*0 ]            ;in0
   4263    mova                    m1, [coeffq+16*16]            ;in4
   4264    mova                    m2, [coeffq+16*1 ]            ;in8
   4265    mova                    m3, [coeffq+16*17]            ;in12
   4266 
   4267    cmp                   eobd, 150
   4268    jg .full1
   4269 
   4270    pxor                    m4, m4
   4271    REPX          {mova x, m4}, m5, m6, m7
   4272    call  m(idct_8x8_internal_8bpc).main
   4273    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4274 
   4275    mova                    m0, [coeffq+16*8 ]            ;in2
   4276    mova                    m1, [coeffq+16*24]            ;in6
   4277    mova                    m2, [coeffq+16*9 ]            ;in10
   4278    mova                    m3, [coeffq+16*25]            ;in14
   4279    pxor                    m4, m4
   4280    REPX          {mova x, m4}, m5, m6, m7
   4281    call m(idct_16x8_internal_8bpc).main
   4282    mova                    m7, [rsp+gprsize+16*0]
   4283    SAVE_8ROWS   rsp+gprsize+16*11, 16
   4284 
   4285    call m(idct_8x32_internal_8bpc).main_fast
   4286    jmp m(idct_8x32_internal_8bpc).pass2
   4287 
   4288 .full1:
   4289    mova                    m4, [coeffq+16*2 ]            ;in16
   4290    mova                    m5, [coeffq+16*18]            ;in20
   4291    mova                    m6, [coeffq+16*3 ]            ;in24
   4292    mova                    m7, [coeffq+16*19]            ;in26
   4293    call  m(idct_8x8_internal_8bpc).main
   4294    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4295 
   4296    mova                    m0, [coeffq+16*8 ]            ;in2
   4297    mova                    m1, [coeffq+16*24]            ;in6
   4298    mova                    m2, [coeffq+16*9 ]            ;in10
   4299    mova                    m3, [coeffq+16*25]            ;in14
   4300    mova                    m4, [coeffq+16*10]            ;in18
   4301    mova                    m5, [coeffq+16*26]            ;in22
   4302    mova                    m6, [coeffq+16*11]            ;in26
   4303    mova                    m7, [coeffq+16*27]            ;in30
   4304    call m(idct_16x8_internal_8bpc).main
   4305    mova                    m7, [rsp+gprsize+16*0]
   4306    SAVE_8ROWS   rsp+gprsize+16*11, 16
   4307 
   4308    mova                    m0, [coeffq+16*6 ]            ;in17
   4309    mova                    m1, [coeffq+16*14]            ;in19
   4310    mova                    m2, [coeffq+16*22]            ;in21
   4311    mova                    m3, [coeffq+16*30]            ;in23
   4312    mova                    m4, [coeffq+16*7 ]            ;in25
   4313    mova                    m5, [coeffq+16*15]            ;in27
   4314    mova                    m6, [coeffq+16*23]            ;in29
   4315    mova                    m7, [coeffq+16*31]            ;in31
   4316 
   4317    mova   [rsp+gprsize+16*33], m0                        ;in17
   4318    mova   [rsp+gprsize+16*28], m1                        ;in19
   4319    mova   [rsp+gprsize+16*29], m2                        ;in21
   4320    mova   [rsp+gprsize+16*32], m3                        ;in23
   4321    mova   [rsp+gprsize+16*31], m4                        ;in25
   4322    mova   [rsp+gprsize+16*30], m5                        ;in27
   4323    mova   [rsp+gprsize+16*27], m6                        ;in29
   4324    mova   [rsp+gprsize+16*34], m7                        ;in31
   4325 
   4326    call m(idct_8x32_internal_8bpc).main
   4327    jmp m(idct_8x32_internal_8bpc).pass2
   4328 
   4329 
   4330 cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
   4331 %if ARCH_X86_32
   4332    LEA                     r5, $$
   4333 %endif
   4334    test                  eobd, eobd
   4335    jz .dconly
   4336 
   4337    call m(idct_32x16_internal_8bpc)
   4338    call m(idct_8x16_internal_8bpc).pass2
   4339 
   4340    add                 coeffq, 16*16
   4341    lea                   dstq, [r3+8]
   4342    LOAD_8ROWS       rsp+16*11, 16
   4343    mova            [rsp+16*0], m7
   4344    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
   4345    call  m(idct_8x8_internal_8bpc).pass1_end
   4346    call m(idct_8x16_internal_8bpc).pass2
   4347 
   4348    add                 coeffq, 16*16
   4349    lea                   dstq, [r3+8]
   4350    LOAD_8ROWS       rsp+16*19, 16
   4351    mova            [rsp+16*0], m7
   4352    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
   4353    call  m(idct_8x8_internal_8bpc).pass1_end
   4354    call m(idct_8x16_internal_8bpc).pass2
   4355 
   4356    add                 coeffq, 16*16
   4357    lea                   dstq, [r3+8]
   4358    LOAD_8ROWS       rsp+16*27, 16
   4359    mova            [rsp+16*0], m7
   4360    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
   4361    call  m(idct_8x8_internal_8bpc).pass1_end
   4362    call m(idct_8x16_internal_8bpc).pass2
   4363    RET
   4364 
   4365 .dconly:
   4366    movd                    m1, [o(pw_2896x8)]
   4367    pmulhrsw                m0, m1, [coeffq]
   4368    movd                    m2, [o(pw_16384)]
   4369    mov               [coeffq], eobd
   4370    pmulhrsw                m0, m1
   4371    mov                    r3d, 16
   4372    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
   4373    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
   4374 
   4375 
   4376 cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   4377    add                 coeffq, 16
   4378    lea                     r3, [o(.pass1_end1)]
   4379 .pass1:
   4380    LOAD_8ROWS     coeffq+16*0, 128, 1
   4381    call  m(idct_8x8_internal_8bpc).main
   4382    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4383 
   4384    LOAD_8ROWS     coeffq+16*4, 128, 1
   4385    call m(idct_16x8_internal_8bpc).main
   4386    mova                    m7, [rsp+gprsize+16*0]
   4387    SAVE_8ROWS   rsp+gprsize+16*11, 16
   4388 
   4389    LOAD_8ROWS     coeffq+16*2, 64, 1
   4390    mova   [rsp+gprsize+16*19], m0                        ;in1
   4391    mova   [rsp+gprsize+16*26], m1                        ;in3
   4392    mova   [rsp+gprsize+16*23], m2                        ;in5
   4393    mova   [rsp+gprsize+16*22], m3                        ;in7
   4394    mova   [rsp+gprsize+16*21], m4                        ;in9
   4395    mova   [rsp+gprsize+16*24], m5                        ;in11
   4396    mova   [rsp+gprsize+16*25], m6                        ;in13
   4397    mova   [rsp+gprsize+16*20], m7                        ;in15
   4398 
   4399    LOAD_8ROWS    coeffq+16*34, 64, 1
   4400    mova   [rsp+gprsize+16*33], m0                        ;in17
   4401    mova   [rsp+gprsize+16*28], m1                        ;in19
   4402    mova   [rsp+gprsize+16*29], m2                        ;in21
   4403    mova   [rsp+gprsize+16*32], m3                        ;in23
   4404    mova   [rsp+gprsize+16*31], m4                        ;in25
   4405    mova   [rsp+gprsize+16*30], m5                        ;in27
   4406    mova   [rsp+gprsize+16*27], m6                        ;in29
   4407    mova   [rsp+gprsize+16*34], m7                        ;in31
   4408    call m(idct_8x32_internal_8bpc).main
   4409 
   4410 .pass1_end:
   4411    mova   [rsp+gprsize+16*0 ], m7
   4412    mov                   tx2q, r3
   4413    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4414 
   4415 .pass1_end1:
   4416    SAVE_8ROWS     coeffq+16*0, 32
   4417    LOAD_8ROWS   rsp+gprsize+16*11, 16
   4418    mova   [rsp+gprsize+16*0 ], m7
   4419    lea                   tx2q, [o(.pass1_end2)]
   4420    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4421 
   4422 .pass1_end2:
   4423    SAVE_8ROWS    coeffq+16*16, 32
   4424    LOAD_8ROWS   rsp+gprsize+16*19, 16
   4425    mova   [rsp+gprsize+16*0 ], m7
   4426    lea                   tx2q, [o(.pass1_end3)]
   4427    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4428 
   4429 .pass1_end3:
   4430    SAVE_8ROWS    coeffq+16*32, 32
   4431    LOAD_8ROWS   rsp+gprsize+16*27, 16
   4432    mova   [rsp+gprsize+16*0 ], m7
   4433    lea                   tx2q, [o(.pass1_end4)]
   4434    jmp   m(idct_8x8_internal_8bpc).pass1_end
   4435 
   4436 .pass1_end4:
   4437    SAVE_8ROWS    coeffq+16*48, 32
   4438 
   4439    sub                 coeffq, 16
   4440    lea                     r3, [o(.end)]
   4441    jmp .pass1
   4442 
   4443 .end:
   4444    ret
   4445 
   4446 
   4447 cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
   4448    mov                    r4d, eobd
   4449    cmp                   eobd, 43                ;if (eob > 43)
   4450    sbb                    r3d, r3d               ;  iteration_count++
   4451    cmp                    r4d, 150               ;if (eob > 150)
   4452    sbb                    r3d, 0                 ;  iteration_count++
   4453    cmp                    r4d, 278               ;if (eob > 278)
   4454    sbb                    r3d, -4                ;  iteration_count++
   4455 
   4456 %if ARCH_X86_32
   4457    LEA                     r5, $$
   4458 %endif
   4459    lea                     r4, [dstq+8]
   4460    mov             [rsp+16*3], r4
   4461    mov     [rsp+gprsize+16*3], r3d
   4462    mov   [rsp+gprsize*2+16*3], coeffq
   4463 
   4464 .loop:
   4465    LOAD_8ROWS          coeffq, 64, 1
   4466    mova            [rsp+16*1], m6
   4467    pxor                    m6, m6
   4468    REPX   {mova [coeffq+64*x], m6}, 0,  1,  2,  3,  4,  5,  6,  7
   4469    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
   4470    call  m(idct_8x8_internal_8bpc).pass1_end3
   4471    mova            [rsp+16*0], m2
   4472    mova            [rsp+16*1], m3
   4473    mova            [rsp+16*2], m4
   4474    mova                    m3, [o(pw_1697x16)]
   4475    mova                    m4, [o(pw_16384)]
   4476    REPX   {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1
   4477    mova                    m2, [o(pw_8192)]
   4478    REPX      {pmulhrsw x, m2}, m5, m6, m7, m0, m1
   4479    mova                    m2, [rsp+16*0]
   4480    mova            [rsp+16*0], m7
   4481    IDTX16                   2, 7, 3, 4
   4482    mova                    m7, [rsp+16*2]
   4483    mova            [rsp+16*2], m5
   4484    IDTX16                   7, 5, 3, 4
   4485    mova                    m5, [rsp+16*1]
   4486    mova            [rsp+16*1], m6
   4487    pmulhrsw                m3, m5
   4488    pmulhrsw                m3, m4
   4489    psrlw                   m4, 1 ; pw_8192
   4490    paddsw                  m3, m5
   4491    pmulhrsw                m2, m4
   4492    pmulhrsw                m3, m4
   4493    pmulhrsw                m4, m7
   4494    call  m(idct_8x8_internal_8bpc).end3
   4495    lea                   dstq, [dstq+strideq*2]
   4496    add                 coeffq, 16
   4497    dec                    r3d
   4498    jg .loop
   4499    mov                 coeffq, [rsp+gprsize*2+16*3]
   4500    add                 coeffq, 64*8
   4501    mov                    r3d, [rsp+gprsize+16*3]
   4502    xor                   dstq, dstq
   4503    mov     [rsp+gprsize+16*3], dstq
   4504    mov                   dstq, [rsp+16*3]
   4505    test                   r3d, r3d
   4506    jnz .loop
   4507    RET
   4508 
   4509 
   4510 cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
   4511    mov                    r4d, 12                ;0100b
   4512    mov                    r5d, 136               ;1000 1000b
   4513    cmp                   eobd, 44                ;if (eob > 43)
   4514    cmovns                 r4d, r5d               ;  iteration_count+2
   4515    cmp                   eobd, 151               ;if (eob > 150)
   4516    mov                    r3d, 34952             ;1000 1000 1000 1000b
   4517    cmovs                  r3d, r4d               ;  iteration_count += 4
   4518 
   4519 %if ARCH_X86_32
   4520    LEA                     r5, $$
   4521 %endif
   4522    lea                     r4, [dstq+8]
   4523    mov             [rsp+16*3], r4
   4524 
   4525 .loop:
   4526    LOAD_8ROWS          coeffq, 32, 1
   4527    REPX         {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
   4528    mova            [rsp+16*1], m6
   4529    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
   4530    call  m(idct_8x8_internal_8bpc).pass1_end3
   4531    mova            [rsp+16*1], m5
   4532    mova            [rsp+16*2], m6
   4533    mova                    m6, [o(pw_1697x16)]
   4534    REPX      {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4
   4535    pmulhrsw                m7, [o(pw_2048)]
   4536    mova                    m5, [rsp+16*1]
   4537    mova            [rsp+16*0], m7
   4538    IDTX16                   5, 7, 6
   4539    mova                    m7, [rsp+16*2]
   4540    IDTX16                   7, 6, 6
   4541    mova                    m6, [o(pw_2048)]
   4542    REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
   4543    mova            [rsp+16*2], m5
   4544    mova            [rsp+16*1], m7
   4545    call  m(idct_8x8_internal_8bpc).end3
   4546    lea                   dstq, [dstq+strideq*2]
   4547    pxor                    m7, m7
   4548    REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
   4549 
   4550 .loop_end:
   4551    add                 coeffq, 16
   4552    shr                    r3d, 2
   4553    jz .ret
   4554    test                   r3d, 2
   4555    jnz .loop
   4556    mov                    r4d, r3d
   4557    and                    r4d, 1
   4558    lea                 coeffq, [coeffq+r4*8+32*7]
   4559    mov                   dstq, [rsp+16*3]
   4560    lea                     r4, [dstq+8]
   4561    mov             [rsp+16*3], r4
   4562    jmp .loop
   4563 
   4564 .ret:
   4565    RET
   4566 
   4567 
   4568 cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
   4569 %if ARCH_X86_32
   4570    LEA                     r5, $$
   4571 %endif
   4572    test                  eobd, eobd
   4573    jz .dconly
   4574 
   4575    call m(idct_32x32_internal_8bpc)
   4576    RET
   4577 
   4578 .dconly:
   4579    movd                    m1, [o(pw_2896x8)]
   4580    pmulhrsw                m0, m1, [coeffq]
   4581    movd                    m2, [o(pw_8192)]
   4582    mov               [coeffq], eobd
   4583    mov                    r3d, 32
   4584    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
   4585    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
   4586 
   4587 
   4588 cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   4589    mov                    r4d, 2
   4590    sub                   eobd, 136
   4591    mov  [rsp+gprsize*1+16*35], eobd
   4592    mov                    r3d, 4
   4593    cmovs                  r3d, r4d
   4594 
   4595 %if ARCH_X86_32
   4596    LEA                     r5, $$
   4597 %endif
   4598 
   4599    mov  [rsp+gprsize*2+16*35], coeffq
   4600 
   4601 .pass1_loop:
   4602    LOAD_8ROWS     coeffq+64*1, 64*2
   4603    mova   [rsp+gprsize+16*19], m0                        ;in1
   4604    mova   [rsp+gprsize+16*26], m1                        ;in3
   4605    mova   [rsp+gprsize+16*23], m2                        ;in5
   4606    mova   [rsp+gprsize+16*22], m3                        ;in7
   4607    mova   [rsp+gprsize+16*21], m4                        ;in9
   4608    mova   [rsp+gprsize+16*24], m5                        ;in11
   4609    mova   [rsp+gprsize+16*25], m6                        ;in13
   4610    mova   [rsp+gprsize+16*20], m7                        ;in15
   4611 
   4612    mov                   tx2d, [rsp+gprsize*1+16*35]
   4613    test                  tx2d, tx2d
   4614    jl .fast
   4615 
   4616 .full:
   4617    LOAD_8ROWS     coeffq+64*0, 64*4
   4618    call  m(idct_8x8_internal_8bpc).main
   4619    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4620    LOAD_8ROWS     coeffq+64*2, 64*4
   4621    call m(idct_16x8_internal_8bpc).main
   4622    mova                    m7, [rsp+gprsize+16*0]
   4623    SAVE_8ROWS   rsp+gprsize+16*11, 16
   4624 
   4625    LOAD_8ROWS    coeffq+64*17, 64*2
   4626    mova   [rsp+gprsize+16*33], m0                        ;in17
   4627    mova   [rsp+gprsize+16*28], m1                        ;in19
   4628    mova   [rsp+gprsize+16*29], m2                        ;in21
   4629    mova   [rsp+gprsize+16*32], m3                        ;in23
   4630    mova   [rsp+gprsize+16*31], m4                        ;in25
   4631    mova   [rsp+gprsize+16*30], m5                        ;in27
   4632    mova   [rsp+gprsize+16*27], m6                        ;in29
   4633    mova   [rsp+gprsize+16*34], m7                        ;in31
   4634 
   4635    call m(idct_8x32_internal_8bpc).main
   4636    jmp .pass1_end
   4637 
   4638 .fast:
   4639    mova                    m0, [coeffq+256*0]
   4640    mova                    m1, [coeffq+256*1]
   4641    mova                    m2, [coeffq+256*2]
   4642    mova                    m3, [coeffq+256*3]
   4643    pxor                    m4, m4
   4644    REPX          {mova x, m4}, m5, m6, m7
   4645    call  m(idct_8x8_internal_8bpc).main
   4646 
   4647    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4648    mova                    m0, [coeffq+128*1]
   4649    mova                    m1, [coeffq+128*3]
   4650    mova                    m2, [coeffq+128*5]
   4651    mova                    m3, [coeffq+128*7]
   4652    pxor                    m4, m4
   4653    REPX          {mova x, m4}, m5, m6, m7
   4654    call m(idct_16x8_internal_8bpc).main
   4655    mova                    m7, [rsp+gprsize+16*0]
   4656    SAVE_8ROWS   rsp+gprsize+16*11, 16
   4657 
   4658    call m(idct_8x32_internal_8bpc).main_fast
   4659 
   4660 .pass1_end:
   4661    mova    [rsp+gprsize+16*0], m7
   4662    mova                    m7, [o(pw_8192)]
   4663    lea                   tx2q, [o(.pass1_end1)]
   4664    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   4665 
   4666 .pass1_end1:
   4667    SAVE_8ROWS     coeffq+64*0, 64
   4668    LOAD_8ROWS   rsp+gprsize+16*11, 16
   4669    mova    [rsp+gprsize+16*0], m7
   4670    mova                    m7, [o(pw_8192)]
   4671    lea                   tx2q, [o(.pass1_end2)]
   4672    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   4673 
   4674 .pass1_end2:
   4675    SAVE_8ROWS     coeffq+64*8, 64
   4676    LOAD_8ROWS   rsp+gprsize+16*19, 16
   4677    mova    [rsp+gprsize+16*0], m7
   4678    mova                    m7, [o(pw_8192)]
   4679    lea                   tx2q, [o(.pass1_end3)]
   4680    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   4681 
   4682 .pass1_end3:
   4683    SAVE_8ROWS    coeffq+64*16, 64
   4684    LOAD_8ROWS   rsp+gprsize+16*27, 16
   4685    mova    [rsp+gprsize+16*0], m7
   4686    mova                    m7, [o(pw_8192)]
   4687    lea                   tx2q, [o(.pass1_end4)]
   4688    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   4689 
   4690 .pass1_end4:
   4691    SAVE_8ROWS    coeffq+64*24, 64
   4692 
   4693    add                 coeffq, 16
   4694    dec                    r3d
   4695    jg .pass1_loop
   4696 
   4697 
   4698 .pass2:
   4699    mov                 coeffq, [rsp+gprsize*2+16*35]
   4700    mov                    r3d, 4
   4701    lea                   tx2q, [o(.pass2_end)]
   4702 
   4703 .pass2_loop:
   4704    mov  [rsp+gprsize*3+16*35], r3d
   4705    lea                     r3, [dstq+8]
   4706    mov  [rsp+gprsize*2+16*35], r3
   4707 
   4708    mova                    m0, [coeffq+16*4 ]
   4709    mova                    m1, [coeffq+16*12]
   4710    mova                    m2, [coeffq+16*20]
   4711    mova                    m3, [coeffq+16*28]
   4712    mova                    m4, [coeffq+16*5 ]
   4713    mova                    m5, [coeffq+16*13]
   4714    mova                    m6, [coeffq+16*21]
   4715    mova                    m7, [coeffq+16*29]
   4716    mova   [rsp+gprsize+16*19], m0                        ;in1
   4717    mova   [rsp+gprsize+16*26], m1                        ;in3
   4718    mova   [rsp+gprsize+16*23], m2                        ;in5
   4719    mova   [rsp+gprsize+16*22], m3                        ;in7
   4720    mova   [rsp+gprsize+16*21], m4                        ;in9
   4721    mova   [rsp+gprsize+16*24], m5                        ;in11
   4722    mova   [rsp+gprsize+16*25], m6                        ;in13
   4723    mova   [rsp+gprsize+16*20], m7                        ;in15
   4724 
   4725    mov                   eobd, [rsp+gprsize*1+16*35]
   4726    test                  eobd, eobd
   4727    jl .fast1
   4728 
   4729 .full1:
   4730    mova                    m0, [coeffq+16*0 ]
   4731    mova                    m1, [coeffq+16*16]
   4732    mova                    m2, [coeffq+16*1 ]
   4733    mova                    m3, [coeffq+16*17]
   4734    mova                    m4, [coeffq+16*2 ]
   4735    mova                    m5, [coeffq+16*18]
   4736    mova                    m6, [coeffq+16*3 ]
   4737    mova                    m7, [coeffq+16*19]
   4738    call  m(idct_8x8_internal_8bpc).main
   4739    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4740 
   4741    mova                    m0, [coeffq+16*8 ]
   4742    mova                    m1, [coeffq+16*24]
   4743    mova                    m2, [coeffq+16*9 ]
   4744    mova                    m3, [coeffq+16*25]
   4745    mova                    m4, [coeffq+16*10]
   4746    mova                    m5, [coeffq+16*26]
   4747    mova                    m6, [coeffq+16*11]
   4748    mova                    m7, [coeffq+16*27]
   4749    call m(idct_16x8_internal_8bpc).main
   4750    mova                    m7, [rsp+gprsize+16*0]
   4751    SAVE_8ROWS   rsp+gprsize+16*11, 16
   4752 
   4753    mova                    m0, [coeffq+16*6 ]
   4754    mova                    m1, [coeffq+16*14]
   4755    mova                    m2, [coeffq+16*22]
   4756    mova                    m3, [coeffq+16*30]
   4757    mova                    m4, [coeffq+16*7 ]
   4758    mova                    m5, [coeffq+16*15]
   4759    mova                    m6, [coeffq+16*23]
   4760    mova                    m7, [coeffq+16*31]
   4761    mova   [rsp+gprsize+16*33], m0                        ;in17
   4762    mova   [rsp+gprsize+16*28], m1                        ;in19
   4763    mova   [rsp+gprsize+16*29], m2                        ;in21
   4764    mova   [rsp+gprsize+16*32], m3                        ;in23
   4765    mova   [rsp+gprsize+16*31], m4                        ;in25
   4766    mova   [rsp+gprsize+16*30], m5                        ;in27
   4767    mova   [rsp+gprsize+16*27], m6                        ;in29
   4768    mova   [rsp+gprsize+16*34], m7                        ;in31
   4769 
   4770    call m(idct_8x32_internal_8bpc).main
   4771    jmp                   tx2q
   4772 
   4773 .fast1:
   4774    mova                    m0, [coeffq+16*0 ]
   4775    mova                    m1, [coeffq+16*16]
   4776    mova                    m2, [coeffq+16*1 ]
   4777    mova                    m3, [coeffq+16*17]
   4778    pxor                    m4, m4
   4779    REPX          {mova x, m4}, m5, m6, m7
   4780    call  m(idct_8x8_internal_8bpc).main
   4781    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4782 
   4783    mova                    m0, [coeffq+16*8 ]
   4784    mova                    m1, [coeffq+16*24]
   4785    mova                    m2, [coeffq+16*9 ]
   4786    mova                    m3, [coeffq+16*25]
   4787    pxor                    m4, m4
   4788    REPX          {mova x, m4}, m5, m6, m7
   4789    call m(idct_16x8_internal_8bpc).main
   4790    mova                    m7, [rsp+gprsize+16*0]
   4791    SAVE_8ROWS   rsp+gprsize+16*11, 16
   4792 
   4793    call m(idct_8x32_internal_8bpc).main_fast
   4794    jmp                   tx2q
   4795 
   4796 .pass2_end:
   4797    lea                     r3, [o(.pass2_end1)]
   4798    jmp  m(idct_8x32_internal_8bpc).end
   4799 
   4800 .pass2_end1:
   4801    lea                   tx2q, [o(.pass2_end)]
   4802    add                 coeffq, 16*32
   4803    mov                   dstq, [rsp+gprsize*2+16*35]
   4804    mov                    r3d, [rsp+gprsize*3+16*35]
   4805    dec                    r3d
   4806    jg .pass2_loop
   4807 
   4808    ret
   4809 
   4810 
   4811 cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
   4812    mov                    r4d, 2
   4813    cmp                   eobd, 136
   4814    mov                    r3d, 4
   4815    cmovs                  r3d, r4d
   4816 
   4817 %if ARCH_X86_32
   4818    LEA                     r5, $$
   4819 %endif
   4820 
   4821    lea                     r4, [dstq+8]
   4822    mov   [rsp+gprsize*0+16*3], r4
   4823    mov   [rsp+gprsize*1+16*3], r3d
   4824    mov   [rsp+gprsize*2+16*3], r3d
   4825    mov   [rsp+gprsize*3+16*3], coeffq
   4826 
   4827 .loop:
   4828    LOAD_8ROWS          coeffq, 64
   4829    mova            [rsp+16*1], m6
   4830    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).end)]
   4831    call  m(idct_8x8_internal_8bpc).pass1_end3
   4832    pmulhrsw                m7, [o(pw_8192)]
   4833    mova            [rsp+16*0], m7
   4834    mova                    m7, [o(pw_8192)]
   4835    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   4836    mova            [rsp+16*1], m6
   4837    mova            [rsp+16*2], m5
   4838    call  m(idct_8x8_internal_8bpc).end3
   4839    lea                   dstq, [dstq+strideq*2]
   4840 
   4841    pxor                    m7, m7
   4842    REPX   {mova [coeffq+64*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
   4843 
   4844    add                 coeffq, 16
   4845    dec                    r3d
   4846    jg .loop
   4847 
   4848    mov                    r4d, [rsp+gprsize*2+16*3]
   4849    dec                    r4d
   4850    jle .ret
   4851 
   4852    mov                   dstq, [rsp+gprsize*0+16*3]
   4853    mov                 coeffq, [rsp+gprsize*3+16*3]
   4854    mov   [rsp+gprsize*2+16*3], r4
   4855    lea                     r3, [dstq+8]
   4856    add                 coeffq, 64*8
   4857    mov   [rsp+gprsize*0+16*3], r3
   4858    mov                    r3d, [rsp+gprsize*1+16*3]
   4859    mov   [rsp+gprsize*3+16*3], coeffq
   4860    jmp .loop
   4861 
   4862 .ret:
   4863    RET
   4864 
   4865 
   4866 cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
   4867 %if ARCH_X86_32
   4868    LEA                     r5, $$
   4869 %endif
   4870    test                  eobd, eobd
   4871    jz .dconly
   4872    call m(idct_16x64_internal_8bpc)
   4873 .end:
   4874    RET
   4875 
   4876 .dconly:
   4877    movd                    m1, [o(pw_2896x8)]
   4878    pmulhrsw                m0, m1, [coeffq]
   4879    movd                    m2, [o(pw_8192)]
   4880    mov               [coeffq], eobd
   4881    mov                    r2d, 32
   4882    lea                   tx2q, [o(.end)]
   4883    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
   4884 
   4885 
   4886 cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   4887    mov                    r4d, 2
   4888    sub                   eobd, 151
   4889    mov  [rsp+gprsize*1+16*67], eobd
   4890    mov                    r3d, 4
   4891    cmovs                  r3d, r4d
   4892 
   4893 %if ARCH_X86_32
   4894    LEA                     r5, $$
   4895 %endif
   4896 
   4897    mov  [rsp+gprsize*2+16*67], coeffq
   4898 
   4899 .pass1_loop:
   4900    LOAD_8ROWS     coeffq+64*0, 64*2
   4901    call  m(idct_8x8_internal_8bpc).main
   4902    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4903    LOAD_8ROWS     coeffq+64*1, 64*2
   4904    call m(idct_16x8_internal_8bpc).main
   4905    mova                    m7, [o(pw_8192)]
   4906    lea                   tx2q, [o(.pass1_end)]
   4907    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   4908 
   4909 .pass1_end:
   4910    SAVE_8ROWS     coeffq+64*8, 64
   4911    LOAD_8ROWS    rsp+gprsize+16*3, 16
   4912    mova    [rsp+gprsize+16*0], m7
   4913    mova                    m7, [o(pw_8192)]
   4914    lea                   tx2q, [o(.pass1_end1)]
   4915    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   4916 
   4917 .pass1_end1:
   4918    SAVE_8ROWS     coeffq+64*0, 64
   4919 
   4920    add                 coeffq, 16
   4921    dec                    r3d
   4922    jg .pass1_loop
   4923 
   4924    mov                 coeffq, [rsp+gprsize*2+16*67]
   4925    mov                    r3d, 2
   4926    lea                     r4, [dstq+8]
   4927    mov  [rsp+gprsize*2+16*67], r4
   4928    lea                     r4, [o(.end1)]
   4929 
   4930 .pass2_loop:
   4931    mov  [rsp+gprsize*3+16*67], r3d
   4932    mov                   eobd, [rsp+gprsize*1+16*67]
   4933 
   4934    mova                    m0, [coeffq+16*4 ]            ;in1
   4935    mova                    m1, [coeffq+16*12]            ;in3
   4936    mova                    m2, [coeffq+16*20]            ;in5
   4937    mova                    m3, [coeffq+16*28]            ;in7
   4938    mova                    m4, [coeffq+16*5 ]            ;in9
   4939    mova                    m5, [coeffq+16*13]            ;in11
   4940    mova                    m6, [coeffq+16*21]            ;in13
   4941    mova                    m7, [coeffq+16*29]            ;in15
   4942    mova   [rsp+gprsize+16*35], m0                        ;in1
   4943    mova   [rsp+gprsize+16*49], m1                        ;in3
   4944    mova   [rsp+gprsize+16*43], m2                        ;in5
   4945    mova   [rsp+gprsize+16*41], m3                        ;in7
   4946    mova   [rsp+gprsize+16*39], m4                        ;in9
   4947    mova   [rsp+gprsize+16*45], m5                        ;in11
   4948    mova   [rsp+gprsize+16*47], m6                        ;in13
   4949    mova   [rsp+gprsize+16*37], m7                        ;in15
   4950 
   4951    pxor                    m4, m4
   4952    mova                    m0, [coeffq+16*0]
   4953    mova                    m1, [coeffq+16*1]
   4954 
   4955    test                  eobd, eobd
   4956    jl .fast
   4957 
   4958 .full:
   4959    mova                    m2, [coeffq+16*2]
   4960    mova                    m3, [coeffq+16*3]
   4961 
   4962    REPX          {mova x, m4}, m5, m6, m7
   4963    call  m(idct_8x8_internal_8bpc).main
   4964    SAVE_7ROWS    rsp+gprsize+16*3, 16
   4965 
   4966    pxor                    m4, m4
   4967    mova                    m0, [coeffq+16*16]
   4968    mova                    m1, [coeffq+16*17]
   4969    mova                    m2, [coeffq+16*18]
   4970    mova                    m3, [coeffq+16*19]
   4971 
   4972    REPX          {mova x, m4}, m5, m6, m7
   4973    call m(idct_16x8_internal_8bpc).main
   4974    mova                    m7, [rsp+gprsize+16*0]
   4975    SAVE_8ROWS   rsp+gprsize+16*11, 16
   4976 
   4977    mova                    m0, [coeffq+16*8 ]
   4978    mova                    m1, [coeffq+16*24]
   4979    mova                    m2, [coeffq+16*9 ]
   4980    mova                    m3, [coeffq+16*25]
   4981    mova                    m4, [coeffq+16*10]
   4982    mova                    m5, [coeffq+16*26]
   4983    mova                    m6, [coeffq+16*11]
   4984    mova                    m7, [coeffq+16*27]
   4985    mova   [rsp+gprsize+16*19], m0
   4986    mova   [rsp+gprsize+16*26], m1
   4987    mova   [rsp+gprsize+16*23], m2
   4988    mova   [rsp+gprsize+16*22], m3
   4989    mova   [rsp+gprsize+16*21], m4
   4990    mova   [rsp+gprsize+16*24], m5
   4991    mova   [rsp+gprsize+16*25], m6
   4992    mova   [rsp+gprsize+16*20], m7
   4993 
   4994    call m(idct_8x32_internal_8bpc).main_fast
   4995    SAVE_8ROWS    rsp+gprsize+16*3, 16
   4996 
   4997    mova                    m0, [coeffq+16*6 ]            ;in17
   4998    mova                    m1, [coeffq+16*14]            ;in19
   4999    mova                    m2, [coeffq+16*22]            ;in21
   5000    mova                    m3, [coeffq+16*30]            ;in23
   5001    mova                    m4, [coeffq+16*7 ]            ;in25
   5002    mova                    m5, [coeffq+16*15]            ;in27
   5003    mova                    m6, [coeffq+16*23]            ;in29
   5004    mova                    m7, [coeffq+16*31]            ;in31
   5005    mova   [rsp+gprsize+16*63], m0                        ;in17
   5006    mova   [rsp+gprsize+16*53], m1                        ;in19
   5007    mova   [rsp+gprsize+16*55], m2                        ;in21
   5008    mova   [rsp+gprsize+16*61], m3                        ;in23
   5009    mova   [rsp+gprsize+16*59], m4                        ;in25
   5010    mova   [rsp+gprsize+16*57], m5                        ;in27
   5011    mova   [rsp+gprsize+16*51], m6                        ;in29
   5012    mova   [rsp+gprsize+16*65], m7                        ;in31
   5013 
   5014    call .main
   5015    jmp  .end
   5016 
   5017 .fast:
   5018    REPX          {mova x, m4}, m2, m3, m5, m6, m7
   5019    call  m(idct_8x8_internal_8bpc).main
   5020    SAVE_7ROWS    rsp+gprsize+16*3, 16
   5021 
   5022    pxor                    m4, m4
   5023    mova                    m0, [coeffq+16*16]
   5024    mova                    m1, [coeffq+16*17]
   5025 
   5026    REPX          {mova x, m4}, m2, m3, m5, m6, m7
   5027    call m(idct_16x8_internal_8bpc).main
   5028    mova                    m7, [rsp+gprsize+16*0]
   5029    SAVE_8ROWS   rsp+gprsize+16*11, 16
   5030 
   5031    mova                    m0, [coeffq+16*8 ]
   5032    mova                    m1, [coeffq+16*24]
   5033    mova                    m2, [coeffq+16*9 ]
   5034    mova                    m3, [coeffq+16*25]
   5035    mova   [rsp+gprsize+16*19], m0                        ;in1
   5036    mova   [rsp+gprsize+16*26], m1                        ;in3
   5037    mova   [rsp+gprsize+16*23], m2                        ;in5
   5038    mova   [rsp+gprsize+16*22], m3                        ;in7
   5039 
   5040    call m(idct_8x32_internal_8bpc).main_veryfast
   5041    SAVE_8ROWS    rsp+gprsize+16*3, 16
   5042 
   5043    call .main_fast
   5044 
   5045 .end:
   5046    LOAD_8ROWS   rsp+gprsize+16*3, 16
   5047    mova    [rsp+gprsize+16*0], m7
   5048    mov                     r3, r4
   5049    jmp  m(idct_8x32_internal_8bpc).end2
   5050 
   5051 .end1:
   5052    LOAD_8ROWS   rsp+gprsize+16*35, 16
   5053    lea                   dstq, [dstq+strideq*2]
   5054    lea                     r3, [rsp+16*32+gprsize]
   5055    call .write
   5056    mov                   dstq, [rsp+gprsize*2+16*67]
   5057    mov                    r3d, [rsp+gprsize*3+16*67]
   5058    lea                     r4, [dstq+8]
   5059    mov  [rsp+gprsize*2+16*67], r4
   5060    lea                     r4, [o(.end1)]
   5061 
   5062    dec                    r3d
   5063    jg .pass2_loop
   5064    ret
   5065 .write:
   5066    mova             [r3+16*0], m7
   5067    mov                     r4, -16*32
   5068    pxor                    m7, m7
   5069    sub                 coeffq, r4
   5070 .zero_loop:
   5071    mova      [coeffq+r4+16*0], m7
   5072    mova      [coeffq+r4+16*1], m7
   5073    add                     r4, 16*2
   5074    jl .zero_loop
   5075    call .write_main2
   5076    LOAD_8ROWS        r3+16*11, 16
   5077    call .write_main
   5078    LOAD_8ROWS        r3+16*19, 16
   5079    call .write_main
   5080    LOAD_8ROWS        r3+16*27, 16
   5081 .write_main:
   5082    mova             [r3+16*0], m7
   5083 .write_main2:
   5084    mova                    m7, [o(pw_2048)]
   5085    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   5086    pmulhrsw                m7, [r3+16*0]
   5087    mova             [r3+16*2], m5
   5088    mova             [r3+16*1], m6
   5089    mova             [r3+16*0], m7
   5090    WRITE_8X4                0, 1, 2, 3, 5, 6, 7
   5091    lea                   dstq, [dstq+strideq*2]
   5092    WRITE_8X4                4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7
   5093    lea                   dstq, [dstq+strideq*2]
   5094    ret
   5095 
   5096 
   5097 ALIGN function_align
   5098 cglobal_label .main_fast
   5099    mova                    m0, [rsp+gprsize*2+16*35]     ;in1
   5100    pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t62,t63
   5101    pmulhrsw                m0, [o(pw_101x8)]             ;t32,t33
   5102    mova                    m7, [o(pd_2048)]
   5103    mova [rsp+gprsize*2+16*35], m0                        ;t32
   5104    mova [rsp+gprsize*2+16*66], m3                        ;t63
   5105    ITX_MULSUB_2W            3, 0, 1, 2, 7,  401, 4076    ;t33a, t62a
   5106    mova [rsp+gprsize*2+16*36], m3                        ;t33a
   5107    mova [rsp+gprsize*2+16*65], m0                        ;t62a
   5108 
   5109    mova                    m1, [rsp+gprsize*2+16*37]     ;in15
   5110    pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60,t61
   5111    pmulhrsw                m1, [o(pw_m1474x8)]           ;t34,t35
   5112    mova [rsp+gprsize*2+16*38], m1                        ;t35
   5113    mova [rsp+gprsize*2+16*63], m2                        ;t60
   5114    ITX_MULSUB_2W            2, 1, 0, 3, 7, m4076, 401    ;t34a, t61a
   5115    mova [rsp+gprsize*2+16*37], m2                        ;t34a
   5116    mova [rsp+gprsize*2+16*64], m1                        ;t61a
   5117 
   5118    mova                    m0, [rsp+gprsize*2+16*39]     ;in9
   5119    pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t58,t59
   5120    pmulhrsw                m0, [o(pw_897x8)]             ;t36,t37
   5121    mova [rsp+gprsize*2+16*39], m0                        ;t36
   5122    mova [rsp+gprsize*2+16*62], m3                        ;t59
   5123    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3166, 2598    ;t37a, t58a
   5124    mova [rsp+gprsize*2+16*40], m3                        ;t37a
   5125    mova [rsp+gprsize*2+16*61], m0                        ;t58a
   5126 
   5127    mova                    m1, [rsp+gprsize*2+16*41]     ;in7
   5128    pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56,t57
   5129    pmulhrsw                m1, [o(pw_m700x8)]            ;t38,t39
   5130    mova [rsp+gprsize*2+16*42], m1                        ;t39
   5131    mova [rsp+gprsize*2+16*59], m2                        ;t56
   5132    ITX_MULSUB_2W            2, 1, 0, 3, 7, m2598, 3166   ;t38a, t57a
   5133    mova [rsp+gprsize*2+16*41], m2                        ;t38a
   5134    mova [rsp+gprsize*2+16*60], m1                        ;t57a
   5135 
   5136    mova                    m0, [rsp+gprsize*2+16*43]     ;in5
   5137    pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t54,t55
   5138    pmulhrsw                m0, [o(pw_501x8)]             ;t40,t41
   5139    mova [rsp+gprsize*2+16*43], m0                        ;t40
   5140    mova [rsp+gprsize*2+16*58], m3                        ;t55
   5141    ITX_MULSUB_2W            3, 0, 1, 2, 7, 1931, 3612    ;t41a, t54a
   5142    mova [rsp+gprsize*2+16*44], m3                        ;t41a
   5143    mova [rsp+gprsize*2+16*57], m0                        ;t54a
   5144 
   5145    mova                    m1, [rsp+gprsize*2+16*45]     ;in11
   5146    pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52,t53
   5147    pmulhrsw                m1, [o(pw_m1092x8)]           ;t42,t43
   5148    mova [rsp+gprsize*2+16*46], m1                        ;t43
   5149    mova [rsp+gprsize*2+16*55], m2                        ;t52
   5150    ITX_MULSUB_2W            2, 1, 0, 3, 7, m3612, 1931   ;t42a, t53a
   5151    mova [rsp+gprsize*2+16*45], m2                        ;t42a
   5152    mova [rsp+gprsize*2+16*56], m1                        ;t53a
   5153 
   5154    mova                    m0, [rsp+gprsize*2+16*47]     ;in13
   5155    pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t50,t51
   5156    pmulhrsw                m0, [o(pw_1285x8)]            ;t44,t45
   5157    mova                    m6, m0
   5158    mova [rsp+gprsize*2+16*54], m3                        ;t51
   5159    ITX_MULSUB_2W            3, 0, 1, 2, 7, 3920, 1189    ;t45a, t50a
   5160    mova [rsp+gprsize*2+16*48], m3                        ;t45a
   5161    mova [rsp+gprsize*2+16*53], m0                        ;t50a
   5162 
   5163    mova                    m0, [rsp+gprsize*2+16*49]     ;in3
   5164    pmulhrsw                m3, m0, [o(pw_4085x8)]        ;t48,t49
   5165    pmulhrsw                m0, [o(pw_m301x8)]            ;t46,t47
   5166    mova                    m4, m3
   5167    mova                    m5, m0
   5168 
   5169    jmp .main2
   5170 
   5171 ALIGN function_align
   5172 cglobal_label .main
   5173    mova                    m0, [rsp+gprsize*2+16*35]     ;in1
   5174    mova                    m1, [rsp+gprsize*2+16*65]     ;in31
   5175    pmulhrsw                m3, m0, [o(pw_4095x8)]        ;t63a
   5176    pmulhrsw                m0, [o(pw_101x8)]             ;t32a
   5177    pmulhrsw                m2, m1, [o(pw_2967x8)]        ;t62a
   5178    pmulhrsw                m1, [o(pw_m2824x8)]           ;t33a
   5179    mova                    m7, [o(pd_2048)]
   5180    psubsw                  m4, m0, m1                    ;t33
   5181    paddsw                  m0, m1                        ;t32
   5182    psubsw                  m5, m3, m2                    ;t62
   5183    paddsw                  m3, m2                        ;t63
   5184    ITX_MULSUB_2W            5, 4, 1, 2, 7,  401, 4076    ;t33a, t62a
   5185    mova [rsp+gprsize*2+16*35], m0                        ;t32
   5186    mova [rsp+gprsize*2+16*36], m5                        ;t33a
   5187    mova [rsp+gprsize*2+16*65], m4                        ;t62a
   5188    mova [rsp+gprsize*2+16*66], m3                        ;t63
   5189 
   5190    mova                    m0, [rsp+gprsize*2+16*63]     ;in17
   5191    mova                    m1, [rsp+gprsize*2+16*37]     ;in15
   5192    pmulhrsw                m3, m0, [o(pw_3745x8)]        ;t61a
   5193    pmulhrsw                m0, [o(pw_1660x8)]            ;t34a
   5194    pmulhrsw                m2, m1, [o(pw_3822x8)]        ;t60a
   5195    pmulhrsw                m1, [o(pw_m1474x8)]           ;t35a
   5196    psubsw                  m4, m1, m0                    ;t34
   5197    paddsw                  m0, m1                        ;t35
   5198    psubsw                  m5, m2, m3                    ;t61
   5199    paddsw                  m3, m2                        ;t60
   5200    ITX_MULSUB_2W            5, 4, 1, 2, 7, m4076, 401    ;t34a, t61a
   5201    mova [rsp+gprsize*2+16*37], m5                        ;t34a
   5202    mova [rsp+gprsize*2+16*38], m0                        ;t35
   5203    mova [rsp+gprsize*2+16*63], m3                        ;t60
   5204    mova [rsp+gprsize*2+16*64], m4                        ;t61a
   5205 
   5206    mova                    m0, [rsp+gprsize*2+16*39]     ;in9
   5207    mova                    m1, [rsp+gprsize*2+16*61]     ;in23
   5208    pmulhrsw                m3, m0, [o(pw_3996x8)]        ;t59a
   5209    pmulhrsw                m0, [o(pw_897x8)]             ;t36a
   5210    pmulhrsw                m2, m1, [o(pw_3461x8)]        ;t58a
   5211    pmulhrsw                m1, [o(pw_m2191x8)]           ;t37a
   5212    psubsw                  m4, m0, m1                    ;t37
   5213    paddsw                  m0, m1                        ;t36
   5214    psubsw                  m5, m3, m2                    ;t58
   5215    paddsw                  m3, m2                        ;t59
   5216    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3166, 2598    ;t37a, t58a
   5217    mova [rsp+gprsize*2+16*39], m0                        ;t36
   5218    mova [rsp+gprsize*2+16*40], m5                        ;t37a
   5219    mova [rsp+gprsize*2+16*61], m4                        ;t58a
   5220    mova [rsp+gprsize*2+16*62], m3                        ;t59
   5221 
   5222    mova                    m0, [rsp+gprsize*2+16*59]     ;in25
   5223    mova                    m1, [rsp+gprsize*2+16*41]     ;in7
   5224    pmulhrsw                m3, m0, [o(pw_3349x8)]        ;t57a
   5225    pmulhrsw                m0, [o(pw_2359x8)]            ;t38a
   5226    pmulhrsw                m2, m1, [o(pw_4036x8)]        ;t56a
   5227    pmulhrsw                m1, [o(pw_m700x8)]            ;t39a
   5228    psubsw                  m4, m1, m0                    ;t38
   5229    paddsw                  m0, m1                        ;t39
   5230    psubsw                  m5, m2, m3                    ;t57
   5231    paddsw                  m3, m2                        ;t56
   5232    ITX_MULSUB_2W            5, 4, 1, 2, 7, m2598, 3166   ;t38a, t57a
   5233    mova [rsp+gprsize*2+16*41], m5                        ;t38a
   5234    mova [rsp+gprsize*2+16*42], m0                        ;t39
   5235    mova [rsp+gprsize*2+16*59], m3                        ;t56
   5236    mova [rsp+gprsize*2+16*60], m4                        ;t57a
   5237 
   5238    mova                    m0, [rsp+gprsize*2+16*43]     ;in5
   5239    mova                    m1, [rsp+gprsize*2+16*57]     ;in27
   5240    pmulhrsw                m3, m0, [o(pw_4065x8)]        ;t55a
   5241    pmulhrsw                m0, [o(pw_501x8)]             ;t40a
   5242    pmulhrsw                m2, m1, [o(pw_3229x8)]        ;t54a
   5243    pmulhrsw                m1, [o(pw_m2520x8)]           ;t41a
   5244    psubsw                  m4, m0, m1                    ;t41
   5245    paddsw                  m0, m1                        ;t40
   5246    psubsw                  m5, m3, m2                    ;t54
   5247    paddsw                  m3, m2                        ;t55
   5248    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1931, 3612    ;t41a, t54a
   5249    mova [rsp+gprsize*2+16*43], m0                        ;t40
   5250    mova [rsp+gprsize*2+16*44], m5                        ;t41a
   5251    mova [rsp+gprsize*2+16*57], m4                        ;t54a
   5252    mova [rsp+gprsize*2+16*58], m3                        ;t55
   5253 
   5254    mova                    m0, [rsp+gprsize*2+16*55]     ;in21
   5255    mova                    m1, [rsp+gprsize*2+16*45]     ;in11
   5256    pmulhrsw                m3, m0, [o(pw_3564x8)]        ;t53a
   5257    pmulhrsw                m0, [o(pw_2019x8)]            ;t42a
   5258    pmulhrsw                m2, m1, [o(pw_3948x8)]        ;t52a
   5259    pmulhrsw                m1, [o(pw_m1092x8)]           ;t43a
   5260    psubsw                  m4, m1, m0                    ;t42
   5261    paddsw                  m0, m1                        ;t43
   5262    psubsw                  m5, m2, m3                    ;t53
   5263    paddsw                  m3, m2                        ;t52
   5264    ITX_MULSUB_2W            5, 4, 1, 2, 7, m3612, 1931   ;t42a, t53a
   5265    mova [rsp+gprsize*2+16*45], m5                        ;t42a
   5266    mova [rsp+gprsize*2+16*46], m0                        ;t43
   5267    mova [rsp+gprsize*2+16*55], m3                        ;t52
   5268    mova [rsp+gprsize*2+16*56], m4                        ;t53a
   5269 
   5270    mova                    m0, [rsp+gprsize*2+16*47]     ;in13
   5271    mova                    m1, [rsp+gprsize*2+16*53]     ;in19
   5272    pmulhrsw                m3, m0, [o(pw_3889x8)]        ;t51a
   5273    pmulhrsw                m0, [o(pw_1285x8)]            ;t44a
   5274    pmulhrsw                m2, m1, [o(pw_3659x8)]        ;t50a
   5275    pmulhrsw                m1, [o(pw_m1842x8)]           ;t45a
   5276    psubsw                  m4, m0, m1                    ;t45
   5277    paddsw                  m0, m1                        ;t44
   5278    psubsw                  m5, m3, m2                    ;t50
   5279    paddsw                  m3, m2                        ;t51
   5280    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3920, 1189    ;t45a, t50a
   5281    mova                    m6, m0
   5282    mova [rsp+gprsize*2+16*48], m5                        ;t45a
   5283    mova [rsp+gprsize*2+16*53], m4                        ;t50a
   5284    mova [rsp+gprsize*2+16*54], m3                        ;t51
   5285 
   5286    mova                    m0, [rsp+gprsize*2+16*51]     ;in29
   5287    mova                    m1, [rsp+gprsize*2+16*49]     ;in3
   5288    pmulhrsw                m3, m0, [o(pw_3102x8)]        ;t49a
   5289    pmulhrsw                m0, [o(pw_2675x8)]            ;t46a
   5290    pmulhrsw                m2, m1, [o(pw_4085x8)]        ;t48a
   5291    pmulhrsw                m1, [o(pw_m301x8)]            ;t47a
   5292    psubsw                  m5, m1, m0                    ;t46
   5293    paddsw                  m0, m1                        ;t47
   5294    psubsw                  m4, m2, m3                    ;t49
   5295    paddsw                  m3, m2                        ;t48
   5296 
   5297 ALIGN function_align
   5298 .main2:
   5299    ITX_MULSUB_2W            4, 5, 1, 2, 7, m1189, 3920   ;t46a, t49a
   5300    mova                    m1, [rsp+gprsize*2+16*54]     ;t51
   5301    psubsw                  m2, m0, m6                    ;t44a
   5302    paddsw                  m0, m6                        ;t47a
   5303    psubsw                  m6, m3, m1                    ;t51a
   5304    paddsw                  m3, m1                        ;t48a
   5305    mova [rsp+gprsize*2+16*50], m0                        ;t47a
   5306    mova [rsp+gprsize*2+16*51], m3                        ;t48a
   5307    ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t44, t51
   5308    mova [rsp+gprsize*2+16*47], m6                        ;t44
   5309    mova [rsp+gprsize*2+16*54], m2                        ;t51
   5310 
   5311    mova                    m0, [rsp+gprsize*2+16*48]     ;t45a
   5312    mova                    m3, [rsp+gprsize*2+16*53]     ;t50a
   5313    psubsw                  m2, m4, m0                    ;t45
   5314    paddsw                  m4, m0                        ;t46
   5315    psubsw                  m6, m5, m3                    ;t50
   5316    paddsw                  m5, m3                        ;t49
   5317    ITX_MULSUB_2W            6, 2, 0, 3, 7, m2276, 3406   ;t45a, t50a
   5318    mova [rsp+gprsize*2+16*48], m6                        ;t45a
   5319    mova [rsp+gprsize*2+16*49], m4                        ;t46
   5320    mova [rsp+gprsize*2+16*52], m5                        ;t49
   5321    mova [rsp+gprsize*2+16*53], m2                        ;t50a
   5322 
   5323    mova                    m0, [rsp+gprsize*2+16*43]     ;t40
   5324    mova                    m2, [rsp+gprsize*2+16*46]     ;t43
   5325    mova                    m3, [rsp+gprsize*2+16*55]     ;t52
   5326    mova                    m1, [rsp+gprsize*2+16*58]     ;t55
   5327    psubsw                  m4, m0, m2                    ;t43a
   5328    paddsw                  m0, m2                        ;t40a
   5329    psubsw                  m5, m1, m3                    ;t52a
   5330    paddsw                  m1, m3                        ;t55a
   5331    ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t43, t52
   5332    mova [rsp+gprsize*2+16*43], m0                        ;t40a
   5333    mova [rsp+gprsize*2+16*46], m5                        ;t43
   5334    mova [rsp+gprsize*2+16*55], m4                        ;t52
   5335    mova [rsp+gprsize*2+16*58], m1                        ;t55a
   5336 
   5337    mova                    m0, [rsp+gprsize*2+16*44]     ;t41a
   5338    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
   5339    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
   5340    mova                    m1, [rsp+gprsize*2+16*57]     ;t54a
   5341    psubsw                  m4, m0, m2                    ;t42
   5342    paddsw                  m0, m2                        ;t41
   5343    psubsw                  m5, m1, m3                    ;t53
   5344    paddsw                  m1, m3                        ;t54
   5345    ITX_MULSUB_2W            5, 4, 2, 3, 7, 3406, 2276    ;t42a, t53a
   5346    mova [rsp+gprsize*2+16*44], m0                        ;t41
   5347    mova [rsp+gprsize*2+16*45], m5                        ;t42a
   5348    mova [rsp+gprsize*2+16*56], m4                        ;t53a
   5349    mova [rsp+gprsize*2+16*57], m1                        ;t54
   5350 
   5351    mova                    m0, [rsp+gprsize*2+16*41]     ;t38a
   5352    mova                    m2, [rsp+gprsize*2+16*40]     ;t37a
   5353    mova                    m3, [rsp+gprsize*2+16*61]     ;t58a
   5354    mova                    m1, [rsp+gprsize*2+16*60]     ;t57a
   5355    psubsw                  m4, m0, m2                    ;t37
   5356    paddsw                  m0, m2                        ;t38
   5357    psubsw                  m5, m1, m3                    ;t58
   5358    paddsw                  m1, m3                        ;t57
   5359    ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t37a, t58a
   5360    mova [rsp+gprsize*2+16*41], m0                        ;t38
   5361    mova [rsp+gprsize*2+16*40], m5                        ;t37a
   5362    mova [rsp+gprsize*2+16*61], m4                        ;t58a
   5363    mova [rsp+gprsize*2+16*60], m1                        ;t57
   5364 
   5365    mova                    m0, [rsp+gprsize*2+16*42]     ;t39
   5366    mova                    m2, [rsp+gprsize*2+16*39]     ;t36
   5367    mova                    m3, [rsp+gprsize*2+16*62]     ;t59
   5368    mova                    m1, [rsp+gprsize*2+16*59]     ;t56
   5369    psubsw                  m4, m0, m2                    ;t36a
   5370    paddsw                  m0, m2                        ;t39a
   5371    psubsw                  m5, m1, m3                    ;t59a
   5372    paddsw                  m1, m3                        ;t56a
   5373    ITX_MULSUB_2W            5, 4, 2, 3, 7, m4017, 799    ;t36, t59
   5374    mova [rsp+gprsize*2+16*42], m0                        ;t39a
   5375    mova [rsp+gprsize*2+16*39], m5                        ;t36
   5376    mova [rsp+gprsize*2+16*62], m4                        ;t59
   5377    mova [rsp+gprsize*2+16*59], m1                        ;t56a
   5378 
   5379    mova                    m0, [rsp+gprsize*2+16*35]     ;t32
   5380    mova                    m2, [rsp+gprsize*2+16*38]     ;t35
   5381    mova                    m3, [rsp+gprsize*2+16*63]     ;t60
   5382    mova                    m1, [rsp+gprsize*2+16*66]     ;t63
   5383    psubsw                  m4, m0, m2                    ;t35a
   5384    paddsw                  m0, m2                        ;t32a
   5385    psubsw                  m5, m1, m3                    ;t60a
   5386    paddsw                  m1, m3                        ;t63a
   5387    ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t35, t60
   5388    mova [rsp+gprsize*2+16*35], m0                        ;t32a
   5389    mova [rsp+gprsize*2+16*38], m5                        ;t35
   5390    mova [rsp+gprsize*2+16*63], m4                        ;t60
   5391    mova [rsp+gprsize*2+16*66], m1                        ;t63a
   5392 
   5393    mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
   5394    mova                    m2, [rsp+gprsize*2+16*37]     ;t34a
   5395    mova                    m3, [rsp+gprsize*2+16*64]     ;t61a
   5396    mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
   5397    psubsw                  m4, m0, m2                    ;t34
   5398    paddsw                  m0, m2                        ;t33
   5399    psubsw                  m5, m1, m3                    ;t61
   5400    paddsw                  m1, m3                        ;t62
   5401    ITX_MULSUB_2W            5, 4, 2, 3, 7,  799, 4017    ;t34a, t61a
   5402 
   5403    mova                    m2, [rsp+gprsize*2+16*41]     ;t38
   5404    mova                    m3, [rsp+gprsize*2+16*60]     ;t57
   5405    psubsw                  m6, m0, m2                    ;t38a
   5406    paddsw                  m0, m2                        ;t33a
   5407    psubsw                  m2, m1, m3                    ;t57a
   5408    paddsw                  m1, m3                        ;t62a
   5409    mova [rsp+gprsize*2+16*36], m0                        ;t33a
   5410    mova [rsp+gprsize*2+16*65], m1                        ;t62a
   5411    ITX_MULSUB_2W            2, 6, 0, 3, 7, 1567, 3784    ;t38, t57
   5412    mova [rsp+gprsize*2+16*41], m2                        ;t38
   5413    mova [rsp+gprsize*2+16*60], m6                        ;t57
   5414 
   5415    mova                    m2, [rsp+gprsize*2+16*40]     ;t37
   5416    mova                    m3, [rsp+gprsize*2+16*61]     ;t58
   5417    psubsw                  m0, m5, m2                    ;t37
   5418    paddsw                  m5, m2                        ;t34
   5419    psubsw                  m1, m4, m3                    ;t58
   5420    paddsw                  m4, m3                        ;t61
   5421    ITX_MULSUB_2W            1, 0, 2, 3, 7, 1567, 3784    ;t37a, t58a
   5422    mova [rsp+gprsize*2+16*37], m5                        ;t34
   5423    mova [rsp+gprsize*2+16*64], m4                        ;t61
   5424    mova [rsp+gprsize*2+16*40], m1                        ;t37a
   5425    mova [rsp+gprsize*2+16*61], m0                        ;t58a
   5426 
   5427    mova                    m0, [rsp+gprsize*2+16*38]     ;t35
   5428    mova                    m2, [rsp+gprsize*2+16*39]     ;t36
   5429    mova                    m3, [rsp+gprsize*2+16*62]     ;t59
   5430    mova                    m1, [rsp+gprsize*2+16*63]     ;t60
   5431    psubsw                  m4, m0, m2                    ;t36a
   5432    paddsw                  m0, m2                        ;t35a
   5433    psubsw                  m5, m1, m3                    ;t59a
   5434    paddsw                  m1, m3                        ;t60a
   5435    ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t36, t59
   5436    mova [rsp+gprsize*2+16*38], m0                        ;t35a
   5437    mova [rsp+gprsize*2+16*39], m5                        ;t36
   5438    mova [rsp+gprsize*2+16*62], m4                        ;t59
   5439    mova [rsp+gprsize*2+16*63], m1                        ;t60a
   5440 
   5441    mova                    m0, [rsp+gprsize*2+16*35]     ;t32a
   5442    mova                    m2, [rsp+gprsize*2+16*42]     ;t39a
   5443    mova                    m3, [rsp+gprsize*2+16*59]     ;t56a
   5444    mova                    m1, [rsp+gprsize*2+16*66]     ;t63a
   5445    psubsw                  m4, m0, m2                    ;t39
   5446    paddsw                  m0, m2                        ;t32
   5447    psubsw                  m5, m1, m3                    ;t56
   5448    paddsw                  m1, m3                        ;t63
   5449    ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t39a, t56a
   5450    mova [rsp+gprsize*2+16*35], m0                        ;t32
   5451    mova [rsp+gprsize*2+16*42], m5                        ;t39a
   5452    mova [rsp+gprsize*2+16*59], m4                        ;t56a
   5453    mova [rsp+gprsize*2+16*66], m1                        ;t63
   5454 
   5455    mova                    m0, [rsp+gprsize*2+16*50]     ;t47a
   5456    mova                    m2, [rsp+gprsize*2+16*43]     ;t40a
   5457    mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
   5458    mova                    m1, [rsp+gprsize*2+16*51]     ;t48a
   5459    psubsw                  m4, m0, m2                    ;t40
   5460    paddsw                  m0, m2                        ;t47
   5461    psubsw                  m5, m1, m3                    ;t55
   5462    paddsw                  m1, m3                        ;t48
   5463    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t40a, t55a
   5464    mova [rsp+gprsize*2+16*50], m0                        ;t47
   5465    mova [rsp+gprsize*2+16*43], m5                        ;t40a
   5466    mova [rsp+gprsize*2+16*58], m4                        ;t55a
   5467    mova [rsp+gprsize*2+16*51], m1                        ;t48
   5468 
   5469    mova                    m0, [rsp+gprsize*2+16*49]     ;t46
   5470    mova                    m2, [rsp+gprsize*2+16*44]     ;t41
   5471    mova                    m3, [rsp+gprsize*2+16*57]     ;t54
   5472    mova                    m1, [rsp+gprsize*2+16*52]     ;t49
   5473    psubsw                  m4, m0, m2                    ;t41a
   5474    paddsw                  m0, m2                        ;t46a
   5475    psubsw                  m5, m1, m3                    ;t54a
   5476    paddsw                  m1, m3                        ;t49a
   5477    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t41, t54
   5478    mova [rsp+gprsize*2+16*49], m0                        ;t46a
   5479    mova [rsp+gprsize*2+16*44], m5                        ;t41
   5480    mova [rsp+gprsize*2+16*57], m4                        ;t54
   5481    mova [rsp+gprsize*2+16*52], m1                        ;t49a
   5482 
   5483    mova                    m0, [rsp+gprsize*2+16*48]     ;t45a
   5484    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
   5485    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
   5486    mova                    m1, [rsp+gprsize*2+16*53]     ;t50a
   5487    psubsw                  m4, m0, m2                    ;t42
   5488    paddsw                  m0, m2                        ;t45
   5489    psubsw                  m5, m1, m3                    ;t53
   5490    paddsw                  m1, m3                        ;t50
   5491    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t42a, t53a
   5492    mova [rsp+gprsize*2+16*48], m0                        ;t45
   5493    mova [rsp+gprsize*2+16*45], m5                        ;t42a
   5494    mova [rsp+gprsize*2+16*56], m4                        ;t53a
   5495    mova [rsp+gprsize*2+16*53], m1                        ;t50
   5496 
   5497    mova                    m0, [rsp+gprsize*2+16*47]     ;t44
   5498    mova                    m2, [rsp+gprsize*2+16*46]     ;t43
   5499    mova                    m3, [rsp+gprsize*2+16*55]     ;t52
   5500    mova                    m1, [rsp+gprsize*2+16*54]     ;t51
   5501    psubsw                  m4, m0, m2                    ;t43a
   5502    paddsw                  m0, m2                        ;t44a
   5503    psubsw                  m5, m1, m3                    ;t52a
   5504    paddsw                  m1, m3                        ;t51a
   5505    ITX_MULSUB_2W            5, 4, 2, 3, 7, m3784, 1567   ;t43, t52
   5506 
   5507    mova                    m2, [rsp+gprsize*2+16*38]     ;t35a
   5508    mova                    m3, [rsp+gprsize*2+16*31]     ;tmp[28]
   5509    psubsw                  m6, m2, m0                    ;t44
   5510    paddsw                  m2, m0                        ;t35
   5511    psubsw                  m0, m3, m2                    ;out35
   5512    paddsw                  m2, m3                        ;out28
   5513    mova                    m3, [rsp+gprsize*2+16*63]     ;t60a
   5514    mova [rsp+gprsize*2+16*38], m0                        ;out35
   5515    mova [rsp+gprsize*2+16*31], m2                        ;out28
   5516    psubsw                  m0, m3, m1                    ;t51
   5517    paddsw                  m3, m1                        ;t60
   5518    ITX_MULSUB_2W            0, 6, 1, 2, 7, 2896, 2896    ;t44a, t51a
   5519    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp[3]
   5520    psubsw                  m1, m2, m3                    ;out60
   5521    paddsw                  m2, m3                        ;out3
   5522    mova                    m3, [rsp+gprsize*2+16*22]     ;tmp[19]
   5523    mova [rsp+gprsize*2+16*63], m1                        ;out60
   5524    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
   5525    psubsw                  m1, m3, m0                    ;out44
   5526    paddsw                  m3, m0                        ;out19
   5527    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp[12]
   5528 
   5529    mova                    m0, [rsp+gprsize*2+16*39]     ;t36
   5530    mova [rsp+gprsize*2+16*47], m1                        ;out44
   5531    mova [rsp+gprsize*2+16*22], m3                        ;out19
   5532    mova                    m1, [rsp+gprsize*2+16*62]     ;t59
   5533    psubsw                  m3, m2, m6                    ;out51
   5534    paddsw                  m2, m6                        ;out12
   5535    mova [rsp+gprsize*2+16*54], m3                        ;out51
   5536    mova [rsp+gprsize*2+16*15], m2                        ;out12
   5537    psubsw                  m2, m0, m5                    ;t43a
   5538    paddsw                  m0, m5                        ;t36a
   5539    mova                    m5, [rsp+gprsize*2+16*30]     ;tmp[27]
   5540    psubsw                  m3, m1, m4                    ;t52a
   5541    paddsw                  m1, m4                        ;t59a
   5542    ITX_MULSUB_2W            3, 2, 4, 6, 7, 2896, 2896    ;t43, t52
   5543    mova                    m4, [rsp+gprsize*2+16*7 ]     ;tmp[4 ]
   5544    psubsw                  m6, m5, m0                    ;out36
   5545    paddsw                  m5, m0                        ;out27
   5546    psubsw                  m0, m4, m1                    ;out59
   5547    paddsw                  m4, m1                        ;out4
   5548    mova [rsp+gprsize*2+16*39], m6                        ;out36
   5549    mova [rsp+gprsize*2+16*30], m5                        ;out27
   5550    mova [rsp+gprsize*2+16*62], m0                        ;out59
   5551    mova [rsp+gprsize*2+16*7 ], m4                        ;out4
   5552    mova                    m0, [rsp+gprsize*2+16*23]     ;tmp[20]
   5553    mova                    m5, [rsp+gprsize*2+16*14]     ;tmp[11]
   5554    psubsw                  m4, m0, m3                    ;out43
   5555    paddsw                  m0, m3                        ;out20
   5556    psubsw                  m6, m5, m2                    ;out52
   5557    paddsw                  m5, m2                        ;out11
   5558    mova [rsp+gprsize*2+16*46], m4                        ;out43
   5559    mova [rsp+gprsize*2+16*23], m0                        ;out20
   5560    mova [rsp+gprsize*2+16*55], m6                        ;out52
   5561    mova [rsp+gprsize*2+16*14], m5                        ;out11
   5562 
   5563    mova                    m0, [rsp+gprsize*2+16*40]     ;t37a
   5564    mova                    m5, [rsp+gprsize*2+16*45]     ;t42a
   5565    mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
   5566    mova                    m1, [rsp+gprsize*2+16*61]     ;t58a
   5567    mova                    m2, [rsp+gprsize*2+16*29]     ;tmp[26]
   5568    psubsw                  m4, m0, m5                    ;t42
   5569    paddsw                  m0, m5                        ;t37
   5570    psubsw                  m5, m1, m3                    ;t53
   5571    paddsw                  m1, m3                        ;t58
   5572    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t43, t52
   5573    mova                    m3, [rsp+gprsize*2+16*8 ]     ;tmp[5 ]
   5574    psubsw                  m6, m2, m0                    ;out37
   5575    paddsw                  m2, m0                        ;out26
   5576    psubsw                  m0, m3, m1                    ;out58
   5577    paddsw                  m3, m1                        ;out5
   5578    mova [rsp+gprsize*2+16*40], m6                        ;out37
   5579    mova [rsp+gprsize*2+16*29], m2                        ;out26
   5580    mova [rsp+gprsize*2+16*61], m0                        ;out58
   5581    mova [rsp+gprsize*2+16*8 ], m3                        ;out5
   5582    mova                    m0, [rsp+gprsize*2+16*24]     ;tmp[21]
   5583    mova                    m1, [rsp+gprsize*2+16*13]     ;tmp[10]
   5584    psubsw                  m2, m0, m5                    ;out42
   5585    paddsw                  m0, m5                        ;out21
   5586    psubsw                  m3, m1, m4                    ;out53
   5587    paddsw                  m1, m4                        ;out10
   5588    mova [rsp+gprsize*2+16*45], m2                        ;out42
   5589    mova [rsp+gprsize*2+16*24], m0                        ;out21
   5590    mova [rsp+gprsize*2+16*56], m3                        ;out53
   5591    mova [rsp+gprsize*2+16*13], m1                        ;out10
   5592 
   5593    mova                    m0, [rsp+gprsize*2+16*41]     ;t38
   5594    mova                    m5, [rsp+gprsize*2+16*44]     ;t41
   5595    mova                    m3, [rsp+gprsize*2+16*57]     ;t54
   5596    mova                    m1, [rsp+gprsize*2+16*60]     ;t57
   5597    mova                    m2, [rsp+gprsize*2+16*28]     ;tmp[25]
   5598    psubsw                  m4, m0, m5                    ;t41a
   5599    paddsw                  m0, m5                        ;t38a
   5600    psubsw                  m5, m1, m3                    ;t54a
   5601    paddsw                  m1, m3                        ;t57a
   5602    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t41a, t54a
   5603    mova                    m3, [rsp+gprsize*2+16*9 ]     ;tmp[6 ]
   5604    psubsw                  m6, m2, m0                    ;out38
   5605    paddsw                  m2, m0                        ;out25
   5606    psubsw                  m0, m3, m1                    ;out57
   5607    paddsw                  m3, m1                        ;out6
   5608    mova [rsp+gprsize*2+16*41], m6                        ;out38
   5609    mova [rsp+gprsize*2+16*28], m2                        ;out25
   5610    mova [rsp+gprsize*2+16*60], m0                        ;out57
   5611    mova [rsp+gprsize*2+16*9 ], m3                        ;out6
   5612    mova                    m0, [rsp+gprsize*2+16*25]     ;tmp[22]
   5613    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp[9 ]
   5614    psubsw                  m2, m0, m5                    ;out41
   5615    paddsw                  m0, m5                        ;out22
   5616    psubsw                  m3, m1, m4                    ;out54
   5617    paddsw                  m1, m4                        ;out9
   5618    mova [rsp+gprsize*2+16*44], m2                        ;out41
   5619    mova [rsp+gprsize*2+16*25], m0                        ;out22
   5620    mova [rsp+gprsize*2+16*57], m3                        ;out54
   5621    mova [rsp+gprsize*2+16*12], m1                        ;out9
   5622 
   5623    mova                    m0, [rsp+gprsize*2+16*42]     ;t39a
   5624    mova                    m5, [rsp+gprsize*2+16*43]     ;t40a
   5625    mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
   5626    mova                    m1, [rsp+gprsize*2+16*59]     ;t56a
   5627    mova                    m2, [rsp+gprsize*2+16*27]     ;tmp[24]
   5628    psubsw                  m4, m0, m5                    ;t40
   5629    paddsw                  m0, m5                        ;t39
   5630    psubsw                  m5, m1, m3                    ;t55
   5631    paddsw                  m1, m3                        ;t56
   5632    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t40a, t55a
   5633    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp[7 ]
   5634    psubsw                  m6, m2, m0                    ;out39
   5635    paddsw                  m2, m0                        ;out24
   5636    psubsw                  m0, m3, m1                    ;out56
   5637    paddsw                  m3, m1                        ;out7
   5638    mova [rsp+gprsize*2+16*42], m6                        ;out39
   5639    mova [rsp+gprsize*2+16*27], m2                        ;out24
   5640    mova [rsp+gprsize*2+16*59], m0                        ;out56
   5641    mova [rsp+gprsize*2+16*10], m3                        ;out7
   5642    mova                    m0, [rsp+gprsize*2+16*26]     ;tmp[23]
   5643    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp[8 ]
   5644    psubsw                  m2, m0, m5                    ;out40
   5645    paddsw                  m0, m5                        ;out23
   5646    psubsw                  m3, m1, m4                    ;out55
   5647    paddsw                  m1, m4                        ;out8
   5648    mova [rsp+gprsize*2+16*43], m2                        ;out40
   5649    mova [rsp+gprsize*2+16*26], m0                        ;out23
   5650    mova [rsp+gprsize*2+16*58], m3                        ;out55
   5651    mova [rsp+gprsize*2+16*11], m1                        ;out8
   5652 
   5653    mova                    m0, [rsp+gprsize*2+16*37]     ;t34
   5654    mova                    m5, [rsp+gprsize*2+16*48]     ;t45
   5655    mova                    m3, [rsp+gprsize*2+16*53]     ;t50
   5656    mova                    m1, [rsp+gprsize*2+16*64]     ;t61
   5657    mova                    m2, [rsp+gprsize*2+16*32]     ;tmp[29]
   5658    psubsw                  m4, m0, m5                    ;t45a
   5659    paddsw                  m0, m5                        ;t34a
   5660    psubsw                  m5, m1, m3                    ;t50a
   5661    paddsw                  m1, m3                        ;t61a
   5662    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
   5663    mova                    m3, [rsp+gprsize*2+16*5 ]     ;tmp[2 ]
   5664    psubsw                  m6, m2, m0                    ;out34
   5665    paddsw                  m2, m0                        ;out29
   5666    psubsw                  m0, m3, m1                    ;out61
   5667    paddsw                  m3, m1                        ;out2
   5668    mova [rsp+gprsize*2+16*37], m6                        ;out34
   5669    mova [rsp+gprsize*2+16*32], m2                        ;out29
   5670    mova [rsp+gprsize*2+16*64], m0                        ;out61
   5671    mova [rsp+gprsize*2+16*5 ], m3                        ;out2
   5672    mova                    m0, [rsp+gprsize*2+16*21]     ;tmp[18]
   5673    mova                    m1, [rsp+gprsize*2+16*16]     ;tmp[13]
   5674    psubsw                  m2, m0, m5                    ;out45
   5675    paddsw                  m0, m5                        ;out18
   5676    psubsw                  m3, m1, m4                    ;out50
   5677    paddsw                  m1, m4                        ;out13
   5678    mova [rsp+gprsize*2+16*48], m2                        ;out45
   5679    mova [rsp+gprsize*2+16*21], m0                        ;out18
   5680    mova [rsp+gprsize*2+16*53], m3                        ;out50
   5681    mova [rsp+gprsize*2+16*16], m1                        ;out13
   5682 
   5683    mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
   5684    mova                    m5, [rsp+gprsize*2+16*49]     ;t46a
   5685    mova                    m3, [rsp+gprsize*2+16*52]     ;t49a
   5686    mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
   5687    mova                    m2, [rsp+gprsize*2+16*33]     ;tmp[30]
   5688    psubsw                  m4, m0, m5                    ;t46
   5689    paddsw                  m0, m5                        ;t33
   5690    psubsw                  m5, m1, m3                    ;t49
   5691    paddsw                  m1, m3                        ;t62
   5692    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
   5693    mova                    m3, [rsp+gprsize*2+16*4 ]     ;tmp[1 ]
   5694    psubsw                  m6, m2, m0                    ;out33
   5695    paddsw                  m2, m0                        ;out30
   5696    psubsw                  m0, m3, m1                    ;out62
   5697    paddsw                  m3, m1                        ;out1
   5698    mova [rsp+gprsize*2+16*36], m6                        ;out33
   5699    mova [rsp+gprsize*2+16*33], m2                        ;out30
   5700    mova [rsp+gprsize*2+16*65], m0                        ;out62
   5701    mova [rsp+gprsize*2+16*4 ], m3                        ;out1
   5702    mova                    m0, [rsp+gprsize*2+16*20]     ;tmp[17]
   5703    mova                    m1, [rsp+gprsize*2+16*17]     ;tmp[14]
   5704    psubsw                  m2, m0, m5                    ;out46
   5705    paddsw                  m0, m5                        ;out17
   5706    psubsw                  m3, m1, m4                    ;out49
   5707    paddsw                  m1, m4                        ;out14
   5708    mova [rsp+gprsize*2+16*49], m2                        ;out46
   5709    mova [rsp+gprsize*2+16*20], m0                        ;out17
   5710    mova [rsp+gprsize*2+16*52], m3                        ;out49
   5711    mova [rsp+gprsize*2+16*17], m1                        ;out14
   5712 
   5713    mova                    m0, [rsp+gprsize*2+16*35]     ;t32
   5714    mova                    m5, [rsp+gprsize*2+16*50]     ;t47
   5715    mova                    m3, [rsp+gprsize*2+16*51]     ;t48
   5716    mova                    m1, [rsp+gprsize*2+16*66]     ;t63
   5717    mova                    m2, [rsp+gprsize*2+16*34]     ;tmp[31]
   5718    psubsw                  m4, m0, m5                    ;t47a
   5719    paddsw                  m0, m5                        ;t32a
   5720    psubsw                  m5, m1, m3                    ;t48a
   5721    paddsw                  m1, m3                        ;t63a
   5722    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t47, t48
   5723    mova                    m3, [rsp+gprsize*2+16*3 ]     ;tmp[0 ]
   5724    psubsw                  m6, m2, m0                    ;out32
   5725    paddsw                  m2, m0                        ;out31
   5726    psubsw                  m0, m3, m1                    ;out63
   5727    paddsw                  m3, m1                        ;out0
   5728    mova [rsp+gprsize*2+16*35], m6                        ;out32
   5729    mova [rsp+gprsize*2+16*34], m2                        ;out31
   5730    mova [rsp+gprsize*2+16*66], m0                        ;out63
   5731    mova [rsp+gprsize*2+16*3 ], m3                        ;out0
   5732    mova                    m0, [rsp+gprsize*2+16*19]     ;tmp[16]
   5733    mova                    m1, [rsp+gprsize*2+16*18]     ;tmp[15]
   5734    psubsw                  m2, m0, m5                    ;out47
   5735    paddsw                  m0, m5                        ;out16
   5736    psubsw                  m3, m1, m4                    ;out48
   5737    paddsw                  m1, m4                        ;out15
   5738    mova [rsp+gprsize*2+16*50], m2                        ;out47
   5739    mova [rsp+gprsize*2+16*19], m0                        ;out16
   5740    mova [rsp+gprsize*2+16*51], m3                        ;out48
   5741    mova [rsp+gprsize*2+16*18], m1                        ;out15
   5742    ret
   5743 
   5744 
   5745 cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
   5746 %if ARCH_X86_32
   5747    LEA                     r5, $$
   5748 %endif
   5749    test                  eobd, eobd
   5750    jz .dconly
   5751 
   5752    call m(idct_64x16_internal_8bpc)
   5753    RET
   5754 
   5755 .dconly:
   5756    movd                    m1, [o(pw_2896x8)]
   5757    pmulhrsw                m0, m1, [coeffq]
   5758    movd                    m2, [o(pw_8192)]
   5759    mov               [coeffq], eobd
   5760    mov                    r3d, 16
   5761    lea                   tx2q, [o(.end)]
   5762 
   5763 .body:
   5764    pmulhrsw                m0, m2
   5765    movd                    m2, [o(pw_2048)]  ;intentionally rip-relative
   5766    pmulhrsw                m0, m1
   5767    pmulhrsw                m0, m2
   5768    pshuflw                 m0, m0, q0000
   5769    punpcklwd               m0, m0
   5770    pxor                    m7, m7
   5771 
   5772 .loop:
   5773    mova                    m1, [dstq+16*0]
   5774    mova                    m3, [dstq+16*1]
   5775    mova                    m5, [dstq+16*2]
   5776    mova                    m6, [dstq+16*3]
   5777    punpckhbw               m2, m1, m7
   5778    punpcklbw               m1, m7
   5779    punpckhbw               m4, m3, m7
   5780    punpcklbw               m3, m7
   5781    paddw                   m2, m0
   5782    paddw                   m1, m0
   5783    paddw                   m4, m0
   5784    paddw                   m3, m0
   5785    packuswb                m1, m2
   5786    packuswb                m3, m4
   5787    punpckhbw               m2, m5, m7
   5788    punpcklbw               m5, m7
   5789    punpckhbw               m4, m6, m7
   5790    punpcklbw               m6, m7
   5791    paddw                   m2, m0
   5792    paddw                   m5, m0
   5793    paddw                   m4, m0
   5794    paddw                   m6, m0
   5795    packuswb                m5, m2
   5796    packuswb                m6, m4
   5797    mova           [dstq+16*0], m1
   5798    mova           [dstq+16*1], m3
   5799    mova           [dstq+16*2], m5
   5800    mova           [dstq+16*3], m6
   5801    add                   dstq, strideq
   5802    dec                    r3d
   5803    jg .loop
   5804    jmp                   tx2q
   5805 
   5806 .end:
   5807    RET
   5808 
   5809 
   5810 %macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2
   5811 
   5812 %if %3
   5813    mova                 m3, [o(pw_2896x8)]
   5814    pmulhrsw             m0, m3, [%1+%2*0]
   5815    pmulhrsw             m1, m3, [%1+%2*1]
   5816    pmulhrsw             m2, m3, [%1+%2*2]
   5817    pmulhrsw             m3, [%1+%2*3]
   5818 %else
   5819    mova                 m0, [%1+%2*0]
   5820    mova                 m1, [%1+%2*1]
   5821    mova                 m2, [%1+%2*2]
   5822    mova                 m3, [%1+%2*3]
   5823 %endif
   5824 %endmacro
   5825 
   5826 %macro LOAD_4ROWS_H 2 ;src, stride
   5827    mova                 m4, [%1+%2*0]
   5828    mova                 m5, [%1+%2*1]
   5829    mova                 m6, [%1+%2*2]
   5830    mova                 m7, [%1+%2*3]
   5831 %endmacro
   5832 
   5833 cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   5834    mov                    r3d, 2
   5835    mov  [rsp+gprsize*2+16*67], dstq
   5836    lea                   dstq, [rsp+gprsize+16*68]
   5837 
   5838 .pass1_loop:
   5839    LOAD_4ROWS     coeffq+32*0, 32*8
   5840    pxor                    m4, m4
   5841    REPX          {mova x, m4}, m5, m6, m7
   5842    call  m(idct_8x8_internal_8bpc).main
   5843    SAVE_7ROWS    rsp+gprsize+16*3, 16
   5844 
   5845    pxor                    m4, m4
   5846    LOAD_4ROWS     coeffq+32*4, 32*8
   5847 
   5848    REPX          {mova x, m4}, m5, m6, m7
   5849    call m(idct_16x8_internal_8bpc).main
   5850    mova                    m7, [rsp+gprsize+16*0]
   5851    SAVE_8ROWS   rsp+gprsize+16*11, 16
   5852 
   5853    LOAD_8ROWS     coeffq+32*2, 32*4
   5854    mova   [rsp+gprsize+16*19], m0
   5855    mova   [rsp+gprsize+16*26], m1
   5856    mova   [rsp+gprsize+16*23], m2
   5857    mova   [rsp+gprsize+16*22], m3
   5858    mova   [rsp+gprsize+16*21], m4
   5859    mova   [rsp+gprsize+16*24], m5
   5860    mova   [rsp+gprsize+16*25], m6
   5861    mova   [rsp+gprsize+16*20], m7
   5862 
   5863    call m(idct_8x32_internal_8bpc).main_fast
   5864    SAVE_8ROWS    rsp+gprsize+16*3, 16
   5865 
   5866    LOAD_8ROWS     coeffq+32*1, 32*2
   5867    mova   [rsp+gprsize+16*35], m0                        ;in1
   5868    mova   [rsp+gprsize+16*49], m1                        ;in3
   5869    mova   [rsp+gprsize+16*43], m2                        ;in5
   5870    mova   [rsp+gprsize+16*41], m3                        ;in7
   5871    mova   [rsp+gprsize+16*39], m4                        ;in9
   5872    mova   [rsp+gprsize+16*45], m5                        ;in11
   5873    mova   [rsp+gprsize+16*47], m6                        ;in13
   5874    mova   [rsp+gprsize+16*37], m7                        ;in15
   5875 
   5876    LOAD_8ROWS    coeffq+32*17, 32*2
   5877    mova   [rsp+gprsize+16*63], m0                        ;in17
   5878    mova   [rsp+gprsize+16*53], m1                        ;in19
   5879    mova   [rsp+gprsize+16*55], m2                        ;in21
   5880    mova   [rsp+gprsize+16*61], m3                        ;in23
   5881    mova   [rsp+gprsize+16*59], m4                        ;in25
   5882    mova   [rsp+gprsize+16*57], m5                        ;in27
   5883    mova   [rsp+gprsize+16*51], m6                        ;in29
   5884    mova   [rsp+gprsize+16*65], m7                        ;in31
   5885 
   5886    call m(idct_16x64_internal_8bpc).main
   5887 
   5888    LOAD_8ROWS    rsp+gprsize+16*3, 16
   5889    mova    [rsp+gprsize+16*0], m7
   5890    mova                    m7, [o(pw_8192)]
   5891    lea                   tx2q, [o(.pass1_end)]
   5892    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   5893 
   5894 .pass1_end:
   5895    SAVE_8ROWS     coeffq+32*0, 32
   5896    LOAD_8ROWS   rsp+gprsize+16*11, 16
   5897    mova    [rsp+gprsize+16*0], m7
   5898    mova                    m7, [o(pw_8192)]
   5899    lea                   tx2q, [o(.pass1_end1)]
   5900    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   5901 
   5902 .pass1_end1:
   5903    SAVE_8ROWS     coeffq+32*8, 32
   5904    LOAD_8ROWS   rsp+gprsize+16*19, 16
   5905    mova    [rsp+gprsize+16*0], m7
   5906    mova                    m7, [o(pw_8192)]
   5907    lea                   tx2q, [o(.pass1_end2)]
   5908    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   5909 
   5910 .pass1_end2:
   5911    SAVE_8ROWS    coeffq+32*16, 32
   5912    LOAD_8ROWS   rsp+gprsize+16*27, 16
   5913    mova    [rsp+gprsize+16*0], m7
   5914    mova                    m7, [o(pw_8192)]
   5915    lea                   tx2q, [o(.pass1_end3)]
   5916    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   5917 
   5918 .pass1_end3:
   5919    SAVE_8ROWS    coeffq+32*24, 32
   5920    LOAD_8ROWS   rsp+gprsize+16*35, 16
   5921    mova    [rsp+gprsize+16*0], m7
   5922    mova                    m7, [o(pw_8192)]
   5923    lea                   tx2q, [o(.pass1_end4)]
   5924    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   5925 
   5926 .pass1_end4:
   5927    SAVE_8ROWS       dstq+32*0, 32
   5928    LOAD_8ROWS   rsp+gprsize+16*43, 16
   5929    mova    [rsp+gprsize+16*0], m7
   5930    mova                    m7, [o(pw_8192)]
   5931    lea                   tx2q, [o(.pass1_end5)]
   5932    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   5933 
   5934 .pass1_end5:
   5935    SAVE_8ROWS       dstq+32*8, 32
   5936    LOAD_8ROWS   rsp+gprsize+16*51, 16
   5937    mova    [rsp+gprsize+16*0], m7
   5938    mova                    m7, [o(pw_8192)]
   5939    lea                   tx2q, [o(.pass1_end6)]
   5940    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   5941 
   5942 .pass1_end6:
   5943    SAVE_8ROWS      dstq+32*16, 32
   5944    LOAD_8ROWS   rsp+gprsize+16*59, 16
   5945    mova    [rsp+gprsize+16*0], m7
   5946    mova                    m7, [o(pw_8192)]
   5947    lea                   tx2q, [o(.pass1_end7)]
   5948    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   5949 
   5950 .pass1_end7:
   5951    SAVE_8ROWS      dstq+32*24, 32
   5952 
   5953    add                 coeffq, 16
   5954    add                   dstq, 16
   5955    dec                    r3d
   5956    jg .pass1_loop
   5957 
   5958 .pass2:
   5959    mov                   dstq, [rsp+gprsize*2+16*67]
   5960    sub                 coeffq, 32
   5961    mov                    r3d, 4
   5962 
   5963 .pass2_loop:
   5964    mov  [rsp+gprsize*1+16*67], r3d
   5965 
   5966    LOAD_4ROWS     coeffq+16*0, 32*2
   5967    LOAD_4ROWS_H   coeffq+16*1, 32*2
   5968    call  m(idct_8x8_internal_8bpc).main
   5969    SAVE_7ROWS    rsp+gprsize+16*3, 16
   5970    LOAD_4ROWS     coeffq+16*2, 32*2
   5971    LOAD_4ROWS_H   coeffq+16*3, 32*2
   5972    call m(idct_16x8_internal_8bpc).main
   5973 
   5974    mov                    r3, dstq
   5975    lea                  tx2q, [o(.end)]
   5976    lea                  dstq, [dstq+strideq*8]
   5977    jmp  m(idct_8x8_internal_8bpc).end
   5978 
   5979 .end:
   5980    LOAD_8ROWS   rsp+gprsize+16*3, 16
   5981    mova   [rsp+gprsize+16*0], m7
   5982    lea                  tx2q, [o(.end1)]
   5983    mov                  dstq, r3
   5984    jmp  m(idct_8x8_internal_8bpc).end
   5985 
   5986 .end1:
   5987    pxor                   m7, m7
   5988    REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
   5989 
   5990    add                 coeffq, 16*16
   5991    mov                    r3d, [rsp+gprsize*1+16*67]
   5992    mov                   dstq, [rsp+gprsize*2+16*67]
   5993    add                   dstq, 8
   5994    mov  [rsp+gprsize*2+16*67], dstq
   5995    dec                    r3d
   5996    jg .pass2_loop
   5997 
   5998    mov                    r3d, 4
   5999    lea                 coeffq, [rsp+gprsize+16*68]
   6000 .pass2_loop2:
   6001    mov  [rsp+gprsize*1+16*67], r3d
   6002 
   6003    LOAD_4ROWS     coeffq+16*0, 32*2
   6004    LOAD_4ROWS_H   coeffq+16*1, 32*2
   6005    call  m(idct_8x8_internal_8bpc).main
   6006    SAVE_7ROWS    rsp+gprsize+16*3, 16
   6007    LOAD_4ROWS     coeffq+16*2, 32*2
   6008    LOAD_4ROWS_H   coeffq+16*3, 32*2
   6009    call m(idct_16x8_internal_8bpc).main
   6010 
   6011    mov                    r3, dstq
   6012    lea                  tx2q, [o(.end2)]
   6013    lea                  dstq, [dstq+strideq*8]
   6014    jmp  m(idct_8x8_internal_8bpc).end
   6015 
   6016 .end2:
   6017    LOAD_8ROWS   rsp+gprsize+16*3, 16
   6018    mova   [rsp+gprsize+16*0], m7
   6019    lea                  tx2q, [o(.end3)]
   6020    mov                  dstq, r3
   6021    jmp  m(idct_8x8_internal_8bpc).end
   6022 
   6023 .end3:
   6024 
   6025    add                 coeffq, 16*16
   6026    mov                    r3d, [rsp+gprsize*1+16*67]
   6027    mov                   dstq, [rsp+gprsize*2+16*67]
   6028    add                   dstq, 8
   6029    mov  [rsp+gprsize*2+16*67], dstq
   6030    dec                    r3d
   6031    jg .pass2_loop2
   6032    ret
   6033 
   6034 
   6035 cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
   6036 %if ARCH_X86_32
   6037    LEA                     r5, $$
   6038 %endif
   6039    test                  eobd, eobd
   6040    jz .dconly
   6041    call m(idct_32x64_internal_8bpc)
   6042 .end:
   6043    RET
   6044 
   6045 .dconly:
   6046    movd                    m1, [o(pw_2896x8)]
   6047    pmulhrsw                m0, m1, [coeffq]
   6048    movd                    m2, [o(pw_16384)]
   6049    mov               [coeffq], eobd
   6050    pmulhrsw                m0, m1
   6051    mov                    r3d, 64
   6052    lea                   tx2q, [o(.end)]
   6053    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
   6054 
   6055 
   6056 cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   6057    mov                    r4d, 2
   6058    sub                   eobd, 136
   6059    mov  [rsp+gprsize*1+16*67], eobd
   6060    mov                    r3d, 4
   6061    cmovs                  r3d, r4d
   6062 
   6063 %if ARCH_X86_32
   6064    LEA                     r5, $$
   6065 %endif
   6066 
   6067    mov  [rsp+gprsize*2+16*67], coeffq
   6068 
   6069 .pass1_loop:
   6070    LOAD_8ROWS     coeffq+64*1, 64*2, 1
   6071    mova   [rsp+gprsize+16*19], m0                        ;in1
   6072    mova   [rsp+gprsize+16*26], m1                        ;in3
   6073    mova   [rsp+gprsize+16*23], m2                        ;in5
   6074    mova   [rsp+gprsize+16*22], m3                        ;in7
   6075    mova   [rsp+gprsize+16*21], m4                        ;in9
   6076    mova   [rsp+gprsize+16*24], m5                        ;in11
   6077    mova   [rsp+gprsize+16*25], m6                        ;in13
   6078    mova   [rsp+gprsize+16*20], m7                        ;in15
   6079 
   6080    mov                   tx2d, [rsp+gprsize*1+16*67]
   6081    test                  tx2d, tx2d
   6082    jl .fast
   6083 
   6084 .full:
   6085    LOAD_8ROWS     coeffq+64*0, 64*4, 1
   6086    call  m(idct_8x8_internal_8bpc).main
   6087    SAVE_7ROWS    rsp+gprsize+16*3, 16
   6088    LOAD_8ROWS     coeffq+64*2, 64*4, 1
   6089    call m(idct_16x8_internal_8bpc).main
   6090    mova                    m7, [rsp+gprsize+16*0]
   6091    SAVE_8ROWS   rsp+gprsize+16*11, 16
   6092 
   6093    LOAD_8ROWS    coeffq+64*17, 64*2, 1
   6094    mova   [rsp+gprsize+16*33], m0                        ;in17
   6095    mova   [rsp+gprsize+16*28], m1                        ;in19
   6096    mova   [rsp+gprsize+16*29], m2                        ;in21
   6097    mova   [rsp+gprsize+16*32], m3                        ;in23
   6098    mova   [rsp+gprsize+16*31], m4                        ;in25
   6099    mova   [rsp+gprsize+16*30], m5                        ;in27
   6100    mova   [rsp+gprsize+16*27], m6                        ;in29
   6101    mova   [rsp+gprsize+16*34], m7                        ;in31
   6102 
   6103    call m(idct_8x32_internal_8bpc).main
   6104    jmp .pass1_end
   6105 
   6106 .fast:
   6107    LOAD_4ROWS          coeffq, 256, 1
   6108    pxor                    m4, m4
   6109    REPX          {mova x, m4}, m5, m6, m7
   6110    call  m(idct_8x8_internal_8bpc).main
   6111 
   6112    SAVE_7ROWS    rsp+gprsize+16*3, 16
   6113    LOAD_4ROWS    coeffq+128*1, 256, 1
   6114    pxor                    m4, m4
   6115    REPX          {mova x, m4}, m5, m6, m7
   6116    call m(idct_16x8_internal_8bpc).main
   6117    mova                    m7, [rsp+gprsize+16*0]
   6118    SAVE_8ROWS   rsp+gprsize+16*11, 16
   6119 
   6120    call m(idct_8x32_internal_8bpc).main_fast
   6121 
   6122 .pass1_end:
   6123    mova    [rsp+gprsize+16*0], m7
   6124    lea                   tx2q, [o(.pass1_end1)]
   6125    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6126 
   6127 .pass1_end1:
   6128    SAVE_8ROWS     coeffq+64*0, 64
   6129    LOAD_8ROWS   rsp+gprsize+16*11, 16
   6130    mova    [rsp+gprsize+16*0], m7
   6131    lea                   tx2q, [o(.pass1_end2)]
   6132    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6133 
   6134 .pass1_end2:
   6135    SAVE_8ROWS     coeffq+64*8, 64
   6136    LOAD_8ROWS   rsp+gprsize+16*19, 16
   6137    mova    [rsp+gprsize+16*0], m7
   6138    lea                   tx2q, [o(.pass1_end3)]
   6139    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6140 
   6141 .pass1_end3:
   6142    SAVE_8ROWS    coeffq+64*16, 64
   6143    LOAD_8ROWS   rsp+gprsize+16*27, 16
   6144    mova    [rsp+gprsize+16*0], m7
   6145    lea                   tx2q, [o(.pass1_end4)]
   6146    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6147 
   6148 .pass1_end4:
   6149    SAVE_8ROWS    coeffq+64*24, 64
   6150 
   6151    add                 coeffq, 16
   6152    dec                    r3d
   6153    jg .pass1_loop
   6154 
   6155 .pass2:
   6156    mov                 coeffq, [rsp+gprsize*2+16*67]
   6157    mov                    r3d, 4
   6158    lea                     r4, [dstq+8]
   6159    mov  [rsp+gprsize*2+16*67], r4
   6160    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
   6161    jmp m(idct_16x64_internal_8bpc).pass2_loop
   6162 
   6163 
   6164 cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
   6165 %if ARCH_X86_32
   6166    LEA                     r5, $$
   6167 %endif
   6168    test                  eobd, eobd
   6169    jz .dconly
   6170    call m(idct_64x32_internal_8bpc)
   6171 .end:
   6172    RET
   6173 
   6174 .dconly:
   6175    movd                    m1, [o(pw_2896x8)]
   6176    pmulhrsw                m0, m1, [coeffq]
   6177    movd                    m2, [o(pw_16384)]
   6178    pmulhrsw                m0, m1
   6179    mov               [coeffq], eobd
   6180    mov                    r3d, 32
   6181    lea                   tx2q, [o(.end)]
   6182    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
   6183 
   6184 
   6185 cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   6186    mov                    r4d, 2
   6187    sub                   eobd, 136
   6188    mov  [rsp+gprsize*1+16*67], eobd
   6189    mov                    r3d, 4
   6190    cmovs                  r3d, r4d
   6191 
   6192 %if ARCH_X86_32
   6193    LEA                     r5, $$
   6194 %endif
   6195 
   6196    mov  [rsp+gprsize*2+16*67], coeffq
   6197    mov  [rsp+gprsize*3+16*67], dstq
   6198    lea                   dstq, [rsp+gprsize+16*69]
   6199    mov  [rsp+gprsize*4+16*67], dstq
   6200 
   6201 .pass1_loop:
   6202    LOAD_4ROWS     coeffq+64*0, 64*8, 1
   6203    pxor                    m4, m4
   6204    REPX          {mova x, m4}, m5, m6, m7
   6205    call  m(idct_8x8_internal_8bpc).main
   6206    SAVE_7ROWS    rsp+gprsize+16*3, 16
   6207 
   6208    pxor                    m4, m4
   6209    LOAD_4ROWS     coeffq+64*4, 64*8, 1
   6210 
   6211    REPX          {mova x, m4}, m5, m6, m7
   6212    call m(idct_16x8_internal_8bpc).main
   6213    mova                    m7, [rsp+gprsize+16*0]
   6214    SAVE_8ROWS   rsp+gprsize+16*11, 16
   6215 
   6216    LOAD_8ROWS     coeffq+64*2, 64*4, 1
   6217    mova   [rsp+gprsize+16*19], m0
   6218    mova   [rsp+gprsize+16*26], m1
   6219    mova   [rsp+gprsize+16*23], m2
   6220    mova   [rsp+gprsize+16*22], m3
   6221    mova   [rsp+gprsize+16*21], m4
   6222    mova   [rsp+gprsize+16*24], m5
   6223    mova   [rsp+gprsize+16*25], m6
   6224    mova   [rsp+gprsize+16*20], m7
   6225 
   6226    call m(idct_8x32_internal_8bpc).main_fast
   6227    SAVE_8ROWS    rsp+gprsize+16*3, 16
   6228 
   6229    LOAD_8ROWS     coeffq+64*1, 64*2, 1
   6230    mova   [rsp+gprsize+16*35], m0                        ;in1
   6231    mova   [rsp+gprsize+16*49], m1                        ;in3
   6232    mova   [rsp+gprsize+16*43], m2                        ;in5
   6233    mova   [rsp+gprsize+16*41], m3                        ;in7
   6234    mova   [rsp+gprsize+16*39], m4                        ;in9
   6235    mova   [rsp+gprsize+16*45], m5                        ;in11
   6236    mova   [rsp+gprsize+16*47], m6                        ;in13
   6237    mova   [rsp+gprsize+16*37], m7                        ;in15
   6238 
   6239    LOAD_8ROWS    coeffq+64*17, 64*2, 1
   6240    mova   [rsp+gprsize+16*63], m0                        ;in17
   6241    mova   [rsp+gprsize+16*53], m1                        ;in19
   6242    mova   [rsp+gprsize+16*55], m2                        ;in21
   6243    mova   [rsp+gprsize+16*61], m3                        ;in23
   6244    mova   [rsp+gprsize+16*59], m4                        ;in25
   6245    mova   [rsp+gprsize+16*57], m5                        ;in27
   6246    mova   [rsp+gprsize+16*51], m6                        ;in29
   6247    mova   [rsp+gprsize+16*65], m7                        ;in31
   6248 
   6249    call m(idct_16x64_internal_8bpc).main
   6250 
   6251    LOAD_8ROWS    rsp+gprsize+16*3, 16
   6252    mova    [rsp+gprsize+16*0], m7
   6253    lea                   tx2q, [o(.pass1_end)]
   6254    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6255 
   6256 .pass1_end:
   6257    SAVE_8ROWS     coeffq+64*0, 64
   6258    LOAD_8ROWS   rsp+gprsize+16*11, 16
   6259    mova    [rsp+gprsize+16*0], m7
   6260    lea                   tx2q, [o(.pass1_end1)]
   6261    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6262 
   6263 .pass1_end1:
   6264    SAVE_8ROWS     coeffq+64*8, 64
   6265    LOAD_8ROWS   rsp+gprsize+16*19, 16
   6266    mova    [rsp+gprsize+16*0], m7
   6267    lea                   tx2q, [o(.pass1_end2)]
   6268    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6269 
   6270 .pass1_end2:
   6271    SAVE_8ROWS    coeffq+64*16, 64
   6272    LOAD_8ROWS   rsp+gprsize+16*27, 16
   6273    mova    [rsp+gprsize+16*0], m7
   6274    lea                   tx2q, [o(.pass1_end3)]
   6275    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6276 
   6277 .pass1_end3:
   6278    SAVE_8ROWS    coeffq+64*24, 64
   6279    LOAD_8ROWS   rsp+gprsize+16*35, 16
   6280    mova    [rsp+gprsize+16*0], m7
   6281    lea                   tx2q, [o(.pass1_end4)]
   6282    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6283 
   6284 .pass1_end4:
   6285    SAVE_8ROWS       dstq+64*0, 64
   6286    LOAD_8ROWS   rsp+gprsize+16*43, 16
   6287    mova    [rsp+gprsize+16*0], m7
   6288    lea                   tx2q, [o(.pass1_end5)]
   6289    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6290 
   6291 .pass1_end5:
   6292    SAVE_8ROWS       dstq+64*8, 64
   6293    LOAD_8ROWS   rsp+gprsize+16*51, 16
   6294    mova    [rsp+gprsize+16*0], m7
   6295    lea                   tx2q, [o(.pass1_end6)]
   6296    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6297 
   6298 .pass1_end6:
   6299    SAVE_8ROWS      dstq+64*16, 64
   6300    LOAD_8ROWS   rsp+gprsize+16*59, 16
   6301    mova    [rsp+gprsize+16*0], m7
   6302    lea                   tx2q, [o(.pass1_end7)]
   6303    jmp   m(idct_8x8_internal_8bpc).pass1_end
   6304 
   6305 .pass1_end7:
   6306    SAVE_8ROWS      dstq+64*24, 64
   6307 
   6308    add                 coeffq, 16
   6309    add                   dstq, 16
   6310    dec                    r3d
   6311    jg .pass1_loop
   6312 
   6313 .pass2:
   6314    mov                 coeffq, [rsp+gprsize*4+16*67]
   6315    mov                   dstq, [rsp+gprsize*3+16*67]
   6316    mov                   eobd, [rsp+gprsize*1+16*67]
   6317    lea                   dstq, [dstq+32]
   6318    mov  [rsp+gprsize*1+16*35], eobd
   6319    lea                   tx2q, [o(.pass2_end)]
   6320    mov                    r3d, 4
   6321    jmp m(idct_32x32_internal_8bpc).pass2_loop
   6322 
   6323 .pass2_end:
   6324    mova    [rsp+gprsize+16*0], m7
   6325    lea                     r3, [o(.pass2_end1)]
   6326    jmp  m(idct_8x32_internal_8bpc).end2
   6327 
   6328 .pass2_end1:
   6329    lea                   tx2q, [o(.pass2_end)]
   6330    add                 coeffq, 16*32
   6331    mov                   dstq, [rsp+gprsize*2+16*35]
   6332    mov                    r3d, [rsp+gprsize*3+16*35]
   6333    dec                    r3d
   6334    jg m(idct_32x32_internal_8bpc).pass2_loop
   6335 
   6336 .pass2_end2:
   6337    mov                   dstq, [rsp+gprsize*3+16*67]
   6338    mov                 coeffq, [rsp+gprsize*2+16*67]
   6339    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
   6340    mov                    r3d, 4
   6341    jmp m(idct_32x32_internal_8bpc).pass2_loop
   6342 
   6343 
   6344 cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
   6345 %if ARCH_X86_32
   6346    LEA                     r5, $$
   6347 %endif
   6348    test                  eobd, eobd
   6349    jz .dconly
   6350 
   6351    call m(idct_64x64_internal_8bpc)
   6352    RET
   6353 
   6354 .dconly:
   6355    movd                    m1, [o(pw_2896x8)]
   6356    pmulhrsw                m0, m1, [coeffq]
   6357    movd                    m2, [o(pw_8192)]
   6358    mov               [coeffq], eobd
   6359    mov                    r3d, 64
   6360    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)]
   6361    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
   6362 
   6363 cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
   6364    mov                    r5d, 4
   6365    mov                    r4d, 2
   6366    sub                   eobd, 136
   6367    cmovns                 r4d, r5d
   6368 
   6369 %if ARCH_X86_32
   6370    LEA                     r5, $$
   6371 %endif
   6372 
   6373    mov  [rsp+gprsize*1+16*67], eobd
   6374    mov                    r3d, r4d
   6375    mov  [rsp+gprsize*4+16*67], coeffq
   6376    mov  [rsp+gprsize*3+16*67], dstq
   6377    lea                   dstq, [rsp+gprsize+16*69]
   6378    mov  [rsp+gprsize*2+16*67], dstq
   6379 
   6380 .pass1_loop:
   6381    LOAD_4ROWS     coeffq+64*0, 64*8
   6382    pxor                    m4, m4
   6383    REPX          {mova x, m4}, m5, m6, m7
   6384    call  m(idct_8x8_internal_8bpc).main
   6385    SAVE_7ROWS    rsp+gprsize+16*3, 16
   6386 
   6387    pxor                    m4, m4
   6388    LOAD_4ROWS     coeffq+64*4, 64*8
   6389 
   6390    REPX          {mova x, m4}, m5, m6, m7
   6391    call m(idct_16x8_internal_8bpc).main
   6392    mova                    m7, [rsp+gprsize+16*0]
   6393    SAVE_8ROWS   rsp+gprsize+16*11, 16
   6394 
   6395    LOAD_8ROWS     coeffq+64*2, 64*4
   6396    mova   [rsp+gprsize+16*19], m0
   6397    mova   [rsp+gprsize+16*26], m1
   6398    mova   [rsp+gprsize+16*23], m2
   6399    mova   [rsp+gprsize+16*22], m3
   6400    mova   [rsp+gprsize+16*21], m4
   6401    mova   [rsp+gprsize+16*24], m5
   6402    mova   [rsp+gprsize+16*25], m6
   6403    mova   [rsp+gprsize+16*20], m7
   6404 
   6405    call m(idct_8x32_internal_8bpc).main_fast
   6406    SAVE_8ROWS    rsp+gprsize+16*3, 16
   6407 
   6408    LOAD_8ROWS     coeffq+64*1, 64*2
   6409    mova   [rsp+gprsize+16*35], m0                        ;in1
   6410    mova   [rsp+gprsize+16*49], m1                        ;in3
   6411    mova   [rsp+gprsize+16*43], m2                        ;in5
   6412    mova   [rsp+gprsize+16*41], m3                        ;in7
   6413    mova   [rsp+gprsize+16*39], m4                        ;in9
   6414    mova   [rsp+gprsize+16*45], m5                        ;in11
   6415    mova   [rsp+gprsize+16*47], m6                        ;in13
   6416    mova   [rsp+gprsize+16*37], m7                        ;in15
   6417 
   6418    LOAD_8ROWS    coeffq+64*17, 64*2
   6419    mova   [rsp+gprsize+16*63], m0                        ;in17
   6420    mova   [rsp+gprsize+16*53], m1                        ;in19
   6421    mova   [rsp+gprsize+16*55], m2                        ;in21
   6422    mova   [rsp+gprsize+16*61], m3                        ;in23
   6423    mova   [rsp+gprsize+16*59], m4                        ;in25
   6424    mova   [rsp+gprsize+16*57], m5                        ;in27
   6425    mova   [rsp+gprsize+16*51], m6                        ;in29
   6426    mova   [rsp+gprsize+16*65], m7                        ;in31
   6427 
   6428    call m(idct_16x64_internal_8bpc).main
   6429 
   6430    LOAD_8ROWS    rsp+gprsize+16*3, 16
   6431    mova    [rsp+gprsize+16*0], m7
   6432    mova                    m7, [o(pw_8192)]
   6433    lea                   tx2q, [o(.pass1_end)]
   6434    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   6435 
   6436 .pass1_end:
   6437    SAVE_8ROWS     coeffq+64*0, 64
   6438    LOAD_8ROWS   rsp+gprsize+16*11, 16
   6439    mova    [rsp+gprsize+16*0], m7
   6440    mova                    m7, [o(pw_8192)]
   6441    lea                   tx2q, [o(.pass1_end1)]
   6442    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   6443 
   6444 .pass1_end1:
   6445    SAVE_8ROWS     coeffq+64*8, 64
   6446    LOAD_8ROWS   rsp+gprsize+16*19, 16
   6447    mova    [rsp+gprsize+16*0], m7
   6448    mova                    m7, [o(pw_8192)]
   6449    lea                   tx2q, [o(.pass1_end2)]
   6450    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   6451 
   6452 .pass1_end2:
   6453    SAVE_8ROWS    coeffq+64*16, 64
   6454    LOAD_8ROWS   rsp+gprsize+16*27, 16
   6455    mova    [rsp+gprsize+16*0], m7
   6456    mova                    m7, [o(pw_8192)]
   6457    lea                   tx2q, [o(.pass1_end3)]
   6458    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   6459 
   6460 .pass1_end3:
   6461    SAVE_8ROWS    coeffq+64*24, 64
   6462    LOAD_8ROWS   rsp+gprsize+16*35, 16
   6463    mova    [rsp+gprsize+16*0], m7
   6464    mova                    m7, [o(pw_8192)]
   6465    lea                   tx2q, [o(.pass1_end4)]
   6466    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   6467 
   6468 .pass1_end4:
   6469    SAVE_8ROWS       dstq+64*0, 64
   6470    LOAD_8ROWS   rsp+gprsize+16*43, 16
   6471    mova    [rsp+gprsize+16*0], m7
   6472    mova                    m7, [o(pw_8192)]
   6473    lea                   tx2q, [o(.pass1_end5)]
   6474    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   6475 
   6476 .pass1_end5:
   6477    SAVE_8ROWS       dstq+64*8, 64
   6478    LOAD_8ROWS   rsp+gprsize+16*51, 16
   6479    mova    [rsp+gprsize+16*0], m7
   6480    mova                    m7, [o(pw_8192)]
   6481    lea                   tx2q, [o(.pass1_end6)]
   6482    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   6483 
   6484 .pass1_end6:
   6485    SAVE_8ROWS      dstq+64*16, 64
   6486    LOAD_8ROWS   rsp+gprsize+16*59, 16
   6487    mova    [rsp+gprsize+16*0], m7
   6488    mova                    m7, [o(pw_8192)]
   6489    lea                   tx2q, [o(.pass1_end7)]
   6490    jmp   m(idct_8x8_internal_8bpc).pass1_end1
   6491 
   6492 .pass1_end7:
   6493    SAVE_8ROWS      dstq+64*24, 64
   6494 
   6495    add                 coeffq, 16
   6496    add                   dstq, 16
   6497    dec                    r3d
   6498    jg .pass1_loop
   6499 
   6500 .pass2:
   6501    mov                   dstq, [rsp+gprsize*3+16*67]
   6502    mov                 coeffq, [rsp+gprsize*2+16*67]
   6503    lea                   dstq, [dstq+32]
   6504    mov                    r3d, 4
   6505    lea                     r4, [dstq+8]
   6506    mov  [rsp+gprsize*2+16*67], r4
   6507    lea                     r4, [o(.pass2_end)]
   6508    jmp m(idct_16x64_internal_8bpc).pass2_loop
   6509 
   6510 .pass2_end:
   6511    LOAD_8ROWS   rsp+gprsize+16*35, 16
   6512    lea                   dstq, [dstq+strideq*2]
   6513    lea                     r3, [rsp+16*32+gprsize]
   6514    mova    [rsp+gprsize+16*0], m7
   6515    call m(idct_16x64_internal_8bpc).write
   6516    mov                   dstq, [rsp+gprsize*2+16*67]
   6517    mov                    r3d, [rsp+gprsize*3+16*67]
   6518    lea                     r4, [dstq+8]
   6519    mov  [rsp+gprsize*2+16*67], r4
   6520    lea                     r4, [o(.pass2_end)]
   6521 
   6522    dec                    r3d
   6523    jg  m(idct_16x64_internal_8bpc).pass2_loop
   6524 
   6525 .pass2_end2:
   6526    mov                 coeffq, [rsp+gprsize*4+16*67]
   6527    mov                   dstq, [rsp+gprsize*2+16*67]
   6528    mov                    r3d, 4
   6529    sub                   dstq, 72
   6530    lea                     r4, [dstq+8]
   6531    mov  [rsp+gprsize*2+16*67], r4
   6532    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
   6533    jmp m(idct_16x64_internal_8bpc).pass2_loop