tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx_avx2.asm (207527B)


      1 ; Copyright © 2018-2021, VideoLAN and dav1d authors
      2 ; Copyright © 2018, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 %if ARCH_X86_64
     30 
     31 SECTION_RODATA 16
     32 
     33 ; Note: The order of (at least some of) those constants matter!
     34 
     35 const deint_shuf, db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
     36 
     37 %macro COEF_PAIR 2
     38 pw_%1_%2:  dw  %1, %2
     39 pw_m%2_%1: dw -%2, %1
     40 %endmacro
     41 
     42 ; ADST-only
     43 pw_3803_1321:   dw  3803,  1321
     44 pw_m1321_2482:  dw -1321,  2482
     45 pw_2482_3344:   dw  2482,  3344
     46 pw_m3344_3344:  dw -3344,  3344
     47 pw_m3803_3344:  dw -3803,  3344
     48 pw_m3803_m6688: dw -3803, -6688
     49 pw_2896_m2896:  dw  2896, -2896
     50 
     51 const pw_5,       times 2 dw 5
     52 const pw_2048,    times 2 dw 2048
     53 const pw_4096,    times 2 dw 4096
     54 const pw_8192,    times 2 dw 8192
     55 const pw_16384,   times 2 dw 16384
     56 const pw_1697x16, times 2 dw 1697*16
     57 const pw_1697x8,  times 2 dw 1697*8
     58 const pw_2896x8,  times 2 dw 2896*8
     59 const pd_2048,    dd 2048
     60 
     61 const pw_2896_2896,  dw  2896, 2896
     62 const pw_m2896_2896, dw -2896, 2896
     63 const pw_1567_3784,  dw  1567, 3784
     64 const pw_m3784_1567, dw -3784, 1567
     65 COEF_PAIR 3784, 1567
     66 COEF_PAIR  201, 4091
     67 COEF_PAIR  995, 3973
     68 COEF_PAIR 1751, 3703
     69 COEF_PAIR 2440, 3290
     70 COEF_PAIR 3035, 2751
     71 COEF_PAIR 3513, 2106
     72 COEF_PAIR 3857, 1380
     73 COEF_PAIR 4052,  601
     74 COEF_PAIR  401, 4076
     75 COEF_PAIR 1931, 3612
     76 COEF_PAIR 3166, 2598
     77 COEF_PAIR 3920, 1189
     78 COEF_PAIR  799, 4017
     79 COEF_PAIR 3406, 2276
     80 pw_m799_m4017:  dw  -799, -4017
     81 const pw_m1567_m3784, dw -1567, -3784
     82 pw_m3406_m2276: dw -3406, -2276
     83 pw_m401_m4076:  dw  -401, -4076
     84 pw_m3166_m2598: dw -3166, -2598
     85 pw_m1931_m3612: dw -1931, -3612
     86 pw_m3920_m1189: dw -3920, -1189
     87 COEF_PAIR 2276, 3406
     88 COEF_PAIR 4017,  799
     89 
     90 %macro COEF_X8 1-*
     91 %rep %0
     92    dw %1*8, %1*8
     93    %rotate 1
     94 %endrep
     95 %endmacro
     96 
     97 pw_3703x8:  COEF_X8  3703
     98 pw_1751x8:  COEF_X8  1751
     99 pw_m1380x8: COEF_X8 -1380
    100 pw_3857x8:  COEF_X8  3857
    101 pw_3973x8:  COEF_X8  3973
    102 pw_995x8:   COEF_X8   995
    103 pw_m2106x8: COEF_X8 -2106
    104 pw_3513x8:  COEF_X8  3513
    105 pw_3290x8:  COEF_X8  3290
    106 pw_2440x8:  COEF_X8  2440
    107 pw_m601x8:  COEF_X8  -601
    108 pw_4052x8:  COEF_X8  4052
    109 
    110 const idct64_mul
    111 COEF_X8  4095,   101,  4065,   501,  2967, -2824,  3229, -2520
    112 COEF_X8  3745,  1660,  3564,  2019,  3822, -1474,  3948, -1092
    113 COEF_X8  3996,   897,  3889,  1285,  3461, -2191,  3659, -1842
    114 COEF_X8  3349,  2359,  3102,  2675,  4036,  -700,  4085,  -301
    115 
    116 pw_201_4091x8:   dw   201*8, 4091*8
    117 pw_m601_4052x8:  dw  -601*8, 4052*8
    118 pw_995_3973x8:   dw   995*8, 3973*8
    119 pw_m1380_3857x8: dw -1380*8, 3857*8
    120 pw_1751_3703x8:  dw  1751*8, 3703*8
    121 pw_m2106_3513x8: dw -2106*8, 3513*8
    122 pw_2440_3290x8:  dw  2440*8, 3290*8
    123 pw_m2751_3035x8: dw -2751*8, 3035*8
    124 
    125 %define o_idct64_offset idct64_mul - (o_base) - 8
    126 
    127 SECTION .text
    128 
    129 ; Code size reduction trickery: Instead of using rip-relative loads with
    130 ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
    131 ; single rip-relative lea and then address things relative from that with
    132 ; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
    133 %define o_base deint_shuf + 128
    134 %define o(x) (r6 - (o_base) + (x))
    135 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
    136 
    137 ; flags: 1 = swap, 2 = interleave, 4: coef_regs
    138 %macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
    139 %if %7 & 4
    140    pmaddwd             m%2, m%5, m%1
    141    pmaddwd             m%1, m%6
    142 %else
    143 %if %7 & 1
    144    vpbroadcastd        m%2, [o(pw_%5_%6)]
    145    vpbroadcastd        m%3, [o(pw_m%6_%5)]
    146 %else
    147    vpbroadcastd        m%2, [o(pw_m%6_%5)]
    148    vpbroadcastd        m%3, [o(pw_%5_%6)]
    149 %endif
    150    pmaddwd             m%2, m%1
    151    pmaddwd             m%1, m%3
    152 %endif
    153    paddd               m%2, m%4
    154    paddd               m%1, m%4
    155 %if %7 & 2
    156    pslld               m%2, 4
    157    psrld               m%1, 12
    158    pblendw             m%1, m%2, 0xaa
    159 %else
    160    psrad               m%2, 12
    161    psrad               m%1, 12
    162    packssdw            m%1, m%2
    163 %endif
    164 %endmacro
    165 
    166 ; flags: 1 = swap, 2 = interleave, 4 = coef_regs
    167 %macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags
    168 %if %10 & 1
    169    vpbroadcastd        m%3, [o(pw_%8_%9)]
    170    vpbroadcastd        m%4, [o(pw_m%9_%8)]
    171    vpbroadcastd       xm%2, [o(pw_%6_%7)]
    172    vpblendd            m%2, m%3, 0xf0
    173    vpbroadcastd       xm%3, [o(pw_m%7_%6)]
    174 %else
    175    vpbroadcastd        m%3, [o(pw_m%9_%8)]
    176    vpbroadcastd        m%4, [o(pw_%8_%9)]
    177    vpbroadcastd       xm%2, [o(pw_m%7_%6)]
    178    vpblendd            m%2, m%3, 0xf0
    179    vpbroadcastd       xm%3, [o(pw_%6_%7)]
    180 %endif
    181    vpblendd            m%3, m%4, 0xf0
    182    ITX_MUL2X_PACK       %1, %4, _, %5, %2, %3, (4|%10)
    183 %endmacro
    184 
    185 ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
    186 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
    187 %macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
    188    punpckhwd           m%3, m%2, m%1
    189    punpcklwd           m%2, m%1
    190 %if %7 < 32
    191    pmaddwd             m%1, m%7, m%2
    192    pmaddwd             m%4, m%7, m%3
    193 %else
    194    vpbroadcastd        m%1, [o(pw_m%7_%6)]
    195    pmaddwd             m%4, m%3, m%1
    196    pmaddwd             m%1, m%2
    197 %endif
    198    paddd               m%4, m%5
    199    paddd               m%1, m%5
    200    psrad               m%4, 12
    201    psrad               m%1, 12
    202    packssdw            m%1, m%4
    203 %if %7 < 32
    204    pmaddwd             m%3, m%6
    205    pmaddwd             m%2, m%6
    206 %else
    207    vpbroadcastd        m%4, [o(pw_%6_%7)]
    208    pmaddwd             m%3, m%4
    209    pmaddwd             m%2, m%4
    210 %endif
    211    paddd               m%3, m%5
    212    paddd               m%2, m%5
    213    psrad               m%3, 12
    214    psrad               m%2, 12
    215 %if %0 == 8
    216    packssdw            m%8, m%2, m%3
    217 %else
    218    packssdw            m%2, m%3
    219 %endif
    220 %endmacro
    221 
    222 %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
    223    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3
    224    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0
    225    psubsw              m%3, m%1, m%2
    226    paddsw              m%2, m%1
    227    paddsw              m%1, m%4, m%5
    228    psubsw              m%4, m%5
    229 %endmacro
    230 
    231 %macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
    232    ITX_MULSUB_2W        %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
    233    ITX_MULSUB_2W        %2, %8, %9, %10, %11,  799, 4017 ; t4a, t7a
    234    ITX_MULSUB_2W        %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
    235    paddsw              m%9, m%2, m%6  ; t4
    236    psubsw              m%2, m%6       ; t5a
    237    paddsw             m%10, m%8, m%4  ; t7
    238    psubsw              m%8, m%4       ; t6a
    239    ITX_MULSUB_2W        %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0
    240    ITX_MULSUB_2W        %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6
    241    psubsw              m%6, m%1, m%3  ; dct4 out2
    242    paddsw              m%3, m%1       ; dct4 out1
    243    paddsw              m%1, m%5, m%7  ; dct4 out0
    244    psubsw              m%5, m%7       ; dct4 out3
    245    psubsw              m%7, m%3, m%2  ; out6
    246    paddsw              m%2, m%3       ; out1
    247    paddsw              m%3, m%6, m%8  ; out2
    248    psubsw              m%6, m%8       ; out5
    249    psubsw              m%8, m%1, m%10 ; out7
    250    paddsw              m%1, m%10      ; out0
    251    paddsw              m%4, m%5, m%9  ; out3
    252    psubsw              m%5, m%9       ; out4
    253 %endmacro
    254 
    255 ; in1 = %1, in3  = %2, in5  = %3, in7  = %4
    256 ; in9 = %5, in11 = %6, in13 = %7, in15 = %8
    257 %macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
    258    ITX_MULSUB_2W        %1, %8, %9, %10, %11,  401, 4076 ; t8a,  t15a
    259    ITX_MULSUB_2W        %5, %4, %9, %10, %11, 3166, 2598 ; t9a,  t14a
    260    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
    261    ITX_MULSUB_2W        %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
    262    psubsw              m%9, m%2, m%6 ; t13
    263    paddsw              m%6, m%2      ; t12
    264    psubsw              m%2, m%8, m%4 ; t14
    265    paddsw              m%8, m%4      ; t15
    266    psubsw              m%4, m%7, m%3 ; t10
    267    paddsw              m%3, m%7      ; t11
    268    psubsw              m%7, m%1, m%5 ; t9
    269    paddsw              m%1, m%5      ; t8
    270    ITX_MULSUB_2W        %2, %7, %5, %10, %11,  1567, 3784 ; t9a,  t14a
    271    ITX_MULSUB_2W        %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
    272    psubsw              m%5, m%1, m%3 ; t11a
    273    paddsw              m%1, m%3      ; t8a
    274    psubsw              m%3, m%7, m%4 ; t13
    275    paddsw              m%7, m%4      ; t14
    276    psubsw              m%4, m%8, m%6 ; t12a
    277    paddsw              m%8, m%6      ; t15a
    278    psubsw              m%6, m%2, m%9 ; t10
    279    paddsw              m%2, m%9      ; t9
    280    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a
    281    ITX_MULSUB_2W        %4, %5, %9, %10, %11, 2896, 2896 ; t11,  t12
    282 %endmacro
    283 
    284 %macro WRAP_XMM 1+
    285    INIT_XMM cpuname
    286    %1
    287    INIT_YMM cpuname
    288 %endmacro
    289 
    290 %macro ITX4_END 4-5 2048 ; row[1-4], rnd
    291 %if %5
    292    vpbroadcastd         m2, [o(pw_%5)]
    293    pmulhrsw             m0, m2
    294    pmulhrsw             m1, m2
    295 %endif
    296    lea                  r2, [dstq+strideq*2]
    297 %assign %%i 1
    298 %rep 4
    299    %if %1 & 2
    300        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
    301    %else
    302        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
    303    %endif
    304    %assign %%i %%i + 1
    305    %rotate 1
    306 %endrep
    307    movd                 m2, [%%row_adr1]
    308    pinsrd               m2, [%%row_adr2], 1
    309    movd                 m3, [%%row_adr3]
    310    pinsrd               m3, [%%row_adr4], 1
    311    pmovzxbw             m2, m2
    312    pmovzxbw             m3, m3
    313    paddw                m0, m2
    314    paddw                m1, m3
    315    packuswb             m0, m1
    316    movd       [%%row_adr1], m0
    317    pextrd     [%%row_adr2], m0, 1
    318    pextrd     [%%row_adr3], m0, 2
    319    pextrd     [%%row_adr4], m0, 3
    320    ret
    321 %endmacro
    322 
    323 %macro IWHT4_1D_PACKED 0
    324    punpckhqdq           m3, m0, m1 ; in1 in3
    325    punpcklqdq           m0, m1     ; in0 in2
    326    psubw                m2, m0, m3
    327    paddw                m0, m3
    328    punpckhqdq           m2, m2     ; t2 t2
    329    punpcklqdq           m0, m0     ; t0 t0
    330    psubw                m1, m0, m2
    331    psraw                m1, 1
    332    psubw                m1, m3     ; t1 t3
    333    psubw                m0, m1     ; ____ out0
    334    paddw                m2, m1     ; out3 ____
    335 %endmacro
    336 
    337 INIT_XMM avx2
    338 cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c
    339    mova                 m0, [cq+16*0]
    340    mova                 m1, [cq+16*1]
    341    pxor                 m2, m2
    342    mova          [cq+16*0], m2
    343    mova          [cq+16*1], m2
    344    psraw                m0, 2
    345    psraw                m1, 2
    346    IWHT4_1D_PACKED
    347    punpckhwd            m0, m1
    348    punpcklwd            m3, m1, m2
    349    punpckhdq            m1, m0, m3
    350    punpckldq            m0, m3
    351    IWHT4_1D_PACKED
    352    vpblendd             m0, m2, 0x03
    353    ITX4_END              3, 0, 2, 1, 0
    354 
    355 %macro INV_TXFM_FN 3 ; type1, type2, size
    356 cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2
    357    %define %%p1 m(i%1_%3_internal_8bpc)
    358    lea                  r6, [o_base]
    359    ; Jump to the 1st txfm function if we're not taking the fast path, which
    360    ; in turn performs an indirect jump to the 2nd txfm function.
    361    lea                tx2q, [m(i%2_%3_internal_8bpc).pass2]
    362 %ifidn %1_%2, dct_dct
    363    test               eobd, eobd
    364    jnz %%p1
    365 %else
    366    ; jump to the 1st txfm function unless it's located directly after this
    367    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
    368 ALIGN function_align
    369 %%end:
    370 %endif
    371 %endmacro
    372 
    373 %macro INV_TXFM_4X4_FN 2 ; type1, type2
    374    INV_TXFM_FN          %1, %2, 4x4
    375 %ifidn %1_%2, dct_dct
    376    vpbroadcastw         m0, [cq]
    377    vpbroadcastd         m1, [o(pw_2896x8)]
    378    pmulhrsw             m0, m1
    379    mov                [cq], eobd ; 0
    380    pmulhrsw             m0, m1
    381    mova                 m1, m0
    382    jmp m(iadst_4x4_internal_8bpc).end2
    383 %endif
    384 %endmacro
    385 
    386 %macro IDCT4_1D_PACKED 0
    387    vpbroadcastd         m4, [o(pd_2048)]
    388    punpckhwd            m2, m1, m0
    389    punpcklwd            m1, m0
    390    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784
    391    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896
    392    paddsw               m0, m1, m2 ; out0 out1
    393    psubsw               m1, m2     ; out3 out2
    394 %endmacro
    395 
    396 %macro IADST4_1D_PACKED 0
    397    punpcklwd            m2, m1, m0
    398    punpckhwd            m3, m1, m0
    399    vpbroadcastd         m5, [o(pw_m3344_3344)]
    400    vpbroadcastd         m0, [o(pw_3803_1321)]
    401    vpbroadcastd         m4, [o(pw_m1321_2482)]
    402    pmaddwd              m1, m5, m2 ; 3344*in3 - 3344*in2
    403    psrld                m5, 16
    404    pmaddwd              m0, m2
    405    pmaddwd              m2, m4
    406    pmaddwd              m5, m3 ; 3344*in0
    407    paddd                m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
    408    vpbroadcastd         m4, [o(pw_2482_3344)]
    409    vpbroadcastd         m5, [o(pw_m3803_3344)]
    410    pmaddwd              m4, m3
    411    pmaddwd              m5, m3
    412    paddd                m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
    413    vpbroadcastd         m0, [o(pw_m3803_m6688)]
    414    pmaddwd              m3, m0
    415    vpbroadcastd         m0, [o(pd_2048)]
    416    paddd                m2, m0
    417    paddd                m1, m0
    418    paddd                m0, m4
    419    paddd                m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
    420    paddd                m2, m4
    421    paddd                m2, m3
    422    REPX      {psrad x, 12}, m1, m2, m0, m5
    423    packssdw             m0, m5 ; out0 out1
    424    packssdw             m1, m2 ; out2 out3
    425 %endmacro
    426 
    427 INV_TXFM_4X4_FN dct, dct
    428 INV_TXFM_4X4_FN dct, adst
    429 INV_TXFM_4X4_FN dct, flipadst
    430 INV_TXFM_4X4_FN dct, identity
    431 
    432 cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
    433    mova                 m0, [cq+16*0]
    434    mova                 m1, [cq+16*1]
    435    IDCT4_1D_PACKED
    436    mova                 m2, [o(deint_shuf)]
    437    shufps               m3, m0, m1, q1331
    438    shufps               m0, m1, q0220
    439    pshufb               m0, m2
    440    pshufb               m1, m3, m2
    441    jmp                tx2q
    442 .pass2:
    443    IDCT4_1D_PACKED
    444    pxor                 m2, m2
    445    mova          [cq+16*0], m2
    446    mova          [cq+16*1], m2
    447    ITX4_END              0, 1, 3, 2
    448 
    449 INV_TXFM_4X4_FN adst, dct
    450 INV_TXFM_4X4_FN adst, adst
    451 INV_TXFM_4X4_FN adst, flipadst
    452 INV_TXFM_4X4_FN adst, identity
    453 
    454 cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
    455    mova                 m0, [cq+16*0]
    456    mova                 m1, [cq+16*1]
    457    call .main
    458    punpckhwd            m3, m0, m1
    459    punpcklwd            m0, m1
    460    punpckhwd            m1, m0, m3
    461    punpcklwd            m0, m3
    462    jmp                tx2q
    463 .pass2:
    464    call .main
    465 .end:
    466    pxor                 m2, m2
    467    mova          [cq+16*0], m2
    468    mova          [cq+16*1], m2
    469 .end2:
    470    ITX4_END              0, 1, 2, 3
    471 ALIGN function_align
    472 cglobal_label .main
    473    IADST4_1D_PACKED
    474    ret
    475 
    476 INV_TXFM_4X4_FN flipadst, dct
    477 INV_TXFM_4X4_FN flipadst, adst
    478 INV_TXFM_4X4_FN flipadst, flipadst
    479 INV_TXFM_4X4_FN flipadst, identity
    480 
    481 cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
    482    mova                 m0, [cq+16*0]
    483    mova                 m1, [cq+16*1]
    484    call m(iadst_4x4_internal_8bpc).main
    485    punpcklwd            m2, m1, m0
    486    punpckhwd            m1, m0
    487    punpcklwd            m0, m1, m2
    488    punpckhwd            m1, m2
    489    jmp                tx2q
    490 .pass2:
    491    call m(iadst_4x4_internal_8bpc).main
    492 .end:
    493    pxor                 m2, m2
    494    mova          [cq+16*0], m2
    495    mova          [cq+16*1], m2
    496 .end2:
    497    ITX4_END              3, 2, 1, 0
    498 
    499 INV_TXFM_4X4_FN identity, dct
    500 INV_TXFM_4X4_FN identity, adst
    501 INV_TXFM_4X4_FN identity, flipadst
    502 INV_TXFM_4X4_FN identity, identity
    503 
    504 cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
    505    mova                 m0, [cq+16*0]
    506    mova                 m1, [cq+16*1]
    507    vpbroadcastd         m3, [o(pw_1697x8)]
    508    pmulhrsw             m2, m3, m0
    509    pmulhrsw             m3, m1
    510    paddsw               m0, m2
    511    paddsw               m1, m3
    512    punpckhwd            m2, m0, m1
    513    punpcklwd            m0, m1
    514    punpckhwd            m1, m0, m2
    515    punpcklwd            m0, m2
    516    jmp                tx2q
    517 .pass2:
    518    vpbroadcastd         m3, [o(pw_1697x8)]
    519    pmulhrsw             m2, m3, m0
    520    pmulhrsw             m3, m1
    521    paddsw               m0, m2
    522    paddsw               m1, m3
    523    jmp m(iadst_4x4_internal_8bpc).end
    524 
    525 %macro WRITE_4X8 2 ; coefs[1-2]
    526    movd                xm4, [dstq+strideq*0]
    527    pinsrd              xm4, [dstq+strideq*1], 1
    528    movd                xm5, [dstq+strideq*2]
    529    pinsrd              xm5, [dstq+r3       ], 1
    530    pinsrd              xm4, [r2  +strideq*0], 2
    531    pinsrd              xm4, [r2  +strideq*1], 3
    532    pinsrd              xm5, [r2  +strideq*2], 2
    533    pinsrd              xm5, [r2  +r3       ], 3
    534    pmovzxbw             m4, xm4
    535    pmovzxbw             m5, xm5
    536    paddw                m4, m%1
    537    paddw                m5, m%2
    538    packuswb             m4, m5
    539    vextracti128        xm5, m4, 1
    540    movd   [dstq+strideq*0], xm4
    541    pextrd [dstq+strideq*1], xm4, 1
    542    pextrd [dstq+strideq*2], xm4, 2
    543    pextrd [dstq+r3       ], xm4, 3
    544    movd   [r2  +strideq*0], xm5
    545    pextrd [r2  +strideq*1], xm5, 1
    546    pextrd [r2  +strideq*2], xm5, 2
    547    pextrd [r2  +r3       ], xm5, 3
    548 %endmacro
    549 
    550 %macro INV_TXFM_4X8_FN 2 ; type1, type2
    551    INV_TXFM_FN          %1, %2, 4x8
    552 %ifidn %1_%2, dct_dct
    553    movd                xm1, [o(pw_2896x8)]
    554    pmulhrsw            xm0, xm1, [cq]
    555    movd                xm2, [o(pw_2048)]
    556    mov                [cq], eobd
    557    pmulhrsw            xm0, xm1
    558    pmulhrsw            xm0, xm1
    559    pmulhrsw            xm0, xm2
    560    vpbroadcastw         m0, xm0
    561    mova                 m1, m0
    562    jmp m(iadst_4x8_internal_8bpc).end3
    563 %endif
    564 %endmacro
    565 
    566 %macro IDCT8_1D_PACKED 0
    567    vpbroadcastd         m6, [o(pd_2048)]
    568    punpckhwd            m5, m3, m0 ; in7 in1
    569    punpckhwd            m4, m1, m2 ; in3 in5
    570    punpcklwd            m3, m1     ; in6 in2
    571    punpcklwd            m2, m0     ; in4 in0
    572    ITX_MUL2X_PACK        5, 0, 1, 6,  799, 4017, 3 ; t4a t7a
    573    ITX_MUL2X_PACK        4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
    574    ITX_MUL2X_PACK        3, 0, 1, 6, 1567, 3784    ; t3 t2
    575    psubsw               m0, m5, m4 ; t5a t6a (interleaved)
    576    paddsw               m4, m5     ; t4  t7  (interleaved)
    577    ITX_MUL2X_PACK        2, 1, 5, 6, 2896, 2896    ; t0 t1
    578    vpbroadcastd         m1, [o(pw_m2896_2896)]
    579    ITX_MUL2X_PACK        0, 1, _, 6, 1, 5, 4 ; t6 t5
    580 %if mmsize > 16
    581    vbroadcasti128       m1, [o(deint_shuf)]
    582    pshufb               m4, m1
    583 %else
    584    pshufb               m4, [o(deint_shuf)]
    585 %endif
    586    psubsw               m1, m2, m3 ; tmp3 tmp2
    587    paddsw               m3, m2     ; tmp0 tmp1
    588    shufps               m2, m4, m0, q1032 ; t7 t6
    589    vpblendd             m4, m0, 0xcc      ; t4 t5
    590    paddsw               m0, m3, m2 ; out0 out1
    591    psubsw               m3, m2     ; out7 out6
    592    psubsw               m2, m1, m4 ; out4 out5
    593    paddsw               m1, m4     ; out3 out2
    594 %endmacro
    595 
    596 %macro IADST8_1D_PACKED 1 ; pass
    597    vpbroadcastd         m6, [o(pd_2048)]
    598    punpckhwd            m0, m4, m3 ; 0 7
    599    punpckhwd            m1, m5, m2 ; 2 5
    600    punpcklwd            m2, m5     ; 4 3
    601    punpcklwd            m3, m4     ; 6 1
    602 %if %1 == 1
    603    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076, 3 ; t1a t0a
    604    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
    605    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
    606    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
    607    psubsw               m4, m0, m2 ; t5 t4
    608    paddsw               m0, m2     ; t1 t0
    609    psubsw               m5, m1, m3 ; t6 t7
    610    paddsw               m1, m3     ; t2 t3
    611    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
    612    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
    613 %if mmsize > 16
    614    vbroadcasti128       m2, [o(deint_shuf)]
    615 %else
    616    mova                 m2, [o(deint_shuf)]
    617 %endif
    618    pshuflw              m1, m1, q2301
    619    pshufhw              m1, m1, q2301
    620    psubsw               m3, m0, m1        ; t3 t2
    621    paddsw               m0, m1            ; -out7  out0
    622    psubsw               m1, m4, m5        ; t7 t6
    623    paddsw               m4, m5            ;  out6 -out1
    624    pshufb               m0, m2
    625    pshufb               m4, m2
    626    vpbroadcastd         m5, [o(pw_m2896_2896)]
    627    pmaddwd              m2, m5, m3
    628    pmaddwd              m5, m1
    629    paddd                m2, m6
    630    paddd                m5, m6
    631    psrad                m2, 12
    632    psrad                m5, 12
    633    packssdw             m2, m5            ; out4 -out5
    634    vpbroadcastd         m5, [o(pw_2896_2896)]
    635    pmaddwd              m3, m5
    636    pmaddwd              m1, m5
    637    paddd                m3, m6
    638    paddd                m1, m6
    639    psrad                m3, 12
    640    psrad                m1, 12
    641    packssdw             m1, m3            ; out2 -out3
    642    punpcklqdq           m3, m4, m0        ; out6 -out7
    643    punpckhqdq           m0, m4            ; out0 -out1
    644 %else
    645    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
    646    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
    647    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
    648    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
    649    psubsw               m4, m0, m2 ; t4 t5
    650    paddsw               m0, m2     ; t0 t1
    651    psubsw               m5, m1, m3 ; t6 t7
    652    paddsw               m1, m3     ; t2 t3
    653    shufps               m2, m5, m4, q1032
    654    punpckhwd            m4, m2
    655    punpcklwd            m5, m2
    656    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
    657    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567    ; t7a t6a
    658    psubsw               m2, m0, m1        ; t2 t3
    659    paddsw               m0, m1            ; out0 -out7
    660    psubsw               m1, m4, m5        ; t7 t6
    661    paddsw               m4, m5            ; out6 -out1
    662    vpbroadcastd         m5, [o(pw_2896x8)]
    663    vpblendd             m3, m0, m4, 0x33  ; out6 -out7
    664    vpblendd             m0, m4, 0xcc      ; out0 -out1
    665    shufps               m4, m2, m1, q1032 ; t3 t7
    666    vpblendd             m1, m2, 0x33      ; t2 t6
    667    psubsw               m2, m1, m4        ; t2-t3 t6-t7
    668    paddsw               m1, m4            ; t2+t3 t6+t7
    669    pmulhrsw             m2, m5            ; out4 -out5
    670    pshufd               m1, m1, q1032
    671    pmulhrsw             m1, m5            ; out2 -out3
    672 %endif
    673 %endmacro
    674 
    675 INIT_YMM avx2
    676 INV_TXFM_4X8_FN dct, dct
    677 INV_TXFM_4X8_FN dct, adst
    678 INV_TXFM_4X8_FN dct, flipadst
    679 INV_TXFM_4X8_FN dct, identity
    680 
    681 cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
    682    vpermq               m0, [cq+32*0], q3120
    683    vpermq               m1, [cq+32*1], q3120
    684    vpbroadcastd         m2, [o(pw_2896x8)]
    685    pmulhrsw             m0, m2
    686    pmulhrsw             m1, m2
    687    IDCT4_1D_PACKED
    688    vbroadcasti128       m2, [o(deint_shuf)]
    689    shufps               m3, m0, m1, q1331
    690    shufps               m0, m1, q0220
    691    pshufb               m0, m2
    692    pshufb               m1, m3, m2
    693    jmp                tx2q
    694 .pass2:
    695    vextracti128        xm2, m0, 1
    696    vextracti128        xm3, m1, 1
    697    call .main
    698    vpbroadcastd         m4, [o(pw_2048)]
    699    vinserti128          m0, xm2, 1
    700    vinserti128          m1, xm3, 1
    701    pshufd               m1, m1, q1032
    702    jmp m(iadst_4x8_internal_8bpc).end2
    703 ALIGN function_align
    704 cglobal_label .main
    705    WRAP_XMM IDCT8_1D_PACKED
    706    ret
    707 
    708 INV_TXFM_4X8_FN adst, dct
    709 INV_TXFM_4X8_FN adst, adst
    710 INV_TXFM_4X8_FN adst, flipadst
    711 INV_TXFM_4X8_FN adst, identity
    712 
    713 cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
    714    vpermq               m0, [cq+32*0], q3120
    715    vpermq               m1, [cq+32*1], q3120
    716    vpbroadcastd         m2, [o(pw_2896x8)]
    717    pmulhrsw             m0, m2
    718    pmulhrsw             m1, m2
    719    call m(iadst_8x4_internal_8bpc).main
    720    punpckhwd            m3, m0, m1
    721    punpcklwd            m0, m1
    722    punpckhwd            m1, m0, m3
    723    punpcklwd            m0, m3
    724    jmp                tx2q
    725 .pass2:
    726    vextracti128        xm2, m0, 1
    727    vextracti128        xm3, m1, 1
    728    pshufd              xm4, xm0, q1032
    729    pshufd              xm5, xm1, q1032
    730    call .main_pass2
    731    vpbroadcastd         m4, [o(pw_2048)]
    732    vinserti128          m0, xm2, 1
    733    vinserti128          m1, xm3, 1
    734    pxor                 m5, m5
    735    psubw                m5, m4
    736 .end:
    737    vpblendd             m4, m5, 0xcc
    738 .end2:
    739    pmulhrsw             m0, m4
    740    pmulhrsw             m1, m4
    741    WIN64_RESTORE_XMM
    742    pxor                 m2, m2
    743    mova          [cq+32*0], m2
    744    mova          [cq+32*1], m2
    745 .end3:
    746    lea                  r2, [dstq+strideq*4]
    747    lea                  r3, [strideq*3]
    748    WRITE_4X8             0, 1
    749    RET
    750 ALIGN function_align
    751 .main_pass1:
    752    WRAP_XMM IADST8_1D_PACKED 1
    753    ret
    754 ALIGN function_align
    755 cglobal_label .main_pass2
    756    WRAP_XMM IADST8_1D_PACKED 2
    757    ret
    758 
    759 INV_TXFM_4X8_FN flipadst, dct
    760 INV_TXFM_4X8_FN flipadst, adst
    761 INV_TXFM_4X8_FN flipadst, flipadst
    762 INV_TXFM_4X8_FN flipadst, identity
    763 
    764 cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
    765    vpermq               m0, [cq+32*0], q3120
    766    vpermq               m1, [cq+32*1], q3120
    767    vpbroadcastd         m2, [o(pw_2896x8)]
    768    pmulhrsw             m0, m2
    769    pmulhrsw             m1, m2
    770    call m(iadst_8x4_internal_8bpc).main
    771    punpcklwd            m3, m1, m0
    772    punpckhwd            m1, m0
    773    punpcklwd            m0, m1, m3
    774    punpckhwd            m1, m3
    775    jmp                tx2q
    776 .pass2:
    777    vextracti128        xm2, m0, 1
    778    vextracti128        xm3, m1, 1
    779    pshufd              xm4, xm0, q1032
    780    pshufd              xm5, xm1, q1032
    781    call m(iadst_4x8_internal_8bpc).main_pass2
    782    vpbroadcastd         m5, [o(pw_2048)]
    783    vinserti128          m3, xm1, 1
    784    vinserti128          m2, xm0, 1
    785    pxor                 m4, m4
    786    psubw                m4, m5
    787    pshufd               m0, m3, q1032
    788    pshufd               m1, m2, q1032
    789    jmp m(iadst_4x8_internal_8bpc).end
    790 
    791 INV_TXFM_4X8_FN identity, dct
    792 INV_TXFM_4X8_FN identity, adst
    793 INV_TXFM_4X8_FN identity, flipadst
    794 INV_TXFM_4X8_FN identity, identity
    795 
    796 cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
    797    vpermq               m2, [cq+32*0], q3120
    798    vpermq               m0, [cq+32*1], q3120
    799    vpbroadcastd         m3, [o(pw_2896x8)]
    800    vpbroadcastd         m4, [o(pw_1697x8)]
    801    punpcklwd            m1, m2, m0
    802    punpckhwd            m2, m0
    803    pmulhrsw             m1, m3
    804    pmulhrsw             m2, m3
    805    punpcklwd            m0, m1, m2
    806    punpckhwd            m1, m2
    807    pmulhrsw             m2, m4, m0
    808    pmulhrsw             m4, m1
    809    paddsw               m0, m2
    810    paddsw               m1, m4
    811    jmp                tx2q
    812 .pass2:
    813    vpbroadcastd         m4, [o(pw_4096)]
    814    jmp m(iadst_4x8_internal_8bpc).end2
    815 
    816 %macro INV_TXFM_4X16_FN 2 ; type1, type2
    817    INV_TXFM_FN          %1, %2, 4x16
    818 %ifidn %1_%2, dct_dct
    819    movd                xm1, [o(pw_2896x8)]
    820    pmulhrsw            xm0, xm1, [cq]
    821    movd                xm2, [o(pw_16384)]
    822    movd                xm3, [o(pw_2048)]
    823    mov                [cq], eobd
    824    pmulhrsw            xm0, xm2
    825    pmulhrsw            xm0, xm1
    826    pmulhrsw            xm0, xm3
    827    vpbroadcastw         m0, xm0
    828    mova                 m1, m0
    829    mova                 m2, m0
    830    mova                 m3, m0
    831    jmp m(iadst_4x16_internal_8bpc).end3
    832 %endif
    833 %endmacro
    834 
    835 %macro IDCT16_1D_PACKED 0
    836    vpbroadcastd        m10, [o(pd_2048)]
    837 .main2:
    838    punpckhwd            m8, m7, m0 ; dct16 in15 in1
    839    punpcklwd            m9, m4, m0 ; dct4  in2  in0
    840    punpckhwd            m0, m3, m4 ; dct16 in7  in9
    841    punpcklwd            m7, m1     ; dct8  in7  in1
    842    punpckhwd            m1, m6     ; dct16 in3  in13
    843    punpcklwd            m3, m5     ; dct8  in3  in5
    844    punpckhwd            m5, m2     ; dct16 in11 in5
    845    punpcklwd            m6, m2     ; dct4  in3  in1
    846    ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 3 ; t8a  t15a
    847    ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 3 ; t9a  t14a
    848    ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
    849    ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
    850    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 3 ; t4a  t7a
    851    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 3 ; t5a  t6a
    852    ITX_MUL2X_PACK        6, 2, 4, 10, 1567, 3784    ; t3   t2
    853    psubsw               m2, m8, m0 ; t9  t14
    854    paddsw               m8, m0     ; t8  t15
    855    psubsw               m0, m1, m5 ; t10 t13
    856    paddsw               m1, m5     ; t11 t12
    857    vpbroadcastd         m5, [o(pw_m3784_1567)]  ; reuse pw_1567_3784
    858    ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 6   ; t9a  t14a
    859    vpbroadcastd         m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
    860    ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 6   ; t10a t13a
    861    psubsw               m4, m8, m1 ; t11a t12a
    862    paddsw               m8, m1     ; t8a  t15a
    863    psubsw               m1, m7, m3 ; t5a  t6a
    864    paddsw               m7, m3     ; t4   t7
    865    paddsw               m3, m2, m0 ; t9   t14
    866    psubsw               m2, m0     ; t10  t13
    867 %if mmsize > 16
    868    vbroadcasti128       m0, [o(deint_shuf)]
    869 %else
    870    mova                 m0, [o(deint_shuf)]
    871 %endif
    872    pshufb               m8, m0
    873    pshufb               m7, m0
    874    pshufb               m3, m0
    875    ITX_MUL2X_PACK        9, 0, 5, 10, 2896, 2896 ; t0   t1
    876    vpbroadcastd         m0, [o(pw_m2896_2896)]
    877    ITX_MUL2X_PACK        4, 5, _, 10, 5, 0, 4    ; t11  t12
    878    vpbroadcastd         m5, [o(pw_2896_2896)]
    879    ITX_MUL2X_PACK        1, 0, _, 10, 0, 5, 4    ; t6   t5
    880    vpbroadcastd         m0, [o(pw_m2896_2896)]
    881    ITX_MUL2X_PACK        2, 0, _, 10, 0, 5, 4    ; t13a t10a
    882    punpckhqdq           m0, m8, m3        ; t15a t14
    883    punpcklqdq           m8, m3            ; t8a  t9
    884    shufps               m5, m4, m2, q1032 ; t12  t13a
    885    vpblendd             m4, m2, 0xcc      ; t11  t10a
    886    shufps               m2, m7, m1, q1032 ; t7 t6
    887    vpblendd             m7, m1, 0xcc      ; t4 t5
    888    psubsw               m1, m9, m6 ; dct4 out3 out2
    889    paddsw               m9, m6     ; dct4 out0 out1
    890    psubsw               m3, m9, m2 ; dct8 out7 out6
    891    paddsw               m9, m2     ; dct8 out0 out1
    892    psubsw               m2, m1, m7 ; dct8 out4 out5
    893    paddsw               m1, m7     ; dct8 out3 out2
    894    psubsw               m7, m9, m0 ; out15 out14
    895    paddsw               m0, m9     ; out0  out1
    896    psubsw               m6, m1, m5 ; out12 out13
    897    paddsw               m1, m5     ; out3  out2
    898    psubsw               m5, m2, m4 ; out11 out10
    899    paddsw               m2, m4     ; out4  out5
    900    psubsw               m4, m3, m8 ; out8  out9
    901    paddsw               m3, m8     ; out7  out6
    902 %endmacro
    903 
    904 INV_TXFM_4X16_FN dct, dct
    905 INV_TXFM_4X16_FN dct, adst
    906 INV_TXFM_4X16_FN dct, flipadst
    907 INV_TXFM_4X16_FN dct, identity
    908 
    909 cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
    910    mova                 m0, [cq+32*0]
    911    mova                 m1, [cq+32*1]
    912    mova                 m2, [cq+32*2]
    913    mova                 m3, [cq+32*3]
    914    call m(idct_16x4_internal_8bpc).main
    915    vpbroadcastd         m5, [o(pw_16384)]
    916    punpckhwd            m4, m2, m3
    917    punpcklwd            m2, m3
    918    punpckhwd            m3, m0, m1
    919    punpcklwd            m0, m1
    920    REPX   {pmulhrsw x, m5}, m0, m4, m2, m3
    921    punpckhdq            m1, m0, m2
    922    punpckldq            m0, m2
    923    punpckldq            m2, m3, m4
    924    punpckhdq            m3, m4
    925    jmp                tx2q
    926 .pass2:
    927    vextracti128        xm4, m0, 1
    928    vextracti128        xm5, m1, 1
    929    vextracti128        xm6, m2, 1
    930    vextracti128        xm7, m3, 1
    931    call .main
    932    vinserti128          m0, xm4, 1
    933    vinserti128          m1, xm5, 1
    934    vpbroadcastd         m5, [o(pw_2048)]
    935    vinserti128          m2, xm6, 1
    936    vinserti128          m3, xm7, 1
    937    pshufd               m1, m1, q1032
    938    pshufd               m3, m3, q1032
    939    jmp m(iadst_4x16_internal_8bpc).end2
    940 ALIGN function_align
    941 cglobal_label .main
    942    WRAP_XMM IDCT16_1D_PACKED
    943    ret
    944 
    945 INV_TXFM_4X16_FN adst, dct
    946 INV_TXFM_4X16_FN adst, adst
    947 INV_TXFM_4X16_FN adst, flipadst
    948 INV_TXFM_4X16_FN adst, identity
    949 
    950 cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
    951    mova                 m0, [cq+32*0]
    952    mova                 m1, [cq+32*1]
    953    mova                 m2, [cq+32*2]
    954    mova                 m3, [cq+32*3]
    955    call m(iadst_16x4_internal_8bpc).main
    956    vpbroadcastd         m5, [o(pw_16384)]
    957    punpckhwd            m4, m2, m3
    958    punpcklwd            m2, m3
    959    punpckhwd            m3, m0, m1
    960    punpcklwd            m0, m1
    961    REPX   {pmulhrsw x, m5}, m4, m2, m3, m0
    962    punpckhdq            m1, m0, m2
    963    punpckldq            m0, m2
    964    punpckldq            m2, m3, m4
    965    punpckhdq            m3, m4
    966    jmp                tx2q
    967 .pass2:
    968    call .main
    969    vpbroadcastd         m5, [o(pw_2896x8)]
    970    paddsw               m1, m2, m4
    971    psubsw               m2, m4
    972    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
    973    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
    974    vpbroadcastd         m5, [o(pw_2048)]
    975    pshufd               m1, m1, q1032
    976    vpblendd             m4, m1, m0, 0x33
    977    vpblendd             m0, m2, 0x33
    978    vpblendd             m2, m3, 0x33
    979    vpblendd             m3, m1, 0x33
    980    vpermq               m0, m0, q2031
    981    vpermq               m1, m2, q1302
    982    vpermq               m2, m3, q3120
    983    vpermq               m3, m4, q0213
    984    psubw                m6, m7, m5
    985 .end:
    986    vpblendd             m5, m6, 0xcc
    987 .end2:
    988    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
    989    WIN64_RESTORE_XMM
    990    pxor                 m4, m4
    991    mova          [cq+32*0], m4
    992    mova          [cq+32*1], m4
    993    mova          [cq+32*2], m4
    994    mova          [cq+32*3], m4
    995 .end3:
    996    lea                  r2, [dstq+strideq*8]
    997    lea                  r3, [strideq*3]
    998    WRITE_4X8             0, 1
    999    lea                dstq, [dstq+strideq*4]
   1000    lea                  r2, [r2  +strideq*4]
   1001    WRITE_4X8             2, 3
   1002    RET
   1003 ALIGN function_align
   1004 .main:
   1005    vpblendd             m4, m1, m0, 0xcc
   1006    vpblendd             m1, m0, 0x33
   1007    vpblendd             m5, m2, m3, 0xcc
   1008    vpblendd             m2, m3, 0x33
   1009    vperm2i128           m3, m5, m2, 0x31
   1010    vinserti128          m0, m1, xm4, 1 ; in0  in3  in2  in1
   1011    vperm2i128           m4, m1, m4, 0x31
   1012    vinserti128          m1, m5, xm2, 1 ; in4  in7  in6  in5
   1013    pshufd               m3, m3, q1032  ; in15 in12 in13 in14
   1014    pshufd               m2, m4, q1032  ; in11 in8  in9  in10
   1015 cglobal_label .main2
   1016    vpbroadcastd         m8, [o(pd_2048)]
   1017    pxor                 m7, m7
   1018    punpckhwd            m4, m3, m0 ; in12 in3  in14 in1
   1019    punpcklwd            m0, m3     ; in0  in15 in2  in13
   1020    punpckhwd            m3, m2, m1 ; in8  in7  in10 in5
   1021    punpcklwd            m1, m2     ; in4  in11 in6  in9
   1022    ITX_MUL4X_PACK        0, 2, 5, 6, 8,  201, 4091,  995, 3973, 3
   1023    ITX_MUL4X_PACK        1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
   1024    ITX_MUL4X_PACK        3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
   1025    ITX_MUL4X_PACK        4, 2, 5, 6, 8, 3857, 1380, 4052,  601, 3
   1026    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
   1027    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
   1028    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
   1029    paddsw               m1, m4     ; t5a  t4a  t7a  t6a
   1030    ITX_MUL4X_PACK        2, 4, 5, 6, 8,  799, 4017, 3406, 2276, 3
   1031    psubw                m6, m7, m5
   1032    ITX_MUL2X_PACK        3, 5, _, 8, 6, 4, 6
   1033    vpbroadcastd         m6, [o(pw_m3784_1567)]
   1034    vpbroadcastd         m5, [o(pw_1567_3784)]
   1035    psubsw               m4, m0, m1 ; t5   t4   t7   t6
   1036    paddsw               m0, m1     ; t1   t0   t3   t2
   1037    psubsw               m1, m2, m3 ; t13a t12a t15a t14a
   1038    paddsw               m2, m3     ; t9a  t8a  t11a t10a
   1039    psubw                m3, m7, m6 ; pw_3784_m1567
   1040    vpblendd             m6, m3, 0xf0
   1041    ITX_MUL2X_PACK        4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
   1042    ITX_MUL2X_PACK        1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
   1043    vbroadcasti128       m5, [o(deint_shuf)]
   1044    pshufb               m0, m5
   1045    pshufb               m2, m5
   1046    vperm2i128           m3, m0, m2, 0x31  ; t3   t2   t11a t10a
   1047    vinserti128          m0, xm2, 1        ; t1   t0   t9a  t8a
   1048    vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
   1049    vinserti128          m4, xm1, 1        ; t4a  t5a  t12  t13
   1050    pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
   1051    psubsw               m1, m0, m3        ; t3a t2a t11 t10
   1052    paddsw               m0, m3     ; -out15  out0   out14 -out1
   1053    paddsw               m3, m4, m2 ; -out3   out12  out2  -out13
   1054    psubsw               m4, m2            ; t6 t7 t14a t15a
   1055    shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
   1056    vpblendd             m4, m1, 0x33      ; t3a t7  t11 t15a
   1057    ret
   1058 ALIGN function_align
   1059 .main_pass1_end:
   1060    vpbroadcastd         m5, [o(pw_m2896_2896)]
   1061    vpbroadcastd         m6, [o(pw_2896_2896)]
   1062    punpcklwd            m1, m4, m2
   1063    punpckhwd            m4, m2
   1064    pmaddwd              m2, m5, m4
   1065    pmaddwd              m4, m6
   1066    pmaddwd              m5, m1
   1067    pmaddwd              m1, m6
   1068    REPX      {paddd x, m8}, m5, m1, m2, m4
   1069    REPX      {psrad x, 12}, m5, m2, m1, m4
   1070    packssdw             m2, m5     ; -out11  out8   out10 -out9
   1071    packssdw             m1, m4     ; -out7   out4   out6  -out5
   1072    ret
   1073 
   1074 INV_TXFM_4X16_FN flipadst, dct
   1075 INV_TXFM_4X16_FN flipadst, adst
   1076 INV_TXFM_4X16_FN flipadst, flipadst
   1077 INV_TXFM_4X16_FN flipadst, identity
   1078 
   1079 cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
   1080    mova                 m0, [cq+32*0]
   1081    mova                 m1, [cq+32*1]
   1082    mova                 m2, [cq+32*2]
   1083    mova                 m3, [cq+32*3]
   1084    call m(iadst_16x4_internal_8bpc).main
   1085    vpbroadcastd         m5, [o(pw_16384)]
   1086    punpcklwd            m4, m1, m0
   1087    punpckhwd            m1, m0
   1088    punpcklwd            m0, m3, m2
   1089    punpckhwd            m3, m2
   1090    REPX   {pmulhrsw x, m5}, m4, m1, m0, m3
   1091    punpckldq            m2, m3, m1
   1092    punpckhdq            m3, m1
   1093    punpckhdq            m1, m0, m4
   1094    punpckldq            m0, m4
   1095    jmp                tx2q
   1096 .pass2:
   1097    call m(iadst_4x16_internal_8bpc).main
   1098    vpbroadcastd         m5, [o(pw_2896x8)]
   1099    paddsw               m1, m2, m4
   1100    psubsw               m2, m4
   1101    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
   1102    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
   1103    vpbroadcastd         m6, [o(pw_2048)]
   1104    pshufd               m1, m1, q1032
   1105    vpblendd             m4, m0, m2, 0x33
   1106    vpblendd             m0, m1, 0xcc
   1107    vpblendd             m1, m3, 0xcc
   1108    vpblendd             m2, m3, 0x33
   1109    vpermq               m0, m0, q3120
   1110    vpermq               m1, m1, q0213
   1111    vpermq               m2, m2, q2031
   1112    vpermq               m3, m4, q1302
   1113    psubw                m5, m7, m6
   1114    jmp m(iadst_4x16_internal_8bpc).end
   1115 
   1116 INV_TXFM_4X16_FN identity, dct
   1117 INV_TXFM_4X16_FN identity, adst
   1118 INV_TXFM_4X16_FN identity, flipadst
   1119 INV_TXFM_4X16_FN identity, identity
   1120 
   1121 cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
   1122    mova                 m3, [cq+32*0]
   1123    mova                 m2, [cq+32*1]
   1124    mova                 m4, [cq+32*2]
   1125    mova                 m5, [cq+32*3]
   1126    vpbroadcastd         m8, [o(pw_1697x8)]
   1127    pcmpeqw              m0, m0 ; -1
   1128    punpcklwd            m1, m3, m2
   1129    punpckhwd            m3, m2
   1130    punpcklwd            m2, m4, m5
   1131    punpckhwd            m4, m5
   1132    pmulhrsw             m5, m8, m1
   1133    pmulhrsw             m6, m8, m2
   1134    pmulhrsw             m7, m8, m3
   1135    pmulhrsw             m8, m4
   1136    pcmpeqw              m9, m0, m1 ; we want to do a signed avg, but pavgw is
   1137    pxor                 m1, m9     ; unsigned. as long as both signs are equal
   1138    pcmpeqw              m9, m0, m2 ; it still works, but if the input is -1 the
   1139    pxor                 m2, m9     ; pmulhrsw result will become 0 which causes
   1140    pcmpeqw              m9, m0, m3 ; pavgw to output -32768 instead of 0 unless
   1141    pxor                 m3, m9     ; we explicitly deal with that case here.
   1142    pcmpeqw              m0, m4
   1143    pxor                 m4, m0
   1144    pavgw                m1, m5
   1145    pavgw                m2, m6
   1146    pavgw                m3, m7
   1147    pavgw                m4, m8
   1148    punpckldq            m0, m1, m2
   1149    punpckhdq            m1, m2
   1150    punpckldq            m2, m3, m4
   1151    punpckhdq            m3, m4
   1152    jmp                tx2q
   1153 .pass2:
   1154    vpbroadcastd         m8, [o(pw_1697x16)]
   1155    vpbroadcastd         m5, [o(pw_2048)]
   1156    pmulhrsw             m4, m8, m0
   1157    pmulhrsw             m6, m8, m1
   1158    pmulhrsw             m7, m8, m2
   1159    pmulhrsw             m8, m3
   1160    REPX      {paddsw x, x}, m0, m1, m2, m3
   1161    paddsw               m0, m4
   1162    paddsw               m1, m6
   1163    paddsw               m2, m7
   1164    paddsw               m3, m8
   1165    jmp m(iadst_4x16_internal_8bpc).end2
   1166 
   1167 %macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3]
   1168    movq               xm%3, [dstq   ]
   1169    movhps             xm%3, [dstq+%5]
   1170    movq               xm%4, [dstq+%6]
   1171    movhps             xm%4, [dstq+%7]
   1172    pmovzxbw            m%3, xm%3
   1173    pmovzxbw            m%4, xm%4
   1174 %ifnum %1
   1175    paddw               m%3, m%1
   1176 %else
   1177    paddw               m%3, %1
   1178 %endif
   1179 %ifnum %2
   1180    paddw               m%4, m%2
   1181 %else
   1182    paddw               m%4, %2
   1183 %endif
   1184    packuswb            m%3, m%4
   1185    vextracti128       xm%4, m%3, 1
   1186    movq          [dstq   ], xm%3
   1187    movhps        [dstq+%6], xm%3
   1188    movq          [dstq+%5], xm%4
   1189    movhps        [dstq+%7], xm%4
   1190 %endmacro
   1191 
   1192 %macro INV_TXFM_8X4_FN 2 ; type1, type2
   1193    INV_TXFM_FN          %1, %2, 8x4
   1194 %ifidn %1_%2, dct_dct
   1195    movd                xm1, [o(pw_2896x8)]
   1196    pmulhrsw            xm0, xm1, [cq]
   1197    mov                [cq], eobd
   1198    pmulhrsw            xm0, xm1
   1199    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
   1200 %endif
   1201 %endmacro
   1202 
   1203 INV_TXFM_8X4_FN dct, dct
   1204 INV_TXFM_8X4_FN dct, adst
   1205 INV_TXFM_8X4_FN dct, flipadst
   1206 INV_TXFM_8X4_FN dct, identity
   1207 
   1208 cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
   1209    vpbroadcastd        xm3, [o(pw_2896x8)]
   1210    pmulhrsw            xm0, xm3, [cq+16*0]
   1211    pmulhrsw            xm1, xm3, [cq+16*1]
   1212    pmulhrsw            xm2, xm3, [cq+16*2]
   1213    pmulhrsw            xm3,      [cq+16*3]
   1214    call m(idct_4x8_internal_8bpc).main
   1215    vbroadcasti128       m4, [o(deint_shuf)]
   1216    vinserti128          m3, m1, xm3, 1
   1217    vinserti128          m1, m0, xm2, 1
   1218    shufps               m0, m1, m3, q0220
   1219    shufps               m1, m3, q1331
   1220    pshufb               m0, m4
   1221    pshufb               m1, m4
   1222    jmp                tx2q
   1223 .pass2:
   1224    IDCT4_1D_PACKED
   1225    vpermq               m0, m0, q3120
   1226    vpermq               m1, m1, q2031
   1227    jmp m(iadst_8x4_internal_8bpc).end2
   1228 
   1229 INV_TXFM_8X4_FN adst, dct
   1230 INV_TXFM_8X4_FN adst, adst
   1231 INV_TXFM_8X4_FN adst, flipadst
   1232 INV_TXFM_8X4_FN adst, identity
   1233 
   1234 cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
   1235    vpbroadcastd        xm0, [o(pw_2896x8)]
   1236    pshufd              xm4,      [cq+16*0], q1032
   1237    pmulhrsw            xm3, xm0, [cq+16*3]
   1238    pshufd              xm5,      [cq+16*1], q1032
   1239    pmulhrsw            xm2, xm0, [cq+16*2]
   1240    pmulhrsw            xm4, xm0
   1241    pmulhrsw            xm5, xm0
   1242    call m(iadst_4x8_internal_8bpc).main_pass1
   1243    vinserti128        m0, xm2, 1
   1244    vinserti128        m1, xm3, 1
   1245    punpckhwd          m2, m0, m1
   1246    punpcklwd          m0, m1
   1247    pxor               m3, m3
   1248    psubsw             m3, m2
   1249    punpckhwd          m1, m0, m3
   1250    punpcklwd          m0, m3
   1251    jmp              tx2q
   1252 .pass2:
   1253    call .main
   1254 .end:
   1255    vpermq               m0, m0, q3120
   1256    vpermq               m1, m1, q3120
   1257 .end2:
   1258    vpbroadcastd         m2, [o(pw_2048)]
   1259    pmulhrsw             m0, m2
   1260    pmulhrsw             m1, m2
   1261    WIN64_RESTORE_XMM
   1262 .end3:
   1263    pxor                 m2, m2
   1264    mova          [cq+32*0], m2
   1265    mova          [cq+32*1], m2
   1266    lea                  r3, [strideq*3]
   1267    WRITE_8X4             0, 1, 4, 5
   1268    RET
   1269 ALIGN function_align
   1270 cglobal_label .main
   1271    IADST4_1D_PACKED
   1272    ret
   1273 
   1274 INV_TXFM_8X4_FN flipadst, dct
   1275 INV_TXFM_8X4_FN flipadst, adst
   1276 INV_TXFM_8X4_FN flipadst, flipadst
   1277 INV_TXFM_8X4_FN flipadst, identity
   1278 
   1279 cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
   1280    vpbroadcastd        xm0, [o(pw_2896x8)]
   1281    pshufd              xm4,      [cq+16*0], q1032
   1282    pmulhrsw            xm3, xm0, [cq+16*3]
   1283    pshufd              xm5,      [cq+16*1], q1032
   1284    pmulhrsw            xm2, xm0, [cq+16*2]
   1285    pmulhrsw            xm4, xm0
   1286    pmulhrsw            xm5, xm0
   1287    call m(iadst_4x8_internal_8bpc).main_pass1
   1288    vinserti128          m3, xm1, 1
   1289    vinserti128          m2, xm0, 1
   1290    punpckhwd            m1, m3, m2
   1291    punpcklwd            m3, m2
   1292    pxor                 m0, m0
   1293    psubsw               m0, m1
   1294    punpckhwd            m1, m0, m3
   1295    punpcklwd            m0, m3
   1296    jmp                tx2q
   1297 .pass2:
   1298    call m(iadst_8x4_internal_8bpc).main
   1299    mova                 m2, m1
   1300    vpermq               m1, m0, q2031
   1301    vpermq               m0, m2, q2031
   1302    jmp m(iadst_8x4_internal_8bpc).end2
   1303 
   1304 INV_TXFM_8X4_FN identity, dct
   1305 INV_TXFM_8X4_FN identity, adst
   1306 INV_TXFM_8X4_FN identity, flipadst
   1307 INV_TXFM_8X4_FN identity, identity
   1308 
   1309 cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
   1310    mova                xm2, [cq+16*0]
   1311    mova                xm0, [cq+16*1]
   1312    vinserti128          m2, [cq+16*2], 1
   1313    vinserti128          m0, [cq+16*3], 1
   1314    vpbroadcastd         m3, [o(pw_2896x8)]
   1315    punpcklwd            m1, m2, m0
   1316    punpckhwd            m2, m0
   1317    pmulhrsw             m1, m3
   1318    pmulhrsw             m2, m3
   1319    punpcklwd            m0, m1, m2
   1320    punpckhwd            m1, m2
   1321    paddsw               m0, m0
   1322    paddsw               m1, m1
   1323    jmp                tx2q
   1324 .pass2:
   1325    vpbroadcastd         m3, [o(pw_1697x8)]
   1326    pmulhrsw             m2, m3, m0
   1327    pmulhrsw             m3, m1
   1328    paddsw               m0, m2
   1329    paddsw               m1, m3
   1330    jmp m(iadst_8x4_internal_8bpc).end
   1331 
   1332 %macro INV_TXFM_8X8_FN 2 ; type1, type2
   1333    INV_TXFM_FN          %1, %2, 8x8
   1334 %ifidn %1_%2, dct_dct
   1335    movd                xm1, [o(pw_2896x8)]
   1336    pmulhrsw            xm0, xm1, [cq]
   1337    movd                xm2, [o(pw_16384)]
   1338    mov                [cq], eobd
   1339    or                  r3d, 8
   1340 .dconly:
   1341    pmulhrsw            xm0, xm2
   1342 .dconly2:
   1343    movd                xm2, [pw_2048]
   1344    pmulhrsw            xm0, xm1
   1345    lea                  r2, [strideq*3]
   1346    pmulhrsw            xm0, xm2
   1347    vpbroadcastw         m0, xm0
   1348 .dconly_loop:
   1349    WRITE_8X4             0, 0, 1, 2, strideq*1, strideq*2, r2
   1350    lea                dstq, [dstq+strideq*4]
   1351    sub                 r3d, 4
   1352    jg .dconly_loop
   1353    RET
   1354 %endif
   1355 %endmacro
   1356 
   1357 INV_TXFM_8X8_FN dct, dct
   1358 INV_TXFM_8X8_FN dct, adst
   1359 INV_TXFM_8X8_FN dct, flipadst
   1360 INV_TXFM_8X8_FN dct, identity
   1361 
   1362 cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
   1363    vpermq               m0, [cq+32*0], q3120 ; 0 1
   1364    vpermq               m3, [cq+32*3], q3120 ; 6 7
   1365    vpermq               m2, [cq+32*2], q3120 ; 4 5
   1366    vpermq               m1, [cq+32*1], q3120 ; 2 3
   1367    call .main
   1368    shufps               m4, m0, m1, q0220
   1369    shufps               m5, m0, m1, q1331
   1370    shufps               m1, m2, m3, q0220
   1371    shufps               m3, m2, m3, q1331
   1372    vbroadcasti128       m0, [o(deint_shuf)]
   1373    vpbroadcastd         m2, [o(pw_16384)]
   1374    REPX   {pshufb   x, m0}, m4, m5, m1, m3
   1375    REPX   {pmulhrsw x, m2}, m4, m5, m1, m3
   1376    vinserti128          m0, m4, xm1, 1
   1377    vperm2i128           m2, m4, m1, 0x31
   1378    vinserti128          m1, m5, xm3, 1
   1379    vperm2i128           m3, m5, m3, 0x31
   1380    jmp                tx2q
   1381 .pass2:
   1382    call .main
   1383    vpbroadcastd         m4, [o(pw_2048)]
   1384    vpermq               m0, m0, q3120
   1385    vpermq               m1, m1, q2031
   1386    vpermq               m2, m2, q3120
   1387    vpermq               m3, m3, q2031
   1388    jmp m(iadst_8x8_internal_8bpc).end2
   1389 ALIGN function_align
   1390 cglobal_label .main
   1391    IDCT8_1D_PACKED
   1392    ret
   1393 
   1394 INV_TXFM_8X8_FN adst, dct
   1395 INV_TXFM_8X8_FN adst, adst
   1396 INV_TXFM_8X8_FN adst, flipadst
   1397 INV_TXFM_8X8_FN adst, identity
   1398 
   1399 cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
   1400    vpermq               m4, [cq+32*0], q1302 ; 1 0
   1401    vpermq               m3, [cq+32*3], q3120 ; 6 7
   1402    vpermq               m5, [cq+32*1], q1302 ; 3 2
   1403    vpermq               m2, [cq+32*2], q3120 ; 4 5
   1404    call .main_pass1
   1405    vpbroadcastd         m5, [o(pw_16384)]
   1406    punpcklwd            m4, m0, m1
   1407    punpckhwd            m0, m1
   1408    punpcklwd            m1, m2, m3
   1409    punpckhwd            m2, m3
   1410    pxor                 m3, m3
   1411    psubw                m3, m5 ; negate odd elements during rounding
   1412    pmulhrsw             m4, m5
   1413    pmulhrsw             m0, m3
   1414    pmulhrsw             m1, m5
   1415    pmulhrsw             m2, m3
   1416    punpcklwd            m3, m4, m0
   1417    punpckhwd            m4, m0
   1418    punpcklwd            m0, m1, m2
   1419    punpckhwd            m1, m2
   1420    vperm2i128           m2, m3, m0, 0x31
   1421    vinserti128          m0, m3, xm0, 1
   1422    vperm2i128           m3, m4, m1, 0x31
   1423    vinserti128          m1, m4, xm1, 1
   1424    jmp                tx2q
   1425 .pass2:
   1426    pshufd               m4, m0, q1032
   1427    pshufd               m5, m1, q1032
   1428    call .main_pass2
   1429    vpbroadcastd         m5, [o(pw_2048)]
   1430    vpbroadcastd        xm4, [o(pw_4096)]
   1431    psubw                m4, m5 ; lower half = 2048, upper half = -2048
   1432 .end:
   1433    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
   1434 .end2:
   1435    pmulhrsw             m0, m4
   1436    pmulhrsw             m1, m4
   1437 .end3:
   1438    pmulhrsw             m2, m4
   1439    pmulhrsw             m3, m4
   1440    WIN64_RESTORE_XMM
   1441 .end4:
   1442    pxor                 m4, m4
   1443    mova          [cq+32*0], m4
   1444    mova          [cq+32*1], m4
   1445    mova          [cq+32*2], m4
   1446    mova          [cq+32*3], m4
   1447    lea                  r3, [strideq*3]
   1448    WRITE_8X4             0, 1, 4, 5
   1449    lea                dstq, [dstq+strideq*4]
   1450    WRITE_8X4             2, 3, 4, 5
   1451    RET
   1452 ALIGN function_align
   1453 .main_pass1:
   1454    IADST8_1D_PACKED 1
   1455    ret
   1456 ALIGN function_align
   1457 cglobal_label .main_pass2
   1458    IADST8_1D_PACKED 2
   1459    ret
   1460 
   1461 INV_TXFM_8X8_FN flipadst, dct
   1462 INV_TXFM_8X8_FN flipadst, adst
   1463 INV_TXFM_8X8_FN flipadst, flipadst
   1464 INV_TXFM_8X8_FN flipadst, identity
   1465 
   1466 cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
   1467    vpermq               m4, [cq+32*0], q1302 ; 1 0
   1468    vpermq               m3, [cq+32*3], q3120 ; 6 7
   1469    vpermq               m5, [cq+32*1], q1302 ; 3 2
   1470    vpermq               m2, [cq+32*2], q3120 ; 4 5
   1471    call m(iadst_8x8_internal_8bpc).main_pass1
   1472    vpbroadcastd         m5, [o(pw_16384)]
   1473    punpckhwd            m4, m3, m2
   1474    punpcklwd            m3, m2
   1475    punpckhwd            m2, m1, m0
   1476    punpcklwd            m1, m0
   1477    pxor                 m0, m0
   1478    psubw                m0, m5
   1479    pmulhrsw             m4, m0
   1480    pmulhrsw             m3, m5
   1481    pmulhrsw             m2, m0
   1482    pmulhrsw             m1, m5
   1483    punpckhwd            m0, m4, m3
   1484    punpcklwd            m4, m3
   1485    punpckhwd            m3, m2, m1
   1486    punpcklwd            m2, m1
   1487    vinserti128          m1, m0, xm3, 1
   1488    vperm2i128           m3, m0, m3, 0x31
   1489    vinserti128          m0, m4, xm2, 1
   1490    vperm2i128           m2, m4, m2, 0x31
   1491    jmp                tx2q
   1492 .pass2:
   1493    pshufd               m4, m0, q1032
   1494    pshufd               m5, m1, q1032
   1495    call m(iadst_8x8_internal_8bpc).main_pass2
   1496    vpbroadcastd         m4, [o(pw_2048)]
   1497    vpbroadcastd        xm5, [o(pw_4096)]
   1498    psubw                m4, m5 ; lower half = -2048, upper half = 2048
   1499    vpermq               m5, m3, q2031
   1500    vpermq               m3, m0, q2031
   1501    vpermq               m0, m2, q2031
   1502    vpermq               m2, m1, q2031
   1503    pmulhrsw             m1, m0, m4
   1504    pmulhrsw             m0, m5, m4
   1505    jmp m(iadst_8x8_internal_8bpc).end3
   1506 
   1507 INV_TXFM_8X8_FN identity, dct
   1508 INV_TXFM_8X8_FN identity, adst
   1509 INV_TXFM_8X8_FN identity, flipadst
   1510 INV_TXFM_8X8_FN identity, identity
   1511 
   1512 cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
   1513    mova                xm3, [cq+16*0]
   1514    mova                xm2, [cq+16*1]
   1515    vinserti128          m3, [cq+16*4], 1
   1516    vinserti128          m2, [cq+16*5], 1
   1517    mova                xm4, [cq+16*2]
   1518    mova                xm0, [cq+16*3]
   1519    vinserti128          m4, [cq+16*6], 1
   1520    vinserti128          m0, [cq+16*7], 1
   1521    punpcklwd            m1, m3, m2
   1522    punpckhwd            m3, m2
   1523    punpcklwd            m2, m4, m0
   1524    punpckhwd            m4, m0
   1525    punpckldq            m0, m1, m2
   1526    punpckhdq            m1, m2
   1527    punpckldq            m2, m3, m4
   1528    punpckhdq            m3, m4
   1529    jmp                tx2q
   1530 .pass2:
   1531    vpbroadcastd         m4, [o(pw_4096)]
   1532    jmp m(iadst_8x8_internal_8bpc).end
   1533 
   1534 %macro INV_TXFM_8X16_FN 2 ; type1, type2
   1535    INV_TXFM_FN          %1, %2, 8x16
   1536 %ifidn %1_%2, dct_dct
   1537    movd                xm1, [o(pw_2896x8)]
   1538    pmulhrsw            xm0, xm1, [cq]
   1539    movd                xm2, [o(pw_16384)]
   1540    mov                [cq], eobd
   1541    pmulhrsw            xm0, xm1
   1542    or                  r3d, 16
   1543    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
   1544 %endif
   1545 %endmacro
   1546 
   1547 %macro ITX_8X16_LOAD_COEFS 0
   1548    vpbroadcastd         m4, [o(pw_2896x8)]
   1549    pmulhrsw             m0, m4, [cq+32*0]
   1550    add                  cq, 32*4
   1551    pmulhrsw             m7, m4, [cq+32*3]
   1552    pmulhrsw             m1, m4, [cq-32*3]
   1553    pmulhrsw             m6, m4, [cq+32*2]
   1554    pmulhrsw             m2, m4, [cq-32*2]
   1555    pmulhrsw             m5, m4, [cq+32*1]
   1556    pmulhrsw             m3, m4, [cq-32*1]
   1557    pmulhrsw             m4,     [cq+32*0]
   1558 %endmacro
   1559 
   1560 INV_TXFM_8X16_FN dct, dct
   1561 INV_TXFM_8X16_FN dct, adst
   1562 INV_TXFM_8X16_FN dct, flipadst
   1563 INV_TXFM_8X16_FN dct, identity
   1564 
   1565 cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
   1566    ITX_8X16_LOAD_COEFS
   1567    call m(idct_16x8_internal_8bpc).main
   1568    vpbroadcastd        m10, [o(pw_16384)]
   1569 .pass1_end:
   1570    vperm2i128           m9, m3, m7, 0x31
   1571    vinserti128          m3, xm7, 1
   1572    vperm2i128           m8, m2, m6, 0x31
   1573    vinserti128          m2, xm6, 1
   1574    vperm2i128           m6, m1, m5, 0x31
   1575    vinserti128          m1, xm5, 1
   1576    vperm2i128           m5, m0, m4, 0x31
   1577    vinserti128          m0, xm4, 1
   1578    punpckhwd            m4, m2, m3
   1579    punpcklwd            m2, m3
   1580    punpckhwd            m3, m0, m1
   1581    punpcklwd            m0, m1
   1582 .pass1_end2:
   1583    punpckhwd            m7, m5, m6
   1584    punpcklwd            m5, m6
   1585    punpcklwd            m6, m8, m9
   1586    punpckhwd            m8, m9
   1587    REPX  {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8
   1588    punpckhdq            m1, m0, m2
   1589    punpckldq            m0, m2
   1590    punpckldq            m2, m3, m4
   1591    punpckhdq            m3, m4
   1592    punpckldq            m4, m5, m6
   1593    punpckhdq            m5, m6
   1594    punpckldq            m6, m7, m8
   1595    punpckhdq            m7, m8
   1596    jmp                tx2q
   1597 .pass2:
   1598    call .main
   1599    REPX {vpermq x, x, q3120}, m0, m2, m4, m6
   1600    REPX {vpermq x, x, q2031}, m1, m3, m5, m7
   1601 .end:
   1602    vpbroadcastd         m8, [o(pw_2048)]
   1603 .end2:
   1604    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
   1605 .end3:
   1606    pxor                 m8, m8
   1607    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
   1608    lea                  r3, [strideq*3]
   1609    WRITE_8X4             0, 1, 8, 9
   1610    lea                dstq, [dstq+strideq*4]
   1611    WRITE_8X4             2, 3, 0, 1
   1612    lea                dstq, [dstq+strideq*4]
   1613    WRITE_8X4             4, 5, 0, 1
   1614    lea                dstq, [dstq+strideq*4]
   1615    WRITE_8X4             6, 7, 0, 1
   1616    RET
   1617 ALIGN function_align
   1618 cglobal_label .main
   1619    IDCT16_1D_PACKED
   1620    ret
   1621 
   1622 INV_TXFM_8X16_FN adst, dct
   1623 INV_TXFM_8X16_FN adst, adst
   1624 INV_TXFM_8X16_FN adst, flipadst
   1625 INV_TXFM_8X16_FN adst, identity
   1626 
   1627 cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
   1628    ITX_8X16_LOAD_COEFS
   1629    call m(iadst_16x8_internal_8bpc).main
   1630    call m(iadst_16x8_internal_8bpc).main_pass1_end
   1631    vpbroadcastd        m10, [o(pw_16384)]
   1632    pslld                m9, m10, 17
   1633    psubw               m10, m9 ; 16384, -16384
   1634    jmp m(idct_8x16_internal_8bpc).pass1_end
   1635 ALIGN function_align
   1636 .pass2:
   1637    call .main
   1638    call .main_pass2_end
   1639    vpbroadcastd         m9, [o(pw_2048)]
   1640    vpbroadcastd        xm8, [o(pw_4096)]
   1641    psubw                m8, m9
   1642    REPX {vpermq x, x, q2031}, m0, m1, m2, m3
   1643    REPX {vpermq x, x, q3120}, m4, m5, m6, m7
   1644    jmp m(idct_8x16_internal_8bpc).end2
   1645 ALIGN function_align
   1646 cglobal_label .main
   1647    REPX {pshufd x, x, q1032}, m7, m1, m5, m3
   1648 .main2:
   1649    vpbroadcastd        m10, [o(pd_2048)]
   1650    punpckhwd            m8, m7, m0 ; in14 in1
   1651    punpcklwd            m0, m7     ; in0  in15
   1652    punpcklwd            m7, m6, m1 ; in12 in3
   1653    punpckhwd            m1, m6     ; in2  in13
   1654    punpckhwd            m6, m5, m2 ; in10 in5
   1655    punpcklwd            m2, m5     ; in4  in11
   1656    punpcklwd            m5, m4, m3 ; in8  in7
   1657    punpckhwd            m3, m4     ; in6  in9
   1658    ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 3 ; t0  t1
   1659    ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 3 ; t2  t3
   1660    ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 3 ; t4  t5
   1661    ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 3 ; t6  t7
   1662    ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 3 ; t8  t9
   1663    ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
   1664    ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
   1665    ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 3 ; t14 t15
   1666    psubsw               m4, m0, m5 ; t9a  t8a
   1667    paddsw               m0, m5     ; t1a  t0a
   1668    psubsw               m5, m1, m6 ; t11a t10a
   1669    paddsw               m1, m6     ; t3a  t2a
   1670    psubsw               m6, m2, m7 ; t13a t12a
   1671    paddsw               m2, m7     ; t5a  t4a
   1672    psubsw               m7, m3, m8 ; t15a t14a
   1673    paddsw               m3, m8     ; t7a  t6a
   1674    vpbroadcastd        m11, [o(pw_m4017_799)]
   1675    vpbroadcastd        m12, [o(pw_799_4017)]
   1676    pxor                 m9, m9
   1677    ITX_MUL2X_PACK        4, 8, _, 10, 11, 12, 6 ; t8  t9
   1678    psubw                m8, m9, m11 ; pw_4017_m799
   1679    ITX_MUL2X_PACK        6, 12, _, 10, 12, 8, 6 ; t12 t13
   1680    vpbroadcastd        m11, [o(pw_m2276_3406)]
   1681    vpbroadcastd        m12, [o(pw_3406_2276)]
   1682    ITX_MUL2X_PACK        5, 8, _, 10, 11, 12, 6 ; t10 t11
   1683    psubw                m8, m9, m11 ; pw_2276_m3406
   1684    ITX_MUL2X_PACK        7, 12, _, 10, 12, 8, 6 ; t14 t15
   1685    psubsw               m8, m1, m3 ; t7   t6
   1686    paddsw               m1, m3     ; t3   t2
   1687    psubsw               m3, m0, m2 ; t5   t4
   1688    paddsw               m0, m2     ; t1   t0
   1689    psubsw               m2, m5, m7 ; t14a t15a
   1690    paddsw               m7, m5     ; t10a t11a
   1691    psubsw               m5, m4, m6 ; t12a t13a
   1692    paddsw               m4, m6     ; t8a  t9a
   1693    vpbroadcastd        m11, [o(pw_m3784_1567)]
   1694    vpbroadcastd        m12, [o(pw_1567_3784)]
   1695    ITX_MUL2X_PACK        3, 6, _, 10, 12, 11, 6 ; t5a t4a
   1696    psubw                m6, m9, m11 ; pw_3784_m1567
   1697    ITX_MUL2X_PACK        8, 6, _, 10, 6, 12, 6  ; t7a t6a
   1698    vpbroadcastd        m11, [o(pw_m1567_3784)]
   1699    vpbroadcastd        m12, [o(pw_3784_1567)]
   1700    ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 6 ; t15 t14
   1701    psubw                m6, m9, m11 ; pw_1567_m3784
   1702    ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 6 ; t13 t12
   1703    vbroadcasti128      m12, [o(deint_shuf)]
   1704    paddsw               m6, m4, m7        ; -out1  out14
   1705    psubsw               m4, m7            ;  t10    t11
   1706    psubsw              m11, m3, m8        ;  t7     t6
   1707    paddsw               m8, m3            ;  out12 -out3
   1708    psubsw               m3, m0, m1        ;  t3a    t2a
   1709    paddsw               m0, m1            ; -out15  out0
   1710    paddsw               m1, m2, m5        ; -out13  out2
   1711    psubsw               m5, m2            ;  t15a   t14a
   1712    pshufb               m0, m12
   1713    pshufb               m6, m12
   1714    pshufb               m8, m12
   1715    pshufb               m1, m12
   1716    shufps               m7, m6, m0, q1032 ;  out14 -out15
   1717    vpblendd             m0, m6, 0x33      ; -out1   out0
   1718    punpcklqdq           m6, m8, m1        ;  out12 -out13
   1719    punpckhqdq           m1, m8, m1        ; -out3   out2
   1720    ret
   1721 ALIGN function_align
   1722 .main_pass1_end:
   1723    vpbroadcastd         m8, [o(pw_m2896_2896)]
   1724    vpbroadcastd        m12, [o(pw_2896_2896)]
   1725    pmaddwd              m9, m8, m11       ; -out11
   1726    pmaddwd              m2, m12, m5       ; -out5
   1727    pmaddwd              m5, m8            ;  out10
   1728    pmaddwd             m11, m12           ;  out4
   1729    REPX     {paddd x, m10}, m9, m5, m2, m11
   1730    REPX     {psrad x, 12 }, m9, m5, m2, m11
   1731    packssdw             m5, m9            ;  out10 -out11
   1732    packssdw             m2, m11           ; -out5   out4
   1733    pmaddwd             m11, m8, m3        ;  out8
   1734    vpbroadcastd         m8, [o(pw_2896_m2896)]
   1735    pmaddwd              m3, m12           ; -out7
   1736    pmaddwd              m8, m4            ; -out9
   1737    pmaddwd              m4, m12           ;  out6
   1738    REPX     {paddd x, m10}, m11, m3, m8, m4
   1739    REPX     {psrad x, 12 }, m11, m3, m8, m4
   1740    packssdw             m3, m4            ; -out7   out6
   1741    packssdw             m4, m11, m8       ;  out8  -out9
   1742    vpbroadcastd        m10, [o(pw_16384)]
   1743    pxor                 m9, m9
   1744    ret
   1745 ALIGN function_align
   1746 cglobal_label .main_pass2_end
   1747    vpbroadcastd         m8, [o(pw_2896x8)]
   1748    pshufb               m2, m11, m12
   1749    pshufb               m5, m12
   1750    pshufb               m3, m12
   1751    pshufb               m4, m12
   1752    punpcklqdq          m11, m5, m2        ;  t15a   t7
   1753    punpckhqdq           m5, m2            ;  t14a   t6
   1754    shufps               m2, m3, m4, q1032 ;  t2a    t10
   1755    vpblendd             m3, m4, 0xcc      ;  t3a    t11
   1756    psubsw               m4, m2, m3        ;  out8  -out9
   1757    paddsw               m3, m2            ; -out7   out6
   1758    paddsw               m2, m5, m11       ; -out5   out4
   1759    psubsw               m5, m11           ;  out10 -out11
   1760    REPX   {pmulhrsw x, m8}, m2, m3, m4, m5
   1761    ret
   1762 
   1763 INV_TXFM_8X16_FN flipadst, dct
   1764 INV_TXFM_8X16_FN flipadst, adst
   1765 INV_TXFM_8X16_FN flipadst, flipadst
   1766 INV_TXFM_8X16_FN flipadst, identity
   1767 
   1768 cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
   1769    ITX_8X16_LOAD_COEFS
   1770    call m(iadst_16x8_internal_8bpc).main
   1771    call m(iadst_16x8_internal_8bpc).main_pass1_end
   1772    vpbroadcastd         m9, [o(pw_16384)]
   1773    pslld               m10, m9, 17
   1774    psubw               m10, m9 ; -16384, 16384
   1775    vperm2i128           m9, m4, m0, 0x31
   1776    vinserti128          m0, m4, xm0, 1
   1777    vperm2i128           m8, m5, m1, 0x31
   1778    vinserti128          m4, m5, xm1, 1
   1779    vperm2i128           m5, m7, m3, 0x31
   1780    vinserti128          m3, m7, xm3, 1
   1781    vinserti128          m1, m6, xm2, 1
   1782    vperm2i128           m6, m6, m2, 0x31
   1783    punpcklwd            m2, m4, m0
   1784    punpckhwd            m4, m0
   1785    punpcklwd            m0, m3, m1
   1786    punpckhwd            m3, m1
   1787    jmp m(idct_8x16_internal_8bpc).pass1_end2
   1788 .pass2:
   1789    call m(iadst_8x16_internal_8bpc).main
   1790    call m(iadst_8x16_internal_8bpc).main_pass2_end
   1791    vpbroadcastd         m8, [o(pw_2048)]
   1792    vpbroadcastd        xm9, [o(pw_4096)]
   1793    psubw                m8, m9
   1794    vpermq               m9, m0, q3120
   1795    vpermq               m0, m7, q2031
   1796    vpermq               m7, m1, q3120
   1797    vpermq               m1, m6, q2031
   1798    vpermq               m6, m2, q3120
   1799    vpermq               m2, m5, q2031
   1800    vpermq               m5, m3, q3120
   1801    vpermq               m3, m4, q2031
   1802    pmulhrsw             m0, m8
   1803    pmulhrsw             m1, m8
   1804    pmulhrsw             m2, m8
   1805    pmulhrsw             m3, m8
   1806    pmulhrsw             m4, m5, m8
   1807    pmulhrsw             m5, m6, m8
   1808    pmulhrsw             m6, m7, m8
   1809    pmulhrsw             m7, m9, m8
   1810    jmp m(idct_8x16_internal_8bpc).end3
   1811 
   1812 INV_TXFM_8X16_FN identity, dct
   1813 INV_TXFM_8X16_FN identity, adst
   1814 INV_TXFM_8X16_FN identity, flipadst
   1815 INV_TXFM_8X16_FN identity, identity
   1816 
   1817 %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
   1818    pmulhrsw            m%2, m%3, m%1
   1819 %if %0 == 4 ; if downshifting by 1
   1820    pmulhrsw            m%2, m%4
   1821 %else
   1822    paddsw              m%1, m%1
   1823 %endif
   1824    paddsw              m%1, m%2
   1825 %endmacro
   1826 
   1827 cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
   1828    mova                xm3, [cq+16*0]
   1829    mova                xm2, [cq+16*2]
   1830    add                  cq, 16*8
   1831    vinserti128          m3, [cq+16*0], 1
   1832    vinserti128          m2, [cq+16*2], 1
   1833    vpbroadcastd         m9, [o(pw_2896x8)]
   1834    mova                xm4, [cq-16*4]
   1835    mova                xm5, [cq-16*2]
   1836    vinserti128          m4, [cq+16*4], 1
   1837    vinserti128          m5, [cq+16*6], 1
   1838    mova                xm7, [cq-16*7]
   1839    mova                xm6, [cq-16*5]
   1840    vinserti128          m7, [cq+16*1], 1
   1841    vinserti128          m6, [cq+16*3], 1
   1842    mova                xm8, [cq-16*3]
   1843    mova                xm0, [cq-16*1]
   1844    vinserti128          m8, [cq+16*5], 1
   1845    vinserti128          m0, [cq+16*7], 1
   1846    punpcklwd            m1, m3, m2
   1847    punpckhwd            m3, m2
   1848    punpcklwd            m2, m4, m5
   1849    punpckhwd            m4, m5
   1850    punpcklwd            m5, m7, m6
   1851    punpckhwd            m7, m6
   1852    punpcklwd            m6, m8, m0
   1853    punpckhwd            m8, m0
   1854    REPX   {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8
   1855    punpckldq            m0, m1, m2
   1856    punpckhdq            m1, m2
   1857    punpckldq            m2, m3, m4
   1858    punpckhdq            m3, m4
   1859    punpckldq            m4, m5, m6
   1860    punpckhdq            m5, m6
   1861    punpckldq            m6, m7, m8
   1862    punpckhdq            m7, m8
   1863    jmp                tx2q
   1864 .pass2:
   1865    vpbroadcastd         m8, [o(pw_1697x16)]
   1866    REPX {vpermq   x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
   1867    REPX {IDTX16   x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
   1868    jmp m(idct_8x16_internal_8bpc).end
   1869 
   1870 %macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
   1871    pmovzxbw            m%3, [dstq+%5]
   1872 %ifnum %1
   1873    paddw               m%3, m%1
   1874 %else
   1875    paddw               m%3, %1
   1876 %endif
   1877    pmovzxbw            m%4, [dstq+%6]
   1878 %ifnum %2
   1879    paddw               m%4, m%2
   1880 %else
   1881    paddw               m%4, %2
   1882 %endif
   1883    packuswb            m%3, m%4
   1884    vpermq              m%3, m%3, q3120
   1885    mova          [dstq+%5], xm%3
   1886    vextracti128  [dstq+%6], m%3, 1
   1887 %endmacro
   1888 
   1889 %macro INV_TXFM_16X4_FN 2 ; type1, type2
   1890    INV_TXFM_FN          %1, %2, 16x4
   1891 %ifidn %1_%2, dct_dct
   1892    movd                xm1, [o(pw_2896x8)]
   1893    pmulhrsw            xm0, xm1, [cq]
   1894    movd                xm2, [o(pw_16384)]
   1895    mov                [cq], eobd
   1896    or                  r3d, 4
   1897 .dconly:
   1898    pmulhrsw            xm0, xm2
   1899    movd                xm2, [pw_2048] ; intentionally rip-relative
   1900    pmulhrsw            xm0, xm1
   1901    pmulhrsw            xm0, xm2
   1902    vpbroadcastw         m0, xm0
   1903    pxor                 m3, m3
   1904 .dconly_loop:
   1905    mova                xm1, [dstq+strideq*0]
   1906    vinserti128          m1, [dstq+strideq*1], 1
   1907    punpckhbw            m2, m1, m3
   1908    punpcklbw            m1, m3
   1909    paddw                m2, m0
   1910    paddw                m1, m0
   1911    packuswb             m1, m2
   1912    mova         [dstq+strideq*0], xm1
   1913    vextracti128 [dstq+strideq*1], m1, 1
   1914    lea                dstq, [dstq+strideq*2]
   1915    sub                 r3d, 2
   1916    jg .dconly_loop
   1917    RET
   1918 %endif
   1919 %endmacro
   1920 
   1921 INV_TXFM_16X4_FN dct, dct
   1922 INV_TXFM_16X4_FN dct, adst
   1923 INV_TXFM_16X4_FN dct, flipadst
   1924 INV_TXFM_16X4_FN dct, identity
   1925 
   1926 cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
   1927    mova                xm0, [cq+16*0]
   1928    mova                xm1, [cq+16*1]
   1929    mova                xm2, [cq+16*2]
   1930    mova                xm3, [cq+16*3]
   1931    mova                xm4, [cq+16*4]
   1932    mova                xm5, [cq+16*5]
   1933    mova                xm6, [cq+16*6]
   1934    mova                xm7, [cq+16*7]
   1935    call m(idct_4x16_internal_8bpc).main
   1936    vinserti128          m6, m2, xm6, 1
   1937    vinserti128          m2, m0, xm4, 1
   1938    vinserti128          m0, m1, xm5, 1
   1939    vinserti128          m1, m3, xm7, 1
   1940    punpcklwd            m3, m2, m6
   1941    punpckhwd            m2, m6
   1942    vpbroadcastd         m6, [o(pw_16384)]
   1943    punpckhwd            m4, m0, m1
   1944    punpcklwd            m0, m1
   1945    mova                 m1, m6
   1946    jmp m(iadst_16x4_internal_8bpc).pass1_end
   1947 .pass2:
   1948    call .main
   1949    jmp m(iadst_16x4_internal_8bpc).end
   1950 ALIGN function_align
   1951 cglobal_label .main
   1952    vpbroadcastd         m6, [o(pd_2048)]
   1953    IDCT4_1D              0, 1, 2, 3, 4, 5, 6
   1954    ret
   1955 
   1956 INV_TXFM_16X4_FN adst, dct
   1957 INV_TXFM_16X4_FN adst, adst
   1958 INV_TXFM_16X4_FN adst, flipadst
   1959 INV_TXFM_16X4_FN adst, identity
   1960 
   1961 cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
   1962    vpermq               m0, [cq+32*0], q1230
   1963    vpermq               m3, [cq+32*3], q2103
   1964    vpermq               m1, [cq+32*1], q1230
   1965    vpermq               m2, [cq+32*2], q2103
   1966    call m(iadst_4x16_internal_8bpc).main2
   1967    call m(iadst_4x16_internal_8bpc).main_pass1_end
   1968    punpcklwd            m4, m3, m1
   1969    punpcklwd            m5, m2, m0
   1970    punpckhwd            m0, m1
   1971    punpckhwd            m2, m3
   1972    vpbroadcastd         m1, [o(pw_16384)]
   1973    vinserti128          m3, m0, xm2, 1
   1974    vperm2i128           m2, m0, m2, 0x31
   1975    vinserti128          m0, m4, xm5, 1
   1976    vperm2i128           m4, m4, m5, 0x31
   1977    psubw                m6, m7, m1
   1978 .pass1_end:
   1979    pmulhrsw             m3, m1
   1980    pmulhrsw             m2, m6
   1981    pmulhrsw             m4, m1
   1982    pmulhrsw             m0, m6
   1983    punpcklwd            m1, m3, m2
   1984    punpckhwd            m3, m2
   1985    punpcklwd            m2, m4, m0
   1986    punpckhwd            m4, m0
   1987    punpckldq            m0, m1, m2
   1988    punpckhdq            m1, m2
   1989    punpckldq            m2, m3, m4
   1990    punpckhdq            m3, m4
   1991    jmp                tx2q
   1992 .pass2:
   1993    call .main
   1994 .end:
   1995    vpbroadcastd         m4, [o(pw_2048)]
   1996    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
   1997    WIN64_RESTORE_XMM
   1998 .end2:
   1999    pxor                 m4, m4
   2000    mova          [cq+32*0], m4
   2001    mova          [cq+32*1], m4
   2002    mova          [cq+32*2], m4
   2003    mova          [cq+32*3], m4
   2004 .end3:
   2005    WRITE_16X2            0, 1, 4, 5, strideq*0, strideq*1
   2006    lea                dstq, [dstq+strideq*2]
   2007    WRITE_16X2            2, 3, 4, 5, strideq*0, strideq*1
   2008    RET
   2009 ALIGN function_align
   2010 cglobal_label .main
   2011    vpbroadcastd         m6, [o(pw_m3344_3344)]
   2012    vpbroadcastd         m7, [o(pw_3803_1321)]
   2013    vpbroadcastd         m8, [o(pw_m1321_2482)]
   2014    vpbroadcastd         m9, [o(pw_2482_3344)]
   2015    punpcklwd            m4, m2, m0 ; in2 in0 l
   2016    punpckhwd            m2, m0     ; in2 in0 h
   2017    psrld                m5, m6, 16
   2018    pmaddwd             m10, m6, m4 ; t2:02 l
   2019    pmaddwd              m6, m2     ; t2:02 h
   2020    pmaddwd              m0, m7, m4 ; t0:02 l
   2021    pmaddwd              m7, m2     ; t0:02 h
   2022    pmaddwd              m4, m8     ; t1:02 l
   2023    pmaddwd              m8, m2     ; t1:02 h
   2024    punpckhwd            m2, m3, m1 ; in3 in1 h
   2025    punpcklwd            m3, m1     ; in3 in1 l
   2026    pmaddwd              m1, m5, m2 ; t2:3 h
   2027    pmaddwd              m5, m3     ; t2:3 l
   2028    paddd                m6, m1
   2029    vpbroadcastd         m1, [o(pd_2048)]
   2030    paddd               m10, m5
   2031    pmaddwd              m5, m9, m3
   2032    pmaddwd              m9, m2
   2033    paddd                m0, m1
   2034    paddd                m7, m1
   2035    paddd                m0, m5     ; t0 + t3 + 2048 l
   2036    paddd                m7, m9     ; t0 + t3 + 2048 h
   2037    vpbroadcastd         m9, [o(pw_m3803_3344)]
   2038    pmaddwd              m5, m9, m2
   2039    pmaddwd              m9, m3
   2040    paddd               m10, m1     ; t2 + 2048 l
   2041    paddd                m6, m1     ; t2 + 2048 h
   2042    paddd                m5, m1     ; t1:13 + 2048 h
   2043    paddd                m1, m9     ; t1:13 + 2048 l
   2044    vpbroadcastd         m9, [o(pw_m3803_m6688)]
   2045    pmaddwd              m2, m9
   2046    pmaddwd              m3, m9
   2047    paddd                m5, m8     ; t1 + t3 + 2048 h
   2048    paddd                m1, m4     ; t1 + t3 + 2048 l
   2049    paddd                m8, m7
   2050    paddd                m4, m0
   2051    paddd                m2, m8     ; t0 + t1 - t3 + 2048 h
   2052    paddd                m3, m4     ; t0 + t1 - t3 + 2048 l
   2053    REPX      {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
   2054    packssdw             m0, m7
   2055    packssdw             m1, m5
   2056    packssdw             m3, m2
   2057    packssdw             m2, m10, m6
   2058    ret
   2059 
   2060 INV_TXFM_16X4_FN flipadst, dct
   2061 INV_TXFM_16X4_FN flipadst, adst
   2062 INV_TXFM_16X4_FN flipadst, flipadst
   2063 INV_TXFM_16X4_FN flipadst, identity
   2064 
   2065 cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
   2066    vpermq               m0, [cq+32*0], q1230
   2067    vpermq               m3, [cq+32*3], q2103
   2068    vpermq               m1, [cq+32*1], q1230
   2069    vpermq               m2, [cq+32*2], q2103
   2070    call m(iadst_4x16_internal_8bpc).main2
   2071    call m(iadst_4x16_internal_8bpc).main_pass1_end
   2072    punpckhwd            m4, m3, m2
   2073    punpckhwd            m5, m1, m0
   2074    punpcklwd            m0, m2
   2075    punpcklwd            m1, m3
   2076    vpbroadcastd         m6, [o(pw_16384)]
   2077    vinserti128          m3, m0, xm1, 1
   2078    vperm2i128           m2, m0, m1, 0x31
   2079    vinserti128          m0, m4, xm5, 1
   2080    vperm2i128           m4, m4, m5, 0x31
   2081    psubw                m1, m7, m6
   2082    jmp m(iadst_16x4_internal_8bpc).pass1_end
   2083 ALIGN function_align
   2084 .pass2:
   2085    call m(iadst_16x4_internal_8bpc).main
   2086    vpbroadcastd         m4, [o(pw_2048)]
   2087    REPX   {pmulhrsw x, m4}, m3, m2, m1, m0
   2088    pxor                 m4, m4
   2089    mova          [cq+32*0], m4
   2090    mova          [cq+32*1], m4
   2091    mova          [cq+32*2], m4
   2092    mova          [cq+32*3], m4
   2093    WRITE_16X2            3, 2, 4, 5, strideq*0, strideq*1
   2094    lea                dstq, [dstq+strideq*2]
   2095    WRITE_16X2            1, 0, 4, 5, strideq*0, strideq*1
   2096    RET
   2097 
   2098 INV_TXFM_16X4_FN identity, dct
   2099 INV_TXFM_16X4_FN identity, adst
   2100 INV_TXFM_16X4_FN identity, flipadst
   2101 INV_TXFM_16X4_FN identity, identity
   2102 
   2103 cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
   2104    mova                xm2, [cq+16*0]
   2105    mova                xm4, [cq+16*1]
   2106    vinserti128          m2, [cq+16*4], 1
   2107    vinserti128          m4, [cq+16*5], 1
   2108    mova                xm0, [cq+16*2]
   2109    mova                xm1, [cq+16*3]
   2110    vinserti128          m0, [cq+16*6], 1
   2111    vinserti128          m1, [cq+16*7], 1
   2112    vpbroadcastd         m7, [o(pw_1697x16)]
   2113    vpbroadcastd         m8, [o(pw_16384)]
   2114    punpcklwd            m3, m2, m4
   2115    punpckhwd            m2, m4
   2116    punpcklwd            m4, m0, m1
   2117    punpckhwd            m0, m1
   2118    punpcklwd            m1, m3, m2
   2119    punpckhwd            m3, m2
   2120    punpcklwd            m2, m4, m0
   2121    punpckhwd            m4, m0
   2122    pmulhrsw             m0, m7, m1
   2123    pmulhrsw             m5, m7, m2
   2124    pmulhrsw             m6, m7, m3
   2125    pmulhrsw             m7, m4
   2126    REPX   {pmulhrsw x, m8}, m0, m5, m6, m7
   2127    paddsw               m1, m0
   2128    paddsw               m2, m5
   2129    paddsw               m3, m6
   2130    paddsw               m4, m7
   2131    punpcklqdq           m0, m1, m2
   2132    punpckhqdq           m1, m2
   2133    punpcklqdq           m2, m3, m4
   2134    punpckhqdq           m3, m4
   2135    jmp                tx2q
   2136 .pass2:
   2137    vpbroadcastd         m7, [o(pw_1697x8)]
   2138    pmulhrsw             m4, m7, m0
   2139    pmulhrsw             m5, m7, m1
   2140    pmulhrsw             m6, m7, m2
   2141    pmulhrsw             m7, m3
   2142    paddsw               m0, m4
   2143    paddsw               m1, m5
   2144    paddsw               m2, m6
   2145    paddsw               m3, m7
   2146    jmp m(iadst_16x4_internal_8bpc).end
   2147 
   2148 %macro INV_TXFM_16X8_FN 2 ; type1, type2
   2149    INV_TXFM_FN          %1, %2, 16x8
   2150 %ifidn %1_%2, dct_dct
   2151    movd                xm1, [o(pw_2896x8)]
   2152    pmulhrsw            xm0, xm1, [cq]
   2153    movd                xm2, [o(pw_16384)]
   2154    mov                [cq], eobd
   2155    pmulhrsw            xm0, xm1
   2156    or                  r3d, 8
   2157    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
   2158 %endif
   2159 %endmacro
   2160 
   2161 %macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
   2162    vpbroadcastd         m8, [o(pw_2896x8)]
   2163    vpermq               m0, [cq+32*0], q3120
   2164    add                  cq, 32*4
   2165    vpermq               m7, [cq+32*3], q%1
   2166    vpermq               m1, [cq-32*3], q%1
   2167    vpermq               m6, [cq+32*2], q3120
   2168    vpermq               m2, [cq-32*2], q3120
   2169    vpermq               m5, [cq+32*1], q%1
   2170    vpermq               m3, [cq-32*1], q%1
   2171    vpermq               m4, [cq+32*0], q3120
   2172    REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
   2173 %endmacro
   2174 
   2175 INV_TXFM_16X8_FN dct, dct
   2176 INV_TXFM_16X8_FN dct, adst
   2177 INV_TXFM_16X8_FN dct, flipadst
   2178 INV_TXFM_16X8_FN dct, identity
   2179 
   2180 cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
   2181    ITX_16X8_LOAD_COEFS 3120
   2182    call m(idct_8x16_internal_8bpc).main
   2183    vpbroadcastd        m10, [o(pw_16384)]
   2184    punpckhwd            m8, m0, m2
   2185    punpcklwd            m0, m2
   2186    punpckhwd            m2, m1, m3
   2187    punpcklwd            m1, m3
   2188    punpcklwd            m9, m4, m6
   2189    punpckhwd            m4, m6
   2190    punpcklwd            m6, m5, m7
   2191    punpckhwd            m5, m7
   2192    REPX  {pmulhrsw x, m10}, m8, m1, m4, m6
   2193 .pass1_end:
   2194    REPX  {pmulhrsw x, m10}, m0, m2, m9, m5
   2195    punpckhwd            m3, m0, m8
   2196    punpcklwd            m0, m8
   2197    punpckhwd            m8, m2, m1
   2198    punpcklwd            m2, m1
   2199    punpcklwd            m7, m9, m4
   2200    punpckhwd            m9, m4
   2201    punpcklwd            m4, m5, m6
   2202    punpckhwd            m5, m6
   2203    punpckhdq            m1, m0, m2
   2204    punpckldq            m0, m2
   2205    punpckldq            m2, m3, m8
   2206    punpckhdq            m3, m8
   2207    punpckldq            m6, m7, m4
   2208    punpckhdq            m7, m4
   2209    punpckldq            m8, m9, m5
   2210    punpckhdq            m9, m5
   2211    vperm2i128           m4, m0, m6, 0x31
   2212    vinserti128          m0, xm6, 1
   2213    vperm2i128           m5, m1, m7, 0x31
   2214    vinserti128          m1, xm7, 1
   2215    vperm2i128           m6, m2, m8, 0x31
   2216    vinserti128          m2, xm8, 1
   2217    vperm2i128           m7, m3, m9, 0x31
   2218    vinserti128          m3, xm9, 1
   2219    jmp                tx2q
   2220 .pass2:
   2221    call .main
   2222    vpbroadcastd         m8, [o(pw_2048)]
   2223 .end:
   2224    REPX   {pmulhrsw x, m8}, m0, m2, m4, m6
   2225 .end2:
   2226    REPX   {pmulhrsw x, m8}, m1, m3, m5, m7
   2227    lea                  r3, [strideq*3]
   2228    WRITE_16X2            0, 1, 8, 0, strideq*0, strideq*1
   2229    WRITE_16X2            2, 3, 0, 1, strideq*2, r3
   2230 .end3:
   2231    pxor                 m0, m0
   2232    REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
   2233 .end4:
   2234    lea                dstq, [dstq+strideq*4]
   2235    WRITE_16X2            4, 5, 0, 1, strideq*0, strideq*1
   2236    WRITE_16X2            6, 7, 0, 1, strideq*2, r3
   2237    RET
   2238 ALIGN function_align
   2239 cglobal_label .main
   2240    vpbroadcastd        m10, [o(pd_2048)]
   2241 .main2:
   2242    IDCT8_1D              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
   2243    ret
   2244 
   2245 INV_TXFM_16X8_FN adst, dct
   2246 INV_TXFM_16X8_FN adst, adst
   2247 INV_TXFM_16X8_FN adst, flipadst
   2248 INV_TXFM_16X8_FN adst, identity
   2249 
   2250 cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
   2251    ITX_16X8_LOAD_COEFS 1302
   2252    call m(iadst_8x16_internal_8bpc).main2
   2253    call m(iadst_8x16_internal_8bpc).main_pass1_end
   2254    psubw               m11, m9, m10
   2255    punpcklwd            m8, m0, m2
   2256    punpckhwd            m0, m2
   2257    punpckhwd            m2, m1, m3
   2258    punpcklwd            m1, m3
   2259    punpcklwd            m9, m4, m6
   2260    punpckhwd            m4, m6
   2261    punpckhwd            m6, m5, m7
   2262    punpcklwd            m5, m7
   2263    REPX  {pmulhrsw x, m11}, m8, m1, m4, m6
   2264    jmp m(idct_16x8_internal_8bpc).pass1_end
   2265 ALIGN function_align
   2266 .pass2:
   2267    call .main
   2268    call .main_pass2_end
   2269    pxor                 m8, m8
   2270    psubw                m8, m9
   2271    REPX   {pmulhrsw x, m9}, m0, m2, m4, m6
   2272    jmp m(idct_16x8_internal_8bpc).end2
   2273 ALIGN function_align
   2274 cglobal_label .main
   2275    vpbroadcastd        m10, [o(pd_2048)]
   2276    ITX_MULSUB_2W         7, 0, 8, 9, 10,  401, 4076 ; t1a, t0a
   2277    ITX_MULSUB_2W         3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
   2278    ITX_MULSUB_2W         1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
   2279    ITX_MULSUB_2W         5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
   2280    psubsw               m8, m2, m6 ; t6
   2281    paddsw               m2, m6     ; t2
   2282    psubsw               m6, m0, m4 ; t4
   2283    paddsw               m0, m4     ; t0
   2284    psubsw               m4, m5, m1 ; t7
   2285    paddsw               m5, m1     ; t3
   2286    psubsw               m1, m7, m3 ; t5
   2287    paddsw               m7, m3     ; t1
   2288    ITX_MULSUB_2W         6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
   2289    ITX_MULSUB_2W         4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
   2290    psubsw               m9, m6, m8 ;  t7
   2291    paddsw               m6, m8     ;  out6
   2292    psubsw               m3, m7, m5 ;  t3
   2293    paddsw               m7, m5     ; -out7
   2294    psubsw               m5, m0, m2 ;  t2
   2295    paddsw               m0, m2     ;  out0
   2296    psubsw               m2, m1, m4 ;  t6
   2297    paddsw               m1, m4     ; -out1
   2298    ret
   2299 ALIGN function_align
   2300 .main_pass1_end:
   2301    vpbroadcastd        m11, [o(pw_m2896_2896)]
   2302    vpbroadcastd        m12, [o(pw_2896_2896)]
   2303    punpckhwd            m4, m3, m5
   2304    punpcklwd            m3, m5
   2305    pmaddwd              m5, m11, m4
   2306    pmaddwd              m4, m12
   2307    pmaddwd              m8, m11, m3
   2308    pmaddwd              m3, m12
   2309    REPX     {paddd x, m10}, m5, m4, m8, m3
   2310    REPX     {psrad x, 12 }, m5, m8, m4, m3
   2311    packssdw             m3, m4     ; -out3
   2312    packssdw             m4, m8, m5 ;  out4
   2313    punpcklwd            m5, m9, m2
   2314    punpckhwd            m9, m2
   2315    pmaddwd              m2, m12, m5
   2316    pmaddwd              m5, m11
   2317    pmaddwd             m12, m9
   2318    pmaddwd             m11, m9
   2319    REPX     {paddd x, m10}, m2, m5, m12, m11
   2320    REPX     {psrad x, 12 }, m2, m12, m5, m11
   2321    packssdw             m2, m12    ;  out2
   2322    packssdw             m5, m11    ; -out5
   2323    ret
   2324 ALIGN function_align
   2325 cglobal_label .main_pass2_end
   2326    vpbroadcastd         m8, [o(pw_2896x8)]
   2327    psubsw               m4, m5, m3
   2328    paddsw               m3, m5
   2329    psubsw               m5, m2, m9
   2330    paddsw               m2, m9
   2331    pmulhrsw             m2, m8     ;  out2
   2332    pmulhrsw             m3, m8     ; -out3
   2333    pmulhrsw             m4, m8     ;  out4
   2334    pmulhrsw             m5, m8     ; -out5
   2335    vpbroadcastd         m9, [o(pw_2048)]
   2336    ret
   2337 
   2338 INV_TXFM_16X8_FN flipadst, dct
   2339 INV_TXFM_16X8_FN flipadst, adst
   2340 INV_TXFM_16X8_FN flipadst, flipadst
   2341 INV_TXFM_16X8_FN flipadst, identity
   2342 
   2343 cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
   2344    ITX_16X8_LOAD_COEFS 1302
   2345    call m(iadst_8x16_internal_8bpc).main2
   2346    call m(iadst_8x16_internal_8bpc).main_pass1_end
   2347    psubw                m9, m10
   2348    punpcklwd            m8, m6, m4
   2349    punpckhwd            m6, m4
   2350    punpcklwd            m4, m7, m5
   2351    punpckhwd            m7, m5
   2352    punpckhwd            m5, m3, m1
   2353    punpcklwd            m3, m1
   2354    punpckhwd            m1, m2, m0
   2355    punpcklwd            m2, m0
   2356    REPX  {pmulhrsw x, m10}, m8, m4, m5, m1
   2357    REPX  {pmulhrsw x, m9 }, m6, m7, m3, m2
   2358    punpcklwd            m0, m7, m4
   2359    punpckhwd            m7, m4
   2360    punpckhwd            m4, m6, m8
   2361    punpcklwd            m6, m8
   2362    punpckhwd            m8, m3, m5
   2363    punpcklwd            m3, m5
   2364    punpcklwd            m5, m2, m1
   2365    punpckhwd            m2, m1
   2366    punpckhdq            m1, m0, m6
   2367    punpckldq            m0, m6
   2368    punpckldq            m6, m7, m4
   2369    punpckhdq            m7, m4
   2370    punpckhdq            m4, m3, m5
   2371    punpckldq            m3, m5
   2372    punpckldq            m5, m8, m2
   2373    punpckhdq            m8, m2
   2374    vinserti128          m2, m6, xm5, 1
   2375    vperm2i128           m6, m5, 0x31
   2376    vperm2i128           m5, m1, m4, 0x31
   2377    vinserti128          m1, xm4, 1
   2378    vperm2i128           m4, m0, m3, 0x31
   2379    vinserti128          m0, xm3, 1
   2380    vinserti128          m3, m7, xm8, 1
   2381    vperm2i128           m7, m8, 0x31
   2382    jmp                tx2q
   2383 .pass2:
   2384    call m(iadst_16x8_internal_8bpc).main
   2385    call m(iadst_16x8_internal_8bpc).main_pass2_end
   2386    pxor                 m8, m8
   2387    psubw                m8, m9
   2388    pmulhrsw            m10, m7, m8
   2389    pmulhrsw             m7, m0, m9
   2390    pmulhrsw             m0, m6, m9
   2391    pmulhrsw             m6, m1, m8
   2392    pmulhrsw             m1, m5, m8
   2393    pmulhrsw             m5, m2, m9
   2394    pmulhrsw             m2, m4, m9
   2395    pmulhrsw             m4, m3, m8
   2396    lea                  r3, [strideq*3]
   2397    WRITE_16X2           10, 0, 8, 9, strideq*0, strideq*1
   2398    WRITE_16X2            1, 2, 0, 1, strideq*2, r3
   2399    jmp m(idct_16x8_internal_8bpc).end3
   2400 
   2401 INV_TXFM_16X8_FN identity, dct
   2402 INV_TXFM_16X8_FN identity, adst
   2403 INV_TXFM_16X8_FN identity, flipadst
   2404 INV_TXFM_16X8_FN identity, identity
   2405 
   2406 cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
   2407    mova                xm7, [cq+16*0]
   2408    mova                xm2, [cq+16*1]
   2409    add                  cq, 16*8
   2410    vpbroadcastd         m3, [o(pw_2896x8)]
   2411    vinserti128          m7, [cq+16*0], 1
   2412    vinserti128          m2, [cq+16*1], 1
   2413    mova                xm6, [cq-16*6]
   2414    mova                xm4, [cq-16*5]
   2415    vinserti128          m6, [cq+16*2], 1
   2416    vinserti128          m4, [cq+16*3], 1
   2417    mova                xm8, [cq-16*4]
   2418    mova                xm5, [cq-16*3]
   2419    vinserti128          m8, [cq+16*4], 1
   2420    vinserti128          m5, [cq+16*5], 1
   2421    mova                xm0, [cq-16*2]
   2422    mova                xm1, [cq-16*1]
   2423    vinserti128          m0, [cq+16*6], 1
   2424    vinserti128          m1, [cq+16*7], 1
   2425    vpbroadcastd        m10, [o(pw_1697x16)]
   2426    vpbroadcastd        m11, [o(pw_16384)]
   2427    REPX   {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
   2428    punpcklwd            m3, m7, m2
   2429    punpckhwd            m7, m2
   2430    punpcklwd            m2, m6, m4
   2431    punpckhwd            m6, m4
   2432    punpcklwd            m4, m8, m5
   2433    punpckhwd            m8, m5
   2434    punpcklwd            m5, m0, m1
   2435    punpckhwd            m0, m1
   2436    punpckldq            m1, m3, m2
   2437    punpckhdq            m3, m2
   2438    punpckldq            m2, m4, m5
   2439    punpckhdq            m4, m5
   2440    punpckldq            m5, m7, m6
   2441    punpckhdq            m7, m6
   2442    punpckldq            m6, m8, m0
   2443    punpckhdq            m8, m0
   2444    REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8
   2445    punpcklqdq           m0, m1, m2
   2446    punpckhqdq           m1, m2
   2447    punpcklqdq           m2, m3, m4
   2448    punpckhqdq           m3, m4
   2449    punpcklqdq           m4, m5, m6
   2450    punpckhqdq           m5, m6
   2451    punpcklqdq           m6, m7, m8
   2452    punpckhqdq           m7, m8
   2453    jmp                tx2q
   2454 .pass2:
   2455    vpbroadcastd         m8, [o(pw_4096)]
   2456    jmp m(idct_16x8_internal_8bpc).end
   2457 
   2458 %define o_base pw_5 + 128
   2459 
   2460 %macro INV_TXFM_16X16_FN 2 ; type1, type2
   2461    INV_TXFM_FN          %1, %2, 16x16
   2462 %ifidn %1_%2, dct_dct
   2463    movd                xm1, [o(pw_2896x8)]
   2464    pmulhrsw            xm0, xm1, [cq]
   2465    movd                xm2, [o(pw_8192)]
   2466    mov                [cq], eobd
   2467    or                  r3d, 16
   2468    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
   2469 %endif
   2470 %endmacro
   2471 
   2472 %macro ITX_16X16_LOAD_COEFS 0
   2473    mova                 m0, [cq+32*0]
   2474    mova                 m1, [cq+32*1]
   2475    mova                 m2, [cq+32*2]
   2476    mova                 m3, [cq+32*3]
   2477    add                  cq, 32*8
   2478    mova                 m4, [cq-32*4]
   2479    mova                 m5, [cq-32*3]
   2480    mova                 m6, [cq-32*2]
   2481    mova                 m7, [cq-32*1]
   2482    mova                 m8, [cq+32*0]
   2483    mova                 m9, [cq+32*1]
   2484    mova                m10, [cq+32*2]
   2485    mova                m11, [cq+32*3]
   2486    mova                m12, [cq+32*4]
   2487    mova                m13, [cq+32*5]
   2488    mova                m14, [cq+32*6]
   2489    mova                m15, [cq+32*7]
   2490    mova              [rsp], m15
   2491 %endmacro
   2492 
   2493 INV_TXFM_16X16_FN dct, dct
   2494 INV_TXFM_16X16_FN dct, adst
   2495 INV_TXFM_16X16_FN dct, flipadst
   2496 INV_TXFM_16X16_FN dct, identity
   2497 
   2498 cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
   2499    ITX_16X16_LOAD_COEFS
   2500    call .main
   2501 .pass1_end:
   2502    vpbroadcastd         m1, [o(pw_8192)]
   2503    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
   2504    vextracti128 [rsp+16*5], m8, 1
   2505    mova         [rsp+16*1], xm8
   2506 .pass1_end2:
   2507    vextracti128 [rsp+16*4], m0, 1
   2508    mova         [rsp+16*0], xm0
   2509    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
   2510    pmulhrsw             m1, [rsp+32*1]
   2511    vperm2i128           m8, m1, m9, 0x31
   2512    vinserti128          m1, xm9, 1
   2513    vperm2i128           m9, m2, m10, 0x31
   2514    vinserti128          m2, xm10, 1
   2515    vperm2i128          m10, m3, m11, 0x31
   2516    vinserti128          m3, xm11, 1
   2517    vperm2i128          m11, m4, m12, 0x31
   2518    vinserti128          m4, xm12, 1
   2519    vperm2i128          m12, m5, m13, 0x31
   2520    vinserti128          m5, xm13, 1
   2521    vperm2i128          m13, m6, m14, 0x31
   2522    vinserti128          m6, xm14, 1
   2523    vperm2i128          m14, m7, m15, 0x31
   2524    vinserti128          m7, xm15, 1
   2525    mova                m15, [rsp+32*2]
   2526 .pass1_end3:
   2527    punpcklwd            m0, m9, m10
   2528    punpckhwd            m9, m10
   2529    punpcklwd           m10, m15, m8
   2530    punpckhwd           m15, m8
   2531    punpckhwd            m8, m11, m12
   2532    punpcklwd           m11, m12
   2533    punpckhwd           m12, m13, m14
   2534    punpcklwd           m13, m14
   2535    punpckhdq           m14, m11, m13
   2536    punpckldq           m11, m13
   2537    punpckldq           m13, m15, m9
   2538    punpckhdq           m15, m9
   2539    punpckldq            m9, m10, m0
   2540    punpckhdq           m10, m0
   2541    punpckhdq            m0, m8, m12
   2542    punpckldq            m8, m12
   2543    punpcklqdq          m12, m13, m8
   2544    punpckhqdq          m13, m8
   2545    punpcklqdq           m8, m9, m11
   2546    punpckhqdq           m9, m11
   2547    punpckhqdq          m11, m10, m14
   2548    punpcklqdq          m10, m14
   2549    punpcklqdq          m14, m15, m0
   2550    punpckhqdq          m15, m0
   2551    mova                 m0, [rsp]
   2552    mova              [rsp], m15
   2553    punpckhwd           m15, m4, m5
   2554    punpcklwd            m4, m5
   2555    punpckhwd            m5, m0, m1
   2556    punpcklwd            m0, m1
   2557    punpckhwd            m1, m6, m7
   2558    punpcklwd            m6, m7
   2559    punpckhwd            m7, m2, m3
   2560    punpcklwd            m2, m3
   2561    punpckhdq            m3, m0, m2
   2562    punpckldq            m0, m2
   2563    punpckldq            m2, m4, m6
   2564    punpckhdq            m4, m6
   2565    punpckhdq            m6, m5, m7
   2566    punpckldq            m5, m7
   2567    punpckldq            m7, m15, m1
   2568    punpckhdq           m15, m1
   2569    punpckhqdq           m1, m0, m2
   2570    punpcklqdq           m0, m2
   2571    punpcklqdq           m2, m3, m4
   2572    punpckhqdq           m3, m4
   2573    punpcklqdq           m4, m5, m7
   2574    punpckhqdq           m5, m7
   2575    punpckhqdq           m7, m6, m15
   2576    punpcklqdq           m6, m15
   2577    jmp                tx2q
   2578 .pass2:
   2579    call .main
   2580 .end:
   2581    vpbroadcastd         m1, [o(pw_2048)]
   2582    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
   2583    mova              [rsp], m6
   2584 .end2:
   2585    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
   2586    pmulhrsw             m1, [rsp+32*1]
   2587    lea                  r3, [strideq*3]
   2588    WRITE_16X2            0,  1,  6,  0, strideq*0, strideq*1
   2589    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
   2590    lea                dstq, [dstq+strideq*4]
   2591    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
   2592    WRITE_16X2        [rsp],  7,  0,  1, strideq*2, r3
   2593 .end3:
   2594    pxor                 m2, m2
   2595    REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1
   2596    lea                dstq, [dstq+strideq*4]
   2597    WRITE_16X2            8,  9,  0,  1, strideq*0, strideq*1
   2598    WRITE_16X2           10, 11,  0,  1, strideq*2, r3
   2599    REPX {mova [cq+32*x], m2},  0,  1,  2,  3,  4,  5,  6,  7
   2600    lea                dstq, [dstq+strideq*4]
   2601    WRITE_16X2           12, 13,  0,  1, strideq*0, strideq*1
   2602    WRITE_16X2           14, 15,  0,  1, strideq*2, r3
   2603    RET
   2604 ALIGN function_align
   2605 cglobal_label .main
   2606    vpbroadcastd        m15, [o(pd_2048)]
   2607    mova [rsp+gprsize+32*1], m1
   2608    mova [rsp+gprsize+32*2], m9
   2609    IDCT8_1D              0,  2,  4,  6,  8, 10, 12, 14,  1,  9, 15
   2610    mova                 m1, [rsp+gprsize+32*2] ; in9
   2611    mova [rsp+gprsize+32*2], m14 ; tmp7
   2612    mova                 m9, [rsp+gprsize+32*1] ; in1
   2613    mova [rsp+gprsize+32*1], m10 ; tmp5
   2614    mova                m14, [rsp+gprsize+32*0] ; in15
   2615    mova [rsp+gprsize+32*0], m6  ; tmp3
   2616    IDCT16_1D_ODDHALF     9,  3,  5,  7,  1, 11, 13, 14,  6, 10, 15
   2617    mova                 m6, [rsp+gprsize+32*1] ; tmp5
   2618    psubsw              m15, m0, m14  ; out15
   2619    paddsw               m0, m14      ; out0
   2620    psubsw              m14, m2, m13  ; out14
   2621    paddsw               m2, m13      ; out1
   2622    mova [rsp+gprsize+32*1], m2
   2623    psubsw              m13, m4, m11  ; out13
   2624    paddsw               m2, m4, m11  ; out2
   2625    psubsw              m11, m8, m7   ; out11
   2626    paddsw               m4, m8, m7   ; out4
   2627    mova                 m7, [rsp+gprsize+32*2] ; tmp7
   2628    psubsw              m10, m6, m5   ; out10
   2629    paddsw               m5, m6       ; out5
   2630    psubsw               m8, m7, m9   ; out8
   2631    paddsw               m7, m9       ; out7
   2632    psubsw               m9, m12, m3  ; out9
   2633    paddsw               m6, m12, m3  ; out6
   2634    mova                 m3, [rsp+gprsize+32*0] ; tmp3
   2635    psubsw              m12, m3, m1   ; out12
   2636    paddsw               m3, m1       ; out3
   2637    ret
   2638 
   2639 INV_TXFM_16X16_FN adst, dct
   2640 INV_TXFM_16X16_FN adst, adst
   2641 INV_TXFM_16X16_FN adst, flipadst
   2642 
   2643 cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
   2644    ITX_16X16_LOAD_COEFS
   2645    call .main
   2646    call .main_pass1_end
   2647    pmulhrsw             m0, m1, [cq+32*0]
   2648    pmulhrsw             m2, m1, [cq+32*1]
   2649    REPX   {pmulhrsw x, m1}, m4, m6, m8, m10
   2650    pmulhrsw            m12, m1, [cq+32*2]
   2651    pmulhrsw            m14, m1, [cq+32*3]
   2652    vextracti128 [rsp+16*5], m8, 1
   2653    mova         [rsp+16*1], xm8
   2654    pxor                 m8, m8
   2655    psubw                m1, m8, m1
   2656    jmp m(idct_16x16_internal_8bpc).pass1_end2
   2657 ALIGN function_align
   2658 .pass2:
   2659    call .main
   2660    call .main_pass2_end
   2661    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
   2662    mova         [rsp+32*0], m6
   2663    pxor                 m6, m6
   2664    psubw                m1, m6, m1
   2665    jmp m(idct_16x16_internal_8bpc).end2
   2666 ALIGN function_align
   2667 cglobal_label .main
   2668    vpbroadcastd        m15, [o(pd_2048)]
   2669    mova [rsp+gprsize+32*1], m0
   2670    mova [rsp+gprsize+32*2], m4
   2671    ITX_MULSUB_2W        13,  2,  0,  4, 15,  995, 3973 ; t3,  t2
   2672    ITX_MULSUB_2W         9,  6,  0,  4, 15, 2440, 3290 ; t7,  t6
   2673    ITX_MULSUB_2W         5, 10,  0,  4, 15, 3513, 2106 ; t11, t10
   2674    ITX_MULSUB_2W         1, 14,  0,  4, 15, 4052,  601 ; t15, t14
   2675    psubsw               m0, m2, m10  ; t10a
   2676    paddsw               m2, m10      ; t2a
   2677    psubsw              m10, m13, m5  ; t11a
   2678    paddsw              m13, m5       ; t3a
   2679    psubsw               m5, m6, m14  ; t14a
   2680    paddsw               m6, m14      ; t6a
   2681    psubsw              m14, m9, m1   ; t15a
   2682    paddsw               m9, m1       ; t7a
   2683    ITX_MULSUB_2W         0, 10,  1,  4, 15, 3406, 2276 ; t11, t10
   2684    ITX_MULSUB_2W        14,  5,  1,  4, 15, 2276, 3406 ; t14, t15
   2685    psubsw               m1, m10, m14 ; t14a
   2686    paddsw              m10, m14      ; t10a
   2687    psubsw              m14, m0, m5   ; t15a
   2688    paddsw               m0, m5       ; t11a
   2689    psubsw               m5, m2, m6   ; t6
   2690    paddsw               m2, m6       ; t2
   2691    psubsw               m6, m13, m9  ; t7
   2692    paddsw              m13, m9       ; t3
   2693    ITX_MULSUB_2W         6,  5,  4,  9, 15, 3784, 1567 ; t6a, t7a
   2694    ITX_MULSUB_2W        14,  1,  4,  9, 15, 3784, 1567 ; t14, t15
   2695    mova                 m9, [rsp+gprsize+32*0] ; in15
   2696    mova [rsp+gprsize+32*0], m10 ; t10a
   2697    mova                 m4, [rsp+gprsize+32*1] ; in0
   2698    mova [rsp+gprsize+32*1], m6  ; t6a
   2699    mova                 m6, [rsp+gprsize+32*2] ; in4
   2700    mova [rsp+gprsize+32*2], m2  ; t2
   2701    ITX_MULSUB_2W         9,  4,  2, 10, 15,  201, 4091 ; t1,  t0
   2702    ITX_MULSUB_2W        11,  6,  2, 10, 15, 1751, 3703 ; t5,  t4
   2703    ITX_MULSUB_2W         7,  8,  2, 10, 15, 3035, 2751 ; t9,  t8
   2704    ITX_MULSUB_2W         3, 12,  2, 10, 15, 3857, 1380 ; t13, t12
   2705    psubsw              m10, m4, m8  ; t8a
   2706    paddsw               m8, m4      ; t0a
   2707    psubsw               m4, m9, m7  ; t9a
   2708    paddsw               m9, m7      ; t1a
   2709    psubsw               m7, m6, m12 ; t12a
   2710    paddsw               m6, m12     ; t4a
   2711    psubsw              m12, m11, m3 ; t13a
   2712    paddsw              m11, m3      ; t5a
   2713    ITX_MULSUB_2W        10,  4,  2,  3, 15,  799, 4017 ; t9,  t8
   2714    ITX_MULSUB_2W        12,  7,  2,  3, 15, 4017,  799 ; t12, t13
   2715    psubsw               m3, m9, m11 ; t5
   2716    paddsw               m9, m11     ; t1
   2717    psubsw              m11, m4, m12 ; t12a
   2718    paddsw               m4, m12     ; t8a
   2719    paddsw              m12, m8, m6  ; t0
   2720    psubsw               m8, m6      ; t4
   2721    paddsw               m6, m10, m7 ; t9a
   2722    psubsw              m10, m7      ; t13a
   2723    ITX_MULSUB_2W         8,  3,  2,  7, 15, 1567, 3784 ; t5a, t4a
   2724    ITX_MULSUB_2W        11, 10,  2,  7, 15, 1567, 3784 ; t13, t12
   2725    mova                 m7, [rsp+gprsize+32*0] ; t10a
   2726    mova                 m2, [rsp+gprsize+32*1] ; t6a
   2727    paddsw              m15, m9, m13  ; -out15
   2728    psubsw               m9, m13      ;  t3a
   2729    paddsw              m13, m11, m1  ; -out13
   2730    psubsw              m11, m1       ;  t15a
   2731    psubsw               m1, m4, m7   ;  t10
   2732    paddsw               m7, m4       ; -out1
   2733    psubsw               m4, m3, m2   ;  t6
   2734    paddsw               m3, m2       ; -out3
   2735    paddsw               m2, m10, m14 ;  out2
   2736    psubsw              m10, m14      ;  t14a
   2737    paddsw              m14, m6, m0   ;  out14
   2738    psubsw               m6, m0       ;  t11
   2739    mova                 m0, [rsp+gprsize+32*2] ; t2
   2740    mova [rsp+gprsize+32*1], m7
   2741    psubsw               m7, m12, m0  ;  t2a
   2742    paddsw               m0, m12      ;  out0
   2743    paddsw              m12, m8, m5   ;  out12
   2744    psubsw               m8, m5       ;  t7
   2745    ret
   2746 ALIGN function_align
   2747 .main_pass1_end:
   2748    mova          [cq+32*0], m0
   2749    mova          [cq+32*1], m2
   2750    mova          [cq+32*2], m12
   2751    mova          [cq+32*3], m14
   2752    vpbroadcastd        m14, [pw_m2896_2896]
   2753    vpbroadcastd        m12, [pw_2896_2896]
   2754    vpbroadcastd         m2, [pd_2048]
   2755    punpcklwd            m5, m11, m10
   2756    punpckhwd           m11, m10
   2757    pmaddwd             m10, m14, m5
   2758    pmaddwd              m0, m14, m11
   2759    pmaddwd              m5, m12
   2760    pmaddwd             m11, m12
   2761    REPX      {paddd x, m2}, m10, m0, m5, m11
   2762    REPX      {psrad x, 12}, m10, m0, m5, m11
   2763    packssdw            m10, m0  ;  out10
   2764    packssdw             m5, m11 ; -out5
   2765    punpcklwd           m11, m8, m4
   2766    punpckhwd            m8, m4
   2767    pmaddwd              m4, m12, m11
   2768    pmaddwd              m0, m12, m8
   2769    pmaddwd             m11, m14
   2770    pmaddwd              m8, m14
   2771    REPX      {paddd x, m2}, m4, m0, m11, m8
   2772    REPX      {psrad x, 12}, m4, m0, m11, m8
   2773    packssdw             m4, m0  ;  out4
   2774    packssdw            m11, m8  ; -out11
   2775    punpcklwd            m8, m9, m7
   2776    punpckhwd            m9, m7
   2777    pmaddwd              m7, m12, m8
   2778    pmaddwd              m0, m12, m9
   2779    pmaddwd              m8, m14
   2780    pmaddwd              m9, m14
   2781    REPX      {paddd x, m2}, m7, m0, m8, m9
   2782    REPX      {psrad x, 12}, m7, m0, m8, m9
   2783    packssdw             m7, m0  ; -out7
   2784    packssdw             m8, m9  ;  out8
   2785    punpckhwd            m0, m6, m1
   2786    punpcklwd            m6, m1
   2787    pmaddwd              m1, m14, m0
   2788    pmaddwd              m9, m14, m6
   2789    pmaddwd              m0, m12
   2790    pmaddwd              m6, m12
   2791    REPX      {paddd x, m2}, m1, m9, m0, m6
   2792    REPX      {psrad x, 12}, m1, m9, m0, m6
   2793    packssdw             m9, m1  ; -out7
   2794    packssdw             m6, m0  ;  out8
   2795    vpbroadcastd         m1, [o(pw_8192)]
   2796    ret
   2797 ALIGN function_align
   2798 cglobal_label .main_pass2_end
   2799    ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
   2800    ; 16-bit here will produce the same result as using 32-bit intermediates.
   2801    paddsw               m5, m10, m11 ; -out5
   2802    psubsw              m10, m11      ;  out10
   2803    psubsw              m11, m4, m8   ; -out11
   2804    paddsw               m4, m8       ;  out4
   2805    psubsw               m8, m7, m9   ;  out8
   2806    paddsw               m7, m9       ; -out7
   2807    psubsw               m9, m1, m6   ; -out9
   2808    paddsw               m6, m1       ;  out6
   2809    vpbroadcastd         m1, [o(pw_2896x8)]
   2810    REPX   {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
   2811    vpbroadcastd         m1, [o(pw_2048)]
   2812    ret
   2813 
   2814 INV_TXFM_16X16_FN flipadst, dct
   2815 INV_TXFM_16X16_FN flipadst, adst
   2816 INV_TXFM_16X16_FN flipadst, flipadst
   2817 
   2818 cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
   2819    ITX_16X16_LOAD_COEFS
   2820    call m(iadst_16x16_internal_8bpc).main
   2821    call m(iadst_16x16_internal_8bpc).main_pass1_end
   2822    pmulhrsw             m6, m1
   2823    pmulhrsw             m2, m1, m8
   2824    mova         [rsp+32*2], m6
   2825    pmulhrsw             m6, m1, m4
   2826    pmulhrsw             m4, m1, m10
   2827    pmulhrsw             m8, m1, [cq+32*3]
   2828    pmulhrsw            m10, m1, [cq+32*2]
   2829    pmulhrsw            m12, m1, [cq+32*1]
   2830    pmulhrsw            m14, m1, [cq+32*0]
   2831    pxor                 m0, m0
   2832    psubw                m0, m1
   2833    REPX   {pmulhrsw x, m0}, m3, m5, m7, m11, m15
   2834    pmulhrsw             m1, m0, m9
   2835    pmulhrsw             m9, m0, m13
   2836    pmulhrsw             m0, [rsp+32*1]
   2837    mova         [rsp+16*0], xm15
   2838    mova         [rsp+16*1], xm7
   2839    vperm2i128          m15, m15, m7, 0x31
   2840    vinserti128          m7, m2, xm14, 1
   2841    vperm2i128          m14, m2, m14, 0x31
   2842    vinserti128          m2, m9, xm5, 1
   2843    vperm2i128           m9, m9, m5, 0x31
   2844    vinserti128          m5, m4, xm12, 1
   2845    vperm2i128          m12, m4, m12, 0x31
   2846    vinserti128          m4, m11, xm3, 1
   2847    vperm2i128          m11, m11, m3, 0x31
   2848    vinserti128          m3, m10, xm6, 1
   2849    vperm2i128          m10, m10, m6, 0x31
   2850    vinserti128          m6, m1, xm0, 1
   2851    vperm2i128          m13, m1, m0, 0x31
   2852    vinserti128          m1, m8, [rsp+32*2], 1
   2853    vperm2i128           m8, m8, [rsp+32*2], 0x31
   2854    jmp m(idct_16x16_internal_8bpc).pass1_end3
   2855 .pass2:
   2856    call m(iadst_16x16_internal_8bpc).main
   2857    call m(iadst_16x16_internal_8bpc).main_pass2_end
   2858    pmulhrsw             m0, m1
   2859    pmulhrsw             m8, m1
   2860    mova         [rsp+32*0], m0
   2861    mova         [rsp+32*2], m8
   2862    pxor                 m0, m0
   2863    psubw                m0, m1
   2864    pmulhrsw             m8, m0, m7
   2865    pmulhrsw             m7, m0, m9
   2866    pmulhrsw             m9, m1, m6
   2867    pmulhrsw             m6, m1, m10
   2868    pmulhrsw            m10, m0, m5
   2869    pmulhrsw             m5, m0, m11
   2870    pmulhrsw            m11, m1, m4
   2871    pmulhrsw             m4, m1, m12
   2872    pmulhrsw            m12, m0, m3
   2873    pmulhrsw             m3, m0, m13
   2874    pmulhrsw            m13, m1, m2
   2875    pmulhrsw             m1, m14
   2876    pmulhrsw            m14, m0, [rsp+32*1]
   2877    pmulhrsw             m0, m15
   2878    lea                  r3, [strideq*3]
   2879    WRITE_16X2            0,  1,  2,  0, strideq*0, strideq*1
   2880    mova                m15, [rsp+32*0]
   2881    WRITE_16X2            3,  4,  0,  1, strideq*2, r3
   2882    lea                dstq, [dstq+strideq*4]
   2883    WRITE_16X2            5,  6,  0,  1, strideq*0, strideq*1
   2884    WRITE_16X2            7, [rsp+32*2],  0,  1, strideq*2, r3
   2885    jmp m(idct_16x16_internal_8bpc).end3
   2886 
   2887 %macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
   2888    pmulhrsw            m%2, m%3, m%1
   2889    psraw               m%2, 1
   2890    pavgw               m%1, m%2 ; signs are guaranteed to be equal
   2891 %endmacro
   2892 
   2893 INV_TXFM_16X16_FN identity, dct
   2894 INV_TXFM_16X16_FN identity, identity
   2895 
   2896 cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
   2897    vpbroadcastd         m7, [o(pw_1697x16)]
   2898    mova                xm0, [cq+16* 0]
   2899    vinserti128          m0, [cq+16*16], 1
   2900    mova               xm15, [cq+16* 1]
   2901    vinserti128         m15, [cq+16*17], 1
   2902    mova                xm1, [cq+16* 2]
   2903    vinserti128          m1, [cq+16*18], 1
   2904    mova                xm8, [cq+16* 3]
   2905    vinserti128          m8, [cq+16*19], 1
   2906    mova                xm2, [cq+16* 4]
   2907    vinserti128          m2, [cq+16*20], 1
   2908    mova                xm9, [cq+16* 5]
   2909    vinserti128          m9, [cq+16*21], 1
   2910    mova                xm3, [cq+16* 6]
   2911    vinserti128          m3, [cq+16*22], 1
   2912    mova               xm10, [cq+16* 7]
   2913    add                  cq, 16*16
   2914    vinserti128         m10, [cq+16* 7], 1
   2915    mova                xm4, [cq-16* 8]
   2916    vinserti128          m4, [cq+16* 8], 1
   2917    mova               xm11, [cq-16* 7]
   2918    vinserti128         m11, [cq+16* 9], 1
   2919    mova                xm5, [cq-16* 6]
   2920    vinserti128          m5, [cq+16*10], 1
   2921    mova               xm12, [cq-16* 5]
   2922    vinserti128         m12, [cq+16*11], 1
   2923    mova               xm13, [cq-16* 3]
   2924    vinserti128         m13, [cq+16*13], 1
   2925    mova               xm14, [cq-16* 1]
   2926    vinserti128         m14, [cq+16*15], 1
   2927    REPX  {IDTX16B x, 6, 7},  0, 15,  1,  8,  2,  9,  3, \
   2928                             10,  4, 11,  5, 12, 13, 14
   2929    mova                xm6, [cq-16* 4]
   2930    vinserti128          m6, [cq+16*12], 1
   2931    mova              [rsp], m0
   2932    IDTX16B               6, 0, 7
   2933    mova                xm0, [cq-16* 2]
   2934    vinserti128          m0, [cq+16*14], 1
   2935    pmulhrsw             m7, m0
   2936    psraw                m7, 1
   2937    pavgw                m7, m0
   2938    jmp m(idct_16x16_internal_8bpc).pass1_end3
   2939 ALIGN function_align
   2940 .pass2:
   2941    vpbroadcastd        m15, [o(pw_1697x16)]
   2942    mova         [rsp+32*1], m0
   2943    REPX  {IDTX16 x, 0, 15},  1,  2,  3,  4,  5,  6,  7, \
   2944                              8,  9, 10, 11, 12, 13, 14
   2945    mova                 m0, [rsp+32*1]
   2946    mova         [rsp+32*1], m1
   2947    IDTX16                0, 1, 15
   2948    mova                 m1, [rsp+32*0]
   2949    pmulhrsw            m15, m1
   2950    paddsw               m1, m1
   2951    paddsw              m15, m1
   2952    jmp m(idct_16x16_internal_8bpc).end
   2953 
   2954 %define o_base deint_shuf + 128
   2955 
   2956 %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
   2957 %if %3
   2958    vpbroadcastd        m15, [o(pw_2896x8)]
   2959    pmulhrsw             m0, m15, [%1+%2*0]
   2960    pmulhrsw             m1, m15, [%1+%2*1]
   2961    pmulhrsw             m2, m15, [%1+%2*2]
   2962    pmulhrsw             m3, m15, [%1+%2*3]
   2963    pmulhrsw             m4, m15, [%1+%2*4]
   2964    pmulhrsw             m5, m15, [%1+%2*5]
   2965    pmulhrsw             m6, m15, [%1+%2*6]
   2966    pmulhrsw             m7, m15, [%1+%2*7]
   2967 %else
   2968    mova                 m0, [%1+%2*0]
   2969    mova                 m1, [%1+%2*1]
   2970    mova                 m2, [%1+%2*2]
   2971    mova                 m3, [%1+%2*3]
   2972    mova                 m4, [%1+%2*4]
   2973    mova                 m5, [%1+%2*5]
   2974    mova                 m6, [%1+%2*6]
   2975    mova                 m7, [%1+%2*7]
   2976 %endif
   2977 %endmacro
   2978 
   2979 %macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2
   2980 %if %3
   2981 %if %3 == 1
   2982    vpbroadcastd        m15, [o(pw_2896x8)]
   2983 %endif
   2984    pmulhrsw             m8, m15, [%1+%2*0]
   2985    pmulhrsw             m9, m15, [%1+%2*1]
   2986    pmulhrsw            m10, m15, [%1+%2*2]
   2987    pmulhrsw            m11, m15, [%1+%2*3]
   2988    pmulhrsw            m12, m15, [%1+%2*4]
   2989    pmulhrsw            m13, m15, [%1+%2*5]
   2990    pmulhrsw            m14, m15, [%1+%2*6]
   2991    pmulhrsw            m15,      [%1+%2*7]
   2992 %else
   2993    mova                 m8, [%1+%2*0]
   2994    mova                 m9, [%1+%2*1]
   2995    mova                m10, [%1+%2*2]
   2996    mova                m11, [%1+%2*3]
   2997    mova                m12, [%1+%2*4]
   2998    mova                m13, [%1+%2*5]
   2999    mova                m14, [%1+%2*6]
   3000    mova                m15, [%1+%2*7]
   3001 %endif
   3002 %endmacro
   3003 
   3004 %macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
   3005    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%4_%5x8]
   3006    punpcklwd           m%1, m%2, m%2
   3007    pmulhrsw            m%1, m%3
   3008    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%6_%7x8]
   3009    punpckhwd           m%2, m%2
   3010    pmulhrsw            m%2, m%3
   3011 %endmacro
   3012 
   3013 cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
   3014    lea                  r6, [o_base]
   3015    test               eobd, eobd
   3016    jz .dconly
   3017    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
   3018    %undef cmp
   3019    cmp                eobd, 106
   3020    jle .fast
   3021    LOAD_8ROWS      cq+32*1, 32*2
   3022    call m(idct_16x8_internal_8bpc).main
   3023    vperm2i128          m11, m0, m4, 0x31
   3024    vinserti128          m0, xm4, 1
   3025    vperm2i128           m4, m1, m5, 0x31
   3026    vinserti128          m1, xm5, 1
   3027    vperm2i128           m5, m2, m6, 0x31
   3028    vinserti128          m2, xm6, 1
   3029    vperm2i128           m6, m3, m7, 0x31
   3030    vinserti128          m3, xm7, 1
   3031    pxor                 m7, m7
   3032    REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
   3033    punpckhwd            m7, m0, m1
   3034    punpcklwd            m0, m1
   3035    punpckhwd            m1, m2, m3
   3036    punpcklwd            m2, m3
   3037    punpcklwd            m3, m11, m4
   3038    punpckhwd           m11, m4
   3039    punpckhwd            m4, m5, m6
   3040    punpcklwd            m5, m6
   3041    punpckhdq            m6, m0, m2
   3042    punpckldq            m0, m2
   3043    punpckldq            m2, m3, m5
   3044    punpckhdq            m3, m5
   3045    punpckhdq            m5, m11, m4
   3046    punpckldq           m11, m4
   3047    punpckldq            m4, m7, m1
   3048    punpckhdq            m7, m1
   3049    punpckhqdq          m12, m6, m0
   3050    punpcklqdq           m0, m6     ; out4
   3051    punpckhqdq          m13, m7, m4
   3052    punpcklqdq           m4, m7     ; out5
   3053    punpckhqdq          m14, m3, m2
   3054    punpcklqdq           m2, m3     ; out6
   3055    punpckhqdq          m15, m5, m11
   3056    punpcklqdq          m11, m5     ; out7
   3057    mova         [rsp+32*0], m0
   3058    mova         [rsp+32*1], m4
   3059    mova         [rsp+32*2], m2
   3060 .fast:
   3061    LOAD_8ROWS      cq+32*0, 32*2
   3062    call m(idct_16x8_internal_8bpc).main
   3063    vperm2i128           m8, m0, m4, 0x31
   3064    vinserti128          m0, xm4, 1
   3065    vperm2i128           m4, m1, m5, 0x31
   3066    vinserti128          m1, xm5, 1
   3067    vperm2i128           m5, m2, m6, 0x31
   3068    vinserti128          m2, xm6, 1
   3069    vperm2i128           m6, m3, m7, 0x31
   3070    vinserti128          m3, xm7, 1
   3071    vpbroadcastd         m9, [o(pw_8192)]
   3072    pxor                 m7, m7
   3073    REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
   3074    punpckhwd            m7, m0, m1
   3075    punpcklwd            m0, m1
   3076    punpckhwd            m1, m2, m3
   3077    punpcklwd            m2, m3
   3078    punpckhwd            m3, m8, m4
   3079    punpcklwd            m8, m4
   3080    punpckhwd            m4, m5, m6
   3081    punpcklwd            m5, m6
   3082    punpckhdq            m6, m0, m2
   3083    punpckldq            m0, m2
   3084    punpckldq            m2, m8, m5
   3085    punpckhdq            m8, m5
   3086    punpckhdq            m5, m3, m4
   3087    punpckldq            m3, m4
   3088    punpckhdq            m4, m7, m1
   3089    punpckldq            m7, m1
   3090    punpcklqdq           m1, m7, m4
   3091    punpckhqdq           m7, m4     ; out9
   3092    punpckhqdq           m4, m2, m8 ; out10
   3093    punpcklqdq           m2, m8
   3094    punpckhqdq           m8, m3, m5
   3095    punpcklqdq           m3, m5
   3096    punpckhqdq           m5, m0, m6 ; out8
   3097    punpcklqdq           m0, m6
   3098    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7
   3099    cmp                eobd, 106
   3100    jg .full
   3101    mova         [rsp+32*0], m5
   3102    mova         [rsp+32*1], m7
   3103    mova         [rsp+32*2], m4
   3104    pmulhrsw            m11, m9, m8
   3105    pxor                 m4, m4
   3106    REPX       {mova x, m4}, m5, m6, m7
   3107    call .main_fast
   3108    jmp .pass2
   3109 .dconly:
   3110    movd                xm1, [o(pw_2896x8)]
   3111    pmulhrsw            xm0, xm1, [cq]
   3112    movd                xm2, [o(pw_8192)]
   3113    mov                [cq], eobd
   3114    or                  r3d, 32
   3115    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
   3116 .full:
   3117    REPX   {pmulhrsw x, m9}, m12, m13, m14, m15
   3118    pmulhrsw             m6, m9, [rsp+32*2]
   3119    mova         [rsp+32*2], m4
   3120    pmulhrsw             m4, m9, [rsp+32*0]
   3121    mova         [rsp+32*0], m5
   3122    pmulhrsw             m5, m9, [rsp+32*1]
   3123    mova         [rsp+32*1], m7
   3124    pmulhrsw             m7, m9, m11
   3125    pmulhrsw            m11, m9, m8
   3126    call .main
   3127 .pass2:
   3128    vpbroadcastd        m12, [o(pw_2048)]
   3129    REPX  {pmulhrsw x, m12}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   3130                             m8,  m9,  m10, m11,      m13, m14, m15
   3131    pmulhrsw            m12, [rsp]
   3132    REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14
   3133    REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15
   3134    mova         [rsp+32*0], m4
   3135    mova         [rsp+32*1], m6
   3136    lea                  r3, [strideq*3]
   3137    WRITE_8X4             0,  1,  4,  6
   3138    lea                dstq, [dstq+strideq*4]
   3139    WRITE_8X4             2,  3,  4,  6
   3140    lea                dstq, [dstq+strideq*4]
   3141    WRITE_8X4    [rsp+32*0],  5,  4,  6
   3142    lea                dstq, [dstq+strideq*4]
   3143    WRITE_8X4    [rsp+32*1],  7,  4,  6
   3144    lea                dstq, [dstq+strideq*4]
   3145    WRITE_8X4             8,  9,  4,  6
   3146    lea                dstq, [dstq+strideq*4]
   3147    WRITE_8X4            10, 11,  4,  6
   3148    lea                dstq, [dstq+strideq*4]
   3149    WRITE_8X4            12, 13,  4,  6
   3150    lea                dstq, [dstq+strideq*4]
   3151    WRITE_8X4            14, 15,  4,  6
   3152    RET
   3153 ALIGN function_align
   3154 cglobal_label .main_fast ; bottom half is zero
   3155    call m(idct_8x16_internal_8bpc).main
   3156    mova                 m8, [rsp+gprsize+0*32]
   3157    mova [rsp+gprsize+0*32], m0
   3158    mova                 m9, [rsp+gprsize+1*32]
   3159    mova [rsp+gprsize+1*32], m1
   3160    mova                 m0, [rsp+gprsize+2*32]
   3161    mova [rsp+gprsize+2*32], m6
   3162    lea                  r5, [r6-(o_base)+pw_201_4091x8]
   3163    ITX_UNPACK_MULHRSW    1,  8,  6,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
   3164    ITX_UNPACK_MULHRSW   15,  9,  6,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
   3165    ITX_UNPACK_MULHRSW   14,  0,  6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
   3166    ITX_UNPACK_MULHRSW   13, 11,  6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
   3167    jmp .main2
   3168 ALIGN function_align
   3169 cglobal_label .main
   3170    call m(idct_8x16_internal_8bpc).main
   3171    mova                 m8, [rsp+gprsize+0*32]
   3172    mova [rsp+gprsize+0*32], m0
   3173    mova                 m9, [rsp+gprsize+1*32]
   3174    mova [rsp+gprsize+1*32], m1
   3175    mova                 m0, [rsp+gprsize+2*32]
   3176    mova [rsp+gprsize+2*32], m6
   3177    punpcklwd            m1, m15, m8  ; in31 in1
   3178    punpckhwd            m8, m15      ; in3  in29
   3179    punpcklwd           m15, m14, m9  ; in27 in5
   3180    punpckhwd            m9, m14      ; in7  in25
   3181    punpcklwd           m14, m13, m0  ; in23 in9
   3182    punpckhwd            m0, m13      ; in11 in21
   3183    punpcklwd           m13, m12, m11 ; in19 in13
   3184    punpckhwd           m11, m12      ; in15 in17
   3185    ITX_MUL2X_PACK        1,  6, 12, 10,  201, 4091, 3 ; t16a, t31a
   3186    ITX_MUL2X_PACK        8,  6, 12, 10, 4052,  601, 3 ; t23a, t24a
   3187    ITX_MUL2X_PACK       15,  6, 12, 10,  995, 3973, 3 ; t20a, t27a
   3188    ITX_MUL2X_PACK        9,  6, 12, 10, 3857, 1380, 3 ; t19a, t28a
   3189    ITX_MUL2X_PACK       14,  6, 12, 10, 1751, 3703, 3 ; t18a, t29a
   3190    ITX_MUL2X_PACK        0,  6, 12, 10, 3513, 2106, 3 ; t21a, t26a
   3191    ITX_MUL2X_PACK       13,  6, 12, 10, 2440, 3290, 3 ; t22a, t25a
   3192    ITX_MUL2X_PACK       11,  6, 12, 10, 3035, 2751, 3 ; t17a, t30a
   3193 .main2:
   3194    psubsw               m6, m1, m11  ; t17 t30
   3195    paddsw               m1, m11      ; t16 t31
   3196    psubsw              m11, m9, m14  ; t18 t29
   3197    paddsw               m9, m14      ; t19 t28
   3198    psubsw              m14, m15, m0  ; t21 t26
   3199    paddsw              m15, m0       ; t20 t27
   3200    psubsw               m0, m8, m13  ; t22 t25
   3201    paddsw               m8, m13      ; t23 t24
   3202    ITX_MUL2X_PACK        6, 12, 13, 10,   799, 4017, 3 ; t17a t30a
   3203    ITX_MUL2X_PACK       11, 12, 13, 10, m4017,  799, 3 ; t18a t29a
   3204    ITX_MUL2X_PACK       14, 12, 13, 10,  3406, 2276, 3 ; t21a t26a
   3205    ITX_MUL2X_PACK        0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
   3206    psubsw              m13, m1, m9   ; t19a t28a
   3207    paddsw               m1, m9       ; t16a t31a
   3208    psubsw               m9, m8, m15  ; t20a t27a
   3209    paddsw               m8, m15      ; t23a t24a
   3210    psubsw              m15, m6, m11  ; t18  t29
   3211    paddsw               m6, m11      ; t17  t30
   3212    psubsw              m11, m0, m14  ; t21  t26
   3213    paddsw               m0, m14      ; t22  t25
   3214    ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 3 ; t18a t29a
   3215    ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 3 ; t19  t28
   3216    ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 3 ; t20  t27
   3217    ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a
   3218    vbroadcasti128      m12, [o(deint_shuf)]
   3219    psubsw              m14, m1, m8   ; t23  t24
   3220    paddsw               m1, m8       ; t16  t31
   3221    psubsw               m8, m6, m0   ; t22a t25a
   3222    paddsw               m6, m0       ; t17a t30a
   3223    psubsw               m0, m15, m11 ; t21  t26
   3224    paddsw              m15, m11      ; t18  t29
   3225    psubsw              m11, m13, m9  ; t20a t27a
   3226    paddsw              m13, m9       ; t19a t28a
   3227    REPX    {pshufb x, m12}, m1, m6, m15, m13
   3228    ITX_MUL2X_PACK       14,  9, 12, 10, 2896, 2896 ; t24a t23a
   3229    vpbroadcastd         m9, [o(pw_m2896_2896)]
   3230    ITX_MUL2X_PACK        8, 12,  _, 10, 12,  9, 4  ; t22  t25
   3231    vpbroadcastd        m12, [o(pw_2896_2896)]
   3232    ITX_MUL2X_PACK        0, 12,  _, 10, 12,  9, 4  ; t21a t26a
   3233    vpbroadcastd        m12, [o(pw_2896_2896)]
   3234    ITX_MUL2X_PACK       11,  9,  _, 10,  9, 12, 4  ; t27  t20
   3235    shufps               m9, m14, m8, q1032 ; t23a t22
   3236    vpblendd            m14, m8, 0xcc       ; t24a t25
   3237    shufps               m8, m11, m0, q1032 ; t20  t21a
   3238    vpblendd            m11, m0, 0xcc       ; t27  t26a
   3239    punpcklqdq           m0, m1, m6   ; t16  t17a
   3240    punpckhqdq           m1, m6       ; t31  t30a
   3241    psubsw              m10, m5, m8   ; out20 out21
   3242    paddsw               m5, m8       ; out11 out10
   3243    psubsw               m6, m3, m14  ; out24 out25
   3244    paddsw               m3, m14      ; out7  out6
   3245    psubsw               m8, m7, m0   ; out16 out17
   3246    paddsw               m7, m0       ; out15 out14
   3247    mova                 m0, [rsp+gprsize+0*32]
   3248    punpcklqdq          m12, m13, m15 ; t19a t18
   3249    punpckhqdq          m13, m15      ; t28a t29
   3250    psubsw              m15, m0, m1   ; out31 out30
   3251    paddsw               m0, m1       ; out0  out1
   3252    mova                 m1, [rsp+gprsize+1*32]
   3253    mova [rsp+gprsize+0*32], m6
   3254    mova                 m6, [rsp+gprsize+2*32]
   3255    psubsw              m14, m1, m13  ; out28 out29
   3256    paddsw               m1, m13      ; out3  out2
   3257    psubsw              m13, m2, m11  ; out27 out26
   3258    paddsw               m2, m11      ; out4  out5
   3259    psubsw              m11, m4, m9   ; out23 out22
   3260    paddsw               m4, m9       ; out8  out9
   3261    psubsw               m9, m6, m12  ; out19 out18
   3262    paddsw               m6, m12      ; out12 out13
   3263    ret
   3264 
   3265 %macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
   3266    vbroadcasti128      m%1, [cq+16*%3]
   3267    vbroadcasti128      m%2, [cq+16*%4]
   3268    shufpd              m%1, m%2, 0x0c
   3269 %endmacro
   3270 
   3271 cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
   3272    lea                  r6, [o_base]
   3273    test               eobd, eobd
   3274    jnz .normal
   3275    movd                xm1, [o(pw_2896x8)]
   3276    pmulhrsw            xm0, xm1, [cq]
   3277    movd                xm2, [o(pw_8192)]
   3278    mov                [cq], eobd
   3279    or                  r3d, 8
   3280 .dconly:
   3281    pmulhrsw            xm0, xm2
   3282    movd                xm2, [pw_2048] ; intentionally rip-relative
   3283    pmulhrsw            xm0, xm1
   3284    pmulhrsw            xm0, xm2
   3285    vpbroadcastw         m0, xm0
   3286    pxor                 m3, m3
   3287 .dconly_loop:
   3288    mova                 m1, [dstq]
   3289    punpckhbw            m2, m1, m3
   3290    punpcklbw            m1, m3
   3291    paddw                m2, m0
   3292    paddw                m1, m0
   3293    packuswb             m1, m2
   3294    mova             [dstq], m1
   3295    add                dstq, strideq
   3296    dec                 r3d
   3297    jg .dconly_loop
   3298    RET
   3299 .normal:
   3300    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
   3301    %undef cmp
   3302    LOAD_PACKED_16X2      0,  7,  0,  2 ; in0  in2
   3303    LOAD_PACKED_16X2      4,  7,  1,  3 ; in1  in3
   3304    LOAD_PACKED_16X2      1,  7,  4,  6 ; in4  in6
   3305    LOAD_PACKED_16X2      5,  7,  5,  7 ; in5  in7
   3306    pxor                 m8, m8
   3307    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
   3308    add                  cq, 16*16
   3309    LOAD_PACKED_16X2      2,  7, -8, -6 ; in8  in10
   3310    LOAD_PACKED_16X2      6,  7, -7, -5 ; in9  in11
   3311    LOAD_PACKED_16X2      3,  7, -4, -2 ; in12 in14
   3312    LOAD_PACKED_16X2     11,  7, -3, -1 ; in13 in15
   3313    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1
   3314    mova         [rsp+32*0], m4
   3315    mova         [rsp+32*1], m5
   3316    mova         [rsp+32*2], m6
   3317    cmp                eobd, 106
   3318    jg .full
   3319    pxor                 m4, m4
   3320    REPX       {mova x, m4}, m5, m6, m7
   3321    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
   3322    jmp .pass2
   3323 .full:
   3324    LOAD_PACKED_16X2      4,  7,  0,  2 ; in16 in18
   3325    LOAD_PACKED_16X2     12,  7,  3,  1 ; in19 in17
   3326    LOAD_PACKED_16X2      5,  7,  4,  6 ; in20 in22
   3327    LOAD_PACKED_16X2     13,  7,  7,  5 ; in23 in21
   3328    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
   3329    add                  cq, 16*8
   3330    LOAD_PACKED_16X2      6,  7,  0,  2 ; in24 in26
   3331    LOAD_PACKED_16X2     14,  7,  3,  1 ; in27 in25
   3332    LOAD_PACKED_16X2      7,  8,  4,  6 ; in28 in30
   3333    LOAD_PACKED_16X2     15,  8,  7,  5 ; in31 in29
   3334    pxor                 m8, m8
   3335    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
   3336    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
   3337 .pass2:
   3338    vpbroadcastd        m12, [o(pw_8192)]
   3339    REPX  {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15
   3340    mova         [rsp+32*1], m9
   3341    mova         [rsp+32*2], m10
   3342    punpckhwd            m9, m0, m2
   3343    punpcklwd            m0, m2
   3344    punpckhwd            m2, m1, m3
   3345    punpcklwd            m1, m3
   3346    punpcklwd           m10, m4, m6
   3347    punpckhwd            m4, m6
   3348    punpcklwd            m6, m5, m7
   3349    punpckhwd            m5, m7
   3350    punpckhwd            m3, m0, m9
   3351    punpcklwd            m0, m9
   3352    punpckhwd            m9, m2, m1
   3353    punpcklwd            m2, m1
   3354    punpcklwd            m7, m10, m4
   3355    punpckhwd           m10, m4
   3356    punpcklwd            m4, m5, m6
   3357    punpckhwd            m5, m6
   3358    punpckhdq            m1, m0, m2
   3359    punpckldq            m0, m2
   3360    punpckldq            m2, m3, m9
   3361    punpckhdq            m3, m9
   3362    punpckldq            m6, m7, m4
   3363    punpckhdq            m7, m4
   3364    punpckldq            m9, m10, m5
   3365    punpckhdq           m10, m5
   3366    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10
   3367    pmulhrsw            m12, [rsp+32*0]
   3368    mova         [rsp+32*0], m8
   3369    vperm2i128           m4, m0, m6, 0x31
   3370    vinserti128          m0, xm6, 1
   3371    vperm2i128           m5, m1, m7, 0x31
   3372    vinserti128          m1, xm7, 1
   3373    vperm2i128           m6, m2, m9, 0x31
   3374    vinserti128          m2, xm9, 1
   3375    vperm2i128           m7, m3, m10, 0x31
   3376    vinserti128          m3, xm10, 1
   3377    call m(idct_16x8_internal_8bpc).main
   3378    vpbroadcastd         m8, [o(pw_2048)]
   3379    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
   3380    lea                  r2, [strideq*3]
   3381    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
   3382    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
   3383    lea                  r3, [dstq+strideq*4]
   3384    %define dstq r3
   3385    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
   3386    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
   3387    mova                 m0, [rsp+32*0]
   3388    mova                 m1, [rsp+32*1]
   3389    mova                 m2, [rsp+32*2]
   3390    punpckhwd            m7, m0, m2
   3391    punpcklwd            m0, m2
   3392    punpckhwd            m2, m1, m11
   3393    punpcklwd            m1, m11
   3394    punpckhwd            m4, m12, m14
   3395    punpcklwd           m12, m14
   3396    punpckhwd            m5, m13, m15
   3397    punpcklwd           m13, m15
   3398    punpckhwd            m3, m0, m7
   3399    punpcklwd            m0, m7
   3400    punpckhwd            m9, m2, m1
   3401    punpcklwd            m2, m1
   3402    punpcklwd            m7, m12, m4
   3403    punpckhwd           m12, m4
   3404    punpcklwd            m4, m5, m13
   3405    punpckhwd            m5, m13
   3406    punpckhdq            m1, m0, m2
   3407    punpckldq            m0, m2
   3408    punpckldq            m2, m3, m9
   3409    punpckhdq            m3, m9
   3410    punpckldq            m6, m7, m4
   3411    punpckhdq            m7, m4
   3412    punpckldq            m9, m12, m5
   3413    punpckhdq           m12, m5
   3414    vperm2i128           m4, m0, m6, 0x31
   3415    vinserti128          m0, xm6, 1
   3416    vperm2i128           m5, m1, m7, 0x31
   3417    vinserti128          m1, xm7, 1
   3418    vperm2i128           m6, m2, m9, 0x31
   3419    vinserti128          m2, xm9, 1
   3420    vperm2i128           m7, m3, m12, 0x31
   3421    vinserti128          m3, xm12, 1
   3422    call m(idct_16x8_internal_8bpc).main2
   3423    vpbroadcastd         m8, [o(pw_2048)]
   3424    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
   3425    add                  r0, 16
   3426    add                  r3, 16
   3427    %define dstq r0
   3428    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
   3429    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
   3430    %define dstq r3
   3431    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
   3432    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
   3433    RET
   3434 
   3435 cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob
   3436    vpbroadcastd         m9, [pw_5]
   3437    lea                  r4, [strideq*3]
   3438    sub                eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
   3439 .loop:
   3440    mova                xm0,[cq+16* 0]
   3441    mova                xm1, [cq+16* 4]
   3442    vinserti128          m0, [cq+16* 1], 1
   3443    vinserti128          m1, [cq+16* 5], 1
   3444    pxor                 m8, m8
   3445    mova          [cq+32*0], m8
   3446    mova          [cq+32*2], m8
   3447    add                  cq, 16*16
   3448    mova                xm2, [cq-16* 8]
   3449    mova                xm3, [cq-16* 4]
   3450    vinserti128          m2, [cq-16* 7], 1
   3451    vinserti128          m3, [cq-16* 3], 1
   3452    mova                xm4, [cq+16* 0]
   3453    mova                xm5, [cq+16* 4]
   3454    vinserti128          m4, [cq+16* 1], 1
   3455    vinserti128          m5, [cq+16* 5], 1
   3456    mova                xm6, [cq+16* 8]
   3457    mova                xm7, [cq+16*12]
   3458    vinserti128          m6, [cq+16* 9], 1
   3459    vinserti128          m7, [cq+16*13], 1
   3460    REPX {mova [cq+32*x], m8}, -4, -2,  0,  2,  4,  6
   3461    REPX  {paddsw    x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
   3462    call .transpose8x8
   3463    REPX  {psraw     x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
   3464    WRITE_8X4             0,  4,  8, 10, strideq*8, strideq*4, r4*4
   3465    add                dstq, strideq
   3466    WRITE_8X4             1,  5,  0,  4, strideq*8, strideq*4, r4*4
   3467    add                dstq, strideq
   3468    WRITE_8X4             2,  6,  0,  4, strideq*8, strideq*4, r4*4
   3469    add                dstq, strideq
   3470    WRITE_8X4             3,  7,  0,  4, strideq*8, strideq*4, r4*4
   3471    add                dstq, strideq
   3472    sub                  cq, 16*16-32
   3473    lea                dstq, [dstq+r4*4]
   3474    add                eobd, 0x80000000
   3475    jnc .loop
   3476    RET
   3477 ALIGN function_align
   3478 .transpose8x8:
   3479    punpckhwd            m8, m4, m5
   3480    punpcklwd            m4, m5
   3481    punpckhwd            m5, m0, m1
   3482    punpcklwd            m0, m1
   3483    punpckhwd            m1, m6, m7
   3484    punpcklwd            m6, m7
   3485    punpckhwd            m7, m2, m3
   3486    punpcklwd            m2, m3
   3487    punpckhdq            m3, m0, m2
   3488    punpckldq            m0, m2
   3489    punpckldq            m2, m4, m6
   3490    punpckhdq            m4, m6
   3491    punpckhdq            m6, m5, m7
   3492    punpckldq            m5, m7
   3493    punpckldq            m7, m8, m1
   3494    punpckhdq            m8, m1
   3495    punpckhqdq           m1, m0, m2
   3496    punpcklqdq           m0, m2
   3497    punpcklqdq           m2, m3, m4
   3498    punpckhqdq           m3, m4
   3499    punpcklqdq           m4, m5, m7
   3500    punpckhqdq           m5, m7
   3501    punpckhqdq           m7, m6, m8
   3502    punpcklqdq           m6, m8
   3503    ret
   3504 
   3505 cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob
   3506    add                  cq, 16*8
   3507    vpbroadcastd         m9, [pw_4096]
   3508    lea                  r4, [strideq*3]
   3509    lea                  r5, [dstq+strideq*4]
   3510    sub                eobd, 107
   3511 .loop:
   3512    mova                xm0, [cq-16*8]
   3513    mova                xm1, [cq-16*7]
   3514    vinserti128          m0, [cq+16*0], 1
   3515    vinserti128          m1, [cq+16*1], 1
   3516    mova                xm2, [cq-16*6]
   3517    mova                xm3, [cq-16*5]
   3518    vinserti128          m2, [cq+16*2], 1
   3519    vinserti128          m3, [cq+16*3], 1
   3520    mova                xm4, [cq-16*4]
   3521    mova                xm5, [cq-16*3]
   3522    vinserti128          m4, [cq+16*4], 1
   3523    vinserti128          m5, [cq+16*5], 1
   3524    mova                xm6, [cq-16*2]
   3525    mova                xm7, [cq-16*1]
   3526    vinserti128          m6, [cq+16*6], 1
   3527    vinserti128          m7, [cq+16*7], 1
   3528    pxor                 m8, m8
   3529    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1,  0,  1,  2,  3
   3530    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
   3531    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
   3532    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
   3533    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
   3534    %define dstq r5
   3535    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
   3536    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
   3537    add                  cq, 16*16
   3538    add                  r0, 16
   3539    add                  r5, 16
   3540    add                eobd, 0x80000000
   3541    jnc .loop
   3542    RET
   3543 
   3544 %define o_base pw_5 + 128
   3545 
   3546 %macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs
   3547 %if %3
   3548    vpbroadcastd        m15, [o(pw_2896x8)]
   3549    pmulhrsw             m0, m15, [%1+%2* 0]
   3550    pmulhrsw             m1, m15, [%1+%2* 1]
   3551    pmulhrsw             m2, m15, [%1+%2* 2]
   3552    pmulhrsw             m3, m15, [%1+%2* 3]
   3553    pmulhrsw             m4, m15, [%1+%2* 4]
   3554    pmulhrsw             m5, m15, [%1+%2* 5]
   3555    pmulhrsw             m6, m15, [%1+%2* 6]
   3556    pmulhrsw             m7, m15, [%1+%2* 7]
   3557    pmulhrsw             m8, m15, [%1+%2* 8]
   3558    pmulhrsw             m9, m15, [%1+%2* 9]
   3559    pmulhrsw            m10, m15, [%1+%2*10]
   3560    pmulhrsw            m11, m15, [%1+%2*11]
   3561    pmulhrsw            m12, m15, [%1+%2*12]
   3562    pmulhrsw            m13, m15, [%1+%2*13]
   3563    pmulhrsw            m14, m15, [%1+%2*14]
   3564    pmulhrsw            m15,      [%1+%2*15]
   3565 %else
   3566    mova                 m0, [%1+%2* 0]
   3567    mova                 m1, [%1+%2* 1]
   3568    mova                 m2, [%1+%2* 2]
   3569    mova                 m3, [%1+%2* 3]
   3570    mova                 m4, [%1+%2* 4]
   3571    mova                 m5, [%1+%2* 5]
   3572    mova                 m6, [%1+%2* 6]
   3573    mova                 m7, [%1+%2* 7]
   3574    mova                 m8, [%1+%2* 8]
   3575    mova                 m9, [%1+%2* 9]
   3576    mova                m10, [%1+%2*10]
   3577    mova                m11, [%1+%2*11]
   3578    mova                m12, [%1+%2*12]
   3579    mova                m13, [%1+%2*13]
   3580    mova                m14, [%1+%2*14]
   3581    mova                m15, [%1+%2*15]
   3582 %endif
   3583    mova              [rsp], m15
   3584 %if %4
   3585    pxor                m15, m15
   3586    REPX {mova [%1+%2*x], m15}, 0,  1,  2,  3,  4,  5,  6,  7, \
   3587                                8,  9, 10, 11, 12, 13, 14, 15
   3588 %endif
   3589 %endmacro
   3590 
   3591 %macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
   3592    mova                m%4, [%2]
   3593    paddsw              m%3, m%1, m%4
   3594    psubsw              m%1, m%4
   3595    pmovzxbw            m%4, [dstq+%6]
   3596    pmulhrsw            m%3, m%5
   3597    pmulhrsw            m%1, m%5
   3598    paddw               m%3, m%4
   3599    pmovzxbw            m%4, [r2+%7]
   3600    paddw               m%1, m%4
   3601    packuswb            m%3, m%1
   3602    vpermq              m%3, m%3, q3120
   3603    mova          [dstq+%6], xm%3
   3604    vextracti128    [r2+%7], m%3, 1
   3605 %endmacro
   3606 
   3607 cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob
   3608    lea                  r6, [o_base]
   3609    test               eobd, eobd
   3610    jz .dconly
   3611    PROLOGUE              0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
   3612                                           base, tmp3
   3613    %undef cmp
   3614    LOAD_16ROWS          cq, 64, 1
   3615    call m(idct_16x16_internal_8bpc).main
   3616    lea               tmp1q, [rsp+32*7]
   3617    lea               tmp2q, [tmp1q+32*8]
   3618    lea               tmp3q, [tmp1q+32*16]
   3619    mova                 m1, [rsp+32*1]
   3620    mova         [rsp+32*0], m6
   3621    mova         [rsp+32*1], m7
   3622    vpbroadcastd         m7, [o(pw_16384)]
   3623    call .transpose_2x8x8_round
   3624    mova                m15, [rsp+32*0]
   3625    mova         [tmp3q-32*4+ 0], xm0
   3626    vextracti128 [tmp3q+32*0+ 0], m0, 1
   3627    mova         [tmp3q-32*3+ 0], xm2
   3628    vextracti128 [tmp3q+32*1+ 0], m2, 1
   3629    mova         [tmp3q-32*2+ 0], xm4
   3630    vextracti128 [tmp3q+32*2+ 0], m4, 1
   3631    mova         [tmp3q-32*1+ 0], xm6
   3632    vextracti128 [tmp3q+32*3+ 0], m6, 1
   3633    mova         [tmp3q-32*4+16], xm8
   3634    vextracti128 [tmp3q+32*0+16], m8, 1
   3635    mova         [tmp3q-32*3+16], xm10
   3636    vextracti128 [tmp3q+32*1+16], m10, 1
   3637    mova         [tmp3q-32*2+16], xm12
   3638    vextracti128 [tmp3q+32*2+16], m12, 1
   3639    mova         [tmp3q-32*1+16], xm14
   3640    vextracti128 [tmp3q+32*3+16], m14, 1
   3641    cmp                eobd, 150
   3642    jg .full
   3643    vinserti128          m0, m1, xm9, 1
   3644    vperm2i128           m4, m1, m9, 0x31
   3645    vinserti128          m2, m5, xm13, 1
   3646    vperm2i128           m6, m5, m13, 0x31
   3647    vinserti128          m1, m3, xm11, 1
   3648    vperm2i128           m5, m3, m11, 0x31
   3649    vinserti128          m3, m7, xm15, 1
   3650    vperm2i128           m7, m7, m15, 0x31
   3651    call .main_oddhalf_fast
   3652    pxor                 m8, m8
   3653    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
   3654    jmp .idct16
   3655 .dconly:
   3656    movd                xm1, [o(pw_2896x8)]
   3657    pmulhrsw            xm0, xm1, [cq]
   3658    movd                xm2, [o(pw_16384)]
   3659    mov                [cq], eobd
   3660    pmulhrsw            xm0, xm1
   3661    or                  r3d, 32
   3662    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
   3663 .full:
   3664    mova       [tmp1q-32*4], m1
   3665    mova       [tmp1q-32*3], m3
   3666    mova       [tmp1q-32*2], m5
   3667    mova       [tmp1q-32*1], m7
   3668    mova       [tmp1q+32*0], m9
   3669    mova       [tmp1q+32*1], m11
   3670    mova       [tmp1q+32*2], m13
   3671    mova       [tmp1q+32*3], m15
   3672    LOAD_16ROWS       cq+32, 64, 1
   3673    call m(idct_16x16_internal_8bpc).main
   3674    lea                  r2, [tmp3q+32*8]
   3675    mova                 m1, [rsp+32*1]
   3676    mova         [rsp+32*0], m6
   3677    mova         [rsp+32*1], m7
   3678    vpbroadcastd         m7, [o(pw_16384)]
   3679    call .transpose_2x8x8_round
   3680    mova                m15, [rsp+32*0]
   3681    mova         [r2-32*4+ 0], xm0
   3682    vextracti128 [r2+32*0+ 0], m0, 1
   3683    mova         [r2-32*3+ 0], xm2
   3684    vextracti128 [r2+32*1+ 0], m2, 1
   3685    mova         [r2-32*2+ 0], xm4
   3686    vextracti128 [r2+32*2+ 0], m4, 1
   3687    mova         [r2-32*1+ 0], xm6
   3688    vextracti128 [r2+32*3+ 0], m6, 1
   3689    mova         [r2-32*4+16], xm8
   3690    vextracti128 [r2+32*0+16], m8, 1
   3691    mova         [r2-32*3+16], xm10
   3692    vextracti128 [r2+32*1+16], m10, 1
   3693    mova         [r2-32*2+16], xm12
   3694    vextracti128 [r2+32*2+16], m12, 1
   3695    mova         [r2-32*1+16], xm14
   3696    vextracti128 [r2+32*3+16], m14, 1
   3697    vinserti128          m8, m1, xm9, 1
   3698    vperm2i128          m12, m1, m9, 0x31
   3699    mova                xm0, [tmp1q-32*4]
   3700    mova                xm1, [tmp1q-32*3]
   3701    vinserti128          m0, [tmp1q+32*0], 1
   3702    vinserti128          m1, [tmp1q+32*1], 1
   3703    vinserti128         m10, m5, xm13, 1
   3704    vperm2i128          m14, m5, m13, 0x31
   3705    mova                xm4, [tmp1q-32*4+16]
   3706    mova                xm5, [tmp1q-32*3+16]
   3707    vinserti128          m4, [tmp1q+32*0+16], 1
   3708    vinserti128          m5, [tmp1q+32*1+16], 1
   3709    vinserti128          m9, m3, xm11, 1
   3710    vperm2i128          m13, m3, m11, 0x31
   3711    mova                xm2, [tmp1q-32*2]
   3712    mova                xm3, [tmp1q-32*1]
   3713    vinserti128          m2, [tmp1q+32*2], 1
   3714    vinserti128          m3, [tmp1q+32*3], 1
   3715    vinserti128         m11, m7, xm15, 1
   3716    vperm2i128          m15, m7, m15, 0x31
   3717    mova                xm6, [tmp1q-32*2+16]
   3718    mova                xm7, [tmp1q-32*1+16]
   3719    vinserti128          m6, [tmp1q+32*2+16], 1
   3720    vinserti128          m7, [tmp1q+32*3+16], 1
   3721    call .main_oddhalf
   3722    LOAD_8ROWS_H    r2-32*4, 32
   3723 .idct16:
   3724    LOAD_8ROWS   tmp3q-32*4, 32
   3725    mova              [rsp], m15
   3726    call m(idct_16x16_internal_8bpc).main
   3727    imul                 r2, strideq, 19
   3728    lea                  r3, [strideq*3]
   3729    add                  r2, dstq
   3730    call .pass2_end
   3731    RET
   3732 ALIGN function_align
   3733 cglobal_label .main_oddhalf_fast ; lower half is zero
   3734    mova [rsp+gprsize+32*1], m7
   3735    pxor                 m7, m7
   3736    mova [rsp+gprsize+32*0], m7
   3737    mova [rsp+gprsize+32*2], m7
   3738    vpbroadcastd        m11, [o(pw_3703x8)]
   3739    vpbroadcastd         m7, [o(pw_1751x8)]
   3740    vpbroadcastd        m12, [o(pw_m1380x8)]
   3741    vpbroadcastd         m8, [o(pw_3857x8)]
   3742    vpbroadcastd        m13, [o(pw_3973x8)]
   3743    vpbroadcastd        m15, [o(pw_995x8)]
   3744    pmulhrsw            m11, m4  ; t29a
   3745    pmulhrsw             m4, m7  ; t18a
   3746    pmulhrsw            m12, m3  ; t19a
   3747    pmulhrsw             m3, m8  ; t28a
   3748    pmulhrsw            m13, m2  ; t27a
   3749    pmulhrsw             m2, m15 ; t20a
   3750    vpbroadcastd        m10, [o(pw_m2106x8)]
   3751    vpbroadcastd         m7, [o(pw_3513x8)]
   3752    vpbroadcastd         m9, [o(pw_3290x8)]
   3753    vpbroadcastd         m8, [o(pw_2440x8)]
   3754    vpbroadcastd        m14, [o(pw_m601x8)]
   3755    vpbroadcastd        m15, [o(pw_4052x8)]
   3756    pmulhrsw            m10, m5  ; t21a
   3757    pmulhrsw             m5, m7  ; t26a
   3758    pmulhrsw             m9, m6  ; t25a
   3759    pmulhrsw             m6, m8  ; t22a
   3760    pmulhrsw            m14, m1  ; t23a
   3761    pmulhrsw             m1, m15 ; t24a
   3762    vpbroadcastd        m15, [o(pd_2048)]
   3763    jmp .main2
   3764 ALIGN function_align
   3765 cglobal_label .main_oddhalf
   3766    mova [rsp+gprsize+32*0], m15
   3767    mova [rsp+gprsize+32*1], m7
   3768    mova [rsp+gprsize+32*2], m8
   3769    vpbroadcastd        m15, [o(pd_2048)]
   3770    ITX_MULSUB_2W         4, 11,  7,  8, 15, 1751, 3703 ; t18a, t29a
   3771    ITX_MULSUB_2W        12,  3,  7,  8, 15, 3857, 1380 ; t19a, t28a
   3772    ITX_MULSUB_2W         2, 13,  7,  8, 15,  995, 3973 ; t20a, t27a
   3773    ITX_MULSUB_2W        10,  5,  7,  8, 15, 3513, 2106 ; t21a, t26a
   3774    ITX_MULSUB_2W         6,  9,  7,  8, 15, 2440, 3290 ; t22a, t25a
   3775    ITX_MULSUB_2W        14,  1,  7,  8, 15, 4052,  601 ; t23a, t24a
   3776 .main2:
   3777    psubsw               m7, m12, m4  ; t18
   3778    paddsw              m12, m4       ; t19
   3779    psubsw               m4, m2, m10  ; t21
   3780    paddsw               m2, m10      ; t20
   3781    psubsw              m10, m14, m6  ; t22
   3782    paddsw              m14, m6       ; t23
   3783    psubsw               m6, m1, m9   ; t25
   3784    paddsw               m1, m9       ; t24
   3785    psubsw               m9, m13, m5  ; t26
   3786    paddsw              m13, m5       ; t27
   3787    psubsw               m5, m3, m11  ; t29
   3788    paddsw               m3, m11      ; t28
   3789    ITX_MULSUB_2W         5,  7,  8, 11, 15, m4017,  799 ; t18a, t29a
   3790    ITX_MULSUB_2W         9,  4,  8, 11, 15,  3406, 2276 ; t21a, t26a
   3791    ITX_MULSUB_2W         6, 10,  8, 11, 15, m2276, 3406 ; t22a, t25a
   3792    psubsw               m8, m14, m2  ; t20a
   3793    paddsw              m14, m2       ; t23a
   3794    psubsw               m2, m1, m13  ; t27a
   3795    paddsw               m1, m13      ; t24a
   3796    psubsw              m13, m6, m9   ; t21
   3797    paddsw               m6, m9       ; t22
   3798    psubsw               m9, m10, m4  ; t26
   3799    paddsw              m10, m4       ; t25
   3800    ITX_MULSUB_2W         2,  8,  4, 11, 15, m3784, 1567 ; t20,  t27
   3801    ITX_MULSUB_2W         9, 13,  4, 11, 15, m3784, 1567 ; t21a, t26a
   3802    mova                 m4, [rsp+gprsize+32*0] ; in31
   3803    mova [rsp+gprsize+32*0], m6  ; t22
   3804    mova                 m6, [rsp+gprsize+32*1] ; in15
   3805    mova [rsp+gprsize+32*1], m14 ; t23a
   3806    mova                m14, [rsp+gprsize+32*2] ; in17
   3807    mova [rsp+gprsize+32*2], m1  ; t24a
   3808    ITX_MULSUB_2W         0,  4,  1, 11, 15,  201, 4091 ; t16a, t31a
   3809    ITX_MULSUB_2W        14,  6,  1, 11, 15, 3035, 2751 ; t17a, t30a
   3810    psubsw               m1, m0, m14  ; t17
   3811    paddsw               m0, m14      ; t16
   3812    psubsw              m14, m4, m6   ; t30
   3813    paddsw               m4, m6       ; t31
   3814    ITX_MULSUB_2W        14,  1,  6, 11, 15,  799, 4017 ; t17a, t30a
   3815    psubsw               m6, m0, m12  ; t19a
   3816    paddsw               m0, m12      ; t16a
   3817    psubsw              m12, m4, m3   ; t28a
   3818    paddsw               m4, m3       ; t31a
   3819    psubsw               m3, m14, m5  ; t18
   3820    paddsw              m14, m5       ; t17
   3821    psubsw               m5, m1, m7   ; t29
   3822    paddsw               m1, m7       ; t30
   3823    ITX_MULSUB_2W         5,  3,  7, 11, 15, 1567, 3784 ; t18a, t29a
   3824    ITX_MULSUB_2W        12,  6,  7, 11, 15, 1567, 3784 ; t19,  t28
   3825    psubsw               m7, m1, m10  ; t25a
   3826    paddsw               m1, m10      ; t30a
   3827    psubsw              m10, m5, m9   ; t21
   3828    paddsw               m5, m9       ; t18
   3829    psubsw               m9, m12, m2  ; t20a
   3830    paddsw              m12, m2       ; t19a
   3831    psubsw               m2, m3, m13  ; t26
   3832    paddsw               m3, m13      ; t29
   3833    psubsw              m13, m6, m8   ; t27a
   3834    paddsw               m6, m8       ; t28a
   3835    mova       [tmp1q-32*2], m5
   3836    mova       [tmp1q-32*1], m12
   3837    mova       [tmp2q+32*0], m6
   3838    mova       [tmp2q+32*1], m3
   3839    mova       [tmp2q+32*2], m1
   3840    mova                 m5, [rsp+gprsize+32*0] ; t22
   3841    mova                 m6, [rsp+gprsize+32*1] ; t23
   3842    mova                 m3, [rsp+gprsize+32*2] ; t24a
   3843    psubsw               m1, m14, m5  ; t22a
   3844    paddsw              m14, m5       ; t17a
   3845    psubsw               m5, m0, m6   ; t23
   3846    paddsw               m0, m6       ; t16
   3847    psubsw               m6, m4, m3   ; t24
   3848    paddsw               m4, m3       ; t31
   3849    vpbroadcastd         m8, [o(pw_m2896_2896)]
   3850    vpbroadcastd         m3, [o(pw_2896_2896)]
   3851    mova       [tmp1q-32*4], m0
   3852    mova       [tmp1q-32*3], m14
   3853    mova       [tmp2q+32*3], m4
   3854    ITX_MULSUB_2W        13,  9,  0,  4, 15,  3,  8 ; t20,  t27
   3855    ITX_MULSUB_2W         2, 10,  0,  4, 15,  3,  8 ; t21a, t26a
   3856    ITX_MULSUB_2W         7,  1,  0,  4, 15,  3,  8 ; t22,  t25
   3857    ITX_MULSUB_2W         6,  5,  0,  4, 15,  3,  8 ; t23a, t24a
   3858    mova       [tmp1q+32*0], m13
   3859    mova       [tmp1q+32*1], m2
   3860    mova       [tmp1q+32*2], m7
   3861    mova       [tmp1q+32*3], m6
   3862    mova       [tmp2q-32*4], m5
   3863    mova       [tmp2q-32*3], m1
   3864    mova       [tmp2q-32*2], m10
   3865    mova       [tmp2q-32*1], m9
   3866    ret
   3867 ALIGN function_align
   3868 .transpose_2x8x8_round:
   3869    punpckhwd            m6, m12, m13
   3870    punpcklwd           m12, m13
   3871    punpckhwd           m13, m8, m9
   3872    punpcklwd            m8, m9
   3873    punpckhwd            m9, m14, m15
   3874    punpcklwd           m14, m15
   3875    punpckhwd           m15, m10, m11
   3876    punpcklwd           m10, m11
   3877    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5
   3878    punpckhdq           m11, m8, m10
   3879    punpckldq            m8, m10
   3880    punpckldq           m10, m12, m14
   3881    punpckhdq           m12, m14
   3882    punpckhdq           m14, m13, m15
   3883    punpckldq           m13, m15
   3884    punpckldq           m15, m6, m9
   3885    punpckhdq            m6, m9
   3886    punpckhqdq           m9, m8, m10
   3887    punpcklqdq           m8, m10
   3888    punpcklqdq          m10, m11, m12
   3889    punpckhqdq          m11, m12
   3890    punpcklqdq          m12, m13, m15
   3891    punpckhqdq          m13, m15
   3892    punpckhqdq          m15, m14, m6
   3893    punpcklqdq          m14, m6
   3894    pmulhrsw             m6, m7, [rsp+gprsize+32*0]
   3895    REPX   {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15
   3896    pmulhrsw             m7, [rsp+gprsize+32*1]
   3897    mova [rsp+gprsize+32*0], m15
   3898    punpckhwd           m15, m4, m5
   3899    punpcklwd            m4, m5
   3900    punpckhwd            m5, m0, m1
   3901    punpcklwd            m0, m1
   3902    punpckhwd            m1, m6, m7
   3903    punpcklwd            m6, m7
   3904    punpckhwd            m7, m2, m3
   3905    punpcklwd            m2, m3
   3906    punpckhdq            m3, m0, m2
   3907    punpckldq            m0, m2
   3908    punpckldq            m2, m4, m6
   3909    punpckhdq            m4, m6
   3910    punpckhdq            m6, m5, m7
   3911    punpckldq            m5, m7
   3912    punpckldq            m7, m15, m1
   3913    punpckhdq           m15, m1
   3914    punpckhqdq           m1, m0, m2
   3915    punpcklqdq           m0, m2
   3916    punpcklqdq           m2, m3, m4
   3917    punpckhqdq           m3, m4
   3918    punpcklqdq           m4, m5, m7
   3919    punpckhqdq           m5, m7
   3920    punpckhqdq           m7, m6, m15
   3921    punpcklqdq           m6, m15
   3922    ret
   3923 ALIGN function_align
   3924 .pass2_end:
   3925    mova [rsp+gprsize+32*0], m7
   3926    mova [rsp+gprsize+32*2], m15
   3927    vpbroadcastd        m15, [o(pw_2048)]
   3928    IDCT32_PASS2_END      0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4
   3929    IDCT32_PASS2_END      4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8
   3930    IDCT32_PASS2_END      8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4
   3931    IDCT32_PASS2_END     12, tmp1q-32*1, 0, 4, 15, r3*4,      strideq*0
   3932    add                dstq, strideq
   3933    sub                  r2, strideq
   3934    mova                 m1, [rsp+gprsize+32*1]
   3935    IDCT32_PASS2_END      1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4
   3936    IDCT32_PASS2_END      5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8
   3937    IDCT32_PASS2_END      9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4
   3938    IDCT32_PASS2_END     13, tmp1q-32*2, 0, 4, 15, r3*4,      strideq*0
   3939    add                dstq, strideq
   3940    sub                  r2, strideq
   3941    IDCT32_PASS2_END      2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4
   3942    IDCT32_PASS2_END      6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8
   3943    IDCT32_PASS2_END     10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4
   3944    IDCT32_PASS2_END     14, tmp1q-32*3, 0, 4, 15, r3*4,      strideq*0
   3945    add                dstq, strideq
   3946    sub                  r2, strideq
   3947    mova                 m7, [rsp+gprsize+32*0]
   3948    mova                 m1, [rsp+gprsize+32*2]
   3949    IDCT32_PASS2_END      3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4
   3950    IDCT32_PASS2_END      7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
   3951    IDCT32_PASS2_END     11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
   3952    IDCT32_PASS2_END      1, tmp1q-32*4, 0, 4, 15, r3*4,      strideq*0
   3953    ret
   3954 
   3955 ; Perform the final sumsub step and YMM lane shuffling
   3956 %macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
   3957    mova                m%3, [tmp2q+32*( 3-%1)]
   3958    psubsw              m%4, m%1, m%3
   3959    paddsw              m%1, m%3
   3960    mova                m%3, [tmp1q+32*(11-%2)]
   3961    mova         [tmp1q+32*(11-%2)+16], xm%4
   3962    vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
   3963    paddsw              m%4, m%2, m%3
   3964    psubsw              m%2, m%3
   3965    mova         [tmp1q+32*(11-%2)], xm%2
   3966    vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
   3967    vperm2i128          m%2, m%1, m%4, 0x31
   3968    vinserti128         m%1, xm%4, 1
   3969 %endmacro
   3970 
   3971 cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob
   3972    lea                  r6, [o_base]
   3973    test               eobd, eobd
   3974    jnz .normal
   3975    movd                xm1, [o(pw_2896x8)]
   3976    pmulhrsw            xm0, xm1, [cq]
   3977    movd                xm2, [o(pw_16384)]
   3978    mov                [cq], eobd
   3979    pmulhrsw            xm0, xm1
   3980    or                  r3d, 16
   3981    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
   3982 .normal:
   3983    PROLOGUE              0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
   3984    vpbroadcastd        m15, [o(pw_2896x8)]
   3985    pmulhrsw             m0, m15, [cq+32* 1]
   3986    pmulhrsw             m1, m15, [cq+32* 3]
   3987    pmulhrsw             m2, m15, [cq+32* 5]
   3988    pmulhrsw             m3, m15, [cq+32* 7]
   3989    pmulhrsw             m4, m15, [cq+32* 9]
   3990    pmulhrsw             m5, m15, [cq+32*11]
   3991    pmulhrsw             m6, m15, [cq+32*13]
   3992    pmulhrsw             m7, m15, [cq+32*15]
   3993    pmulhrsw             m8, m15, [cq+32*17]
   3994    pmulhrsw             m9, m15, [cq+32*19]
   3995    pmulhrsw            m10, m15, [cq+32*21]
   3996    pmulhrsw            m11, m15, [cq+32*23]
   3997    pmulhrsw            m12, m15, [cq+32*25]
   3998    pmulhrsw            m13, m15, [cq+32*27]
   3999    pmulhrsw            m14, m15, [cq+32*29]
   4000    pmulhrsw            m15,      [cq+32*31]
   4001    lea               tmp1q, [rsp+32*7]
   4002    lea               tmp2q, [tmp1q+32*8]
   4003    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
   4004    LOAD_16ROWS     cq+32*0, 32*2, 1, 0
   4005    pxor                m15, m15
   4006    mov                 r3d, 8
   4007 .zero_loop:
   4008    mova          [cq+32*0], m15
   4009    mova          [cq+32*1], m15
   4010    mova          [cq+32*2], m15
   4011    mova          [cq+32*3], m15
   4012    add                  cq, 32*4
   4013    dec                 r3d
   4014    jg .zero_loop
   4015    call m(idct_16x16_internal_8bpc).main
   4016    call .pass1_end
   4017    lea                  r2, [strideq*3]
   4018    mov                  r3, dstq
   4019 .pass2:
   4020    vpbroadcastd         m7, [o(pw_16384)]
   4021    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
   4022    call m(idct_16x16_internal_8bpc).main
   4023    mova         [rsp+32*2], m15
   4024    vpbroadcastd        m15, [o(pw_2048)]
   4025    REPX  {pmulhrsw x, m15}, m2, m3, m0
   4026    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
   4027    pmulhrsw             m1, m15, [rsp+32*1]
   4028    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
   4029    lea                dstq, [dstq+strideq*4]
   4030    REPX  {pmulhrsw x, m15}, m4, m5, m6, m7
   4031    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
   4032    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
   4033    lea                dstq, [dstq+strideq*4]
   4034    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11
   4035    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
   4036    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
   4037    lea                dstq, [dstq+strideq*4]
   4038    REPX  {pmulhrsw x, m15}, m11, m12, m13, m14
   4039    pmulhrsw            m15, [rsp+32*2]
   4040    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
   4041    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
   4042    test                 r3, r3
   4043    jnz .right_half
   4044    RET
   4045 .right_half:
   4046    LOAD_8ROWS   tmp1q-32*4, 32
   4047    LOAD_8ROWS_H tmp2q-32*4, 32
   4048    lea                dstq, [r3+16]
   4049    xor                 r3d, r3d
   4050    mova         [rsp+32*0], m6
   4051    mova         [rsp+32*1], m7
   4052    jmp .pass2
   4053 ALIGN function_align
   4054 .pass1_end:
   4055    mova [rsp+gprsize+32*0], m9
   4056    IDCT32_PASS1_END      0,  8,  1,  9
   4057    IDCT32_PASS1_END      2, 10,  1,  9
   4058    IDCT32_PASS1_END      3, 11,  1,  9
   4059    IDCT32_PASS1_END      4, 12,  1,  9
   4060    IDCT32_PASS1_END      5, 13,  1,  9
   4061    IDCT32_PASS1_END      6, 14,  1,  9
   4062    IDCT32_PASS1_END      7, 15,  1,  9
   4063    mova                 m1, [rsp+gprsize+32*1]
   4064    mova                 m9, [rsp+gprsize+32*0]
   4065    mova [rsp+gprsize+32*0], m6
   4066    mova [rsp+gprsize+32*1], m7
   4067    IDCT32_PASS1_END      1,  9,  6,  7
   4068    ret
   4069 
   4070 cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob
   4071 %undef cmp
   4072    lea                  r6, [o_base]
   4073    vpbroadcastd         m9, [o(pw_2896x8)]
   4074    vpbroadcastd        m10, [o(pw_1697x16)]
   4075    vpbroadcastd        m12, [o(pw_8192)]
   4076    cmp                eobd, 43   ; if (eob > 43)
   4077    setg                r4b       ;   iteration_count++
   4078    cmp                eobd, 150  ; if (eob > 150)
   4079    setg                 al       ;   iteration_count++
   4080    add                eobd, -279 ; if (eob > 278)
   4081    adc                 r4b, al   ;   iteration_count++
   4082    lea                  r3, [strideq*3]
   4083    mov                  r6, cq
   4084    paddw               m11, m12, m12 ; pw_16384
   4085 .loop:
   4086    mova                xm0, [cq+64* 0]
   4087    mova                xm1, [cq+64* 1]
   4088    vinserti128          m0, [cq+64* 8], 1
   4089    vinserti128          m1, [cq+64* 9], 1
   4090    mova                xm2, [cq+64* 2]
   4091    mova                xm3, [cq+64* 3]
   4092    vinserti128          m2, [cq+64*10], 1
   4093    vinserti128          m3, [cq+64*11], 1
   4094    mova                xm4, [cq+64* 4]
   4095    mova                xm5, [cq+64* 5]
   4096    vinserti128          m4, [cq+64*12], 1
   4097    vinserti128          m5, [cq+64*13], 1
   4098    mova                xm6, [cq+64* 6]
   4099    mova                xm7, [cq+64* 7]
   4100    vinserti128          m6, [cq+64*14], 1
   4101    vinserti128          m7, [cq+64*15], 1
   4102    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
   4103    REPX  {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
   4104    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
   4105    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   4106    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
   4107    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
   4108    lea                dstq, [dstq+strideq*4]
   4109    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
   4110    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
   4111    lea                dstq, [dstq+strideq*4]
   4112    add                  cq, 16
   4113    dec                 r4b
   4114    jge .loop
   4115    sub                  cq, 32
   4116    pxor                 m0, m0
   4117    mov                 r0d, 8
   4118    cmp                  cq, r6
   4119    ja .zero_loop
   4120 .zero_loop_half:
   4121    mova          [r6+64*0], m0
   4122    mova          [r6+64*1], m0
   4123    add                  r6, 64*4
   4124    mova          [r6-64*2], m0
   4125    mova          [r6-64*1], m0
   4126    sub                 r0d, 2
   4127    jg .zero_loop_half
   4128    RET
   4129 .zero_loop:
   4130    mova          [r6+32*0], m0
   4131    mova          [r6+32*1], m0
   4132    mova          [r6+32*2], m0
   4133    mova          [r6+32*3], m0
   4134    add                  r6, 32*4
   4135    dec                 r0d
   4136    jg .zero_loop
   4137    RET
   4138 
   4139 cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob
   4140 %undef cmp
   4141    lea                  r6, [o_base]
   4142    vpbroadcastd         m9, [o(pw_2896x8)]
   4143    vpbroadcastd        m10, [o(pw_1697x16)]
   4144    vpbroadcastd        m11, [o(pw_2048)]
   4145    cmp                eobd, 35  ; if (eob > 35)
   4146    setg                r4b      ;   iteration_count++
   4147    cmp                eobd, 150 ; if (eob > 150)
   4148    setg                r3b      ;   iteration_count += 2
   4149    lea                 r4d, [r4+r3*2]
   4150    lea                  r3, [strideq*3]
   4151    mov                  r5, dstq
   4152    mov                  r6, cq
   4153 .loop:
   4154    mova                xm0, [cq+32* 0]
   4155    mova                xm1, [cq+32* 1]
   4156    vinserti128          m0, [cq+32* 8], 1
   4157    vinserti128          m1, [cq+32* 9], 1
   4158    mova                xm2, [cq+32* 2]
   4159    mova                xm3, [cq+32* 3]
   4160    vinserti128          m2, [cq+32*10], 1
   4161    vinserti128          m3, [cq+32*11], 1
   4162    mova                xm4, [cq+32* 4]
   4163    mova                xm5, [cq+32* 5]
   4164    vinserti128          m4, [cq+32*12], 1
   4165    vinserti128          m5, [cq+32*13], 1
   4166    mova                xm6, [cq+32* 6]
   4167    mova                xm7, [cq+32* 7]
   4168    vinserti128          m6, [cq+32*14], 1
   4169    vinserti128          m7, [cq+32*15], 1
   4170    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
   4171    REPX  {paddsw   x, x  }, m0, m1, m2, m3, m4, m5, m6, m7
   4172    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
   4173    REPX  {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7
   4174    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   4175    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
   4176    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
   4177    lea                dstq, [dstq+strideq*4]
   4178    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
   4179    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
   4180    lea                dstq, [dstq+strideq*4]
   4181    add                  cq, 16
   4182    dec                 r4b
   4183    jl .ret
   4184    test                r4b, 1
   4185    jz .loop
   4186    add                  cq, 32*15
   4187    lea                dstq, [r5+16]
   4188    jmp .loop
   4189 .ret:
   4190    sub                  cd, eax
   4191    pxor                 m0, m0
   4192    add                  cd, 384
   4193 .zero_loop:
   4194    mova          [r6+32*0], m0
   4195    mova          [r6+32*1], m0
   4196    mova          [r6+32*2], m0
   4197    mova          [r6+32*3], m0
   4198    add                  r6, 32*4
   4199    sub                  cd, 128
   4200    jge .zero_loop
   4201    RET
   4202 
   4203 cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob
   4204    lea                  r6, [o_base]
   4205    test               eobd, eobd
   4206    jnz .normal
   4207    movd                xm1, [o(pw_2896x8)]
   4208    pmulhrsw            xm0, xm1, [cq]
   4209    movd                xm2, [o(pw_8192)]
   4210    mov                [cq], eobd
   4211    or                  r3d, 32
   4212    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
   4213 .normal:
   4214    PROLOGUE              0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
   4215                                           base, tmp3, tmp4
   4216    %undef cmp
   4217    lea               tmp1q, [rsp+32*7]
   4218    lea               tmp2q, [tmp1q+32*8]
   4219    sub                eobd, 136
   4220    mov               tmp4d, eobd
   4221 .pass1_loop:
   4222    LOAD_8ROWS      cq+64*1, 64*2
   4223    pxor                 m8, m8
   4224    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
   4225    test              tmp4d, tmp4d
   4226    jl .fast
   4227    LOAD_8ROWS_H   cq+64*17, 64*2
   4228    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
   4229    LOAD_8ROWS_H   cq+64*16, 64*2
   4230    pxor                 m0, m0
   4231    REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
   4232                               24, 25, 26, 27, 28, 29, 30, 31
   4233    mova              [rsp], m15
   4234    jmp .idct16
   4235 .fast:
   4236    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   4237    pxor                 m8, m8
   4238    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
   4239    mova              [rsp], m8
   4240 .idct16:
   4241    LOAD_8ROWS      cq+64*0, 64*2
   4242    pxor                m15, m15
   4243    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
   4244    call m(idct_16x16_internal_8bpc).main
   4245    call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
   4246    vpbroadcastd         m7, [o(pw_8192)]
   4247    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
   4248    lea               tmp3q, [tmp1q+32*32]
   4249    mova                m15, [rsp]
   4250    mova       [tmp3q-32*4], m0
   4251    mova       [tmp3q-32*3], m2
   4252    mova       [tmp3q-32*2], m4
   4253    mova       [tmp3q-32*1], m6
   4254    mova       [tmp3q+32*0], m8
   4255    mova       [tmp3q+32*1], m10
   4256    mova       [tmp3q+32*2], m12
   4257    mova       [tmp3q+32*3], m14
   4258    add               tmp3q, 32*8
   4259    mova       [tmp3q-32*4], m1
   4260    mova       [tmp3q-32*3], m3
   4261    mova       [tmp3q-32*2], m5
   4262    mova       [tmp3q-32*1], m7
   4263    mova       [tmp3q+32*0], m9
   4264    mova       [tmp3q+32*1], m11
   4265    mova       [tmp3q+32*2], m13
   4266    mova       [tmp3q+32*3], m15
   4267    vpbroadcastd         m9, [o(pw_8192)]
   4268    pmulhrsw             m0, m9, [tmp1q-32*4]
   4269    pmulhrsw             m1, m9, [tmp1q-32*3]
   4270    pmulhrsw             m2, m9, [tmp1q-32*2]
   4271    pmulhrsw             m3, m9, [tmp1q-32*1]
   4272    pmulhrsw             m4, m9, [tmp1q+32*0]
   4273    pmulhrsw             m5, m9, [tmp1q+32*1]
   4274    pmulhrsw             m6, m9, [tmp1q+32*2]
   4275    pmulhrsw             m7, m9, [tmp1q+32*3]
   4276    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
   4277    mova       [tmp1q-32*4], m0
   4278    pmulhrsw             m0, m9, [tmp2q-32*4]
   4279    mova       [tmp2q-32*4], m1
   4280    pmulhrsw             m1, m9, [tmp2q-32*3]
   4281    mova       [tmp1q-32*3], m2
   4282    pmulhrsw             m2, m9, [tmp2q-32*2]
   4283    mova       [tmp2q-32*3], m3
   4284    pmulhrsw             m3, m9, [tmp2q-32*1]
   4285    mova       [tmp1q-32*2], m4
   4286    pmulhrsw             m4, m9, [tmp2q+32*0]
   4287    mova       [tmp2q-32*2], m5
   4288    pmulhrsw             m5, m9, [tmp2q+32*1]
   4289    mova       [tmp1q-32*1], m6
   4290    pmulhrsw             m6, m9, [tmp2q+32*2]
   4291    mova       [tmp2q-32*1], m7
   4292    pmulhrsw             m7, m9, [tmp2q+32*3]
   4293    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
   4294    mova       [tmp1q+32*0], m0
   4295    mova       [tmp2q+32*0], m1
   4296    mova       [tmp1q+32*1], m2
   4297    mova       [tmp2q+32*1], m3
   4298    mova       [tmp1q+32*2], m4
   4299    mova       [tmp2q+32*2], m5
   4300    mova       [tmp1q+32*3], m6
   4301    mova       [tmp2q+32*3], m7
   4302    add                  cq, 32
   4303    add               tmp1q, 32*16
   4304    add               tmp2q, 32*16
   4305    add                eobd, 0x80000000
   4306    jnc .pass1_loop
   4307    add               tmp1q, 32*24
   4308    imul                 r2, strideq, 19
   4309    lea                  r3, [strideq*3]
   4310    add                  r2, dstq
   4311    test              tmp4d, tmp4d
   4312    jge .pass2_loop
   4313    add               tmp1q, 32*16
   4314    add               tmp2q, 32*16
   4315    add               tmp3q, 32*16
   4316 .pass2_loop:
   4317    LOAD_8ROWS   tmp2q-32*4, 32
   4318    test              tmp4d, tmp4d
   4319    jl .fast2
   4320    LOAD_8ROWS_H tmp3q-32*4, 32
   4321    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
   4322    sub               tmp3q, 32*8
   4323    LOAD_8ROWS_H tmp3q-32*4, 32
   4324    sub               tmp3q, 32*16
   4325    jmp .pass2_loop_end
   4326 .fast2:
   4327    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   4328    sub               tmp3q, 32*24
   4329    pxor                 m8, m8
   4330    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
   4331 .pass2_loop_end:
   4332    LOAD_8ROWS   tmp3q-32*4, 32
   4333    mova              [rsp], m15
   4334    call m(idct_16x16_internal_8bpc).main
   4335    call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
   4336    lea               tmp3q, [tmp1q-32*32]
   4337    cmp               tmp2q, tmp3q
   4338    jb .ret
   4339    sub               tmp2q, 32*32
   4340    sub                dstq, r3
   4341    lea                  r2, [r2+r3+16]
   4342    add                dstq, 16
   4343    jmp .pass2_loop
   4344 .ret:
   4345    RET
   4346 
   4347 cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob
   4348    %undef cmp
   4349    vpbroadcastd         m9, [pw_8192]
   4350    sub                eobd, 136 ; if (eob < 136)
   4351    shr                eobd, 30  ;     topleft 16x16 only
   4352    lea                eobd, [eobq*2-8]
   4353    lea                  r4, [strideq*3]
   4354    mov                  r5, dstq
   4355    lea                  r6, [cq+32]
   4356 .loop:
   4357    mova                xm0, [cq+64* 0]
   4358    mova                xm1, [cq+64* 1]
   4359    vinserti128          m0, [cq+64* 8], 1
   4360    vinserti128          m1, [cq+64* 9], 1
   4361    mova                xm2, [cq+64* 2]
   4362    mova                xm3, [cq+64* 3]
   4363    vinserti128          m2, [cq+64*10], 1
   4364    vinserti128          m3, [cq+64*11], 1
   4365    mova                xm4, [cq+64* 4]
   4366    mova                xm5, [cq+64* 5]
   4367    vinserti128          m4, [cq+64*12], 1
   4368    vinserti128          m5, [cq+64*13], 1
   4369    mova                xm6, [cq+64* 6]
   4370    mova                xm7, [cq+64* 7]
   4371    vinserti128          m6, [cq+64*14], 1
   4372    vinserti128          m7, [cq+64*15], 1
   4373    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
   4374    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
   4375    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
   4376    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
   4377    lea                dstq, [dstq+strideq*4]
   4378    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
   4379    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
   4380    lea                dstq, [dstq+strideq*4]
   4381    add                  cq, 16
   4382    inc                eobd
   4383    jz .ret
   4384    test               eobd, 3
   4385    jnz .loop
   4386    add                  cq, 64*15
   4387    lea                dstq, [r5+16]
   4388    jmp .loop
   4389 .ret:
   4390    pxor                 m0, m0
   4391    mov                 r0d, 16
   4392    cmp                  cq, r6
   4393    jne .zero_loop
   4394 .zero_loop_topleft:
   4395    mova          [r6-32*1], m0
   4396    mova          [r6+32*1], m0
   4397    mova          [r6+32*3], m0
   4398    mova          [r6+32*5], m0
   4399    add                  r6, 64*4
   4400    sub                 r0d, 4
   4401    jg .zero_loop_topleft
   4402    RET
   4403 .zero_loop:
   4404    mova          [r6-32*1], m0
   4405    mova          [r6+32*0], m0
   4406    mova          [r6+32*1], m0
   4407    mova          [r6+32*2], m0
   4408    add                  r6, 32*4
   4409    dec                 r0d
   4410    jg .zero_loop
   4411    RET
   4412 
   4413 %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
   4414 %if %1 & 1
   4415    mova                m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
   4416    mova                m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
   4417 %else
   4418    mova                m%5, [tmp1q-32*(45-%1)]
   4419    mova                m%4, [tmp2q-32*(20+%1)]
   4420 %endif
   4421    psubsw              m%6, m%5, m%4 ; idct32 out31-n
   4422    paddsw              m%5, m%4      ; idct32 out 0+n
   4423    psubsw              m%4, m%6, m%3 ; out32+n
   4424    paddsw              m%6, m%3      ; out31-n
   4425    psubsw              m%3, m%5, m%2 ; out63-n
   4426    paddsw              m%5, m%2      ; out 0+n
   4427 %if %0 == 6 ; pass 1
   4428 %if %1 & 1
   4429    mova [tmp2q-32*(19-%1)], m%4
   4430    mova [tmp1q-32*(14+%1)], m%6
   4431    mova [tmp1q+32*(18-%1)], m%3
   4432    mova [tmp2q-32*(51-%1)], m%5
   4433 %else
   4434    mova [tmp1q-32*(13-%1)], m%4
   4435    mova [tmp2q-32*(20+%1)], m%6
   4436    mova [tmp2q+32*(12-%1)], m%3
   4437    mova [tmp1q-32*(45-%1)], m%5
   4438 %endif
   4439 %else ; pass 2
   4440    REPX  {pmulhrsw x, m14}, m%4, m%6, m%3, m%5
   4441 %if %1 & 1
   4442    %define %%d0 r2
   4443    %define %%d1 dstq
   4444 %else
   4445    %define %%d0 dstq
   4446    %define %%d1 r2
   4447 %endif
   4448    pmovzxbw            m%2, [%%d0+%9 ]
   4449    paddw               m%2, m%4
   4450    pmovzxbw            m%4, [%%d1+%8 ]
   4451    paddw               m%4, m%6
   4452    pmovzxbw            m%6, [%%d1+%10]
   4453    paddw               m%3, m%6
   4454    pmovzxbw            m%6, [%%d0+%7 ]
   4455    paddw               m%5, m%6
   4456    packuswb            m%2, m%4
   4457    packuswb            m%3, m%5
   4458    vpermq              m%2, m%2, q3120
   4459    vpermq              m%3, m%3, q3120
   4460    mova         [%%d0+%9 ], xm%2
   4461    vextracti128 [%%d1+%8 ], m%2, 1
   4462    mova         [%%d1+%10], xm%3
   4463    vextracti128 [%%d0+%7 ], m%3, 1
   4464 %endif
   4465 %endmacro
   4466 
   4467 cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob
   4468    lea                  r6, [o_base]
   4469    test               eobd, eobd
   4470    jnz .normal
   4471    movd                xm1, [o(pw_2896x8)]
   4472    pmulhrsw            xm0, xm1, [cq]
   4473    movd                xm2, [o(pw_8192)]
   4474    mov                [cq], eobd
   4475    or                  r3d, 64
   4476    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
   4477 .normal:
   4478    PROLOGUE              0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
   4479    %undef cmp
   4480    lea               tmp1q, [rsp+32*23]
   4481    lea               tmp2q, [tmp1q+32*24]
   4482    sub                eobd, 151
   4483    mov                 r7d, eobd
   4484 .pass1_loop:
   4485    LOAD_16ROWS          cq, 64
   4486    call m(idct_16x16_internal_8bpc).main
   4487    mova                 m1, [rsp+32*1]
   4488    mova         [rsp+32*0], m6
   4489    mova         [rsp+32*1], m7
   4490    vpbroadcastd         m7, [o(pw_8192)]
   4491    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
   4492    mova                m15, [rsp+32*0]
   4493    mova       [tmp1q-32*4], m0
   4494    mova       [tmp1q-32*3], m2
   4495    mova       [tmp1q-32*2], m4
   4496    mova       [tmp1q-32*1], m6
   4497    mova       [tmp1q+32*0], m8
   4498    mova       [tmp1q+32*1], m10
   4499    mova       [tmp1q+32*2], m12
   4500    mova       [tmp1q+32*3], m14
   4501    mova       [tmp2q-32*4], m1
   4502    mova       [tmp2q-32*3], m3
   4503    mova       [tmp2q-32*2], m5
   4504    mova       [tmp2q-32*1], m7
   4505    mova       [tmp2q+32*0], m9
   4506    mova       [tmp2q+32*1], m11
   4507    mova       [tmp2q+32*2], m13
   4508    mova       [tmp2q+32*3], m15
   4509    add                  cq, 32
   4510    add               tmp1q, 32*8
   4511    add               tmp2q, 32*8
   4512    add                eobd, 0x80000000
   4513    jnc .pass1_loop
   4514    lea                  r2, [rsp+32*23]
   4515    mova                xm0, [r2-32*4+ 0]
   4516    mova                xm1, [r2-32*2+ 0]
   4517    vinserti128          m0, [r2+32*0+ 0], 1
   4518    vinserti128          m1, [r2+32*2+ 0], 1
   4519    mova                xm2, [r2-32*4+16]
   4520    mova                xm3, [r2-32*2+16]
   4521    vinserti128          m2, [r2+32*0+16], 1
   4522    vinserti128          m3, [r2+32*2+16], 1
   4523    pxor                 m4, m4
   4524    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
   4525    test                r7d, r7d
   4526    jl .fast
   4527    lea                  r3, [r2+32*8]
   4528    mova                xm4, [r3-32*4+ 0]
   4529    mova                xm5, [r3-32*2+ 0]
   4530    vinserti128          m4, [r3+32*0+ 0], 1
   4531    vinserti128          m5, [r3+32*2+ 0], 1
   4532    mova                xm6, [r3-32*4+16]
   4533    mova                xm7, [r3-32*2+16]
   4534    vinserti128          m6, [r3+32*0+16], 1
   4535    vinserti128          m7, [r3+32*2+16], 1
   4536 .fast:
   4537    mova              [rsp], m8
   4538    lea               tmp1q, [rsp+32*7]
   4539    call m(idct_16x16_internal_8bpc).main
   4540    mova                 m1, [rsp+32*1]
   4541    mova       [tmp1q-32*4], m0
   4542    mova       [tmp1q-32*3], m1
   4543    mova       [tmp1q-32*2], m2
   4544    mova       [tmp1q-32*1], m3
   4545    mova       [tmp1q+32*0], m4
   4546    mova       [tmp1q+32*1], m5
   4547    mova       [tmp1q+32*2], m6
   4548    mova       [tmp1q+32*3], m7
   4549    add               tmp1q, 32*8
   4550    mova       [tmp1q-32*4], m8
   4551    mova       [tmp1q-32*3], m9
   4552    mova       [tmp1q-32*2], m10
   4553    mova       [tmp1q-32*1], m11
   4554    mova       [tmp1q+32*0], m12
   4555    mova       [tmp1q+32*1], m13
   4556    mova       [tmp1q+32*2], m14
   4557    mova       [tmp1q+32*3], m15
   4558    mova                xm0, [r2-32*3+ 0]
   4559    mova                xm1, [r2-32*1+ 0]
   4560    vinserti128          m0, [r2+32*1+ 0], 1
   4561    vinserti128          m1, [r2+32*3+ 0], 1
   4562    mova                xm2, [r2-32*3+16]
   4563    mova                xm3, [r2-32*1+16]
   4564    vinserti128          m2, [r2+32*1+16], 1
   4565    vinserti128          m3, [r2+32*3+16], 1
   4566    pxor                 m4, m4
   4567    REPX       {mova x, m4}, m5, m6, m7
   4568    test                r7d, r7d
   4569    jl .fast2
   4570    mova                xm4, [r3-32*3+ 0]
   4571    mova                xm5, [r3-32*1+ 0]
   4572    vinserti128          m4, [r3+32*1+ 0], 1
   4573    vinserti128          m5, [r3+32*3+ 0], 1
   4574    mova                xm6, [r3-32*3+16]
   4575    mova                xm7, [r3-32*1+16]
   4576    vinserti128          m6, [r3+32*1+16], 1
   4577    vinserti128          m7, [r3+32*3+16], 1
   4578 .fast2:
   4579    add               tmp1q, 32*8
   4580    lea               tmp2q, [tmp1q+32*8]
   4581    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   4582    add                  r2, 32*24
   4583    vpbroadcastd        m15, [o(pd_2048)]
   4584    add               tmp1q, 32*16
   4585    add               tmp2q, 32*32
   4586    mova                xm0, [r2-32*4+ 0]
   4587    mova                xm3, [r2-32*1+16]
   4588    vinserti128          m0, [r2+32*0+ 0], 1
   4589    vinserti128          m3, [r2+32*3+16], 1
   4590    mova                xm4, [r2-32*4+16]
   4591    mova                xm7, [r2-32*1+ 0]
   4592    vinserti128          m4, [r2+32*0+16], 1
   4593    vinserti128          m7, [r2+32*3+ 0], 1
   4594    pxor                 m1, m1
   4595    REPX       {mova x, m1}, m2, m5, m6
   4596    test                r7d, r7d
   4597    jl .fast3
   4598    add                  r3, 32*24
   4599    mova                xm1, [r3-32*1+16]
   4600    mova                xm2, [r3-32*4+ 0]
   4601    vinserti128          m1, [r3+32*3+16], 1
   4602    vinserti128          m2, [r3+32*0+ 0], 1
   4603    mova                xm5, [r3-32*1+ 0]
   4604    mova                xm6, [r3-32*4+16]
   4605    vinserti128          m5, [r3+32*3+ 0], 1
   4606    vinserti128          m6, [r3+32*0+16], 1
   4607 .fast3:
   4608    add                  r6, o_idct64_offset
   4609    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   4610    add                  r6, 8
   4611    add               tmp1q, 32*8
   4612    sub               tmp2q, 32*8
   4613    mova                xm0, [r2-32*2+ 0]
   4614    mova                xm3, [r2-32*3+16]
   4615    vinserti128          m0, [r2+32*2+ 0], 1
   4616    vinserti128          m3, [r2+32*1+16], 1
   4617    mova                xm4, [r2-32*2+16]
   4618    mova                xm7, [r2-32*3+ 0]
   4619    vinserti128          m4, [r2+32*2+16], 1
   4620    vinserti128          m7, [r2+32*1+ 0], 1
   4621    pxor                 m1, m1
   4622    REPX       {mova x, m1}, m2, m5, m6
   4623    test                r7d, r7d
   4624    jl .fast4
   4625    mova                xm1, [r3-32*3+16]
   4626    mova                xm2, [r3-32*2+ 0]
   4627    vinserti128          m1, [r3+32*1+16], 1
   4628    vinserti128          m2, [r3+32*2+ 0], 1
   4629    mova                xm5, [r3-32*3+ 0]
   4630    mova                xm6, [r3-32*2+16]
   4631    vinserti128          m5, [r3+32*1+ 0], 1
   4632    vinserti128          m6, [r3+32*2+16], 1
   4633 .fast4:
   4634    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   4635    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
   4636    RET
   4637 ALIGN function_align
   4638 %define o_base idct64_mul - 8
   4639 cglobal_label .main_part1
   4640    ; idct64 steps 1-5:
   4641    ; in1/31/17/15/ 9/23/25/ 7 ->
   4642    ;     t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a
   4643    ; in5/27/21/11/13/19/29/ 3 ->
   4644    ;     t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a
   4645    vpbroadcastd        m11, [o(idct64_mul+4* 0)]
   4646    vpbroadcastd        m13, [o(idct64_mul+4* 1)]
   4647    vpbroadcastd        m10, [o(idct64_mul+4* 4)]
   4648    vpbroadcastd        m12, [o(idct64_mul+4* 5)]
   4649    pmulhrsw            m11, m0  ; t63a
   4650    pmulhrsw             m0, m13 ; t32a
   4651    pmulhrsw            m10, m1  ; t62a
   4652    pmulhrsw             m1, m12 ; t33a
   4653    vpbroadcastd         m9, [o(idct64_mul+4* 8)]
   4654    vpbroadcastd        m13, [o(idct64_mul+4* 9)]
   4655    vpbroadcastd         m8, [o(idct64_mul+4*12)]
   4656    vpbroadcastd        m12, [o(idct64_mul+4*13)]
   4657    pmulhrsw             m9, m2  ; t61a
   4658    pmulhrsw             m2, m13 ; t34a
   4659    pmulhrsw             m8, m3  ; t60a
   4660    pmulhrsw             m3, m12 ; t35a
   4661    psubsw              m12, m0, m1   ; t33
   4662    paddsw               m0, m1       ; t32
   4663    psubsw               m1, m3, m2   ; t34
   4664    paddsw               m3, m2       ; t35
   4665    psubsw               m2, m8, m9   ; t61
   4666    paddsw               m8, m9       ; t60
   4667    psubsw               m9, m11, m10 ; t62
   4668    paddsw              m11, m10      ; t63
   4669    ITX_MULSUB_2W         2,  1, 10, 13, 15, m4076, 401 ; t34a, t61a
   4670    vpbroadcastd        m14, [o(pw_401_4076)]
   4671    ITX_MULSUB_2W         9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
   4672    psubsw              m10, m0, m3  ; t35a
   4673    paddsw               m0, m3      ; t32a
   4674    psubsw               m3, m11, m8 ; t60a
   4675    paddsw              m11, m8      ; t63a
   4676    psubsw               m8, m9, m2  ; t34
   4677    paddsw               m9, m2      ; t33
   4678    psubsw               m2, m12, m1 ; t61
   4679    paddsw              m12, m1      ; t62
   4680    mova       [tmp1q-32*4], m0
   4681    mova       [tmp1q-32*3], m9
   4682    mova       [tmp2q+32*2], m12
   4683    mova       [tmp2q+32*3], m11
   4684    vpbroadcastd        m13, [o(pw_m4017_799)]
   4685    vpbroadcastd        m14, [o(pw_799_4017)]
   4686    ITX_MULSUB_2W         2,  8,  0,  1, 15, 14, 13 ; t34a, t61a
   4687    ITX_MULSUB_2W         3, 10,  0,  1, 15, 14, 13 ; t35,  t60
   4688    mova       [tmp1q-32*2], m2
   4689    mova       [tmp1q-32*1], m3
   4690    mova       [tmp2q+32*0], m10
   4691    mova       [tmp2q+32*1], m8
   4692    vpbroadcastd         m3, [o(idct64_mul+4*16)]
   4693    vpbroadcastd        m11, [o(idct64_mul+4*17)]
   4694    vpbroadcastd         m2, [o(idct64_mul+4*20)]
   4695    vpbroadcastd        m10, [o(idct64_mul+4*21)]
   4696    vpbroadcastd         m1, [o(idct64_mul+4*24)]
   4697    vpbroadcastd         m9, [o(idct64_mul+4*25)]
   4698    vpbroadcastd         m0, [o(idct64_mul+4*28)]
   4699    vpbroadcastd         m8, [o(idct64_mul+4*29)]
   4700    pmulhrsw             m3, m4  ; t59a
   4701    pmulhrsw             m4, m11 ; t36a
   4702    pmulhrsw             m2, m5  ; t58a
   4703    pmulhrsw             m5, m10 ; t37a
   4704    pmulhrsw             m1, m6  ; t57a
   4705    pmulhrsw             m6, m9  ; t38a
   4706    pmulhrsw             m0, m7  ; t56a
   4707    pmulhrsw             m7, m8  ; t39a
   4708    psubsw               m8, m4, m5 ; t37
   4709    paddsw               m4, m5     ; t36
   4710    psubsw               m5, m7, m6 ; t38
   4711    paddsw               m7, m6     ; t39
   4712    psubsw               m6, m0, m1 ; t57
   4713    paddsw               m0, m1     ; t56
   4714    psubsw               m1, m3, m2 ; t58
   4715    paddsw               m3, m2     ; t59
   4716    ITX_MULSUB_2W         6,  5,  2,  9, 15, m2598, 3166 ; t38a, t57a
   4717    vpbroadcastd        m10, [o(pw_3166_2598)]
   4718    ITX_MULSUB_2W         1,  8,  2,  9, 15, 10,  9 ; t37a, t58a
   4719    psubsw               m2, m7, m4 ; t36a
   4720    paddsw               m7, m4     ; t39a
   4721    psubsw               m4, m0, m3 ; t59a
   4722    paddsw               m0, m3     ; t56a
   4723    psubsw               m3, m6, m1 ; t37
   4724    paddsw               m6, m1     ; t38
   4725    psubsw               m1, m5, m8 ; t58
   4726    paddsw               m5, m8     ; t57
   4727    mova       [tmp1q+32*2], m6
   4728    mova       [tmp1q+32*3], m7
   4729    mova       [tmp2q-32*4], m0
   4730    mova       [tmp2q-32*3], m5
   4731    vpbroadcastd         m6, [o(pw_m799_m4017)]
   4732    vpbroadcastd         m7, [o(pw_m4017_799)]
   4733    ITX_MULSUB_2W         4,  2,  0,  5, 15,  7,  6 ; t36,  t59
   4734    ITX_MULSUB_2W         1,  3,  0,  5, 15,  7,  6 ; t37a, t58a
   4735    mova       [tmp1q+32*0], m4
   4736    mova       [tmp1q+32*1], m1
   4737    mova       [tmp2q-32*2], m3
   4738    mova       [tmp2q-32*1], m2
   4739    ret
   4740 %define o_base pw_5 + 128
   4741 .main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub
   4742    sub                  r6, o_idct64_offset + 8
   4743    vpbroadcastd        m11, [o(pw_1567_3784)]
   4744    vpbroadcastd        m12, [o(pw_m3784_1567)]
   4745    vpbroadcastd        m13, [o(pw_2896_2896)]
   4746    vpbroadcastd        m14, [o(pw_m2896_2896)]
   4747 .main_part2_pass1_loop:
   4748    call .main_part2_internal
   4749    IDCT64_PART2_END      0,  7,  0,  6,  9, 10
   4750    IDCT64_PART2_END      7,  8,  5,  0,  6,  7
   4751    IDCT64_PART2_END      8,  2,  1,  0,  6,  7
   4752    IDCT64_PART2_END     15,  3,  4,  0,  6,  7
   4753    cmp               tmp1q, tmp2q
   4754    jne .main_part2_pass1_loop
   4755    ret
   4756 cglobal_label .main_part2_internal
   4757    mova                 m0, [tmp1q-32*12] ; t32a
   4758    mova                 m6, [tmp2q-32*13] ; t39a
   4759    mova                 m1, [tmp1q-32* 4] ; t40a
   4760    mova                 m5, [tmp2q+32* 3] ; t55a
   4761    add               tmp1q, 32
   4762    sub               tmp2q, 32
   4763    mova                 m2, [tmp1q+32* 3] ; t48a
   4764    mova                 m4, [tmp2q-32* 4] ; t47a
   4765    mova                 m3, [tmp1q+32*11] ; t56a
   4766    mova                 m7, [tmp2q+32*12] ; t63a
   4767    psubsw               m8, m0, m6 ; t39
   4768    paddsw               m0, m6     ; t32
   4769    psubsw               m6, m4, m1 ; t40
   4770    paddsw               m4, m1     ; t47
   4771    psubsw               m1, m2, m5 ; t55
   4772    paddsw               m2, m5     ; t48
   4773    psubsw               m5, m7, m3 ; t56
   4774    paddsw               m7, m3     ; t63
   4775    ITX_MULSUB_2W         5,  8,  3,  9, 15, 11, 12 ; t39a, t56a
   4776    vpbroadcastd         m9, [o(pw_m1567_m3784)]
   4777    ITX_MULSUB_2W         1,  6,  3,  9, 15, 12,  9 ; t40a, t55a
   4778    psubsw               m3, m0, m4 ; t47a
   4779    paddsw               m0, m4     ; t32a
   4780    psubsw               m4, m7, m2 ; t48a
   4781    paddsw               m7, m2     ; t63a
   4782    psubsw               m2, m5, m1 ; t40
   4783    paddsw               m5, m1     ; t39
   4784    psubsw               m1, m8, m6 ; t55
   4785    paddsw               m8, m6     ; t56
   4786    ITX_MULSUB_2W         4,  3,  6,  9, 15, 13, 14 ; t47,  t48
   4787    ITX_MULSUB_2W         1,  2,  6,  9, 15, 13, 14 ; t40a, t55a
   4788    ret
   4789 .main_part2_pass2:
   4790    sub                  r6, o_idct64_offset + 8
   4791    vpbroadcastd        m11, [o(pw_1567_3784)]
   4792    vpbroadcastd        m12, [o(pw_m3784_1567)]
   4793    vpbroadcastd        m13, [o(pw_2896_2896)]
   4794    lea                  r9, [strideq*5]    ; stride*5
   4795    lea                  r3, [r9+strideq*1] ; stride*6
   4796    lea                  r7, [r9+strideq*2] ; stride*7
   4797    lea                  r8, [r3+strideq*2] ; stride*8
   4798    lea                  r2, [dstq+r7]
   4799 .main_part2_pass2_loop:
   4800    vpbroadcastd        m14, [o(pw_m2896_2896)]
   4801    call .main_part2_internal
   4802    vpbroadcastd        m14, [o(pw_2048)]
   4803    IDCT64_PART2_END      0,  7,  0,  6,  9, 10, strideq*0, r3*4, r8*4, r7*8
   4804    IDCT64_PART2_END      7,  8,  5,  0,  6,  7, strideq*0, r3*4, r8*4, r7*8
   4805    IDCT64_PART2_END      8,  2,  1,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
   4806    IDCT64_PART2_END     15,  3,  4,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
   4807    add                dstq, strideq
   4808    sub                  r2, strideq
   4809    cmp               tmp1q, tmp2q
   4810    jne .main_part2_pass2_loop
   4811    ret
   4812 
   4813 cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob
   4814    lea                  r6, [o_base]
   4815    test               eobd, eobd
   4816    jnz .normal
   4817    movd                xm1, [o(pw_2896x8)]
   4818    pmulhrsw            xm0, xm1, [cq]
   4819    movd                xm2, [o(pw_8192)]
   4820    mov                [cq], eobd
   4821    or                  r3d, 16
   4822 .dconly:
   4823    pmulhrsw            xm0, xm2
   4824    movd                xm2, [o(pw_2048)]
   4825    pmulhrsw            xm0, xm1
   4826    pmulhrsw            xm0, xm2
   4827    vpbroadcastw         m0, xm0
   4828    pxor                 m1, m1
   4829 .dconly_loop:
   4830    mova                 m2, [dstq+32*0]
   4831    mova                 m3, [dstq+32*1]
   4832    punpckhbw            m4, m2, m1
   4833    punpcklbw            m2, m1
   4834    punpckhbw            m5, m3, m1
   4835    punpcklbw            m3, m1
   4836    paddw                m4, m0
   4837    paddw                m2, m0
   4838    paddw                m5, m0
   4839    paddw                m3, m0
   4840    packuswb             m2, m4
   4841    packuswb             m3, m5
   4842    mova        [dstq+32*0], m2
   4843    mova        [dstq+32*1], m3
   4844    add                dstq, strideq
   4845    dec                 r3d
   4846    jg .dconly_loop
   4847    RET
   4848 .normal:
   4849    PROLOGUE              0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
   4850    LOAD_8ROWS      cq+32*0, 32*4
   4851    pxor                 m8, m8
   4852    REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
   4853    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
   4854    mova              [rsp], m8
   4855    lea               tmp1q, [rsp+32*7]
   4856    call m(idct_16x16_internal_8bpc).main
   4857    mova                 m1, [rsp+32*1]
   4858    mova       [tmp1q-32*4], m0
   4859    mova       [tmp1q-32*3], m1
   4860    mova       [tmp1q-32*2], m2
   4861    mova       [tmp1q-32*1], m3
   4862    mova       [tmp1q+32*0], m4
   4863    mova       [tmp1q+32*1], m5
   4864    mova       [tmp1q+32*2], m6
   4865    mova       [tmp1q+32*3], m7
   4866    add               tmp1q, 32*8
   4867    mova       [tmp1q-32*4], m8
   4868    mova       [tmp1q-32*3], m9
   4869    mova       [tmp1q-32*2], m10
   4870    mova       [tmp1q-32*1], m11
   4871    mova       [tmp1q+32*0], m12
   4872    mova       [tmp1q+32*1], m13
   4873    mova       [tmp1q+32*2], m14
   4874    mova       [tmp1q+32*3], m15
   4875    LOAD_8ROWS      cq+32*2, 32*4
   4876    pxor                 m8, m8
   4877    REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
   4878    add               tmp1q, 32*8
   4879    lea               tmp2q, [tmp1q+32*8]
   4880    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   4881    vpbroadcastd        m15, [o(pd_2048)]
   4882    add               tmp1q, 32*16
   4883    add               tmp2q, 32*32
   4884    mova                 m0, [cq+32* 1]
   4885    mova                 m1, [cq+32*31]
   4886    mova                 m2, [cq+32*17]
   4887    mova                 m3, [cq+32*15]
   4888    mova                 m4, [cq+32* 9]
   4889    mova                 m5, [cq+32*23]
   4890    mova                 m6, [cq+32*25]
   4891    mova                 m7, [cq+32* 7]
   4892    pxor                 m8, m8
   4893    REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
   4894    add                  r6, o_idct64_offset
   4895    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   4896    add                  r6, 8
   4897    add               tmp1q, 32*8
   4898    sub               tmp2q, 32*8
   4899    mova                 m0, [cq+32* 5]
   4900    mova                 m1, [cq+32*27]
   4901    mova                 m2, [cq+32*21]
   4902    mova                 m3, [cq+32*11]
   4903    mova                 m4, [cq+32*13]
   4904    mova                 m5, [cq+32*19]
   4905    mova                 m6, [cq+32*29]
   4906    mova                 m7, [cq+32* 3]
   4907    pxor                 m8, m8
   4908    REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
   4909    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   4910    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
   4911    sub               tmp1q, 32*36
   4912    lea                  r2, [strideq*3]
   4913    mov               tmp2d, 4
   4914 .pass2_loop:
   4915    lea                  r3, [tmp1q-32*8]
   4916    mova                xm0, [r3   -32*4]
   4917    mova                xm1, [r3   -32*3]
   4918    vinserti128          m0, [tmp1q-32*4], 1
   4919    vinserti128          m1, [tmp1q-32*3], 1
   4920    mova                xm2, [r3   -32*2]
   4921    mova                xm3, [r3   -32*1]
   4922    vinserti128          m2, [tmp1q-32*2], 1
   4923    vinserti128          m3, [tmp1q-32*1], 1
   4924    mova                xm4, [r3   +32*0]
   4925    mova                xm5, [r3   +32*1]
   4926    vinserti128          m4, [tmp1q+32*0], 1
   4927    vinserti128          m5, [tmp1q+32*1], 1
   4928    mova                xm6, [r3   +32*2]
   4929    mova                xm7, [r3   +32*3]
   4930    vinserti128          m6, [tmp1q+32*2], 1
   4931    vinserti128          m7, [tmp1q+32*3], 1
   4932    mova                xm8, [r3   -32*4+16]
   4933    mova                xm9, [r3   -32*3+16]
   4934    vinserti128          m8, [tmp1q-32*4+16], 1
   4935    vinserti128          m9, [tmp1q-32*3+16], 1
   4936    mova               xm10, [r3   -32*2+16]
   4937    mova               xm11, [r3   -32*1+16]
   4938    vinserti128         m10, [tmp1q-32*2+16], 1
   4939    vinserti128         m11, [tmp1q-32*1+16], 1
   4940    mova               xm12, [r3   +32*0+16]
   4941    mova               xm13, [r3   +32*1+16]
   4942    vinserti128         m12, [tmp1q+32*0+16], 1
   4943    vinserti128         m13, [tmp1q+32*1+16], 1
   4944    mova               xm14, [r3   +32*2+16]
   4945    mova               xm15, [r3   +32*3+16]
   4946    vinserti128         m14, [tmp1q+32*2+16], 1
   4947    vinserti128         m15, [tmp1q+32*3+16], 1
   4948    mova         [rsp+32*0], m6
   4949    mova         [rsp+32*1], m7
   4950    vpbroadcastd         m7, [o(pw_8192)]
   4951    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
   4952    call m(idct_16x16_internal_8bpc).main
   4953    mova         [rsp+32*0], m15
   4954    vpbroadcastd        m15, [o(pw_2048)]
   4955    REPX  {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7
   4956    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
   4957    pmulhrsw             m1, m15, [rsp+32*1]
   4958    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
   4959    lea                  r3, [dstq+strideq*4]
   4960    %define dstq r3
   4961    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
   4962    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
   4963    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
   4964    lea                  r3, [r3+strideq*4]
   4965    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
   4966    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
   4967    pmulhrsw            m15, [rsp+32*0]
   4968    lea                  r3, [r3+strideq*4]
   4969    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
   4970    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
   4971    add               tmp1q, 32*16
   4972    add                  r0, 16
   4973    dec               tmp2d
   4974    jg .pass2_loop
   4975    RET
   4976 
   4977 cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob
   4978    lea                  r6, [o_base]
   4979    test               eobd, eobd
   4980    jnz .normal
   4981    movd                xm1, [o(pw_2896x8)]
   4982    pmulhrsw            xm0, xm1, [cq]
   4983    movd                xm2, [o(pw_16384)]
   4984    mov                [cq], eobd
   4985    pmulhrsw            xm0, xm1
   4986    or                  r3d, 64
   4987    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
   4988 .normal:
   4989    PROLOGUE              0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
   4990    lea               tmp1q, [rsp+32*7]
   4991    lea                r10d, [eobq-136]
   4992    sar                r10d, 31
   4993 .pass1_loop:
   4994    lea               tmp2q, [tmp1q+32*16]
   4995    LOAD_8ROWS      cq+64*1, 64*2, 1
   4996    pxor                 m8, m8
   4997    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
   4998    test               r10b, r10b
   4999    jnz .fast
   5000    LOAD_8ROWS_H   cq+64*17, 64*2, 2
   5001    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
   5002    LOAD_8ROWS_H   cq+64*16, 64*2, 1
   5003    mova              [rsp], m15
   5004    pxor                m15, m15
   5005    REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \
   5006                                24, 25, 26, 27, 28, 29, 30, 31
   5007    jmp .idct16
   5008 .fast:
   5009    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   5010    pxor                 m8, m8
   5011    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
   5012    mova              [rsp], m8
   5013 .idct16:
   5014    LOAD_8ROWS      cq+64*0, 64*2, 1
   5015    pxor                m15, m15
   5016    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
   5017    call m(idct_16x16_internal_8bpc).main
   5018    call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
   5019    vpbroadcastd         m7, [o(pw_16384)]
   5020    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
   5021    lea                  r3, [tmp1q+32*48]
   5022    mova                m15, [rsp]
   5023    mova          [r3-32*4], m0
   5024    mova          [r3-32*3], m2
   5025    mova          [r3-32*2], m4
   5026    mova          [r3-32*1], m6
   5027    mova          [r3+32*0], m8
   5028    mova          [r3+32*1], m10
   5029    mova          [r3+32*2], m12
   5030    mova          [r3+32*3], m14
   5031    add                  r3, 32*24
   5032    mova          [r3-32*4], m1
   5033    mova          [r3-32*3], m3
   5034    mova          [r3-32*2], m5
   5035    mova          [r3-32*1], m7
   5036    mova          [r3+32*0], m9
   5037    mova          [r3+32*1], m11
   5038    mova          [r3+32*2], m13
   5039    mova          [r3+32*3], m15
   5040    vpbroadcastd         m9, [o(pw_16384)]
   5041    pmulhrsw             m0, m9, [tmp1q-32*4]
   5042    pmulhrsw             m1, m9, [tmp1q-32*3]
   5043    pmulhrsw             m2, m9, [tmp1q-32*2]
   5044    pmulhrsw             m3, m9, [tmp1q-32*1]
   5045    pmulhrsw             m4, m9, [tmp1q+32*0]
   5046    pmulhrsw             m5, m9, [tmp1q+32*1]
   5047    pmulhrsw             m6, m9, [tmp1q+32*2]
   5048    pmulhrsw             m7, m9, [tmp1q+32*3]
   5049    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
   5050    mova       [tmp1q-32*4], m0
   5051    pmulhrsw             m0, m9, [tmp2q-32*4]
   5052    mova       [tmp2q-32*4], m1
   5053    pmulhrsw             m1, m9, [tmp2q-32*3]
   5054    mova       [tmp1q-32*3], m2
   5055    pmulhrsw             m2, m9, [tmp2q-32*2]
   5056    mova       [tmp2q-32*3], m3
   5057    pmulhrsw             m3, m9, [tmp2q-32*1]
   5058    mova       [tmp1q-32*2], m4
   5059    pmulhrsw             m4, m9, [tmp2q+32*0]
   5060    mova       [tmp2q-32*2], m5
   5061    pmulhrsw             m5, m9, [tmp2q+32*1]
   5062    mova       [tmp1q-32*1], m6
   5063    pmulhrsw             m6, m9, [tmp2q+32*2]
   5064    mova       [tmp2q-32*1], m7
   5065    pmulhrsw             m7, m9, [tmp2q+32*3]
   5066    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
   5067    mova       [tmp1q+32*0], m0
   5068    mova       [tmp2q+32*0], m1
   5069    mova       [tmp1q+32*1], m2
   5070    mova       [tmp2q+32*1], m3
   5071    mova       [tmp1q+32*2], m4
   5072    mova       [tmp2q+32*2], m5
   5073    mova       [tmp1q+32*3], m6
   5074    mova       [tmp2q+32*3], m7
   5075    add                  cq, 32
   5076    add               tmp1q, 32*8
   5077    add                r10d, 0x80000000
   5078    jnc .pass1_loop
   5079    lea                  r2, [rsp+32*55]
   5080    lea                  r7, [r2+32*24]
   5081 .pass2_loop:
   5082    lea                  r3, [r2+32*8]
   5083    lea                  r8, [r7+32*8]
   5084    mova                 m0, [r2-32*4]
   5085    mova                 m1, [r2-32*2]
   5086    mova                 m2, [r2+32*0]
   5087    mova                 m3, [r2+32*2]
   5088    pxor                 m4, m4
   5089    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
   5090    test               r10b, r10b
   5091    jnz .fast2
   5092    mova                 m4, [r3-32*4]
   5093    mova                 m5, [r3-32*2]
   5094    mova                 m6, [r3+32*0]
   5095    mova                 m7, [r3+32*2]
   5096 .fast2:
   5097    mova              [rsp], m8
   5098    lea               tmp1q, [rsp+32*39]
   5099    call m(idct_16x16_internal_8bpc).main
   5100    mova                 m1, [rsp+32*1]
   5101    mova       [tmp1q-32*4], m0
   5102    mova       [tmp1q-32*3], m1
   5103    mova       [tmp1q-32*2], m2
   5104    mova       [tmp1q-32*1], m3
   5105    mova       [tmp1q+32*0], m4
   5106    mova       [tmp1q+32*1], m5
   5107    mova       [tmp1q+32*2], m6
   5108    mova       [tmp1q+32*3], m7
   5109    add               tmp1q, 32*8
   5110    mova       [tmp1q-32*4], m8
   5111    mova       [tmp1q-32*3], m9
   5112    mova       [tmp1q-32*2], m10
   5113    mova       [tmp1q-32*1], m11
   5114    mova       [tmp1q+32*0], m12
   5115    mova       [tmp1q+32*1], m13
   5116    mova       [tmp1q+32*2], m14
   5117    mova       [tmp1q+32*3], m15
   5118    mova                 m0, [r2-32*3]
   5119    mova                 m1, [r2-32*1]
   5120    mova                 m2, [r2+32*1]
   5121    mova                 m3, [r2+32*3]
   5122    pxor                 m4, m4
   5123    REPX       {mova x, m4}, m5, m6, m7
   5124    test               r10b, r10b
   5125    jnz .fast3
   5126    mova                 m4, [r3-32*3]
   5127    mova                 m5, [r3-32*1]
   5128    mova                 m6, [r3+32*1]
   5129    mova                 m7, [r3+32*3]
   5130 .fast3:
   5131    add               tmp1q, 32*8
   5132    lea               tmp2q, [tmp1q+32*8]
   5133    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   5134    vpbroadcastd        m15, [o(pd_2048)]
   5135    add               tmp1q, 32*16
   5136    add               tmp2q, 32*32
   5137    mova                 m0, [r7-32*4]
   5138    mova                 m3, [r7+32*3]
   5139    mova                 m4, [r7+32*0]
   5140    mova                 m7, [r7-32*1]
   5141    pxor                 m1, m1
   5142    REPX       {mova x, m1}, m2, m5, m6
   5143    test               r10b, r10b
   5144    jnz .fast4
   5145    mova                 m1, [r8+32*3]
   5146    mova                 m2, [r8-32*4]
   5147    mova                 m5, [r8-32*1]
   5148    mova                 m6, [r8+32*0]
   5149 .fast4:
   5150    add                  r6, o_idct64_offset
   5151    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   5152    add                  r6, 8
   5153    add               tmp1q, 32*8
   5154    sub               tmp2q, 32*8
   5155    mova                 m0, [r7-32*2]
   5156    mova                 m3, [r7+32*1]
   5157    mova                 m4, [r7+32*2]
   5158    mova                 m7, [r7-32*3]
   5159    pxor                 m1, m1
   5160    REPX       {mova x, m1}, m2, m5, m6
   5161    test               r10b, r10b
   5162    jnz .fast5
   5163    mova                 m1, [r8+32*1]
   5164    mova                 m2, [r8-32*2]
   5165    mova                 m5, [r8-32*3]
   5166    mova                 m6, [r8+32*2]
   5167 .fast5:
   5168    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   5169    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
   5170    add                r10d, 0x80000000
   5171    jc .ret
   5172    lea                  r2, [rsp+32*7]
   5173    lea                  r7, [r2+32*16]
   5174    sub                dstq, r8
   5175    lea                dstq, [dstq+strideq*4+16]
   5176    jmp .pass2_loop
   5177 .ret:
   5178    RET
   5179 
   5180 cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob
   5181    lea                  r6, [o_base]
   5182    test               eobd, eobd
   5183    jnz .normal
   5184    movd                xm1, [o(pw_2896x8)]
   5185    pmulhrsw            xm0, xm1, [cq]
   5186    movd                xm2, [o(pw_16384)]
   5187    mov                [cq], eobd
   5188    pmulhrsw            xm0, xm1
   5189    or                  r3d, 32
   5190    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
   5191 .normal:
   5192    PROLOGUE              0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
   5193                                            base, tmp3, tmp4
   5194    lea               tmp1q, [rsp+32*7]
   5195    lea               tmp4d, [eobq-136]
   5196 .pass1_loop:
   5197    LOAD_8ROWS      cq+64*0, 64*4, 1
   5198    pxor                 m8, m8
   5199    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
   5200    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
   5201    mova              [rsp], m8
   5202    call m(idct_16x16_internal_8bpc).main
   5203    mova                 m1, [rsp+32*1]
   5204    mova       [tmp1q-32*4], m0
   5205    mova       [tmp1q-32*3], m1
   5206    mova       [tmp1q-32*2], m2
   5207    mova       [tmp1q-32*1], m3
   5208    mova       [tmp1q+32*0], m4
   5209    mova       [tmp1q+32*1], m5
   5210    mova       [tmp1q+32*2], m6
   5211    mova       [tmp1q+32*3], m7
   5212    add               tmp1q, 32*8
   5213    mova       [tmp1q-32*4], m8
   5214    mova       [tmp1q-32*3], m9
   5215    mova       [tmp1q-32*2], m10
   5216    mova       [tmp1q-32*1], m11
   5217    mova       [tmp1q+32*0], m12
   5218    mova       [tmp1q+32*1], m13
   5219    mova       [tmp1q+32*2], m14
   5220    mova       [tmp1q+32*3], m15
   5221    LOAD_8ROWS      cq+64*2, 64*4, 1
   5222    pxor                 m8, m8
   5223    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
   5224    add               tmp1q, 32*8
   5225    lea               tmp2q, [tmp1q+32*8]
   5226    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   5227    vpbroadcastd        m15, [o(pd_2048)]
   5228    add               tmp1q, 32*16
   5229    add               tmp2q, 32*32
   5230    vpbroadcastd         m7, [o(pw_2896x8)]
   5231    pmulhrsw             m0, m7, [cq+64* 1]
   5232    pmulhrsw             m1, m7, [cq+64*31]
   5233    pmulhrsw             m2, m7, [cq+64*17]
   5234    pmulhrsw             m3, m7, [cq+64*15]
   5235    pmulhrsw             m4, m7, [cq+64* 9]
   5236    pmulhrsw             m5, m7, [cq+64*23]
   5237    pmulhrsw             m6, m7, [cq+64*25]
   5238    pmulhrsw             m7,     [cq+64* 7]
   5239    pxor                 m8, m8
   5240    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
   5241    add                  r6, o_idct64_offset
   5242    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   5243    vpbroadcastd         m7, [o(pw_2896x8-(o_idct64_offset))]
   5244    add                  r6, 8
   5245    add               tmp1q, 32*8
   5246    sub               tmp2q, 32*8
   5247    pmulhrsw             m0, m7, [cq+64* 5]
   5248    pmulhrsw             m1, m7, [cq+64*27]
   5249    pmulhrsw             m2, m7, [cq+64*21]
   5250    pmulhrsw             m3, m7, [cq+64*11]
   5251    pmulhrsw             m4, m7, [cq+64*13]
   5252    pmulhrsw             m5, m7, [cq+64*19]
   5253    pmulhrsw             m6, m7, [cq+64*29]
   5254    pmulhrsw             m7,     [cq+64* 3]
   5255    pxor                 m8, m8
   5256    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
   5257    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   5258    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
   5259    sub               tmp1q, 32*44
   5260    vpbroadcastd        m10, [o(pw_16384)]
   5261    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
   5262    add                  cq, 32
   5263    add               tmp4d, 0x80000000
   5264    jnc .pass1_loop
   5265    lea               tmp1q, [rsp+32*15]
   5266    imul                 r2, strideq, 19
   5267    lea                  r3, [strideq*3]
   5268    add                  r2, dstq
   5269    mov               tmp4b, 4
   5270 .pass2_loop:
   5271    lea               tmp2q, [tmp1q+32*64]
   5272    LOAD_8ROWS   tmp1q-32*4, 32
   5273    test              tmp4d, 0x40000000
   5274    jnz .fast
   5275    LOAD_8ROWS_H tmp2q-32*4, 32
   5276    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
   5277    lea               tmp3q, [tmp2q-32*8]
   5278    LOAD_8ROWS_H tmp3q-32*4, 32
   5279    mova              [rsp], m15
   5280    jmp .idct16
   5281 .fast:
   5282    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   5283    pxor                 m8, m8
   5284    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
   5285    mova              [rsp], m8
   5286 .idct16:
   5287    lea               tmp3q, [tmp1q-32*8]
   5288    LOAD_8ROWS   tmp3q-32*4, 32
   5289    call m(idct_16x16_internal_8bpc).main
   5290    call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
   5291    add               tmp1q, 32*16
   5292    sub                dstq, r3
   5293    lea                  r2, [r2+r3+16]
   5294    add                dstq, 16
   5295    dec               tmp4b
   5296    jg .pass2_loop
   5297    RET
   5298 ALIGN function_align
   5299 .transpose_round_interleave:
   5300    mov               tmp3d, 4
   5301 .loop:
   5302    lea               tmp2q, [tmp1q+32*8]
   5303    mova                xm0, [tmp1q-32*4]
   5304    mova                xm1, [tmp1q-32*3]
   5305    vinserti128          m0, [tmp2q-32*4], 1
   5306    vinserti128          m1, [tmp2q-32*3], 1
   5307    mova                xm2, [tmp1q-32*2]
   5308    mova                xm3, [tmp1q-32*1]
   5309    vinserti128          m2, [tmp2q-32*2], 1
   5310    vinserti128          m3, [tmp2q-32*1], 1
   5311    mova                xm4, [tmp1q+32*0]
   5312    mova                xm5, [tmp1q+32*1]
   5313    vinserti128          m4, [tmp2q+32*0], 1
   5314    vinserti128          m5, [tmp2q+32*1], 1
   5315    mova                xm6, [tmp1q+32*2]
   5316    mova                xm7, [tmp1q+32*3]
   5317    vinserti128          m6, [tmp2q+32*2], 1
   5318    vinserti128          m7, [tmp2q+32*3], 1
   5319    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
   5320    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
   5321    mova                xm8, [tmp1q-32*4+16]
   5322    mova                xm9, [tmp1q-32*3+16]
   5323    vinserti128          m8, [tmp2q-32*4+16], 1
   5324    vinserti128          m9, [tmp2q-32*3+16], 1
   5325    mova       [tmp1q-32*4], m0
   5326    mova       [tmp2q-32*4], m1
   5327    mova       [tmp1q-32*3], m2
   5328    mova       [tmp2q-32*3], m3
   5329    mova                xm2, [tmp1q-32*2+16]
   5330    mova                xm3, [tmp1q-32*1+16]
   5331    vinserti128          m2, [tmp2q-32*2+16], 1
   5332    vinserti128          m3, [tmp2q-32*1+16], 1
   5333    mova       [tmp1q-32*2], m4
   5334    mova       [tmp2q-32*2], m5
   5335    mova       [tmp1q-32*1], m6
   5336    mova       [tmp2q-32*1], m7
   5337    mova                xm4, [tmp1q+32*0+16]
   5338    mova                xm5, [tmp1q+32*1+16]
   5339    vinserti128          m4, [tmp2q+32*0+16], 1
   5340    vinserti128          m5, [tmp2q+32*1+16], 1
   5341    mova                xm6, [tmp1q+32*2+16]
   5342    mova                xm7, [tmp1q+32*3+16]
   5343    vinserti128          m6, [tmp2q+32*2+16], 1
   5344    vinserti128          m7, [tmp2q+32*3+16], 1
   5345    pmulhrsw             m0, m8, m10
   5346    pmulhrsw             m1, m9, m10
   5347    REPX  {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7
   5348    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
   5349    mova       [tmp1q+32*0], m0
   5350    mova       [tmp2q+32*0], m1
   5351    mova       [tmp1q+32*1], m2
   5352    mova       [tmp2q+32*1], m3
   5353    mova       [tmp1q+32*2], m4
   5354    mova       [tmp2q+32*2], m5
   5355    mova       [tmp1q+32*3], m6
   5356    mova       [tmp2q+32*3], m7
   5357    add               tmp1q, 32*16
   5358    dec               tmp3d
   5359    jg .loop
   5360    ret
   5361 
   5362 cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob
   5363    lea                  r6, [o_base]
   5364    test               eobd, eobd
   5365    jnz .normal
   5366    movd                xm1, [o(pw_2896x8)]
   5367    pmulhrsw            xm0, xm1, [cq]
   5368    movd                xm2, [o(pw_8192)]
   5369    mov                [cq], eobd
   5370    or                  r3d, 64
   5371    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
   5372 .normal:
   5373    PROLOGUE              0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
   5374    lea               tmp1q, [rsp+32*71]
   5375    lea                r10d, [eobq-136]
   5376 .pass1_loop:
   5377    LOAD_8ROWS      cq+64*0, 64*4
   5378    pxor                 m8, m8
   5379    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
   5380    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
   5381    mova              [rsp], m8
   5382    call m(idct_16x16_internal_8bpc).main
   5383    mova                 m1, [rsp+32*1]
   5384    mova       [tmp1q-32*4], m0
   5385    mova       [tmp1q-32*3], m1
   5386    mova       [tmp1q-32*2], m2
   5387    mova       [tmp1q-32*1], m3
   5388    mova       [tmp1q+32*0], m4
   5389    mova       [tmp1q+32*1], m5
   5390    mova       [tmp1q+32*2], m6
   5391    mova       [tmp1q+32*3], m7
   5392    add               tmp1q, 32*8
   5393    mova       [tmp1q-32*4], m8
   5394    mova       [tmp1q-32*3], m9
   5395    mova       [tmp1q-32*2], m10
   5396    mova       [tmp1q-32*1], m11
   5397    mova       [tmp1q+32*0], m12
   5398    mova       [tmp1q+32*1], m13
   5399    mova       [tmp1q+32*2], m14
   5400    mova       [tmp1q+32*3], m15
   5401    LOAD_8ROWS      cq+64*2, 64*4
   5402    pxor                 m8, m8
   5403    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
   5404    add               tmp1q, 32*8
   5405    lea               tmp2q, [tmp1q+32*8]
   5406    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   5407    vpbroadcastd        m15, [o(pd_2048)]
   5408    add               tmp1q, 32*16
   5409    add               tmp2q, 32*32
   5410    mova                 m0, [cq+64* 1]
   5411    mova                 m1, [cq+64*31]
   5412    mova                 m2, [cq+64*17]
   5413    mova                 m3, [cq+64*15]
   5414    mova                 m4, [cq+64* 9]
   5415    mova                 m5, [cq+64*23]
   5416    mova                 m6, [cq+64*25]
   5417    mova                 m7, [cq+64* 7]
   5418    pxor                 m8, m8
   5419    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
   5420    add                  r6, o_idct64_offset
   5421    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   5422    add                  r6, 8
   5423    add               tmp1q, 32*8
   5424    sub               tmp2q, 32*8
   5425    mova                 m0, [cq+64* 5]
   5426    mova                 m1, [cq+64*27]
   5427    mova                 m2, [cq+64*21]
   5428    mova                 m3, [cq+64*11]
   5429    mova                 m4, [cq+64*13]
   5430    mova                 m5, [cq+64*19]
   5431    mova                 m6, [cq+64*29]
   5432    mova                 m7, [cq+64* 3]
   5433    pxor                 m8, m8
   5434    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
   5435    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   5436    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
   5437    sub               tmp1q, 32*44
   5438    vpbroadcastd        m10, [o(pw_8192)]
   5439    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
   5440    add                  cq, 32
   5441    add                r10d, 0x80000000
   5442    jnc .pass1_loop
   5443    lea               tmp1q, [rsp+32*7]
   5444    mov                r10b, 4
   5445 .pass2_loop:
   5446    lea                  r2, [tmp1q+32*64]
   5447    mova                 m0, [r2-32*4]
   5448    mova                 m1, [r2-32*2]
   5449    mova                 m2, [r2+32*0]
   5450    mova                 m3, [r2+32*2]
   5451    pxor                 m4, m4
   5452    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
   5453    mova              [rsp], m4
   5454    test               r10d, 0x40000000
   5455    jnz .fast
   5456    lea                  r3, [r2+32*64]
   5457    mova                 m4, [r3-32*4]
   5458    mova                 m5, [r3-32*2]
   5459    mova                 m6, [r3+32*0]
   5460    mova                 m7, [r3+32*2]
   5461 .fast:
   5462    call m(idct_16x16_internal_8bpc).main
   5463    mova                 m1, [rsp+32*1]
   5464    mova       [tmp1q-32*4], m0
   5465    mova       [tmp1q-32*3], m1
   5466    mova       [tmp1q-32*2], m2
   5467    mova       [tmp1q-32*1], m3
   5468    mova       [tmp1q+32*0], m4
   5469    mova       [tmp1q+32*1], m5
   5470    mova       [tmp1q+32*2], m6
   5471    mova       [tmp1q+32*3], m7
   5472    add               tmp1q, 32*8
   5473    mova       [tmp1q-32*4], m8
   5474    mova       [tmp1q-32*3], m9
   5475    mova       [tmp1q-32*2], m10
   5476    mova       [tmp1q-32*1], m11
   5477    mova       [tmp1q+32*0], m12
   5478    mova       [tmp1q+32*1], m13
   5479    mova       [tmp1q+32*2], m14
   5480    mova       [tmp1q+32*3], m15
   5481    mova                 m0, [r2-32*3]
   5482    mova                 m1, [r2-32*1]
   5483    mova                 m2, [r2+32*1]
   5484    mova                 m3, [r2+32*3]
   5485    pxor                 m4, m4
   5486    REPX       {mova x, m4}, m5, m6, m7
   5487    test               r10d, 0x40000000
   5488    jnz .fast2
   5489    mova                 m4, [r3-32*3]
   5490    mova                 m5, [r3-32*1]
   5491    mova                 m6, [r3+32*1]
   5492    mova                 m7, [r3+32*3]
   5493 .fast2:
   5494    add               tmp1q, 32*8
   5495    lea               tmp2q, [tmp1q+32*8]
   5496    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
   5497    vpbroadcastd        m15, [o(pd_2048)]
   5498    add                  r2, 32*8
   5499    add                  r3, 32*8
   5500    add               tmp1q, 32*16
   5501    add               tmp2q, 32*32
   5502    mova                 m0, [r2-32*4] ;  1
   5503    mova                 m3, [r2+32*3] ; 15
   5504    mova                 m4, [r2+32*0] ;  9
   5505    mova                 m7, [r2-32*1] ;  7
   5506    pxor                 m1, m1
   5507    REPX       {mova x, m1}, m2, m5, m6
   5508    test               r10d, 0x40000000
   5509    jnz .fast3
   5510    mova                 m1, [r3+32*3] ; 31
   5511    mova                 m2, [r3-32*4] ; 17
   5512    mova                 m5, [r3-32*1] ; 23
   5513    mova                 m6, [r3+32*0] ; 25
   5514 .fast3:
   5515    add                  r6, o_idct64_offset
   5516    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   5517    add                  r6, 8
   5518    add               tmp1q, 32*8
   5519    sub               tmp2q, 32*8
   5520    mova                 m0, [r2-32*2] ;  5
   5521    mova                 m3, [r2+32*1] ; 11
   5522    mova                 m4, [r2+32*2] ; 13
   5523    mova                 m7, [r2-32*3] ;  3
   5524    pxor                 m1, m1
   5525    REPX       {mova x, m1}, m2, m5, m6
   5526    test               r10d, 0x40000000
   5527    jnz .fast4
   5528    mova                 m1, [r3+32*1] ; 27
   5529    mova                 m2, [r3-32*2] ; 21
   5530    mova                 m5, [r3-32*3] ; 19
   5531    mova                 m6, [r3+32*2] ; 29
   5532 .fast4:
   5533    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
   5534    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
   5535    sub               tmp1q, 32*28
   5536    sub                dstq, r8
   5537    lea                dstq, [dstq+strideq*4+16]
   5538    dec                r10b
   5539    jg .pass2_loop
   5540    RET
   5541 
   5542 %endif ; ARCH_X86_64