tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx16_sse.asm (285869B)


      1 ; Copyright © 2021, VideoLAN and dav1d authors
      2 ; Copyright © 2021, Two Orioles, LLC
      3 ; Copyright © 2017-2021, The rav1e contributors
      4 ; Copyright © 2020, Nathan Egge
      5 ; Copyright © 2021, Matthias Dressel
      6 ; All rights reserved.
      7 ;
      8 ; Redistribution and use in source and binary forms, with or without
      9 ; modification, are permitted provided that the following conditions are met:
     10 ;
     11 ; 1. Redistributions of source code must retain the above copyright notice, this
     12 ;    list of conditions and the following disclaimer.
     13 ;
     14 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     15 ;    this list of conditions and the following disclaimer in the documentation
     16 ;    and/or other materials provided with the distribution.
     17 ;
     18 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     19 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     20 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     21 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     22 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     23 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     24 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     25 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     27 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     28 
     29 %include "config.asm"
     30 %include "ext/x86/x86inc.asm"
     31 
     32 SECTION_RODATA
     33 %macro COEF 1-2
     34 pd_%1: times 4 dd %1
     35 %if %0 == 2
     36 pd_m%1: times 4 dd -%1
     37 %endif
     38 %endmacro
     39 
     40 COEF  201
     41 COEF  401
     42 COEF  601, 1
     43 COEF  799
     44 COEF  995
     45 COEF 1189, 1
     46 COEF 1380, 1
     47 COEF 1567
     48 COEF 1751
     49 COEF 1931
     50 COEF 2106, 1
     51 COEF 2276, 1
     52 COEF 2440
     53 COEF 2598, 1
     54 COEF 2751, 1
     55 COEF 2896
     56 COEF 3035
     57 COEF 3166
     58 COEF 3290
     59 COEF 3406
     60 COEF 3513
     61 COEF 3612
     62 COEF 3703
     63 COEF 3784
     64 COEF 3857
     65 COEF 3920
     66 COEF 3973
     67 COEF 4017
     68 COEF 4052
     69 COEF 4076
     70 COEF 4091
     71 
     72 deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
     73 
     74 %if ARCH_X86_32
     75 pd_1:            times 4 dd     1
     76 %endif
     77 pd_2:            times 4 dd     2
     78 pw_5:            times 8 dw     5
     79 pd_1321:         times 4 dd  1321
     80 pd_2482:         times 4 dd  2482
     81 pd_m3344:        times 4 dd -3344
     82 pd_2048:         times 4 dd  2048
     83 pw_4x2048_4xm2048: times 4 dw 2048
     84                   times 4 dw -2048
     85 pw_4xm2048_4x2048: times 4 dw -2048
     86                   times 4 dw 2048
     87 pw_2048:         times 8 dw  2048
     88 pw_m2048:        times 8 dw  -2048
     89 pd_3803:         times 4 dd  3803
     90 pw_4096:         times 8 dw  4096
     91 pd_5793:         times 4 dd  5793
     92 pd_6144:         times 4 dd  6144
     93 pw_8192:         times 8 dw  8192
     94 pd_10240:        times 4 dd 10240
     95 pd_11586:        times 4 dd 11586
     96 pw_1697x8:       times 8 dw  1697*8
     97 pw_2896x8:       times 8 dw  2896*8
     98 pw_1697x16:      times 8 dw  1697*16
     99 pw_16384:        times 8 dw 16384
    100 pixel_10bpc_max: times 8 dw  0x03ff
    101 
    102 pw_1567_3784:    times 4 dw  1567,  3784
    103 pw_m3784_1567:   times 4 dw -3784,  1567
    104 pw_2896_2896:    times 4 dw  2896,  2896
    105 pw_m2896_2896:   times 4 dw -2896,  2896
    106 
    107 clip_18b_min: times 4 dd -0x20000
    108 clip_18b_max: times 4 dd  0x1ffff
    109 
    110 idct64_mul_16bpc:
    111 dd 4095,  101, 2967, -2824,  3745, 1660, 3822, -1474,   401,  4076,   799,  4017
    112 dd -700, 4036, 2359,  3349, -2191, 3461,  897,  3996, -2598, -3166, -4017,  -799
    113 dd 4065,  501, 3229, -2520,  3564, 2019, 3948, -1092,  1931,  3612,  3406,  2276
    114 dd -301, 4085, 2675,  3102, -1842, 3659, 1285,  3889, -1189, -3920, -2276, -3406
    115 
    116 cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3
    117 cextern iadst_4x4_internal_8bpc_ssse3.main
    118 cextern idct_4x8_internal_8bpc_ssse3.main
    119 cextern iadst_4x8_internal_8bpc_ssse3.main
    120 cextern idct_16x4_internal_8bpc_ssse3.main
    121 cextern iadst_16x4_internal_8bpc_ssse3.main
    122 cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end
    123 cextern idct_8x4_internal_8bpc_ssse3.main
    124 cextern iadst_8x4_internal_8bpc_ssse3.main
    125 cextern idct_8x8_internal_8bpc_ssse3.main
    126 cextern idct_8x8_internal_8bpc_ssse3.pass1_end3
    127 cextern iadst_8x8_internal_8bpc_ssse3.main
    128 cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end
    129 cextern idct_16x8_internal_8bpc_ssse3.main
    130 cextern iadst_16x8_internal_8bpc_ssse3.main
    131 cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end
    132 cextern idct_8x32_internal_8bpc_ssse3.main
    133 cextern idct_8x32_internal_8bpc_ssse3.main_fast
    134 cextern idct_8x32_internal_8bpc_ssse3.main_veryfast
    135 cextern idct_16x64_internal_8bpc_ssse3.main
    136 cextern idct_16x64_internal_8bpc_ssse3.main_fast
    137 
    138 tbl_4x16_2d: db 0, 13, 29, 45
    139 tbl_4x16_h: db 0, 16, 32, 48
    140 tbl_4x16_v: db 0, 4, 8, 12
    141 
    142 tbl_8x16_2d: db 0, 14, 30, 46
    143 tbl_8x16_v: db 0, 4, 8, 12
    144 tbl_8x16_h: db 0, 32, 64, 96
    145 
    146 tbl_16x16_2d: db 0, 10, 36, 78
    147 tbl_16x16_v: db 0, 4, 8, 12
    148 tbl_16x16_h: db 0, 64, 128, 192
    149 
    150 tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203
    151 
    152 tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343
    153 
    154 tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one
    155 tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406
    156 
    157 tbl_Nx32_odd_offset: db 2*16, 2*23
    158                     db 2*20, 2*19
    159                     db 2*18, 2*21
    160                     db 2*22, 2*17
    161                     db 2*30, 2*25
    162                     db 2*26, 2*29
    163                     db 2*28, 2*27
    164                     db 2*24, 2*31
    165 
    166 tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46
    167                 db 2* 8, 2*40, 2*23, 2*38
    168                 db 2* 1, 2*36, 2*20, 2*42
    169                 db 2* 9, 2*44, 2*19, 2*34
    170                 db 2* 2, 2*60, 2*18, 2*50
    171                 db 2*10, 2*52, 2*21, 2*58
    172                 db 2* 3, 2*56, 2*22, 2*54
    173                 db 2*11, 2*48, 2*17, 2*62
    174 
    175 SECTION .text
    176 
    177 %define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx)
    178 %define m(x) m_suffix(x, SUFFIX)
    179 
    180 ; This refers to the first function in itx_sse i.e. the start of the text section
    181 ; which is needed as a base pointer for constants.
    182 %define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3)
    183 
    184 %if ARCH_X86_64
    185 %define o(x) x
    186 %else
    187 %define o(x) r6-$$+x ; PIC
    188 %endif
    189 
    190 %macro IWHT4_1D 0
    191    ; m0 = in0,  m1 = in1,  m2 = in2,  m3 = in3
    192    paddd                m0, m1      ; in0 += in1
    193    psubd                m4, m2, m3  ; tmp0 = in2 - in3
    194    psubd                m5, m0, m4  ; tmp1 = (in0 - tmp0) >> 1
    195    psrad                m5, 1
    196    psubd                m2, m5, m1  ; in2 = tmp1 - in1
    197    psubd                m5, m3      ; in1 = tmp1 - in3
    198    psubd                m0, m5      ; in0 -= in1
    199    paddd                m4, m2      ; in3 = tmp0 + in2
    200    ; m0 = out0,  m1 = in1,  m2 = out2,  m3 = in3
    201    ; m4 = out3,  m5 = out1
    202 %endmacro
    203 
    204 INIT_XMM sse2
    205 cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax
    206    mova                 m0, [cq+16*0]
    207    mova                 m1, [cq+16*1]
    208    mova                 m2, [cq+16*2]
    209    mova                 m3, [cq+16*3]
    210    REPX       {psrad x, 2}, m0, m1, m2, m3
    211    IWHT4_1D
    212    punpckldq            m1, m0, m5
    213    punpckhdq            m3, m0, m5
    214    punpckldq            m5, m2, m4
    215    punpckhdq            m2, m4
    216    punpcklqdq           m0, m1, m5
    217    punpckhqdq           m1, m5
    218    punpcklqdq           m4, m3, m2
    219    punpckhqdq           m3, m2
    220    mova                 m2, m4
    221    IWHT4_1D
    222    packssdw             m0, m4 ; low: out3,  high: out0
    223    packssdw             m2, m5 ; low: out2,  high: out1
    224    pxor                 m4, m4
    225    mova          [cq+16*0], m4
    226    mova          [cq+16*1], m4
    227    mova          [cq+16*2], m4
    228    mova          [cq+16*3], m4
    229    lea                  r2, [dstq+strideq*2]
    230    movq                 m1, [dstq+strideq*0]
    231    movhps               m1, [r2  +strideq*1]
    232    movq                 m3, [r2  +strideq*0]
    233    movhps               m3, [dstq+strideq*1]
    234    movd                 m5, bdmaxm
    235    pshuflw              m5, m5, q0000  ; broadcast
    236    punpcklqdq           m5, m5         ; broadcast
    237    paddsw               m0, m1
    238    paddsw               m2, m3
    239    pmaxsw               m0, m4
    240    pmaxsw               m2, m4
    241    pminsw               m0, m5
    242    pminsw               m2, m5
    243    movhps [r2  +strideq*1], m0 ; write out0
    244    movhps [dstq+strideq*1], m2 ; write out1
    245    movq   [r2  +strideq*0], m2 ; write out2
    246    movq   [dstq+strideq*0], m0 ; write out3
    247    RET
    248 
    249 ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
    250 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
    251 ; flags: 2 = inv_dst1, 4 = inv_dst2
    252 ; skip round/shift if rnd is not a number
    253 %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
    254 ; %1 dst/src[1]
    255 ; %2 dst/src[2]
    256 ; %3 tmp[1]
    257 ; %4 tmp[2]
    258 ; %5 tmp[3]
    259 ; %6 rnd
    260 ; %7 coef[1]
    261 ; %8 coef[2]
    262 ; %9 flags
    263 %ifnidn %7,%8   ; optimize when coef1 == coef2
    264 %if %8 < 32
    265    pmulld              m%4, m%1, m%8
    266    pmulld              m%3, m%2, m%8
    267 %else
    268    mova                m%3, [o(pd_%8)]
    269    pmulld              m%4, m%1, m%3
    270    pmulld              m%3, m%2
    271 %endif
    272 %endif
    273 %if %7 < 32
    274    pmulld              m%1, m%7
    275    pmulld              m%2, m%7
    276 %else
    277    mova                m%5, [o(pd_%7)]
    278    pmulld              m%1, m%5
    279    pmulld              m%2, m%5
    280 %endif
    281 %if %9 & 4  ; invert dst2
    282    paddd               m%4, m%2
    283    psubd               m%2, m%6, m%4
    284 %else
    285 %ifnum %6
    286 %ifnidn %7,%8
    287    paddd               m%4, m%6
    288 %else
    289    paddd               m%1, m%6
    290 %endif
    291 %endif
    292 %ifnidn %7,%8
    293    paddd               m%2, m%4
    294 %else
    295    mova                m%3, m%2
    296    paddd               m%2, m%1
    297 %endif
    298 %endif
    299 %if %9 & 2  ; invert dst1
    300    psubd               m%3, m%1
    301    paddd               m%1, m%3, m%6
    302 %else
    303 %ifnum %6
    304 %ifnidn %7,%8
    305    paddd               m%1, m%6
    306 %endif
    307 %endif
    308    psubd               m%1, m%3
    309 %endif
    310 %ifnum %6
    311    psrad               m%2, 12
    312    psrad               m%1, 12
    313 %endif
    314 %endmacro
    315 
    316 %macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack
    317 cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2
    318    %define %%p1 m(i%1_%4_internal_16bpc)
    319 %if ARCH_X86_32
    320    LEA                  r6, $$
    321 %endif
    322 %if has_epilogue
    323 %ifidn %1_%2, dct_dct
    324    test               eobd, eobd
    325    jz %%end
    326 %endif
    327    lea                tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
    328 %ifnum %3
    329 %if %3
    330    add                eobd, %3
    331 %endif
    332 %else
    333    lea                  r5, [o(%3)]
    334 %endif
    335    call %%p1
    336    RET
    337 %%end:
    338 %else
    339    ; Jump to the 1st txfm function if we're not taking the fast path, which
    340    ; in turn performs an indirect jump to the 2nd txfm function.
    341    lea                tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
    342 %ifnum %3
    343 %if %3
    344    add                eobd, %3
    345 %endif
    346 %else
    347    lea                  r5, [o(%3)]
    348 %endif
    349 %ifidn %1_%2, dct_dct
    350    test               eobd, eobd
    351    jnz %%p1
    352 %else
    353    ; jump to the 1st txfm function unless it's located directly after this
    354    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
    355 ALIGN function_align
    356 %%end:
    357 %endif
    358 %endif
    359 %endmacro
    360 
    361 %macro INV_TXFM_4X4_FN 2 ; type1, type2
    362    INV_TXFM_FN          %1, %2, 0, 4x4
    363 %ifidn %1_%2, dct_dct
    364    imul                r5d, [cq], 181
    365    mov                [cq], eobd ; 0
    366    mov                 r3d, 4
    367 .dconly:
    368    add                 r5d, 128
    369    sar                 r5d, 8
    370 .dconly2:
    371    imul                r5d, 2896
    372    mova                 m2, [o(pixel_10bpc_max)]
    373    add                 r5d, 34816
    374    movd                 m0, r5d
    375    pshuflw              m0, m0, q1111
    376    pxor                 m3, m3
    377    punpcklqdq           m0, m0
    378 .dconly_loop:
    379    movq                 m1, [dstq+strideq*0]
    380    movhps               m1, [dstq+strideq*1]
    381    paddw                m1, m0
    382    pminsw               m1, m2
    383    pmaxsw               m1, m3
    384    movq   [dstq+strideq*0], m1
    385    movhps [dstq+strideq*1], m1
    386    lea                dstq, [dstq+strideq*2]
    387    sub                 r3d, 2
    388    jg .dconly_loop
    389    RET
    390 %endif
    391 %endmacro
    392 
    393 %macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
    394    ; butterfly rotation
    395    ITX_MULSUB_2D        %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1  %3 out0
    396    ITX_MULSUB_2D        %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2  %4 out3
    397    ; Hadamard rotation
    398    psubd               m%5, m%1, m%2
    399    paddd               m%2, m%1
    400    paddd               m%1, m%3, m%4
    401    psubd               m%3, m%4
    402    ; %1 (src1) = out0
    403    ; %2 (src2) = out1
    404    ; %3 (src3) = out3
    405    ; $5 (tmp1) = out2
    406 %endmacro
    407 
    408 INIT_XMM sse4
    409 
    410 INV_TXFM_4X4_FN dct, dct
    411 INV_TXFM_4X4_FN dct, identity
    412 INV_TXFM_4X4_FN dct, adst
    413 INV_TXFM_4X4_FN dct, flipadst
    414 
    415 cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    416    mova                 m0, [cq+16*0]
    417    mova                 m1, [cq+16*1]
    418    mova                 m2, [cq+16*2]
    419    mova                 m3, [cq+16*3]
    420    mova                 m5, [o(pd_2048)]
    421    call .pass1_main
    422    packssdw             m0, m1     ; out0 out1
    423    packssdw             m4, m2     ; out2 out3
    424    ; transpose
    425    punpckhwd            m2, m0, m4
    426    punpcklwd            m0, m4
    427    punpckhwd            m1, m0, m2
    428    punpcklwd            m0, m2
    429    ; m0 = out0 out1
    430    ; m1 = out2 out3
    431    ; m5 = pd_2048
    432    jmp                tx2q
    433 .pass1_main:
    434    IDCT4_1D              0, 1, 2, 3, 4, 6, 7, 5
    435    ret
    436 .pass2:
    437    ; m0 = in0 in1
    438    ; m1 = in2 in3
    439    ; m5 = pd_2048
    440    punpckhwd            m2, m1, m0
    441    punpcklwd            m1, m0
    442    pmaddwd              m4, m2, [o(pw_m3784_1567)]
    443    pmaddwd              m2, [o(pw_1567_3784)]
    444    pmaddwd              m0, m1, [o(pw_m2896_2896)]
    445    pmaddwd              m1, [o(pw_2896_2896)]
    446    REPX      {paddd x, m5}, m4, m2, m0, m1
    447    packssdw             m5, m5     ; pw_2048
    448    REPX      {psrad x, 12}, m4, m2, m0, m1
    449    packssdw             m2, m4     ; t3 t2
    450    packssdw             m1, m0     ; t0 t1
    451    paddsw               m0, m1, m2 ; out0 out1
    452    psubsw               m1, m2     ; out3 out2
    453    pmulhrsw             m0, m5
    454    pmulhrsw             m1, m5
    455    movq                 m2, [dstq+strideq*0]
    456    movhps               m2, [dstq+strideq*1]
    457    lea                  r5, [dstq+strideq*2]
    458    movq                 m3, [r5  +strideq*1]
    459    movhps               m3, [r5  +strideq*0]
    460    mova                 m5, [o(pixel_10bpc_max)]
    461    pxor                 m4, m4
    462    mova          [cq+16*0], m4
    463    mova          [cq+16*1], m4
    464    mova          [cq+16*2], m4
    465    mova          [cq+16*3], m4
    466    paddw                m0, m2
    467    paddw                m1, m3
    468    pmaxsw               m0, m4
    469    pmaxsw               m1, m4
    470    pminsw               m0, m5
    471    pminsw               m1, m5
    472    movq   [dstq+strideq*0], m0
    473    movhps [dstq+strideq*1], m0
    474    movhps [r5  +strideq*0], m1
    475    movq   [r5  +strideq*1], m1
    476    RET
    477 
    478 INV_TXFM_4X4_FN adst, dct
    479 INV_TXFM_4X4_FN adst, adst
    480 INV_TXFM_4X4_FN adst, flipadst
    481 INV_TXFM_4X4_FN adst, identity
    482 
    483 cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    484    call .main
    485    packssdw             m0, m2            ; out0 out1
    486    packssdw             m1, m4            ; out2 out3
    487    ; transpose
    488    punpckhwd            m2, m0, m1
    489    punpcklwd            m0, m1
    490    punpckhwd            m1, m0, m2
    491    punpcklwd            m0, m2
    492    ; m0 = out0 out1
    493    ; m1 = out2 out3
    494    ; m5 = pd_2048
    495    jmp                tx2q
    496 .pass2:
    497    ; m0 = in0 in1
    498    ; m1 = in2 in3
    499 %if ARCH_X86_32
    500    lea                  r5, [o(itx8_start)]
    501 %endif
    502    call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
    503 .end:
    504    mova                 m4, [o(pw_2048)]
    505    movq                 m2, [dstq+strideq*0]
    506    movhps               m2, [dstq+strideq*1]
    507    lea                  r5, [dstq+strideq*2]
    508    movq                 m3, [r5  +strideq*0]
    509    movhps               m3, [r5  +strideq*1]
    510    mova                 m5, [o(pixel_10bpc_max)]
    511    pmulhrsw             m0, m4
    512    pmulhrsw             m1, m4
    513    pxor                 m4, m4
    514    mova          [cq+16*0], m4
    515    mova          [cq+16*1], m4
    516    mova          [cq+16*2], m4
    517    mova          [cq+16*3], m4
    518    paddw                m0, m2
    519    paddw                m1, m3
    520    pmaxsw               m0, m4
    521    pmaxsw               m1, m4
    522    pminsw               m0, m5
    523    pminsw               m1, m5
    524    movq   [dstq+strideq*0], m0
    525    movhps [dstq+strideq*1], m0
    526    movq   [r5  +strideq*0], m1
    527    movhps [r5  +strideq*1], m1
    528    RET
    529 ALIGN function_align
    530 .main:
    531    mova                 m1, [cq+16*2]
    532    mova                 m3, [cq+16*3]
    533    mova                 m5, [cq+16*0]
    534    lea                  r3, [cq+16*1]
    535 .main2:
    536    mova                 m0, [o(pd_1321)]  ; SINPI_1_9
    537    mova                 m2, [o(pd_2482)]  ; SINPI_2_9
    538    mova                 m6, [o(pd_3803)]  ; SINPI_4_9
    539    pmulld               m4, m0, m1        ; s[4] = SINPI_1_9 * T[2]
    540    pmulld               m7, m3, m6        ; s[6] = SINPI_4_9 * T[3]
    541    pmulld               m6, m1            ; s[3] = SINPI_4_9 * T[2]
    542    pmulld               m0, m5            ; s[0] = SINPI_1_9 * T[0]
    543    psubd                m1, m3            ; T[2] - T[3]
    544    pmulld               m3, m2            ; s[5] = SINPI_2_9 * T[3]
    545    pmulld               m2, m5            ; s[1] = SINPI_2_9 * T[0]
    546    paddd                m0, m6            ; s[0] += s[3]
    547    paddd                m0, m3            ; s[0] += s[5]
    548    mova                 m3, [o(pd_m3344)] ; -SINPI_3_9
    549    psubd                m2, m4            ; s[1] -= s[4]
    550    psubd                m2, m7            ; s[1] -= s[6]
    551    psubd                m1, m5            ; -b7 = (T[2] -T[3]) - T[0]
    552    pmulld               m1, m3            ; s[2]  = -SINPI_3_9 * -b7
    553    pmulld               m3, [r3]          ; -s[3] = -SINPI_3_9 * T[1]
    554    mova                 m5, [o(pd_2048)]
    555    REPX      {paddd x, m5}, m0, m1        ; {s[0], s[2]} + 2048
    556    paddd                m4, m0, m2        ; x[3]  = s[0] + s[1]
    557    psubd                m2, m3            ; x[1]  = s[1] + s[3]
    558    psubd                m0, m3            ; x[0]  = s[0] + s[3]
    559    paddd                m4, m3            ; x[3] -= s[3]
    560    paddd                m2, m5            ; x[1] + 2048
    561    REPX      {psrad x, 12}, m0, m2, m1, m4
    562    ret
    563 
    564 
    565 INV_TXFM_4X4_FN flipadst, dct
    566 INV_TXFM_4X4_FN flipadst, adst
    567 INV_TXFM_4X4_FN flipadst, flipadst
    568 INV_TXFM_4X4_FN flipadst, identity
    569 
    570 cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    571    call m(iadst_4x4_internal_16bpc).main
    572    packssdw             m0, m2            ; out0 out1
    573    packssdw             m1, m4            ; out2 out3
    574    ; transpose
    575    punpcklwd            m2, m1, m0
    576    punpckhwd            m1, m0
    577    punpcklwd            m0, m1, m2
    578    punpckhwd            m1, m2
    579    ; m0 = out0 out1
    580    ; m1 = out2 out3
    581    ; m5 = pd_2048
    582    jmp                tx2q
    583 .pass2:
    584    ; m0 = in0 in1
    585    ; m1 = in2 in3
    586 %if ARCH_X86_32
    587    lea                 r5, [o(itx8_start)]
    588 %endif
    589    call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
    590    mova                 m4, [o(pw_2048)]
    591    movq                 m3, [dstq+strideq*1]
    592    movhps               m3, [dstq+strideq*0]
    593    lea                  r5, [dstq+strideq*2]
    594    movq                 m2, [r5  +strideq*1]
    595    movhps               m2, [r5  +strideq*0]
    596    mova                 m5, [o(pixel_10bpc_max)]
    597    pmulhrsw             m0, m4
    598    pmulhrsw             m1, m4
    599    pxor                 m4, m4
    600    mova          [cq+16*0], m4
    601    mova          [cq+16*1], m4
    602    mova          [cq+16*2], m4
    603    mova          [cq+16*3], m4
    604    paddw                m0, m2
    605    paddw                m1, m3
    606    pmaxsw               m0, m4
    607    pmaxsw               m1, m4
    608    pminsw               m0, m5
    609    pminsw               m1, m5
    610    movhps [dstq+strideq*0], m1
    611    movq   [dstq+strideq*1], m1
    612    movhps [r5  +strideq*0], m0
    613    movq   [r5  +strideq*1], m0
    614    RET
    615 
    616 INV_TXFM_4X4_FN identity, dct
    617 INV_TXFM_4X4_FN identity, adst
    618 INV_TXFM_4X4_FN identity, flipadst
    619 INV_TXFM_4X4_FN identity, identity
    620 
    621 cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    622    mova                 m3, [o(pd_5793)]
    623    pmulld               m0, m3, [cq+16*0]
    624    pmulld               m1, m3, [cq+16*1]
    625    pmulld               m2, m3, [cq+16*2]
    626    pmulld               m3,     [cq+16*3]
    627    mova                 m5, [o(pd_2048)]
    628    REPX      {paddd x, m5}, m0, m1, m2, m3
    629    REPX      {psrad x, 12}, m0, m1, m2, m3
    630    packssdw             m0, m1
    631    packssdw             m2, m3
    632    ; transpose
    633    punpckhwd            m3, m0, m2
    634    punpcklwd            m0, m2
    635    punpckhwd            m1, m0, m3
    636    punpcklwd            m0, m3
    637    ; m0 = out0 out1
    638    ; m1 = out2 out3
    639    ; m5 = pd_2048
    640    jmp                tx2q
    641 .pass2:
    642    ; m0 = in0 in1
    643    ; m1 = in2 in3
    644    ; m5 = pd_2048
    645    mova                 m4, [o(pw_1697x8)]
    646    movq                 m2, [dstq+strideq*0]
    647    movhps               m2, [dstq+strideq*1]
    648    lea                  r5, [dstq+strideq*2]
    649    pmulhrsw             m3, m4, m0
    650    pmulhrsw             m4, m1
    651    paddsw               m0, m3
    652    paddsw               m1, m4
    653    movq                 m3, [r5  +strideq*0]
    654    movhps               m3, [r5  +strideq*1]
    655    mova                 m4, [o(pixel_10bpc_max)]
    656    packssdw             m5, m5 ; pw_2048
    657    pmulhrsw             m0, m5
    658    pmulhrsw             m1, m5
    659    pxor                 m5, m5
    660    mova          [cq+16*0], m5
    661    mova          [cq+16*1], m5
    662    mova          [cq+16*2], m5
    663    mova          [cq+16*3], m5
    664    paddw                m0, m2
    665    paddw                m1, m3
    666    pmaxsw               m0, m5
    667    pmaxsw               m1, m5
    668    pminsw               m0, m4
    669    pminsw               m1, m4
    670    movq   [dstq+strideq*0], m0
    671    movhps [dstq+strideq*1], m0
    672    movq   [r5  +strideq*0], m1
    673    movhps [r5  +strideq*1], m1
    674    RET
    675 
    676 %macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
    677    INV_TXFM_FN          %1, %2, %3, 4x8
    678 %ifidn %1_%2, dct_dct
    679    imul                r5d, [cq], 181
    680    mov                [cq], eobd ; 0
    681    mov                 r3d, 8
    682    add                 r5d, 128
    683    sar                 r5d, 8
    684    imul                r5d, 181
    685    jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
    686 %endif
    687 %endmacro
    688 
    689 INV_TXFM_4X8_FN dct, dct
    690 INV_TXFM_4X8_FN dct, identity, 9
    691 INV_TXFM_4X8_FN dct, adst
    692 INV_TXFM_4X8_FN dct, flipadst
    693 
    694 cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    695 %undef cmp
    696    mova                 m5, [o(pd_2048)]
    697 %if ARCH_X86_64
    698    xor                 r5d, r5d
    699    cmp                eobd, 13
    700    setge               r5b
    701 %else
    702    mov                 r5d, 1
    703    cmp                eobd, 13
    704    sbb                 r5d, 0
    705 %endif
    706    shl                 r5d, 4
    707 .loop_pass1:
    708    mova                 m3, [o(pd_2896)]
    709    pmulld               m0, m3, [cq+32*0+r5]
    710    pmulld               m1, m3, [cq+32*1+r5]
    711    pmulld               m2, m3, [cq+32*2+r5]
    712    pmulld               m3, [cq+32*3+r5]
    713    REPX      {paddd x, m5}, m0, m1, m2, m3
    714    REPX      {psrad x, 12}, m0, m1, m2, m3
    715    call m(idct_4x4_internal_16bpc).pass1_main
    716    packssdw             m0, m1     ; out0 out1
    717    packssdw             m4, m2     ; out2 out3
    718    test                r5d, r5d
    719    jz .end_pass1
    720    mova       [cq+32*0+16], m0
    721    mova       [cq+32*1+16], m4
    722    xor                 r5d, r5d
    723    jmp .loop_pass1
    724 .end_pass1:
    725    punpckhwd            m2, m0, m4
    726    punpcklwd            m0, m4
    727    punpckhwd            m1, m0, m2
    728    punpcklwd            m0, m2
    729    mova                 m2, [cq+32*0+16]
    730    mova                 m6, [cq+32*1+16]
    731    punpckhwd            m4, m2, m6
    732    punpcklwd            m2, m6
    733    punpckhwd            m3, m2, m4
    734    punpcklwd            m2, m4
    735    ; m0-3 = packed & transposed output
    736    jmp                tx2q
    737 .pass2:
    738 %if ARCH_X86_32
    739    lea                  r5, [o(itx8_start)]
    740 %endif
    741    call m_suffix(idct_4x8_internal_8bpc, _ssse3).main
    742    ; m0-3 is now out0/1,3/2,4/5,7/6
    743    mova                 m4, [o(pw_2048)]
    744    shufps               m1, m1, q1032
    745    shufps               m3, m3, q1032
    746 .end:
    747    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
    748    pxor                 m4, m4
    749    REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
    750    mova                 m7, [o(pixel_10bpc_max)]
    751    lea                  r2, [strideq*3]
    752    movq                 m5, [dstq+strideq*0]
    753    movq                 m6, [dstq+strideq*2]
    754    movhps               m5, [dstq+strideq*1]
    755    movhps               m6, [dstq+r2]
    756    lea                  r4, [dstq+strideq*4]
    757    paddw                m0, m5
    758    paddw                m1, m6
    759    movq                 m5, [r4+strideq*0]
    760    movq                 m6, [r4+strideq*2]
    761    movhps               m5, [r4+strideq*1]
    762    movhps               m6, [r4+r2]
    763    paddw                m2, m5
    764    paddw                m3, m6
    765    REPX     {pminsw x, m7}, m0, m1, m2, m3
    766    REPX     {pmaxsw x, m4}, m0, m1, m2, m3
    767    movq   [dstq+strideq*0], m0
    768    movhps [dstq+strideq*1], m0
    769    movq   [dstq+strideq*2], m1
    770    movhps [dstq+r2       ], m1
    771    movq   [r4  +strideq*0], m2
    772    movhps [r4  +strideq*1], m2
    773    movq   [r4  +strideq*2], m3
    774    movhps [r4  +r2       ], m3
    775    RET
    776 
    777 INV_TXFM_4X8_FN adst, dct
    778 INV_TXFM_4X8_FN adst, adst
    779 INV_TXFM_4X8_FN adst, flipadst
    780 INV_TXFM_4X8_FN adst, identity, 9
    781 
    782 cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    783    call .pass1_main
    784    punpckhwd            m2, m0, m1
    785    punpcklwd            m0, m1
    786    punpckhwd            m1, m0, m2
    787    punpcklwd            m0, m2
    788    mova                 m2, [cq+32*2+16]
    789    mova                 m6, [cq+32*3+16]
    790    punpckhwd            m4, m2, m6
    791    punpcklwd            m2, m6
    792    punpckhwd            m3, m2, m4
    793    punpcklwd            m2, m4
    794    ; m0-3 = packed & transposed output
    795    jmp                tx2q
    796 .pass1_main:
    797 %undef cmp
    798 %if ARCH_X86_64
    799    xor                 r5d, r5d
    800    cmp                eobd, 13
    801    setge               r5b
    802 %else
    803    mov                 r5d, 1
    804    cmp                eobd, 13
    805    sbb                 r5d, 0
    806 %endif
    807    shl                 r5d, 4
    808    lea                  r3, [cq+32*1+16]
    809 .loop_pass1:
    810    mova                 m0, [o(pd_2048)]
    811    mova                 m3, [o(pd_2896)]
    812    pmulld               m5, m3, [cq+32*0+r5]
    813    pmulld               m2, m3, [cq+32*1+r5]
    814    pmulld               m1, m3, [cq+32*2+r5]
    815    pmulld               m3, [cq+32*3+r5]
    816    REPX      {paddd x, m0}, m5, m2, m1, m3
    817    REPX      {psrad x, 12}, m5, m2, m1, m3
    818    mova               [r3], m2
    819    call m(iadst_4x4_internal_16bpc).main2
    820    packssdw             m0, m2            ; out0 out1
    821    packssdw             m1, m4            ; out2 out3
    822    test                r5d, r5d
    823    jz .end_pass1
    824    mova       [cq+32*2+16], m0
    825    mova       [cq+32*3+16], m1
    826    xor                 r5d, r5d
    827    jmp .loop_pass1
    828 .end_pass1:
    829    ret
    830 .pass2:
    831    shufps               m0, m0, q1032
    832    shufps               m1, m1, q1032
    833 %if ARCH_X86_32
    834    lea                  r5, [o(itx8_start)]
    835 %endif
    836    call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
    837    mova                 m4, [o(pw_4x2048_4xm2048)]
    838    jmp m(idct_4x8_internal_16bpc).end
    839 
    840 INV_TXFM_4X8_FN flipadst, dct
    841 INV_TXFM_4X8_FN flipadst, adst
    842 INV_TXFM_4X8_FN flipadst, flipadst
    843 INV_TXFM_4X8_FN flipadst, identity, 9
    844 
    845 cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    846    call m(iadst_4x8_internal_16bpc).pass1_main
    847    punpcklwd            m2, m1, m0
    848    punpckhwd            m1, m0
    849    punpcklwd            m0, m1, m2
    850    punpckhwd            m1, m2
    851    mova                 m6, [cq+32*2+16]
    852    mova                 m2, [cq+32*3+16]
    853    punpcklwd            m4, m2, m6
    854    punpckhwd            m2, m6
    855    punpckhwd            m3, m2, m4
    856    punpcklwd            m2, m4
    857    ; m0-3 = packed & transposed output
    858    jmp                tx2q
    859 .pass2:
    860    shufps               m0, m0, q1032
    861    shufps               m1, m1, q1032
    862 %if ARCH_X86_32
    863    lea                  r5, [o(itx8_start)]
    864 %endif
    865    call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
    866    mova                 m4, m0
    867    mova                 m5, m1
    868    pshufd               m0, m3, q1032
    869    pshufd               m1, m2, q1032
    870    pshufd               m2, m5, q1032
    871    pshufd               m3, m4, q1032
    872    mova                 m4, [o(pw_4xm2048_4x2048)]
    873    jmp m(idct_4x8_internal_16bpc).end
    874 
    875 INV_TXFM_4X8_FN identity, dct
    876 INV_TXFM_4X8_FN identity, adst
    877 INV_TXFM_4X8_FN identity, flipadst
    878 INV_TXFM_4X8_FN identity, identity, 3
    879 
    880 cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    881 %undef cmp
    882    mova                 m5, [o(pd_2048)]
    883    mova                 m4, [o(pd_2896)]
    884    mova                 m6, [o(pd_5793)]
    885    ; clear m7 in case we skip the bottom square
    886    pxor                 m7, m7
    887 %if ARCH_X86_64
    888    xor                 r5d, r5d
    889    cmp                eobd, 16
    890    setge               r5b
    891 %else
    892    mov                 r5d, 1
    893    cmp                eobd, 16
    894    sbb                 r5d, 0
    895 %endif
    896    shl                 r5d, 4
    897 .loop_pass1:
    898    pmulld               m0, m4, [cq+32*0+r5]
    899    pmulld               m1, m4, [cq+32*1+r5]
    900    pmulld               m2, m4, [cq+32*2+r5]
    901    pmulld               m3, m4, [cq+32*3+r5]
    902    REPX      {paddd x, m5}, m0, m1, m2, m3
    903    REPX      {psrad x, 12}, m0, m1, m2, m3
    904    REPX     {pmulld x, m6}, m0, m1, m2, m3
    905    REPX      {paddd x, m5}, m0, m1, m2, m3
    906    REPX      {psrad x, 12}, m0, m1, m2, m3
    907    packssdw             m0, m1
    908    packssdw             m2, m3
    909    test                r5d, r5d
    910    jz .end_pass1
    911    mova       [cq+32*0+16], m0
    912    mova                 m7, m2
    913    xor                 r5d, r5d
    914    jmp .loop_pass1
    915 .end_pass1:
    916    punpckhwd            m4, m0, m2
    917    punpcklwd            m0, m2
    918    punpckhwd            m1, m0, m4
    919    punpcklwd            m0, m4
    920    mova                 m2, [cq+32*0+16]
    921    punpckhwd            m4, m2, m7
    922    punpcklwd            m2, m7
    923    punpckhwd            m3, m2, m4
    924    punpcklwd            m2, m4
    925    ; m0-3 = packed & transposed output
    926    jmp                tx2q
    927 .pass2:
    928    mova                 m4, [o(pw_4096)]
    929    jmp m(idct_4x8_internal_16bpc).end
    930 
    931 %macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
    932    INV_TXFM_FN          %1, %2, tbl_4x16_%3, 4x16
    933 %ifidn %1_%2, dct_dct
    934    imul                r5d, [cq], 181
    935    mov                [cq], eobd ; 0
    936    mov                 r3d, 16
    937    add                 r5d, 384
    938    sar                 r5d, 9
    939    jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
    940 %endif
    941 %endmacro
    942 
    943 INV_TXFM_4X16_FN dct, dct
    944 INV_TXFM_4X16_FN dct, identity, v
    945 INV_TXFM_4X16_FN dct, adst
    946 INV_TXFM_4X16_FN dct, flipadst
    947 
    948 cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
    949 %undef cmp
    950 %if ARCH_X86_32
    951    mov                 r5m, r6d
    952 %endif
    953    mov                 r6d, 4
    954 .zero_loop:
    955    dec                 r6d
    956    cmp                eobb, byte [r5+r6]
    957    jl .zero_loop
    958    mov                 r5d, r6d
    959    shl                 r5d, 4
    960 %if ARCH_X86_32
    961    ; restore pic-ptr
    962    mov                  r6, r5m
    963 %endif
    964    mova                 m5, [o(pd_2048)]
    965 .loop_pass1:
    966    mova                 m0, [cq+64*0+r5]
    967    mova                 m1, [cq+64*1+r5]
    968    mova                 m2, [cq+64*2+r5]
    969    mova                 m3, [cq+64*3+r5]
    970    call m(idct_4x4_internal_16bpc).pass1_main
    971    pcmpeqd              m3, m3
    972    REPX      {psubd x, m3}, m0, m1, m4, m2
    973    REPX       {psrad x, 1}, m0, m1, m4, m2
    974    packssdw             m0, m1     ; out0 out1
    975    packssdw             m4, m2     ; out2 out3
    976    punpckhwd            m2, m0, m4
    977    punpcklwd            m0, m4
    978    punpckhwd            m1, m0, m2
    979    punpcklwd            m0, m2
    980    test                r5d, r5d
    981    jz .end_pass1
    982    mova       [cq+64*0+r5], m0
    983    mova       [cq+64*1+r5], m1
    984    sub                 r5d, 16
    985    jmp .loop_pass1
    986 .end_pass1:
    987    mova                 m2, [cq+64*0+16]
    988    mova                 m3, [cq+64*1+16]
    989    mova                 m4, [cq+64*0+32]
    990    mova                 m5, [cq+64*1+32]
    991    mova                 m6, [cq+64*0+48]
    992    mova                 m7, [cq+64*1+48]
    993    ; m0-7 = packed & transposed output
    994    jmp                tx2q
    995 .pass2:
    996 %if ARCH_X86_32
    997    lea                  r5, [o(itx8_start)]
    998 %endif
    999    call m_suffix(idct_16x4_internal_8bpc, _ssse3).main
   1000    ; m0-6 is out0-13 [with odd registers having inversed output]
   1001    ; [coeffq+16*7] has out15/14
   1002    mova                 m7, [o(pw_2048)]
   1003    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   1004    pmulhrsw             m7, [cq+16*7]
   1005    REPX {shufps x, x, q1032}, m1, m3, m5, m7
   1006    mova          [cq+16*0], m4
   1007    mova          [cq+16*1], m5
   1008    mova          [cq+16*2], m6
   1009    mova          [cq+16*3], m7
   1010 .end:
   1011    pxor                 m4, m4
   1012    REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1013    mova                 m7, [o(pixel_10bpc_max)]
   1014    mov                 r5d, 2
   1015    lea                  r3, [strideq*3]
   1016 .loop:
   1017    movq                 m5, [dstq+strideq*0]
   1018    movq                 m6, [dstq+strideq*2]
   1019    movhps               m5, [dstq+strideq*1]
   1020    movhps               m6, [dstq+r3]
   1021    lea                  r4, [dstq+strideq*4]
   1022    paddw                m0, m5
   1023    paddw                m1, m6
   1024    movq                 m5, [r4+strideq*0]
   1025    movq                 m6, [r4+strideq*2]
   1026    movhps               m5, [r4+strideq*1]
   1027    movhps               m6, [r4+r3]
   1028    paddw                m2, m5
   1029    paddw                m3, m6
   1030    REPX     {pminsw x, m7}, m0, m1, m2, m3
   1031    REPX     {pmaxsw x, m4}, m0, m1, m2, m3
   1032    movq   [dstq+strideq*0], m0
   1033    movhps [dstq+strideq*1], m0
   1034    movq   [dstq+strideq*2], m1
   1035    movhps [dstq+r3       ], m1
   1036    movq   [r4  +strideq*0], m2
   1037    movhps [r4  +strideq*1], m2
   1038    movq   [r4  +strideq*2], m3
   1039    movhps [r4  +r3       ], m3
   1040    dec                 r5d
   1041    jz .end2
   1042    lea                dstq, [dstq+strideq*8]
   1043    mova                 m0, [cq+0*16]
   1044    mova                 m1, [cq+1*16]
   1045    mova                 m2, [cq+2*16]
   1046    mova                 m3, [cq+3*16]
   1047    REPX {mova [cq+x*16], m4}, 0, 1, 2, 3
   1048    jmp .loop
   1049 .end2:
   1050    RET
   1051 
   1052 INV_TXFM_4X16_FN adst, dct
   1053 INV_TXFM_4X16_FN adst, adst
   1054 INV_TXFM_4X16_FN adst, flipadst
   1055 INV_TXFM_4X16_FN adst, identity, v
   1056 
   1057 cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   1058 %undef cmp
   1059 %if ARCH_X86_32
   1060    mov                 r5m, r6d
   1061 %endif
   1062    mov                 r6d, 4
   1063 .zero_loop:
   1064    dec                 r6d
   1065    cmp                eobb, byte [r6+r5]
   1066    jl .zero_loop
   1067    mov                 r5d, r6d
   1068    shl                 r5d, 4
   1069 %if ARCH_X86_32
   1070    ; restore pic-ptr
   1071    mov                  r6, r5m
   1072 %endif
   1073 .loop_pass1:
   1074    mova                 m5, [cq+64*0+r5]
   1075    lea                  r3, [cq+64*1+r5]
   1076    mova                 m1, [cq+64*2+r5]
   1077    mova                 m3, [cq+64*3+r5]
   1078    call m(iadst_4x4_internal_16bpc).main2
   1079    pcmpeqd              m3, m3
   1080    REPX      {psubd x, m3}, m0, m2, m1, m4
   1081    REPX       {psrad x, 1}, m0, m2, m1, m4
   1082    packssdw             m0, m2            ; out0 out1
   1083    packssdw             m1, m4            ; out2 out3
   1084    punpckhwd            m2, m0, m1
   1085    punpcklwd            m0, m1
   1086    punpckhwd            m1, m0, m2
   1087    punpcklwd            m0, m2
   1088    test                r5d, r5d
   1089    jz m(idct_4x16_internal_16bpc).end_pass1
   1090    mova       [cq+64*0+r5], m0
   1091    mova       [cq+64*1+r5], m1
   1092    sub                 r5d, 16
   1093    jmp .loop_pass1
   1094 .pass2:
   1095 %if ARCH_X86_32
   1096    lea                  r5, [o(itx8_start)]
   1097 %endif
   1098    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
   1099    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
   1100    ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8
   1101    ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13
   1102    mova                 m1, [o(pw_4x2048_4xm2048)]
   1103    REPX   {pmulhrsw x, m1}, m7, m2, m0
   1104    pshufd               m6, m1, q1032  ; 4x-2048,4x2048
   1105    pmulhrsw             m1, [cq+16*7]
   1106    REPX   {pmulhrsw x, m6}, m5, m4, m3
   1107    pmulhrsw             m6, [cq+16*6]
   1108    ; m7/5/2/4 = out4/11,5/10,6/9,7/8
   1109    ; m0/3/6/1 = out0/15,3/12,1/14,2/13
   1110    ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
   1111    movhps         [cq+0*8], m4
   1112    movhps         [cq+1*8], m2
   1113    movhps         [cq+2*8], m5
   1114    movhps         [cq+3*8], m7
   1115    movhps         [cq+4*8], m3
   1116    movhps         [cq+5*8], m1
   1117    movhps         [cq+6*8], m6
   1118    movhps         [cq+7*8], m0
   1119    punpcklqdq           m0, m6
   1120    punpcklqdq           m1, m3
   1121    punpcklqdq           m3, m2, m4
   1122    punpcklqdq           m2, m7, m5
   1123    jmp m(idct_4x16_internal_16bpc).end
   1124 
   1125 INV_TXFM_4X16_FN flipadst, dct
   1126 INV_TXFM_4X16_FN flipadst, adst
   1127 INV_TXFM_4X16_FN flipadst, flipadst
   1128 INV_TXFM_4X16_FN flipadst, identity, v
   1129 
   1130 cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   1131 %undef cmp
   1132 %if ARCH_X86_32
   1133    mov                 r5m, r6d
   1134 %endif
   1135    mov                 r6d, 4
   1136 .zero_loop:
   1137    dec                 r6d
   1138    cmp                eobb, byte [r5+r6]
   1139    jl .zero_loop
   1140    mov                 r5d, r6d
   1141    shl                 r5d, 4
   1142 %if ARCH_X86_32
   1143    ; restore pic-ptr
   1144    mov                  r6, r5m
   1145 %endif
   1146 .loop_pass1:
   1147    mova                 m5, [cq+64*0+r5]
   1148    lea                  r3, [cq+64*1+r5]
   1149    mova                 m1, [cq+64*2+r5]
   1150    mova                 m3, [cq+64*3+r5]
   1151    call m(iadst_4x4_internal_16bpc).main2
   1152    pcmpeqd              m3, m3
   1153    REPX      {psubd x, m3}, m0, m2, m1, m4
   1154    REPX       {psrad x, 1}, m0, m2, m1, m4
   1155    packssdw             m0, m2            ; out3 out2
   1156    packssdw             m1, m4            ; out1 out0
   1157    punpcklwd            m2, m1, m0
   1158    punpckhwd            m1, m0
   1159    punpcklwd            m0, m1, m2
   1160    punpckhwd            m1, m2
   1161    test                r5d, r5d
   1162    jz m(idct_4x16_internal_16bpc).end_pass1
   1163    mova       [cq+64*0+r5], m0
   1164    mova       [cq+64*1+r5], m1
   1165    sub                 r5d, 16
   1166    jmp .loop_pass1
   1167 .pass2:
   1168 %if ARCH_X86_32
   1169    lea                  r5, [o(itx8_start)]
   1170 %endif
   1171    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
   1172    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
   1173    ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7
   1174    ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2
   1175    mova                 m1, [o(pw_4x2048_4xm2048)]
   1176    REPX   {pmulhrsw x, m1}, m7, m2, m0
   1177    pshufd               m6, m1, q1032  ; 4x-2048,4x2048
   1178    pmulhrsw             m1, [cq+16*7]
   1179    REPX   {pmulhrsw x, m6}, m5, m4, m3
   1180    pmulhrsw             m6, [cq+16*6]
   1181    ; m7/5/2/4 = out11/4,10/5,9/6,8/7
   1182    ; m0/3/6/1 = out15/0,12/3,14/1,13/2
   1183    ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
   1184    movq           [cq+0*8], m4
   1185    movq           [cq+1*8], m2
   1186    movq           [cq+2*8], m5
   1187    movq           [cq+3*8], m7
   1188    movq           [cq+4*8], m3
   1189    movq           [cq+5*8], m1
   1190    movq           [cq+6*8], m6
   1191    movq           [cq+7*8], m0
   1192    punpckhqdq           m0, m6
   1193    punpckhqdq           m1, m3
   1194    punpckhqdq           m3, m2, m4
   1195    punpckhqdq           m2, m7, m5
   1196    jmp m(idct_4x16_internal_16bpc).end
   1197 
   1198 INV_TXFM_4X16_FN identity, dct, h
   1199 INV_TXFM_4X16_FN identity, adst, h
   1200 INV_TXFM_4X16_FN identity, flipadst, h
   1201 INV_TXFM_4X16_FN identity, identity
   1202 
   1203 cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   1204 %undef cmp
   1205 %if ARCH_X86_32
   1206    mov                 r5m, r6d
   1207 %endif
   1208    mov                 r6d, 4
   1209 .zero_loop:
   1210    dec                 r6d
   1211    cmp                eobb, byte [r5+r6]
   1212    jl .zero_loop
   1213    mov                 r5d, r6d
   1214    shl                 r5d, 4
   1215 %if ARCH_X86_32
   1216    ; restore pic-ptr
   1217    mov                  r6, r5m
   1218 %endif
   1219    mova                 m5, [o(pd_6144)]
   1220    mova                 m4, [o(pd_5793)]
   1221 .loop_pass1:
   1222    pmulld               m0, m4, [cq+64*0+r5]
   1223    pmulld               m1, m4, [cq+64*1+r5]
   1224    pmulld               m2, m4, [cq+64*2+r5]
   1225    pmulld               m3, m4, [cq+64*3+r5]
   1226    REPX      {paddd x, m5}, m0, m1, m2, m3
   1227    REPX      {psrad x, 13}, m0, m1, m2, m3
   1228    packssdw             m0, m1
   1229    packssdw             m2, m3
   1230    punpckhwd            m3, m0, m2
   1231    punpcklwd            m0, m2
   1232    punpckhwd            m1, m0, m3
   1233    punpcklwd            m0, m3
   1234    test                r5d, r5d
   1235    jz m(idct_4x16_internal_16bpc).end_pass1
   1236    mova       [cq+64*0+r5], m0
   1237    mova       [cq+64*1+r5], m1
   1238    sub                 r5d, 16
   1239    jmp .loop_pass1
   1240 .pass2:
   1241    mova          [cq+16*4], m0
   1242    mova          [cq+16*5], m1
   1243    mova          [cq+16*6], m2
   1244    mova          [cq+16*7], m7
   1245    mova                 m0, [o(pw_1697x16)]
   1246    mova                 m7, [o(pw_2048)]
   1247    pmulhrsw             m1, m0, m4
   1248    pmulhrsw             m2, m0, m5
   1249    REPX      {paddsw x, x}, m4, m5
   1250    paddsw               m4, m1
   1251    paddsw               m5, m2
   1252    REPX   {pmulhrsw x, m7}, m4, m5
   1253    mova          [cq+16*0], m4
   1254    mova          [cq+16*1], m5
   1255    mova                 m4, [cq+16*7]
   1256    pmulhrsw             m1, m0, m6
   1257    pmulhrsw             m2, m0, m4
   1258    REPX      {paddsw x, x}, m6, m4
   1259    paddsw               m6, m1
   1260    paddsw               m4, m2
   1261    REPX   {pmulhrsw x, m7}, m6, m4
   1262    mova          [cq+16*2], m6
   1263    mova          [cq+16*3], m4
   1264    mova                 m4, [cq+16*4]
   1265    mova                 m1, [cq+16*5]
   1266    mova                 m2, [cq+16*6]
   1267    pmulhrsw             m5, m0, m2
   1268    pmulhrsw             m6, m0, m3
   1269    REPX      {paddsw x, x}, m2, m3
   1270    paddsw               m2, m5
   1271    paddsw               m3, m6
   1272    pmulhrsw             m6, m0, m1
   1273    pmulhrsw             m0, m4
   1274    REPX      {paddsw x, x}, m1, m4
   1275    paddsw               m1, m6
   1276    paddsw               m0, m4
   1277    REPX   {pmulhrsw x, m7}, m2, m3, m1, m0
   1278    jmp m(idct_4x16_internal_16bpc).end
   1279 
   1280 %macro INV_TXFM_8X4_FN 2 ; type1, type2
   1281 %if ARCH_X86_64
   1282    INV_TXFM_FN          %1, %2, 0, 8x4, 15
   1283 %else
   1284    INV_TXFM_FN          %1, %2, 0, 8x4, 8, 0-4*16
   1285 %endif
   1286 %ifidn %1_%2, dct_dct
   1287    imul                r5d, [cq], 181
   1288    mov                [cq], eobd ; 0
   1289    add                 r5d, 128
   1290    sar                 r5d, 8
   1291    imul                r5d, 181
   1292    add                 r5d, 128
   1293    sar                 r5d, 8
   1294    imul                r5d, 2896
   1295    add                 r5d, 34816
   1296    movd                 m0, r5d
   1297    pshuflw              m0, m0, q1111
   1298    punpcklqdq           m0, m0
   1299    mova                 m6, [o(pixel_10bpc_max)]
   1300    pxor                 m5, m5
   1301    lea                  r2, [strideq*3]
   1302    mova                 m1, [dstq+strideq*0]
   1303    mova                 m2, [dstq+strideq*1]
   1304    mova                 m3, [dstq+strideq*2]
   1305    mova                 m4, [dstq+r2]
   1306    REPX      {paddw x, m0}, m1, m2, m3, m4
   1307    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
   1308    REPX     {pminsw x, m6}, m1, m2, m3, m4
   1309    mova   [dstq+strideq*0], m1
   1310    mova   [dstq+strideq*1], m2
   1311    mova   [dstq+strideq*2], m3
   1312    mova   [dstq+r2       ], m4
   1313    RET
   1314 %endif
   1315 %endmacro
   1316 
   1317 INV_TXFM_8X4_FN dct, dct
   1318 INV_TXFM_8X4_FN dct, identity
   1319 INV_TXFM_8X4_FN dct, adst
   1320 INV_TXFM_8X4_FN dct, flipadst
   1321 
   1322 cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   1323    lea                  r5, [o(.main)]
   1324 .pass1_entry:
   1325 %if ARCH_X86_32
   1326    lea                  r3, [rsp+gprsize]
   1327 %else
   1328    mova                m11, [o(pd_2048)]
   1329    mova                m12, [o(clip_18b_min)]
   1330    mova                m13, [o(clip_18b_max)]
   1331    mova                m14, [o(pd_2896)]
   1332 %endif
   1333    mova                 m0, [cq+0*16]
   1334    mova                 m1, [cq+1*16]
   1335    mova                 m2, [cq+2*16]
   1336    mova                 m3, [cq+3*16]
   1337    mova                 m4, [cq+4*16]
   1338    mova                 m5, [cq+5*16]
   1339    mova                 m6, [cq+6*16]
   1340    mova                 m7, [cq+7*16]
   1341    call .rect2_mul
   1342    call                 r5
   1343    call .transpose4x8packed
   1344    ; m0-3 = packed & transposed output
   1345    jmp                tx2q
   1346 .transpose4x8packed:
   1347    ; transpose
   1348    punpcklwd            m1, m2, m6
   1349    punpckhwd            m2, m6
   1350    punpckhwd            m6, m0, m4
   1351    punpcklwd            m0, m4
   1352 
   1353    punpckhwd            m3, m0, m1
   1354    punpcklwd            m0, m1
   1355    punpckhwd            m4, m6, m2
   1356    punpcklwd            m6, m2
   1357 
   1358    punpcklwd            m2, m3, m4
   1359    punpckhwd            m3, m4
   1360    punpckhwd            m1, m0, m6
   1361    punpcklwd            m0, m6
   1362    ret
   1363 .main:
   1364    call .main_pass1
   1365    call .round
   1366    packssdw             m0, m1
   1367    packssdw             m2, m3
   1368    packssdw             m4, m5
   1369    packssdw             m6, m7
   1370    ret
   1371 .rect2_mul:
   1372 %if ARCH_X86_64
   1373    REPX    {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
   1374    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   1375 %else
   1376    mova               [r3], m7
   1377    mova                 m7, [o(pd_2896)]
   1378    REPX     {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
   1379    pmulld               m7, [r3]
   1380    mova               [r3], m7
   1381    mova                 m7, [o(pd_2048)]
   1382    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
   1383    paddd                m7, [r3]
   1384 %endif
   1385    REPX      {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
   1386    ret
   1387 %if ARCH_X86_64
   1388 .main_pass1_fast:
   1389    pmulld               m5, m3, [o(pd_m2276)]
   1390    pmulld               m3, [o(pd_3406)]
   1391    pmulld               m7, m1, [o(pd_4017)]
   1392    pmulld               m1, [o(pd_799)]
   1393    pmulld               m6, m2, [o(pd_3784)]
   1394    pmulld               m2, [o(pd_1567)]
   1395    pmulld               m0, m14
   1396    pxor                 m4, m4
   1397    jmp .main_pass1_fast2
   1398 .main_pass1:
   1399    ITX_MULSUB_2D         5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
   1400    ITX_MULSUB_2D         1, 7, 8, 9, 10, _,  799, 4017 ; t4a t7a
   1401    ITX_MULSUB_2D         2, 6, 8, 9, 10, _, 1567, 3784 ; t2  t3
   1402    REPX    {pmulld x, m14}, m0, m4
   1403 .main_pass1_fast2:
   1404    REPX     {paddd x, m11}, m1, m2, m3, m5, m6, m7
   1405    REPX     {psrad x, 12 }, m1, m2, m3, m5, m6, m7
   1406    paddd                m8, m1, m5 ; t4
   1407    psubd                m1, m5     ; t5a
   1408    paddd                m9, m7, m3 ; t7
   1409    psubd                m7, m3     ; t6a
   1410    REPX    {pmaxsd x, m12}, m1, m8, m7, m9
   1411    REPX    {pminsd x, m13}, m1, m8, m7, m9
   1412    REPX    {pmulld x, m14}, m7, m1
   1413    paddd                m0, m11
   1414    paddd                m7, m11
   1415    psubd                m5, m0, m4
   1416    paddd                m0, m4
   1417    psubd                m4, m7, m1
   1418    paddd                m7, m1
   1419    REPX    {psrad  x, 12 }, m5, m0, m4, m7
   1420    psubd                m3, m0, m6 ; dct4 out3
   1421    paddd                m0, m6     ; dct4 out0
   1422    paddd                m6, m5, m2 ; dct4 out1
   1423    psubd                m5, m2     ; dct4 out2
   1424    REPX    {pmaxsd x, m12}, m0, m6, m5, m3
   1425    REPX    {pminsd x, m13}, m0, m6, m5, m3
   1426    ret
   1427 .round:
   1428    paddd                m1, m6, m7 ; out1
   1429    psubd                m6, m7     ; out6
   1430    psubd                m7, m0, m9 ; out7
   1431    paddd                m0, m9     ; out0
   1432    paddd                m2, m5, m4 ; out2
   1433    psubd                m5, m4     ; out5
   1434    psubd                m4, m3, m8 ; out4
   1435    paddd                m3, m8     ; out3
   1436 %else
   1437 .main_pass1_fast:
   1438    pmulld               m5, m3, [o(pd_m2276)]
   1439    pmulld               m3, [o(pd_3406)]
   1440    pmulld               m7, m1, [o(pd_4017)]
   1441    pmulld               m1, [o(pd_799)]
   1442    pmulld               m6, m2, [o(pd_3784)]
   1443    pmulld               m2, [o(pd_1567)]
   1444    mova                 m4, [o(pd_2048)]
   1445    mova          [r3+0*16], m2
   1446    REPX      {paddd x, m4}, m5, m3, m7, m1
   1447    REPX      {psrad x, 12}, m5, m3, m7, m1
   1448    paddd                m2, m1, m5 ; t4
   1449    psubd                m1, m5     ; t5a
   1450    pmulld               m5, m0, [o(pd_2896)]
   1451    mova                 m0, m4
   1452    paddd                m4, m7, m3 ; t7
   1453    psubd                m7, m3     ; t6a
   1454    mova                 m3, [o(clip_18b_min)]
   1455    REPX    {pmaxsd x, m3 }, m1, m2, m7, m4
   1456    mova                 m3, [o(clip_18b_max)]
   1457    REPX    {pminsd x, m3 }, m1, m2, m7, m4
   1458    mova          [r3+3*16], m2
   1459    mova          [r3+1*16], m4
   1460    pxor                 m4, m4
   1461    mova                 m2, [r3+0*16]
   1462    mova                 m3, [o(pd_2896)]
   1463    jmp .main_pass1_fast2
   1464 .main_pass1:
   1465    mova          [r3+0*16], m0
   1466    mova          [r3+1*16], m2
   1467    mova          [r3+2*16], m4
   1468    mova          [r3+3*16], m6
   1469    mova                 m0, [o(pd_2048)]
   1470    ITX_MULSUB_2D         5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a
   1471    ITX_MULSUB_2D         1, 7, 2, 4, 6, 0,  799, 4017 ; t4a t7a
   1472    paddd                m2, m1, m5 ; t4
   1473    psubd                m1, m5     ; t5a
   1474    paddd                m4, m7, m3 ; t7
   1475    psubd                m7, m3     ; t6a
   1476    mova                 m6, [o(clip_18b_min)]
   1477    REPX    {pmaxsd x, m6 }, m1, m2, m7, m4
   1478    mova                 m6, [o(clip_18b_max)]
   1479    REPX    {pminsd x, m6 }, m1, m2, m7, m4
   1480    mova                 m6, [r3+3*16]
   1481    mova          [r3+3*16], m2
   1482    mova                 m2, [r3+1*16]
   1483    mova          [r3+1*16], m4
   1484 
   1485    ITX_MULSUB_2D         2, 6, 4, 3, 5, _, 1567, 3784 ; t2  t3
   1486    mova                 m3, [o(pd_2896)]
   1487    mova                 m5, [r3+0*16]
   1488    mova                 m4, [r3+2*16]
   1489    REPX    {pmulld x, m3 }, m5, m4
   1490 .main_pass1_fast2:
   1491    REPX    {paddd  x, m0 }, m2, m6
   1492    REPX    {psrad  x, 12 }, m2, m6
   1493    REPX    {pmulld x, m3 }, m7, m1
   1494    paddd                m7, m0
   1495    paddd                m0, m5
   1496 
   1497    psubd                m5, m0, m4
   1498    paddd                m0, m4
   1499    psubd                m4, m7, m1
   1500    paddd                m7, m1
   1501    REPX    {psrad  x, 12 }, m5, m0, m4, m7
   1502    psubd                m3, m0, m6 ; dct4 out3
   1503    paddd                m0, m6     ; dct4 out0
   1504    paddd                m6, m5, m2 ; dct4 out1
   1505    psubd                m5, m2     ; dct4 out2
   1506 
   1507    mova                 m1, [o(clip_18b_min)]
   1508    REPX    {pmaxsd x, m1 }, m0, m6, m5, m3
   1509    mova                 m1, [o(clip_18b_max)]
   1510    REPX    {pminsd x, m1 }, m0, m6, m5, m3
   1511    ret
   1512 .round:
   1513    paddd                m1, m6, m7 ; out1
   1514    psubd                m6, m7     ; out6
   1515    mova          [r3+0*16], m6
   1516    mova                 m6, [r3+1*16]
   1517    psubd                m7, m0, m6 ; out7
   1518    paddd                m0, m6     ; out0
   1519    paddd                m2, m5, m4 ; out2
   1520    psubd                m5, m4     ; out5
   1521    mova                 m6, [r3+3*16]
   1522    psubd                m4, m3, m6 ; out4
   1523    paddd                m3, m6     ; out3
   1524    mova                 m6, [r3+0*16]
   1525 %endif
   1526    ret
   1527 
   1528 .pass2:
   1529 %if ARCH_X86_32
   1530    lea                  r5, [o(itx8_start)]
   1531 %endif
   1532    call m_suffix(idct_8x4_internal_8bpc, _ssse3).main
   1533 .end:
   1534    lea                  r3, [strideq*3]
   1535    call .round2_and_write_8x4
   1536    REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
   1537    RET
   1538 .round2_and_write_8x4:
   1539    pxor                 m6, m6
   1540    mova                 m5, [o(pixel_10bpc_max)]
   1541    mova                 m4, [o(pw_2048)]
   1542 .round1_and_write_8x4:
   1543    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
   1544 .write_8x4:
   1545    paddw                m0, [dstq+strideq*0]
   1546    paddw                m1, [dstq+strideq*1]
   1547    paddw                m2, [dstq+strideq*2]
   1548    paddw                m3, [dstq+r3]
   1549    REPX     {pminsw x, m5}, m0, m1, m2, m3
   1550    REPX     {pmaxsw x, m6}, m0, m1, m2, m3
   1551    mova   [dstq+strideq*0], m0
   1552    mova   [dstq+strideq*1], m1
   1553    mova   [dstq+strideq*2], m2
   1554    mova   [dstq+r3       ], m3
   1555    ret
   1556 
   1557 INV_TXFM_8X4_FN adst, dct
   1558 INV_TXFM_8X4_FN adst, adst
   1559 INV_TXFM_8X4_FN adst, flipadst
   1560 INV_TXFM_8X4_FN adst, identity
   1561 
   1562 cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   1563    lea                  r5, [o(.main)]
   1564    jmp m(idct_8x4_internal_16bpc).pass1_entry
   1565 .main:
   1566    call .main_pass1
   1567    call .round
   1568    packssdw             m0, m1
   1569    packssdw             m2, m3
   1570    packssdw             m4, m5
   1571    packssdw             m6, m7
   1572    ret
   1573 .main_pass1:
   1574 %if ARCH_X86_64
   1575    ITX_MULSUB_2D         7, 0, 8, 9, 10, 11,  401, 4076 ; t1a, t0a
   1576    ITX_MULSUB_2D         1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a
   1577    ITX_MULSUB_2D         5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a
   1578    ITX_MULSUB_2D         3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a
   1579    psubd                m8, m2, m6 ; t6
   1580    paddd                m2, m6     ; t2
   1581    psubd                m6, m0, m4 ; t4
   1582    paddd                m0, m4     ; t0
   1583    psubd                m4, m5, m1 ; t7
   1584    paddd                m5, m1     ; t3
   1585    psubd                m1, m7, m3 ; t5
   1586    paddd                m7, m3     ; t1
   1587    REPX    {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7
   1588    REPX    {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7
   1589    ITX_MULSUB_2D         6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a
   1590    ITX_MULSUB_2D         4, 8, 3, 9, 10, 11, 3784, 10   ; t6a, t7a
   1591    psubd                m9, m6, m8 ;  t7
   1592    paddd                m6, m8     ;  out6
   1593    mova                 m8, [o(pd_2896)]
   1594    psubd                m3, m7, m5 ;  t3
   1595    paddd                m7, m5     ; -out7
   1596    psubd                m5, m0, m2 ;  t2
   1597    paddd                m0, m2     ;  out0
   1598    psubd                m2, m1, m4 ;  t6
   1599    paddd                m1, m4     ; -out1
   1600    REPX    {pmaxsd x, m12}, m5, m3, m2, m9
   1601    REPX    {pminsd x, m13}, m5, m3, m2, m9
   1602    REPX    {pmulld x, m14}, m5, m3, m2, m9
   1603    psubd               m4, m5, m3 ; (t2 - t3) * 2896
   1604    paddd               m3, m5     ; (t2 + t3) * 2896
   1605    psubd               m5, m2, m9 ; (t6 - t7) * 2896
   1606    paddd               m2, m9     ; (t6 + t7) * 2896
   1607    ret
   1608 .round:
   1609 
   1610    ; m0=out0,m1=-out1,m6=out6,m7=-out7
   1611 
   1612    pcmpeqd              m8, m8
   1613    REPX     {pxor  x, m8 }, m1, m7, m3, m5
   1614    REPX     {psubd x, m8 }, m1, m7
   1615    REPX     {paddd x, m11}, m2, m3, m4, m5
   1616    REPX     {psrad x, 12 }, m2, m3, m4, m5
   1617 %else
   1618    mova          [r3+0*16], m2
   1619    mova          [r3+1*16], m3
   1620    mova          [r3+2*16], m4
   1621    mova          [r3+3*16], m5
   1622    mova                 m5, [o(pd_2048)]
   1623 
   1624    ITX_MULSUB_2D         7, 0, 2, 3, 4, 5,  401, 4076 ; t1a, t0a
   1625    ITX_MULSUB_2D         1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a
   1626    mova                 m2, [r3+0*16]
   1627    mova                 m3, [r3+1*16]
   1628    mova                 m4, [r3+2*16]
   1629    mova          [r3+0*16], m0
   1630    mova          [r3+1*16], m1
   1631    mova          [r3+2*16], m6
   1632    mova                 m1, [r3+3*16]
   1633    mova          [r3+3*16], m7
   1634    ITX_MULSUB_2D         1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a
   1635    ITX_MULSUB_2D         3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a
   1636    mova                 m0, [r3+0*16]
   1637    mova                 m6, [r3+2*16]
   1638    psubd                m7, m2, m6 ; t6
   1639    paddd                m2, m6     ; t2
   1640    psubd                m6, m0, m4 ; t4
   1641    paddd                m0, m4     ; t0
   1642    mova          [r3+0*16], m7
   1643    mova                 m5, [r3+1*16]
   1644    mova                 m7, [r3+3*16]
   1645    psubd                m4, m1, m5 ; t7
   1646    paddd                m5, m1     ; t3
   1647    psubd                m1, m7, m3 ; t5
   1648    paddd                m7, m3     ; t1
   1649    mova                 m3, [o(clip_18b_min)]
   1650    REPX    {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7
   1651    mova          [r3+1*16], m7
   1652    mova                 m7, [o(clip_18b_max)]
   1653    pmaxsd               m3, [r3+0*16]
   1654    REPX    {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5
   1655    pminsd               m7, [r3+1*16]
   1656    mova          [r3+0*16], m0
   1657    mova          [r3+1*16], m2
   1658    mova          [r3+2*16], m5
   1659    mova          [r3+3*16], m7
   1660    mova                 m0, [o(pd_2048)]
   1661    ITX_MULSUB_2D         6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a
   1662    ITX_MULSUB_2D         4, 3, 2, 5, 7, 0, 3784, 7    ; t6a, t7a
   1663    mova                 m5, [r3+2*16]
   1664    mova                 m7, [r3+3*16]
   1665    psubd                m2, m6, m3 ;  t7
   1666    paddd                m6, m3     ;  out6
   1667    mova          [r3+3*16], m6
   1668    mova                 m0, [r3+0*16]
   1669    mova                 m6, [r3+1*16]
   1670    psubd                m3, m7, m5 ;  t3
   1671    paddd                m7, m5     ; -out7
   1672    psubd                m5, m0, m6 ;  t2
   1673    paddd                m0, m6     ;  out0
   1674    psubd                m6, m1, m4 ;  t6
   1675    paddd                m1, m4     ; -out1
   1676    mova                 m4, [o(clip_18b_min)]
   1677    REPX    {pmaxsd x, m4 }, m5, m3, m6, m2
   1678    mova                 m4, [o(clip_18b_max)]
   1679    REPX    {pminsd x, m4 }, m5, m3, m6, m2
   1680    mova                 m4, [o(pd_2896)]
   1681    REPX    {pmulld x, m4 }, m5, m3, m6, m2
   1682    psubd               m4, m5, m3 ; (t2 - t3) * 2896
   1683    paddd               m3, m5     ; (t2 + t3) * 2896
   1684    psubd               m5, m6, m2 ; (t6 - t7) * 2896
   1685    paddd               m2, m6     ; (t6 + t7) * 2896
   1686    ret
   1687 .round:
   1688    mova          [r3+2*16], m0
   1689 
   1690    pcmpeqd              m0, m0
   1691    mova                 m6, [o(pd_2048)]
   1692    REPX     {pxor  x, m0 }, m1, m7, m3, m5
   1693    REPX     {psubd x, m0 }, m1, m7
   1694    REPX     {paddd x, m6 }, m2, m3, m4, m5
   1695    REPX     {psrad x, 12 }, m2, m3, m4, m5
   1696 
   1697    mova                 m6, [r3+3*16]
   1698    mova                 m0, [r3+2*16]
   1699 %endif
   1700    ret
   1701 
   1702 .pass2:
   1703 %if ARCH_X86_32
   1704    lea                  r5, [o(itx8_start)]
   1705 %endif
   1706    call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
   1707    jmp m(idct_8x4_internal_16bpc).end
   1708 
   1709 INV_TXFM_8X4_FN flipadst, dct
   1710 INV_TXFM_8X4_FN flipadst, adst
   1711 INV_TXFM_8X4_FN flipadst, flipadst
   1712 INV_TXFM_8X4_FN flipadst, identity
   1713 
   1714 cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   1715    lea                  r5, [o(.main)]
   1716    jmp m(idct_8x4_internal_16bpc).pass1_entry
   1717 .main:
   1718    call m(iadst_8x4_internal_16bpc).main_pass1
   1719    call m(iadst_8x4_internal_16bpc).round
   1720    packssdw             m7, m6
   1721    packssdw             m5, m4
   1722    packssdw             m3, m2
   1723    packssdw             m1, m0
   1724    mova                 m0, m7
   1725    mova                 m2, m5
   1726    mova                 m4, m3
   1727    mova                 m6, m1
   1728    ret
   1729 .pass2:
   1730 %if ARCH_X86_32
   1731    lea                  r5, [o(itx8_start)]
   1732 %endif
   1733    call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
   1734    lea                  r3, [strideq*3]
   1735    add                dstq, r3
   1736    neg             strideq
   1737    jmp m(idct_8x4_internal_16bpc).end
   1738 
   1739 INV_TXFM_8X4_FN identity, dct
   1740 INV_TXFM_8X4_FN identity, adst
   1741 INV_TXFM_8X4_FN identity, flipadst
   1742 INV_TXFM_8X4_FN identity, identity
   1743 
   1744 cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   1745    lea                  r5, [o(.main)]
   1746    jmp m(idct_8x4_internal_16bpc).pass1_entry
   1747 .main:
   1748    REPX       {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7
   1749    packssdw             m0, m1
   1750    packssdw             m2, m3
   1751    packssdw             m4, m5
   1752    packssdw             m6, m7
   1753    ret
   1754 .pass2:
   1755    mova                 m7, [o(pw_1697x8)]
   1756    pmulhrsw             m4, m7, m0
   1757    pmulhrsw             m5, m7, m1
   1758    pmulhrsw             m6, m7, m2
   1759    pmulhrsw             m7, m3
   1760    paddsw               m0, m4
   1761    paddsw               m1, m5
   1762    paddsw               m2, m6
   1763    paddsw               m3, m7
   1764    jmp m(idct_8x4_internal_16bpc).end
   1765 
   1766 %macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
   1767 %if ARCH_X86_64
   1768    INV_TXFM_FN          %1, %2, %3, 8x8, 15, 0-3*16
   1769 %else
   1770    INV_TXFM_FN          %1, %2, %3, 8x8, 8, 0-5*16
   1771 %endif
   1772 %ifidn %1_%2, dct_dct
   1773    imul                r5d, [cq], 181
   1774    mov                [cq], eobd ; 0
   1775    mov                 r3d, 2
   1776 .end:
   1777    add                 r5d, 384
   1778    sar                 r5d, 9
   1779 .end2:
   1780    imul                r5d, 2896
   1781    add                 r5d, 34816
   1782    movd                 m0, r5d
   1783    pshuflw              m0, m0, q1111
   1784    punpcklqdq           m0, m0
   1785    mova                 m6, [o(pixel_10bpc_max)]
   1786    pxor                 m5, m5
   1787    lea                  r2, [strideq*3]
   1788 .loop:
   1789    mova                 m1, [dstq+strideq*0]
   1790    mova                 m2, [dstq+strideq*1]
   1791    mova                 m3, [dstq+strideq*2]
   1792    mova                 m4, [dstq+r2]
   1793    REPX      {paddw x, m0}, m1, m2, m3, m4
   1794    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
   1795    REPX     {pminsw x, m6}, m1, m2, m3, m4
   1796    mova   [dstq+strideq*0], m1
   1797    mova   [dstq+strideq*1], m2
   1798    mova   [dstq+strideq*2], m3
   1799    mova   [dstq+r2       ], m4
   1800    lea                dstq, [dstq+strideq*4]
   1801    dec                 r3d
   1802    jg .loop
   1803    RET
   1804 %endif
   1805 %endmacro
   1806 
   1807 INV_TXFM_8X8_FN dct, dct
   1808 INV_TXFM_8X8_FN dct, identity, 6
   1809 INV_TXFM_8X8_FN dct, adst
   1810 INV_TXFM_8X8_FN dct, flipadst
   1811 
   1812 cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   1813 %if ARCH_X86_32
   1814    DECLARE_REG_TMP 1
   1815    mov [rsp+4*16+1*gprsize], r1
   1816 %else
   1817    DECLARE_REG_TMP 6
   1818 %endif
   1819    lea                  t0, [o(.pass1_main)]
   1820 
   1821 .pass1_full:
   1822 %if ARCH_X86_64
   1823    mova                m11, [o(pd_2048)]
   1824    mova                m12, [o(clip_18b_min)]
   1825    mova                m13, [o(clip_18b_max)]
   1826    mova                m14, [o(pd_2896)]
   1827 %endif
   1828 %undef cmp
   1829 %if ARCH_X86_64
   1830    xor                 r5d, r5d
   1831    cmp                eobd, 10
   1832    setge               r5b
   1833 %else
   1834    mov                 r5d, 1
   1835    cmp                eobd, 10
   1836    sbb                 r5d, 0
   1837 %endif
   1838    shl                 r5d, 4
   1839 %if ARCH_X86_32
   1840    lea                  r3, [rsp+gprsize]
   1841 %endif
   1842 .loop_pass1:
   1843    mova                 m0, [cq+0*32+r5]
   1844    mova                 m1, [cq+1*32+r5]
   1845    mova                 m2, [cq+2*32+r5]
   1846    mova                 m3, [cq+3*32+r5]
   1847    mova                 m4, [cq+4*32+r5]
   1848    mova                 m5, [cq+5*32+r5]
   1849    mova                 m6, [cq+6*32+r5]
   1850    mova                 m7, [cq+7*32+r5]
   1851    call                 t0
   1852 
   1853    test                r5d, r5d
   1854    jz .end_pass1
   1855 
   1856    mova       [cq+0*32+16], m0
   1857    mova       [cq+1*32+16], m1
   1858    mova       [cq+2*32+16], m2
   1859    mova       [cq+3*32+16], m3
   1860 
   1861    sub                 r5d, 16
   1862    jmp .loop_pass1
   1863 .end_pass1:
   1864    mova                 m4, [cq+0*32+16]
   1865    mova                 m5, [cq+1*32+16]
   1866    mova                 m6, [cq+2*32+16]
   1867    mova                 m7, [cq+3*32+16]
   1868 %if ARCH_X86_32
   1869    mov                  r1, [rsp+4*16+1*gprsize]
   1870 %endif
   1871    jmp                tx2q
   1872 .pass1_main:
   1873    call m(idct_8x4_internal_16bpc).main_pass1
   1874    pcmpeqd              m1, m1
   1875    REPX      {psubd x, m1}, m0, m6, m5, m3
   1876    call m(idct_8x4_internal_16bpc).round
   1877    REPX      {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
   1878 .pack_and_transpose:
   1879    packssdw             m2, m3
   1880    packssdw             m6, m7
   1881    packssdw             m0, m1
   1882    packssdw             m4, m5
   1883    jmp m(idct_8x4_internal_16bpc).transpose4x8packed
   1884 
   1885 .pass2:
   1886 %if ARCH_X86_32
   1887    lea                  r5, [o(itx8_start)]
   1888 %endif
   1889    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
   1890    lea                  r3, [strideq*3]
   1891 %if ARCH_X86_64
   1892    mova                m10, [o(pixel_10bpc_max)]
   1893    pxor                 m9, m9
   1894 %endif
   1895    call .round3_and_write_8x8
   1896 .zero:
   1897 %if ARCH_X86_64
   1898 %define mzero m9
   1899 %else
   1900 %define mzero m7
   1901    pxor                 m7, m7
   1902 %endif
   1903    REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1904 %undef mzero
   1905    RET
   1906 
   1907    ; round (rounded right-shift by 5) before writing
   1908    ; data in m0-7
   1909    ; on x86-64, pw_2048 is in m8
   1910    ; .round1 is for m0-7
   1911    ; .round2 is for m0-6 & [rsp+gprsize*2]
   1912    ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
   1913    ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7
   1914 %if ARCH_X86_32
   1915 .round1_and_write_8x8:
   1916    mova    [rsp+gprsize*2], m7
   1917 .round2_and_write_8x8:
   1918 %endif
   1919 .round3_and_write_8x8:
   1920    mova                 m7, [o(pw_2048)]
   1921 %if ARCH_X86_32
   1922 .round4_and_write_8x8:
   1923 %endif
   1924    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   1925    pmulhrsw             m7, [rsp+gprsize*2]
   1926 %if ARCH_X86_64
   1927    jmp .write_8x8
   1928 .round2_and_write_8x8:
   1929    mova                 m7, [rsp+gprsize*2]
   1930 .round1_and_write_8x8:
   1931    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
   1932 %endif
   1933 
   1934    ; m0-7 have to-be-written data [pre-rounded]
   1935    ; on x86-64, m9-10 contain a zero/pixel_max
   1936    ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch
   1937    ; r0,1,3 contain dstq/strideq/stride3q
   1938    ; r5 is a scratch register
   1939 .write_8x8:
   1940    lea                  r5, [dstq+strideq*4]
   1941    paddw                m0, [dstq+strideq*0]
   1942    paddw                m1, [dstq+strideq*1]
   1943    paddw                m2, [dstq+strideq*2]
   1944    paddw                m3, [dstq+r3]
   1945    paddw                m4, [r5  +strideq*0]
   1946    paddw                m5, [r5  +strideq*1]
   1947    paddw                m6, [r5  +strideq*2]
   1948    paddw                m7, [r5  +r3]
   1949 %if ARCH_X86_64
   1950    REPX    {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
   1951    REPX    {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
   1952 %else
   1953    mova    [rsp+gprsize*2], m7
   1954    pxor                 m7, m7
   1955    REPX     {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   1956    pmaxsw               m7, [rsp+gprsize*2]
   1957    mova    [rsp+gprsize*2], m7
   1958    mova                 m7, [o(pixel_10bpc_max)]
   1959    REPX     {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6
   1960    pminsw               m7, [rsp+gprsize*2]
   1961 %endif
   1962    mova   [dstq+strideq*0], m0
   1963    mova   [dstq+strideq*1], m1
   1964    mova   [dstq+strideq*2], m2
   1965    mova   [dstq+r3       ], m3
   1966    mova   [r5  +strideq*0], m4
   1967    mova   [r5  +strideq*1], m5
   1968    mova   [r5  +strideq*2], m6
   1969    mova   [r5  +r3       ], m7
   1970    ret
   1971 
   1972 INV_TXFM_8X8_FN adst, dct
   1973 INV_TXFM_8X8_FN adst, adst
   1974 INV_TXFM_8X8_FN adst, flipadst
   1975 INV_TXFM_8X8_FN adst, identity, 6
   1976 
   1977 cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   1978 %if ARCH_X86_32
   1979    mov [rsp+4*16+1*gprsize], r1
   1980 %endif
   1981    lea                  t0, [o(.pass1_main)]
   1982    jmp m(idct_8x8_internal_16bpc).pass1_full
   1983 .pass1_main:
   1984    call m(iadst_8x4_internal_16bpc).main_pass1
   1985    call .round
   1986    jmp m(idct_8x8_internal_16bpc).pack_and_transpose
   1987 .round:
   1988 %if ARCH_X86_64
   1989    pcmpeqd              m8, m8         ; -1
   1990    REPX     {psubd x, m8 }, m0, m6
   1991    REPX     {pxor  x, m8 }, m1, m7, m3, m5
   1992    REPX     {psrad x, 1  }, m0, m1, m6, m7
   1993    REPX     {psubd x, m8 }, m1, m7
   1994    mova                 m8, [o(pd_6144)]
   1995    REPX     {paddd x, m8 }, m2, m3, m4, m5
   1996    REPX     {psrad x, 13 }, m2, m3, m4, m5
   1997 %else
   1998    mova          [r3+2*16], m0
   1999 
   2000    pcmpeqd              m0, m0         ; -1
   2001    mova                 m6, [o(pd_6144)]
   2002    REPX     {pxor  x, m0 }, m1, m7, m3, m5
   2003    REPX     {psrad x, 1  }, m1, m7
   2004    REPX     {psubd x, m0 }, m1, m7
   2005    REPX     {paddd x, m6 }, m2, m3, m4, m5
   2006    REPX     {psrad x, 13 }, m2, m3, m4, m5
   2007 
   2008    mova                 m0, [r3+2*16]
   2009    psrld                m6, 12         ; +1
   2010    paddd                m0, m6
   2011    paddd                m6, [r3+3*16]
   2012    REPX     {psrad x, 1  }, m0, m6
   2013 %endif
   2014    ret
   2015 
   2016 .pass2:
   2017 %if ARCH_X86_32
   2018    lea                  r5, [o(itx8_start)]
   2019 %endif
   2020    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
   2021    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
   2022    lea                  r3, [strideq*3]
   2023 %if ARCH_X86_64
   2024    mova                m10, [o(pixel_10bpc_max)]
   2025    pxor                 m9, m9
   2026 %endif
   2027    call .round3_and_write_8x8
   2028    jmp m(idct_8x8_internal_16bpc).zero
   2029 
   2030    ; round (rounded right-shift by 5) before writing; odd registers are negated
   2031    ; data in m0-7
   2032    ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11
   2033    ; .round1 is for m0-7
   2034    ; .round2 is for m0-6 & [rsp+gprsize*2]
   2035    ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
   2036 %if ARCH_X86_64
   2037 .round2_and_write_8x8:
   2038    mova                 m7, [rsp+gprsize*2]
   2039 .round1_and_write_8x8:
   2040    REPX  {pmulhrsw x, m8 }, m0, m2, m4, m6
   2041    REPX  {pmulhrsw x, m11}, m1, m3, m5, m7
   2042    jmp m(idct_8x8_internal_16bpc).write_8x8
   2043 %else
   2044 .round1_and_write_8x8:
   2045    mova    [rsp+gprsize*2], m7
   2046 .round2_and_write_8x8:
   2047 %endif
   2048 .round3_and_write_8x8:
   2049    mova                 m7, [o(pw_2048)]
   2050    REPX   {pmulhrsw x, m7}, m0, m2, m4, m6
   2051    mova                 m7, [o(pw_m2048)]
   2052    REPX   {pmulhrsw x, m7}, m1, m3, m5
   2053    pmulhrsw             m7, [rsp+gprsize*2]
   2054    jmp m(idct_8x8_internal_16bpc).write_8x8
   2055 
   2056 INV_TXFM_8X8_FN flipadst, dct
   2057 INV_TXFM_8X8_FN flipadst, adst
   2058 INV_TXFM_8X8_FN flipadst, flipadst
   2059 INV_TXFM_8X8_FN flipadst, identity, 6
   2060 
   2061 cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   2062 %if ARCH_X86_32
   2063    mov [rsp+4*16+1*gprsize], r1
   2064 %endif
   2065    lea                  t0, [o(.pass1_main)]
   2066    jmp m(idct_8x8_internal_16bpc).pass1_full
   2067 .pass1_main:
   2068    call m(iadst_8x4_internal_16bpc).main_pass1
   2069    call m(iadst_8x8_internal_16bpc).round
   2070    ; invert registers
   2071    packssdw             m7, m6
   2072    packssdw             m5, m4
   2073    packssdw             m3, m2
   2074    packssdw             m1, m0
   2075    mova                 m0, m7
   2076    mova                 m2, m5
   2077    mova                 m4, m3
   2078    mova                 m6, m1
   2079    jmp m(idct_8x4_internal_16bpc).transpose4x8packed
   2080 
   2081 .pass2:
   2082    lea                dstq, [dstq+strideq*8]
   2083    sub                dstq, strideq
   2084    neg             strideq
   2085    jmp m(iadst_8x8_internal_16bpc).pass2
   2086 
   2087 INV_TXFM_8X8_FN identity, dct
   2088 INV_TXFM_8X8_FN identity, adst
   2089 INV_TXFM_8X8_FN identity, flipadst
   2090 INV_TXFM_8X8_FN identity, identity
   2091 
   2092 cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   2093    mova                 m0, [cq+0*32]
   2094    mova                 m1, [cq+1*32]
   2095    mova                 m2, [cq+2*32]
   2096    mova                 m3, [cq+3*32]
   2097    mova                 m4, [cq+4*32]
   2098    mova                 m5, [cq+5*32]
   2099    mova                 m6, [cq+6*32]
   2100    mova                 m7, [cq+7*32]
   2101    packssdw             m0, [cq+0*32+16]
   2102    packssdw             m1, [cq+1*32+16]
   2103    packssdw             m2, [cq+2*32+16]
   2104    packssdw             m3, [cq+3*32+16]
   2105    packssdw             m4, [cq+4*32+16]
   2106    packssdw             m5, [cq+5*32+16]
   2107    packssdw             m6, [cq+6*32+16]
   2108    packssdw             m7, [cq+7*32+16]
   2109    mova [rsp+gprsize+16*1], m6
   2110    jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3
   2111 
   2112 .pass2:
   2113 %if ARCH_X86_32
   2114    lea                  r5, [o(itx8_start)]
   2115 %endif
   2116    lea                  r3, [strideq*3]
   2117 %if ARCH_X86_64
   2118    mova                m10, [o(pixel_10bpc_max)]
   2119    pxor                 m9, m9
   2120    mova                 m8, [o(pw_4096)]
   2121    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
   2122 %else
   2123    mova      [rsp+gprsize], m7
   2124    mova                 m7, [o(pw_4096)]
   2125    call m(idct_8x8_internal_16bpc).round4_and_write_8x8
   2126 %endif
   2127    jmp m(idct_8x8_internal_16bpc).zero
   2128 
   2129 %macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
   2130 %if ARCH_X86_64
   2131    INV_TXFM_FN          %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16
   2132 %else
   2133    INV_TXFM_FN          %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
   2134 %endif
   2135 %ifidn %1_%2, dct_dct
   2136    imul                r5d, [cq], 181
   2137    mov                [cq], eobd ; 0
   2138    add                 r5d, 128
   2139    sar                 r5d, 8
   2140    imul                r5d, 181
   2141    mov                 r3d, 4
   2142 %if stack_size_padded > 0
   2143    ; adjust to caller's stack allocation
   2144    add                 rsp, (12+ARCH_X86_64)*16
   2145 %endif
   2146    jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end
   2147 %endif
   2148 %endmacro
   2149 
   2150 INV_TXFM_8X16_FN dct, dct
   2151 INV_TXFM_8X16_FN dct, identity, v
   2152 INV_TXFM_8X16_FN dct, adst
   2153 INV_TXFM_8X16_FN dct, flipadst
   2154 
   2155 %if ARCH_X86_64
   2156 DECLARE_REG_TMP 7
   2157 %endif
   2158 
   2159 cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   2160 %if WIN64
   2161    PUSH                 r7
   2162 %elif ARCH_X86_32
   2163    mov [rsp+16*16+gprsize*1], r1
   2164    mov [rsp+16*16+gprsize*2], r6
   2165 %endif
   2166    lea                  t0, [o(m(idct_8x8_internal_16bpc).pass1_main)]
   2167 .pass1_full:
   2168 %if ARCH_X86_64
   2169    mova                m11, [o(pd_2048)]
   2170    mova                m12, [o(clip_18b_min)]
   2171    mova                m13, [o(clip_18b_max)]
   2172    mova                m14, [o(pd_2896)]
   2173 %endif
   2174 %undef cmp
   2175    mov                 r6d, 4
   2176 .zero_loop:
   2177    dec                 r6d
   2178    cmp                eobb, byte [r5+r6]
   2179    jl .zero_loop
   2180    mov                 r5d, r6d
   2181    shl                 r5d, 4
   2182 %if ARCH_X86_32
   2183    ; restore pic-ptr
   2184    mov                  r6, [rsp+16*16+2*gprsize]
   2185    ; setup stack pointer
   2186    lea                  r3, [rsp+gprsize]
   2187 %endif
   2188 .loop_pass1:
   2189    mova                 m0, [cq+0*64+r5]
   2190    mova                 m1, [cq+1*64+r5]
   2191    mova                 m2, [cq+2*64+r5]
   2192    mova                 m3, [cq+3*64+r5]
   2193    mova                 m4, [cq+4*64+r5]
   2194    mova                 m5, [cq+5*64+r5]
   2195    mova                 m6, [cq+6*64+r5]
   2196    mova                 m7, [cq+7*64+r5]
   2197    call m(idct_8x4_internal_16bpc).rect2_mul
   2198    call                 t0
   2199 
   2200    mova       [cq+0*64+r5], m0
   2201    mova       [cq+1*64+r5], m1
   2202    mova       [cq+2*64+r5], m2
   2203    mova       [cq+3*64+r5], m3
   2204    sub                 r5d, 16
   2205    jge .loop_pass1
   2206 %if WIN64
   2207    POP                  r7
   2208 %elif ARCH_X86_32
   2209    mov                  r1, [rsp+16*16+1*gprsize]
   2210 %endif
   2211    jmp                tx2q
   2212 
   2213 .pass2:
   2214 %if ARCH_X86_32
   2215    lea                  r5, [o(itx8_start)]
   2216 %endif
   2217 
   2218    ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15
   2219    ; some are still pre-loaded from the final loop iteration in pass=1
   2220 
   2221    mova                 m1, m2
   2222    mova                 m2, [cq+ 1*16]
   2223    mova                 m3, [cq+ 9*16]
   2224    mova                 m4, [cq+ 2*16]
   2225    mova                 m5, [cq+10*16]
   2226    mova                 m6, [cq+ 3*16]
   2227    mova                 m7, [cq+11*16]
   2228    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
   2229    mova [rsp+gprsize+3*16], m0
   2230    mova [rsp+gprsize+4*16], m1
   2231    mova [rsp+gprsize+5*16], m2
   2232    mova [rsp+gprsize+6*16], m3
   2233    mova [rsp+gprsize+7*16], m4
   2234    mova [rsp+gprsize+8*16], m5
   2235    mova [rsp+gprsize+9*16], m6
   2236    ; m7 is already stored in [rsp+gprsize+0*16]
   2237    mova                 m0, [cq+ 4*16]
   2238    mova                 m1, [cq+12*16]
   2239    mova                 m2, [cq+ 5*16]
   2240    mova                 m3, [cq+13*16]
   2241    mova                 m4, [cq+ 6*16]
   2242    mova                 m5, [cq+14*16]
   2243    mova                 m6, [cq+ 7*16]
   2244    mova                 m7, [cq+15*16]
   2245    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
   2246 
   2247    ; out0-7 is in rsp+gprsize+3-10*mmsize
   2248    ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
   2249 
   2250 %if ARCH_X86_64
   2251    mova                 m8, [o(pw_2048)]
   2252    mova                m10, [o(pixel_10bpc_max)]
   2253    pxor                 m9, m9
   2254    mov                  r6, dstq
   2255 %else
   2256    mov [rsp+16*16+gprsize*1], dstq
   2257 %endif
   2258    lea                  r3, [strideq*3]
   2259    lea                dstq, [dstq+strideq*8]
   2260    call m(idct_8x8_internal_16bpc).round2_and_write_8x8
   2261 %if ARCH_X86_64
   2262 %define mzero m9
   2263 %else
   2264 %define mzero m7
   2265    pxor                 m7, m7
   2266 %endif
   2267    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
   2268                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   2269 %undef mzero
   2270    mova                 m0, [rsp+gprsize+ 3*16]
   2271    mova                 m1, [rsp+gprsize+ 4*16]
   2272    mova                 m2, [rsp+gprsize+ 5*16]
   2273    mova                 m3, [rsp+gprsize+ 6*16]
   2274    mova                 m4, [rsp+gprsize+ 7*16]
   2275    mova                 m5, [rsp+gprsize+ 8*16]
   2276    mova                 m6, [rsp+gprsize+ 9*16]
   2277    mova                 m7, [rsp+gprsize+10*16]
   2278 %if ARCH_X86_64
   2279    mov                dstq, r6
   2280 %else
   2281    mov                dstq, [rsp+16*16+gprsize*1]
   2282 %endif
   2283    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
   2284    RET
   2285 
   2286 INV_TXFM_8X16_FN adst, dct
   2287 INV_TXFM_8X16_FN adst, adst
   2288 INV_TXFM_8X16_FN adst, flipadst
   2289 INV_TXFM_8X16_FN adst, identity, v
   2290 
   2291 cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   2292 %if WIN64
   2293    PUSH                 r7
   2294 %elif ARCH_X86_32
   2295    mov [rsp+16*16+gprsize*1], r1
   2296    mov [rsp+16*16+gprsize*2], r6
   2297 %endif
   2298    lea                  t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)]
   2299    jmp m(idct_8x16_internal_16bpc).pass1_full
   2300 
   2301 .pass2:
   2302 %if ARCH_X86_32
   2303    lea                  r5, [o(itx8_start)]
   2304 %endif
   2305    mova                 m4, [cq+ 9*16]
   2306    mova                 m5, [cq+13*16]
   2307    mova [rsp+gprsize+7*16], m0
   2308    mova [rsp+gprsize+8*16], m1
   2309    mova [rsp+gprsize+5*16], m4
   2310    mova [rsp+gprsize+6*16], m5
   2311    mova                 m0, m2
   2312    mova                 m1, m3
   2313    mova                 m2, [cq+ 1*16]
   2314    mova                 m3, [cq+ 5*16]
   2315    mova                 m4, [cq+ 2*16]
   2316    mova                 m5, [cq+ 6*16]
   2317    mova                 m6, [cq+11*16]
   2318    mova                 m7, [cq+15*16]
   2319    mova [rsp+gprsize+ 3*16], m4
   2320    mova [rsp+gprsize+ 4*16], m5
   2321    mova [rsp+gprsize+ 9*16], m6
   2322    mova [rsp+gprsize+10*16], m7
   2323    mova                 m4, [cq+10*16]
   2324    mova                 m5, [cq+14*16]
   2325    mova                 m6, [cq+ 3*16]
   2326    mova                 m7, [cq+ 7*16]
   2327    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
   2328    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
   2329 
   2330 %if ARCH_X86_64
   2331    mova                m11, [o(pw_m2048)]
   2332    mova                 m8, [o(pw_2048)]
   2333    mova                m10, [o(pixel_10bpc_max)]
   2334    pxor                 m9, m9
   2335    mov                  r6, dstq
   2336 %else
   2337    mov [rsp+16*16+gprsize*1], dstq
   2338 %endif
   2339    lea                  r3, [strideq*3]
   2340    lea                dstq, [dstq+strideq*8]
   2341    call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
   2342 %if ARCH_X86_64
   2343 %define mzero m9
   2344 %else
   2345 %define mzero m7
   2346    pxor                 m7, m7
   2347 %endif
   2348    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
   2349                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   2350 %undef mzero
   2351    mova                 m0, [rsp+gprsize+ 3*16]
   2352    mova                 m1, [rsp+gprsize+ 4*16]
   2353    mova                 m2, [rsp+gprsize+ 5*16]
   2354    mova                 m3, [rsp+gprsize+ 6*16]
   2355    mova                 m4, [rsp+gprsize+ 7*16]
   2356    mova                 m5, [rsp+gprsize+ 8*16]
   2357    mova                 m6, [rsp+gprsize+ 9*16]
   2358    mova                 m7, [rsp+gprsize+10*16]
   2359 %if ARCH_X86_64
   2360    mov                dstq, r6
   2361 %else
   2362    mov                dstq, [rsp+16*16+gprsize*1]
   2363 %endif
   2364    call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
   2365    RET
   2366 
   2367 INV_TXFM_8X16_FN flipadst, dct
   2368 INV_TXFM_8X16_FN flipadst, adst
   2369 INV_TXFM_8X16_FN flipadst, flipadst
   2370 INV_TXFM_8X16_FN flipadst, identity, v
   2371 
   2372 cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   2373 %if WIN64
   2374    PUSH                 r7
   2375 %elif ARCH_X86_32
   2376    mov [rsp+16*16+gprsize*1], r1
   2377    mov [rsp+16*16+gprsize*2], r6
   2378 %endif
   2379    lea                  t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)]
   2380    jmp m(idct_8x16_internal_16bpc).pass1_full
   2381 
   2382 .pass2:
   2383    lea                  r3, [strideq*3]
   2384    lea                  r3, [r3*5]
   2385    add                dstq, r3
   2386    neg             strideq
   2387    jmp m(iadst_8x16_internal_16bpc).pass2
   2388 
   2389 INV_TXFM_8X16_FN identity, dct, h
   2390 INV_TXFM_8X16_FN identity, adst, h
   2391 INV_TXFM_8X16_FN identity, flipadst, h
   2392 INV_TXFM_8X16_FN identity, identity
   2393 
   2394 cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   2395 %if WIN64
   2396    PUSH                 r7
   2397 %elif ARCH_X86_32
   2398    mov [rsp+16*16+gprsize*1], r1
   2399    mov [rsp+16*16+gprsize*2], r6
   2400 %endif
   2401    lea                  t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)]
   2402    jmp m(idct_8x16_internal_16bpc).pass1_full
   2403 
   2404 .pass2:
   2405 %if ARCH_X86_64
   2406    mova                 m4, [o(pw_2048)]
   2407    mova                 m5, [o(pixel_10bpc_max)]
   2408    pxor                 m6, m6
   2409    mova                 m7, [o(pw_1697x16)]
   2410 %endif
   2411    mov                 r5d, 4
   2412    lea                  r3, [strideq*3]
   2413 .pass2_loop:
   2414    call .main
   2415 %if ARCH_X86_64
   2416    call m(idct_8x4_internal_16bpc).round1_and_write_8x4
   2417 %else
   2418    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
   2419 %endif
   2420    REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28
   2421    dec                 r5d
   2422    jle .end
   2423    add                  cq, 16
   2424    lea                dstq, [dstq+strideq*4]
   2425    mova                 m0, [cq+ 0*16]
   2426    mova                 m1, [cq+ 4*16]
   2427    mova                 m2, [cq+ 8*16]
   2428    mova                 m3, [cq+12*16]
   2429    jmp .pass2_loop
   2430 .end:
   2431    RET
   2432 .main:
   2433    ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y)
   2434 %if ARCH_X86_32
   2435    mova                 m7, [o(pw_1697x16)]
   2436    pmulhrsw             m4, m7, m0
   2437    pmulhrsw             m5, m7, m1
   2438    pmulhrsw             m6, m7, m2
   2439    pmulhrsw             m7, m3
   2440 %else
   2441    pmulhrsw             m8, m7, m0
   2442    pmulhrsw             m9, m7, m1
   2443    pmulhrsw            m10, m7, m2
   2444    pmulhrsw            m11, m7, m3
   2445 %endif
   2446    REPX      {paddsw x, x}, m0, m1, m2, m3
   2447 %if ARCH_X86_64
   2448    paddsw               m0, m8
   2449    paddsw               m1, m9
   2450    paddsw               m2, m10
   2451    paddsw               m3, m11
   2452 %else
   2453    paddsw               m0, m4
   2454    paddsw               m1, m5
   2455    paddsw               m2, m6
   2456    paddsw               m3, m7
   2457 %endif
   2458    ret
   2459 
   2460 %macro INV_TXFM_16X4_FN 2 ; type1, type2
   2461 %if ARCH_X86_64
   2462    INV_TXFM_FN          %1, %2, 0, 16x4, 16, 0-8*16
   2463 %else
   2464    INV_TXFM_FN          %1, %2, 0, 16x4, 8, 0-12*16
   2465 %endif
   2466 %ifidn %1_%2, dct_dct
   2467    imul                r5d, [cq], 181
   2468    mov                [cq], eobd ; 0
   2469    mov                 r3d, 4
   2470 .dconly:
   2471    add                 r5d, 384
   2472    sar                 r5d, 9
   2473 .dconly2:
   2474    imul                r5d, 2896
   2475    add                 r5d, 34816
   2476    movd                 m0, r5d
   2477    pshuflw              m0, m0, q1111
   2478    punpcklqdq           m0, m0
   2479    mova                 m3, [o(pixel_10bpc_max)]
   2480    pxor                 m4, m4
   2481 .loop:
   2482    mova                 m1, [dstq+ 0]
   2483    mova                 m2, [dstq+16]
   2484    REPX     {paddw  x, m0}, m1, m2
   2485    REPX     {pminsw x, m3}, m1, m2
   2486    REPX     {pmaxsw x, m4}, m1, m2
   2487    mova          [dstq+ 0], m1
   2488    mova          [dstq+16], m2
   2489    add                dstq, strideq
   2490    dec                 r3d
   2491    jg .loop
   2492    RET
   2493 %endif
   2494 %endmacro
   2495 
   2496 INV_TXFM_16X4_FN dct, dct
   2497 INV_TXFM_16X4_FN dct, identity
   2498 INV_TXFM_16X4_FN dct, adst
   2499 INV_TXFM_16X4_FN dct, flipadst
   2500 
   2501 cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   2502 %if ARCH_X86_64
   2503    mova                m11, [o(pd_2048)]
   2504    mova                m12, [o(clip_18b_min)]
   2505    mova                m13, [o(clip_18b_max)]
   2506    mova                m14, [o(pd_2896)]
   2507 %endif
   2508    ; setup stack pointer
   2509    lea                  r3, [rsp+gprsize]
   2510 
   2511    mova                 m0, [cq+ 1*16]
   2512    mova                 m1, [cq+ 3*16]
   2513    mova                 m2, [cq+ 5*16]
   2514    mova                 m3, [cq+ 7*16]
   2515    mova                 m4, [cq+ 9*16]
   2516    mova                 m5, [cq+11*16]
   2517    mova                 m6, [cq+13*16]
   2518    mova                 m7, [cq+15*16]
   2519    call .main_oddhalf
   2520    mova                 m0, [cq+ 0*16]
   2521    mova                 m1, [cq+ 2*16]
   2522    mova                 m2, [cq+ 4*16]
   2523    mova                 m3, [cq+ 6*16]
   2524    mova                 m4, [cq+ 8*16]
   2525    mova                 m5, [cq+10*16]
   2526    mova                 m6, [cq+12*16]
   2527    mova                 m7, [cq+14*16]
   2528    call m(idct_8x4_internal_16bpc).main_pass1
   2529    call m(idct_8x4_internal_16bpc).round
   2530    ; t0-7 is in m0-7
   2531 
   2532    call .round
   2533 
   2534 %if ARCH_X86_64
   2535 .pack_transpose:
   2536    ; transpose in two parts
   2537    packssdw             m0, m1
   2538    packssdw             m2, m3
   2539    packssdw             m4, m5
   2540    packssdw             m6, m7
   2541    packssdw             m8, m9
   2542    packssdw            m10, m11
   2543    packssdw            m12, m13
   2544    packssdw            m14, m15
   2545 .transpose:
   2546    call m(idct_8x4_internal_16bpc).transpose4x8packed
   2547    call .transpose4x8packed_hi
   2548 %else
   2549    call m(idct_8x4_internal_16bpc).transpose4x8packed
   2550    mova          [r3+0*16], m0
   2551    mova          [r3+1*16], m1
   2552    mova          [r3+2*16], m2
   2553    mova          [r3+3*16], m3
   2554    mova                 m0, [r3+ 8*16]
   2555    mova                 m2, [r3+ 9*16]
   2556    mova                 m4, [r3+10*16]
   2557    mova                 m6, [r3+11*16]
   2558    call m(idct_8x4_internal_16bpc).transpose4x8packed
   2559 %endif
   2560    jmp                tx2q
   2561 %if ARCH_X86_64
   2562 .transpose4x8packed_hi:
   2563    punpcklwd            m9, m10, m14
   2564    punpckhwd           m10, m14
   2565    punpckhwd           m14, m8, m12
   2566    punpcklwd            m8, m12
   2567 
   2568    punpckhwd           m11, m8, m9
   2569    punpcklwd            m8, m9
   2570    punpckhwd           m12, m14, m10
   2571    punpcklwd           m14, m10
   2572 
   2573    punpcklwd           m10, m11, m12
   2574    punpckhwd           m11, m12
   2575    punpckhwd            m9, m8, m14
   2576    punpcklwd            m8, m14
   2577    ret
   2578 %endif
   2579 .main_oddhalf_fast: ; lower half zero
   2580    pmulld               m7, m0, [o(pd_4076)]
   2581    pmulld               m0, [o(pd_401)]
   2582    pmulld               m6, m1, [o(pd_m1189)]
   2583    pmulld               m1, [o(pd_3920)]
   2584 %if ARCH_X86_32
   2585    mova                 m4, [o(pd_2048)]
   2586    REPX      {paddd x, m4}, m1, m6
   2587    REPX      {psrad x, 12}, m1, m6
   2588    mova          [r3+1*16], m1
   2589 %endif
   2590    pmulld               m5, m2, [o(pd_3612)]
   2591    pmulld               m2, [o(pd_1931)]
   2592 %if ARCH_X86_32
   2593    pmulld               m1, m3, [o(pd_m2598)]
   2594 %else
   2595    pmulld               m4, m3, [o(pd_m2598)]
   2596 %endif
   2597    pmulld               m3, [o(pd_3166)]
   2598    jmp .main_oddhalf_fast2
   2599 .main_oddhalf:
   2600 %if ARCH_X86_64
   2601    ITX_MULSUB_2D         0, 7, 8, 9, 10, _,  401, 4076 ; t8a,  t15a
   2602    ITX_MULSUB_2D         6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
   2603    ITX_MULSUB_2D         2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
   2604    ITX_MULSUB_2D         4, 3, 8, 9, 10, _, 3166, 2598 ; t9a,  t14a
   2605 .main_oddhalf_fast2:
   2606    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   2607    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
   2608    psubd                m8, m0, m4 ; t9
   2609    paddd                m0, m4     ; t8
   2610    psubd                m4, m6, m2 ; t10
   2611    paddd                m2, m6     ; t11
   2612    psubd                m6, m1, m5 ; t13
   2613    paddd                m5, m1     ; t12
   2614    psubd                m1, m7, m3 ; t14
   2615    paddd                m7, m3     ; t15
   2616    REPX    {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
   2617    REPX    {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
   2618    mova                m15, [o(pd_3784)]
   2619    mova                m10, [o(pd_1567)]
   2620    ITX_MULSUB_2D         1, 8, 3, 9, _, 11, 10, 15
   2621    ITX_MULSUB_2D         6, 4, 3, 9, _, 11, 10, 15, 4
   2622    psubd                m3, m1, m4 ; t10
   2623    paddd                m1, m4     ; t9
   2624    psubd                m4, m0, m2 ; t11a
   2625    paddd                m0, m2     ; t8a
   2626    psubd                m2, m8, m6 ; t13
   2627    paddd                m6, m8     ; t14
   2628    psubd                m8, m7, m5 ; t12a
   2629    paddd                m7, m5     ; t15a
   2630    REPX    {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
   2631    REPX    {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
   2632    REPX    {pmulld x, m14}, m2, m8, m3, m4
   2633    paddd                m2, m11
   2634    paddd                m8, m11
   2635    paddd                m5, m2, m3 ; t13a
   2636    psubd                m2, m3     ; t10a
   2637    psubd                m3, m8, m4 ; t11
   2638    paddd                m4, m8     ; t12
   2639    REPX      {psrad x, 12}, m5, m2, m3, m4
   2640    mova          [r3+0*16], m0
   2641    mova          [r3+1*16], m1
   2642    mova          [r3+2*16], m2
   2643    mova          [r3+3*16], m3
   2644    mova          [r3+4*16], m4
   2645    mova          [r3+5*16], m5
   2646    mova          [r3+6*16], m6
   2647    mova          [r3+7*16], m7
   2648 %else
   2649    mova          [r3+0*16], m2
   2650    mova          [r3+1*16], m3
   2651    mova          [r3+2*16], m4
   2652    mova          [r3+3*16], m5
   2653    mova                 m4, [o(pd_2048)]
   2654 
   2655    ITX_MULSUB_2D         0, 7, 2, 3, 5, _,  401, 4076 ; t8a,  t15a
   2656    ITX_MULSUB_2D         6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a
   2657 
   2658    mova                 m2, [r3+0*16]
   2659    mova                 m3, [r3+1*16]
   2660    mova          [r3+0*16], m0
   2661    mova          [r3+1*16], m1
   2662    mova                 m1, [r3+2*16]
   2663    mova                 m5, [r3+3*16]
   2664    mova          [r3+2*16], m6
   2665    mova          [r3+3*16], m7
   2666 
   2667    ITX_MULSUB_2D         2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a
   2668    ITX_MULSUB_2D         1, 3, 0, 6, 7, _, 3166, 2598 ; t9a,  t14a
   2669 
   2670    mova                 m0, [r3+0*16]
   2671    mova                 m6, [r3+2*16]
   2672    mova                 m7, [r3+3*16]
   2673 .main_oddhalf_fast2:
   2674    REPX      {paddd x, m4}, m0, m7, m2, m5, m1, m3
   2675    REPX      {psrad x, 12}, m0, m7, m2, m5, m1, m3
   2676    psubd                m4, m0, m1 ; t9
   2677    paddd                m0, m1     ; t8
   2678    mova                 m1, [r3+1*16]
   2679    mova          [r3+0*16], m4
   2680    psubd                m4, m6, m2 ; t10
   2681    paddd                m2, m6     ; t11
   2682    psubd                m6, m1, m5 ; t13
   2683    paddd                m5, m1     ; t12
   2684    psubd                m1, m7, m3 ; t14
   2685    paddd                m7, m3     ; t15
   2686    mova                 m3, [o(clip_18b_min)]
   2687    REPX     {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7
   2688    pmaxsd               m3, [r3+0*16]
   2689    mova          [r3+0*16], m3
   2690    mova                 m3, [o(clip_18b_max)]
   2691    REPX     {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7
   2692    pminsd               m3, [r3+0*16]
   2693    mova          [r3+0*16], m0
   2694    mova          [r3+1*16], m2
   2695    mova          [r3+2*16], m5
   2696    mova          [r3+3*16], m7
   2697    mova                m7, [o(pd_2048)]
   2698    ITX_MULSUB_2D         1, 3, 0, 2, 5, 7, 1567, 3784
   2699    ITX_MULSUB_2D         6, 4, 0, 2, _, 7,    5, 3784, 4
   2700    mova                 m0, [r3+0*16]
   2701    mova                 m2, [r3+1*16]
   2702    psubd                m5, m1, m4 ; t10
   2703    mova          [r3+1*16], m5
   2704    paddd                m1, m4     ; t9
   2705    psubd                m4, m0, m2 ; t11a
   2706    paddd                m0, m2     ; t8a
   2707    mova                 m5, [r3+2*16]
   2708    mova                 m7, [r3+3*16]
   2709    psubd                m2, m3, m6 ; t13
   2710    paddd                m6, m3     ; t14
   2711    paddd                m3, m7, m5 ; t15a
   2712    psubd                m7, m5     ; t12a
   2713    mova          [r3+0*16], m3
   2714    mova                 m3, [r3+1*16]
   2715    mova                 m5, [o(clip_18b_min)]
   2716    REPX     {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6
   2717    pmaxsd               m5, [r3+0*16]
   2718    mova          [r3+0*16], m5
   2719    mova                 m5, [o(clip_18b_max)]
   2720    REPX     {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6
   2721    pminsd               m5, [r3+0*16]
   2722    mova          [r3+0*16], m5
   2723    mova                 m5, [o(pd_2896)]
   2724    REPX     {pmulld x, m5}, m2, m7, m3, m4
   2725    mova                 m5, [o(pd_2048)]
   2726    REPX     {paddd  x, m5}, m2, m7
   2727    paddd                m5, m2, m3 ; t13a
   2728    psubd                m2, m3     ; t10a
   2729    psubd                m3, m7, m4 ; t11
   2730    paddd                m4, m7     ; t12
   2731    REPX      {psrad x, 12}, m5, m2, m3, m4
   2732    mova                 m7, [r3+0*16]
   2733    mova         [r3+11*16], m0
   2734    mova         [r3+10*16], m1
   2735    mova          [r3+9*16], m2
   2736    mova          [r3+8*16], m3
   2737    mova          [r3+7*16], m4
   2738    mova          [r3+6*16], m5
   2739    mova          [r3+5*16], m6
   2740    mova          [r3+4*16], m7
   2741 %endif
   2742    ret
   2743 .round:
   2744 %if ARCH_X86_64
   2745    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   2746    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   2747    pcmpeqd              m8, m8
   2748    REPX      {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
   2749    mova                 m8, [r3+1*16]
   2750    mova                 m9, [r3+2*16]
   2751    mova                m10, [r3+3*16]
   2752    mova                m11, [r3+4*16]
   2753    mova                m12, [r3+5*16]
   2754    mova                m13, [r3+6*16]
   2755    mova                m14, [r3+7*16]
   2756    psubd               m15, m0, m14       ; out15
   2757    paddd                m0, m14           ; out0
   2758    psubd               m14, m1, m13       ; out14
   2759    paddd                m1, m13           ; out1
   2760    psubd               m13, m2, m12       ; out13
   2761    paddd                m2, m12           ; out2
   2762    psubd               m12, m3, m11       ; out12
   2763    paddd                m3, m11           ; out3
   2764    psubd               m11, m4, m10       ; out11
   2765    paddd                m4, m10           ; out4
   2766    psubd               m10, m5, m9        ; out10
   2767    paddd                m5, m9            ; out5
   2768    psubd                m9, m6, m8        ; out9
   2769    paddd                m6, m8            ; out6
   2770    psubd                m8, m7, [r3+0*16] ; out8
   2771    paddd                m7, [r3+0*16]     ; out7
   2772    REPX       {psrad x, 1}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   2773                             m8,  m9,  m10, m11, m12, m13, m14, m15
   2774    ; and out0-15 is now in m0-15
   2775 %else
   2776    mova         [r3+ 0*16], m0
   2777    mova                 m0, [o(clip_18b_min)]
   2778    REPX     {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
   2779    pmaxsd               m0, [r3+ 0*16]
   2780    mova         [r3+ 0*16], m7
   2781    mova                 m7, [o(clip_18b_max)]
   2782    REPX     {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
   2783    pminsd               m7, [r3+ 0*16]
   2784    mova         [r3+ 0*16], m0
   2785    pcmpeqd              m0, m0
   2786    REPX      {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
   2787    mova         [r3+ 1*16], m1
   2788    mova         [r3+ 2*16], m2
   2789    mova                 m1, [r3+ 0*16]
   2790    psubd                m1, m0
   2791    mova         [r3+ 0*16], m1
   2792    mova                 m1, [r3+11*16]
   2793    mova                 m2, [r3+10*16]
   2794    psubd                m0, m7, m1
   2795    paddd                m7, m1
   2796    psubd                m1, m6, m2
   2797    paddd                m6, m2
   2798    REPX       {psrad x, 1}, m0, m1, m6, m7
   2799    packssdw             m0, m1     ; out8-9
   2800    packssdw             m6, m7     ; out6-7
   2801    mova         [r3+11*16], m6
   2802    mova                 m1, [r3+9*16]
   2803    mova                 m7, [r3+8*16]
   2804    psubd                m2, m5, m1
   2805    paddd                m5, m1
   2806    psubd                m1, m4, m7
   2807    paddd                m4, m7
   2808    REPX       {psrad x, 1}, m2, m1, m4, m5
   2809    packssdw             m2, m1     ; out10-11
   2810    packssdw             m4, m5     ; out4-5
   2811    mova                 m1, [r3+2*16]
   2812    mova         [r3+10*16], m4
   2813    mova                 m6, [r3+7*16]
   2814    mova                 m7, [r3+6*16]
   2815    psubd                m4, m3, m6
   2816    paddd                m3, m6
   2817    psubd                m6, m1, m7
   2818    paddd                m1, m7
   2819    REPX       {psrad x, 1}, m4, m6, m1, m3
   2820    packssdw             m4, m6     ; out12-13
   2821    packssdw             m1, m3     ; out2-3
   2822    mova                 m3, [r3+1*16]
   2823    mova          [r3+9*16], m1
   2824    mova                 m1, [r3+0*16]
   2825    mova                 m5, [r3+5*16]
   2826    mova                 m7, [r3+4*16]
   2827    psubd                m6, m3, m5
   2828    paddd                m3, m5
   2829    psubd                m5, m1, m7
   2830    paddd                m1, m7
   2831    REPX       {psrad x, 1}, m6, m5, m1, m3
   2832    packssdw             m6, m5     ; out14-15
   2833    packssdw             m1, m3     ; out0-1
   2834    mova          [r3+8*16], m1
   2835 %endif
   2836    ret
   2837 
   2838 .pass2:
   2839    lea                  r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)]
   2840 .pass2_loop:
   2841    lea                  r3, [strideq*3]
   2842 %if ARCH_X86_32
   2843    lea                  r5, [o(itx8_start)]
   2844 %endif
   2845    call                 r4
   2846    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
   2847    REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   2848 %if ARCH_X86_64
   2849    mova                 m0, m8
   2850    mova                 m1, m9
   2851    mova                 m2, m10
   2852    mova                 m3, m11
   2853 %else
   2854    mova                 m0, [rsp+gprsize+0*16]
   2855    mova                 m1, [rsp+gprsize+1*16]
   2856    mova                 m2, [rsp+gprsize+2*16]
   2857    mova                 m3, [rsp+gprsize+3*16]
   2858 %endif
   2859    add                dstq, 16
   2860 %if ARCH_X86_32
   2861    lea                  r5, [o(itx8_start)]
   2862 %endif
   2863    call                 r4
   2864    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
   2865    RET
   2866 
   2867 INV_TXFM_16X4_FN adst, dct
   2868 INV_TXFM_16X4_FN adst, adst
   2869 INV_TXFM_16X4_FN adst, flipadst
   2870 INV_TXFM_16X4_FN adst, identity
   2871 
   2872 cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   2873    ; setup stack pointer
   2874    lea                  r3, [rsp+gprsize]
   2875    call .main
   2876 %if ARCH_X86_64
   2877    jmp m(idct_16x4_internal_16bpc).pack_transpose
   2878 %else
   2879    call m(idct_8x4_internal_16bpc).transpose4x8packed
   2880    mova [rsp+gprsize+0*16], m0
   2881    mova [rsp+gprsize+1*16], m1
   2882    mova [rsp+gprsize+2*16], m2
   2883    mova [rsp+gprsize+3*16], m3
   2884    mova                 m0, [rsp+gprsize+ 8*16]
   2885    mova                 m2, [rsp+gprsize+ 9*16]
   2886    mova                 m4, [rsp+gprsize+10*16]
   2887    mova                 m6, [rsp+gprsize+11*16]
   2888    call m(idct_8x4_internal_16bpc).transpose4x8packed
   2889    jmp                tx2q
   2890 %endif
   2891 
   2892 .main:
   2893 %if ARCH_X86_64
   2894    mova                m11, [o(pd_2048)]
   2895    mova                m12, [o(clip_18b_min)]
   2896    mova                m13, [o(clip_18b_max)]
   2897    mova                m14, [o(pd_2896)]
   2898 %endif
   2899    mova                 m0, [cq+ 2*16]
   2900    mova                 m1, [cq+13*16]
   2901    mova                 m2, [cq+ 6*16]
   2902    mova                 m3, [cq+ 9*16]
   2903    mova                 m4, [cq+10*16]
   2904    mova                 m5, [cq+ 5*16]
   2905    mova                 m6, [cq+14*16]
   2906    mova                 m7, [cq+ 1*16]
   2907    call .main_part1
   2908    mova                 m0, [cq+ 0*16]
   2909    mova                 m1, [cq+15*16]
   2910    mova                 m2, [cq+ 4*16]
   2911    mova                 m3, [cq+11*16]
   2912    mova                 m4, [cq+ 8*16]
   2913    mova                 m5, [cq+ 7*16]
   2914    mova                 m6, [cq+12*16]
   2915    mova                 m7, [cq+ 3*16]
   2916    call .main_part2
   2917 .round:
   2918 %if ARCH_X86_64
   2919    mova                m15, [o(pd_6144)]
   2920    psrld               m14, 11       ; pd_1
   2921    pcmpeqd              m8, m8       ; -1
   2922    psubd               m13, m15, m14 ; pd_6143
   2923    REPX     {paddd x, m14}, m0, m2
   2924    REPX     {paddd x, m15}, m4, m6
   2925    REPX     {pxor  x, m8 }, m1, m3, m5, m7
   2926    REPX     {psrad x, 1  }, m1, m3
   2927    REPX     {paddd x, m15}, m5, m7
   2928    REPX     {psubd x, m8 }, m1, m3
   2929    paddd                m8, m15, m9
   2930    psubd                m9, m13, m10
   2931    paddd               m10, m15, m11
   2932    psubd               m11, m13, m12
   2933    paddd               m12, m14, [r3+3*16]
   2934    psubd               m13, m14, [r3+2*16]
   2935    psubd               m15, m14, [r3+0*16]
   2936    paddd               m14, [r3+1*16]
   2937    REPX      {psrad x, 1 }, m0,  m2,  m12, m13, m14, m15
   2938    REPX      {psrad x, 13}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
   2939 %else
   2940    mova          [r3+8*16], m1
   2941    mova          [r3+9*16], m3
   2942    mova                 m3, [o(pd_6144)]
   2943    pcmpeqd              m1, m1
   2944    REPX      {pxor  x, m1}, m5, m7
   2945    REPX      {paddd x, m3}, m4, m5, m6, m7
   2946    REPX      {psrad x, 13}, m4, m5, m6, m7
   2947    packssdw             m4, m5
   2948    packssdw             m6, m7
   2949    mova         [r3+10*16], m4
   2950    mova         [r3+11*16], m6
   2951    mova                 m4, [r3+4*16]
   2952    mova                 m5, [r3+5*16]
   2953    mova                 m6, [r3+6*16]
   2954    mova                 m7, [r3+7*16]
   2955    REPX      {pxor  x, m1}, m5, m7
   2956    REPX      {psubd x, m1}, m4, m6
   2957    REPX      {psrad x, 1 }, m4, m5, m6, m7
   2958    REPX      {psubd x, m1}, m5, m7
   2959    packssdw             m4, m5
   2960    packssdw             m6, m7
   2961    mova                 m5, [r3+8*16]
   2962    mova                 m7, [r3+9*16]
   2963    mova          [r3+8*16], m4
   2964    mova          [r3+9*16], m6
   2965    REPX      {pxor  x, m1}, m5, m7
   2966    REPX      {paddd x, m3}, m0, m5, m2, m7
   2967    REPX      {psrad x, 13}, m0, m5, m2, m7
   2968    packssdw             m0, m5
   2969    packssdw             m2, m7
   2970    mova                 m4, [r3+0*16]
   2971    mova                 m5, [r3+1*16]
   2972    mova                 m6, [r3+2*16]
   2973    mova                 m7, [r3+3*16]
   2974    REPX      {psubd x, m1}, m4, m6
   2975    REPX      {pxor  x, m1}, m5, m7
   2976    REPX      {psrad x, 1 }, m4, m5, m6, m7
   2977    REPX      {psubd x, m1}, m5, m7
   2978    packssdw             m4, m5
   2979    packssdw             m6, m7
   2980 %endif
   2981    ret
   2982 
   2983 .main_part2:
   2984 %if ARCH_X86_64
   2985    ITX_MULSUB_2D         1, 0, 8, 9, 10, 11,  201, 4091
   2986    ITX_MULSUB_2D         3, 2, 8, 9, 10, 11, 1751, 3703
   2987    ITX_MULSUB_2D         5, 4, 8, 9, 10, 11, 3035, 2751
   2988    ITX_MULSUB_2D         7, 6, 8, 9, 10, 11, 3857, 1380
   2989    psubd                m8, m0, m4 ; t8a
   2990    paddd                m0, m4     ; t0a
   2991    psubd                m4, m1, m5 ; t9a
   2992    paddd                m1, m5     ; t1a
   2993    psubd                m5, m2, m6 ; t12a
   2994    paddd                m2, m6     ; t4a
   2995    psubd                m6, m3, m7 ; t13a
   2996    paddd                m7, m3     ; t5a
   2997    REPX    {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
   2998    REPX    {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
   2999    mova                m15, [o(pd_4017)]
   3000    mova                m10, [o(pd_799)]
   3001    ITX_MULSUB_2D         8, 4, 3, 9, _, 11, 10, 15
   3002    ITX_MULSUB_2D         6, 5, 3, 9, _, 11, 15, 10
   3003    psubd                m3, m0, m2 ; t4
   3004    paddd                m0, m2     ; t0
   3005    psubd                m2, m1, m7 ; t5
   3006    paddd                m1, m7     ; t1
   3007    psubd                m7, m4, m6 ; t12a
   3008    paddd                m4, m6     ; t8a
   3009    psubd                m6, m8, m5 ; t13a
   3010    paddd                m5, m8     ; t9a
   3011    REPX    {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
   3012    REPX    {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
   3013    mova                m15, [o(pd_3784)]
   3014    mova                m10, [o(pd_1567)]
   3015    ITX_MULSUB_2D         3, 2, 8, 9, _, 11, 10, 15
   3016    ITX_MULSUB_2D         7, 6, 8, 9, _, 11, 10, 15
   3017    mova                m10, [r3+0*16]      ;  t2
   3018    mova                 m8, [r3+1*16]      ;  t3
   3019    psubd                m9, m0, m10        ;  t2a
   3020    paddd                m0, m10            ;  out0
   3021    psubd               m10, m1, m8         ;  t3a
   3022    paddd                m1, m8             ; -out15
   3023    mova          [r3+0*16], m1
   3024    mova                m15, [r3+3*16]      ;  t7a
   3025    mova                 m1, [r3+2*16]      ;  t6a
   3026    psubd                m8, m3, m15        ;  t7
   3027    paddd               m15, m3             ;  out12
   3028    paddd                m3, m2, m1         ; -out3
   3029    psubd                m2, m1             ;  t6
   3030    mova          [r3+3*16], m15
   3031    mova          [r3+1*16], m2
   3032    mova                 m1, [r3+7*16]      ;  t15
   3033    mova                 m2, [r3+6*16]      ;  t14
   3034    paddd               m15, m7, m1         ; -out13
   3035    psubd                m7, m1             ;  t15a
   3036    psubd               m11, m6, m2         ;  t14a
   3037    paddd                m2, m6             ;  out2
   3038    mova          [r3+2*16], m15
   3039    mova                 m1, [r3+4*16]      ;  t10a
   3040    mova                m15, [r3+5*16]      ;  t11a
   3041    psubd                m6, m4, m1         ;  t10
   3042    paddd                m1, m4             ; -out1
   3043    psubd                m4, m5, m15        ;  t11
   3044    paddd                m5, m15            ;  out14
   3045    REPX    {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8
   3046    pmaxsd              m12, [r3+1*16]      ;  t6
   3047    mova          [r3+1*16], m5
   3048    REPX    {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8
   3049    REPX    {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8
   3050    paddd                m5, m11, m7        ; -out5  (unshifted)
   3051    psubd               m11, m7             ;  out10 (unshifted)
   3052    paddd                m7, m9, m10        ; -out7  (unshifted)
   3053    psubd                m9, m10            ;  out8  (unshifted)
   3054    psubd               m10, m6, m4         ; -out9  (unshifted)
   3055    paddd                m6, m4             ;  out6  (unshifted)
   3056    paddd                m4, m12, m8        ;  out4  (unshifted)
   3057    psubd               m12, m8             ; -out11 (unshifted)
   3058 %else
   3059    mova          [r3+8*16], m0
   3060    mova          [r3+9*16], m1
   3061    mova         [r3+10*16], m2
   3062    mova         [r3+11*16], m3
   3063    mova                 m3, [o(pd_2048)]
   3064    ITX_MULSUB_2D         5, 4, 0, 1, 2, 3, 3035, 2751
   3065    ITX_MULSUB_2D         7, 6, 0, 1, 2, 3, 3857, 1380
   3066    mova                 m0, [r3+8*16]
   3067    mova                 m1, [r3+9*16]
   3068    mova          [r3+8*16], m4
   3069    mova                 m4, [r3+10*16]
   3070    mova          [r3+9*16], m5
   3071    mova         [r3+10*16], m6
   3072    mova                 m5, [r3+11*16]
   3073    mova         [r3+11*16], m7
   3074    ITX_MULSUB_2D         1, 0, 2, 6, 7, 3,  201, 4091
   3075    ITX_MULSUB_2D         5, 4, 2, 6, 7, 3, 1751, 3703
   3076    mova                 m2, [r3+8*16]
   3077    mova                 m6, [r3+9*16]
   3078    psubd                m3, m0, m2 ; t8a
   3079    paddd                m0, m2     ; t0a
   3080    mova          [r3+8*16], m3
   3081    psubd                m2, m1, m6 ; t9a
   3082    paddd                m1, m6     ; t1a
   3083    mova                 m3, [r3+10*16]
   3084    psubd                m6, m4, m3 ; t12a
   3085    paddd                m4, m3     ; t4a
   3086    mova                 m3, [r3+11*16]
   3087    psubd                m7, m5, m3 ; t13a
   3088    paddd                m5, m3     ; t5a
   3089    mova                 m3, [o(clip_18b_min)]
   3090    REPX     {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5
   3091    pmaxsd               m3, [r3+8*16]
   3092    mova          [r3+8*16], m3
   3093    mova                 m3, [o(clip_18b_max)]
   3094    REPX     {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5
   3095    pminsd               m3, [r3+8*16]
   3096    mova          [r3+8*16], m3
   3097    psubd                m3, m0, m4 ; t4
   3098    paddd                m0, m4     ; t0
   3099    psubd                m4, m1, m5 ; t5
   3100    paddd                m1, m5     ; t1
   3101    mova                 m5, [o(pd_2048)]
   3102    mova          [r3+9*16], m1
   3103    mova         [r3+10*16], m4
   3104    mova         [r3+11*16], m3
   3105    mova                 m3, [r3+8*16]
   3106    mova          [r3+8*16], m0
   3107    ITX_MULSUB_2D         3, 2, 0, 1, 4, 5,  799, 4017
   3108    ITX_MULSUB_2D         7, 6, 0, 1, 4, 5, 4017,    4
   3109    psubd                m5, m2, m7 ; t12a
   3110    paddd                m2, m7     ; t8a
   3111    psubd                m7, m3, m6 ; t13a
   3112    paddd                m6, m3     ; t9a
   3113    mova                 m0, [r3+8*16]
   3114    mova                 m1, [r3+9*16]
   3115    mova                 m4, [r3+10*16]
   3116    mova                 m3, [o(clip_18b_min)]
   3117    REPX     {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6
   3118    pmaxsd               m3, [r3+11*16]
   3119    mova          [r3+8*16], m3
   3120    mova                 m3, [o(clip_18b_max)]
   3121    REPX     {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6
   3122    pminsd               m3, [r3+8*16]
   3123    mova          [r3+8*16], m0
   3124    mova          [r3+9*16], m1
   3125    mova         [r3+10*16], m2
   3126    mova         [r3+11*16], m6
   3127    mova                 m0, [o(pd_2048)]
   3128    ITX_MULSUB_2D         3, 4, 1, 2, 6, 0, 1567, 3784
   3129    ITX_MULSUB_2D         5, 7, 1, 2, 6, 0,    6, 3784
   3130    mova                 m0, [r3+7*16]      ;  t7a
   3131    mova                 m2, [r3+6*16]      ;  t6a
   3132    psubd                m1, m3, m0         ;  t7
   3133    paddd                m0, m3             ;  out12
   3134    paddd                m3, m4, m2         ; -out3
   3135    psubd                m4, m2             ;  t6
   3136    mova          [r3+7*16], m3
   3137    mova                 m3, [r3+3*16]      ;  t15
   3138    mova                 m2, [r3+2*16]      ;  t14
   3139    paddd                m6, m5, m3         ; -out13
   3140    psubd                m5, m3             ;  t15a
   3141    psubd                m3, m7, m2         ;  t14a
   3142    paddd                m2, m7             ;  out2
   3143    mova          [r3+6*16], m2
   3144    mova                 m7, [r3+0*16]      ;  t10a
   3145    mova                 m2, [r3+1*16]      ;  t11a
   3146    mova          [r3+0*16], m0
   3147    mova          [r3+1*16], m6
   3148    mova                 m6, [r3+11*16]
   3149    psubd                m0, m6, m2         ;  t11
   3150    paddd                m6, m2             ;  out14
   3151    mova          [r3+2*16], m6
   3152    mova                 m2, [r3+10*16]
   3153    psubd                m6, m2, m7         ;  t10
   3154    paddd                m2, m7             ; -out1
   3155    mova                 m7, [r3+5*16]      ;  t3
   3156    mova          [r3+5*16], m2
   3157    mova         [r3+10*16], m1
   3158    mova                 m1, [r3+9*16]
   3159    psubd                m2, m1, m7         ;  t3a
   3160    paddd                m1, m7             ; -out15
   3161    mova          [r3+3*16], m1
   3162    mova                 m1, [r3+4*16]      ;  t2
   3163    mova                 m7, [r3+8*16]
   3164    psubd                m7, m1             ;  t2a
   3165    paddd                m1, [r3+8*16]      ;  out0
   3166    mova          [r3+4*16], m1
   3167    mova                 m1, [o(clip_18b_min)]
   3168    REPX     {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7
   3169    pmaxsd               m1, [r3+10*16]
   3170    mova         [r3+10*16], m1
   3171    mova                 m1, [o(clip_18b_max)]
   3172    REPX     {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7
   3173    pminsd               m1, [r3+10*16]
   3174    mova         [r3+10*16], m1
   3175    mova                 m1, [o(pd_2896)]
   3176    REPX     {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7
   3177    pmulld               m1, [r3+10*16]
   3178    mova         [r3+11*16], m3
   3179    psubd                m3, m4, m1         ; -out11 (unshifted)
   3180    paddd                m4, m1             ;  out4  (unshifted)
   3181    psubd                m1, m6, m0         ; -out9  (unshifted)
   3182    paddd                m6, m0             ;  out6  (unshifted)
   3183    psubd                m0, m7, m2         ;  out8  (unshifted)
   3184    paddd                m7, m2             ; -out7  (unshifted)
   3185    mova                 m2, [r3+11*16]
   3186    mova         [r3+11*16], m5
   3187    paddd                m5, m2             ; -out5  (unshifted)
   3188    psubd                m2, [r3+11*16]     ;  out10 (unshifted)
   3189    ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted)
   3190    ; r[-4,3] contain out0-3 and out12-15
   3191 %endif
   3192    ret
   3193 .main_part1:
   3194 %if ARCH_X86_64
   3195    ITX_MULSUB_2D         1, 0, 8, 9, 10, 11,  995, 3973
   3196    ITX_MULSUB_2D         3, 2, 8, 9, 10, 11, 2440, 3290
   3197    ITX_MULSUB_2D         5, 4, 8, 9, 10, 11, 3513, 2106
   3198    ITX_MULSUB_2D         7, 6, 8, 9, 10, 11, 4052,  601
   3199    psubd                m8, m0, m4 ; t10a
   3200    paddd                m0, m4     ; t2a
   3201    psubd                m4, m1, m5 ; t11a
   3202    paddd                m1, m5     ; t3a
   3203    psubd                m5, m2, m6 ; t14a
   3204    paddd                m2, m6     ; t6a
   3205    psubd                m6, m3, m7 ; t15a
   3206    paddd                m7, m3     ; t7a
   3207    REPX    {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
   3208    REPX    {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
   3209    mova                m15, [o(pd_2276)]
   3210    mova                m10, [o(pd_3406)]
   3211    ITX_MULSUB_2D         8, 4, 3, 9, _, 11, 10, 15
   3212    ITX_MULSUB_2D         6, 5, 3, 9, _, 11, 15, 10
   3213    psubd                m3, m0, m2 ; t6
   3214    paddd                m0, m2     ; t2
   3215    psubd                m2, m1, m7 ; t7
   3216    paddd                m1, m7     ; t3
   3217    psubd                m7, m4, m6 ; t14a
   3218    paddd                m4, m6     ; t10a
   3219    psubd                m6, m8, m5 ; t15a
   3220    paddd                m5, m8     ; t11a
   3221    REPX    {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
   3222    REPX    {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
   3223    mova                m15, [o(pd_1567)]
   3224    mova                m10, [o(pd_3784)]
   3225    ITX_MULSUB_2D         2, 3, 8, 9, _, 11, 10, 15
   3226    ITX_MULSUB_2D         6, 7, 8, 9, _, 11, 10, 15
   3227    mova          [r3+0*16], m0
   3228    mova          [r3+1*16], m1
   3229    mova          [r3+4*16], m4
   3230    mova          [r3+5*16], m5
   3231    mova          [r3+2*16], m2
   3232    mova          [r3+3*16], m3
   3233    mova          [r3+6*16], m6
   3234    mova          [r3+7*16], m7
   3235 %else
   3236    mova          [r3+4*16], m0
   3237    mova          [r3+5*16], m1
   3238    mova          [r3+6*16], m2
   3239    mova          [r3+7*16], m3
   3240    mova                 m3, [o(pd_2048)]
   3241    ITX_MULSUB_2D         5, 4, 0, 1, 2, 3, 3513, 2106
   3242    ITX_MULSUB_2D         7, 6, 0, 1, 2, 3, 4052,  601
   3243    mova          [r3+0*16], m4
   3244    mova          [r3+1*16], m5
   3245    mova          [r3+2*16], m6
   3246    mova          [r3+3*16], m7
   3247    mova                 m0, [r3+4*16]
   3248    mova                 m1, [r3+5*16]
   3249    mova                 m2, [r3+6*16]
   3250    mova                 m7, [r3+7*16]
   3251    ITX_MULSUB_2D         1, 0, 4, 5, 6, 3,  995, 3973
   3252    ITX_MULSUB_2D         7, 2, 4, 5, 6, 3, 2440, 3290
   3253    mova                 m4, [r3+0*16]
   3254    mova                 m5, [r3+1*16]
   3255    psubd                m6, m0, m4 ; t10a
   3256    paddd                m0, m4     ; t2a
   3257    mova          [r3+4*16], m6
   3258    mova                 m6, [r3+2*16]
   3259    mova                 m3, [r3+3*16]
   3260    psubd                m4, m1, m5 ; t11a
   3261    paddd                m1, m5     ; t3a
   3262    psubd                m5, m2, m6 ; t14a
   3263    paddd                m2, m6     ; t6a
   3264    psubd                m6, m7, m3 ; t15a
   3265    paddd                m7, m3     ; t7a
   3266    mova                 m3, [o(clip_18b_min)]
   3267    REPX     {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7
   3268    pmaxsd               m3, [r3+4*16]
   3269    mova          [r3+4*16], m3
   3270    mova                 m3, [o(clip_18b_max)]
   3271    REPX     {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7
   3272    pminsd               m3, [r3+4*16]
   3273    mova          [r3+4*16], m3
   3274    psubd                m3, m0, m2 ; t6
   3275    paddd                m0, m2     ; t2
   3276    psubd                m2, m1, m7 ; t7
   3277    paddd                m1, m7     ; t3
   3278    mova          [r3+5*16], m1
   3279    mova          [r3+6*16], m3
   3280    mova          [r3+7*16], m2
   3281    mova                 m1, [r3+4*16]
   3282    mova          [r3+4*16], m0
   3283    mova                 m3, [o(pd_2048)]
   3284    ITX_MULSUB_2D         1, 4, 0, 7, 2, 3, 3406, 2276
   3285    ITX_MULSUB_2D         6, 5, 0, 7, 2, 3, 2276,    2
   3286    psubd                m7, m4, m6 ; t14a
   3287    paddd                m4, m6     ; t10a
   3288    psubd                m6, m1, m5 ; t15a
   3289    paddd                m5, m1     ; t11a
   3290    mova                 m1, [r3+5*16]
   3291    mova                 m3, [r3+6*16]
   3292    mova                 m2, [r3+7*16]
   3293    mova                 m0, [o(clip_18b_min)]
   3294    REPX     {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5
   3295    pmaxsd               m0, [r3+4*16]
   3296    mova          [r3+4*16], m0
   3297    mova                 m0, [o(clip_18b_max)]
   3298    REPX     {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5
   3299    pminsd               m0, [r3+4*16]
   3300    mova          [r3+4*16], m0
   3301    mova          [r3+5*16], m1
   3302    mova          [r3+0*16], m4
   3303    mova          [r3+1*16], m5
   3304    mova                 m0, [o(pd_2048)]
   3305    ITX_MULSUB_2D         2, 3, 1, 4, 5, 0, 3784, 1567
   3306    ITX_MULSUB_2D         6, 7, 1, 4, 5, 0,    5, 1567
   3307    mova          [r3+6*16], m2
   3308    mova          [r3+7*16], m3
   3309    mova          [r3+2*16], m6
   3310    mova          [r3+3*16], m7
   3311 %endif
   3312    ret
   3313 
   3314 .pass2:
   3315    lea                  r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
   3316    jmp m(idct_16x4_internal_16bpc).pass2_loop
   3317 
   3318 INV_TXFM_16X4_FN flipadst, dct
   3319 INV_TXFM_16X4_FN flipadst, adst
   3320 INV_TXFM_16X4_FN flipadst, flipadst
   3321 INV_TXFM_16X4_FN flipadst, identity
   3322 
   3323 cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   3324    lea                  r3, [rsp+gprsize]
   3325    call m(iadst_16x4_internal_16bpc).main
   3326 %if ARCH_X86_64
   3327    packssdw             m1, m0
   3328    packssdw             m3, m2
   3329    packssdw             m5, m4
   3330    packssdw             m7, m6
   3331    packssdw             m9, m8
   3332    packssdw            m11, m10
   3333    packssdw            m13, m12
   3334    packssdw            m15, m14
   3335    mova                 m0, m15
   3336    mova                 m2, m13
   3337    mova                 m4, m11
   3338    mova                 m6, m9
   3339    mova                 m8, m7
   3340    mova                m10, m5
   3341    mova                m12, m3
   3342    mova                m14, m1
   3343    jmp m(idct_16x4_internal_16bpc).transpose
   3344 %else
   3345    mova [rsp+gprsize+4*16], m0
   3346    mova [rsp+gprsize+5*16], m2
   3347    mova [rsp+gprsize+6*16], m4
   3348    mova [rsp+gprsize+7*16], m6
   3349    pshufd               m6, [rsp+gprsize+ 8*16], q1032
   3350    pshufd               m4, [rsp+gprsize+ 9*16], q1032
   3351    pshufd               m2, [rsp+gprsize+10*16], q1032
   3352    pshufd               m0, [rsp+gprsize+11*16], q1032
   3353    call m(idct_8x4_internal_16bpc).transpose4x8packed
   3354    mova [rsp+gprsize+0*16], m0
   3355    mova [rsp+gprsize+1*16], m1
   3356    mova [rsp+gprsize+2*16], m2
   3357    mova [rsp+gprsize+3*16], m3
   3358    pshufd               m6, [rsp+gprsize+ 4*16], q1032
   3359    pshufd               m4, [rsp+gprsize+ 5*16], q1032
   3360    pshufd               m2, [rsp+gprsize+ 6*16], q1032
   3361    pshufd               m0, [rsp+gprsize+ 7*16], q1032
   3362    call m(idct_8x4_internal_16bpc).transpose4x8packed
   3363    jmp                tx2q
   3364 %endif
   3365 
   3366 .pass2:
   3367    lea                  r3, [strideq*3]
   3368    lea                dstq, [dstq+r3]
   3369    neg             strideq
   3370    lea                  r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
   3371    jmp m(idct_16x4_internal_16bpc).pass2_loop
   3372 
   3373 INV_TXFM_16X4_FN identity, dct
   3374 INV_TXFM_16X4_FN identity, adst
   3375 INV_TXFM_16X4_FN identity, flipadst
   3376 INV_TXFM_16X4_FN identity, identity
   3377 
   3378 cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   3379 %if ARCH_X86_64
   3380    mova                m15, [o(pd_11586)]
   3381    pmulld               m0, m15, [cq+ 0*16]
   3382    pmulld               m1, m15, [cq+ 1*16]
   3383    pmulld               m2, m15, [cq+ 2*16]
   3384    pmulld               m3, m15, [cq+ 3*16]
   3385    pmulld               m4, m15, [cq+ 4*16]
   3386    pmulld               m5, m15, [cq+ 5*16]
   3387    pmulld               m6, m15, [cq+ 6*16]
   3388    pmulld               m7, m15, [cq+ 7*16]
   3389    pmulld               m8, m15, [cq+ 8*16]
   3390    pmulld               m9, m15, [cq+ 9*16]
   3391    pmulld              m10, m15, [cq+10*16]
   3392    pmulld              m11, m15, [cq+11*16]
   3393    pmulld              m12, m15, [cq+12*16]
   3394    pmulld              m13, m15, [cq+13*16]
   3395    pmulld              m14, m15, [cq+14*16]
   3396    pmulld              m15, [cq+15*16]
   3397    mova         [cq+ 0*16], m15
   3398    mova                m15, [o(pd_6144)]
   3399    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
   3400                         m8, m9, m10, m11, m12, m13, m14
   3401    paddd               m15, [cq+ 0*16]
   3402    REPX     {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
   3403                         m8, m9, m10, m11, m12, m13, m14, m15
   3404    jmp m(idct_16x4_internal_16bpc).pack_transpose
   3405 %else
   3406    add                  cq, 8*16
   3407    mov                 r5d, 2
   3408 .loop_pass1:
   3409    mova                 m7, [o(pd_11586)]
   3410    pmulld               m0, m7, [cq+0*16]
   3411    pmulld               m1, m7, [cq+1*16]
   3412    pmulld               m2, m7, [cq+2*16]
   3413    pmulld               m3, m7, [cq+3*16]
   3414    pmulld               m4, m7, [cq+4*16]
   3415    pmulld               m5, m7, [cq+5*16]
   3416    pmulld               m6, m7, [cq+6*16]
   3417    pmulld               m7, [cq+7*16]
   3418    mova          [cq+7*16], m7
   3419    mova                 m7, [o(pd_6144)]
   3420    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
   3421    paddd                m7, [cq+7*16]
   3422    REPX      {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
   3423    packssdw             m0, m1
   3424    packssdw             m2, m3
   3425    packssdw             m4, m5
   3426    packssdw             m6, m7
   3427    call m(idct_8x4_internal_16bpc).transpose4x8packed
   3428    dec                 r5d
   3429    jz .end_pass1
   3430    mova [rsp+gprsize+0*16], m0
   3431    mova [rsp+gprsize+1*16], m1
   3432    mova [rsp+gprsize+2*16], m2
   3433    mova [rsp+gprsize+3*16], m3
   3434    sub                  cq, 8*16
   3435    jmp .loop_pass1
   3436 .end_pass1:
   3437    jmp                tx2q
   3438 %endif
   3439 
   3440 .pass2:
   3441 %if ARCH_X86_64
   3442    mova                m12, [o(pw_1697x8)]
   3443 %endif
   3444    lea                  r4, [o(.main)]
   3445    jmp m(idct_16x4_internal_16bpc).pass2_loop
   3446 .main:
   3447 %if ARCH_X86_64
   3448    pmulhrsw             m4, m0, m12
   3449    pmulhrsw             m5, m1, m12
   3450    pmulhrsw             m6, m2, m12
   3451    pmulhrsw             m7, m3, m12
   3452 %else
   3453    mova                 m7, [o(pw_1697x8)]
   3454    pmulhrsw             m4, m0, m7
   3455    pmulhrsw             m5, m1, m7
   3456    pmulhrsw             m6, m2, m7
   3457    pmulhrsw             m7, m3
   3458 %endif
   3459    paddsw               m0, m4
   3460    paddsw               m1, m5
   3461    paddsw               m2, m6
   3462    paddsw               m3, m7
   3463    ret
   3464 
   3465 %macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
   3466 %if ARCH_X86_64
   3467    INV_TXFM_FN          %1, %2, %3, 16x8, 16, 0-8*16
   3468 %else
   3469    INV_TXFM_FN          %1, %2, %3, 16x8, 8, 0-13*16
   3470 %endif
   3471 %ifidn %1_%2, dct_dct
   3472    imul                r5d, [cq], 181
   3473    mov                [cq], eobd ; 0
   3474    mov                 r3d, 8
   3475    add                 r5d, 128
   3476    sar                 r5d, 8
   3477    imul                r5d, 181
   3478 %if ARCH_X86_32
   3479    add                 rsp, 1*16
   3480 %endif
   3481    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
   3482 %endif
   3483 %endmacro
   3484 
   3485 INV_TXFM_16X8_FN dct, dct
   3486 INV_TXFM_16X8_FN dct, identity, 6
   3487 INV_TXFM_16X8_FN dct, adst
   3488 INV_TXFM_16X8_FN dct, flipadst
   3489 
   3490 cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   3491 %if ARCH_X86_64
   3492    DECLARE_REG_TMP 6, 4, 6
   3493 %else
   3494    mov [rsp+gprsize+12*16], r1
   3495    DECLARE_REG_TMP 1, 4, 3
   3496 %endif
   3497    lea                  t0, [o(.main)]
   3498 .loop_main:
   3499 %undef cmp
   3500 %if ARCH_X86_64
   3501    xor                 r5d, r5d
   3502    cmp                eobd, 10
   3503    setge               r5b
   3504 %else
   3505    mov                 r5d, 1
   3506    cmp                eobd, 10
   3507    sbb                 r5d, 0
   3508 %endif
   3509    shl                 r5d, 4
   3510 
   3511    lea                  r3, [rsp+gprsize]
   3512 .loop_pass1:
   3513    call                 t0
   3514 %if ARCH_X86_64
   3515    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   3516    mova       [cq+4*32+r5], m8
   3517    mova       [cq+5*32+r5], m9
   3518    mova       [cq+6*32+r5], m10
   3519    mova       [cq+7*32+r5], m11
   3520 %else
   3521    call m(idct_8x4_internal_16bpc).transpose4x8packed
   3522    mova       [cq+4*32+r5], m0
   3523    mova       [cq+5*32+r5], m1
   3524    mova       [cq+6*32+r5], m2
   3525    mova       [cq+7*32+r5], m3
   3526    mova                 m0, [rsp+gprsize+ 8*16]
   3527    mova                 m2, [rsp+gprsize+ 9*16]
   3528    mova                 m4, [rsp+gprsize+10*16]
   3529    mova                 m6, [rsp+gprsize+11*16]
   3530 %endif
   3531    call m(idct_8x4_internal_16bpc).transpose4x8packed
   3532    pxor                 m7, m7
   3533    REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15
   3534    test                r5d, r5d
   3535    jz .end
   3536    mova       [cq+0*32+r5], m0
   3537    mova       [cq+1*32+r5], m1
   3538    mova       [cq+2*32+r5], m2
   3539    mova       [cq+3*32+r5], m3
   3540    xor                 r5d, r5d
   3541    jmp .loop_pass1
   3542 .end:
   3543 
   3544    jmp                tx2q
   3545 .main:
   3546 %if ARCH_X86_64
   3547    mova                m11, [o(pd_2048)]
   3548    mova                m12, [o(clip_18b_min)]
   3549    mova                m13, [o(clip_18b_max)]
   3550    mova                m14, [o(pd_2896)]
   3551 %endif
   3552    mova                 m0, [cq+ 1*32+r5]
   3553    mova                 m1, [cq+ 3*32+r5]
   3554    mova                 m2, [cq+ 5*32+r5]
   3555    mova                 m3, [cq+ 7*32+r5]
   3556    mova                 m4, [cq+ 9*32+r5]
   3557    mova                 m5, [cq+11*32+r5]
   3558    mova                 m6, [cq+13*32+r5]
   3559    mova                 m7, [cq+15*32+r5]
   3560    call m(idct_8x4_internal_16bpc).rect2_mul
   3561    call m(idct_16x4_internal_16bpc).main_oddhalf
   3562 
   3563    mova                 m0, [cq+ 0*32+r5]
   3564    mova                 m1, [cq+ 2*32+r5]
   3565    mova                 m2, [cq+ 4*32+r5]
   3566    mova                 m3, [cq+ 6*32+r5]
   3567    mova                 m4, [cq+ 8*32+r5]
   3568    mova                 m5, [cq+10*32+r5]
   3569    mova                 m6, [cq+12*32+r5]
   3570    mova                 m7, [cq+14*32+r5]
   3571    call m(idct_8x4_internal_16bpc).rect2_mul
   3572    call m(idct_8x4_internal_16bpc).main_pass1
   3573    call m(idct_8x4_internal_16bpc).round
   3574    call m(idct_16x4_internal_16bpc).round
   3575 %if ARCH_X86_64
   3576    packssdw             m0, m1
   3577    packssdw             m2, m3
   3578    packssdw             m4, m5
   3579    packssdw             m6, m7
   3580    packssdw             m8, m9
   3581    packssdw            m10, m11
   3582    packssdw            m12, m13
   3583    packssdw            m14, m15
   3584 %endif
   3585    ret
   3586 
   3587 .pass2:
   3588 %if ARCH_X86_32
   3589    mov             strideq, [rsp+gprsize+12*16]
   3590 %endif
   3591    mov                 r4d, 2
   3592 .pass2_main:
   3593 %if ARCH_X86_64
   3594    mova                 m8, [o(pw_2048)]
   3595    pxor                 m9, m9
   3596    mova                m10, [o(pixel_10bpc_max)]
   3597 %endif
   3598    lea                  r3, [strideq*3]
   3599    jmp .loop_pass2_entry
   3600 .loop_pass2:
   3601    mova                 m0, [cq+0*32+ 0]
   3602    mova                 m1, [cq+1*32+ 0]
   3603    mova                 m2, [cq+2*32+ 0]
   3604    mova                 m3, [cq+3*32+ 0]
   3605 .loop_pass2_entry:
   3606    mova                 m4, [cq+0*32+16]
   3607    mova                 m5, [cq+1*32+16]
   3608    mova                 m6, [cq+2*32+16]
   3609    mova                 m7, [cq+3*32+16]
   3610 %if ARCH_X86_32
   3611    lea                  r5, [o(itx8_start)]
   3612 %endif
   3613    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
   3614    call m(idct_8x8_internal_16bpc).round2_and_write_8x8
   3615 %if ARCH_X86_64
   3616 %define mzero m9
   3617 %else
   3618 %define mzero m7
   3619    pxor                 m7, m7
   3620 %endif
   3621    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
   3622    add                dstq, 16
   3623    add                  cq, 4*32
   3624    dec                 r4d
   3625    jg .loop_pass2
   3626    RET
   3627 
   3628 INV_TXFM_16X8_FN adst, dct
   3629 INV_TXFM_16X8_FN adst, adst
   3630 INV_TXFM_16X8_FN adst, flipadst
   3631 INV_TXFM_16X8_FN adst, identity, 6
   3632 
   3633 cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   3634 %if ARCH_X86_32
   3635    mov [rsp+gprsize+12*16], r1
   3636 %endif
   3637    lea                  t0, [o(.main)]
   3638    jmp m(idct_16x8_internal_16bpc).loop_main
   3639 
   3640 .main:
   3641 %if ARCH_X86_64
   3642    mova                m11, [o(pd_2048)]
   3643    mova                m12, [o(clip_18b_min)]
   3644    mova                m13, [o(clip_18b_max)]
   3645    mova                m14, [o(pd_2896)]
   3646 %endif
   3647    mova                 m0, [cq+ 2*32+r5]
   3648    mova                 m1, [cq+13*32+r5]
   3649    mova                 m2, [cq+ 6*32+r5]
   3650    mova                 m3, [cq+ 9*32+r5]
   3651    mova                 m4, [cq+10*32+r5]
   3652    mova                 m5, [cq+ 5*32+r5]
   3653    mova                 m6, [cq+14*32+r5]
   3654    mova                 m7, [cq+ 1*32+r5]
   3655    call m(idct_8x4_internal_16bpc).rect2_mul
   3656    call m(iadst_16x4_internal_16bpc).main_part1
   3657    mova                 m0, [cq+ 0*32+r5]
   3658    mova                 m1, [cq+15*32+r5]
   3659    mova                 m2, [cq+ 4*32+r5]
   3660    mova                 m3, [cq+11*32+r5]
   3661    mova                 m4, [cq+ 8*32+r5]
   3662    mova                 m5, [cq+ 7*32+r5]
   3663    mova                 m6, [cq+12*32+r5]
   3664    mova                 m7, [cq+ 3*32+r5]
   3665 %if ARCH_X86_32
   3666    add                  r3, 8*16
   3667 %endif
   3668    call m(idct_8x4_internal_16bpc).rect2_mul
   3669 %if ARCH_X86_32
   3670    sub                  r3, 8*16
   3671 %endif
   3672    call m(iadst_16x4_internal_16bpc).main_part2
   3673    call m(iadst_16x4_internal_16bpc).round
   3674 %if ARCH_X86_64
   3675    packssdw             m0, m1
   3676    packssdw             m2, m3
   3677    packssdw             m4, m5
   3678    packssdw             m6, m7
   3679    packssdw             m8, m9
   3680    packssdw            m10, m11
   3681    packssdw            m12, m13
   3682    packssdw            m14, m15
   3683 %endif
   3684    ret
   3685 
   3686 .pass2:
   3687 %if ARCH_X86_32
   3688    mov             strideq, [rsp+gprsize+12*16]
   3689 %endif
   3690    mov                 r4d, 2
   3691 %if ARCH_X86_64
   3692    mova                 m8, [o(pw_2048)]
   3693    pxor                 m9, m9
   3694    mova                m10, [o(pixel_10bpc_max)]
   3695    mova                m11, [o(pw_m2048)]
   3696 %endif
   3697    lea                  r3, [strideq*3]
   3698    jmp .loop_pass2_entry
   3699 .loop_pass2:
   3700    mova                 m0, [cq+0*32+ 0]
   3701    mova                 m1, [cq+1*32+ 0]
   3702    mova                 m2, [cq+2*32+ 0]
   3703    mova                 m3, [cq+3*32+ 0]
   3704 .loop_pass2_entry:
   3705    mova                 m4, [cq+0*32+16]
   3706    mova                 m5, [cq+1*32+16]
   3707    mova                 m6, [cq+2*32+16]
   3708    mova                 m7, [cq+3*32+16]
   3709 %if ARCH_X86_32
   3710    lea                  r5, [o(itx8_start)]
   3711 %endif
   3712    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
   3713    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
   3714    call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
   3715 %if ARCH_X86_64
   3716 %define mzero m9
   3717 %else
   3718 %define mzero m7
   3719    pxor                 m7, m7
   3720 %endif
   3721    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
   3722    add                dstq, 16
   3723    add                  cq, 4*32
   3724    dec                 r4d
   3725    jg .loop_pass2
   3726    RET
   3727 
   3728 INV_TXFM_16X8_FN flipadst, dct
   3729 INV_TXFM_16X8_FN flipadst, adst
   3730 INV_TXFM_16X8_FN flipadst, flipadst
   3731 INV_TXFM_16X8_FN flipadst, identity, 6
   3732 
   3733 cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   3734 %if ARCH_X86_32
   3735    mov [rsp+gprsize+12*16], r1
   3736 %endif
   3737    lea                  t0, [o(.main)]
   3738    jmp m(idct_16x8_internal_16bpc).loop_main
   3739 .main:
   3740    call m(iadst_16x8_internal_16bpc).main
   3741 %if ARCH_X86_64
   3742    pshufd               m1, m0, q1032
   3743    pshufd               m3, m2, q1032
   3744    pshufd               m5, m4, q1032
   3745    pshufd               m7, m6, q1032
   3746    pshufd               m0, m14, q1032
   3747    pshufd               m2, m12, q1032
   3748    pshufd               m4, m10, q1032
   3749    pshufd               m6, m8, q1032
   3750    mova                m14, m1
   3751    mova                m12, m3
   3752    mova                m10, m5
   3753    mova                 m8, m7
   3754 %else
   3755    pshufd               m1, m0, q1032
   3756    pshufd               m3, m2, q1032
   3757    pshufd               m5, m4, q1032
   3758    pshufd               m7, m6, q1032
   3759    pshufd               m0, [r3+11*16], q1032
   3760    pshufd               m2, [r3+10*16], q1032
   3761    pshufd               m4, [r3+9*16], q1032
   3762    pshufd               m6, [r3+8*16], q1032
   3763    mova          [r3+8*16], m7
   3764    mova          [r3+9*16], m5
   3765    mova         [r3+10*16], m3
   3766    mova         [r3+11*16], m1
   3767 %endif
   3768    ret
   3769 
   3770 .pass2:
   3771 %if ARCH_X86_32
   3772    mov             strideq, [rsp+gprsize+12*16]
   3773 %endif
   3774    lea                dstq, [dstq+strideq*8]
   3775    neg             strideq
   3776    add                dstq, strideq
   3777 %if ARCH_X86_32
   3778    mov [rsp+gprsize+12*16], strideq
   3779 %endif
   3780    jmp m(iadst_16x8_internal_16bpc).pass2
   3781 
   3782 INV_TXFM_16X8_FN identity, dct, -54
   3783 INV_TXFM_16X8_FN identity, adst, -54
   3784 INV_TXFM_16X8_FN identity, flipadst, -54
   3785 INV_TXFM_16X8_FN identity, identity
   3786 
   3787 cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   3788 %if ARCH_X86_32
   3789    mov [rsp+gprsize+12*16], r1
   3790 %endif
   3791    lea                  t0, [o(.main)]
   3792    jmp m(idct_16x8_internal_16bpc).loop_main
   3793 .main:
   3794 %if ARCH_X86_64
   3795    mova                m15, [o(pd_2896)]
   3796    pmulld               m0, m15, [cq+ 0*32+r5]
   3797    pmulld               m1, m15, [cq+ 1*32+r5]
   3798    pmulld               m2, m15, [cq+ 2*32+r5]
   3799    pmulld               m3, m15, [cq+ 3*32+r5]
   3800    pmulld               m4, m15, [cq+ 4*32+r5]
   3801    pmulld               m5, m15, [cq+ 5*32+r5]
   3802    pmulld               m6, m15, [cq+ 6*32+r5]
   3803    pmulld               m7, m15, [cq+ 7*32+r5]
   3804    pmulld               m8, m15, [cq+ 8*32+r5]
   3805    pmulld               m9, m15, [cq+ 9*32+r5]
   3806    pmulld              m10, m15, [cq+10*32+r5]
   3807    pmulld              m11, m15, [cq+11*32+r5]
   3808    pmulld              m12, m15, [cq+12*32+r5]
   3809    pmulld              m13, m15, [cq+13*32+r5]
   3810    pmulld              m14, m15, [cq+14*32+r5]
   3811    pmulld              m15, [cq+15*32+r5]
   3812    mova               [r3], m15
   3813    mova                m15, [o(pd_2048)]
   3814    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
   3815                         m8, m9, m10, m11, m12, m13, m14
   3816    paddd               m15, [r3]
   3817    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
   3818                         m8, m9, m10, m11, m12, m13, m14, m15
   3819    mova               [r3], m15
   3820    mova                m15, [o(pd_11586)]
   3821    REPX    {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
   3822                         m8, m9, m10, m11, m12, m13, m14
   3823    pmulld              m15, [r3]
   3824    mova               [r3], m15
   3825    mova                m15, [o(pd_6144)]
   3826    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
   3827                         m8, m9, m10, m11, m12, m13, m14
   3828    paddd               m15, [r3]
   3829    REPX     {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
   3830                         m8, m9, m10, m11, m12, m13, m14, m15
   3831    packssdw             m0, m1
   3832    packssdw             m2, m3
   3833    packssdw             m4, m5
   3834    packssdw             m6, m7
   3835    packssdw             m8, m9
   3836    packssdw            m10, m11
   3837    packssdw            m12, m13
   3838    packssdw            m14, m15
   3839 %else
   3840    mova                 m0, [cq+ 0*32+r5]
   3841    mova                 m1, [cq+ 1*32+r5]
   3842    mova                 m2, [cq+ 2*32+r5]
   3843    mova                 m3, [cq+ 3*32+r5]
   3844    mova                 m4, [cq+ 4*32+r5]
   3845    mova                 m5, [cq+ 5*32+r5]
   3846    mova                 m6, [cq+ 6*32+r5]
   3847    mova                 m7, [cq+ 7*32+r5]
   3848    call m(idct_8x4_internal_16bpc).rect2_mul
   3849    mova               [r3], m7
   3850    mova                 m7, [o(pd_11586)]
   3851    REPX      {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
   3852    pmulld               m7, [r3]
   3853    mova               [r3], m7
   3854    mova                 m7, [o(pd_6144)]
   3855    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
   3856    paddd                m7, [r3]
   3857    REPX      {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
   3858    packssdw             m0, m1
   3859    packssdw             m2, m3
   3860    packssdw             m4, m5
   3861    packssdw             m6, m7
   3862    mova         [r3+ 8*16], m0
   3863    mova         [r3+ 9*16], m2
   3864    mova         [r3+10*16], m4
   3865    mova         [r3+11*16], m6
   3866    mova                 m0, [cq+ 8*32+r5]
   3867    mova                 m1, [cq+ 9*32+r5]
   3868    mova                 m2, [cq+10*32+r5]
   3869    mova                 m3, [cq+11*32+r5]
   3870    mova                 m4, [cq+12*32+r5]
   3871    mova                 m5, [cq+13*32+r5]
   3872    mova                 m6, [cq+14*32+r5]
   3873    mova                 m7, [cq+15*32+r5]
   3874    call m(idct_8x4_internal_16bpc).rect2_mul
   3875    mova               [r3], m7
   3876    mova                 m7, [o(pd_11586)]
   3877    REPX      {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
   3878    pmulld               m7, [r3]
   3879    mova               [r3], m7
   3880    mova                 m7, [o(pd_6144)]
   3881    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
   3882    paddd                m7, [r3]
   3883    REPX      {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
   3884    packssdw             m0, m1
   3885    packssdw             m2, m3
   3886    packssdw             m4, m5
   3887    packssdw             m6, m7
   3888 %endif
   3889    ret
   3890 .pass2:
   3891 %if ARCH_X86_32
   3892    mov             strideq, [rsp+gprsize+12*16]
   3893 %endif
   3894    mov                 r4d, 2
   3895 %if ARCH_X86_64
   3896    mova                 m8, [o(pw_4096)]
   3897    pxor                 m9, m9
   3898    mova                m10, [o(pixel_10bpc_max)]
   3899 %endif
   3900    lea                  r3, [strideq*3]
   3901    jmp .loop_pass2_entry
   3902 .loop_pass2:
   3903    mova                 m0, [cq+0*32+ 0]
   3904    mova                 m1, [cq+1*32+ 0]
   3905    mova                 m2, [cq+2*32+ 0]
   3906    mova                 m3, [cq+3*32+ 0]
   3907 .loop_pass2_entry:
   3908    mova                 m4, [cq+0*32+16]
   3909    mova                 m5, [cq+1*32+16]
   3910    mova                 m6, [cq+2*32+16]
   3911    mova                 m7, [cq+3*32+16]
   3912 %if ARCH_X86_64
   3913    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
   3914 %else
   3915    mova      [rsp+gprsize], m7
   3916    mova                 m7, [o(pw_4096)]
   3917    call m(idct_8x8_internal_16bpc).round4_and_write_8x8
   3918 %endif
   3919 %if ARCH_X86_64
   3920 %define mzero m9
   3921 %else
   3922 %define mzero m7
   3923    pxor                 m7, m7
   3924 %endif
   3925    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
   3926    add                dstq, 16
   3927    add                  cq, 4*32
   3928    dec                 r4d
   3929    jg .loop_pass2
   3930    RET
   3931 
   3932 %macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
   3933 %if ARCH_X86_64
   3934    INV_TXFM_FN          %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16
   3935 %else
   3936    INV_TXFM_FN          %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
   3937 %endif
   3938 %ifidn %1_%2, dct_dct
   3939    imul                r5d, [cq], 181
   3940    mov                [cq], eobd ; 0
   3941    mov                 r3d, 16
   3942    add                 r5d, 640
   3943    sar                 r5d, 10
   3944    add                 rsp, (5+ARCH_X86_64*3+WIN64)*16
   3945    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
   3946 %endif
   3947 %endmacro
   3948 
   3949 INV_TXFM_16X16_FN dct, dct
   3950 INV_TXFM_16X16_FN dct, identity, v
   3951 INV_TXFM_16X16_FN dct, adst
   3952 INV_TXFM_16X16_FN dct, flipadst
   3953 
   3954 cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   3955 %if ARCH_X86_64
   3956    DECLARE_REG_TMP       6, 7
   3957 %if WIN64
   3958    mov [rsp+16*16+gprsize], r7
   3959 %endif
   3960 %elif ARCH_X86_32
   3961    DECLARE_REG_TMP       1, 6
   3962    mov [rsp+16*16+gprsize*1], r1
   3963    mov [rsp+16*16+gprsize*2], r6
   3964 %endif
   3965    lea                  t0, [o(.main)]
   3966 .pass1_full:
   3967 %undef cmp
   3968    mov                 t1d, 4
   3969 .zero_loop:
   3970    dec                 t1d
   3971    cmp                eobb, byte [r5+t1]
   3972    jb .zero_loop
   3973    mov                 r5d, t1d
   3974    shl                 r5d, 4
   3975 %if ARCH_X86_32
   3976    ; restore pic-ptr
   3977    mov                  r6, [rsp+16*16+2*gprsize]
   3978 %endif
   3979    ; setup stack pointer
   3980    lea                  r3, [rsp+gprsize]
   3981 .loop_pass1:
   3982    call                 t0
   3983 %if ARCH_X86_64
   3984    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   3985    mova       [cq+4*64+r5], m8
   3986    mova       [cq+5*64+r5], m9
   3987    mova       [cq+6*64+r5], m10
   3988    mova       [cq+7*64+r5], m11
   3989 %else
   3990    call m(idct_8x4_internal_16bpc).transpose4x8packed
   3991    mova       [cq+4*64+r5], m0
   3992    mova       [cq+5*64+r5], m1
   3993    mova       [cq+6*64+r5], m2
   3994    mova       [cq+7*64+r5], m3
   3995    mova                 m0, [rsp+gprsize+ 8*16]
   3996    mova                 m2, [rsp+gprsize+ 9*16]
   3997    mova                 m4, [rsp+gprsize+10*16]
   3998    mova                 m6, [rsp+gprsize+11*16]
   3999 %endif
   4000    call m(idct_8x4_internal_16bpc).transpose4x8packed
   4001    mova       [cq+0*64+r5], m0
   4002    mova       [cq+1*64+r5], m1
   4003    mova       [cq+2*64+r5], m2
   4004    mova       [cq+3*64+r5], m3
   4005    pxor                 m0, m0
   4006    REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15
   4007    sub                 r5d, 16
   4008    jge .loop_pass1
   4009 
   4010 %if ARCH_X86_32
   4011    ; restore pic-ptr
   4012    mov                  r1, [rsp+16*16+1*gprsize]
   4013 %endif
   4014    jmp                tx2q
   4015 .main:
   4016 %if ARCH_X86_64
   4017    mova                m11, [o(pd_2048)]
   4018    mova                m12, [o(clip_18b_min)]
   4019    mova                m13, [o(clip_18b_max)]
   4020    mova                m14, [o(pd_2896)]
   4021 %endif
   4022 
   4023    mova                 m0, [cq+ 1*64+r5]
   4024    mova                 m1, [cq+ 3*64+r5]
   4025    mova                 m2, [cq+ 5*64+r5]
   4026    mova                 m3, [cq+ 7*64+r5]
   4027    mova                 m4, [cq+ 9*64+r5]
   4028    mova                 m5, [cq+11*64+r5]
   4029    mova                 m6, [cq+13*64+r5]
   4030    mova                 m7, [cq+15*64+r5]
   4031    call m(idct_16x4_internal_16bpc).main_oddhalf
   4032 
   4033    mova                 m0, [cq+ 0*64+r5]
   4034    mova                 m1, [cq+ 2*64+r5]
   4035    mova                 m2, [cq+ 4*64+r5]
   4036    mova                 m3, [cq+ 6*64+r5]
   4037    mova                 m4, [cq+ 8*64+r5]
   4038    mova                 m5, [cq+10*64+r5]
   4039    mova                 m6, [cq+12*64+r5]
   4040    mova                 m7, [cq+14*64+r5]
   4041    call m(idct_8x4_internal_16bpc).main_pass1
   4042    call m(idct_8x4_internal_16bpc).round
   4043    call .round
   4044 %if ARCH_X86_64
   4045    packssdw             m0, m1
   4046    packssdw             m2, m3
   4047    packssdw             m4, m5
   4048    packssdw             m6, m7
   4049    packssdw             m8, m9
   4050    packssdw            m10, m11
   4051    packssdw            m12, m13
   4052    packssdw            m14, m15
   4053 %endif
   4054    ret
   4055 .round:
   4056 %if ARCH_X86_64
   4057    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
   4058    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
   4059    psrld                m8, m11, 10        ; 2
   4060    REPX      {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
   4061    mova                 m8, [r3+1*16]
   4062    mova                 m9, [r3+2*16]
   4063    mova                m10, [r3+3*16]
   4064    mova                m11, [r3+4*16]
   4065    mova                m12, [r3+5*16]
   4066    mova                m13, [r3+6*16]
   4067    mova                m14, [r3+7*16]
   4068    psubd               m15, m0, m14       ; out15
   4069    paddd                m0, m14           ; out0
   4070    psubd               m14, m1, m13       ; out14
   4071    paddd                m1, m13           ; out1
   4072    psubd               m13, m2, m12       ; out13
   4073    paddd                m2, m12           ; out2
   4074    psubd               m12, m3, m11       ; out12
   4075    paddd                m3, m11           ; out3
   4076    psubd               m11, m4, m10       ; out11
   4077    paddd                m4, m10           ; out4
   4078    psubd               m10, m5, m9        ; out10
   4079    paddd                m5, m9            ; out5
   4080    psubd                m9, m6, m8        ; out9
   4081    paddd                m6, m8            ; out6
   4082    psubd                m8, m7, [r3+0*16] ; out8
   4083    paddd                m7, [r3+0*16]     ; out7
   4084    REPX       {psrad x, 2}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
   4085                             m8,  m9,  m10, m11, m12, m13, m14, m15
   4086    ; and out0-15 is now in m0-15
   4087 %else
   4088    mova         [r3+ 0*16], m0
   4089    mova                 m0, [o(clip_18b_min)]
   4090    REPX     {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
   4091    pmaxsd               m0, [r3+ 0*16]
   4092    mova         [r3+ 0*16], m7
   4093    mova                 m7, [o(clip_18b_max)]
   4094    REPX     {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
   4095    pminsd               m7, [r3+ 0*16]
   4096    mova         [r3+ 0*16], m0
   4097    mova                 m0, [o(pd_2)]
   4098    REPX      {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
   4099    paddd                m0, [r3+ 0*16]
   4100    mova         [r3+ 0*16], m0
   4101    mova         [r3+ 1*16], m1
   4102    mova         [r3+ 2*16], m2
   4103    mova                 m1, [r3+11*16]
   4104    mova                 m2, [r3+10*16]
   4105    psubd                m0, m7, m1
   4106    paddd                m7, m1
   4107    psubd                m1, m6, m2
   4108    paddd                m6, m2
   4109    REPX       {psrad x, 2}, m0, m1, m6, m7
   4110    packssdw             m0, m1     ; out8-9
   4111    packssdw             m6, m7     ; out6-7
   4112    mova         [r3+11*16], m6
   4113    mova                 m1, [r3+9*16]
   4114    mova                 m7, [r3+8*16]
   4115    psubd                m2, m5, m1
   4116    paddd                m5, m1
   4117    psubd                m1, m4, m7
   4118    paddd                m4, m7
   4119    REPX       {psrad x, 2}, m2, m1, m4, m5
   4120    packssdw             m2, m1     ; out10-11
   4121    packssdw             m4, m5     ; out4-5
   4122    mova                 m1, [r3+2*16]
   4123    mova         [r3+10*16], m4
   4124    mova                 m6, [r3+7*16]
   4125    mova                 m7, [r3+6*16]
   4126    psubd                m4, m3, m6
   4127    paddd                m3, m6
   4128    psubd                m6, m1, m7
   4129    paddd                m1, m7
   4130    REPX       {psrad x, 2}, m4, m6, m1, m3
   4131    packssdw             m4, m6     ; out12-13
   4132    packssdw             m1, m3     ; out2-3
   4133    mova                 m3, [r3+1*16]
   4134    mova          [r3+9*16], m1
   4135    mova                 m1, [r3+0*16]
   4136    mova                 m5, [r3+5*16]
   4137    mova                 m7, [r3+4*16]
   4138    psubd                m6, m3, m5
   4139    paddd                m3, m5
   4140    psubd                m5, m1, m7
   4141    paddd                m1, m7
   4142    REPX       {psrad x, 2}, m6, m5, m1, m3
   4143    packssdw             m6, m5     ; out14-15
   4144    packssdw             m1, m3     ; out0-1
   4145    mova          [r3+8*16], m1
   4146 %endif
   4147    ret
   4148 
   4149 .pass2:
   4150 %if ARCH_X86_64
   4151    mova                 m8, [o(pw_2048)]
   4152    pxor                 m9, m9
   4153    mova                m10, [o(pixel_10bpc_max)]
   4154    mov                  r7, dstq
   4155 %else
   4156    mov [rsp+2*gprsize+16*16], dstq
   4157 %endif
   4158    lea                  r3, [strideq*3]
   4159    mov                 r4d, 2
   4160 .loop_pass2:
   4161 %if ARCH_X86_32
   4162    lea                  r5, [o(itx8_start)]
   4163 %endif
   4164    mova                 m0, [cq+0*64+ 0]
   4165    mova                 m1, [cq+2*64+ 0]
   4166    mova                 m2, [cq+0*64+16]
   4167    mova                 m3, [cq+2*64+16]
   4168    mova                 m4, [cq+0*64+32]
   4169    mova                 m5, [cq+2*64+32]
   4170    mova                 m6, [cq+0*64+48]
   4171    mova                 m7, [cq+2*64+48]
   4172    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
   4173    mova [rsp+gprsize+3*16], m0
   4174    mova [rsp+gprsize+4*16], m1
   4175    mova [rsp+gprsize+5*16], m2
   4176    mova [rsp+gprsize+6*16], m3
   4177    mova [rsp+gprsize+7*16], m4
   4178    mova [rsp+gprsize+8*16], m5
   4179    mova [rsp+gprsize+9*16], m6
   4180    ; m7 is already stored in [rsp+gprsize+0*16]
   4181    mova                 m0, [cq+1*64+ 0]
   4182    mova                 m1, [cq+3*64+ 0]
   4183    mova                 m2, [cq+1*64+16]
   4184    mova                 m3, [cq+3*64+16]
   4185    mova                 m4, [cq+1*64+32]
   4186    mova                 m5, [cq+3*64+32]
   4187    mova                 m6, [cq+1*64+48]
   4188    mova                 m7, [cq+3*64+48]
   4189    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
   4190 
   4191    ; out0-7 is in rsp+gprsize+3-10*mmsize
   4192    ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
   4193 
   4194 %if ARCH_X86_64
   4195    lea                dstq, [r7+strideq*8]
   4196 %else
   4197    mov                dstq, [rsp+2*gprsize+16*16]
   4198    lea                dstq, [dstq+strideq*8]
   4199 %endif
   4200    call m(idct_8x8_internal_16bpc).round2_and_write_8x8
   4201 %if ARCH_X86_64
   4202    mov                dstq, r7
   4203 %else
   4204    mov                dstq, [rsp+2*gprsize+16*16]
   4205 %endif
   4206    mova                 m0, [rsp+gprsize+ 3*16]
   4207    mova                 m1, [rsp+gprsize+ 4*16]
   4208    mova                 m2, [rsp+gprsize+ 5*16]
   4209    mova                 m3, [rsp+gprsize+ 6*16]
   4210    mova                 m4, [rsp+gprsize+ 7*16]
   4211    mova                 m5, [rsp+gprsize+ 8*16]
   4212    mova                 m6, [rsp+gprsize+ 9*16]
   4213    mova                 m7, [rsp+gprsize+10*16]
   4214    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
   4215 %if ARCH_X86_64
   4216    add                  r7, 16
   4217 %define mzero m9
   4218 %else
   4219    add dword [rsp+2*gprsize+16*16], 16
   4220 %define mzero m7
   4221    pxor                 m7, m7
   4222 %endif
   4223    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
   4224    add                  cq, 64*4
   4225    REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
   4226 %undef mzero
   4227    dec                 r4d
   4228    jg .loop_pass2
   4229 %if WIN64
   4230    mov                  r7, [rsp+16*16+gprsize]
   4231 %endif
   4232    RET
   4233 
   4234 INV_TXFM_16X16_FN adst, dct
   4235 INV_TXFM_16X16_FN adst, adst
   4236 INV_TXFM_16X16_FN adst, flipadst
   4237 
   4238 cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   4239 %if WIN64
   4240    mov [rsp+16*16+gprsize], r7
   4241 %elif ARCH_X86_32
   4242    mov [rsp+16*16+gprsize*1], r1
   4243    mov [rsp+16*16+gprsize*2], r6
   4244 %endif
   4245    lea                  t0, [o(.main)]
   4246    jmp m(idct_16x16_internal_16bpc).pass1_full
   4247 
   4248 .main:
   4249 %if ARCH_X86_64
   4250    mova                m11, [o(pd_2048)]
   4251    mova                m12, [o(clip_18b_min)]
   4252    mova                m13, [o(clip_18b_max)]
   4253    mova                m14, [o(pd_2896)]
   4254 %endif
   4255    mova                 m0, [cq+ 2*64+r5]
   4256    mova                 m1, [cq+13*64+r5]
   4257    mova                 m2, [cq+ 6*64+r5]
   4258    mova                 m3, [cq+ 9*64+r5]
   4259    mova                 m4, [cq+10*64+r5]
   4260    mova                 m5, [cq+ 5*64+r5]
   4261    mova                 m6, [cq+14*64+r5]
   4262    mova                 m7, [cq+ 1*64+r5]
   4263    call m(iadst_16x4_internal_16bpc).main_part1
   4264    mova                 m0, [cq+ 0*64+r5]
   4265    mova                 m1, [cq+15*64+r5]
   4266    mova                 m2, [cq+ 4*64+r5]
   4267    mova                 m3, [cq+11*64+r5]
   4268    mova                 m4, [cq+ 8*64+r5]
   4269    mova                 m5, [cq+ 7*64+r5]
   4270    mova                 m6, [cq+12*64+r5]
   4271    mova                 m7, [cq+ 3*64+r5]
   4272    call m(iadst_16x4_internal_16bpc).main_part2
   4273    call .round
   4274 %if ARCH_X86_64
   4275    packssdw             m0, m1
   4276    packssdw             m2, m3
   4277    packssdw             m4, m5
   4278    packssdw             m6, m7
   4279    packssdw             m8, m9
   4280    packssdw            m10, m11
   4281    packssdw            m12, m13
   4282    packssdw            m14, m15
   4283 %endif
   4284    ret
   4285 .round:
   4286 %if ARCH_X86_64
   4287    pcmpeqd              m8, m8         ; -1
   4288    mova                m15, [o(pd_10240)]
   4289    psrld               m14, 10         ; +2
   4290    psubd               m13, m14, m8    ; +3
   4291    REPX     {pxor  x, m8 }, m1, m3, m5, m7
   4292    REPX     {paddd x, m14}, m0, m2
   4293    REPX     {paddd x, m13}, m1, m3
   4294    REPX     {paddd x, m15}, m4, m5, m6, m7
   4295    paddd               m13, m15, m8    ; +10239
   4296    paddd                m8, m15, m9
   4297    psubd                m9, m13, m10
   4298    paddd               m10, m15, m11
   4299    psubd               m11, m13, m12
   4300    paddd               m12, m14, [r3+3*16]
   4301    psubd               m13, m14, [r3+2*16]
   4302    psubd               m15, m14, [r3+0*16]
   4303    paddd               m14, [r3+1*16]
   4304    REPX      {psrad x, 2 }, m0,  m1,  m2,  m3,  m12, m13, m14, m15
   4305    REPX      {psrad x, 14}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
   4306 %else
   4307    mova          [r3+8*16], m1
   4308    mova          [r3+9*16], m3
   4309    mova                 m3, [o(pd_10240)]
   4310    pcmpeqd              m1, m1
   4311    REPX      {pxor  x, m1}, m5, m7
   4312    REPX      {paddd x, m3}, m4, m5, m6, m7
   4313    REPX      {psrad x, 14}, m4, m5, m6, m7
   4314    packssdw             m4, m5
   4315    packssdw             m6, m7
   4316    mova         [r3+10*16], m4
   4317    mova         [r3+11*16], m6
   4318    mova                 m4, [r3+4*16]
   4319    mova                 m5, [r3+5*16]
   4320    mova                 m6, [r3+6*16]
   4321    mova                 m7, [r3+7*16]
   4322    mova                 m3, [o(pd_2)]
   4323    REPX      {pxor  x, m1}, m5, m7
   4324    REPX      {paddd x, m3}, m4, m6
   4325    psubd                m3, m1
   4326    REPX      {paddd x, m3}, m5, m7
   4327    REPX      {psrad x, 2 }, m4, m5, m6, m7
   4328    packssdw             m4, m5
   4329    packssdw             m6, m7
   4330    mova                 m5, [r3+8*16]
   4331    mova                 m7, [r3+9*16]
   4332    mova          [r3+8*16], m4
   4333    mova          [r3+9*16], m6
   4334    mova                 m3, [o(pd_10240)]
   4335    REPX      {pxor  x, m1}, m5, m7
   4336    REPX      {paddd x, m3}, m0, m5, m2, m7
   4337    REPX      {psrad x, 14}, m0, m5, m2, m7
   4338    packssdw             m0, m5
   4339    packssdw             m2, m7
   4340    mova                 m4, [r3+0*16]
   4341    mova                 m5, [r3+1*16]
   4342    mova                 m6, [r3+2*16]
   4343    mova                 m7, [r3+3*16]
   4344    mova                 m3, [o(pd_2)]
   4345    REPX      {pxor  x, m1}, m5, m7
   4346    REPX      {paddd x, m3}, m4, m6
   4347    psubd                m3, m1
   4348    REPX      {paddd x, m3}, m5, m7
   4349    REPX      {psrad x, 2 }, m4, m5, m6, m7
   4350    packssdw             m4, m5
   4351    packssdw             m6, m7
   4352 %endif
   4353    ret
   4354 .pass2:
   4355 %if ARCH_X86_64
   4356    mova                 m8, [o(pw_2048)]
   4357    mova                m11, [o(pw_m2048)]
   4358    pxor                 m9, m9
   4359    mova                m10, [o(pixel_10bpc_max)]
   4360    mov                  r7, dstq
   4361 %else
   4362    mov [rsp+2*gprsize+16*16], dstq
   4363 %endif
   4364    lea                  r3, [strideq*3]
   4365    mov                 r4d, 2
   4366 .loop_pass2:
   4367 %if ARCH_X86_32
   4368    lea                  r5, [o(itx8_start)]
   4369 %endif
   4370    mova                 m0, [cq+0*64+32]
   4371    mova                 m1, [cq+1*64+32]
   4372    mova                 m2, [cq+2*64+16]
   4373    mova                 m3, [cq+3*64+16]
   4374    mova                 m4, [cq+0*64+ 0]
   4375    mova                 m5, [cq+1*64+ 0]
   4376    mova                 m6, [cq+2*64+48]
   4377    mova                 m7, [cq+3*64+48]
   4378    mova [rsp+gprsize+3*16], m0
   4379    mova [rsp+gprsize+4*16], m1
   4380    mova [rsp+gprsize+5*16], m2
   4381    mova [rsp+gprsize+6*16], m3
   4382    mova [rsp+gprsize+7*16], m4
   4383    mova [rsp+gprsize+8*16], m5
   4384    mova [rsp+gprsize+9*16], m6
   4385    mova [rsp+gprsize+10*16], m7
   4386    mova                 m0, [cq+2*64+ 0]
   4387    mova                 m1, [cq+3*64+ 0]
   4388    mova                 m2, [cq+0*64+16]
   4389    mova                 m3, [cq+1*64+16]
   4390    mova                 m4, [cq+2*64+32]
   4391    mova                 m5, [cq+3*64+32]
   4392    mova                 m6, [cq+0*64+48]
   4393    mova                 m7, [cq+1*64+48]
   4394    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
   4395    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
   4396 
   4397    ; out0-7 is in rsp+gprsize+3-10*mmsize
   4398    ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
   4399 
   4400 %if ARCH_X86_64
   4401    lea                dstq, [r7+strideq*8]
   4402 %else
   4403    mov                dstq, [rsp+2*gprsize+16*16]
   4404    lea                dstq, [dstq+strideq*8]
   4405 %endif
   4406    call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
   4407 %if ARCH_X86_64
   4408    mov                dstq, r7
   4409 %else
   4410    mov                dstq, [rsp+2*gprsize+16*16]
   4411 %endif
   4412    mova                 m0, [rsp+gprsize+ 3*16]
   4413    mova                 m1, [rsp+gprsize+ 4*16]
   4414    mova                 m2, [rsp+gprsize+ 5*16]
   4415    mova                 m3, [rsp+gprsize+ 6*16]
   4416    mova                 m4, [rsp+gprsize+ 7*16]
   4417    mova                 m5, [rsp+gprsize+ 8*16]
   4418    mova                 m6, [rsp+gprsize+ 9*16]
   4419    mova                 m7, [rsp+gprsize+10*16]
   4420    call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
   4421 %if ARCH_X86_64
   4422    add                  r7, 16
   4423 %define mzero m9
   4424 %else
   4425    add dword [rsp+2*gprsize+16*16], 16
   4426 %define mzero m7
   4427    pxor                 m7, m7
   4428 %endif
   4429    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
   4430    add                  cq, 64*4
   4431    REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
   4432 %undef mzero
   4433    dec                 r4d
   4434    jg .loop_pass2
   4435 %if WIN64
   4436    mov                  r7, [rsp+16*16+gprsize]
   4437 %endif
   4438    RET
   4439 
   4440 INV_TXFM_16X16_FN flipadst, dct
   4441 INV_TXFM_16X16_FN flipadst, adst
   4442 INV_TXFM_16X16_FN flipadst, flipadst
   4443 
   4444 cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   4445 %if WIN64
   4446    mov [rsp+16*16+gprsize], r7
   4447 %elif ARCH_X86_32
   4448    mov [rsp+16*16+gprsize*1], r1
   4449    mov [rsp+16*16+gprsize*2], r6
   4450 %endif
   4451    lea                  t0, [o(.main)]
   4452    jmp m(idct_16x16_internal_16bpc).pass1_full
   4453 
   4454 .main:
   4455    call m(iadst_16x16_internal_16bpc).main
   4456 %if ARCH_X86_64
   4457    mova                 m1, m0
   4458    mova                 m3, m2
   4459    mova                 m5, m4
   4460    mova                 m7, m6
   4461    pshufd               m0, m14, q1032
   4462    pshufd               m2, m12, q1032
   4463    pshufd               m4, m10, q1032
   4464    pshufd               m6, m8, q1032
   4465    pshufd               m8, m7, q1032
   4466    pshufd              m10, m5, q1032
   4467    pshufd              m12, m3, q1032
   4468    pshufd              m14, m1, q1032
   4469 %else
   4470    pshufd               m1, m0, q1032
   4471    pshufd               m3, m2, q1032
   4472    pshufd               m5, m4, q1032
   4473    pshufd               m7, m6, q1032
   4474    pshufd               m0, [r3+11*16], q1032
   4475    pshufd               m2, [r3+10*16], q1032
   4476    pshufd               m4, [r3+9*16], q1032
   4477    pshufd               m6, [r3+8*16], q1032
   4478    mova         [r3+11*16], m1
   4479    mova         [r3+10*16], m3
   4480    mova         [r3+ 9*16], m5
   4481    mova         [r3+ 8*16], m7
   4482 %endif
   4483    ret
   4484 
   4485 .pass2:
   4486    lea                  r3, [strideq*3]
   4487    lea                  r3, [r3*5]
   4488    add                dstq, r3
   4489    neg             strideq
   4490    jmp m(iadst_16x16_internal_16bpc).pass2
   4491 
   4492 INV_TXFM_16X16_FN identity, dct, h
   4493 INV_TXFM_16X16_FN identity, identity
   4494 
   4495 cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
   4496 %if WIN64
   4497    mov [rsp+16*16+gprsize], r7
   4498 %elif ARCH_X86_32
   4499    mov [rsp+16*16+gprsize*1], r1
   4500    mov [rsp+16*16+gprsize*2], r6
   4501 %endif
   4502    lea                  t0, [o(.main)]
   4503    jmp m(idct_16x16_internal_16bpc).pass1_full
   4504 
   4505 .main:
   4506 %if ARCH_X86_64
   4507    mova                m15, [o(pd_11586)]
   4508    pmulld               m0, m15, [cq+ 0*64+r5]
   4509    pmulld               m1, m15, [cq+ 1*64+r5]
   4510    pmulld               m2, m15, [cq+ 2*64+r5]
   4511    pmulld               m3, m15, [cq+ 3*64+r5]
   4512    pmulld               m4, m15, [cq+ 4*64+r5]
   4513    pmulld               m5, m15, [cq+ 5*64+r5]
   4514    pmulld               m6, m15, [cq+ 6*64+r5]
   4515    pmulld               m7, m15, [cq+ 7*64+r5]
   4516    pmulld               m8, m15, [cq+ 8*64+r5]
   4517    pmulld               m9, m15, [cq+ 9*64+r5]
   4518    pmulld              m10, m15, [cq+10*64+r5]
   4519    pmulld              m11, m15, [cq+11*64+r5]
   4520    pmulld              m12, m15, [cq+12*64+r5]
   4521    pmulld              m13, m15, [cq+13*64+r5]
   4522    pmulld              m14, m15, [cq+14*64+r5]
   4523    pmulld              m15, [cq+15*64+r5]
   4524    mova               [r3], m15
   4525    mova                m15, [o(pd_10240)]
   4526    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
   4527                         m8, m9, m10, m11, m12, m13, m14
   4528    paddd               m15, [r3]
   4529    REPX     {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \
   4530                         m8, m9, m10, m11, m12, m13, m14, m15
   4531    packssdw             m0, m1
   4532    packssdw             m2, m3
   4533    packssdw             m4, m5
   4534    packssdw             m6, m7
   4535    packssdw             m8, m9
   4536    packssdw            m10, m11
   4537    packssdw            m12, m13
   4538    packssdw            m14, m15
   4539 %else
   4540    mova                 m7, [o(pd_11586)]
   4541    pmulld               m0, m7, [cq+ 0*64+r5]
   4542    pmulld               m1, m7, [cq+ 1*64+r5]
   4543    pmulld               m2, m7, [cq+ 2*64+r5]
   4544    pmulld               m3, m7, [cq+ 3*64+r5]
   4545    pmulld               m4, m7, [cq+ 4*64+r5]
   4546    pmulld               m5, m7, [cq+ 5*64+r5]
   4547    pmulld               m6, m7, [cq+ 6*64+r5]
   4548    pmulld               m7, [cq+ 7*64+r5]
   4549    mova               [r3], m7
   4550    mova                 m7, [o(pd_10240)]
   4551    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
   4552    paddd                m7, [r3]
   4553    REPX      {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
   4554    packssdw             m0, m1
   4555    packssdw             m2, m3
   4556    packssdw             m4, m5
   4557    packssdw             m6, m7
   4558    mova          [r3+8*16], m0
   4559    mova          [r3+9*16], m2
   4560    mova         [r3+10*16], m4
   4561    mova         [r3+11*16], m6
   4562    mova                 m7, [o(pd_11586)]
   4563    pmulld               m0, m7, [cq+ 8*64+r5]
   4564    pmulld               m1, m7, [cq+ 9*64+r5]
   4565    pmulld               m2, m7, [cq+10*64+r5]
   4566    pmulld               m3, m7, [cq+11*64+r5]
   4567    pmulld               m4, m7, [cq+12*64+r5]
   4568    pmulld               m5, m7, [cq+13*64+r5]
   4569    pmulld               m6, m7, [cq+14*64+r5]
   4570    pmulld               m7, [cq+15*64+r5]
   4571    mova               [r3], m7
   4572    mova                 m7, [o(pd_10240)]
   4573    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
   4574    paddd                m7, [r3]
   4575    REPX      {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
   4576    packssdw             m0, m1
   4577    packssdw             m2, m3
   4578    packssdw             m4, m5
   4579    packssdw             m6, m7
   4580 %endif
   4581    ret
   4582 
   4583 .pass2:
   4584 %if ARCH_X86_64
   4585    mova                 m4, [o(pw_2048)]
   4586    mova                 m5, [o(pixel_10bpc_max)]
   4587    pxor                 m6, m6
   4588    mova                 m7, [o(pw_1697x16)]
   4589    mov                  r7, dstq
   4590 %else
   4591    mov [rsp+2*gprsize+16*16], dstq
   4592 %endif
   4593    mov                 r5d, 4
   4594    lea                  r3, [strideq*3]
   4595 .pass2_loop:
   4596    mova                 m0, [cq+0*64+0]
   4597    mova                 m1, [cq+1*64+0]
   4598    mova                 m2, [cq+2*64+0]
   4599    mova                 m3, [cq+3*64+0]
   4600    call m(iidentity_8x16_internal_16bpc).main
   4601 %if ARCH_X86_64
   4602    call m(idct_8x4_internal_16bpc).round1_and_write_8x4
   4603 %else
   4604    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
   4605 %endif
   4606    REPX {mova [cq+x*16], m6}, 0, 4, 8, 12
   4607    add                  cq, 16
   4608    lea                dstq, [dstq+strideq*4]
   4609    dec                 r5w
   4610    jg .pass2_loop
   4611    add                  cq, 64*3
   4612    btc                 r5d, 16
   4613    jc .end
   4614 %if ARCH_X86_64
   4615    lea                dstq, [r7+16]
   4616 %else
   4617    mov                dstq, [rsp+2*gprsize+16*16]
   4618    add                dstq, 16
   4619 %endif
   4620    add                 r5d, 4
   4621    jmp .pass2_loop
   4622 .end:
   4623 %if WIN64
   4624    mov                  r7, [rsp+16*16+gprsize]
   4625 %endif
   4626    RET
   4627 
   4628 cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob
   4629 %if ARCH_X86_32
   4630    LEA                  r6, $$
   4631 %endif
   4632    mova                 m5, [o(pw_5)]
   4633    mova                 m7, [o(pixel_10bpc_max)]
   4634    pxor                 m6, m6
   4635    mov                 r5d, eobd
   4636    add                eobb, 21
   4637    cmovc              eobd, r5d ; 43, 107, 171 -> 64, 128, 192
   4638    lea                  r4, [strideq*3]
   4639 .loop:
   4640    mova                 m0, [cq+128*0]
   4641    packssdw             m0, [cq+128*1]
   4642    mova                 m1, [cq+128*2]
   4643    packssdw             m1, [cq+128*3]
   4644    mova                 m2, [cq+128*4]
   4645    packssdw             m2, [cq+128*5]
   4646    mova                 m3, [cq+128*6]
   4647    packssdw             m3, [cq+128*7]
   4648    REPX     {paddsw x, m5}, m0, m1, m2, m3
   4649    REPX     {psraw  x, 3 }, m0, m1, m2, m3
   4650    call .main_zero
   4651    add                  cq, 16
   4652    lea                dstq, [dstq+strideq*4]
   4653    btc                eobd, 16
   4654    jnc .loop
   4655    sub                eobd, 64
   4656    jge .loop
   4657    RET
   4658 ALIGN function_align
   4659 .main_zero:
   4660    REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
   4661 .main:
   4662    punpckhwd            m4, m0, m1
   4663    punpcklwd            m0, m1
   4664    punpckhwd            m1, m2, m3
   4665    punpcklwd            m2, m3
   4666    punpckhwd            m3, m0, m4
   4667    punpcklwd            m0, m4
   4668    punpckhwd            m4, m2, m1
   4669    punpcklwd            m2, m1
   4670    punpckhqdq           m1, m0, m2
   4671    punpcklqdq           m0, m2
   4672    punpcklqdq           m2, m3, m4
   4673    punpckhqdq           m3, m4
   4674    paddw                m0, [dstq+strideq*0]
   4675    paddw                m1, [dstq+strideq*1]
   4676    paddw                m2, [dstq+strideq*2]
   4677    paddw                m3, [dstq+r4       ]
   4678    REPX     {pmaxsw x, m6}, m0, m1, m2, m3
   4679    REPX     {pminsw x, m7}, m0, m1, m2, m3
   4680    mova   [dstq+strideq*0], m0
   4681    mova   [dstq+strideq*1], m1
   4682    mova   [dstq+strideq*2], m2
   4683    mova   [dstq+r4       ], m3
   4684    ret
   4685 
   4686 cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob
   4687 %if ARCH_X86_32
   4688    LEA                  r6, $$
   4689 %endif
   4690    mova                 m5, [o(pw_4096)]
   4691    mova                 m7, [o(pixel_10bpc_max)]
   4692    pxor                 m6, m6
   4693    mov                 r4d, eobd
   4694    add                eobb, 21
   4695    cmovc              eobd, r4d
   4696    lea                  r4, [strideq*3]
   4697    mov                  r5, dstq
   4698 .loop:
   4699    mova                 m0, [cq+32*0]
   4700    packssdw             m0, [cq+32*1]
   4701    mova                 m1, [cq+32*2]
   4702    packssdw             m1, [cq+32*3]
   4703    mova                 m2, [cq+32*4]
   4704    packssdw             m2, [cq+32*5]
   4705    mova                 m3, [cq+32*6]
   4706    packssdw             m3, [cq+32*7]
   4707    REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
   4708    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
   4709    call m(inv_txfm_add_identity_identity_8x32_16bpc).main
   4710    lea                dstq, [dstq+strideq*4]
   4711    add                  cq, 16
   4712    btc                eobd, 16
   4713    jnc .loop
   4714    add                  cq, 32*8-32
   4715    add                  r5, 16
   4716    mov                dstq, r5
   4717    sub                eobd, 64
   4718    jge .loop
   4719    RET
   4720 
   4721 cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob
   4722 %if ARCH_X86_32
   4723    LEA                  r6, $$
   4724 %else
   4725    mova                 m8, [o(pw_2896x8)]
   4726    mova                 m9, [o(pw_1697x16)]
   4727    mova                m11, [o(pw_8192)]
   4728 %endif
   4729    mova                 m7, [o(pixel_10bpc_max)]
   4730    lea                  r4, [strideq*3]
   4731    pxor                 m6, m6
   4732 %if ARCH_X86_64
   4733    paddw               m10, m11, m11 ; pw_16384
   4734 %endif
   4735    mov                  r5, dstq
   4736    call .main
   4737    sub                eobd, 36
   4738    jl .ret
   4739    add                  cq, 128*8-32
   4740    lea                dstq, [r5+16]
   4741    call .main
   4742    sub                  cq, 128*8
   4743    lea                dstq, [r5+strideq*8]
   4744    mov                  r5, dstq
   4745    call .main
   4746    sub                eobd, 107 ; eob < 143
   4747    jl .ret
   4748    add                  cq, 128*8-32
   4749    lea                dstq, [r5+16]
   4750    call .main
   4751    sub                  cq, 128*8
   4752    lea                dstq, [r5+strideq*8]
   4753    mov                  r5, dstq
   4754    call .main
   4755    sub                eobd, 128 ; eob < 271
   4756    jl .ret
   4757    add                  cq, 128*8-32
   4758    lea                dstq, [r5+16]
   4759    call .main
   4760    sub                  cq, 128*8
   4761    lea                dstq, [r5+strideq*8]
   4762    mov                  r5, dstq
   4763    call .main
   4764    sub                eobd, 128 ; eob < 399
   4765    jl .ret
   4766    add                  cq, 128*8-32
   4767    lea                dstq, [r5+16]
   4768    call .main
   4769 .ret:
   4770    RET
   4771 ALIGN function_align
   4772 .main:
   4773    mova                 m0, [cq+128*0]
   4774    packssdw             m0, [cq+128*1]
   4775    mova                 m1, [cq+128*2]
   4776    packssdw             m1, [cq+128*3]
   4777    mova                 m2, [cq+128*4]
   4778    packssdw             m2, [cq+128*5]
   4779    mova                 m3, [cq+128*6]
   4780    packssdw             m3, [cq+128*7]
   4781 %if ARCH_X86_64
   4782    REPX  {pmulhrsw x, m8 }, m0, m1, m2, m3
   4783    pmulhrsw             m4, m9, m0
   4784    pmulhrsw             m5, m9, m1
   4785    REPX  {pmulhrsw x, m10}, m4, m5
   4786 %else
   4787    mova                 m6, [o(pw_2896x8)]
   4788    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
   4789    mova                 m5, [o(pw_1697x16)]
   4790    pmulhrsw             m4, m5, m0
   4791    pmulhrsw             m5, m1
   4792    mova                 m6, [o(pw_16384)]
   4793    REPX  {pmulhrsw x, m6 }, m4, m5
   4794 %endif
   4795    paddsw               m0, m4
   4796    paddsw               m1, m5
   4797 %if ARCH_X86_64
   4798    pmulhrsw             m4, m9, m2
   4799    pmulhrsw             m5, m9, m3
   4800    REPX  {pmulhrsw x, m10}, m4, m5
   4801 %else
   4802    mova                 m5, [o(pw_1697x16)]
   4803    pmulhrsw             m4, m5, m2
   4804    pmulhrsw             m5, m3
   4805    REPX  {pmulhrsw x, m6 }, m4, m5
   4806 %endif
   4807    paddsw               m2, m4
   4808    paddsw               m3, m5
   4809 %if ARCH_X86_64
   4810    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
   4811 %else
   4812    psrlw                m6, 1          ; pw_8192
   4813    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
   4814    pxor                 m6, m6
   4815 %endif
   4816    call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
   4817    lea                dstq, [dstq+strideq*4]
   4818    add                  cq, 16
   4819    btc                eobd, 16
   4820    jnc .main
   4821    ret
   4822 
   4823 cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob
   4824 %if ARCH_X86_32
   4825    LEA                  r6, $$
   4826 %else
   4827    mova                 m8, [o(pw_2896x8)]
   4828    mova                 m9, [o(pw_1697x16)]
   4829    mova                m10, [o(pw_2048)]
   4830 %endif
   4831    mova                 m7, [o(pixel_10bpc_max)]
   4832    lea                  r4, [strideq*3]
   4833    pxor                 m6, m6
   4834    mov                  r5, dstq
   4835    call .main
   4836    sub                eobd, 36
   4837    jl .ret
   4838    call .main
   4839    add                  cq, 64*8-64
   4840    lea                dstq, [r5+16*1]
   4841    call .main
   4842    sub                eobd, 107 ; eob < 143
   4843    jl .ret
   4844    call .main
   4845    add                  cq, 64*8-64
   4846    lea                dstq, [r5+16*2]
   4847    call .main
   4848    sub                eobd, 128 ; eob < 271
   4849    jl .ret
   4850    call .main
   4851    add                  cq, 64*8-64
   4852    lea                dstq, [r5+16*3]
   4853    call .main
   4854    sub                eobd, 128 ; eob < 399
   4855    jl .ret
   4856    call .main
   4857 .ret:
   4858    RET
   4859 ALIGN function_align
   4860 .main:
   4861    mova                 m0, [cq+64*0]
   4862    packssdw             m0, [cq+64*1]
   4863    mova                 m1, [cq+64*2]
   4864    packssdw             m1, [cq+64*3]
   4865    mova                 m2, [cq+64*4]
   4866    packssdw             m2, [cq+64*5]
   4867    mova                 m3, [cq+64*6]
   4868    packssdw             m3, [cq+64*7]
   4869 %if ARCH_X86_64
   4870    REPX  {pmulhrsw x, m8 }, m0, m1, m2, m3
   4871 %else
   4872    mova                 m6, [o(pw_2896x8)]
   4873    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
   4874 %endif
   4875    REPX  {paddsw   x, x  }, m0, m1, m2, m3
   4876 %if ARCH_X86_64
   4877    pmulhrsw             m4, m9, m0
   4878    pmulhrsw             m5, m9, m1
   4879 %else
   4880    mova                 m6, [o(pw_1697x16)]
   4881    pmulhrsw             m4, m6, m0
   4882    pmulhrsw             m5, m6, m1
   4883 %endif
   4884    REPX  {paddsw   x, x  }, m0, m1
   4885    paddsw               m0, m4
   4886    paddsw               m1, m5
   4887 %if ARCH_X86_64
   4888    pmulhrsw             m4, m9, m2
   4889    pmulhrsw             m5, m9, m3
   4890 %else
   4891    pmulhrsw             m4, m6, m2
   4892    pmulhrsw             m6, m3
   4893 %endif
   4894    REPX  {paddsw   x, x  }, m2, m3
   4895    paddsw               m2, m4
   4896 %if ARCH_X86_64
   4897    paddsw               m3, m5
   4898    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3
   4899 %else
   4900    paddsw               m3, m6
   4901    mova                 m6, [o(pw_2048)]
   4902    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
   4903    pxor                 m6, m6
   4904 %endif
   4905    REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
   4906    call m(inv_txfm_add_identity_identity_8x32_16bpc).main
   4907    lea                dstq, [dstq+strideq*4]
   4908    add                  cq, 16
   4909    btc                eobd, 16
   4910    jnc .main
   4911    ret
   4912 
   4913 cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob
   4914 %undef cmp
   4915 %if ARCH_X86_32
   4916    LEA                  r6, $$
   4917 %endif
   4918    mova                 m5, [o(pw_8192)]
   4919    mova                 m7, [o(pixel_10bpc_max)]
   4920    pxor                 m6, m6
   4921    lea                  r4, [strideq*3]
   4922    mov                  r5, dstq
   4923    call .main                              ; 0
   4924    cmp                eobd, 36
   4925    jl .ret
   4926    add                  cq, 128*8-32       ; 0 1
   4927    lea                dstq, [r5+16]        ; 1
   4928    call .main
   4929    call .main2
   4930    cmp                eobd, 136
   4931    jl .ret
   4932    add                  cq, 128*16-64      ; 0 1 2
   4933    lea                dstq, [r5+16*2]      ; 1 2
   4934    call .main                              ; 2
   4935    call .main2
   4936    call .main2
   4937    cmp                eobd, 300
   4938    jl .ret
   4939    add                  cq, 128*24-96      ; 0 1 2 3
   4940    add                  r5, 16*3           ; 1 2 3
   4941    mov                dstq, r5             ; 2 3
   4942    call .main                              ; 3
   4943    call .main2
   4944    call .main2
   4945    call .main2
   4946    cmp                eobd, 535
   4947    jl .ret
   4948    add                  cq, 128*24-96      ; 0 1 2 3
   4949    lea                dstq, [r5+strideq*8] ; 1 2 3 4
   4950    mov                  r5, dstq           ; 2 3 4
   4951    call .main                              ; 3 4
   4952    call .main2
   4953    call .main2
   4954    cmp                eobd, 755
   4955    jl .ret
   4956    add                  cq, 128*16-64      ; 0 1 2 3
   4957    lea                dstq, [r5+strideq*8] ; 1 2 3 4
   4958    mov                  r5, dstq           ; 2 3 4 5
   4959    call .main                              ; 3 4 5
   4960    call .main2
   4961    cmp                eobd, 911
   4962    jl .ret
   4963    add                  cq, 128*8-32       ; 0 1 2 3
   4964    lea                dstq, [r5+strideq*8] ; 1 2 3 4
   4965    call .main                              ; 2 3 4 5
   4966 .ret:                                       ; 3 4 5 6
   4967    RET
   4968 ALIGN function_align
   4969 .main2:
   4970    sub                  cq, 128*8
   4971    sub                dstq, 16
   4972 .main:
   4973    mova                 m0, [cq+128*0]
   4974    packssdw             m0, [cq+128*1]
   4975    mova                 m1, [cq+128*2]
   4976    packssdw             m1, [cq+128*3]
   4977    mova                 m2, [cq+128*4]
   4978    packssdw             m2, [cq+128*5]
   4979    mova                 m3, [cq+128*6]
   4980    packssdw             m3, [cq+128*7]
   4981    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
   4982    call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
   4983    lea                dstq, [dstq+strideq*4]
   4984    add                  cq, 16
   4985    btc                eobd, 16
   4986    jnc .main
   4987    ret
   4988 
   4989 cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
   4990                                         dst, stride, c, eob
   4991 %if ARCH_X86_32
   4992    LEA                  r6, $$
   4993 %define base $$
   4994    DECLARE_REG_TMP       0, 4
   4995 %else
   4996    lea                  r6, [tbl_Nx32_odd_offset]
   4997 %define base tbl_Nx32_odd_offset
   4998    DECLARE_REG_TMP       4, 7
   4999 %if WIN64
   5000    mov [rsp+gprsize*1+35*16], r7
   5001 %endif
   5002 %endif
   5003 %define o2(x) r6-base+x
   5004    test               eobd, eobd
   5005    jz .dconly
   5006 
   5007 %if ARCH_X86_32
   5008    mov [rsp+gprsize*1+35*16], r0
   5009 %endif
   5010 %undef cmp
   5011    ; remove entirely-zero iterations
   5012    mov                 r5d, 7*2
   5013    cmp                eobw, word [o2(tbl_8x32_2d)+r5]
   5014    jge .end_zero_loop
   5015    pxor                 m0, m0
   5016 .zero_loop:
   5017    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
   5018    movzx               t1d, t0b
   5019    shr                 t0d, 8
   5020    mova   [rsp+ 3*16+r5*8], m0
   5021    mova   [rsp+11*16+r5*8], m0
   5022    mova   [rsp+ 3*16+t0*8], m0
   5023    mova   [rsp+ 3*16+t1*8], m0
   5024    sub                 r5d, 2
   5025    cmp                eobw, word [o2(tbl_8x32_2d)+r5]
   5026    jl .zero_loop
   5027 .end_zero_loop:
   5028    ; actual first pass after skipping all-zero data
   5029    mov [rsp+gprsize*0+35*16], eobd
   5030    mov                  r3, rsp
   5031 .loop_pass1:
   5032 %if ARCH_X86_64
   5033    mova                m11, [o(pd_2048)]
   5034    mova                m12, [o(clip_18b_min)]
   5035    mova                m13, [o(clip_18b_max)]
   5036    mova                m14, [o(pd_2896)]
   5037 %endif
   5038    mova                 m0, [cq+0*128+r5*8]
   5039    mova                 m1, [cq+1*128+r5*8]
   5040    mova                 m2, [cq+2*128+r5*8]
   5041    mova                 m3, [cq+3*128+r5*8]
   5042    mova                 m4, [cq+4*128+r5*8]
   5043    mova                 m5, [cq+5*128+r5*8]
   5044    mova                 m6, [cq+6*128+r5*8]
   5045    mova                 m7, [cq+7*128+r5*8]
   5046    call m(idct_8x4_internal_16bpc).main_pass1
   5047    mova                 m1, [o(pd_2)]
   5048    REPX      {paddd x, m1}, m0, m6, m5, m3
   5049    call m(idct_8x4_internal_16bpc).round
   5050    REPX      {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
   5051    packssdw             m0, m1
   5052    packssdw             m2, m3
   5053    packssdw             m4, m5
   5054    packssdw             m6, m7
   5055    call m(idct_8x4_internal_16bpc).transpose4x8packed
   5056 
   5057    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
   5058    movzx               t1d, t0b
   5059    shr                 t0d, 8
   5060    mova    [r3+ 3*16+r5*8], m0
   5061    mova    [r3+11*16+r5*8], m2
   5062    mova    [r3+ 3*16+t1*8], m1
   5063    mova    [r3+ 3*16+t0*8], m3
   5064    pxor                 m7, m7
   5065    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7
   5066    sub                 r5d, 2
   5067    jge .loop_pass1
   5068 
   5069    ; pass 2 code starts here
   5070    ; m0 is already loaded from last iteration of first pass
   5071 %if ARCH_X86_32
   5072    mov                  r0, [rsp+gprsize*1+35*16]
   5073 %endif
   5074    mov                eobd, [rsp+gprsize*0+35*16]
   5075    cmp                eobd, 43
   5076    jl .load_veryfast
   5077    cmp                eobd, 107
   5078    jl .load_fast
   5079    ; load normal
   5080    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
   5081    jmp .run
   5082 .load_fast:
   5083    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
   5084    jmp .run
   5085 .load_veryfast:
   5086    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
   5087    ; fall-through
   5088 .run:
   5089    call .pass2
   5090 %if WIN64
   5091    mov                  r7, [rsp+gprsize*1+35*16]
   5092 %endif
   5093    RET
   5094 
   5095 .pass2:
   5096 %if ARCH_X86_32
   5097    lea                  r5, [o(itx8_start)]
   5098 %endif
   5099    mova                 m1, [rsp+gprsize+16* 4]
   5100    mova                 m2, [rsp+gprsize+16* 5]
   5101    mova                 m3, [rsp+gprsize+16* 6]
   5102    mova                 m4, [rsp+gprsize+16* 7]
   5103    mova                 m5, [rsp+gprsize+16* 8]
   5104    mova                 m6, [rsp+gprsize+16* 9]
   5105    mova                 m7, [rsp+gprsize+16*10]
   5106    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
   5107    mova [rsp+gprsize+ 3*16], m0
   5108    mova [rsp+gprsize+ 4*16], m1
   5109    mova [rsp+gprsize+ 5*16], m2
   5110    mova [rsp+gprsize+ 6*16], m3
   5111    mova [rsp+gprsize+ 7*16], m4
   5112    mova [rsp+gprsize+ 8*16], m5
   5113    mova [rsp+gprsize+ 9*16], m6
   5114    mova                 m0, [rsp+gprsize+11*16]
   5115    mova                 m1, [rsp+gprsize+12*16]
   5116    mova                 m2, [rsp+gprsize+13*16]
   5117    mova                 m3, [rsp+gprsize+14*16]
   5118    mova                 m4, [rsp+gprsize+15*16]
   5119    mova                 m5, [rsp+gprsize+16*16]
   5120    mova                 m6, [rsp+gprsize+17*16]
   5121    mova                 m7, [rsp+gprsize+18*16]
   5122    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
   5123    mova                 m7, [rsp+gprsize+ 0*16]
   5124    mova [rsp+gprsize+11*16], m0
   5125    mova [rsp+gprsize+12*16], m1
   5126    mova [rsp+gprsize+13*16], m2
   5127    mova [rsp+gprsize+14*16], m3
   5128    mova [rsp+gprsize+15*16], m4
   5129    mova [rsp+gprsize+16*16], m5
   5130    mova [rsp+gprsize+17*16], m6
   5131    mova [rsp+gprsize+18*16], m7
   5132    call                 r4
   5133 %if ARCH_X86_64
   5134    mova                 m8, [o(pw_2048)]
   5135    pxor                 m9, m9
   5136    mova                m10, [o(pixel_10bpc_max)]
   5137 %endif
   5138    lea                  r3, [strideq*3]
   5139    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
   5140    lea                dstq, [dstq+strideq*8]
   5141    mova                 m0, [rsp+gprsize+11*16]
   5142    mova                 m1, [rsp+gprsize+12*16]
   5143    mova                 m2, [rsp+gprsize+13*16]
   5144    mova                 m3, [rsp+gprsize+14*16]
   5145    mova                 m4, [rsp+gprsize+15*16]
   5146    mova                 m5, [rsp+gprsize+16*16]
   5147    mova                 m6, [rsp+gprsize+17*16]
   5148    mova                 m7, [rsp+gprsize+18*16]
   5149    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
   5150    lea                dstq, [dstq+strideq*8]
   5151    mova                 m0, [rsp+gprsize+19*16]
   5152    mova                 m1, [rsp+gprsize+20*16]
   5153    mova                 m2, [rsp+gprsize+21*16]
   5154    mova                 m3, [rsp+gprsize+22*16]
   5155    mova                 m4, [rsp+gprsize+23*16]
   5156    mova                 m5, [rsp+gprsize+24*16]
   5157    mova                 m6, [rsp+gprsize+25*16]
   5158    mova                 m7, [rsp+gprsize+26*16]
   5159    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
   5160    lea                dstq, [dstq+strideq*8]
   5161    mova                 m0, [rsp+gprsize+27*16]
   5162    mova                 m1, [rsp+gprsize+28*16]
   5163    mova                 m2, [rsp+gprsize+29*16]
   5164    mova                 m3, [rsp+gprsize+30*16]
   5165    mova                 m4, [rsp+gprsize+31*16]
   5166    mova                 m5, [rsp+gprsize+32*16]
   5167    mova                 m6, [rsp+gprsize+33*16]
   5168    mova                 m7, [rsp+gprsize+34*16]
   5169    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
   5170    ret
   5171 .dconly:
   5172    imul                r5d, [cq], 181
   5173    mov                [cq], eobd ; 0
   5174    mov                 r3d, 8
   5175    add                 r5d, 640
   5176    sar                 r5d, 10
   5177    add                 rsp, (31+2*ARCH_X86_64)*16
   5178    jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
   5179 
   5180 cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
   5181                                          dst, stride, c, eob
   5182    LEA                  r6, base
   5183    test               eobd, eobd
   5184    jz .dconly
   5185 
   5186 %if ARCH_X86_32
   5187    mov [rsp+gprsize*1+76*16], r0
   5188 %elif WIN64
   5189    mov [rsp+gprsize*1+76*16], r7
   5190 %endif
   5191 %undef cmp
   5192    ; remove entirely-zero iterations
   5193    mov                 r5d, 7*2
   5194    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
   5195    jge .end_zero_loop
   5196    pxor                 m0, m0
   5197 .zero_loop:
   5198    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
   5199    movzx               t1d, t0b
   5200    shr                 t0d, 8
   5201    mova   [rsp+12*16+r5*8], m0
   5202    mova   [rsp+20*16+r5*8], m0
   5203    mova   [rsp+12*16+t0*8], m0
   5204    mova   [rsp+12*16+t1*8], m0
   5205    mova   [rsp+44*16+r5*8], m0
   5206    mova   [rsp+52*16+r5*8], m0
   5207    mova   [rsp+44*16+t0*8], m0
   5208    mova   [rsp+44*16+t1*8], m0
   5209    sub                 r5d, 2
   5210    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
   5211    jl .zero_loop
   5212 .end_zero_loop:
   5213    ; actual first pass after skipping all-zero data
   5214    mov [rsp+gprsize*0+76*16], eobd
   5215    mov                  r3, rsp
   5216 .loop_pass1:
   5217 %if ARCH_X86_64
   5218    mova                m11, [o(pd_2048)]
   5219    mova                m12, [o(clip_18b_min)]
   5220    mova                m13, [o(clip_18b_max)]
   5221    mova                m14, [o(pd_2896)]
   5222 %endif
   5223    mova                 m0, [cq+ 1*128+r5*8]
   5224    mova                 m1, [cq+ 3*128+r5*8]
   5225    mova                 m2, [cq+ 5*128+r5*8]
   5226    mova                 m3, [cq+ 7*128+r5*8]
   5227    mova                 m4, [cq+ 9*128+r5*8]
   5228    mova                 m5, [cq+11*128+r5*8]
   5229    mova                 m6, [cq+13*128+r5*8]
   5230    mova                 m7, [cq+15*128+r5*8]
   5231    call m(idct_8x4_internal_16bpc).rect2_mul
   5232    call m(idct_16x4_internal_16bpc).main_oddhalf
   5233 
   5234    mova                 m0, [cq+ 0*128+r5*8]
   5235    mova                 m1, [cq+ 2*128+r5*8]
   5236    mova                 m2, [cq+ 4*128+r5*8]
   5237    mova                 m3, [cq+ 6*128+r5*8]
   5238    mova                 m4, [cq+ 8*128+r5*8]
   5239    mova                 m5, [cq+10*128+r5*8]
   5240    mova                 m6, [cq+12*128+r5*8]
   5241    mova                 m7, [cq+14*128+r5*8]
   5242    call m(idct_8x4_internal_16bpc).rect2_mul
   5243    call m(idct_8x4_internal_16bpc).main_pass1
   5244    call m(idct_8x4_internal_16bpc).round
   5245    call m(idct_16x4_internal_16bpc).round
   5246 %if ARCH_X86_64
   5247    packssdw             m0, m1
   5248    packssdw             m2, m3
   5249    packssdw             m4, m5
   5250    packssdw             m6, m7
   5251    packssdw             m8, m9
   5252    packssdw            m10, m11
   5253    packssdw            m12, m13
   5254    packssdw            m14, m15
   5255 %endif
   5256    call m(idct_8x4_internal_16bpc).transpose4x8packed
   5257    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
   5258    movzx               t1d, t0b
   5259    shr                 t0d, 8
   5260 %if ARCH_X86_64
   5261    mova   [rsp+12*16+r5*8], m0
   5262    mova   [rsp+20*16+r5*8], m2
   5263    mova   [rsp+12*16+t1*8], m1
   5264    mova   [rsp+12*16+t0*8], m3
   5265    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   5266    mova   [rsp+44*16+r5*8], m8
   5267    mova   [rsp+52*16+r5*8], m10
   5268    mova   [rsp+44*16+t1*8], m9
   5269    mova   [rsp+44*16+t0*8], m11
   5270 %else
   5271    mova   [rsp+44*16+r5*8], m0
   5272    mova   [rsp+52*16+r5*8], m2
   5273    mova   [rsp+44*16+t1*8], m1
   5274    mova   [rsp+44*16+t0*8], m3
   5275    mova                 m0, [r3+ 8*16]
   5276    mova                 m2, [r3+ 9*16]
   5277    mova                 m4, [r3+10*16]
   5278    mova                 m6, [r3+11*16]
   5279    call m(idct_8x4_internal_16bpc).transpose4x8packed
   5280    mova   [rsp+12*16+r5*8], m0
   5281    mova   [rsp+20*16+r5*8], m2
   5282    mova   [rsp+12*16+t1*8], m1
   5283    mova   [rsp+12*16+t0*8], m3
   5284 %endif
   5285    pxor                 m7, m7
   5286    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   5287    sub                 r5d, 2
   5288    jge .loop_pass1
   5289 
   5290    ; pass=2
   5291    add                 rsp, 9*16
   5292 %if ARCH_X86_64
   5293    mov                  r6, dstq
   5294 %else
   5295    mov                dstq, [rsp+gprsize*1+67*16]
   5296 %endif
   5297    mov                eobd, [rsp+gprsize*0+67*16]
   5298    cmp                eobd, 44
   5299    jl .load_veryfast
   5300    cmp                eobd, 151
   5301    jl .load_fast
   5302    ; load normal
   5303    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
   5304    jmp .run
   5305 .load_fast:
   5306    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
   5307    jmp .run
   5308 .load_veryfast:
   5309    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
   5310    ; fall-through
   5311 .run:
   5312 %if ARCH_X86_64
   5313    lea                  r2, [dstq+32]
   5314    mov                  r7, -4
   5315 %else
   5316    lea                  r2, [rsp+67*16]
   5317    mov dword [r2+0*gprsize], 2
   5318 %endif
   5319    jmp .loop_pass2_entry
   5320 .loop_pass2:
   5321    mova                 m0, [rsp+16* 3]
   5322 .loop_pass2_entry:
   5323 %if ARCH_X86_32
   5324    mov                dstq, [r2+1*gprsize]
   5325 %endif
   5326    call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
   5327    add                 rsp, 32*16
   5328 %if ARCH_X86_64
   5329    add                  r7, 2
   5330    lea                dstq, [r2+r7*8]
   5331    jl .loop_pass2
   5332 %if WIN64
   5333    mov                  r7, [rsp+gprsize*1+3*16]
   5334 %endif
   5335 %else
   5336    add dword [r2+1*gprsize], 16
   5337    dec dword [r2+0*gprsize]
   5338    jg .loop_pass2
   5339 %endif
   5340 %assign stack_size (stack_size-73*16)
   5341 %if STACK_ALIGNMENT >= 16
   5342 %assign stack_size_padded (stack_size_padded-73*16)
   5343 %assign stack_offset (stack_offset-73*16)
   5344 %else
   5345 %xdefine rstkm [rsp + stack_size]
   5346 %endif
   5347    RET
   5348 .dconly:
   5349    imul                r5d, [cq], 181
   5350    mov                [cq], eobd ; 0
   5351    mov                 r3d, 32
   5352    add                 r5d, 128
   5353    sar                 r5d, 8
   5354    imul                r5d, 181
   5355    add                 rsp, (65+4*ARCH_X86_64)*16
   5356    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
   5357 
   5358 cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
   5359                                         dst, stride, c, eob
   5360 %if ARCH_X86_32
   5361    LEA                  r6, $$
   5362 %endif
   5363    test               eobd, eobd
   5364    jz .dconly
   5365 
   5366    ; remove entirely-zero iterations
   5367 %undef cmp
   5368 %if ARCH_X86_64
   5369    xor                 r5d, r5d
   5370    cmp                eobd, 10
   5371    setge               r5b
   5372 %else
   5373    mov                 r5d, 1
   5374    cmp                eobd, 10
   5375    sbb                 r5d, 0
   5376 %endif
   5377    add                 r5d, r5d
   5378 
   5379    ; actual first pass after skipping all-zero data
   5380 .loop_pass1:
   5381    mova                 m0, [cq+32* 1+r5*8]
   5382    mova                 m1, [cq+32* 7+r5*8]
   5383    mova                 m2, [cq+32* 9+r5*8]
   5384    mova                 m3, [cq+32*15+r5*8]
   5385    mova                 m4, [cq+32*17+r5*8]
   5386    mova                 m5, [cq+32*23+r5*8]
   5387    mova                 m6, [cq+32*25+r5*8]
   5388    mova                 m7, [cq+32*31+r5*8]
   5389 %if ARCH_X86_64
   5390    mova                m11, [o(pd_2048)]
   5391    mova                m12, [o(clip_18b_min)]
   5392    mova                m13, [o(clip_18b_max)]
   5393    mova                m14, [o(pd_2896)]
   5394 %endif
   5395    mov                  r3, rsp
   5396    call .main_oddhalf_part1
   5397    mova                 m0, [cq+32* 3+r5*8]
   5398    mova                 m1, [cq+32* 5+r5*8]
   5399    mova                 m2, [cq+32*11+r5*8]
   5400    mova                 m3, [cq+32*13+r5*8]
   5401    mova                 m4, [cq+32*19+r5*8]
   5402    mova                 m5, [cq+32*21+r5*8]
   5403    mova                 m6, [cq+32*27+r5*8]
   5404    mova                 m7, [cq+32*29+r5*8]
   5405    call .main_oddhalf_part2
   5406    mova                 m0, [cq+32* 2+r5*8]
   5407    mova                 m1, [cq+32* 6+r5*8]
   5408    mova                 m2, [cq+32*10+r5*8]
   5409    mova                 m3, [cq+32*14+r5*8]
   5410    mova                 m4, [cq+32*18+r5*8]
   5411    mova                 m5, [cq+32*22+r5*8]
   5412    mova                 m6, [cq+32*26+r5*8]
   5413    mova                 m7, [cq+32*30+r5*8]
   5414    add                  r3, 16*(16+4*ARCH_X86_32)
   5415    call m(idct_16x4_internal_16bpc).main_oddhalf
   5416    mova                 m0, [cq+32* 0+r5*8]
   5417    mova                 m1, [cq+32* 4+r5*8]
   5418    mova                 m2, [cq+32* 8+r5*8]
   5419    mova                 m3, [cq+32*12+r5*8]
   5420    mova                 m4, [cq+32*16+r5*8]
   5421    mova                 m5, [cq+32*20+r5*8]
   5422    mova                 m6, [cq+32*24+r5*8]
   5423    mova                 m7, [cq+32*28+r5*8]
   5424    call m(idct_8x4_internal_16bpc).main_pass1
   5425    call m(idct_8x4_internal_16bpc).round
   5426    sub                  r3, 16*(16+4*ARCH_X86_32)
   5427    call .round_dct32
   5428 %if ARCH_X86_64
   5429    call m(idct_8x4_internal_16bpc).transpose4x8packed
   5430    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   5431    mova    [cq+32* 8+r5*8], m8
   5432    mova    [cq+32* 9+r5*8], m9
   5433    mova    [cq+32*10+r5*8], m10
   5434    mova    [cq+32*11+r5*8], m11
   5435    mova                 m8, [r3+16* 9] ;  8  9
   5436    mova                m10, [r3+16*11] ; 10 11
   5437    mova                m12, [r3+16*13] ; 12 13
   5438    mova                m14, [r3+16*15] ; 14 15
   5439    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   5440    mova    [cq+32* 4+r5*8], m8
   5441    mova    [cq+32* 5+r5*8], m9
   5442    mova    [cq+32* 6+r5*8], m10
   5443    mova    [cq+32* 7+r5*8], m11
   5444    mova                 m8, [r3+16* 8] ; 24 25
   5445    mova                m10, [r3+16*10] ; 26 27
   5446    mova                m12, [r3+16*12] ; 28 29
   5447    mova                m14, [r3+16*14] ; 30 31
   5448    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   5449    mova    [cq+32*12+r5*8], m8
   5450    mova    [cq+32*13+r5*8], m9
   5451    mova    [cq+32*14+r5*8], m10
   5452    mova    [cq+32*15+r5*8], m11
   5453 %else
   5454    sub                  r3, 8*16
   5455    mova                 m0, [r3+ 8*16]
   5456    mova                 m2, [r3+10*16]
   5457    mova                 m4, [r3+12*16]
   5458    mova                 m6, [r3+14*16]
   5459    packssdw             m0, [r3+ 9*16]
   5460    packssdw             m2, [r3+11*16]
   5461    packssdw             m4, [r3+13*16]
   5462    packssdw             m6, [r3+15*16]
   5463    call m(idct_8x4_internal_16bpc).transpose4x8packed
   5464    mova    [cq+32* 4+r5*8], m0
   5465    mova    [cq+32* 5+r5*8], m1
   5466    mova    [cq+32* 6+r5*8], m2
   5467    mova    [cq+32* 7+r5*8], m3
   5468    mova                 m0, [r3+16*16]
   5469    mova                 m2, [r3+18*16]
   5470    mova                 m4, [r3+20*16]
   5471    mova                 m6, [r3+22*16]
   5472    packssdw             m0, [r3+17*16]
   5473    packssdw             m2, [r3+19*16]
   5474    packssdw             m4, [r3+21*16]
   5475    packssdw             m6, [r3+23*16]
   5476    call m(idct_8x4_internal_16bpc).transpose4x8packed
   5477    mova    [cq+32* 8+r5*8], m0
   5478    mova    [cq+32* 9+r5*8], m1
   5479    mova    [cq+32*10+r5*8], m2
   5480    mova    [cq+32*11+r5*8], m3
   5481    mova                 m0, [r3+31*16]
   5482    mova                 m2, [r3+29*16]
   5483    mova                 m4, [r3+27*16]
   5484    mova                 m6, [r3+25*16]
   5485    packssdw             m0, [r3+30*16]
   5486    packssdw             m2, [r3+28*16]
   5487    packssdw             m4, [r3+26*16]
   5488    packssdw             m6, [r3+24*16]
   5489    call m(idct_8x4_internal_16bpc).transpose4x8packed
   5490    mova    [cq+32*12+r5*8], m0
   5491    mova    [cq+32*13+r5*8], m1
   5492    mova    [cq+32*14+r5*8], m2
   5493    mova    [cq+32*15+r5*8], m3
   5494    mova                 m0, [r3+ 0*16]
   5495    mova                 m2, [r3+ 2*16]
   5496    mova                 m4, [r3+ 4*16]
   5497    mova                 m6, [r3+ 6*16]
   5498    packssdw             m0, [r3+ 1*16]
   5499    packssdw             m2, [r3+ 3*16]
   5500    packssdw             m4, [r3+ 5*16]
   5501    packssdw             m6, [r3+ 7*16]
   5502    call m(idct_8x4_internal_16bpc).transpose4x8packed
   5503 %endif
   5504    pxor                 m7, m7
   5505    ; clear lower half of [cq]
   5506    REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \
   5507                                    24, 25, 26, 27, 28, 29, 30, 31
   5508    test                r5d, r5d
   5509    jz .end_pass1
   5510    mova    [cq+32* 0+r5*8], m0
   5511    mova    [cq+32* 1+r5*8], m1
   5512    mova    [cq+32* 2+r5*8], m2
   5513    mova    [cq+32* 3+r5*8], m3
   5514    sub                 r5d, 2
   5515    jmp .loop_pass1
   5516 .end_pass1:
   5517 
   5518    ; pass=2, we need to call this otherwise the stack pointer has
   5519    ; the wrong offset in the 8-bit code
   5520    mov                 r4d, 4
   5521    call m(idct_16x8_internal_16bpc).pass2_main
   5522    RET
   5523 
   5524 .main_oddhalf_part1_fast: ; lower half zero
   5525    pmulld               m7, m0, [o(pd_4091)]
   5526    pmulld               m0, [o(pd_201)]
   5527    pmulld               m4, m3, [o(pd_m2751)]
   5528 %if ARCH_X86_32
   5529    pmulld               m3, [o(pd_3035)]
   5530    mova                 m5, [o(pd_2048)]
   5531    REPX      {paddd x, m5}, m0, m7
   5532    REPX      {psrad x, 12}, m0, m7
   5533    mova          [r3+3*16], m7
   5534    mova                 m7, m3
   5535    mova                 m3, m5
   5536 %else
   5537    pmulld               m3, [o(pd_3035)]
   5538 %endif
   5539    pmulld               m6, m1, [o(pd_m1380)]
   5540    pmulld               m1, [o(pd_3857)]
   5541    pmulld               m5, m2, [o(pd_3703)]
   5542    pmulld               m2, [o(pd_1751)]
   5543    jmp .main_oddhalf_part1_fast2
   5544 .main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
   5545 %if ARCH_X86_64
   5546    ITX_MULSUB_2D         0, 7, 8, 9, 10, _,  201, 4091 ; t16a, t31a
   5547    ITX_MULSUB_2D         6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
   5548    ITX_MULSUB_2D         2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
   5549    ITX_MULSUB_2D         4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
   5550 .main_oddhalf_part1_fast2:
   5551    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   5552    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
   5553    psubd                m8, m0, m4 ; t17
   5554    paddd                m0, m4     ; t16
   5555    psubd                m4, m6, m2 ; t18
   5556    paddd                m6, m2     ; t19
   5557    psubd                m2, m1, m5 ; t29
   5558    paddd                m1, m5     ; t28
   5559    psubd                m5, m7, m3 ; t30
   5560    paddd                m7, m3     ; t31
   5561    REPX    {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
   5562    REPX    {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
   5563    mova                m15, [o(pd_4017)]
   5564    mova                m10, [o(pd_799)]
   5565    ITX_MULSUB_2D         5, 8, 3, 9, _, 11, 10, 15    ; t17a, t30a
   5566    ITX_MULSUB_2D         2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
   5567    psubd                m3, m0, m6 ; t19a
   5568    paddd                m0, m6     ; t16a
   5569    psubd                m6, m7, m1 ; t28a
   5570    paddd                m7, m1     ; t31a
   5571    psubd                m1, m5, m4 ; t18
   5572    paddd                m5, m4     ; t17
   5573    psubd                m4, m8, m2 ; t29
   5574    paddd                m8, m2     ; t30
   5575    REPX    {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
   5576    REPX    {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
   5577    mova                m15, [o(pd_3784)]
   5578    mova                m10, [o(pd_1567)]
   5579    ITX_MULSUB_2D         4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
   5580    ITX_MULSUB_2D         6, 3, 2, 9, _, 11, 10, 15 ; t19,  t28
   5581    mova          [r3+16*0], m0
   5582    mova          [r3+16*1], m5
   5583    mova          [r3+16*2], m4
   5584    mova          [r3+16*3], m6
   5585    mova          [r3+16*4], m3
   5586    mova          [r3+16*5], m1
   5587    mova          [r3+16*6], m8
   5588    mova          [r3+16*7], m7
   5589 %else
   5590    mova          [r3+0*16], m2
   5591    mova          [r3+1*16], m3
   5592    mova          [r3+2*16], m4
   5593    mova          [r3+3*16], m5
   5594    mova                  m3, [o(pd_2048)]
   5595    ITX_MULSUB_2D         0, 7, 2, 4, 5, 3,  201, 4091 ; t16a, t31a
   5596    ITX_MULSUB_2D         6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a
   5597    mova                 m4, [r3+2*16]
   5598    mova                 m5, [r3+3*16]
   5599    mova          [r3+2*16], m6
   5600    mova          [r3+3*16], m7
   5601    mova                 m2, [r3+0*16]
   5602    mova                 m7, [r3+1*16]
   5603    mova          [r3+0*16], m0
   5604    mova          [r3+1*16], m1
   5605    ITX_MULSUB_2D         2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a
   5606    ITX_MULSUB_2D         4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a
   5607    mova                 m0, [r3+0*16]
   5608    mova                 m1, [r3+1*16]
   5609    mova                 m6, [r3+2*16]
   5610 .main_oddhalf_part1_fast2:
   5611    REPX      {paddd x, m3}, m1, m2, m4, m5, m6, m7
   5612    REPX      {psrad x, 12}, m1, m2, m4, m5, m6, m7
   5613    psubd                m3, m0, m4 ; t17
   5614    mova          [r3+0*16], m3
   5615    mova                 m3, [r3+3*16]
   5616    paddd                m0, m4     ; t16
   5617    psubd                m4, m6, m2 ; t18
   5618    paddd                m6, m2     ; t19
   5619    psubd                m2, m1, m5 ; t29
   5620    paddd                m1, m5     ; t28
   5621    psubd                m5, m3, m7 ; t30
   5622    paddd                m7, m3     ; t31
   5623    mova                 m3, [o(clip_18b_min)]
   5624    REPX     {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
   5625    pmaxsd               m3, [r3+0*16]
   5626    mova          [r3+0*16], m3
   5627    mova                 m3, [o(clip_18b_max)]
   5628    REPX     {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
   5629    pminsd               m3, [r3+0*16]
   5630    mova          [r3+0*16], m0
   5631    mova          [r3+1*16], m1
   5632    mova          [r3+2*16], m6
   5633    mova          [r3+3*16], m7
   5634    mova                 m0, [o(pd_2048)]
   5635    ITX_MULSUB_2D         5, 3, 1, 6, 7, 0,  799, 4017    ; t17a, t30a
   5636    ITX_MULSUB_2D         2, 4, 1, 6, _, 0,    7, 4017, 4 ; t29a, t18a
   5637    psubd                m1, m5, m4 ; t18
   5638    paddd                m5, m4     ; t17
   5639    psubd                m4, m3, m2 ; t29
   5640    paddd                m3, m2     ; t30
   5641    mova                 m0, [r3+0*16]
   5642    mova                 m2, [r3+1*16]
   5643    mova                 m6, [r3+2*16]
   5644    mova                 m7, [r3+3*16]
   5645    mova          [r3+0*16], m3
   5646    psubd                m3, m0, m6 ; t19a
   5647    paddd                m0, m6     ; t16a
   5648    psubd                m6, m7, m2 ; t28a
   5649    paddd                m7, m2     ; t31a
   5650    mova                 m2, [o(clip_18b_min)]
   5651    REPX     {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
   5652    pmaxsd               m2, [r3+0*16]
   5653    mova          [r3+0*16], m2
   5654    mova                 m2, [o(clip_18b_max)]
   5655    REPX     {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
   5656    pminsd               m2, [r3+0*16]
   5657    mova          [r3+16*0], m0
   5658    mova          [r3+16*1], m5
   5659    mova          [r3+16*6], m2
   5660    mova          [r3+16*7], m7
   5661    mova                 m7, [o(pd_2048)]
   5662    ITX_MULSUB_2D         4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a
   5663    ITX_MULSUB_2D         6, 3, 0, 5, 2, 7,    2, 3784 ; t19,  t28
   5664    mova          [r3+16*2], m4
   5665    mova          [r3+16*3], m6
   5666    mova          [r3+16*4], m3
   5667    mova          [r3+16*5], m1
   5668 %endif
   5669    ret
   5670 .main_oddhalf_part2_fast: ; lower half zero
   5671    pmulld               m7, m0, [o(pd_m601)]
   5672    pmulld               m0, [o(pd_4052)]
   5673    pmulld               m4, m3, [o(pd_3290)]
   5674 %if ARCH_X86_32
   5675    pmulld               m3, [o(pd_2440)]
   5676    mova                 m5, [o(pd_2048)]
   5677    REPX      {paddd x, m5}, m0, m7
   5678    REPX      {psrad x, 12}, m0, m7
   5679    mova         [r3+11*16], m7
   5680    mova                 m7, m3
   5681    mova                 m3, m5
   5682 %else
   5683    pmulld               m3, [o(pd_2440)]
   5684 %endif
   5685    pmulld               m6, m1, [o(pd_3973)]
   5686    pmulld               m1, [o(pd_995)]
   5687    pmulld               m5, m2, [o(pd_m2106)]
   5688    pmulld               m2, [o(pd_3513)]
   5689    jmp .main_oddhalf_part2_fast2
   5690 .main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
   5691 %if ARCH_X86_64
   5692    ITX_MULSUB_2D         7, 0, 8, 9, 10, _, 4052,  601 ; t23a, t24a
   5693    ITX_MULSUB_2D         1, 6, 8, 9, 10, _,  995, 3973 ; t20a, t27a
   5694    ITX_MULSUB_2D         5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
   5695    ITX_MULSUB_2D         3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
   5696 .main_oddhalf_part2_fast2:
   5697    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
   5698    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
   5699    psubd                m8, m0, m4 ; t25
   5700    paddd                m0, m4     ; t24
   5701    psubd                m4, m6, m2 ; t26
   5702    paddd                m6, m2     ; t27
   5703    psubd                m2, m1, m5 ; t21
   5704    paddd                m1, m5     ; t20
   5705    psubd                m5, m7, m3 ; t22
   5706    paddd                m7, m3     ; t23
   5707    REPX    {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
   5708    REPX    {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
   5709    mova                m15, [o(pd_2276)]
   5710    mova                m10, [o(pd_3406)]
   5711    ITX_MULSUB_2D         4, 2, 3, 9, _, 11, 10, 15    ; t21a, t26a
   5712    ITX_MULSUB_2D         8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
   5713    psubd                m3, m0, m6 ; t27a
   5714    paddd                m0, m6     ; t24a
   5715    psubd                m6, m7, m1 ; t20a
   5716    paddd                m7, m1     ; t23a
   5717    psubd                m1, m5, m4 ; t21
   5718    paddd                m5, m4     ; t22
   5719    psubd                m4, m8, m2 ; t26
   5720    paddd                m8, m2     ; t25
   5721    REPX    {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
   5722    REPX    {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
   5723    mova                m15, [o(pd_3784)]
   5724    mova                m10, [o(pd_1567)]
   5725    ITX_MULSUB_2D         4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
   5726    ITX_MULSUB_2D         3, 6, 2, 9, _, 11, 10, 15, 4 ; t27,  t20
   5727    mova                 m9, [r3+16*0] ; t16a
   5728    mova                m10, [r3+16*1] ; t17
   5729    psubd                m2, m9, m7    ; t23
   5730    paddd                m9, m7        ; t16
   5731    psubd                m7, m10, m5   ; t22a
   5732    paddd               m10, m5        ; t17a
   5733    REPX    {pmaxsd x, m12}, m9, m10, m2, m7
   5734    REPX    {pminsd x, m13}, m9, m10, m2, m7
   5735    mova          [r3+16*0], m9
   5736    mova          [r3+16*1], m10
   5737    mova                 m9, [r3+16*2] ; t18a
   5738    mova                m10, [r3+16*3] ; t19
   5739    psubd                m5, m9, m1    ; t21
   5740    paddd                m9, m1        ; t18
   5741    psubd                m1, m10, m6   ; t20a
   5742    paddd               m10, m6        ; t19a
   5743    REPX    {pmaxsd x, m12}, m9, m10, m5, m1
   5744    REPX    {pminsd x, m13}, m9, m10, m5, m1
   5745    mova          [r3+16*2], m9
   5746    mova          [r3+16*3], m10
   5747    mova                 m9, [r3+16*4] ; t28
   5748    mova                m10, [r3+16*5] ; t29a
   5749    psubd                m6, m9, m3    ; t27a
   5750    paddd                m9, m3        ; t28a
   5751    psubd                m3, m10, m4   ; t26
   5752    paddd               m10, m4        ; t29
   5753    REPX    {pmaxsd x, m12}, m9, m10, m6, m3
   5754    REPX    {pminsd x, m13}, m9, m10, m6, m3
   5755    REPX    {pmulld x, m14}, m6, m3, m1, m5
   5756    paddd                m6, m11
   5757    paddd                m3, m11
   5758    psubd                m4, m6, m1    ; t20
   5759    paddd                m6, m1        ; t27
   5760    psubd                m1, m3, m5    ; t21a
   5761    paddd                m3, m5        ; t26a
   5762    REPX    {psrad  x, 12 }, m4, m1, m3, m6
   5763    mova          [r3+16*4], m4
   5764    mova          [r3+16*5], m1
   5765    mova                 m4, [r3+16*6] ; t30
   5766    mova                 m1, [r3+16*7] ; t31a
   5767    psubd                m5, m4, m8    ; t25a
   5768    paddd                m4, m8        ; t30a
   5769    psubd                m8, m1, m0    ; t24
   5770    paddd                m1, m0        ; t31
   5771    REPX    {pmaxsd x, m12}, m8, m5, m4, m1
   5772    REPX    {pminsd x, m13}, m8, m5, m4, m1
   5773    REPX    {pmulld x, m14}, m5, m8, m7, m2
   5774    paddd                m5, m11
   5775    paddd                m8, m11
   5776    psubd                m0, m5, m7    ; t22
   5777    paddd                m5, m7        ; t25
   5778    psubd                m7, m8, m2    ; t23a
   5779    paddd                m2, m8        ; t24a
   5780    REPX    {psrad  x, 12 }, m0, m7, m2, m5
   5781    mova          [r3+16*6], m0
   5782    mova          [r3+16*7], m7
   5783    mova          [r3+16*8], m2
   5784    mova          [r3+16*9], m5
   5785    mova         [r3+16*10], m3
   5786    mova         [r3+16*11], m6
   5787    mova         [r3+16*12], m9
   5788    mova         [r3+16*13], m10
   5789    mova         [r3+16*14], m4
   5790    mova         [r3+16*15], m1
   5791 %else
   5792    mova         [r3+ 8*16], m2
   5793    mova         [r3+ 9*16], m3
   5794    mova         [r3+10*16], m4
   5795    mova         [r3+11*16], m5
   5796    mova                 m3, [o(pd_2048)]
   5797    ITX_MULSUB_2D         7, 0, 2, 4, 5, 3, 4052,  601 ; t23a, t24a
   5798    ITX_MULSUB_2D         1, 6, 2, 4, 5, _,  995, 3973 ; t20a, t27a
   5799    mova                 m2, [r3+ 8*16]
   5800    mova                 m4, [r3+10*16]
   5801    mova                 m5, [r3+11*16]
   5802    mova         [r3+ 8*16], m0
   5803    mova         [r3+10*16], m6
   5804    mova         [r3+11*16], m7
   5805    mova                 m7, [r3+ 9*16]
   5806    mova         [r3+ 9*16], m1
   5807    ITX_MULSUB_2D         5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a
   5808    ITX_MULSUB_2D         7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a
   5809    mova                 m0, [r3+ 8*16]
   5810    mova                 m1, [r3+ 9*16]
   5811    mova                 m6, [r3+10*16]
   5812 .main_oddhalf_part2_fast2:
   5813    REPX      {paddd x, m3}, m1, m2, m7, m4, m5, m6
   5814    REPX      {psrad x, 12}, m1, m2, m7, m4, m5, m6
   5815    psubd                m3, m0, m4 ; t25
   5816    mova         [r3+ 8*16], m3
   5817    mova                 m3, [r3+11*16]
   5818    paddd                m0, m4     ; t24
   5819    psubd                m4, m6, m2 ; t26
   5820    paddd                m6, m2     ; t27
   5821    psubd                m2, m1, m5 ; t21
   5822    paddd                m1, m5     ; t20
   5823    psubd                m5, m3, m7 ; t22
   5824    paddd                m7, m3     ; t23
   5825    mova                 m3, [o(clip_18b_min)]
   5826    REPX     {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
   5827    pmaxsd               m3, [r3+ 8*16]
   5828    mova         [r3+ 8*16], m3
   5829    mova                 m3, [o(clip_18b_max)]
   5830    REPX     {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
   5831    pminsd               m3, [r3+ 8*16]
   5832    mova         [r3+ 8*16], m0
   5833    mova         [r3+ 9*16], m1
   5834    mova         [r3+10*16], m6
   5835    mova         [r3+11*16], m7
   5836    mova                 m7, [o(pd_2048)]
   5837    ITX_MULSUB_2D         4, 2, 0, 1, 6, 7, 3406, 2276    ; t21a, t26a
   5838    ITX_MULSUB_2D         3, 5, 0, 1, _, 7,    6, 2276, 4 ; t25a, t22a
   5839    psubd                m1, m5, m4 ; t21
   5840    paddd                m5, m4     ; t22
   5841    psubd                m4, m3, m2 ; t26
   5842    paddd                m3, m2     ; t25
   5843    mova                 m0, [r3+ 8*16]
   5844    mova                 m2, [r3+ 9*16]
   5845    mova                 m6, [r3+10*16]
   5846    mova                 m7, [r3+11*16]
   5847    mova         [r3+ 8*16], m3
   5848    psubd                m3, m0, m6 ; t27a
   5849    paddd                m0, m6     ; t24a
   5850    psubd                m6, m7, m2 ; t20a
   5851    paddd                m7, m2     ; t23a
   5852    mova                 m2, [o(clip_18b_min)]
   5853    REPX     {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
   5854    pmaxsd               m2, [r3+ 8*16]
   5855    mova         [r3+ 8*16], m2
   5856    mova                 m2, [o(clip_18b_max)]
   5857    REPX     {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
   5858    pminsd               m2, [r3+ 8*16]
   5859    mova         [r3+ 8*16], m0
   5860    mova         [r3+ 9*16], m2
   5861    mova         [r3+14*16], m5
   5862    mova         [r3+15*16], m7
   5863    mova                 m0, [o(pd_2048)]
   5864    ITX_MULSUB_2D         4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a
   5865    ITX_MULSUB_2D         3, 6, 2, 5, _, 0,    7, 3784, 4 ; t27,  t20
   5866    mova         [r3+10*16], m3
   5867    mova                 m0, [o(clip_18b_min)]
   5868    mova                 m2, [o(clip_18b_max)]
   5869    mova                 m5, [r3+16*2] ; t18a
   5870    mova                 m7, [r3+16*3] ; t19
   5871    psubd                m3, m5, m1    ; t21
   5872    paddd                m5, m1        ; t18
   5873    psubd                m1, m7, m6    ; t20a
   5874    paddd                m7, m6        ; t19a
   5875    REPX     {pmaxsd x, m0}, m5, m7, m3, m1
   5876    REPX     {pminsd x, m2}, m5, m7, m3, m1
   5877    mova          [r3+16*2], m5
   5878    mova          [r3+16*3], m7
   5879    mova         [r3+11*16], m3
   5880    mova                 m3, [r3+10*16]
   5881    mova                 m5, [r3+16*4] ; t28
   5882    mova                 m7, [r3+16*5] ; t29a
   5883    psubd                m6, m5, m3    ; t27a
   5884    paddd                m5, m3        ; t28a
   5885    psubd                m3, m7, m4    ; t26
   5886    paddd                m7, m4        ; t29
   5887    REPX     {pmaxsd x, m0}, m5, m7, m6, m3
   5888    REPX     {pminsd x, m2}, m5, m7, m6, m3
   5889    mova         [r3+16*12], m5
   5890    mova         [r3+16*13], m7
   5891    mova                 m5, [o(pd_2048)]
   5892    mova                 m7, [o(pd_2896)]
   5893    mova                 m4, [r3+11*16]
   5894    REPX     {pmulld x, m7}, m6, m3, m1, m4
   5895    paddd                m6, m5
   5896    paddd                m3, m5
   5897    psubd                m5, m6, m1    ; t20
   5898    paddd                m6, m1        ; t27
   5899    psubd                m1, m3, m4    ; t21a
   5900    paddd                m3, m4        ; t26a
   5901    REPX     {psrad  x, 12}, m5, m1, m3, m6
   5902    mova          [r3+16*4], m5
   5903    mova          [r3+16*5], m1
   5904    mova         [r3+16*10], m3
   5905    mova         [r3+16*11], m6
   5906 
   5907    mova                 m5, [r3+14*16]
   5908    mova                 m6, [r3+15*16]
   5909    mova                 m3, [r3+16*0] ; t16a
   5910    mova                 m4, [r3+16*1] ; t17
   5911    psubd                m1, m3, m6    ; t23
   5912    paddd                m3, m6        ; t16
   5913    psubd                m6, m4, m5    ; t22a
   5914    paddd                m4, m5        ; t17a
   5915    REPX     {pmaxsd x, m0}, m3, m4, m1, m6
   5916    REPX     {pminsd x, m2}, m3, m4, m1, m6
   5917    mova          [r3+16*0], m3
   5918    mova          [r3+16*1], m4
   5919    mova                 m5, [r3+ 8*16]
   5920    mova                 m3, [r3+ 9*16]
   5921    mova         [r3+ 8*16], m1
   5922    mova         [r3+ 9*16], m6
   5923    mova                 m4, [r3+16*6] ; t30
   5924    mova                 m1, [r3+16*7] ; t31a
   5925    psubd                m6, m1, m5    ; t24
   5926    paddd                m1, m5        ; t31
   5927    psubd                m5, m4, m3    ; t25a
   5928    paddd                m4, m3        ; t30a
   5929    REPX     {pmaxsd x, m0}, m6, m5, m4, m1
   5930    REPX     {pminsd x, m2}, m6, m5, m4, m1
   5931    mova         [r3+16*14], m4
   5932    mova         [r3+16*15], m1
   5933    mova                 m4, [o(pd_2048)]
   5934    mova                 m1, [r3+ 9*16]
   5935    mova                 m2, [r3+ 8*16]
   5936    REPX     {pmulld x, m7}, m5, m6, m1, m2
   5937    paddd                m5, m4
   5938    paddd                m6, m4
   5939    psubd                m0, m5, m1    ; t22
   5940    paddd                m5, m1        ; t25
   5941    psubd                m1, m6, m2    ; t23a
   5942    paddd                m2, m6        ; t24a
   5943    REPX     {psrad  x, 12}, m0, m1, m2, m5
   5944    mova          [r3+16*6], m0
   5945    mova          [r3+16*7], m1
   5946    mova          [r3+16*8], m2
   5947    mova          [r3+16*9], m5
   5948 %endif
   5949    ret
   5950 
   5951    ; final sumsub for idct16 as well as idct32, plus final downshift
   5952 %macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
   5953    mova                m%4, [r3+16*(23-%1)]
   5954    pmaxsd              m%1, m12
   5955    pminsd              m%1, m13
   5956    psubd               m%3, m%1, m%4 ; idct16 out15 - n
   5957    paddd               m%1, m%4      ; idct16 out0  + n
   5958    pmaxsd              m%1, m12
   5959    pmaxsd              m%3, m12
   5960    pminsd              m%1, m13
   5961    pminsd              m%3, m13
   5962    paddd               m%1, m11
   5963    paddd               m%3, m11
   5964    mova                m%5, [r3+16*( 0+%1)]
   5965    mova                m%2, [r3+16*(15-%1)]
   5966    psubd               m%4, m%1, m%2 ; out31 - n
   5967    paddd               m%1, m%2      ; out0  + n
   5968    paddd               m%2, m%3, m%5 ; out15 - n
   5969    psubd               m%3, m%5      ; out16 + n
   5970    REPX      {psrad x, %6}, m%1, m%3, m%2, m%4
   5971 %endmacro
   5972 
   5973 .round_dct32:
   5974 %if ARCH_X86_64
   5975    psrld               m11, 10 ; pd_2
   5976    IDCT32_END            0, 15, 8, 9, 10, 2    ; 0 15 16 31
   5977    mova         [r3+ 0*16], m6
   5978    mova         [r3+23*16], m7
   5979    IDCT32_END            1, 14, 6, 7, 10, 2    ; 1 14 17 30
   5980    packssdw             m0, m1       ;  0  1
   5981    packssdw            m14, m15      ; 14 15
   5982    packssdw             m8, m6       ; 16 17
   5983    packssdw             m7, m9       ; 30 31
   5984    mova         [r3+16*15], m14
   5985    mova         [r3+16*14], m7
   5986    IDCT32_END            2, 15, 10, 7, 6, 2    ; 2 13 18 29
   5987    IDCT32_END            3, 14,  1, 9, 6, 2    ; 3 12 19 28
   5988    packssdw             m2, m3       ;  2  3
   5989    packssdw            m14, m15      ; 12 13
   5990    packssdw            m10, m1       ; 18 19
   5991    packssdw             m9, m7       ; 28 29
   5992    mova         [r3+16*13], m14
   5993    mova         [r3+16*12], m9
   5994    IDCT32_END            4, 15, 1, 7, 6, 2     ; 4 11 20 27
   5995    IDCT32_END            5, 14, 3, 9, 6, 2     ; 5 10 21 26
   5996    packssdw             m4, m5       ;  4  5
   5997    packssdw            m14, m15      ; 10 11
   5998    packssdw             m1, m3       ; 20 21
   5999    packssdw             m9, m7       ; 26 27
   6000    mova         [r3+16*11], m14
   6001    mova         [r3+16*10], m9
   6002    mova                 m6, [r3+ 0*16]
   6003    mova                 m7, [r3+23*16]
   6004    IDCT32_END            6, 15, 14, 5,  3, 2   ; 6 9 22 25
   6005    IDCT32_END            7, 11,  3, 9, 13, 2   ; 7 8 23 24
   6006    packssdw             m6, m7       ;  6  7
   6007    packssdw            m11, m15      ;  8  9
   6008    packssdw            m14, m3       ; 22 23
   6009    packssdw             m9, m5       ; 24 25
   6010    mova          [r3+16*9], m11
   6011    mova          [r3+16*8], m9
   6012    mova                m12, m1
   6013    ret
   6014 %else
   6015    mova         [r3+16*16], m0
   6016    mova         [r3+17*16], m1
   6017    mova         [r3+18*16], m2
   6018    mova         [r3+19*16], m3
   6019    mova         [r3+20*16], m4
   6020    mova         [r3+21*16], m5
   6021    mova         [r3+22*16], m6
   6022    mova         [r3+23*16], m7
   6023    mova                 m1, [o(pd_2)]
   6024    mova                 m2, [o(clip_18b_min)]
   6025    mova                 m3, [o(clip_18b_max)]
   6026 
   6027    mov                  r4, 15*16
   6028 .loop_dct32_end:
   6029    mova                 m0, [r3+16*16]
   6030    mova                 m6, [r3+16*24]
   6031    pmaxsd               m0, m2
   6032    pminsd               m0, m3
   6033    psubd                m5, m0, m6 ; idct16 out15 - n
   6034    paddd                m0, m6     ; idct16 out0  + n
   6035    pmaxsd               m0, m2
   6036    pmaxsd               m5, m2
   6037    pminsd               m0, m3
   6038    pminsd               m5, m3
   6039    paddd                m0, m1
   6040    paddd                m5, m1
   6041    mova                 m7, [r3]
   6042    mova                 m4, [r3+r4]
   6043    psubd                m6, m0, m4 ; out31 - n
   6044    paddd                m0, m4     ; out0  + n
   6045    paddd                m4, m5, m7 ; out15 - n
   6046    psubd                m5, m7     ; out16 + n
   6047    REPX       {psrad x, 2}, m0, m5, m4, m6
   6048    mova               [r3], m0
   6049    mova            [r3+r4], m4
   6050    mova         [r3+16*16], m5
   6051    mova         [r3+24*16], m6
   6052    add                  r3, 16
   6053    sub                  r4, 32
   6054    jg .loop_dct32_end
   6055    ret
   6056 %endif
   6057 
   6058 .dconly:
   6059    imul                r5d, [cq], 181
   6060    mov                [cq], eobd ; 0
   6061    mov                 r3d, 8
   6062 .dconly1:
   6063    add                 r5d, 640
   6064    sar                 r5d, 10
   6065 .dconly2:
   6066    imul                r5d, 2896
   6067    add                 r5d, 34816
   6068    movd                 m0, r5d
   6069    pshuflw              m0, m0, q1111
   6070    punpcklqdq           m0, m0
   6071    mova                 m6, [o(pixel_10bpc_max)]
   6072    pxor                 m5, m5
   6073 .dconly_loop:
   6074    mova                 m1, [dstq+16*0]
   6075    mova                 m2, [dstq+16*1]
   6076    mova                 m3, [dstq+16*2]
   6077    mova                 m4, [dstq+16*3]
   6078    REPX     {paddw  x, m0}, m1, m2, m3, m4
   6079    REPX     {pminsw x, m6}, m1, m2, m3, m4
   6080    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
   6081    mova        [dstq+16*0], m1
   6082    mova        [dstq+16*1], m2
   6083    mova        [dstq+16*2], m3
   6084    mova        [dstq+16*3], m4
   6085    add                dstq, strideq
   6086    dec                 r3d
   6087    jg .dconly_loop
   6088    RET
   6089 
   6090 cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
   6091                                         dst, stride, c, eob
   6092    LEA                  r6, base
   6093    test               eobd, eobd
   6094    jz .dconly
   6095 
   6096    ; remove entirely-zero iterations
   6097 %undef cmp
   6098    mov                 r5d, 8
   6099 .zero_loop:
   6100    sub                 r5d, 2
   6101    cmp                eobw, word [o2(tbl_32x16_2d)+r5]
   6102    jl .zero_loop
   6103 
   6104    ; actual first pass after skipping all-zero data
   6105 .loop_pass1:
   6106 %if ARCH_X86_64
   6107    mova                m11, [o(pd_2048)]
   6108    mova                m12, [o(clip_18b_min)]
   6109    mova                m13, [o(clip_18b_max)]
   6110    mova                m14, [o(pd_2896)]
   6111 %endif
   6112    mova                 m0, [cq+64* 1+r5*8]
   6113    mova                 m1, [cq+64* 7+r5*8]
   6114    mova                 m2, [cq+64* 9+r5*8]
   6115    mova                 m3, [cq+64*15+r5*8]
   6116    mova                 m4, [cq+64*17+r5*8]
   6117    mova                 m5, [cq+64*23+r5*8]
   6118    mova                 m6, [cq+64*25+r5*8]
   6119    mova                 m7, [cq+64*31+r5*8]
   6120    mov                  r3, rsp
   6121    call m(idct_8x4_internal_16bpc).rect2_mul
   6122    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
   6123 
   6124    mova                 m0, [cq+64* 3+r5*8]
   6125    mova                 m1, [cq+64* 5+r5*8]
   6126    mova                 m2, [cq+64*11+r5*8]
   6127    mova                 m3, [cq+64*13+r5*8]
   6128    mova                 m4, [cq+64*19+r5*8]
   6129    mova                 m5, [cq+64*21+r5*8]
   6130    mova                 m6, [cq+64*27+r5*8]
   6131    mova                 m7, [cq+64*29+r5*8]
   6132 %if ARCH_X86_32
   6133    add                  r3, 16*8
   6134 %endif
   6135    call m(idct_8x4_internal_16bpc).rect2_mul
   6136 %if ARCH_X86_32
   6137    sub                  r3, 16*8
   6138 %endif
   6139    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
   6140    add                  r3, 16*(16+4*ARCH_X86_32)
   6141 
   6142    mova                 m0, [cq+64* 2+r5*8]
   6143    mova                 m1, [cq+64* 6+r5*8]
   6144    mova                 m2, [cq+64*10+r5*8]
   6145    mova                 m3, [cq+64*14+r5*8]
   6146    mova                 m4, [cq+64*18+r5*8]
   6147    mova                 m5, [cq+64*22+r5*8]
   6148    mova                 m6, [cq+64*26+r5*8]
   6149    mova                 m7, [cq+64*30+r5*8]
   6150    call m(idct_8x4_internal_16bpc).rect2_mul
   6151    call m(idct_16x4_internal_16bpc).main_oddhalf
   6152 
   6153    mova                 m0, [cq+64* 0+r5*8]
   6154    mova                 m1, [cq+64* 4+r5*8]
   6155    mova                 m2, [cq+64* 8+r5*8]
   6156    mova                 m3, [cq+64*12+r5*8]
   6157    mova                 m4, [cq+64*16+r5*8]
   6158    mova                 m5, [cq+64*20+r5*8]
   6159    mova                 m6, [cq+64*24+r5*8]
   6160    mova                 m7, [cq+64*28+r5*8]
   6161    call m(idct_8x4_internal_16bpc).rect2_mul
   6162    call m(idct_8x4_internal_16bpc).main_pass1
   6163    call m(idct_8x4_internal_16bpc).round
   6164    sub                  r3, 16*(16+4*ARCH_X86_32)
   6165    call .round_dct32
   6166 
   6167 %if ARCH_X86_64
   6168    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6169    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   6170    mova    [cq+64* 8+r5*8], m8
   6171    mova    [cq+64* 9+r5*8], m9
   6172    mova    [cq+64*10+r5*8], m10
   6173    mova    [cq+64*11+r5*8], m11
   6174    mova                 m8, [r3+16* 9] ;  8  9
   6175    mova                m10, [r3+16*11] ; 10 11
   6176    mova                m12, [r3+16*13] ; 12 13
   6177    mova                m14, [r3+16*15] ; 14 15
   6178    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   6179    mova    [cq+64* 4+r5*8], m8
   6180    mova    [cq+64* 5+r5*8], m9
   6181    mova    [cq+64* 6+r5*8], m10
   6182    mova    [cq+64* 7+r5*8], m11
   6183    mova                 m8, [r3+16* 8] ; 24 25
   6184    mova                m10, [r3+16*10] ; 26 27
   6185    mova                m12, [r3+16*12] ; 28 29
   6186    mova                m14, [r3+16*14] ; 30 31
   6187    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   6188    mova    [cq+64*12+r5*8], m8
   6189    mova    [cq+64*13+r5*8], m9
   6190    mova    [cq+64*14+r5*8], m10
   6191    mova    [cq+64*15+r5*8], m11
   6192 %else
   6193    sub                  r3, 8*16
   6194    mova                 m0, [r3+ 8*16]
   6195    mova                 m2, [r3+10*16]
   6196    mova                 m4, [r3+12*16]
   6197    mova                 m6, [r3+14*16]
   6198    packssdw             m0, [r3+ 9*16]
   6199    packssdw             m2, [r3+11*16]
   6200    packssdw             m4, [r3+13*16]
   6201    packssdw             m6, [r3+15*16]
   6202    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6203    mova    [cq+64* 4+r5*8], m0
   6204    mova    [cq+64* 5+r5*8], m1
   6205    mova    [cq+64* 6+r5*8], m2
   6206    mova    [cq+64* 7+r5*8], m3
   6207    mova                 m0, [r3+16*16]
   6208    mova                 m2, [r3+18*16]
   6209    mova                 m4, [r3+20*16]
   6210    mova                 m6, [r3+22*16]
   6211    packssdw             m0, [r3+17*16]
   6212    packssdw             m2, [r3+19*16]
   6213    packssdw             m4, [r3+21*16]
   6214    packssdw             m6, [r3+23*16]
   6215    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6216    mova    [cq+64* 8+r5*8], m0
   6217    mova    [cq+64* 9+r5*8], m1
   6218    mova    [cq+64*10+r5*8], m2
   6219    mova    [cq+64*11+r5*8], m3
   6220    mova                 m0, [r3+31*16]
   6221    mova                 m2, [r3+29*16]
   6222    mova                 m4, [r3+27*16]
   6223    mova                 m6, [r3+25*16]
   6224    packssdw             m0, [r3+30*16]
   6225    packssdw             m2, [r3+28*16]
   6226    packssdw             m4, [r3+26*16]
   6227    packssdw             m6, [r3+24*16]
   6228    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6229    mova    [cq+64*12+r5*8], m0
   6230    mova    [cq+64*13+r5*8], m1
   6231    mova    [cq+64*14+r5*8], m2
   6232    mova    [cq+64*15+r5*8], m3
   6233    mova                 m0, [r3+ 0*16]
   6234    mova                 m2, [r3+ 2*16]
   6235    mova                 m4, [r3+ 4*16]
   6236    mova                 m6, [r3+ 6*16]
   6237    packssdw             m0, [r3+ 1*16]
   6238    packssdw             m2, [r3+ 3*16]
   6239    packssdw             m4, [r3+ 5*16]
   6240    packssdw             m6, [r3+ 7*16]
   6241    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6242 %endif
   6243    mova    [cq+64* 0+r5*8], m0
   6244    mova    [cq+64* 1+r5*8], m1
   6245    mova    [cq+64* 2+r5*8], m2
   6246    mova    [cq+64* 3+r5*8], m3
   6247    pxor                 m0, m0
   6248    REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
   6249                                    24, 25, 26, 27, 28, 29, 30, 31
   6250    sub                 r5d, 2
   6251    jge .loop_pass1
   6252 
   6253    ; pass=2, we need to call this otherwise the stack pointer has
   6254    ; the wrong offset in the 8-bit code
   6255    call .pass2
   6256    RET
   6257 
   6258 .pass2:
   6259 %if ARCH_X86_64
   6260    mova                 m8, [o(pw_2048)]
   6261    pxor                 m9, m9
   6262    mova                m10, [o(pixel_10bpc_max)]
   6263 %if WIN64
   6264    mov [rsp+16*16+gprsize], r7
   6265 %endif
   6266    mov                  r7, dstq
   6267 %else
   6268    mov [rsp+2*gprsize+16*16], dstq
   6269 %endif
   6270    lea                  r3, [strideq*3]
   6271    mov                 r4d, 4
   6272    jmp m(idct_16x16_internal_16bpc).loop_pass2
   6273 
   6274 .round_dct32:
   6275 %if ARCH_X86_64
   6276    psrld               m11, 11 ; pd_1
   6277    IDCT32_END            0, 15, 8, 9, 10, 1    ; 0 15 16 31
   6278    mova         [r3+ 0*16], m6
   6279    mova         [r3+23*16], m7
   6280    IDCT32_END            1, 14, 6, 7, 10, 1    ; 1 14 17 30
   6281    packssdw             m0, m1       ;  0  1
   6282    packssdw            m14, m15      ; 14 15
   6283    packssdw             m8, m6       ; 16 17
   6284    packssdw             m7, m9       ; 30 31
   6285    mova         [r3+16*15], m14
   6286    mova         [r3+16*14], m7
   6287    IDCT32_END            2, 15, 10, 7, 6, 1    ; 2 13 18 29
   6288    IDCT32_END            3, 14,  1, 9, 6, 1    ; 3 12 19 28
   6289    packssdw             m2, m3       ;  2  3
   6290    packssdw            m14, m15      ; 12 13
   6291    packssdw            m10, m1       ; 18 19
   6292    packssdw             m9, m7       ; 28 29
   6293    mova         [r3+16*13], m14
   6294    mova         [r3+16*12], m9
   6295    IDCT32_END            4, 15, 1, 7, 6, 1     ; 4 11 20 27
   6296    IDCT32_END            5, 14, 3, 9, 6, 1     ; 5 10 21 26
   6297    packssdw             m4, m5       ;  4  5
   6298    packssdw            m14, m15      ; 10 11
   6299    packssdw             m1, m3       ; 20 21
   6300    packssdw             m9, m7       ; 26 27
   6301    mova         [r3+16*11], m14
   6302    mova         [r3+16*10], m9
   6303    mova                 m6, [r3+ 0*16]
   6304    mova                 m7, [r3+23*16]
   6305    IDCT32_END            6, 15, 14, 5,  3, 1   ; 6 9 22 25
   6306    IDCT32_END            7, 11,  3, 9, 13, 1   ; 7 8 23 24
   6307    packssdw             m6, m7       ;  6  7
   6308    packssdw            m11, m15      ;  8  9
   6309    packssdw            m14, m3       ; 22 23
   6310    packssdw             m9, m5       ; 24 25
   6311    mova          [r3+16*9], m11
   6312    mova          [r3+16*8], m9
   6313    mova                m12, m1
   6314    ret
   6315 %else
   6316    mova         [r3+16*16], m0
   6317    mova         [r3+17*16], m1
   6318    mova         [r3+18*16], m2
   6319    mova         [r3+19*16], m3
   6320    mova         [r3+20*16], m4
   6321    mova         [r3+21*16], m5
   6322    mova         [r3+22*16], m6
   6323    mova         [r3+23*16], m7
   6324    pcmpeqd              m1, m1     ; -1
   6325    mova                 m2, [o(clip_18b_min)]
   6326    mova                 m3, [o(clip_18b_max)]
   6327 
   6328    mov                  r4, 15*16
   6329 .loop_dct32_end:
   6330    mova                 m0, [r3+16*16]
   6331    mova                 m6, [r3+16*24]
   6332    psubd                m5, m0, m6 ; idct16 out15 - n
   6333    paddd                m0, m6     ; idct16 out0  + n
   6334    pmaxsd               m0, m2
   6335    pmaxsd               m5, m2
   6336    pminsd               m0, m3
   6337    pminsd               m5, m3
   6338    psubd                m0, m1
   6339    psubd                m5, m1
   6340    mova                 m7, [r3]
   6341    mova                 m4, [r3+r4]
   6342    psubd                m6, m0, m4 ; out31 - n
   6343    paddd                m0, m4     ; out0  + n
   6344    paddd                m4, m5, m7 ; out15 - n
   6345    psubd                m5, m7     ; out16 + n
   6346    REPX       {psrad x, 1}, m0, m5, m4, m6
   6347    mova               [r3], m0
   6348    mova            [r3+r4], m4
   6349    mova         [r3+16*16], m5
   6350    mova         [r3+24*16], m6
   6351    add                  r3, 16
   6352    sub                  r4, 32
   6353    jg .loop_dct32_end
   6354    ret
   6355 %endif
   6356 
   6357 .dconly:
   6358    imul                r5d, [cq], 181
   6359    mov                [cq], eobd ; 0
   6360    mov                 r3d, 16
   6361    add                 r5d, 128
   6362    sar                 r5d, 8
   6363    imul                r5d, 181
   6364    add                 r5d, 384
   6365    sar                 r5d, 9
   6366    jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
   6367 
   6368 cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
   6369                                          dst, stride, c, eob
   6370    LEA                  r6, base
   6371    test               eobd, eobd
   6372    jz .dconly
   6373 
   6374    ; remove entirely-zero iterations
   6375 %if ARCH_X86_32
   6376    mov [rsp+5*32*16+1*gprsize], dstq
   6377 %elif WIN64
   6378    mov [rsp+5*32*16+1*gprsize], r7
   6379 %endif
   6380 %undef cmp
   6381    mov                 r5d, 14
   6382    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
   6383    jge .end_zero_loop
   6384    pxor                 m0, m0
   6385 .zero_loop:
   6386    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
   6387    movzx               t1d, t0b
   6388    shr                 t0d, 8
   6389    mova   [rsp+32*16+r5*8+0*32*16], m0
   6390    mova   [rsp+40*16+r5*8+0*32*16], m0
   6391    mova   [rsp+32*16+t0*8+0*32*16], m0
   6392    mova   [rsp+32*16+t1*8+0*32*16], m0
   6393    mova   [rsp+32*16+r5*8+1*32*16], m0
   6394    mova   [rsp+40*16+r5*8+1*32*16], m0
   6395    mova   [rsp+32*16+t0*8+1*32*16], m0
   6396    mova   [rsp+32*16+t1*8+1*32*16], m0
   6397    mova   [rsp+32*16+r5*8+2*32*16], m0
   6398    mova   [rsp+40*16+r5*8+2*32*16], m0
   6399    mova   [rsp+32*16+t0*8+2*32*16], m0
   6400    mova   [rsp+32*16+t1*8+2*32*16], m0
   6401    mova   [rsp+32*16+r5*8+3*32*16], m0
   6402    mova   [rsp+40*16+r5*8+3*32*16], m0
   6403    mova   [rsp+32*16+t0*8+3*32*16], m0
   6404    mova   [rsp+32*16+t1*8+3*32*16], m0
   6405    sub                 r5d, 2
   6406    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
   6407    jl .zero_loop
   6408 .end_zero_loop:
   6409 
   6410    ; actual first pass after skipping all-zero data
   6411    mov [rsp+gprsize*0+5*32*16], eobd
   6412 .loop_pass1:
   6413    mova                 m0, [cq+128* 1+r5*8]
   6414    mova                 m1, [cq+128* 7+r5*8]
   6415    mova                 m2, [cq+128* 9+r5*8]
   6416    mova                 m3, [cq+128*15+r5*8]
   6417    mova                 m4, [cq+128*17+r5*8]
   6418    mova                 m5, [cq+128*23+r5*8]
   6419    mova                 m6, [cq+128*25+r5*8]
   6420    mova                 m7, [cq+128*31+r5*8]
   6421 %if ARCH_X86_64
   6422    mova                m11, [o(pd_2048)]
   6423    mova                m12, [o(clip_18b_min)]
   6424    mova                m13, [o(clip_18b_max)]
   6425    mova                m14, [o(pd_2896)]
   6426 %endif
   6427    mov                  r3, rsp
   6428    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
   6429    mova                 m0, [cq+128* 3+r5*8]
   6430    mova                 m1, [cq+128* 5+r5*8]
   6431    mova                 m2, [cq+128*11+r5*8]
   6432    mova                 m3, [cq+128*13+r5*8]
   6433    mova                 m4, [cq+128*19+r5*8]
   6434    mova                 m5, [cq+128*21+r5*8]
   6435    mova                 m6, [cq+128*27+r5*8]
   6436    mova                 m7, [cq+128*29+r5*8]
   6437    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
   6438    mova                 m0, [cq+128* 2+r5*8]
   6439    mova                 m1, [cq+128* 6+r5*8]
   6440    mova                 m2, [cq+128*10+r5*8]
   6441    mova                 m3, [cq+128*14+r5*8]
   6442    mova                 m4, [cq+128*18+r5*8]
   6443    mova                 m5, [cq+128*22+r5*8]
   6444    mova                 m6, [cq+128*26+r5*8]
   6445    mova                 m7, [cq+128*30+r5*8]
   6446    add                  r3, 16*(16+4*ARCH_X86_32)
   6447    call m(idct_16x4_internal_16bpc).main_oddhalf
   6448    mova                 m0, [cq+128* 0+r5*8]
   6449    mova                 m1, [cq+128* 4+r5*8]
   6450    mova                 m2, [cq+128* 8+r5*8]
   6451    mova                 m3, [cq+128*12+r5*8]
   6452    mova                 m4, [cq+128*16+r5*8]
   6453    mova                 m5, [cq+128*20+r5*8]
   6454    mova                 m6, [cq+128*24+r5*8]
   6455    mova                 m7, [cq+128*28+r5*8]
   6456    call m(idct_8x4_internal_16bpc).main_pass1
   6457    call m(idct_8x4_internal_16bpc).round
   6458    sub                  r3, 16*(16+4*ARCH_X86_32)
   6459    call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32
   6460    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
   6461    movzx               t1d, t0b
   6462    shr                 t0d, 8
   6463 %if ARCH_X86_64
   6464    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6465    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   6466    mova   [rsp+32*16+r5*8+2*32*16], m8
   6467    mova   [rsp+40*16+r5*8+2*32*16], m10
   6468    mova   [rsp+32*16+t1*8+2*32*16], m9
   6469    mova   [rsp+32*16+t0*8+2*32*16], m11
   6470    mova                 m8, [r3+16* 9] ;  8  9
   6471    mova                m10, [r3+16*11] ; 10 11
   6472    mova                m12, [r3+16*13] ; 12 13
   6473    mova                m14, [r3+16*15] ; 14 15
   6474    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   6475    mova   [rsp+32*16+r5*8+1*32*16], m8
   6476    mova   [rsp+40*16+r5*8+1*32*16], m10
   6477    mova   [rsp+32*16+t1*8+1*32*16], m9
   6478    mova   [rsp+32*16+t0*8+1*32*16], m11
   6479    mova                 m8, [r3+16* 8] ; 24 25
   6480    mova                m10, [r3+16*10] ; 26 27
   6481    mova                m12, [r3+16*12] ; 28 29
   6482    mova                m14, [r3+16*14] ; 30 31
   6483    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   6484    mova   [rsp+32*16+r5*8+3*32*16], m8
   6485    mova   [rsp+40*16+r5*8+3*32*16], m10
   6486    mova   [rsp+32*16+t1*8+3*32*16], m9
   6487    mova   [rsp+32*16+t0*8+3*32*16], m11
   6488 %else
   6489    sub                  r3, 8*16
   6490    mova                 m0, [r3+ 8*16]
   6491    mova                 m2, [r3+10*16]
   6492    mova                 m4, [r3+12*16]
   6493    mova                 m6, [r3+14*16]
   6494    packssdw             m0, [r3+ 9*16]
   6495    packssdw             m2, [r3+11*16]
   6496    packssdw             m4, [r3+13*16]
   6497    packssdw             m6, [r3+15*16]
   6498    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6499    mova   [rsp+32*16+r5*8+1*32*16], m0
   6500    mova   [rsp+40*16+r5*8+1*32*16], m2
   6501    mova   [rsp+32*16+t1*8+1*32*16], m1
   6502    mova   [rsp+32*16+t0*8+1*32*16], m3
   6503    mova                 m0, [r3+16*16]
   6504    mova                 m2, [r3+18*16]
   6505    mova                 m4, [r3+20*16]
   6506    mova                 m6, [r3+22*16]
   6507    packssdw             m0, [r3+17*16]
   6508    packssdw             m2, [r3+19*16]
   6509    packssdw             m4, [r3+21*16]
   6510    packssdw             m6, [r3+23*16]
   6511    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6512    mova   [rsp+32*16+r5*8+2*32*16], m0
   6513    mova   [rsp+40*16+r5*8+2*32*16], m2
   6514    mova   [rsp+32*16+t1*8+2*32*16], m1
   6515    mova   [rsp+32*16+t0*8+2*32*16], m3
   6516    mova                 m0, [r3+31*16]
   6517    mova                 m2, [r3+29*16]
   6518    mova                 m4, [r3+27*16]
   6519    mova                 m6, [r3+25*16]
   6520    packssdw             m0, [r3+30*16]
   6521    packssdw             m2, [r3+28*16]
   6522    packssdw             m4, [r3+26*16]
   6523    packssdw             m6, [r3+24*16]
   6524    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6525    mova   [rsp+32*16+r5*8+3*32*16], m0
   6526    mova   [rsp+40*16+r5*8+3*32*16], m2
   6527    mova   [rsp+32*16+t1*8+3*32*16], m1
   6528    mova   [rsp+32*16+t0*8+3*32*16], m3
   6529    mova                 m0, [r3+ 0*16]
   6530    mova                 m2, [r3+ 2*16]
   6531    mova                 m4, [r3+ 4*16]
   6532    mova                 m6, [r3+ 6*16]
   6533    packssdw             m0, [r3+ 1*16]
   6534    packssdw             m2, [r3+ 3*16]
   6535    packssdw             m4, [r3+ 5*16]
   6536    packssdw             m6, [r3+ 7*16]
   6537    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6538 %endif
   6539    pxor                 m7, m7
   6540    ; clear lower half of [cq]
   6541    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
   6542                                     8, 9, 10, 11, 12, 13, 14, 15, \
   6543                                     16, 17, 18, 19, 20, 21, 22, 23, \
   6544                                     24, 25, 26, 27, 28, 29, 30, 31
   6545    mova   [rsp+32*16+r5*8+0*32*16], m0
   6546    mova   [rsp+40*16+r5*8+0*32*16], m2
   6547    mova   [rsp+32*16+t1*8+0*32*16], m1
   6548    mova   [rsp+32*16+t0*8+0*32*16], m3
   6549    sub                 r5d, 2
   6550    jge .loop_pass1
   6551 
   6552    ; pass=2 code starts here
   6553    mov                eobd, [rsp+gprsize*0+5*32*16]
   6554    add                 rsp, 29*16
   6555    cmp                eobd, 36
   6556    jl .load_veryfast
   6557    cmp                eobd, 136
   6558    jl .load_fast
   6559    ; load normal
   6560    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
   6561    jmp .run
   6562 .load_fast:
   6563    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
   6564    jmp .run
   6565 .load_veryfast:
   6566    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
   6567    ; fall-through
   6568 .run:
   6569 %if ARCH_X86_64
   6570    lea                  r2, [dstq+64]
   6571    mov                  r7, -8
   6572 %else
   6573    lea                  r2, [rsp+(4*32+3)*16]
   6574    mov dword [r2+0*gprsize], 4
   6575 %endif
   6576    jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
   6577 
   6578 .dconly:
   6579    imul                r5d, [cq], 181
   6580    mov                [cq], eobd ; 0
   6581    mov                 r3d, 32
   6582    add                 rsp, (5*32+1-(24+8*ARCH_X86_32))*16
   6583    jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1
   6584 
   6585 cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
   6586                                          0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \
   6587                                          dst, stride, c, eob
   6588    LEA                  r6, base
   6589    test               eobd, eobd
   6590    jz .dconly
   6591 
   6592 %if ARCH_X86_32
   6593    DECLARE_REG_TMP 4, 1, 2, 0
   6594    mov [rsp+gprsize*1+(64*2+12)*16], r0
   6595    mov [rsp+gprsize*2+(64*2+12)*16], r1
   6596    mov [rsp+gprsize*3+(64*2+12)*16], r2
   6597 %else
   6598    DECLARE_REG_TMP 8, 9, 4, 7
   6599    mov [rsp+gprsize*1+(64*2+12)*16], r9
   6600 %if WIN64
   6601    mov [rsp+gprsize*2+(64*2+12)*16], r7
   6602    mov [rsp+gprsize*3+(64*2+12)*16], r8
   6603 %endif
   6604 %endif
   6605 %undef cmp
   6606    ; remove entirely-zero iterations
   6607    mov                 r5d, 7*2
   6608    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
   6609    jge .end_zero_loop
   6610    pxor                 m0, m0
   6611 .zero_loop:
   6612    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
   6613    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
   6614    movzx               t0d, t1b
   6615    movzx               t2d, t3b
   6616    shr                 t1d, 8
   6617    shr                 t3d, 8
   6618    mova   [rsp+12*16+t0*8], m0
   6619    mova   [rsp+12*16+t1*8], m0
   6620    mova   [rsp+12*16+t2*8], m0
   6621    mova   [rsp+12*16+t3*8], m0
   6622    mova   [rsp+76*16+t0*8], m0
   6623    mova   [rsp+76*16+t1*8], m0
   6624    mova   [rsp+76*16+t2*8], m0
   6625    mova   [rsp+76*16+t3*8], m0
   6626    sub                 r5d, 2
   6627    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
   6628    jl .zero_loop
   6629 .end_zero_loop:
   6630    ; actual first pass after skipping all-zero data
   6631    mov [rsp+gprsize*0+(64*2+12)*16], eobd
   6632    mov                  r3, rsp
   6633 %if ARCH_X86_32
   6634    DECLARE_REG_TMP 4, 1, 6, 0
   6635    mov                  r2, [rsp+gprsize*3+(64*2+12)*16]
   6636    mov [rsp+gprsize*3+(64*2+12)*16], r6
   6637 %endif
   6638 .loop_pass1:
   6639 %if ARCH_X86_64
   6640    mova                m11, [o(pd_2048)]
   6641    mova                m12, [o(clip_18b_min)]
   6642    mova                m13, [o(clip_18b_max)]
   6643    mova                m14, [o(pd_2896)]
   6644 %endif
   6645    mova                 m0, [cq+ 1*128+r5*8]
   6646    mova                 m1, [cq+ 3*128+r5*8]
   6647    mova                 m2, [cq+ 5*128+r5*8]
   6648    mova                 m3, [cq+ 7*128+r5*8]
   6649    mova                 m4, [cq+ 9*128+r5*8]
   6650    mova                 m5, [cq+11*128+r5*8]
   6651    mova                 m6, [cq+13*128+r5*8]
   6652    mova                 m7, [cq+15*128+r5*8]
   6653    call m(idct_16x4_internal_16bpc).main_oddhalf
   6654 
   6655    mova                 m0, [cq+ 0*128+r5*8]
   6656    mova                 m1, [cq+ 2*128+r5*8]
   6657    mova                 m2, [cq+ 4*128+r5*8]
   6658    mova                 m3, [cq+ 6*128+r5*8]
   6659    mova                 m4, [cq+ 8*128+r5*8]
   6660    mova                 m5, [cq+10*128+r5*8]
   6661    mova                 m6, [cq+12*128+r5*8]
   6662    mova                 m7, [cq+14*128+r5*8]
   6663    call m(idct_8x4_internal_16bpc).main_pass1
   6664    call m(idct_8x4_internal_16bpc).round
   6665    call m(idct_16x16_internal_16bpc).round
   6666 %if ARCH_X86_64
   6667    packssdw             m0, m1
   6668    packssdw             m2, m3
   6669    packssdw             m4, m5
   6670    packssdw             m6, m7
   6671    packssdw             m8, m9
   6672    packssdw            m10, m11
   6673    packssdw            m12, m13
   6674    packssdw            m14, m15
   6675 %endif
   6676    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6677    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
   6678    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
   6679    movzx               t0d, t1b
   6680    movzx               t2d, t3b
   6681    shr                 t1d, 8
   6682    shr                 t3d, 8
   6683 %if ARCH_X86_64
   6684    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   6685    mova   [rsp+76*16+t0*8], m8
   6686    mova   [rsp+76*16+t1*8], m9
   6687    mova   [rsp+76*16+t2*8], m10
   6688    mova   [rsp+76*16+t3*8], m11
   6689 %else
   6690    mova   [rsp+76*16+t0*8], m0
   6691    mova   [rsp+76*16+t1*8], m1
   6692    mova   [rsp+76*16+t2*8], m2
   6693    mova   [rsp+76*16+t3*8], m3
   6694    mova                 m0, [rsp+ 8*16]
   6695    mova                 m2, [rsp+ 9*16]
   6696    mova                 m4, [rsp+10*16]
   6697    mova                 m6, [rsp+11*16]
   6698    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6699 %endif
   6700    mova   [rsp+12*16+t0*8], m0
   6701    mova   [rsp+12*16+t1*8], m1
   6702    mova   [rsp+12*16+t2*8], m2
   6703    mova   [rsp+12*16+t3*8], m3
   6704 %if ARCH_X86_32
   6705    mov                  r6, [rsp+gprsize*3+(64*2+12)*16]
   6706 %endif
   6707    pxor                 m7, m7
   6708    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   6709    sub                 r5d, 2
   6710    jge .loop_pass1
   6711 
   6712    ; pass=2
   6713    mov                eobd, [rsp+gprsize*0+(64*2+12)*16]
   6714    cmp                eobd, 151
   6715    jl .fast
   6716    ; fall-through
   6717 %if ARCH_X86_64
   6718    DECLARE_REG_TMP 8, 9
   6719 %else
   6720    DECLARE_REG_TMP 1, 5
   6721 %endif
   6722    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
   6723    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
   6724    jmp .run
   6725 .fast:
   6726    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
   6727    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
   6728 .run:
   6729    add                 rsp, 9*16
   6730 
   6731 %if ARCH_X86_64
   6732    lea                  r2, [dstq+32]
   6733    mov                  r7, -4
   6734 %else
   6735    lea                  r2, [rsp+(64*2+3)*16]
   6736    mov      [r2+4*gprsize], t0
   6737    mov      [r2+5*gprsize], t1
   6738    mov                  r1, [r2+2*gprsize]
   6739    mov dword [r2+0*gprsize], 2
   6740 %endif
   6741 .loop_pass2:
   6742 %if ARCH_X86_32
   6743    mov                dstq, [r2+1*gprsize]
   6744 %endif
   6745    call .pass2
   6746    add                 rsp, 64*16
   6747 %if ARCH_X86_64
   6748    add                  r7, 2
   6749    lea                dstq, [r2+r7*8]
   6750    jl .loop_pass2
   6751 %else
   6752    add dword [r2+1*gprsize], 16
   6753    dec dword [r2+0*gprsize]
   6754    jg .loop_pass2
   6755 %endif
   6756 %assign stack_size (stack_size-(64*2+9)*16)
   6757 %if STACK_ALIGNMENT >= 16
   6758 %assign stack_size_padded (stack_size_padded-(64*2+9)*16)
   6759 %assign stack_offset (stack_offset-(64*2+9)*16)
   6760 %else
   6761 %xdefine rstkm [rsp + stack_size]
   6762 %endif
   6763 %if ARCH_X86_64
   6764    mov                  r9, [rsp+gprsize*1+3*16]
   6765 %if WIN64
   6766    mov                  r7, [rsp+gprsize*2+3*16]
   6767    mov                  r8, [rsp+gprsize*3+3*16]
   6768 %endif
   6769 %endif
   6770    RET
   6771 
   6772 .pass2:
   6773 %if ARCH_X86_32
   6774    lea                  r5, [o(itx8_start)]
   6775 %endif
   6776    mova                 m0, [rsp+gprsize+16* 3]
   6777    mova                 m1, [rsp+gprsize+16* 4]
   6778    mova                 m2, [rsp+gprsize+16* 5]
   6779    mova                 m3, [rsp+gprsize+16* 6]
   6780    pxor                 m4, m4
   6781    REPX       {mova x, m4}, m5, m6, m7
   6782    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
   6783    mova [rsp+gprsize+ 3*16], m0
   6784    mova [rsp+gprsize+ 4*16], m1
   6785    mova [rsp+gprsize+ 5*16], m2
   6786    mova [rsp+gprsize+ 6*16], m3
   6787    mova [rsp+gprsize+ 7*16], m4
   6788    mova [rsp+gprsize+ 8*16], m5
   6789    mova [rsp+gprsize+ 9*16], m6
   6790    mova [rsp+gprsize+10*16], m7
   6791    mova                 m0, [rsp+gprsize+16*11]
   6792    mova                 m1, [rsp+gprsize+16*12]
   6793    mova                 m2, [rsp+gprsize+16*13]
   6794    mova                 m3, [rsp+gprsize+16*14]
   6795    pxor                 m4, m4
   6796    REPX       {mova x, m4}, m5, m6, m7
   6797    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
   6798    mova                 m7, [rsp+gprsize+ 0*16]
   6799    mova [rsp+gprsize+11*16], m0
   6800    mova [rsp+gprsize+12*16], m1
   6801    mova [rsp+gprsize+13*16], m2
   6802    mova [rsp+gprsize+14*16], m3
   6803    mova [rsp+gprsize+15*16], m4
   6804    mova [rsp+gprsize+16*16], m5
   6805    mova [rsp+gprsize+17*16], m6
   6806    mova [rsp+gprsize+18*16], m7
   6807 %if ARCH_X86_64
   6808    call                  r8
   6809 %else
   6810    call      [r2+4*gprsize]
   6811 %endif
   6812    mova [rsp+gprsize+ 3*16], m0
   6813    mova [rsp+gprsize+ 5*16], m2
   6814    mova [rsp+gprsize+ 8*16], m5
   6815    mova [rsp+gprsize+10*16], m7
   6816 %if ARCH_X86_64
   6817    call                 r9
   6818    mova                 m8, [o(pw_2048)]
   6819    pxor                 m9, m9
   6820    mova                m10, [o(pixel_10bpc_max)]
   6821 %else
   6822    call     [r2+5*gprsize]
   6823 %endif
   6824    lea                  r3, [strideq*3]
   6825    lea                  r4, [rsp+gprsize+ 3*16]
   6826 %if ARCH_X86_64
   6827    mov                 r6d, 8
   6828 %else
   6829    mov dword [r2+2*gprsize], 8
   6830 %endif
   6831 .loop_write:
   6832    mova                 m0, [r4+0*16]
   6833    mova                 m1, [r4+1*16]
   6834    mova                 m2, [r4+2*16]
   6835    mova                 m3, [r4+3*16]
   6836    mova                 m4, [r4+4*16]
   6837    mova                 m5, [r4+5*16]
   6838    mova                 m6, [r4+6*16]
   6839    mova                 m7, [r4+7*16]
   6840    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
   6841    lea                dstq, [dstq+strideq*8]
   6842    add                  r4, 8*16
   6843 %if ARCH_X86_64
   6844    dec                 r6d
   6845 %else
   6846    dec dword [r2+2*gprsize]
   6847 %endif
   6848    jg .loop_write
   6849    ret
   6850 
   6851 .dconly:
   6852    imul                r5d, [cq], 181
   6853    mov                [cq], eobd ; 0
   6854    mov                 r3d, 64
   6855    add                 r5d, 640
   6856    sar                 r5d, 10
   6857    add                 rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
   6858    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
   6859 
   6860 cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
   6861                                          0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \
   6862                                          dst, stride, c, eob
   6863    LEA                  r6, base
   6864    test               eobd, eobd
   6865    jz .dconly
   6866 
   6867 %if ARCH_X86_32
   6868    DECLARE_REG_TMP 4, 1, 2, 0
   6869    mov [rsp+gprsize*1+(64*4+32)*16], r0
   6870    mov [rsp+gprsize*2+(64*4+32)*16], r1
   6871    mov [rsp+gprsize*3+(64*4+32)*16], r2
   6872 %else
   6873    DECLARE_REG_TMP 8, 9, 4, 7
   6874    mov [rsp+gprsize*1+(64*4+32)*16], r9
   6875 %if WIN64
   6876    mov [rsp+gprsize*2+(64*4+32)*16], r7
   6877    mov [rsp+gprsize*3+(64*4+32)*16], r8
   6878 %endif
   6879 %endif
   6880 %undef cmp
   6881    ; remove entirely-zero iterations
   6882    mov                 r5d, 7*2
   6883    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
   6884    jge .end_zero_loop
   6885    pxor                 m0, m0
   6886 .zero_loop:
   6887    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
   6888    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
   6889    movzx               t0d, t1b
   6890    movzx               t2d, t3b
   6891    shr                 t1d, 8
   6892    shr                 t3d, 8
   6893    mova  [rsp+ 32*16+t0*8], m0
   6894    mova  [rsp+ 32*16+t1*8], m0
   6895    mova  [rsp+ 32*16+t2*8], m0
   6896    mova  [rsp+ 32*16+t3*8], m0
   6897    mova  [rsp+ 96*16+t0*8], m0
   6898    mova  [rsp+ 96*16+t1*8], m0
   6899    mova  [rsp+ 96*16+t2*8], m0
   6900    mova  [rsp+ 96*16+t3*8], m0
   6901    mova  [rsp+160*16+t0*8], m0
   6902    mova  [rsp+160*16+t1*8], m0
   6903    mova  [rsp+160*16+t2*8], m0
   6904    mova  [rsp+160*16+t3*8], m0
   6905    mova  [rsp+224*16+t0*8], m0
   6906    mova  [rsp+224*16+t1*8], m0
   6907    mova  [rsp+224*16+t2*8], m0
   6908    mova  [rsp+224*16+t3*8], m0
   6909    sub                 r5d, 2
   6910    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
   6911    jl .zero_loop
   6912 .end_zero_loop:
   6913    ; actual first pass after skipping all-zero data
   6914    mov [rsp+gprsize*0+(64*4+32)*16], eobd
   6915    mov                  r3, rsp
   6916 %if ARCH_X86_32
   6917    DECLARE_REG_TMP 4, 1, 6, 0
   6918    mov                  r2, [rsp+gprsize*3+(64*4+32)*16]
   6919    mov [rsp+gprsize*3+(64*4+32)*16], r6
   6920 %endif
   6921 .loop_pass1:
   6922 %if ARCH_X86_64
   6923    mova                m11, [o(pd_2048)]
   6924    mova                m12, [o(clip_18b_min)]
   6925    mova                m13, [o(clip_18b_max)]
   6926    mova                m14, [o(pd_2896)]
   6927 %endif
   6928    mova                 m0, [cq+128* 1+r5*8]
   6929    mova                 m1, [cq+128* 7+r5*8]
   6930    mova                 m2, [cq+128* 9+r5*8]
   6931    mova                 m3, [cq+128*15+r5*8]
   6932    mova                 m4, [cq+128*17+r5*8]
   6933    mova                 m5, [cq+128*23+r5*8]
   6934    mova                 m6, [cq+128*25+r5*8]
   6935    mova                 m7, [cq+128*31+r5*8]
   6936    mov                  r3, rsp
   6937    call m(idct_8x4_internal_16bpc).rect2_mul
   6938    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
   6939 
   6940    mova                 m0, [cq+128* 3+r5*8]
   6941    mova                 m1, [cq+128* 5+r5*8]
   6942    mova                 m2, [cq+128*11+r5*8]
   6943    mova                 m3, [cq+128*13+r5*8]
   6944    mova                 m4, [cq+128*19+r5*8]
   6945    mova                 m5, [cq+128*21+r5*8]
   6946    mova                 m6, [cq+128*27+r5*8]
   6947    mova                 m7, [cq+128*29+r5*8]
   6948 %if ARCH_X86_32
   6949    add                  r3, 16*8
   6950 %endif
   6951    call m(idct_8x4_internal_16bpc).rect2_mul
   6952 %if ARCH_X86_32
   6953    sub                  r3, 16*8
   6954 %endif
   6955    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
   6956    add                  r3, 16*(16+4*ARCH_X86_32)
   6957 
   6958    mova                 m0, [cq+128* 2+r5*8]
   6959    mova                 m1, [cq+128* 6+r5*8]
   6960    mova                 m2, [cq+128*10+r5*8]
   6961    mova                 m3, [cq+128*14+r5*8]
   6962    mova                 m4, [cq+128*18+r5*8]
   6963    mova                 m5, [cq+128*22+r5*8]
   6964    mova                 m6, [cq+128*26+r5*8]
   6965    mova                 m7, [cq+128*30+r5*8]
   6966    call m(idct_8x4_internal_16bpc).rect2_mul
   6967    call m(idct_16x4_internal_16bpc).main_oddhalf
   6968 
   6969    mova                 m0, [cq+128* 0+r5*8]
   6970    mova                 m1, [cq+128* 4+r5*8]
   6971    mova                 m2, [cq+128* 8+r5*8]
   6972    mova                 m3, [cq+128*12+r5*8]
   6973    mova                 m4, [cq+128*16+r5*8]
   6974    mova                 m5, [cq+128*20+r5*8]
   6975    mova                 m6, [cq+128*24+r5*8]
   6976    mova                 m7, [cq+128*28+r5*8]
   6977    call m(idct_8x4_internal_16bpc).rect2_mul
   6978    call m(idct_8x4_internal_16bpc).main_pass1
   6979    call m(idct_8x4_internal_16bpc).round
   6980    sub                  r3, 16*(16+4*ARCH_X86_32)
   6981    call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32
   6982 
   6983    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
   6984    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
   6985    movzx               t0d, t1b
   6986    movzx               t2d, t3b
   6987    shr                 t1d, 8
   6988    shr                 t3d, 8
   6989 %if ARCH_X86_64
   6990    call m(idct_8x4_internal_16bpc).transpose4x8packed
   6991    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   6992    mova  [rsp+160*16+t0*8], m8
   6993    mova  [rsp+160*16+t1*8], m9
   6994    mova  [rsp+160*16+t2*8], m10
   6995    mova  [rsp+160*16+t3*8], m11
   6996    mova                 m8, [r3+16* 9] ;  8  9
   6997    mova                m10, [r3+16*11] ; 10 11
   6998    mova                m12, [r3+16*13] ; 12 13
   6999    mova                m14, [r3+16*15] ; 14 15
   7000    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   7001    mova  [rsp+ 96*16+t0*8], m8
   7002    mova  [rsp+ 96*16+t1*8], m9
   7003    mova  [rsp+ 96*16+t2*8], m10
   7004    mova  [rsp+ 96*16+t3*8], m11
   7005    mova                 m8, [r3+16* 8] ; 24 25
   7006    mova                m10, [r3+16*10] ; 26 27
   7007    mova                m12, [r3+16*12] ; 28 29
   7008    mova                m14, [r3+16*14] ; 30 31
   7009    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
   7010    mova  [rsp+224*16+t0*8], m8
   7011    mova  [rsp+224*16+t1*8], m9
   7012    mova  [rsp+224*16+t2*8], m10
   7013    mova  [rsp+224*16+t3*8], m11
   7014 %else
   7015    sub                  r3, 8*16
   7016    mova                 m0, [r3+ 8*16]
   7017    mova                 m2, [r3+10*16]
   7018    mova                 m4, [r3+12*16]
   7019    mova                 m6, [r3+14*16]
   7020    packssdw             m0, [r3+ 9*16]
   7021    packssdw             m2, [r3+11*16]
   7022    packssdw             m4, [r3+13*16]
   7023    packssdw             m6, [r3+15*16]
   7024    call m(idct_8x4_internal_16bpc).transpose4x8packed
   7025    mova  [rsp+ 96*16+t0*8], m0
   7026    mova  [rsp+ 96*16+t1*8], m1
   7027    mova  [rsp+ 96*16+t2*8], m2
   7028    mova  [rsp+ 96*16+t3*8], m3
   7029    mova                 m0, [r3+16*16]
   7030    mova                 m2, [r3+18*16]
   7031    mova                 m4, [r3+20*16]
   7032    mova                 m6, [r3+22*16]
   7033    packssdw             m0, [r3+17*16]
   7034    packssdw             m2, [r3+19*16]
   7035    packssdw             m4, [r3+21*16]
   7036    packssdw             m6, [r3+23*16]
   7037    call m(idct_8x4_internal_16bpc).transpose4x8packed
   7038    mova  [rsp+160*16+t0*8], m0
   7039    mova  [rsp+160*16+t1*8], m1
   7040    mova  [rsp+160*16+t2*8], m2
   7041    mova  [rsp+160*16+t3*8], m3
   7042    mova                 m0, [r3+31*16]
   7043    mova                 m2, [r3+29*16]
   7044    mova                 m4, [r3+27*16]
   7045    mova                 m6, [r3+25*16]
   7046    packssdw             m0, [r3+30*16]
   7047    packssdw             m2, [r3+28*16]
   7048    packssdw             m4, [r3+26*16]
   7049    packssdw             m6, [r3+24*16]
   7050    call m(idct_8x4_internal_16bpc).transpose4x8packed
   7051    mova  [rsp+224*16+t0*8], m0
   7052    mova  [rsp+224*16+t1*8], m1
   7053    mova  [rsp+224*16+t2*8], m2
   7054    mova  [rsp+224*16+t3*8], m3
   7055    mova                 m0, [r3+ 0*16]
   7056    mova                 m2, [r3+ 2*16]
   7057    mova                 m4, [r3+ 4*16]
   7058    mova                 m6, [r3+ 6*16]
   7059    packssdw             m0, [r3+ 1*16]
   7060    packssdw             m2, [r3+ 3*16]
   7061    packssdw             m4, [r3+ 5*16]
   7062    packssdw             m6, [r3+ 7*16]
   7063    call m(idct_8x4_internal_16bpc).transpose4x8packed
   7064 %endif
   7065    mova  [rsp+ 32*16+t0*8], m0
   7066    mova  [rsp+ 32*16+t1*8], m1
   7067    mova  [rsp+ 32*16+t2*8], m2
   7068    mova  [rsp+ 32*16+t3*8], m3
   7069    pxor                 m0, m0
   7070    REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \
   7071                                     8, 9, 10, 11, 12, 13, 14, 15, \
   7072                                     16, 17, 18, 19, 20, 21, 22, 23, \
   7073                                     24, 25, 26, 27, 28, 29, 30, 31
   7074 %if ARCH_X86_32
   7075    mov                  r6, [rsp+gprsize*3+(64*4+32)*16]
   7076 %endif
   7077    sub                 r5d, 2
   7078    jge .loop_pass1
   7079 
   7080    ; pass=2
   7081    mov                eobd, [rsp+gprsize*0+(64*4+32)*16]
   7082    cmp                eobd, 136
   7083    jl .fast
   7084    ; fall-through
   7085 %if ARCH_X86_64
   7086    DECLARE_REG_TMP 8, 9
   7087 %else
   7088    DECLARE_REG_TMP 1, 5
   7089 %endif
   7090    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
   7091    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
   7092    jmp .run
   7093 .fast:
   7094    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
   7095    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
   7096 .run:
   7097    add                 rsp, 29*16
   7098 
   7099 %if ARCH_X86_64
   7100    lea                  r2, [dstq+64]
   7101    mov                  r7, -8
   7102 %else
   7103    lea                  r2, [rsp+(64*4+3)*16]
   7104    mov      [r2+4*gprsize], t0
   7105    mov      [r2+5*gprsize], t1
   7106    mov                  r1, [r2+2*gprsize]
   7107    mov dword [r2+0*gprsize], 4
   7108 %endif
   7109    jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
   7110 
   7111 .dconly:
   7112    imul                r5d, [cq], 181
   7113    mov                [cq], eobd ; 0
   7114    mov                 r3d, 64
   7115    add                 r5d, 128
   7116    sar                 r5d, 8
   7117    imul                r5d, 181
   7118    add                 r5d, 384
   7119    sar                 r5d, 9
   7120    add                 rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
   7121    jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
   7122 
   7123 cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
   7124                                         dst, stride, c, eob
   7125    LEA                  r6, base
   7126    test               eobd, eobd
   7127    jz .dconly
   7128 
   7129    ; remove entirely-zero iterations
   7130 %undef cmp
   7131    mov                 r5d, 8
   7132 .zero_loop:
   7133    sub                 r5d, 2
   7134    cmp                eobw, word [o2(tbl_32x16_2d)+r5]
   7135    jl .zero_loop
   7136 
   7137    ; actual first pass after skipping all-zero data
   7138 .loop_pass1:
   7139 %if ARCH_X86_64
   7140    mova                m11, [o(pd_2048)]
   7141    mova                m12, [o(clip_18b_min)]
   7142    mova                m13, [o(clip_18b_max)]
   7143    mova                m14, [o(pd_2896)]
   7144 %endif
   7145 
   7146    mov                  r3, rsp
   7147    lea                  r4, [o(idct64_mul_16bpc)]
   7148    mova                 m0, [cq+64* 1+r5*8]
   7149    mova                 m1, [cq+64*31+r5*8]
   7150    mova                 m2, [cq+64*17+r5*8]
   7151    mova                 m3, [cq+64*15+r5*8]
   7152    call .main_part1
   7153    mova                 m0, [cq+64* 7+r5*8]
   7154    mova                 m1, [cq+64*25+r5*8]
   7155    mova                 m2, [cq+64*23+r5*8]
   7156    mova                 m3, [cq+64* 9+r5*8]
   7157    call .main_part1
   7158    mova                 m0, [cq+64* 5+r5*8]
   7159    mova                 m1, [cq+64*27+r5*8]
   7160    mova                 m2, [cq+64*21+r5*8]
   7161    mova                 m3, [cq+64*11+r5*8]
   7162    call .main_part1
   7163    mova                 m0, [cq+64* 3+r5*8]
   7164    mova                 m1, [cq+64*29+r5*8]
   7165    mova                 m2, [cq+64*19+r5*8]
   7166    mova                 m3, [cq+64*13+r5*8]
   7167    call .main_part1
   7168    call .main_part2
   7169 
   7170    mova                 m0, [cq+64* 2+r5*8]
   7171    mova                 m1, [cq+64*14+r5*8]
   7172    mova                 m2, [cq+64*18+r5*8]
   7173    mova                 m3, [cq+64*30+r5*8]
   7174    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
   7175 
   7176    mova                 m0, [cq+64* 6+r5*8]
   7177    mova                 m1, [cq+64*10+r5*8]
   7178    mova                 m2, [cq+64*22+r5*8]
   7179    mova                 m3, [cq+64*26+r5*8]
   7180    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
   7181    add                  r3, 16*(24+4*ARCH_X86_32)
   7182 
   7183    mova                 m0, [cq+64* 4+r5*8]
   7184    mova                 m1, [cq+64*12+r5*8]
   7185    mova                 m2, [cq+64*20+r5*8]
   7186    mova                 m3, [cq+64*28+r5*8]
   7187    call m(idct_16x4_internal_16bpc).main_oddhalf_fast
   7188 
   7189    mova                 m0, [cq+64* 0+r5*8]
   7190    mova                 m1, [cq+64* 8+r5*8]
   7191    mova                 m2, [cq+64*16+r5*8]
   7192    mova                 m3, [cq+64*24+r5*8]
   7193    call m(idct_8x4_internal_16bpc).main_pass1_fast
   7194    call m(idct_8x4_internal_16bpc).round
   7195    mova [r3-(7+4*ARCH_X86_32)*16], m1
   7196    mova [r3-(6+4*ARCH_X86_32)*16], m2
   7197    mova [r3-(5+4*ARCH_X86_32)*16], m3
   7198    mova [r3-(4+4*ARCH_X86_32)*16], m4
   7199    mova [r3-(3+4*ARCH_X86_32)*16], m5
   7200    mova [r3-(2+4*ARCH_X86_32)*16], m6
   7201    mova [r3-(1+4*ARCH_X86_32)*16], m7
   7202    sub                  r3, 16*(40+4*ARCH_X86_32-4)
   7203 
   7204 %if ARCH_X86_64
   7205    psrld               m15, m11, 10 ; pd_2
   7206 %else
   7207    mova                 m7, [o(pd_2)]
   7208 %endif
   7209    call .main_end_loop_start
   7210 
   7211    lea                  r3, [rsp+56*16]
   7212    lea                  r4, [cq+r5*8+64*28]
   7213    call .shift_transpose
   7214    sub                 r5d, 2
   7215    jge .loop_pass1
   7216 
   7217    ; pass=2, we need to call this otherwise the stack pointer has
   7218    ; the wrong offset in the 8-bit code
   7219    call .pass2
   7220    RET
   7221 
   7222 .pass2:
   7223 %if ARCH_X86_64
   7224    mova                 m8, [o(pw_2048)]
   7225    pxor                 m9, m9
   7226    mova                m10, [o(pixel_10bpc_max)]
   7227 %if WIN64
   7228    mov [rsp+16*16+gprsize], r7
   7229 %endif
   7230    mov                  r7, dstq
   7231 %else
   7232    mov [rsp+2*gprsize+16*16], dstq
   7233 %endif
   7234    lea                  r3, [strideq*3]
   7235    mov                 r4d, 8
   7236    jmp m(idct_16x16_internal_16bpc).loop_pass2
   7237 
   7238 .main_part1: ; idct64 steps 1-5
   7239    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
   7240    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
   7241    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
   7242    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
   7243 %if ARCH_X86_64
   7244    movd                 m7, [r4+4*0]
   7245    movd                 m8, [r4+4*1]
   7246    movd                 m6, [r4+4*2]
   7247    movd                 m9, [r4+4*3]
   7248    movd                 m5, [r4+4*4]
   7249    movd                m10, [r4+4*5]
   7250    movd                 m4, [r4+4*6]
   7251    movd                m15, [r4+4*7]
   7252    REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15
   7253    pmulld               m7, m0     ; t63a
   7254    pmulld               m0, m8     ; t32a
   7255    pmulld               m6, m1     ; t62a
   7256    pmulld               m1, m9     ; t33a
   7257    pmulld               m5, m2     ; t61a
   7258    pmulld               m2, m10    ; t34a
   7259    pmulld               m4, m3     ; t60a
   7260    pmulld               m3, m15    ; t35a
   7261    movd                m10, [r4+4*8]
   7262    movd                m15, [r4+4*9]
   7263    REPX {pshufd x, x, q0000}, m10, m15
   7264    REPX     {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
   7265    REPX     {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
   7266    psubd                m8, m0, m1 ; t33
   7267    paddd                m0, m1     ; t32
   7268    psubd                m1, m7, m6 ; t62
   7269    paddd                m7, m6     ; t63
   7270    psubd                m6, m3, m2 ; t34
   7271    paddd                m3, m2     ; t35
   7272    psubd                m2, m4, m5 ; t61
   7273    paddd                m4, m5     ; t60
   7274    REPX    {pmaxsd x, m12}, m8, m1, m6, m2
   7275    REPX    {pminsd x, m13}, m8, m1, m6, m2
   7276    ITX_MULSUB_2D         1, 8, 5, 9, _, 11, 10, 15    ; t33a, t62a
   7277    ITX_MULSUB_2D         2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
   7278    REPX    {pmaxsd x, m12}, m0, m3, m7, m4
   7279    REPX    {pminsd x, m13}, m0, m3, m7, m4
   7280    movd                m10, [r4+4*10]
   7281    movd                m15, [r4+4*11]
   7282    REPX {pshufd x, x, q0000}, m10, m15
   7283    psubd                m5, m0, m3 ; t35a
   7284    paddd                m0, m3     ; t32a
   7285    psubd                m3, m7, m4 ; t60a
   7286    paddd                m7, m4     ; t63a
   7287    psubd                m4, m1, m6 ; t34
   7288    paddd                m1, m6     ; t33
   7289    psubd                m6, m8, m2 ; t61
   7290    paddd                m8, m2     ; t62
   7291    REPX    {pmaxsd x, m12}, m5, m3, m4, m6
   7292    REPX    {pminsd x, m13}, m5, m3, m4, m6
   7293    ITX_MULSUB_2D         3, 5, 2, 9, _, 11, 10, 15 ; t35,  t60
   7294    ITX_MULSUB_2D         6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
   7295    REPX    {pmaxsd x, m12}, m0, m7, m1, m8
   7296    REPX    {pminsd x, m13}, m0, m7, m1, m8
   7297    add                  r4, 4*12
   7298    mova          [r3+16*0], m0
   7299    mova          [r3+16*7], m7
   7300    mova          [r3+16*1], m1
   7301    mova          [r3+16*6], m8
   7302    mova          [r3+16*2], m6
   7303    mova          [r3+16*5], m4
   7304    mova          [r3+16*3], m3
   7305    mova          [r3+16*4], m5
   7306 %else
   7307    movd                 m7, [r4+4*0]
   7308    movd                 m6, [r4+4*2]
   7309    movd                 m5, [r4+4*4]
   7310    movd                 m4, [r4+4*6]
   7311    REPX {pshufd x, x, q0000}, m7, m6, m5, m4
   7312    pmulld               m7, m0     ; t63a
   7313    pmulld               m6, m1     ; t62a
   7314    pmulld               m5, m2     ; t61a
   7315    pmulld               m4, m3     ; t60a
   7316    mova          [r3+0*16], m6
   7317    mova          [r3+1*16], m7
   7318    movd                 m6, [r4+4*1]
   7319    movd                 m7, [r4+4*3]
   7320    REPX {pshufd x, x, q0000}, m7, m6
   7321    pmulld               m0, m6     ; t32a
   7322    pmulld               m1, m7     ; t33a
   7323    movd                 m6, [r4+4*5]
   7324    movd                 m7, [r4+4*7]
   7325    REPX {pshufd x, x, q0000}, m7, m6
   7326    pmulld               m2, m6     ; t34a
   7327    pmulld               m3, m7     ; t35a
   7328    mova                 m6, [r3+0*16]
   7329    mova                 m7, [o(pd_2048)]
   7330    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
   7331    paddd                m7, [r3+1*16]
   7332    REPX      {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4
   7333    mova           [r3+0*16], m5
   7334    psubd                m5, m0, m1 ; t33
   7335    paddd                m0, m1     ; t32
   7336    mova           [r3+1*16], m0
   7337    mova                 m0, [r3+0*16]
   7338    psubd                m1, m7, m6 ; t62
   7339    paddd                m7, m6     ; t63
   7340    psubd                m6, m3, m2 ; t34
   7341    paddd                m3, m2     ; t35
   7342    psubd                m2, m4, m0 ; t61
   7343    paddd                m4, m0     ; t60
   7344    mova                 m0, [o(clip_18b_min)]
   7345    REPX     {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4
   7346    pmaxsd               m0, [r3+1*16]
   7347    mova          [r3+0*16], m0
   7348    mova                 m0, [o(clip_18b_max)]
   7349    REPX     {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4
   7350    pminsd               m0, [r3+0*16]
   7351    mova          [r3+0*16], m0
   7352    mova          [r3+1*16], m3
   7353    mova          [r3+2*16], m4
   7354    mova          [r3+3*16], m7
   7355    mova                 m0, [o(pd_2048)]
   7356    movd                 m3, [r4+4*8]
   7357    movd                 m4, [r4+4*9]
   7358    REPX {pshufd x, x, q0000}, m3, m4
   7359    mova          [r3+4*16], m2
   7360    ITX_MULSUB_2D         1, 5, 2, 7, _, 0, 3, 4    ; t33a, t62a
   7361    mova                 m2, [r3+4*16]
   7362    mova          [r3+4*16], m5
   7363    ITX_MULSUB_2D         2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a
   7364    mova                 m0, [r3+0*16]
   7365    mova                 m3, [r3+1*16]
   7366    mova                 m4, [r3+2*16]
   7367    mova                 m7, [r3+3*16]
   7368    psubd                m5, m0, m3 ; t35a
   7369    paddd                m0, m3     ; t32a
   7370    mova          [r3+0*16], m5
   7371    mova                 m5, [r3+4*16]
   7372    psubd                m3, m7, m4 ; t60a
   7373    paddd                m7, m4     ; t63a
   7374    psubd                m4, m1, m6 ; t34
   7375    paddd                m1, m6     ; t33
   7376    psubd                m6, m5, m2 ; t61
   7377    paddd                m2, m5     ; t62
   7378    mova                 m5, [o(clip_18b_min)]
   7379    REPX     {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2
   7380    pmaxsd               m5, [r3+0*16]
   7381    mova          [r3+0*16], m5
   7382    mova                 m5, [o(clip_18b_max)]
   7383    REPX     {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2
   7384    pminsd               m5, [r3+0*16]
   7385    mova          [r3+16*0], m0
   7386    mova          [r3+16*7], m7
   7387    mova          [r3+16*1], m1
   7388    mova          [r3+16*6], m2
   7389    mova          [r3+16*2], m4
   7390    mova                 m7, [o(pd_2048)]
   7391    movd                 m0, [r4+4*10]
   7392    movd                 m1, [r4+4*11]
   7393    REPX {pshufd x, x, q0000}, m0, m1
   7394    ITX_MULSUB_2D         3, 5, 2, 4, _, 7, 0, 1 ; t35,  t60
   7395    mova          [r3+16*3], m3
   7396    mova          [r3+16*4], m5
   7397    mova                 m4, [r3+2*16]
   7398    ITX_MULSUB_2D         6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a
   7399    add                  r4, 4*12
   7400    mova          [r3+16*2], m6
   7401    mova          [r3+16*5], m4
   7402 %endif
   7403    add                  r3, 16*8
   7404    ret
   7405 
   7406 .main_part2: ; idct64 steps 6-9
   7407    lea                  r4, [r3+16*7]
   7408 %if ARCH_X86_64
   7409    mova                m10, [o(pd_1567)]
   7410    mova                m15, [o(pd_3784)]
   7411 .main_part2_loop:
   7412    mova                 m0, [r3-16*32] ; t32a
   7413    mova                 m1, [r4-16*24] ; t39a
   7414    mova                 m2, [r4-16*32] ; t63a
   7415    mova                 m3, [r3-16*24] ; t56a
   7416    mova                 m4, [r3-16*16] ; t40a
   7417    mova                 m5, [r4-16* 8] ; t47a
   7418    mova                 m6, [r4-16*16] ; t55a
   7419    mova                 m7, [r3-16* 8] ; t48a
   7420    psubd                m8, m0, m1 ; t39
   7421    paddd                m0, m1     ; t32
   7422    psubd                m1, m2, m3 ; t56
   7423    paddd                m2, m3     ; t63
   7424    psubd                m3, m5, m4 ; t40
   7425    paddd                m5, m4     ; t47
   7426    psubd                m4, m7, m6 ; t55
   7427    paddd                m7, m6     ; t48
   7428    REPX    {pmaxsd x, m12}, m8, m1, m3, m4
   7429    REPX    {pminsd x, m13}, m8, m1, m3, m4
   7430    ITX_MULSUB_2D         1, 8, 6, 9, _, 11, 10, 15    ; t39a, t56a
   7431    ITX_MULSUB_2D         4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
   7432    REPX    {pmaxsd x, m12}, m0, m2, m5, m7
   7433    REPX    {pminsd x, m13}, m0, m5, m2, m7
   7434    psubd                m6, m2, m7 ; t48a
   7435    paddd                m2, m7     ; t63a
   7436    psubd                m7, m0, m5 ; t47a
   7437    paddd                m0, m5     ; t32a
   7438    psubd                m5, m8, m4 ; t55
   7439    paddd                m8, m4     ; t56
   7440    psubd                m4, m1, m3 ; t40
   7441    paddd                m1, m3     ; t39
   7442    REPX    {pmaxsd x, m12}, m6, m7, m5, m4
   7443    REPX    {pminsd x, m13}, m6, m7, m5, m4
   7444    REPX    {pmulld x, m14}, m6, m7, m5, m4
   7445    REPX    {pmaxsd x, m12}, m2, m0, m8, m1
   7446    REPX    {pminsd x, m13}, m2, m0, m8, m1
   7447    paddd                m6, m11
   7448    paddd                m5, m11
   7449    psubd                m3, m6, m7 ; t47
   7450    paddd                m6, m7     ; t48
   7451    psubd                m7, m5, m4 ; t40a
   7452    paddd                m5, m4     ; t55a
   7453    REPX      {psrad x, 12}, m3, m6, m7, m5
   7454    mova         [r4-16* 8], m2
   7455    mova         [r3-16*32], m0
   7456    mova         [r3-16* 8], m8
   7457    mova         [r4-16*32], m1
   7458    mova         [r4-16*24], m3
   7459    mova         [r3-16*16], m6
   7460    mova         [r3-16*24], m7
   7461    mova         [r4-16*16], m5
   7462 %else
   7463 .main_part2_loop:
   7464    mova                 m0, [r3-16*32] ; t32a
   7465    mova                 m1, [r4-16*24] ; t39a
   7466    mova                 m2, [r4-16*32] ; t63a
   7467    mova                 m3, [r3-16*24] ; t56a
   7468    mova                 m4, [r3-16*16] ; t40a
   7469    mova                 m5, [r4-16* 8] ; t47a
   7470    mova                 m6, [r4-16*16] ; t55a
   7471    psubd                m7, m0, m1 ; t39
   7472    paddd                m0, m1     ; t32
   7473    mova          [r3+0*16], m7
   7474    mova                 m7, [r3-16* 8] ; t48a
   7475    psubd                m1, m2, m3 ; t56
   7476    paddd                m2, m3     ; t63
   7477    psubd                m3, m5, m4 ; t40
   7478    paddd                m5, m4     ; t47
   7479    psubd                m4, m7, m6 ; t55
   7480    paddd                m7, m6     ; t48
   7481    mova                 m6, [o(clip_18b_min)]
   7482    REPX     {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7
   7483    pmaxsd               m6, [r3+0*16]
   7484    mova          [r3+0*16], m6
   7485    mova                 m6, [o(clip_18b_max)]
   7486    REPX     {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7
   7487    pminsd               m6, [r3+0*16]
   7488    mova          [r3+0*16], m0
   7489    mova          [r3+1*16], m2
   7490    mova          [r3+2*16], m5
   7491    mova          [r3+3*16], m7
   7492    mova                 m0, [o(pd_2048)]
   7493    ITX_MULSUB_2D         1, 6, 2, 5, 7, 0, 1567, 3784    ; t39a, t56a
   7494    ITX_MULSUB_2D         4, 3, 2, 5, _, 0,    7, 3784, 4 ; t55a, t40a
   7495    mova                 m2, [r3+1*16]
   7496    mova                 m7, [r3+3*16]
   7497    psubd                m5, m2, m7 ; t48a
   7498    paddd                m2, m7     ; t63a
   7499    mova          [r3+1*16], m5
   7500    mova                 m0, [r3+0*16]
   7501    mova                 m5, [r3+2*16]
   7502    psubd                m7, m0, m5 ; t47a
   7503    paddd                m0, m5     ; t32a
   7504    psubd                m5, m6, m4 ; t55
   7505    paddd                m6, m4     ; t56
   7506    psubd                m4, m1, m3 ; t40
   7507    paddd                m1, m3     ; t39
   7508    mova                 m3, [o(clip_18b_min)]
   7509    REPX     {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1
   7510    pmaxsd               m3, [r3+1*16]
   7511    mova          [r3+0*16], m3
   7512    mova                 m3, [o(clip_18b_max)]
   7513    REPX     {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1
   7514    pminsd               m3, [r3+0*16]
   7515    mova         [r4-16* 8], m2
   7516    mova         [r3-16*32], m0
   7517    mova         [r3-16* 8], m6
   7518    mova         [r4-16*32], m1
   7519    mova                 m0, [o(pd_2896)]
   7520    mova                 m1, [o(pd_2048)]
   7521    REPX     {pmulld x, m0}, m3, m7, m5, m4
   7522    REPX     {paddd  x, m1}, m3, m5
   7523    psubd                m6, m3, m7 ; t47
   7524    paddd                m3, m7     ; t48
   7525    psubd                m7, m5, m4 ; t40a
   7526    paddd                m5, m4     ; t55a
   7527    REPX      {psrad x, 12}, m6, m3, m7, m5
   7528    mova         [r4-16*24], m6
   7529    mova         [r3-16*16], m3
   7530    mova         [r3-16*24], m7
   7531    mova         [r4-16*16], m5
   7532 %endif
   7533    add                  r3, 16
   7534    sub                  r4, 16
   7535    cmp                  r3, r4
   7536    jl .main_part2_loop
   7537    sub                  r3, 4*16
   7538    ret
   7539 
   7540 .main_end_loop:
   7541    mova                 m0, [r3+16*28] ; idct8  0  + n
   7542 .main_end_loop_start:
   7543    mova                 m2, [r3+16*12] ; idct32 16 + n
   7544    mova                 m3, [r4+16*12] ; idct32 31 - n
   7545 %if ARCH_X86_64
   7546    mova                 m1, [r4+16*28] ; idct16 15 - n
   7547    mova                 m4, [r4-16* 4] ; idct64 63 - n
   7548    mova                 m5, [r3-16* 4] ; idct64 48 + n
   7549    mova                 m6, [r4-16*20] ; idct64 47 - n
   7550    mova                 m7, [r3-16*20] ; idct64 32 + n
   7551    pmaxsd               m0, m12
   7552    pminsd               m0, m13
   7553    paddd                m8, m0, m1     ; idct16 out0  + n
   7554    psubd                m0, m1         ; idct16 out15 - n
   7555    REPX    {pmaxsd x, m12}, m8, m0
   7556    REPX    {pminsd x, m13}, m8, m0
   7557    paddd                m1, m8, m3     ; idct32 out0  + n
   7558    psubd                m8, m3         ; idct32 out31 - n
   7559    paddd                m3, m0, m2     ; idct32 out15 - n
   7560    psubd                m0, m2         ; idct32 out16 + n
   7561    REPX    {pmaxsd x, m12}, m1, m8, m3, m0
   7562    REPX    {pminsd x, m13}, m1, m3, m8, m0
   7563    REPX    {paddd  x, m15}, m1, m3, m0, m8
   7564    paddd                m2, m1, m4     ; idct64 out0  + n (unshifted)
   7565    psubd                m1, m4         ; idct64 out63 - n (unshifted)
   7566    paddd                m4, m3, m5     ; idct64 out15 - n (unshifted)
   7567    psubd                m3, m5         ; idct64 out48 + n (unshifted)
   7568    paddd                m5, m0, m6     ; idct64 out16 + n (unshifted)
   7569    psubd                m0, m6         ; idct64 out47 - n (unshifted)
   7570    paddd                m6, m8, m7     ; idct64 out31 - n (unshifted)
   7571    psubd                m8, m7         ; idct64 out32 + n (unshifted)
   7572    mova         [r3-16*20], m2
   7573    mova         [r4+16*28], m1
   7574    mova         [r4-16*20], m4
   7575    mova         [r3+16*28], m3
   7576    mova         [r3-16* 4], m5
   7577    mova         [r4+16*12], m0
   7578    mova         [r4-16* 4], m6
   7579    mova         [r3+16*12], m8
   7580 %else
   7581    mova                 m5, [o(clip_18b_min)]
   7582    mova                 m6, [o(clip_18b_max)]
   7583    mova                 m1, [r3+16*44] ; idct16 15 - n
   7584    pmaxsd               m0, m5
   7585    pminsd               m0, m6
   7586    paddd                m4, m0, m1     ; idct16 out0  + n
   7587    psubd                m0, m1         ; idct16 out15 - n
   7588    REPX     {pmaxsd x, m5}, m4, m0
   7589    REPX     {pminsd x, m6}, m4, m0
   7590    paddd                m1, m4, m3     ; idct32 out0  + n
   7591    psubd                m4, m3         ; idct32 out31 - n
   7592    paddd                m3, m0, m2     ; idct32 out15 - n
   7593    psubd                m0, m2         ; idct32 out16 + n
   7594    REPX     {pmaxsd x, m5}, m1, m4, m3, m0
   7595    REPX     {pminsd x, m6}, m1, m3, m4, m0
   7596    REPX     {paddd  x, m7}, m1, m3, m0, m4
   7597    mova                 m5, [r4-16* 4] ; idct64 63 - n
   7598    mova                 m6, [r3-16* 4] ; idct64 48 + n
   7599    paddd                m2, m1, m5     ; idct64 out0  + n (unshifted)
   7600    psubd                m1, m5         ; idct64 out63 - n (unshifted)
   7601    paddd                m5, m3, m6     ; idct64 out15 - n (unshifted)
   7602    psubd                m3, m6         ; idct64 out48 + n (unshifted)
   7603    mova         [r4+16*28], m1
   7604    mova         [r3+16*28], m3
   7605    mova                 m6, [r4-16*20] ; idct64 47 - n
   7606    mova                 m1, [r3-16*20] ; idct64 32 + n
   7607    mova         [r3-16*20], m2
   7608    mova         [r4-16*20], m5
   7609    paddd                m5, m0, m6     ; idct64 out16 + n (unshifted)
   7610    psubd                m0, m6         ; idct64 out47 - n (unshifted)
   7611    paddd                m6, m4, m1     ; idct64 out31 - n (unshifted)
   7612    psubd                m4, m1         ; idct64 out32 + n (unshifted)
   7613    mova         [r3-16* 4], m5
   7614    mova         [r4+16*12], m0
   7615    mova         [r4-16* 4], m6
   7616    mova         [r3+16*12], m4
   7617 %endif
   7618    sub                  r4, 16
   7619    add                  r3, 16
   7620    cmp                  r3, r4
   7621    jl .main_end_loop
   7622    ret
   7623 
   7624 .shift_transpose:
   7625    mova                 m0, [r3+0*16]
   7626    mova                 m1, [r3+1*16]
   7627    mova                 m2, [r3+2*16]
   7628    mova                 m3, [r3+3*16]
   7629    mova                 m4, [r3+4*16]
   7630    mova                 m5, [r3+5*16]
   7631    mova                 m6, [r3+6*16]
   7632    mova                 m7, [r3+7*16]
   7633    REPX       {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
   7634    packssdw             m0, m1
   7635    packssdw             m2, m3
   7636    packssdw             m4, m5
   7637    packssdw             m6, m7
   7638    call m(idct_8x4_internal_16bpc).transpose4x8packed
   7639    mova          [r4+0*64], m0
   7640    mova          [r4+1*64], m1
   7641    mova          [r4+2*64], m2
   7642    mova          [r4+3*64], m3
   7643    sub                  r4, 4*64
   7644    sub                  r3, 8*16
   7645    cmp                  r3, rsp
   7646    jg .shift_transpose
   7647    ret
   7648 
   7649 .dconly:
   7650    imul                r5d, [cq], 181
   7651    mov                [cq], eobd ; 0
   7652    mov                 r3d, 16
   7653 .dconly1:
   7654    add                 r5d, 640
   7655    sar                 r5d, 10
   7656 .dconly2:
   7657    imul                r5d, 2896
   7658    add                 r5d, 34816
   7659    movd                 m0, r5d
   7660    pshuflw              m0, m0, q1111
   7661    punpcklqdq           m0, m0
   7662    mova                 m6, [o(pixel_10bpc_max)]
   7663    pxor                 m5, m5
   7664 .dconly_loop:
   7665    paddw                m1, m0, [dstq+16*0]
   7666    paddw                m2, m0, [dstq+16*1]
   7667    paddw                m3, m0, [dstq+16*2]
   7668    paddw                m4, m0, [dstq+16*3]
   7669    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
   7670    REPX     {pminsw x, m6}, m1, m2, m3, m4
   7671    mova        [dstq+16*0], m1
   7672    mova        [dstq+16*1], m2
   7673    mova        [dstq+16*2], m3
   7674    mova        [dstq+16*3], m4
   7675    add                dstq, 64
   7676    btc                 r3d, 16
   7677    jnc .dconly_loop
   7678    lea                dstq, [dstq+strideq-128]
   7679    dec                 r3d
   7680    jg .dconly_loop
   7681    RET
   7682 
   7683 cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
   7684                                         0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \
   7685                                         dst, stride, c, eob
   7686    LEA                  r6, base
   7687    test               eobd, eobd
   7688    jz .dconly
   7689 
   7690 %if ARCH_X86_32
   7691    DECLARE_REG_TMP 0, 4, 1
   7692    mov [rsp+(8*32+64+8)*16+1*gprsize], dstq
   7693    mov [rsp+(8*32+64+8)*16+2*gprsize], strideq
   7694 %else
   7695    DECLARE_REG_TMP 4, 7, 8
   7696 %if WIN64
   7697    mov [rsp+(8*32+64+1)*16+1*gprsize], r7
   7698    mov [rsp+64*16+0*gprsize], r8
   7699 %endif
   7700 %endif
   7701 %undef cmp
   7702    ; remove entirely-zero iterations
   7703    mov                 r5d, 14
   7704    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
   7705    jge .end_zero_loop
   7706    pxor                 m0, m0
   7707 .zero_loop:
   7708    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
   7709    movzx               t1d, t0b
   7710    shr                 t0d, 8
   7711    lea                  t2, [rsp+7*32*16]
   7712 .zero_loop_inner:
   7713    mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
   7714    mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
   7715    mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0
   7716    mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0
   7717    sub                  t2, 32*16
   7718    cmp                  t2, rsp
   7719    jge .zero_loop_inner
   7720    sub                 r5d, 2
   7721    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
   7722    jl .zero_loop
   7723 .end_zero_loop:
   7724    mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd
   7725    ; actual first pass after skipping all-zero data
   7726 .loop_pass1:
   7727 %if ARCH_X86_64
   7728    mova                m11, [o(pd_2048)]
   7729    mova                m12, [o(clip_18b_min)]
   7730    mova                m13, [o(clip_18b_max)]
   7731    mova                m14, [o(pd_2896)]
   7732 %endif
   7733 
   7734    mov                  r3, rsp
   7735    lea                  r4, [o(idct64_mul_16bpc)]
   7736    mova                 m0, [cq+128* 1+r5*8]
   7737    mova                 m1, [cq+128*31+r5*8]
   7738    mova                 m2, [cq+128*17+r5*8]
   7739    mova                 m3, [cq+128*15+r5*8]
   7740    call .rect2_mul_fast
   7741    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
   7742    mova                 m0, [cq+128* 7+r5*8]
   7743    mova                 m1, [cq+128*25+r5*8]
   7744    mova                 m2, [cq+128*23+r5*8]
   7745    mova                 m3, [cq+128* 9+r5*8]
   7746    call .rect2_mul_fast
   7747    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
   7748    mova                 m0, [cq+128* 5+r5*8]
   7749    mova                 m1, [cq+128*27+r5*8]
   7750    mova                 m2, [cq+128*21+r5*8]
   7751    mova                 m3, [cq+128*11+r5*8]
   7752    call .rect2_mul_fast
   7753    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
   7754    mova                 m0, [cq+128* 3+r5*8]
   7755    mova                 m1, [cq+128*29+r5*8]
   7756    mova                 m2, [cq+128*19+r5*8]
   7757    mova                 m3, [cq+128*13+r5*8]
   7758    call .rect2_mul_fast
   7759    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
   7760    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
   7761 
   7762    mova                 m0, [cq+128* 2+r5*8]
   7763    mova                 m1, [cq+128*14+r5*8]
   7764    mova                 m2, [cq+128*18+r5*8]
   7765    mova                 m3, [cq+128*30+r5*8]
   7766    call .rect2_mul_fast
   7767    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
   7768 
   7769    mova                 m0, [cq+128* 6+r5*8]
   7770    mova                 m1, [cq+128*10+r5*8]
   7771    mova                 m2, [cq+128*22+r5*8]
   7772    mova                 m3, [cq+128*26+r5*8]
   7773    call .rect2_mul_fast
   7774    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
   7775    add                  r3, 16*(24+4*ARCH_X86_32)
   7776 
   7777    mova                 m0, [cq+128* 4+r5*8]
   7778    mova                 m1, [cq+128*12+r5*8]
   7779    mova                 m2, [cq+128*20+r5*8]
   7780    mova                 m3, [cq+128*28+r5*8]
   7781    call .rect2_mul_fast
   7782    call m(idct_16x4_internal_16bpc).main_oddhalf_fast
   7783 
   7784    mova                 m0, [cq+128* 0+r5*8]
   7785    mova                 m1, [cq+128* 8+r5*8]
   7786    mova                 m2, [cq+128*16+r5*8]
   7787    mova                 m3, [cq+128*24+r5*8]
   7788    call .rect2_mul_fast
   7789    call m(idct_8x4_internal_16bpc).main_pass1_fast
   7790    call m(idct_8x4_internal_16bpc).round
   7791    mova [r3-(7+4*ARCH_X86_32)*16], m1
   7792    mova [r3-(6+4*ARCH_X86_32)*16], m2
   7793    mova [r3-(5+4*ARCH_X86_32)*16], m3
   7794    mova [r3-(4+4*ARCH_X86_32)*16], m4
   7795    mova [r3-(3+4*ARCH_X86_32)*16], m5
   7796    mova [r3-(2+4*ARCH_X86_32)*16], m6
   7797    mova [r3-(1+4*ARCH_X86_32)*16], m7
   7798    sub                  r3, 16*(40+4*ARCH_X86_32-4)
   7799 
   7800 %if ARCH_X86_64
   7801    psrld               m15, m11, 11 ; pd_1
   7802 %else
   7803    mova                 m7, [o(pd_1)]
   7804 %endif
   7805    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
   7806 
   7807    lea                  r3, [rsp+56*16]
   7808    lea                  t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16]
   7809    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
   7810    movzx               t1d, t0b
   7811    shr                 t0d, 8
   7812    call .shift_transpose
   7813    ; zero cq
   7814    pxor                 m7, m7
   7815    lea                  r4, [cq+30*128+r5*8]
   7816 .zero_cq_loop:
   7817    REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
   7818    sub                  r4, 4*128
   7819    cmp                  r4, cq
   7820    jg .zero_cq_loop
   7821    sub                 r5d, 2
   7822    jge .loop_pass1
   7823 
   7824    ; pass=2 code starts here
   7825    mov                eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16]
   7826 %if ARCH_X86_32
   7827    mov             strideq, [rsp+gprsize*2+(8*32+64+8)*16]
   7828 %elif WIN64
   7829    mov                  r8, [rsp+gprsize*0+64*16]
   7830 %endif
   7831    add                 rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16
   7832    cmp                eobd, 36
   7833    jl .load_veryfast
   7834    cmp                eobd, 136
   7835    jl .load_fast
   7836    ; load normal
   7837    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
   7838    jmp .run
   7839 .load_fast:
   7840    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
   7841    jmp .run
   7842 .load_veryfast:
   7843    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
   7844    ; fall-through
   7845 .run:
   7846 %if ARCH_X86_64
   7847    lea                  r2, [dstq+128]
   7848    mov                  r7, -16
   7849 %else
   7850    lea                  r2, [rsp+(8*32+3)*16]
   7851    mov dword [r2+0*gprsize], 8
   7852 %endif
   7853    jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
   7854 
   7855 .rect2_mul_fast:
   7856 %if ARCH_X86_64
   7857    REPX    {pmulld x, m14}, m0, m1, m2, m3
   7858    REPX    {paddd  x, m11}, m0, m1, m2, m3
   7859 %else
   7860    mova                 m4, [o(pd_2896)]
   7861    mova                 m5, [o(pd_2048)]
   7862    REPX    {pmulld x, m4 }, m0, m1, m2, m3
   7863    REPX    {paddd  x, m5 }, m0, m1, m2, m3
   7864 %endif
   7865    REPX    {psrad  x, 12 }, m0, m1, m2, m3
   7866    ret
   7867 
   7868 .shift_transpose:
   7869    mova                 m0, [r3+0*16]
   7870    mova                 m1, [r3+1*16]
   7871    mova                 m2, [r3+2*16]
   7872    mova                 m3, [r3+3*16]
   7873    mova                 m4, [r3+4*16]
   7874    mova                 m5, [r3+5*16]
   7875    mova                 m6, [r3+6*16]
   7876    mova                 m7, [r3+7*16]
   7877    REPX       {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
   7878    packssdw             m0, m1
   7879    packssdw             m2, m3
   7880    packssdw             m4, m5
   7881    packssdw             m6, m7
   7882    call m(idct_8x4_internal_16bpc).transpose4x8packed
   7883    mova     [t2+0*16+r5*8], m0
   7884    mova     [t2+8*16+r5*8], m2
   7885    mova     [t2+0*16+t0*8], m3
   7886    mova     [t2+0*16+t1*8], m1
   7887    sub                  t2, 16*32
   7888    sub                  r3, 8*16
   7889    cmp                  r3, rsp
   7890    jg .shift_transpose
   7891    ret
   7892 
   7893 .dconly:
   7894    imul                r5d, [cq], 181
   7895    mov                [cq], eobd ; 0
   7896    mov                 r3d, 32
   7897    add                 r5d, 128
   7898    sar                 r5d, 8
   7899    imul                r5d, 181
   7900    add                 r5d, 384
   7901    sar                 r5d, 9
   7902    add                 rsp, (1+8*32+1*WIN64)*16
   7903    jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
   7904 
   7905 cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
   7906                                         0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \
   7907                                         dst, stride, c, eob
   7908    LEA                  r6, base
   7909    test               eobd, eobd
   7910    jz .dconly
   7911 
   7912 %if ARCH_X86_32
   7913    DECLARE_REG_TMP 4, 1, 2, 0, 6
   7914    mov [rsp+gprsize*1+(64*9+8)*16], r0
   7915    mov [rsp+gprsize*2+(64*9+8)*16], r1
   7916    mov [rsp+gprsize*3+(64*9+8)*16], r2
   7917    mov [rsp+gprsize*4+(64*9+8)*16], r6
   7918 %else
   7919    DECLARE_REG_TMP 8, 9, 4, 7, 0
   7920    mov [rsp+gprsize*1+(64*9+1)*16], r9
   7921    mov [rsp+gprsize*0+64*16], r0
   7922 %if WIN64
   7923    mov [rsp+gprsize*2+(64*9+1)*16], r7
   7924    mov [rsp+gprsize*3+(64*9+1)*16], r8
   7925 %endif
   7926 %endif
   7927 %undef cmp
   7928 
   7929    ; remove entirely-zero iterations
   7930    mov                 r5d, 14
   7931    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
   7932    jge .end_zero_loop
   7933    pxor                 m0, m0
   7934 .zero_loop:
   7935    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
   7936    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
   7937    movzx               t0d, t1b
   7938    movzx               t2d, t3b
   7939    shr                 t1d, 8
   7940    shr                 t3d, 8
   7941    lea                  t4, [rsp+7*64*16]
   7942 .zero_loop_inner:
   7943    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0
   7944    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0
   7945    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0
   7946    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0
   7947    sub                  t4, 64*16
   7948    cmp                  t4, rsp
   7949    jge .zero_loop_inner
   7950 %if ARCH_X86_32
   7951    mov                  r6, [rsp+gprsize*4+(64*9+8)*16]
   7952 %endif
   7953    sub                 r5d, 2
   7954    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
   7955    jl .zero_loop
   7956 .end_zero_loop:
   7957    mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd
   7958 %if ARCH_X86_32
   7959    mov                  cq, [rsp+gprsize*3+(64*9+8)*16]
   7960 %endif
   7961    ; actual first pass after skipping all-zero data
   7962 .loop_pass1:
   7963 %if ARCH_X86_64
   7964    mova                m11, [o(pd_2048)]
   7965    mova                m12, [o(clip_18b_min)]
   7966    mova                m13, [o(clip_18b_max)]
   7967    mova                m14, [o(pd_2896)]
   7968 %endif
   7969 
   7970    mov                  r3, rsp
   7971    lea                  r4, [o(idct64_mul_16bpc)]
   7972    mova                 m0, [cq+128* 1+r5*8]
   7973    mova                 m1, [cq+128*31+r5*8]
   7974    mova                 m2, [cq+128*17+r5*8]
   7975    mova                 m3, [cq+128*15+r5*8]
   7976    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
   7977    mova                 m0, [cq+128* 7+r5*8]
   7978    mova                 m1, [cq+128*25+r5*8]
   7979    mova                 m2, [cq+128*23+r5*8]
   7980    mova                 m3, [cq+128* 9+r5*8]
   7981    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
   7982    mova                 m0, [cq+128* 5+r5*8]
   7983    mova                 m1, [cq+128*27+r5*8]
   7984    mova                 m2, [cq+128*21+r5*8]
   7985    mova                 m3, [cq+128*11+r5*8]
   7986    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
   7987    mova                 m0, [cq+128* 3+r5*8]
   7988    mova                 m1, [cq+128*29+r5*8]
   7989    mova                 m2, [cq+128*19+r5*8]
   7990    mova                 m3, [cq+128*13+r5*8]
   7991    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
   7992    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
   7993 
   7994    mova                 m0, [cq+128* 2+r5*8]
   7995    mova                 m1, [cq+128*14+r5*8]
   7996    mova                 m2, [cq+128*18+r5*8]
   7997    mova                 m3, [cq+128*30+r5*8]
   7998    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
   7999 
   8000    mova                 m0, [cq+128* 6+r5*8]
   8001    mova                 m1, [cq+128*10+r5*8]
   8002    mova                 m2, [cq+128*22+r5*8]
   8003    mova                 m3, [cq+128*26+r5*8]
   8004    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
   8005    add                  r3, 16*(24+4*ARCH_X86_32)
   8006 
   8007    mova                 m0, [cq+128* 4+r5*8]
   8008    mova                 m1, [cq+128*12+r5*8]
   8009    mova                 m2, [cq+128*20+r5*8]
   8010    mova                 m3, [cq+128*28+r5*8]
   8011    call m(idct_16x4_internal_16bpc).main_oddhalf_fast
   8012 
   8013    mova                 m0, [cq+128* 0+r5*8]
   8014    mova                 m1, [cq+128* 8+r5*8]
   8015    mova                 m2, [cq+128*16+r5*8]
   8016    mova                 m3, [cq+128*24+r5*8]
   8017    call m(idct_8x4_internal_16bpc).main_pass1_fast
   8018    call m(idct_8x4_internal_16bpc).round
   8019    mova [r3-(7+4*ARCH_X86_32)*16], m1
   8020    mova [r3-(6+4*ARCH_X86_32)*16], m2
   8021    mova [r3-(5+4*ARCH_X86_32)*16], m3
   8022    mova [r3-(4+4*ARCH_X86_32)*16], m4
   8023    mova [r3-(3+4*ARCH_X86_32)*16], m5
   8024    mova [r3-(2+4*ARCH_X86_32)*16], m6
   8025    mova [r3-(1+4*ARCH_X86_32)*16], m7
   8026    sub                  r3, 16*(40+4*ARCH_X86_32-4)
   8027 
   8028 %if ARCH_X86_64
   8029    psrld               m15, m11, 10 ; pd_2
   8030 %else
   8031    mova                 m7, [o(pd_2)]
   8032 %endif
   8033    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
   8034 
   8035    lea                  r3, [rsp+56*16]
   8036    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
   8037    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
   8038    movzx               t0d, t1b
   8039    movzx               t2d, t3b
   8040    shr                 t1d, 8
   8041    shr                 t3d, 8
   8042    lea                  t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
   8043    call .shift_transpose
   8044    ; zero cq
   8045    pxor                 m7, m7
   8046 %if ARCH_X86_32
   8047    mov                  cq, [rsp+gprsize*3+(64*9+8)*16]
   8048 %endif
   8049    lea                  r4, [cq+30*128+r5*8]
   8050 .zero_cq_loop:
   8051    REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
   8052    sub                  r4, 4*128
   8053    cmp                  r4, cq
   8054    jg .zero_cq_loop
   8055 %if ARCH_X86_32
   8056    mov                  r6, [rsp+gprsize*4+(64*9+8)*16]
   8057 %endif
   8058    sub                 r5d, 2
   8059    jge .loop_pass1
   8060 
   8061    ; pass=2 code starts here
   8062    mov                eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
   8063 %if ARCH_X86_32
   8064    mov             strideq, [rsp+gprsize*2+(9*64+8)*16]
   8065 %else
   8066    mov                  r0, [rsp+gprsize*0+64*16]
   8067 %endif
   8068    add                 rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16
   8069    cmp                eobd, 151
   8070    jl .fast
   8071    ; fall-through
   8072 %if ARCH_X86_64
   8073    DECLARE_REG_TMP 8, 9
   8074 %else
   8075    DECLARE_REG_TMP 1, 5
   8076 %endif
   8077    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
   8078    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
   8079    jmp .run
   8080 .fast:
   8081    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
   8082    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
   8083 .run:
   8084 
   8085 %if ARCH_X86_64
   8086    lea                  r2, [dstq+128]
   8087    mov                  r7, -16
   8088 %else
   8089    lea                  r2, [rsp+(64*8+3)*16]
   8090    mov      [r2+4*gprsize], t0
   8091    mov      [r2+5*gprsize], t1
   8092    mov                  r1, [r2+2*gprsize]
   8093    mov dword [r2+0*gprsize], 8
   8094 %endif
   8095    jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
   8096 
   8097    ; copy of pass=1 tmp-regs
   8098 %if ARCH_X86_32
   8099    DECLARE_REG_TMP 4, 1, 2, 0, 6
   8100 %else
   8101    DECLARE_REG_TMP 8, 9, 4, 7, 0
   8102 %endif
   8103 
   8104 .shift_transpose:
   8105    mova                 m0, [r3+0*16]
   8106    mova                 m1, [r3+1*16]
   8107    mova                 m2, [r3+2*16]
   8108    mova                 m3, [r3+3*16]
   8109    mova                 m4, [r3+4*16]
   8110    mova                 m5, [r3+5*16]
   8111    mova                 m6, [r3+6*16]
   8112    mova                 m7, [r3+7*16]
   8113    REPX       {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
   8114    packssdw             m0, m1
   8115    packssdw             m2, m3
   8116    packssdw             m4, m5
   8117    packssdw             m6, m7
   8118    call m(idct_8x4_internal_16bpc).transpose4x8packed
   8119    mova          [t4+t0*8], m0
   8120    mova          [t4+t1*8], m1
   8121    mova          [t4+t2*8], m2
   8122    mova          [t4+t3*8], m3
   8123    sub                  t4, 16*64
   8124    sub                  r3, 8*16
   8125    cmp                  r3, rsp
   8126    jg .shift_transpose
   8127    ret
   8128 
   8129 .dconly:
   8130    imul                r5d, [cq], 181
   8131    mov                [cq], eobd ; 0
   8132    mov                 r3d, 64
   8133    add                 rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \
   8134                             (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16
   8135    jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1