tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9itxfm.asm (112846B)


      1 ;******************************************************************************
      2 ;* VP9 IDCT SIMD optimizations
      3 ;*
      4 ;* Copyright (C) 2013 Clément Bœsch <u pkh me>
      5 ;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
      6 ;*
      7 ;* This file is part of FFmpeg.
      8 ;*
      9 ;* FFmpeg is free software; you can redistribute it and/or
     10 ;* modify it under the terms of the GNU Lesser General Public
     11 ;* License as published by the Free Software Foundation; either
     12 ;* version 2.1 of the License, or (at your option) any later version.
     13 ;*
     14 ;* FFmpeg is distributed in the hope that it will be useful,
     15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     17 ;* Lesser General Public License for more details.
     18 ;*
     19 ;* You should have received a copy of the GNU Lesser General Public
     20 ;* License along with FFmpeg; if not, write to the Free Software
     21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     22 ;******************************************************************************
     23 
     24 %include "libavutil/x86/x86util.asm"
     25 %include "vp9itxfm_template.asm"
     26 
     27 SECTION_RODATA 32
     28 
     29 %macro VP9_IDCT_COEFFS 2-3 0
     30 const pw_m%1_%2
     31 times 8 dw -%1,  %2
     32 const pw_%2_%1
     33 times 8 dw  %2,  %1
     34 
     35 %if %3 == 1
     36 const pw_m%2_m%1
     37 times 8 dw -%2, -%1
     38 %if %1 != %2
     39 const pw_m%2_%1
     40 times 8 dw -%2,  %1
     41 const pw_%1_%2
     42 times 8 dw  %1,  %2
     43 %endif
     44 %endif
     45 
     46 %if %1 < 11585
     47 pw_m%1x2:   times 16 dw -%1*2
     48 %elif %1 > 11585
     49 pw_%1x2:    times 16 dw  %1*2
     50 %else
     51 const pw_%1x2
     52 times 16 dw %1*2
     53 %endif
     54 
     55 %if %2 != %1
     56 pw_%2x2:    times 16 dw  %2*2
     57 %endif
     58 %endmacro
     59 
     60 VP9_IDCT_COEFFS 16364,   804
     61 VP9_IDCT_COEFFS 16305,  1606
     62 VP9_IDCT_COEFFS 16069,  3196, 1
     63 VP9_IDCT_COEFFS 15893,  3981
     64 VP9_IDCT_COEFFS 15137,  6270, 1
     65 VP9_IDCT_COEFFS 14811,  7005
     66 VP9_IDCT_COEFFS 14449,  7723
     67 VP9_IDCT_COEFFS 13160,  9760
     68 VP9_IDCT_COEFFS 11585, 11585, 1
     69 VP9_IDCT_COEFFS 11003, 12140
     70 VP9_IDCT_COEFFS 10394, 12665
     71 VP9_IDCT_COEFFS  9102, 13623, 1
     72 VP9_IDCT_COEFFS  8423, 14053
     73 VP9_IDCT_COEFFS  5520, 15426
     74 VP9_IDCT_COEFFS  4756, 15679
     75 VP9_IDCT_COEFFS  2404, 16207
     76 
     77 const pw_5283_13377
     78 times 4 dw 5283, 13377
     79 const pw_9929_13377
     80 times 4 dw 9929, 13377
     81 const pw_15212_m13377
     82 times 4 dw 15212, -13377
     83 const pw_15212_9929
     84 times 4 dw 15212, 9929
     85 const pw_m5283_m15212
     86 times 4 dw -5283, -15212
     87 const pw_13377x2
     88 times 8 dw 13377*2
     89 const pw_m13377_13377
     90 times 4 dw -13377, 13377
     91 const pw_13377_0
     92 times 4 dw 13377, 0
     93 
     94 cextern pw_8
     95 cextern pw_16
     96 cextern pw_32
     97 cextern pw_512
     98 cextern pw_1024
     99 cextern pw_2048
    100 cextern pw_m1
    101 cextern pd_8192
    102 
    103 SECTION .text
    104 
    105 %macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
    106    punpckhwd          m%4, m%2, m%1
    107    punpcklwd          m%2, m%1
    108    pmaddwd            m%3, m%4, [pw_m%5_%6]
    109    pmaddwd            m%4, [pw_%6_%5]
    110    pmaddwd            m%1, m%2, [pw_m%5_%6]
    111    pmaddwd            m%2, [pw_%6_%5]
    112 %endmacro
    113 
    114 %macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
    115    SUMSUB_BA            d, %1, %2, %5
    116    SUMSUB_BA            d, %3, %4, %5
    117    paddd              m%1, %6
    118    paddd              m%2, %6
    119    paddd              m%3, %6
    120    paddd              m%4, %6
    121    psrad              m%1, 14
    122    psrad              m%2, 14
    123    psrad              m%3, 14
    124    psrad              m%4, 14
    125    packssdw           m%1, m%3
    126    packssdw           m%2, m%4
    127 %endmacro
    128 
    129 %macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
    130 %if mmsize == 32
    131    pmovzxbw           m%3, [%6]
    132    pmovzxbw           m%4, [%6+strideq]
    133 %else
    134    movh               m%3, [%6]
    135    movh               m%4, [%6+strideq]
    136    punpcklbw          m%3, m%5
    137    punpcklbw          m%4, m%5
    138 %endif
    139    paddw              m%3, m%1
    140    paddw              m%4, m%2
    141 %if mmsize == 32
    142    packuswb           m%3, m%4
    143    ; Intel...
    144    vpermq             m%3, m%3, q3120
    145    mova              [%6], xm%3
    146    vextracti128 [%6+strideq], m%3, 1
    147 %elif mmsize == 16
    148    packuswb           m%3, m%4
    149    movh              [%6], m%3
    150    movhps    [%6+strideq], m%3
    151 %else
    152    packuswb           m%3, m%5
    153    packuswb           m%4, m%5
    154    movh              [%6], m%3
    155    movh      [%6+strideq], m%4
    156 %endif
    157 %endmacro
    158 
    159 %macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
    160 %assign %%y 0
    161 %rep %3
    162 %assign %%x 0
    163 %rep %3*2/mmsize
    164    mova      [%1+%%y+%%x], %4
    165 %assign %%x (%%x+mmsize)
    166 %endrep
    167 %assign %%y (%%y+%2)
    168 %endrep
    169 %endmacro
    170 
    171 ;-------------------------------------------------------------------------------------------
    172 ; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
    173 ;-------------------------------------------------------------------------------------------
    174 
    175 INIT_MMX mmx
    176 cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
    177    mova                m0, [blockq+0*8]
    178    mova                m1, [blockq+1*8]
    179    mova                m2, [blockq+2*8]
    180    mova                m3, [blockq+3*8]
    181    psraw               m0, 2
    182    psraw               m1, 2
    183    psraw               m2, 2
    184    psraw               m3, 2
    185 
    186    VP9_IWHT4_1D
    187    TRANSPOSE4x4W        0, 1, 2, 3, 4
    188    VP9_IWHT4_1D
    189 
    190    pxor                m4, m4
    191    VP9_STORE_2X         0, 1, 5, 6, 4
    192    lea               dstq, [dstq+strideq*2]
    193    VP9_STORE_2X         2, 3, 5, 6, 4
    194    ZERO_BLOCK      blockq, 8, 4, m4
    195    RET
    196 
    197 ;-------------------------------------------------------------------------------------------
    198 ; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
    199 ;-------------------------------------------------------------------------------------------
    200 
    201 ; 2x2 top left corner
    202 %macro VP9_IDCT4_2x2_1D 0
    203    pmulhrsw            m0, m5                              ; m0=t1
    204    mova                m2, m0                              ; m2=t0
    205    mova                m3, m1
    206    pmulhrsw            m1, m6                              ; m1=t2
    207    pmulhrsw            m3, m7                              ; m3=t3
    208    VP9_IDCT4_1D_FINALIZE
    209 %endmacro
    210 
    211 %macro VP9_IDCT4_WRITEOUT 0
    212 %if cpuflag(ssse3)
    213    mova                m5, [pw_2048]
    214    pmulhrsw            m0, m5              ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
    215    pmulhrsw            m1, m5
    216 %else
    217    mova                m5, [pw_8]
    218    paddw               m0, m5
    219    paddw               m1, m5
    220    psraw               m0, 4
    221    psraw               m1, 4
    222 %endif
    223    VP9_STORE_2X         0,  1,  6,  7,  4
    224    lea               dstq, [dstq+2*strideq]
    225 %if cpuflag(ssse3)
    226    pmulhrsw            m2, m5
    227    pmulhrsw            m3, m5
    228 %else
    229    paddw               m2, m5
    230    paddw               m3, m5
    231    psraw               m2, 4
    232    psraw               m3, 4
    233 %endif
    234    VP9_STORE_2X         2,  3,  6,  7,  4
    235 %endmacro
    236 
    237 %macro IDCT_4x4_FN 1
    238 INIT_MMX %1
    239 cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob
    240 
    241 %if cpuflag(ssse3)
    242    cmp eobd, 4 ; 2x2 or smaller
    243    jg .idctfull
    244 
    245    cmp eobd, 1 ; faster path for when only DC is set
    246    jne .idct2x2
    247 %else
    248    cmp eobd, 1
    249    jg .idctfull
    250 %endif
    251 
    252 %if cpuflag(ssse3)
    253    movd                m0, [blockq]
    254    mova                m5, [pw_11585x2]
    255    pmulhrsw            m0, m5
    256    pmulhrsw            m0, m5
    257 %else
    258    DEFINE_ARGS dst, stride, block, coef
    259    movsx            coefd, word [blockq]
    260    imul             coefd, 11585
    261    add              coefd, 8192
    262    sar              coefd, 14
    263    imul             coefd, 11585
    264    add              coefd, (8 << 14) + 8192
    265    sar              coefd, 14 + 4
    266    movd                m0, coefd
    267 %endif
    268    pshufw              m0, m0, 0
    269    pxor                m4, m4
    270    movh          [blockq], m4
    271 %if cpuflag(ssse3)
    272    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
    273 %endif
    274    VP9_STORE_2X         0,  0,  6,  7,  4
    275    lea               dstq, [dstq+2*strideq]
    276    VP9_STORE_2X         0,  0,  6,  7,  4
    277    RET
    278 
    279 %if cpuflag(ssse3)
    280 ; faster path for when only top left 2x2 block is set
    281 .idct2x2:
    282    movd                m0, [blockq+0]
    283    movd                m1, [blockq+8]
    284    mova                m5, [pw_11585x2]
    285    mova                m6, [pw_6270x2]
    286    mova                m7, [pw_15137x2]
    287    VP9_IDCT4_2x2_1D
    288    ; partial 2x4 transpose
    289    punpcklwd           m0, m1
    290    punpcklwd           m2, m3
    291    SBUTTERFLY          dq, 0, 2, 1
    292    SWAP                1, 2
    293    VP9_IDCT4_2x2_1D
    294    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
    295    movh       [blockq+ 0], m4
    296    movh       [blockq+ 8], m4
    297    VP9_IDCT4_WRITEOUT
    298    RET
    299 %endif
    300 
    301 .idctfull: ; generic full 4x4 idct/idct
    302    mova                m0, [blockq+ 0]
    303    mova                m1, [blockq+ 8]
    304    mova                m2, [blockq+16]
    305    mova                m3, [blockq+24]
    306 %if cpuflag(ssse3)
    307    mova                m6, [pw_11585x2]
    308 %endif
    309    mova                m7, [pd_8192]       ; rounding
    310    VP9_IDCT4_1D
    311    TRANSPOSE4x4W  0, 1, 2, 3, 4
    312    VP9_IDCT4_1D
    313    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
    314    mova       [blockq+ 0], m4
    315    mova       [blockq+ 8], m4
    316    mova       [blockq+16], m4
    317    mova       [blockq+24], m4
    318    VP9_IDCT4_WRITEOUT
    319    RET
    320 %endmacro
    321 
    322 IDCT_4x4_FN mmxext
    323 IDCT_4x4_FN ssse3
    324 
    325 ;-------------------------------------------------------------------------------------------
    326 ; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
    327 ;-------------------------------------------------------------------------------------------
    328 
    329 %macro IADST4_FN 5
    330 INIT_MMX %5
    331 cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob
    332 %if WIN64 && notcpuflag(ssse3)
    333 INIT_XMM cpuname
    334    WIN64_SPILL_XMM 8
    335 INIT_MMX cpuname
    336 %endif
    337    movdqa            xmm5, [pd_8192]
    338    mova                m0, [blockq+ 0]
    339    mova                m1, [blockq+ 8]
    340    mova                m2, [blockq+16]
    341    mova                m3, [blockq+24]
    342 %if cpuflag(ssse3)
    343    mova                m6, [pw_11585x2]
    344 %endif
    345 %ifnidn %1%3, iadstiadst
    346    movdq2q             m7, xmm5
    347 %endif
    348    VP9_%2_1D
    349    TRANSPOSE4x4W  0, 1, 2, 3, 4
    350    VP9_%4_1D
    351    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
    352    mova       [blockq+ 0], m4
    353    mova       [blockq+ 8], m4
    354    mova       [blockq+16], m4
    355    mova       [blockq+24], m4
    356    VP9_IDCT4_WRITEOUT
    357    RET
    358 %endmacro
    359 
    360 IADST4_FN idct,  IDCT4,  iadst, IADST4, sse2
    361 IADST4_FN iadst, IADST4, idct,  IDCT4,  sse2
    362 IADST4_FN iadst, IADST4, iadst, IADST4, sse2
    363 
    364 IADST4_FN idct,  IDCT4,  iadst, IADST4, ssse3
    365 IADST4_FN iadst, IADST4, idct,  IDCT4,  ssse3
    366 IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
    367 
    368 %macro SCRATCH 3
    369 %if ARCH_X86_64
    370    SWAP                %1, %2
    371 %else
    372    mova              [%3], m%1
    373 %endif
    374 %endmacro
    375 
    376 %macro UNSCRATCH 3
    377 %if ARCH_X86_64
    378    SWAP                %1, %2
    379 %else
    380    mova               m%1, [%3]
    381 %endif
    382 %endmacro
    383 
    384 ;-------------------------------------------------------------------------------------------
    385 ; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
    386 ;-------------------------------------------------------------------------------------------
    387 
    388 %macro VP9_IDCT8_1D_FINALIZE 0
    389    SUMSUB_BA            w,  3,  6, 5                       ; m3=t0+t7, m6=t0-t7
    390    SUMSUB_BA            w,  1,  2, 5                       ; m1=t1+t6, m2=t1-t6
    391    SUMSUB_BA            w,  7,  0, 5                       ; m7=t2+t5, m0=t2-t5
    392 
    393    UNSCRATCH            5, 8, blockq+ 0
    394    SCRATCH              2, 8, blockq+ 0
    395 
    396    SUMSUB_BA            w,  5,  4, 2                       ; m5=t3+t4, m4=t3-t4
    397    SWAP                 7,  6,  2
    398    SWAP                 3,  5,  0
    399 
    400 %if ARCH_X86_64
    401    SWAP                 6, 8
    402 %endif
    403 %endmacro
    404 
    405 ; x86-32
    406 ; - in: m0/m4 is in mem
    407 ; - out: m6 is in mem
    408 ; x86-64:
    409 ; - everything is in registers (m0-7)
    410 %macro VP9_IDCT8_1D 0
    411 %if ARCH_X86_64
    412    SWAP                 0, 8
    413    SWAP                 4, 9
    414 %endif
    415 
    416    VP9_UNPACK_MULSUB_2W_4X 5,  3,  9102, 13623, D_8192_REG, 0, 4  ; m5=t5a, m3=t6a
    417    VP9_UNPACK_MULSUB_2W_4X 1,  7, 16069,  3196, D_8192_REG, 0, 4  ; m1=t4a, m7=t7a
    418    SUMSUB_BA            w,  5,  1, 0                       ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a)
    419    SUMSUB_BA            w,  3,  7, 0                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
    420 %if cpuflag(ssse3)
    421    SUMSUB_BA            w,  1,  7, 0                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
    422    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
    423    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
    424 %else
    425    VP9_UNPACK_MULSUB_2W_4X 7,  1, 11585, 11585, D_8192_REG, 0, 4
    426 %endif
    427    VP9_UNPACK_MULSUB_2W_4X 2,  6, 15137,  6270, D_8192_REG, 0, 4  ; m2=t2a, m6=t3a
    428 
    429    UNSCRATCH            0, 8, blockq+ 0    ; IN(0)
    430    UNSCRATCH            4, 9, blockq+64    ; IN(4)
    431    SCRATCH              5, 8, blockq+ 0
    432 
    433 %if cpuflag(ssse3)
    434    SUMSUB_BA            w, 4, 0, 5                         ; m4=IN(0)+IN(4) m0=IN(0)-IN(4)
    435    pmulhrsw            m4, W_11585x2_REG                   ; m4=t0a
    436    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a
    437 %else
    438    SCRATCH              7, 9, blockq+64
    439    VP9_UNPACK_MULSUB_2W_4X 0,  4, 11585, 11585, D_8192_REG, 5, 7
    440    UNSCRATCH            7, 9, blockq+64
    441 %endif
    442    SUMSUB_BA            w,  6,  4, 5                       ; m6=t0a+t3a (t0), m4=t0a-t3a (t3)
    443    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
    444 
    445    VP9_IDCT8_1D_FINALIZE
    446 %endmacro
    447 
    448 %macro VP9_IDCT8_4x4_1D 0
    449    pmulhrsw            m0, W_11585x2_REG                   ; m0=t1a/t0a
    450    pmulhrsw            m6, m2, [pw_15137x2]                ; m6=t3a
    451    pmulhrsw            m2, [pw_6270x2]                     ; m2=t2a
    452    pmulhrsw            m7, m1, [pw_16069x2]                ; m7=t7a
    453    pmulhrsw            m1, [pw_3196x2]                     ; m1=t4a
    454    pmulhrsw            m5, m3, [pw_m9102x2]                ; m5=t5a
    455    pmulhrsw            m3, [pw_13623x2]                    ; m3=t6a
    456    SUMSUB_BA            w,  5,  1, 4                       ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a)
    457    SUMSUB_BA            w,  3,  7, 4                       ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a)
    458    SUMSUB_BA            w,  1,  7, 4                       ; m1=t6a+t5a (t6), m7=t6a-t5a (t5)
    459    pmulhrsw            m1, W_11585x2_REG                   ; m1=t6
    460    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
    461    psubw               m4, m0, m6                          ; m4=t0a-t3a (t3)
    462    paddw               m6, m0                              ; m6=t0a+t3a (t0)
    463    SCRATCH              5,  8, blockq+ 0
    464    SUMSUB_BA            w,  2,  0, 5                       ; m2=t1a+t2a (t1), m0=t1a-t2a (t2)
    465    VP9_IDCT8_1D_FINALIZE
    466 %endmacro
    467 
    468 %macro VP9_IDCT8_2x2_1D 1
    469    pmulhrsw            m0, W_11585x2_REG                   ; m0=t0
    470    pmulhrsw            m3, m1, W_16069x2_REG               ; m3=t7
    471    pmulhrsw            m1, W_3196x2_REG                    ; m1=t4
    472    psubw               m7, m3, m1                          ; t5 = t7a - t4a
    473    paddw               m5, m3, m1                          ; t6 = t7a + t4a
    474    pmulhrsw            m7, W_11585x2_REG                   ; m7=t5
    475    pmulhrsw            m5, W_11585x2_REG                   ; m5=t6
    476    SWAP                 5,  1
    477    ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier
    478    psubw               m6, m0, m3                          ; m6=t0-t7
    479    paddw               m3, m0                              ; m3=t0+t7
    480    psubw               m2, m0, m1                          ; m2=t1-t6
    481    paddw               m1, m0                              ; m1=t1+t6
    482 %if %1 == 1
    483    punpcklwd           m3, m1
    484 %define SCRATCH_REG 1
    485 %elif ARCH_X86_32
    486    mova       [blockq+ 0], m2
    487 %define SCRATCH_REG 2
    488 %else
    489 %define SCRATCH_REG 8
    490 %endif
    491    psubw               m4, m0, m5                          ; m4=t3-t4
    492    paddw               m5, m0                              ; m5=t3+t4
    493    SUMSUB_BA            w,  7,  0, SCRATCH_REG             ; m7=t2+t5, m0=t2-t5
    494    SWAP                 7,  6,  2
    495    SWAP                 3,  5,  0
    496 %undef SCRATCH_REG
    497 %endmacro
    498 
    499 %macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift
    500 %if cpuflag(ssse3)
    501    pmulhrsw           m%1, %6              ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
    502    pmulhrsw           m%2, %6
    503 %else
    504    paddw              m%1, %6
    505    paddw              m%2, %6
    506    psraw              m%1, %7
    507    psraw              m%2, %7
    508 %endif
    509 %if %0 <= 7
    510    VP9_STORE_2X        %1, %2, %3, %4, %5
    511 %else
    512    VP9_STORE_2X        %1, %2, %3, %4, %5, %8
    513 %endif
    514 %endmacro
    515 
    516 ; x86-32:
    517 ; - m6 is in mem
    518 ; x86-64:
    519 ; - m8 holds m6 (SWAP)
    520 ; m6 holds zero
    521 %macro VP9_IDCT8_WRITEOUT 0
    522 %if ARCH_X86_64
    523 %if cpuflag(ssse3)
    524    mova                m9, [pw_1024]
    525 %else
    526    mova                m9, [pw_16]
    527 %endif
    528 %define ROUND_REG m9
    529 %else
    530 %if cpuflag(ssse3)
    531 %define ROUND_REG [pw_1024]
    532 %else
    533 %define ROUND_REG [pw_16]
    534 %endif
    535 %endif
    536    SCRATCH              5, 10, blockq+16
    537    SCRATCH              7, 11, blockq+32
    538    VP9_IDCT8_WRITEx2    0,  1, 5, 7, 6, ROUND_REG
    539    lea               dstq, [dstq+2*strideq]
    540    VP9_IDCT8_WRITEx2    2,  3, 5, 7, 6, ROUND_REG
    541    lea               dstq, [dstq+2*strideq]
    542    UNSCRATCH            5, 10, blockq+16
    543    UNSCRATCH            7, 11, blockq+32
    544    VP9_IDCT8_WRITEx2    4,  5, 0, 1, 6, ROUND_REG
    545    lea               dstq, [dstq+2*strideq]
    546    UNSCRATCH            5, 8, blockq+ 0
    547    VP9_IDCT8_WRITEx2    5,  7, 0, 1, 6, ROUND_REG
    548 
    549 %undef ROUND_REG
    550 %endmacro
    551 
    552 %macro VP9_IDCT_IDCT_8x8_ADD_XMM 2
    553 INIT_XMM %1
    554 cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob
    555 
    556 %if cpuflag(ssse3)
    557 %if ARCH_X86_64
    558    mova               m12, [pw_11585x2]    ; often used
    559 %define W_11585x2_REG m12
    560 %else
    561 %define W_11585x2_REG [pw_11585x2]
    562 %endif
    563 
    564    cmp eobd, 12 ; top left half or less
    565    jg .idctfull
    566 
    567    cmp eobd, 3  ; top left corner or less
    568    jg .idcthalf
    569 
    570    cmp eobd, 1 ; faster path for when only DC is set
    571    jne .idcttopleftcorner
    572 %else
    573    cmp eobd, 1
    574    jg .idctfull
    575 %endif
    576 
    577 %if cpuflag(ssse3)
    578    movd                m0, [blockq]
    579    pmulhrsw            m0, W_11585x2_REG
    580    pmulhrsw            m0, W_11585x2_REG
    581 %else
    582    DEFINE_ARGS dst, stride, block, coef
    583    movsx            coefd, word [blockq]
    584    imul             coefd, 11585
    585    add              coefd, 8192
    586    sar              coefd, 14
    587    imul             coefd, 11585
    588    add              coefd, (16 << 14) + 8192
    589    sar              coefd, 14 + 5
    590    movd                m0, coefd
    591 %endif
    592    SPLATW              m0, m0, 0
    593    pxor                m4, m4
    594    movd          [blockq], m4
    595 %if cpuflag(ssse3)
    596    pmulhrsw            m0, [pw_1024]       ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
    597 %endif
    598 %rep 3
    599    VP9_STORE_2X         0,  0,  6,  7,  4
    600    lea               dstq, [dstq+2*strideq]
    601 %endrep
    602    VP9_STORE_2X         0,  0,  6,  7,  4
    603    RET
    604 
    605 %if cpuflag(ssse3)
    606 ; faster path for when only left corner is set (3 input: DC, right to DC, below
    607 ; to DC). Note: also working with a 2x2 block
    608 .idcttopleftcorner:
    609    movd                m0, [blockq+0]
    610    movd                m1, [blockq+16]
    611 %if ARCH_X86_64
    612    mova               m10, [pw_3196x2]
    613    mova               m11, [pw_16069x2]
    614 %define W_3196x2_REG m10
    615 %define W_16069x2_REG m11
    616 %else
    617 %define W_3196x2_REG [pw_3196x2]
    618 %define W_16069x2_REG [pw_16069x2]
    619 %endif
    620    VP9_IDCT8_2x2_1D 1
    621    ; partial 2x8 transpose
    622    ; punpcklwd m0, m1 already done inside idct
    623    punpcklwd           m2, m3
    624    punpcklwd           m4, m5
    625    punpcklwd           m6, m7
    626    punpckldq           m0, m2
    627    punpckldq           m4, m6
    628    SBUTTERFLY         qdq, 0, 4, 1
    629    SWAP                 1, 4
    630    VP9_IDCT8_2x2_1D 2
    631 %if ARCH_X86_64
    632    SWAP                 6, 8
    633 %endif
    634    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
    635    VP9_IDCT8_WRITEOUT
    636 %if ARCH_X86_64
    637    movd       [blockq+ 0], m6
    638    movd       [blockq+16], m6
    639 %else
    640    mova       [blockq+ 0], m6
    641    mova       [blockq+16], m6
    642    mova       [blockq+32], m6
    643 %endif
    644    RET
    645 
    646 .idcthalf:
    647    movh                m0, [blockq + 0]
    648    movh                m1, [blockq +16]
    649    movh                m2, [blockq +32]
    650    movh                m3, [blockq +48]
    651    VP9_IDCT8_4x4_1D
    652    ; partial 4x8 transpose
    653 %if ARCH_X86_32
    654    mova                m6, [blockq+ 0]
    655 %endif
    656    punpcklwd           m0, m1
    657    punpcklwd           m2, m3
    658    punpcklwd           m4, m5
    659    punpcklwd           m6, m7
    660    SBUTTERFLY          dq, 0, 2, 1
    661    SBUTTERFLY          dq, 4, 6, 5
    662    SBUTTERFLY         qdq, 0, 4, 1
    663    SBUTTERFLY         qdq, 2, 6, 5
    664    SWAP                 1, 4
    665    SWAP                 3, 6
    666    VP9_IDCT8_4x4_1D
    667 %if ARCH_X86_64
    668    SWAP                 6, 8
    669 %endif
    670    pxor                m6, m6
    671    VP9_IDCT8_WRITEOUT
    672 %if ARCH_X86_64
    673    movh       [blockq+ 0], m6
    674    movh       [blockq+16], m6
    675    movh       [blockq+32], m6
    676 %else
    677    mova       [blockq+ 0], m6
    678    mova       [blockq+16], m6
    679    mova       [blockq+32], m6
    680 %endif
    681    movh       [blockq+48], m6
    682    RET
    683 %endif
    684 
    685 .idctfull: ; generic full 8x8 idct/idct
    686 %if ARCH_X86_64
    687    mova                m0, [blockq+  0]    ; IN(0)
    688 %endif
    689    mova                m1, [blockq+ 16]    ; IN(1)
    690    mova                m2, [blockq+ 32]    ; IN(2)
    691    mova                m3, [blockq+ 48]    ; IN(3)
    692 %if ARCH_X86_64
    693    mova                m4, [blockq+ 64]    ; IN(4)
    694 %endif
    695    mova                m5, [blockq+ 80]    ; IN(5)
    696    mova                m6, [blockq+ 96]    ; IN(6)
    697    mova                m7, [blockq+112]    ; IN(7)
    698 %if ARCH_X86_64
    699    mova               m11, [pd_8192]       ; rounding
    700 %define D_8192_REG m11
    701 %else
    702 %define D_8192_REG [pd_8192]
    703 %endif
    704    VP9_IDCT8_1D
    705 %if ARCH_X86_64
    706    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
    707 %else
    708    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
    709    mova        [blockq+0], m0
    710 %endif
    711    VP9_IDCT8_1D
    712 
    713 %if ARCH_X86_64
    714    SWAP                 6, 8
    715 %endif
    716    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
    717    VP9_IDCT8_WRITEOUT
    718    ZERO_BLOCK      blockq, 16, 8, m6
    719    RET
    720 %undef W_11585x2_REG
    721 %endmacro
    722 
    723 VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12
    724 VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13
    725 VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
    726 
    727 ;---------------------------------------------------------------------------------------------
    728 ; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
    729 ;---------------------------------------------------------------------------------------------
    730 
    731 ; x86-32:
    732 ; - in: m0/3/4/7 are in mem [blockq+N*16]
    733 ; - out: m6 is in mem [blockq+0]
    734 ; x86-64:
    735 ; - everything is in registers
    736 %macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7
    737 %if ARCH_X86_64
    738    SWAP                     0, 8
    739    SWAP                     3, 9
    740    SWAP                     4, 10
    741    SWAP                     7, 11
    742 %endif
    743 
    744    VP9_UNPACK_MULSUB_2D_4X  5,  2,  0,  3, 14449,  7723    ; m5/2=t3[d], m2/4=t2[d]
    745    VP9_UNPACK_MULSUB_2D_4X  1,  6,  4,  7,  4756, 15679    ; m1/4=t7[d], m6/7=t6[d]
    746    SCRATCH                  4, 12, blockq+1*16
    747    VP9_RND_SH_SUMSUB_BA     6,  2,  7,  3, 4, D_8192_REG  ; m6=t2[w], m2=t6[w]
    748    UNSCRATCH                4, 12, blockq+1*16
    749    VP9_RND_SH_SUMSUB_BA     1,  5,  4,  0, 3, D_8192_REG  ; m1=t3[w], m5=t7[w]
    750 
    751    UNSCRATCH                0,  8, blockq+16*0
    752    UNSCRATCH                3,  9, blockq+16*3
    753    UNSCRATCH                4, 10, blockq+16*4
    754    UNSCRATCH                7, 11, blockq+16*7
    755    SCRATCH                  1,  8, blockq+16*1
    756    SCRATCH                  2,  9, blockq+16*2
    757    SCRATCH                  5, 10, blockq+16*5
    758    SCRATCH                  6, 11, blockq+16*6
    759 
    760    VP9_UNPACK_MULSUB_2D_4X  7,  0,  1,  2, 16305,  1606    ; m7/1=t1[d], m0/2=t0[d]
    761    VP9_UNPACK_MULSUB_2D_4X  3,  4,  5,  6, 10394, 12665    ; m3/5=t5[d], m4/6=t4[d]
    762    SCRATCH                  1, 12, blockq+ 0*16
    763    VP9_RND_SH_SUMSUB_BA     4,  0,  6,  2, 1, D_8192_REG  ; m4=t0[w], m0=t4[w]
    764    UNSCRATCH                1, 12, blockq+ 0*16
    765    VP9_RND_SH_SUMSUB_BA     3,  7,  5,  1, 2, D_8192_REG  ; m3=t1[w], m7=t5[w]
    766 
    767    UNSCRATCH                2,  9, blockq+16*2
    768    UNSCRATCH                5, 10, blockq+16*5
    769    SCRATCH                  3,  9, blockq+16*3
    770    SCRATCH                  4, 10, blockq+16*4
    771 
    772    ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7
    773 
    774    VP9_UNPACK_MULSUB_2D_4X  0,  7,  1,  3, 15137,  6270    ; m0/1=t5[d], m7/3=t4[d]
    775    VP9_UNPACK_MULSUB_2D_4X  5,  2,  4,  6,  6270, 15137    ; m5/4=t6[d], m2/6=t7[d]
    776    SCRATCH                  1, 12, blockq+ 0*16
    777    VP9_RND_SH_SUMSUB_BA     5,  7,  4,  3, 1, D_8192_REG
    778    UNSCRATCH                1, 12, blockq+ 0*16
    779    PSIGNW                  m5, W_M1_REG                    ; m5=out1[w], m7=t6[w]
    780    VP9_RND_SH_SUMSUB_BA     2,  0,  6,  1, 3, D_8192_REG   ; m2=out6[w], m0=t7[w]
    781 
    782    UNSCRATCH                1,  8, blockq+16*1
    783    UNSCRATCH                3,  9, blockq+16*3
    784    UNSCRATCH                4, 10, blockq+16*4
    785    UNSCRATCH                6, 11, blockq+16*6
    786    SCRATCH                  2,  8, blockq+16*0
    787 
    788    SUMSUB_BA                w,  6,  4, 2                   ; m6=out0[w], m4=t2[w]
    789    SUMSUB_BA                w,  1,  3, 2
    790    PSIGNW                  m1, W_M1_REG                    ; m1=out7[w], m3=t3[w]
    791 
    792    ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7
    793 
    794    ; unfortunately, the code below overflows in some cases
    795 %if 0; cpuflag(ssse3)
    796    SUMSUB_BA                w,  3,  4,  2
    797    SUMSUB_BA                w,  0,  7,  2
    798    pmulhrsw                m3, W_11585x2_REG
    799    pmulhrsw                m7, W_11585x2_REG
    800    pmulhrsw                m4, W_11585x2_REG               ; out4
    801    pmulhrsw                m0, W_11585x2_REG               ; out2
    802 %else
    803    SCRATCH                  5,  9, blockq+16*1
    804    VP9_UNPACK_MULSUB_2W_4X  4, 3, 11585, 11585, D_8192_REG, 2, 5
    805    VP9_UNPACK_MULSUB_2W_4X  7, 0, 11585, 11585, D_8192_REG, 2, 5
    806    UNSCRATCH                5,  9, blockq+16*1
    807 %endif
    808    PSIGNW                  m3, W_M1_REG                    ; out3
    809    PSIGNW                  m7, W_M1_REG                    ; out5
    810 
    811    ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7
    812 
    813 %if ARCH_X86_64
    814    SWAP                     2, 8
    815 %endif
    816    SWAP                     0, 6, 2
    817    SWAP                     7, 1, 5
    818 %endmacro
    819 
    820 %macro IADST8_FN 6
    821 INIT_XMM %5
    822 cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob
    823 
    824 %ifidn %1, idct
    825 %define first_is_idct 1
    826 %else
    827 %define first_is_idct 0
    828 %endif
    829 
    830 %ifidn %3, idct
    831 %define second_is_idct 1
    832 %else
    833 %define second_is_idct 0
    834 %endif
    835 
    836 %if ARCH_X86_64
    837    mova                m0, [blockq+  0]    ; IN(0)
    838 %endif
    839    mova                m1, [blockq+ 16]    ; IN(1)
    840    mova                m2, [blockq+ 32]    ; IN(2)
    841 %if ARCH_X86_64 || first_is_idct
    842    mova                m3, [blockq+ 48]    ; IN(3)
    843 %endif
    844 %if ARCH_X86_64
    845    mova                m4, [blockq+ 64]    ; IN(4)
    846 %endif
    847    mova                m5, [blockq+ 80]    ; IN(5)
    848    mova                m6, [blockq+ 96]    ; IN(6)
    849 %if ARCH_X86_64 || first_is_idct
    850    mova                m7, [blockq+112]    ; IN(7)
    851 %endif
    852 %if ARCH_X86_64
    853 %if cpuflag(ssse3)
    854    mova               m15, [pw_11585x2]    ; often used
    855 %endif
    856    mova               m13, [pd_8192]       ; rounding
    857    mova               m14, [pw_m1]
    858 %define W_11585x2_REG m15
    859 %define D_8192_REG m13
    860 %define W_M1_REG m14
    861 %else
    862 %define W_11585x2_REG [pw_11585x2]
    863 %define D_8192_REG [pd_8192]
    864 %define W_M1_REG [pw_m1]
    865 %endif
    866 
    867    ; note different calling conventions for idct8 vs. iadst8 on x86-32
    868    VP9_%2_1D
    869 %if ARCH_X86_64
    870    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, 8
    871 %else
    872    TRANSPOSE8x8W  0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1
    873    mova      [blockq+  0], m0
    874 %if second_is_idct == 0
    875    mova      [blockq+ 48], m3
    876    mova      [blockq+112], m7
    877 %endif
    878 %endif
    879    VP9_%4_1D
    880 
    881 %if ARCH_X86_64
    882    SWAP                 6, 8
    883 %endif
    884    pxor                m6, m6  ; used for the block reset, and VP9_STORE_2X
    885    VP9_IDCT8_WRITEOUT
    886    ZERO_BLOCK      blockq, 16, 8, m6
    887    RET
    888 
    889 %undef W_11585x2_REG
    890 %undef first_is_idct
    891 %undef second_is_idct
    892 
    893 %endmacro
    894 
    895 IADST8_FN idct,  IDCT8,  iadst, IADST8, sse2, 15
    896 IADST8_FN iadst, IADST8, idct,  IDCT8,  sse2, 15
    897 IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15
    898 IADST8_FN idct,  IDCT8,  iadst, IADST8, ssse3, 16
    899 IADST8_FN idct,  IDCT8,  iadst, IADST8, avx, 16
    900 IADST8_FN iadst, IADST8, idct,  IDCT8,  ssse3, 16
    901 IADST8_FN iadst, IADST8, idct,  IDCT8,  avx, 16
    902 IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16
    903 IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
    904 
    905 ;---------------------------------------------------------------------------------------------
    906 ; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
    907 ;---------------------------------------------------------------------------------------------
    908 
    909 ; x86-64:
    910 ; at the end of this macro, m7 is stored in [%4+15*%5]
    911 ; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
    912 ; the following sumsubs have not been done yet:
    913 ;    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
    914 ;    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
    915 ; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1,
    916 ; and the following simsubs have not been done yet:
    917 ;    SUMSUB_BA            w, x13, x14, 7       ; t6, t9
    918 ;    SUMSUB_BA            w, x15, x12, 7       ; t7, t8
    919 
    920 %macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst
    921 %if %2 <= 4
    922    mova                m3, [%1+ 1*%3]      ; IN(1)
    923    mova                m0, [%1+ 3*%3]      ; IN(3)
    924 
    925    pmulhrsw            m4, m3,  [pw_16305x2]       ; t14-15
    926    pmulhrsw            m3, [pw_1606x2]             ; t8-9
    927    pmulhrsw            m7, m0,  [pw_m4756x2]       ; t10-11
    928    pmulhrsw            m0, [pw_15679x2]            ; t12-13
    929 
    930    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
    931    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
    932 
    933    VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137,  6270, [pd_8192], 1, 6 ; t9,  t14
    934    SCRATCH              4, 10, %4+ 1*%5
    935    SCRATCH              5, 11, %4+ 7*%5
    936    VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
    937    UNSCRATCH            5, 11, %4+ 7*%5
    938 
    939    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
    940    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
    941 %else
    942    mova                m5, [%1+ 1*%3]      ; IN(1)
    943    mova                m4, [%1+ 7*%3]      ; IN(7)
    944 %if %2 <= 8
    945    pmulhrsw            m2, m5,  [pw_16305x2]       ; t15
    946    pmulhrsw            m5, [pw_1606x2]             ; t8
    947    pmulhrsw            m3, m4,  [pw_m10394x2]      ; t9
    948    pmulhrsw            m4, [pw_12665x2]            ; t14
    949 %else
    950    mova                m3, [%1+ 9*%3]      ; IN(9)
    951    mova                m2, [%1+15*%3]      ; IN(15)
    952 
    953    ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
    954    ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
    955 
    956    VP9_UNPACK_MULSUB_2W_4X   5,   2, 16305,  1606, [pd_8192], 0, 1 ; t8,  t15
    957    VP9_UNPACK_MULSUB_2W_4X   3,   4, 10394, 12665, [pd_8192], 0, 1 ; t9,  t14
    958 %endif
    959 
    960    SUMSUB_BA            w,  3,  5, 0       ; t8,  t9
    961    SUMSUB_BA            w,  4,  2, 0       ; t15, t14
    962 
    963    VP9_UNPACK_MULSUB_2W_4X   2,   5, 15137,  6270, [pd_8192], 0, 1 ; t9,  t14
    964 
    965    SCRATCH              4, 10, %4+ 1*%5
    966    SCRATCH              5, 11, %4+ 7*%5
    967 
    968    mova                m6, [%1+ 3*%3]      ; IN(3)
    969    mova                m7, [%1+ 5*%3]      ; IN(5)
    970 %if %2 <= 8
    971    pmulhrsw            m0, m7,  [pw_14449x2]       ; t13
    972    pmulhrsw            m7, [pw_7723x2]             ; t10
    973    pmulhrsw            m1, m6,  [pw_m4756x2]       ; t11
    974    pmulhrsw            m6, [pw_15679x2]            ; t12
    975 %else
    976    mova                m0, [%1+11*%3]      ; IN(11)
    977    mova                m1, [%1+13*%3]      ; IN(13)
    978 
    979    VP9_UNPACK_MULSUB_2W_4X   7,   0, 14449,  7723, [pd_8192], 4, 5 ; t10, t13
    980    VP9_UNPACK_MULSUB_2W_4X   1,   6,  4756, 15679, [pd_8192], 4, 5 ; t11, t12
    981 %endif
    982 
    983    ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
    984    ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
    985 
    986    SUMSUB_BA            w,  7,  1, 4       ; t11, t10
    987    SUMSUB_BA            w,  0,  6, 4       ; t12, t13
    988 
    989    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
    990    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
    991 
    992    VP9_UNPACK_MULSUB_2W_4X   6,   1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13
    993 
    994    UNSCRATCH            5, 11, %4+ 7*%5
    995 %endif
    996 
    997    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
    998    ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
    999 
   1000    SUMSUB_BA            w,  7,  3, 4       ; t8,  t11
   1001 
   1002    ; backup first register
   1003    mova        [%4+15*%5], m7
   1004 
   1005    SUMSUB_BA            w,  6,  2, 7       ; t9,  t10
   1006    UNSCRATCH            4, 10, %4+ 1*%5
   1007    SUMSUB_BA            w,  0,  4, 7       ; t15, t12
   1008    SUMSUB_BA            w,  1,  5, 7       ; t14. t13
   1009 
   1010    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
   1011    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
   1012 
   1013 %if cpuflag(ssse3) && %6 == 0
   1014    SUMSUB_BA            w,  2,  5, 7
   1015    SUMSUB_BA            w,  3,  4, 7
   1016    pmulhrsw            m5, [pw_11585x2]    ; t10
   1017    pmulhrsw            m4, [pw_11585x2]    ; t11
   1018    pmulhrsw            m3, [pw_11585x2]    ; t12
   1019    pmulhrsw            m2, [pw_11585x2]    ; t13
   1020 %else
   1021    SCRATCH              6, 10, %4+ 1*%5
   1022    VP9_UNPACK_MULSUB_2W_4X   5,   2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13
   1023    VP9_UNPACK_MULSUB_2W_4X   4,   3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12
   1024    UNSCRATCH            6, 10, %4+ 1*%5
   1025 %endif
   1026 
   1027    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
   1028    ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
   1029 
   1030    SCRATCH              0,  8, %4+ 1*%5
   1031    SCRATCH              1,  9, %4+ 3*%5
   1032    SCRATCH              2, 10, %4+ 5*%5
   1033    SCRATCH              3, 11, %4+ 7*%5
   1034    SCRATCH              4, 12, %4+ 9*%5
   1035    SCRATCH              5, 13, %4+11*%5
   1036    SCRATCH              6, 14, %4+13*%5
   1037 
   1038    ; even (tx8x8)
   1039 %if %2 <= 4
   1040    mova                m3, [%1+ 0*%3]      ; IN(0)
   1041    mova                m4, [%1+ 2*%3]      ; IN(2)
   1042 
   1043    pmulhrsw            m3, [pw_11585x2]    ; t0-t3
   1044    pmulhrsw            m7, m4, [pw_16069x2]        ; t6-7
   1045    pmulhrsw            m4, [pw_3196x2]             ; t4-5
   1046 
   1047 %if 0 ; overflows :(
   1048    paddw               m6, m7, m4
   1049    psubw               m5, m7, m4
   1050    pmulhrsw            m5, [pw_11585x2]            ; t5
   1051    pmulhrsw            m6, [pw_11585x2]            ; t6
   1052 %else
   1053    VP9_UNPACK_MULSUB_2W_4X  5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5,  t6
   1054 %endif
   1055 
   1056    psubw               m0, m3, m7
   1057    paddw               m7, m3
   1058    psubw               m1, m3, m6
   1059    paddw               m6, m3
   1060    psubw               m2, m3, m5
   1061    paddw               m5, m3
   1062 
   1063 %if ARCH_X86_32
   1064    SWAP                 0, 7
   1065 %endif
   1066    SCRATCH              7, 15, %4+12*%5
   1067 %else
   1068    mova                m6, [%1+ 2*%3]      ; IN(2)
   1069    mova                m1, [%1+ 4*%3]      ; IN(4)
   1070    mova                m7, [%1+ 6*%3]      ; IN(6)
   1071 %if %2 <= 8
   1072    pmulhrsw            m0, m1,  [pw_15137x2]       ; t3
   1073    pmulhrsw            m1, [pw_6270x2]             ; t2
   1074    pmulhrsw            m5, m6, [pw_16069x2]        ; t7
   1075    pmulhrsw            m6, [pw_3196x2]             ; t4
   1076    pmulhrsw            m4, m7, [pw_m9102x2]        ; t5
   1077    pmulhrsw            m7, [pw_13623x2]            ; t6
   1078 %else
   1079    mova                m4, [%1+10*%3]      ; IN(10)
   1080    mova                m0, [%1+12*%3]      ; IN(12)
   1081    mova                m5, [%1+14*%3]      ; IN(14)
   1082 
   1083    VP9_UNPACK_MULSUB_2W_4X   1,   0, 15137,  6270, [pd_8192], 2, 3 ; t2,  t3
   1084    VP9_UNPACK_MULSUB_2W_4X   6,   5, 16069,  3196, [pd_8192], 2, 3 ; t4,  t7
   1085    VP9_UNPACK_MULSUB_2W_4X   4,   7,  9102, 13623, [pd_8192], 2, 3 ; t5,  t6
   1086 %endif
   1087 
   1088    SUMSUB_BA            w,  4,  6, 2       ; t4,  t5
   1089    SUMSUB_BA            w,  7,  5, 2       ; t7,  t6
   1090 
   1091 %if cpuflag(ssse3) && %6 == 0
   1092    SUMSUB_BA            w,  6,  5, 2
   1093    pmulhrsw            m5, [pw_11585x2]                              ; t5
   1094    pmulhrsw            m6, [pw_11585x2]                              ; t6
   1095 %else
   1096    VP9_UNPACK_MULSUB_2W_4X  5,  6, 11585, 11585, [pd_8192], 2, 3 ; t5,  t6
   1097 %endif
   1098 
   1099    SCRATCH              5, 15, %4+10*%5
   1100    mova                m2, [%1+ 0*%3]      ; IN(0)
   1101 %if %2 <= 8
   1102    pmulhrsw            m2, [pw_11585x2]    ; t0 and t1
   1103    psubw               m3, m2, m0
   1104    paddw               m0, m2
   1105 
   1106    SUMSUB_BA            w,  7,  0, 5       ; t0,  t7
   1107 %else
   1108    mova                m3, [%1+ 8*%3]      ; IN(8)
   1109 
   1110    ; from 3 stages back
   1111 %if cpuflag(ssse3) && %6 == 0
   1112    SUMSUB_BA            w,  3,  2, 5
   1113    pmulhrsw            m3, [pw_11585x2]    ; t0
   1114    pmulhrsw            m2, [pw_11585x2]    ; t1
   1115 %else
   1116    mova        [%1+ 0*%3], m0
   1117    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585,  11585, [pd_8192], 5, 0 ; t0, t1
   1118    mova                m0, [%1+ 0*%3]
   1119 %endif
   1120 
   1121    ; from 2 stages back
   1122    SUMSUB_BA            w,  0,  3, 5      ; t0,  t3
   1123 
   1124    SUMSUB_BA            w,  7,  0, 5      ; t0,  t7
   1125 %endif
   1126    UNSCRATCH            5, 15, %4+10*%5
   1127 %if ARCH_X86_32
   1128    SWAP                 0, 7
   1129 %endif
   1130    SCRATCH              7, 15, %4+12*%5
   1131    SUMSUB_BA            w,  1,  2, 7       ; t1,  t2
   1132 
   1133    ; from 1 stage back
   1134    SUMSUB_BA            w,  6,  1, 7       ; t1,  t6
   1135    SUMSUB_BA            w,  5,  2, 7       ; t2,  t5
   1136 %endif
   1137    SUMSUB_BA            w,  4,  3, 7       ; t3,  t4
   1138 
   1139 %if ARCH_X86_64
   1140    SWAP                 0, 8
   1141    SWAP                 1, 9
   1142    SWAP                 2, 10
   1143    SWAP                 3, 11
   1144    SWAP                 4, 12
   1145    SWAP                 5, 13
   1146    SWAP                 6, 14
   1147 
   1148    SUMSUB_BA            w,  0, 15, 7       ; t0, t15
   1149    SUMSUB_BA            w,  1, 14, 7       ; t1, t14
   1150    SUMSUB_BA            w,  2, 13, 7       ; t2, t13
   1151    SUMSUB_BA            w,  3, 12, 7       ; t3, t12
   1152    SUMSUB_BA            w,  4, 11, 7       ; t4, t11
   1153    SUMSUB_BA            w,  5, 10, 7       ; t5, t10
   1154 %else
   1155    SWAP                 1, 6
   1156    SWAP                 2, 5
   1157    SWAP                 3, 4
   1158    mova        [%4+14*%5], m6
   1159 
   1160 %macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride
   1161    mova                m6, [%4+%2*%5]
   1162    SUMSUB_BA            w,  6, %1, 7
   1163    SWAP                %1, 6
   1164    mova        [%4+%3*%5], m6
   1165 %endmacro
   1166 
   1167    %%SUMSUB_BA_STORE    0,  1,  1, %4, %5  ; t0, t15
   1168    %%SUMSUB_BA_STORE    1,  3,  3, %4, %5  ; t1, t14
   1169    %%SUMSUB_BA_STORE    2,  5,  5, %4, %5  ; t2, t13
   1170    %%SUMSUB_BA_STORE    3,  7,  7, %4, %5  ; t3, t12
   1171    %%SUMSUB_BA_STORE    4,  9,  9, %4, %5  ; t4, t11
   1172    %%SUMSUB_BA_STORE    5, 11, 11, %4, %5  ; t5, t10
   1173 %endif
   1174 %endmacro
   1175 
   1176 %macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst
   1177 %if %2 == 1
   1178    VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4
   1179 
   1180 %if ARCH_X86_64
   1181    ; backup a different register
   1182    mova                m7, [tmpq+15*16]
   1183    mova      [tmpq+ 1*16], m15
   1184 
   1185    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
   1186    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
   1187 
   1188    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 15
   1189    mova        [tmpq+  0], m0
   1190    mova        [tmpq+ 32], m1
   1191    mova        [tmpq+ 64], m2
   1192    mova        [tmpq+ 96], m3
   1193    mova        [tmpq+128], m4
   1194    mova        [tmpq+160], m5
   1195    mova        [tmpq+192], m6
   1196    mova        [tmpq+224], m7
   1197 
   1198    mova               m15, [tmpq+ 1*16]
   1199    TRANSPOSE8x8W        8, 9, 10, 11, 12, 13, 14, 15, 0
   1200    mova        [tmpq+ 16], m8
   1201    mova        [tmpq+ 48], m9
   1202    mova        [tmpq+ 80], m10
   1203    mova        [tmpq+112], m11
   1204    mova        [tmpq+144], m12
   1205    mova        [tmpq+176], m13
   1206    mova        [tmpq+208], m14
   1207    mova        [tmpq+240], m15
   1208 %else
   1209    mova                m6, [tmpq+13*16]
   1210    mova                m7, [tmpq+14*16]
   1211    SUMSUB_BA            w, 6, 7                ; t6, t9
   1212    mova      [tmpq+14*16], m6
   1213    mova      [tmpq+13*16], m7
   1214    mova                m7, [tmpq+15*16]
   1215    mova                m6, [tmpq+12*16]
   1216    SUMSUB_BA            w, 7, 6                ; t7, t8
   1217    mova      [tmpq+15*16], m6
   1218 
   1219    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1
   1220    mova     [tmpq+ 0*16], m0
   1221    mova     [tmpq+ 2*16], m1
   1222    mova     [tmpq+ 4*16], m2
   1223    mova     [tmpq+ 6*16], m3
   1224    mova     [tmpq+10*16], m5
   1225    mova     [tmpq+12*16], m6
   1226    mova     [tmpq+14*16], m7
   1227 
   1228    mova                m0, [tmpq+15*16]
   1229    mova                m1, [tmpq+13*16]
   1230    mova                m2, [tmpq+11*16]
   1231    mova                m3, [tmpq+ 9*16]
   1232    mova                m4, [tmpq+ 7*16]
   1233    mova                m5, [tmpq+ 5*16]
   1234    mova                m7, [tmpq+ 1*16]
   1235    TRANSPOSE8x8W       0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1
   1236    mova     [tmpq+ 1*16], m0
   1237    mova     [tmpq+ 3*16], m1
   1238    mova     [tmpq+ 5*16], m2
   1239    mova     [tmpq+ 7*16], m3
   1240    mova     [tmpq+11*16], m5
   1241    mova     [tmpq+13*16], m6
   1242    mova     [tmpq+15*16], m7
   1243 %endif
   1244 %else ; %2 == 2
   1245    VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4
   1246 
   1247 %if cpuflag(ssse3)
   1248 %define ROUND_REG [pw_512]
   1249 %else
   1250 %define ROUND_REG [pw_32]
   1251 %endif
   1252 
   1253    pxor                m7, m7
   1254 %if ARCH_X86_64
   1255    ; backup more registers
   1256    mova        [%1+ 2*32], m8
   1257    mova        [%1+ 3*32], m9
   1258 
   1259    VP9_IDCT8_WRITEx2    0,  1, 8, 9, 7, ROUND_REG, 6
   1260    lea               dstq, [dstq+strideq*2]
   1261    VP9_IDCT8_WRITEx2    2,  3, 8, 9, 7, ROUND_REG, 6
   1262    lea               dstq, [dstq+strideq*2]
   1263    VP9_IDCT8_WRITEx2    4,  5, 8, 9, 7, ROUND_REG, 6
   1264    lea               dstq, [dstq+strideq*2]
   1265 
   1266    ; restore from cache
   1267    SWAP                 0, 7               ; move zero from m7 to m0
   1268    mova                m7, [%1+15*32]
   1269    mova                m8, [%1+ 2*32]
   1270    mova                m9, [%1+ 3*32]
   1271 
   1272    SUMSUB_BA            w,  6,  9, 3       ; t6, t9
   1273    SUMSUB_BA            w,  7,  8, 3       ; t7, t8
   1274 
   1275    VP9_IDCT8_WRITEx2    6,  7, 3, 4, 0, ROUND_REG, 6
   1276    lea               dstq, [dstq+strideq*2]
   1277    VP9_IDCT8_WRITEx2    8,  9, 3, 4, 0, ROUND_REG, 6
   1278    lea               dstq, [dstq+strideq*2]
   1279    VP9_IDCT8_WRITEx2   10, 11, 1, 2, 0, ROUND_REG, 6
   1280    lea               dstq, [dstq+strideq*2]
   1281    VP9_IDCT8_WRITEx2   12, 13, 1, 2, 0, ROUND_REG, 6
   1282    lea               dstq, [dstq+strideq*2]
   1283    VP9_IDCT8_WRITEx2   14, 15, 1, 2, 0, ROUND_REG, 6
   1284 %else
   1285    mova      [tmpq+ 0*32], m5
   1286 
   1287    VP9_IDCT8_WRITEx2    0,  1, 5, 6, 7, ROUND_REG, 6
   1288    lea               dstq, [dstq+strideq*2]
   1289    VP9_IDCT8_WRITEx2    2,  3, 5, 6, 7, ROUND_REG, 6
   1290    lea               dstq, [dstq+strideq*2]
   1291 
   1292    SWAP                 0, 7               ; move zero from m7 to m0
   1293    mova                m5, [tmpq+ 0*32]
   1294 
   1295    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
   1296    lea               dstq, [dstq+strideq*2]
   1297 
   1298    mova                m4, [tmpq+13*32]
   1299    mova                m7, [tmpq+14*32]
   1300    mova                m5, [tmpq+15*32]
   1301    mova                m6, [tmpq+12*32]
   1302    SUMSUB_BADC w, 4, 7, 5, 6, 1
   1303 
   1304    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
   1305    lea               dstq, [dstq+strideq*2]
   1306    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
   1307    lea               dstq, [dstq+strideq*2]
   1308 
   1309    mova                m4, [tmpq+11*32]
   1310    mova                m5, [tmpq+ 9*32]
   1311    mova                m6, [tmpq+ 7*32]
   1312    mova                m7, [tmpq+ 5*32]
   1313 
   1314    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
   1315    lea               dstq, [dstq+strideq*2]
   1316    VP9_IDCT8_WRITEx2    6,  7, 1, 2, 0, ROUND_REG, 6
   1317    lea               dstq, [dstq+strideq*2]
   1318 
   1319    mova                m4, [tmpq+ 3*32]
   1320    mova                m5, [tmpq+ 1*32]
   1321 
   1322    VP9_IDCT8_WRITEx2    4,  5, 1, 2, 0, ROUND_REG, 6
   1323    lea               dstq, [dstq+strideq*2]
   1324 %endif
   1325 
   1326 %undef ROUND_REG
   1327 %endif ; %2 == 1/2
   1328 %endmacro
   1329 
   1330 %macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
   1331    mova               m%3, [dstq]
   1332    mova               m%5, [dstq+%7]
   1333    punpcklbw          m%2, m%3, m%6
   1334    punpckhbw          m%3, m%6
   1335    punpcklbw          m%4, m%5, m%6
   1336    punpckhbw          m%5, m%6
   1337    paddw              m%2, m%1
   1338    paddw              m%3, m%1
   1339    paddw              m%4, m%1
   1340    paddw              m%5, m%1
   1341    packuswb           m%2, m%3
   1342    packuswb           m%4, m%5
   1343    mova            [dstq], m%2
   1344    mova         [dstq+%7], m%4
   1345 %endmacro
   1346 
   1347 %macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
   1348 INIT_XMM %1
   1349 cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
   1350 %if cpuflag(ssse3)
   1351    ; 2x2=eob=3, 4x4=eob=10
   1352    cmp eobd, 38
   1353    jg .idctfull
   1354    cmp eobd, 1 ; faster path for when only DC is set
   1355    jne .idct8x8
   1356 %else
   1357    cmp eobd, 1 ; faster path for when only DC is set
   1358    jg .idctfull
   1359 %endif
   1360 
   1361    ; dc-only
   1362 %if cpuflag(ssse3)
   1363    movd                m0, [blockq]
   1364    mova                m1, [pw_11585x2]
   1365    pmulhrsw            m0, m1
   1366    pmulhrsw            m0, m1
   1367 %else
   1368    DEFINE_ARGS dst, stride, block, coef
   1369    movsx            coefd, word [blockq]
   1370    imul             coefd, 11585
   1371    add              coefd, 8192
   1372    sar              coefd, 14
   1373    imul             coefd, 11585
   1374    add              coefd, (32 << 14) + 8192
   1375    sar              coefd, 14 + 6
   1376    movd                m0, coefd
   1377 %endif
   1378    SPLATW              m0, m0, q0000
   1379 %if cpuflag(ssse3)
   1380    pmulhrsw            m0, [pw_512]
   1381 %endif
   1382    pxor                m5, m5
   1383    movd          [blockq], m5
   1384 %rep 7
   1385    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
   1386    lea               dstq, [dstq+2*strideq]
   1387 %endrep
   1388    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
   1389    RET
   1390 
   1391    DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
   1392 %if cpuflag(ssse3)
   1393 .idct8x8:
   1394    mov               tmpq, rsp
   1395    VP9_IDCT16_1D   blockq, 1, 8, 0
   1396 
   1397    mov               cntd, 2
   1398    mov           dst_bakq, dstq
   1399 .loop2_8x8:
   1400    VP9_IDCT16_1D     tmpq, 2, 8, 0
   1401    lea               dstq, [dst_bakq+8]
   1402    add               tmpq, 16
   1403    dec               cntd
   1404    jg .loop2_8x8
   1405 
   1406    ; at the end of the loop, m0 should still be zero
   1407    ; use that to zero out block coefficients
   1408    ZERO_BLOCK      blockq, 32, 8, m0
   1409    RET
   1410 %endif
   1411 
   1412 .idctfull:
   1413    mov               cntd, 2
   1414    mov               tmpq, rsp
   1415 .loop1_full:
   1416    VP9_IDCT16_1D   blockq, 1, 16, 0
   1417    add             blockq, 16
   1418    add               tmpq, 256
   1419    dec               cntd
   1420    jg .loop1_full
   1421    sub             blockq, 32
   1422 
   1423    mov               cntd, 2
   1424    mov               tmpq, rsp
   1425    mov           dst_bakq, dstq
   1426 .loop2_full:
   1427    VP9_IDCT16_1D     tmpq, 2, 16, 0
   1428    lea               dstq, [dst_bakq+8]
   1429    add               tmpq, 16
   1430    dec               cntd
   1431    jg .loop2_full
   1432 
   1433    ; at the end of the loop, m0 should still be zero
   1434    ; use that to zero out block coefficients
   1435    ZERO_BLOCK      blockq, 32, 16, m0
   1436    RET
   1437 %endmacro
   1438 
   1439 VP9_IDCT_IDCT_16x16_ADD_XMM sse2
   1440 VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
   1441 VP9_IDCT_IDCT_16x16_ADD_XMM avx
   1442 
   1443 %macro VP9_IDCT16_YMM_1D 0
   1444    VP9_UNPACK_MULSUB_2W_4X  1,  15, 16305,  1606, [pd_8192], 0, 4 ; t8,  t15
   1445    VP9_UNPACK_MULSUB_2W_4X  9,   7, 10394, 12665, [pd_8192], 0, 4 ; t9,  t14
   1446 
   1447    SUMSUB_BA            w,  9,   1, 0      ; t8,  t9
   1448    SUMSUB_BA            w,  7,  15, 0      ; t15, t14
   1449 
   1450    VP9_UNPACK_MULSUB_2W_4X 15,   1, 15137,  6270, [pd_8192], 0, 4 ; t9,  t14
   1451 
   1452    VP9_UNPACK_MULSUB_2W_4X  5,  11, 14449,  7723, [pd_8192], 0, 4 ; t10, t13
   1453    VP9_UNPACK_MULSUB_2W_4X 13,   3,  4756, 15679, [pd_8192], 0, 4 ; t11, t12
   1454 
   1455    SUMSUB_BA            w,  5,  13, 0      ; t11, t10
   1456    SUMSUB_BA            w, 11,   3, 0      ; t12, t13
   1457 
   1458    VP9_UNPACK_MULSUB_2W_4X  3,  13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13
   1459 
   1460    SUMSUB_BA            w,  5,   9, 0      ; t8,  t11
   1461    SUMSUB_BA            w,  3,  15, 0      ; t9,  t10
   1462    SUMSUB_BA            w, 11,   7, 0      ; t15, t12
   1463    SUMSUB_BA            w, 13,   1, 0      ; t14, t13
   1464 
   1465    SUMSUB_BA            w, 15,   1, 0
   1466    SUMSUB_BA            w,  9,   7, 0
   1467    pmulhrsw            m1, [pw_11585x2]    ; t10
   1468    pmulhrsw            m7, [pw_11585x2]    ; t11
   1469    pmulhrsw            m9, [pw_11585x2]    ; t12
   1470    pmulhrsw           m15, [pw_11585x2]    ; t13
   1471 
   1472    ; even (tx8x8)
   1473    mova                m4, [blockq+128]
   1474    mova      [blockq+128], m5
   1475    VP9_UNPACK_MULSUB_2W_4X   4,  12, 15137,  6270, [pd_8192], 0, 5 ; t2,  t3
   1476    VP9_UNPACK_MULSUB_2W_4X   2,  14, 16069,  3196, [pd_8192], 0, 5 ; t4,  t7
   1477    VP9_UNPACK_MULSUB_2W_4X  10,   6,  9102, 13623, [pd_8192], 0, 5 ; t5,  t6
   1478    mova                m0, [blockq+  0]
   1479    SUMSUB_BA            w,   8,   0, 5
   1480    pmulhrsw            m8, [pw_11585x2]    ; t0
   1481    pmulhrsw            m0, [pw_11585x2]    ; t1
   1482 
   1483    SUMSUB_BA            w,  10,   2, 5     ; t4,  t5
   1484    SUMSUB_BA            w,   6,  14, 5     ; t7,  t6
   1485    SUMSUB_BA            w,  12,   8, 5     ; t0,  t3
   1486    SUMSUB_BA            w,   4,   0, 5     ; t1,  t2
   1487 
   1488    SUMSUB_BA            w,   2,  14, 5
   1489    pmulhrsw           m14, [pw_11585x2]    ; t5
   1490    pmulhrsw            m2, [pw_11585x2]    ; t6
   1491 
   1492    SUMSUB_BA            w,   6,  12, 5     ; t0,  t7
   1493    SUMSUB_BA            w,   2,   4, 5     ; t1,  t6
   1494    SUMSUB_BA            w,  14,   0, 5     ; t2,  t5
   1495    SUMSUB_BA            w,  10,   8, 5     ; t3,  t4
   1496 
   1497    ; final stage
   1498    SUMSUB_BA            w, 11,  6,  5      ; out0, out15
   1499    SUMSUB_BA            w, 13,  2,  5      ; out1, out14
   1500    SUMSUB_BA            w, 15, 14,  5      ; out2, out13
   1501    SUMSUB_BA            w,  9, 10,  5      ; out3, out12
   1502    SUMSUB_BA            w,  7,  8,  5      ; out4, out11
   1503    SUMSUB_BA            w,  1,  0,  5      ; out5, out10
   1504    SUMSUB_BA            w,  3,  4,  5      ; out6, out9
   1505    mova                m5, [blockq+128]
   1506    mova      [blockq+192], m3
   1507    SUMSUB_BA            w,  5, 12,  3      ; out7, out8
   1508 
   1509    SWAP  0, 11,  8, 12, 10
   1510    SWAP  1, 13, 14,  2, 15,  6,  3,  9,  4,  7,  5
   1511 %endmacro
   1512 
   1513 ; this is almost identical to VP9_STORE_2X, but it does two rows
   1514 ; for slightly improved interleaving, and it omits vpermq since the
   1515 ; input is DC so all values are identical
   1516 %macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
   1517    mova              xm%2, [dstq]
   1518    mova              xm%4, [dstq+strideq*2]
   1519    vinserti128        m%2, m%2, [dstq+strideq], 1
   1520    vinserti128        m%4, m%4, [dstq+stride3q], 1
   1521    punpckhbw          m%3, m%2, m%6
   1522    punpcklbw          m%2, m%6
   1523    punpckhbw          m%5, m%4, m%6
   1524    punpcklbw          m%4, m%6
   1525    paddw              m%3, m%1
   1526    paddw              m%2, m%1
   1527    paddw              m%5, m%1
   1528    paddw              m%4, m%1
   1529    packuswb           m%2, m%3
   1530    packuswb           m%4, m%5
   1531    mova            [dstq], xm%2
   1532    mova        [dstq+strideq*2], xm%4
   1533    vextracti128  [dstq+strideq], m%2, 1
   1534    vextracti128 [dstq+stride3q], m%4, 1
   1535 %endmacro
   1536 
   1537 %if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
   1538 INIT_YMM avx2
   1539 cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob
   1540    cmp eobd, 1 ; faster path for when only DC is set
   1541    jg .idctfull
   1542 
   1543    ; dc-only
   1544    mova                m1, [pw_11585x2]
   1545    vpbroadcastw        m0, [blockq]
   1546    pmulhrsw            m0, m1
   1547    pmulhrsw            m0, m1
   1548    pxor                m5, m5
   1549    pmulhrsw            m0, [pw_512]
   1550    movd          [blockq], xm5
   1551 
   1552    DEFINE_ARGS dst, stride, stride3, cnt
   1553    mov               cntd, 4
   1554    lea           stride3q, [strideq*3]
   1555 .loop_dc:
   1556    VP9_STORE_YMM_DC_4X  0, 1, 2, 3, 4, 5
   1557    lea               dstq, [dstq+4*strideq]
   1558    dec               cntd
   1559    jg .loop_dc
   1560    RET
   1561 
   1562    DEFINE_ARGS dst, stride, block, eob
   1563 .idctfull:
   1564    mova                m1, [blockq+ 32]
   1565    mova                m2, [blockq+ 64]
   1566    mova                m3, [blockq+ 96]
   1567    mova                m5, [blockq+160]
   1568    mova                m6, [blockq+192]
   1569    mova                m7, [blockq+224]
   1570    mova                m8, [blockq+256]
   1571    mova                m9, [blockq+288]
   1572    mova               m10, [blockq+320]
   1573    mova               m11, [blockq+352]
   1574    mova               m12, [blockq+384]
   1575    mova               m13, [blockq+416]
   1576    mova               m14, [blockq+448]
   1577    mova               m15, [blockq+480]
   1578 
   1579    VP9_IDCT16_YMM_1D
   1580    TRANSPOSE16x16W      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
   1581                         [blockq+192], [blockq+128], 1
   1582    mova      [blockq+  0], m0
   1583    VP9_IDCT16_YMM_1D
   1584 
   1585    mova      [blockq+224], m7
   1586 
   1587    ; store
   1588    VP9_IDCT8_WRITEx2    0,  1, 6, 7, unused, [pw_512], 6
   1589    lea               dstq, [dstq+2*strideq]
   1590    VP9_IDCT8_WRITEx2    2,  3, 6, 7, unused, [pw_512], 6
   1591    lea               dstq, [dstq+2*strideq]
   1592    VP9_IDCT8_WRITEx2    4,  5, 6, 7, unused, [pw_512], 6
   1593    lea               dstq, [dstq+2*strideq]
   1594    mova                m6, [blockq+192]
   1595    mova                m7, [blockq+224]
   1596    VP9_IDCT8_WRITEx2    6,  7, 1, 2, unused, [pw_512], 6
   1597    lea               dstq, [dstq+2*strideq]
   1598    VP9_IDCT8_WRITEx2    8,  9, 1, 2, unused, [pw_512], 6
   1599    lea               dstq, [dstq+2*strideq]
   1600    VP9_IDCT8_WRITEx2   10, 11, 1, 2, unused, [pw_512], 6
   1601    lea               dstq, [dstq+2*strideq]
   1602    VP9_IDCT8_WRITEx2   12, 13, 1, 2, unused, [pw_512], 6
   1603    lea               dstq, [dstq+2*strideq]
   1604    VP9_IDCT8_WRITEx2   14, 15, 1, 2, unused, [pw_512], 6
   1605    lea               dstq, [dstq+2*strideq]
   1606 
   1607    ; at the end of the loop, m0 should still be zero
   1608    ; use that to zero out block coefficients
   1609    pxor                m0, m0
   1610    ZERO_BLOCK      blockq, 32, 16, m0
   1611    RET
   1612 %endif
   1613 
   1614 ;---------------------------------------------------------------------------------------------
   1615 ; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
   1616 ;---------------------------------------------------------------------------------------------
   1617 
   1618 %macro VP9_IADST16_1D 2 ; src, pass
   1619 %assign %%str 16*%2
   1620    mova                m0, [%1+ 0*32]  ; in0
   1621    mova                m1, [%1+15*32]  ; in15
   1622    mova                m2, [%1+ 7*32]  ; in7
   1623    mova                m3, [%1+ 8*32]  ; in8
   1624 
   1625    VP9_UNPACK_MULSUB_2D_4X  1,  0,  4,  5, 16364,   804    ; m1/4=t1[d], m0/5=t0[d]
   1626    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  6, 11003, 12140    ; m2/7=t9[d], m3/6=t8[d]
   1627    SCRATCH              4, 8, tmpq+ 0*%%str
   1628    VP9_RND_SH_SUMSUB_BA     3,  0,  6,  5,  4, [pd_8192]   ; m3=t0[w], m0=t8[w]
   1629    UNSCRATCH            4, 8, tmpq+ 0*%%str
   1630    VP9_RND_SH_SUMSUB_BA     2,  1,  7,  4,  5, [pd_8192]   ; m2=t1[w], m1=t9[w]
   1631 
   1632    SCRATCH              0, 10, tmpq+ 0*%%str
   1633    SCRATCH              1, 11, tmpq+15*%%str
   1634    mova   [tmpq+ 7*%%str], m2
   1635    mova   [tmpq+ 8*%%str], m3
   1636 
   1637    mova                m1, [%1+ 2*32]  ; in2
   1638    mova                m0, [%1+13*32]  ; in13
   1639    mova                m3, [%1+ 5*32]  ; in5
   1640    mova                m2, [%1+10*32]  ; in10
   1641 
   1642    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 15893,  3981    ; m0/6=t3[d], m1/7=t2[d]
   1643    VP9_UNPACK_MULSUB_2D_4X  3,  2,  4,  5,  8423, 14053    ; m3/4=t11[d], m2/5=t10[d]
   1644    SCRATCH              4, 12, tmpq+ 2*%%str
   1645    VP9_RND_SH_SUMSUB_BA     2,  1,  5,  7,  4, [pd_8192]   ; m2=t2[w], m1=t10[w]
   1646    UNSCRATCH            4, 12, tmpq+ 2*%%str
   1647    VP9_RND_SH_SUMSUB_BA     3,  0,  4,  6,  5, [pd_8192]   ; m3=t3[w], m0=t11[w]
   1648 
   1649    SCRATCH              0, 12, tmpq+ 2*%%str
   1650    SCRATCH              1, 13, tmpq+13*%%str
   1651    mova   [tmpq+ 5*%%str], m2
   1652    mova   [tmpq+10*%%str], m3
   1653 
   1654    mova                m2, [%1+ 4*32]  ; in4
   1655    mova                m3, [%1+11*32]  ; in11
   1656    mova                m0, [%1+ 3*32]  ; in3
   1657    mova                m1, [%1+12*32]  ; in12
   1658 
   1659    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 14811,  7005    ; m3/7=t5[d], m2/6=t4[d]
   1660    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  5520, 15426    ; m0/4=t13[d], m1/5=t12[d]
   1661    SCRATCH              4, 9, tmpq+ 4*%%str
   1662    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t4[w], m2=t12[w]
   1663    UNSCRATCH            4, 9, tmpq+ 4*%%str
   1664    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t5[w], m3=t13[w]
   1665 
   1666    SCRATCH              0,  8, tmpq+ 4*%%str
   1667    mova   [tmpq+11*%%str], m1          ; t4:m1->r11
   1668    UNSCRATCH            0, 10, tmpq+ 0*%%str
   1669    UNSCRATCH            1, 11, tmpq+15*%%str
   1670 
   1671    ; round 2 interleaved part 1
   1672    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 16069,  3196    ; m1/7=t8[d], m0/6=t9[d]
   1673    VP9_UNPACK_MULSUB_2D_4X  3,  2,  5,  4,  3196, 16069    ; m3/5=t12[d], m2/4=t13[d]
   1674    SCRATCH              4, 9, tmpq+ 3*%%str
   1675    VP9_RND_SH_SUMSUB_BA     3,  1,  5,  7,  4, [pd_8192]   ; m3=t8[w], m1=t12[w]
   1676    UNSCRATCH            4, 9, tmpq+ 3*%%str
   1677    VP9_RND_SH_SUMSUB_BA     2,  0,  4,  6,  5, [pd_8192]   ; m2=t9[w], m0=t13[w]
   1678 
   1679    SCRATCH              0, 10, tmpq+ 0*%%str
   1680    SCRATCH              1, 11, tmpq+15*%%str
   1681    SCRATCH              2, 14, tmpq+ 3*%%str
   1682    SCRATCH              3, 15, tmpq+12*%%str
   1683 
   1684    mova                m2, [%1+ 6*32]  ; in6
   1685    mova                m3, [%1+ 9*32]  ; in9
   1686    mova                m0, [%1+ 1*32]  ; in1
   1687    mova                m1, [%1+14*32]  ; in14
   1688 
   1689    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 13160,  9760    ; m3/7=t7[d], m2/6=t6[d]
   1690    VP9_UNPACK_MULSUB_2D_4X  0,  1,  4,  5,  2404, 16207    ; m0/4=t15[d], m1/5=t14[d]
   1691    SCRATCH              4, 9, tmpq+ 6*%%str
   1692    VP9_RND_SH_SUMSUB_BA     1,  2,  5,  6,  4, [pd_8192]   ; m1=t6[w], m2=t14[w]
   1693    UNSCRATCH            4, 9, tmpq+ 6*%%str
   1694    VP9_RND_SH_SUMSUB_BA     0,  3,  4,  7,  6, [pd_8192]   ; m0=t7[w], m3=t15[w]
   1695 
   1696    ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7
   1697    ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15
   1698 
   1699    UNSCRATCH            4, 12, tmpq+ 2*%%str
   1700    UNSCRATCH            5, 13, tmpq+13*%%str
   1701    SCRATCH              0, 12, tmpq+ 1*%%str
   1702    SCRATCH              1, 13, tmpq+14*%%str
   1703 
   1704    ; remainder of round 2 (rest of t8-15)
   1705    VP9_UNPACK_MULSUB_2D_4X  5,  4,  6,  7,  9102, 13623    ; m5/6=t11[d], m4/7=t10[d]
   1706    VP9_UNPACK_MULSUB_2D_4X  3,  2,  1,  0, 13623,  9102    ; m3/1=t14[d], m2/0=t15[d]
   1707    SCRATCH              0, 9, tmpq+ 6*%%str
   1708    VP9_RND_SH_SUMSUB_BA     3,  4,  1,  7,  0, [pd_8192]   ; m3=t10[w], m4=t14[w]
   1709    UNSCRATCH            0, 9, tmpq+ 6*%%str
   1710    VP9_RND_SH_SUMSUB_BA     2,  5,  0,  6,  1, [pd_8192]   ; m2=t11[w], m5=t15[w]
   1711 
   1712    ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15
   1713 
   1714    UNSCRATCH            6, 14, tmpq+ 3*%%str
   1715    UNSCRATCH            7, 15, tmpq+12*%%str
   1716 
   1717    SUMSUB_BA                w,  3,  7,  1
   1718    PSIGNW                  m3, [pw_m1]                     ; m3=out1[w], m7=t10[w]
   1719    SUMSUB_BA                w,  2,  6,  1                  ; m2=out14[w], m6=t11[w]
   1720 
   1721    ; unfortunately, the code below overflows in some cases, e.g.
   1722    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm
   1723 %if 0; cpuflag(ssse3)
   1724    SUMSUB_BA                w,  7,  6,  1
   1725    pmulhrsw                m7, [pw_11585x2]                ; m7=out6[w]
   1726    pmulhrsw                m6, [pw_11585x2]                ; m6=out9[w]
   1727 %else
   1728    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, 11585, [pd_8192], 1, 0
   1729 %endif
   1730 
   1731    mova       [tmpq+ 3*%%str], m6
   1732    mova       [tmpq+ 6*%%str], m7
   1733    UNSCRATCH                6, 10, tmpq+ 0*%%str
   1734    UNSCRATCH                7, 11, tmpq+15*%%str
   1735    mova       [tmpq+13*%%str], m2
   1736    SCRATCH                  3, 11, tmpq+ 9*%%str
   1737 
   1738    VP9_UNPACK_MULSUB_2D_4X  7,  6,  2,  3, 15137,  6270    ; m6/3=t13[d], m7/2=t12[d]
   1739    VP9_UNPACK_MULSUB_2D_4X  5,  4,  1,  0,  6270, 15137    ; m5/1=t14[d], m4/0=t15[d]
   1740    SCRATCH              0, 9, tmpq+ 2*%%str
   1741    VP9_RND_SH_SUMSUB_BA     5,  6,  1,  3,  0, [pd_8192]   ; m5=out2[w], m6=t14[w]
   1742    UNSCRATCH            0, 9, tmpq+ 2*%%str
   1743    VP9_RND_SH_SUMSUB_BA     4,  7,  0,  2,  1, [pd_8192]
   1744    PSIGNW                  m4, [pw_m1]                     ; m4=out13[w], m7=t15[w]
   1745 
   1746    ; unfortunately, the code below overflows in some cases
   1747 %if 0; cpuflag(ssse3)
   1748    SUMSUB_BA                w,  7,  6,  1
   1749    pmulhrsw                m7, [pw_m11585x2]               ; m7=out5[w]
   1750    pmulhrsw                m6, [pw_11585x2]                ; m6=out10[w]
   1751 %else
   1752    PSIGNW                  m7, [pw_m1]
   1753    VP9_UNPACK_MULSUB_2W_4X  7,  6, 11585, 11585, [pd_8192], 1, 0
   1754 %endif
   1755 
   1756    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14
   1757 
   1758    mova                    m2, [tmpq+ 8*%%str]
   1759    mova                    m3, [tmpq+ 7*%%str]
   1760    mova                    m1, [tmpq+11*%%str]
   1761    mova       [tmpq+ 7*%%str], m6
   1762    mova       [tmpq+11*%%str], m4
   1763    mova                    m4, [tmpq+ 5*%%str]
   1764    SCRATCH                  5, 14, tmpq+ 5*%%str
   1765    SCRATCH                  7, 15, tmpq+ 8*%%str
   1766    UNSCRATCH                6,  8, tmpq+ 4*%%str
   1767    UNSCRATCH                5, 12, tmpq+ 1*%%str
   1768    UNSCRATCH                7, 13, tmpq+14*%%str
   1769 
   1770    ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7
   1771    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
   1772 
   1773    SUMSUB_BA                w,  1,  2, 0                   ; m1=t0[w], m2=t4[w]
   1774    mova                    m0, [tmpq+10*%%str]
   1775    SCRATCH                  1, 12, tmpq+ 1*%%str
   1776    SUMSUB_BA                w,  6,  3, 1                   ; m8=t1[w], m3=t5[w]
   1777    SCRATCH                  6, 13, tmpq+ 4*%%str
   1778    SUMSUB_BA                w,  7,  4, 1                   ; m13=t2[w], m9=t6[w]
   1779    SCRATCH                  7,  8, tmpq+10*%%str
   1780    SUMSUB_BA                w,  5,  0, 1                   ; m12=t3[w], m0=t7[w]
   1781    SCRATCH                  5,  9, tmpq+14*%%str
   1782 
   1783    VP9_UNPACK_MULSUB_2D_4X  2,  3,  7,  5, 15137,  6270    ; m2/6=t5[d], m3/10=t4[d]
   1784    VP9_UNPACK_MULSUB_2D_4X  0,  4,  1,  6,  6270, 15137    ; m0/14=t6[d], m9/15=t7[d]
   1785    SCRATCH                  6, 10, tmpq+ 0*%%str
   1786    VP9_RND_SH_SUMSUB_BA     0,  3,  1,  5,  6, [pd_8192]
   1787    UNSCRATCH                6, 10, tmpq+ 0*%%str
   1788    PSIGNW                  m0, [pw_m1]                     ; m0=out3[w], m3=t6[w]
   1789    VP9_RND_SH_SUMSUB_BA     4,  2,  6,  7,  5, [pd_8192]   ; m9=out12[w], m2=t7[w]
   1790 
   1791    UNSCRATCH                1,  8, tmpq+10*%%str
   1792    UNSCRATCH                5,  9, tmpq+14*%%str
   1793    UNSCRATCH                6, 12, tmpq+ 1*%%str
   1794    UNSCRATCH                7, 13, tmpq+ 4*%%str
   1795    SCRATCH                  4,  9, tmpq+14*%%str
   1796 
   1797    SUMSUB_BA                w,  1,  6,  4                  ; m13=out0[w], m1=t2[w]
   1798    SUMSUB_BA                w,  5,  7,  4
   1799    PSIGNW                  m5, [pw_m1]                     ; m12=out15[w], m8=t3[w]
   1800 
   1801    ; unfortunately, the code below overflows in some cases, e.g.
   1802    ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm
   1803 %if 0 ; cpuflag(ssse3)
   1804    SUMSUB_BA               w,   7,  6,  4
   1805    pmulhrsw                m7, [pw_m11585x2]               ; m8=out7[w]
   1806    pmulhrsw                m6, [pw_11585x2]                ; m1=out8[w]
   1807    SWAP                     6,  7
   1808    SUMSUB_BA                w,  3,  2,  4
   1809    pmulhrsw                m3, [pw_11585x2]                ; m3=out4[w]
   1810    pmulhrsw                m2, [pw_11585x2]                ; m2=out11[w]
   1811 %else
   1812    SCRATCH                  5,  8, tmpq+10*%%str
   1813    VP9_UNPACK_MULSUB_2W_4X  6,  7, 11585, m11585, [pd_8192],  5,  4
   1814    VP9_UNPACK_MULSUB_2W_4X  2,  3, 11585, 11585, [pd_8192],  5,  4
   1815    UNSCRATCH                5,  8, tmpq+10*%%str
   1816 %endif
   1817 
   1818    ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15
   1819    ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14
   1820 
   1821 %if %2 == 1
   1822 %if ARCH_X86_64
   1823    mova                   m13, [tmpq+ 6*%%str]
   1824    TRANSPOSE8x8W            1, 11, 14, 0, 3, 15, 13, 6, 10
   1825    mova          [tmpq+ 0*16], m1
   1826    mova          [tmpq+ 2*16], m11
   1827    mova          [tmpq+ 4*16], m14
   1828    mova          [tmpq+ 6*16], m0
   1829    mova                    m1, [tmpq+ 3*%%str]
   1830    mova                   m11, [tmpq+ 7*%%str]
   1831    mova                   m14, [tmpq+11*%%str]
   1832    mova                    m0, [tmpq+13*%%str]
   1833    mova          [tmpq+ 8*16], m3
   1834    mova          [tmpq+10*16], m15
   1835    mova          [tmpq+12*16], m13
   1836    mova          [tmpq+14*16], m6
   1837 
   1838    TRANSPOSE8x8W            7, 1, 11, 2, 9, 14, 0, 5, 10
   1839    mova          [tmpq+ 1*16], m7
   1840    mova          [tmpq+ 3*16], m1
   1841    mova          [tmpq+ 5*16], m11
   1842    mova          [tmpq+ 7*16], m2
   1843    mova          [tmpq+ 9*16], m9
   1844    mova          [tmpq+11*16], m14
   1845    mova          [tmpq+13*16], m0
   1846    mova          [tmpq+15*16], m5
   1847 %else
   1848    mova       [tmpq+12*%%str], m2
   1849    mova       [tmpq+ 1*%%str], m5
   1850    mova       [tmpq+15*%%str], m7
   1851    mova                    m2, [tmpq+ 9*%%str]
   1852    mova                    m5, [tmpq+ 5*%%str]
   1853    mova                    m7, [tmpq+ 8*%%str]
   1854    TRANSPOSE8x8W            1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1
   1855    mova          [tmpq+ 0*16], m1
   1856    mova          [tmpq+ 2*16], m2
   1857    mova          [tmpq+ 4*16], m5
   1858    mova          [tmpq+ 6*16], m0
   1859    mova          [tmpq+10*16], m7
   1860    mova                    m3, [tmpq+12*%%str]
   1861    mova          [tmpq+12*16], m4
   1862    mova                    m4, [tmpq+14*%%str]
   1863    mova          [tmpq+14*16], m6
   1864 
   1865    mova                    m0, [tmpq+15*%%str]
   1866    mova                    m1, [tmpq+ 3*%%str]
   1867    mova                    m2, [tmpq+ 7*%%str]
   1868    mova                    m5, [tmpq+11*%%str]
   1869    mova                    m7, [tmpq+ 1*%%str]
   1870    TRANSPOSE8x8W            0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1
   1871    mova          [tmpq+ 1*16], m0
   1872    mova          [tmpq+ 3*16], m1
   1873    mova          [tmpq+ 5*16], m2
   1874    mova          [tmpq+ 7*16], m3
   1875    mova          [tmpq+11*16], m5
   1876    mova          [tmpq+13*16], m6
   1877    mova          [tmpq+15*16], m7
   1878 %endif
   1879 %else
   1880    pxor                    m4, m4
   1881 
   1882 %if cpuflag(ssse3)
   1883 %define ROUND_REG [pw_512]
   1884 %else
   1885 %define ROUND_REG [pw_32]
   1886 %endif
   1887 
   1888 %if ARCH_X86_64
   1889    mova                   m12, [tmpq+ 6*%%str]
   1890    VP9_IDCT8_WRITEx2        1, 11, 10,  8,  4, ROUND_REG, 6
   1891    lea                   dstq, [dstq+strideq*2]
   1892    VP9_IDCT8_WRITEx2       14,  0, 10,  8,  4, ROUND_REG, 6
   1893    lea                   dstq, [dstq+strideq*2]
   1894    VP9_IDCT8_WRITEx2        3, 15, 10,  8,  4, ROUND_REG, 6
   1895    lea                   dstq, [dstq+strideq*2]
   1896    VP9_IDCT8_WRITEx2       12,  6, 10,  8,  4, ROUND_REG, 6
   1897    lea                   dstq, [dstq+strideq*2]
   1898 
   1899    mova                    m1, [tmpq+ 3*%%str]
   1900    mova                   m11, [tmpq+ 7*%%str]
   1901    mova                   m14, [tmpq+11*%%str]
   1902    mova                    m0, [tmpq+13*%%str]
   1903 
   1904    VP9_IDCT8_WRITEx2        7,  1, 10,  8,  4, ROUND_REG, 6
   1905    lea                   dstq, [dstq+strideq*2]
   1906    VP9_IDCT8_WRITEx2       11,  2, 10,  8,  4, ROUND_REG, 6
   1907    lea                   dstq, [dstq+strideq*2]
   1908    VP9_IDCT8_WRITEx2        9, 14, 10,  8,  4, ROUND_REG, 6
   1909    lea                   dstq, [dstq+strideq*2]
   1910    VP9_IDCT8_WRITEx2        0,  5, 10,  8,  4, ROUND_REG, 6
   1911 %else
   1912    mova       [tmpq+ 0*%%str], m2
   1913    mova       [tmpq+ 1*%%str], m5
   1914    mova       [tmpq+ 2*%%str], m7
   1915    mova                    m2, [tmpq+ 9*%%str]
   1916    VP9_IDCT8_WRITEx2        1,  2,  5,  7,  4, ROUND_REG, 6
   1917    lea                   dstq, [dstq+strideq*2]
   1918    mova                    m5, [tmpq+ 5*%%str]
   1919    VP9_IDCT8_WRITEx2        5,  0,  1,  2,  4, ROUND_REG, 6
   1920    lea                   dstq, [dstq+strideq*2]
   1921    mova                    m5, [tmpq+ 8*%%str]
   1922    VP9_IDCT8_WRITEx2        3,  5,  1,  2,  4, ROUND_REG, 6
   1923    lea                   dstq, [dstq+strideq*2]
   1924    mova                    m5, [tmpq+ 6*%%str]
   1925    VP9_IDCT8_WRITEx2        5,  6,  1,  2,  4, ROUND_REG, 6
   1926    lea                   dstq, [dstq+strideq*2]
   1927 
   1928    mova                    m0, [tmpq+ 2*%%str]
   1929    mova                    m3, [tmpq+ 3*%%str]
   1930    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
   1931    lea                   dstq, [dstq+strideq*2]
   1932    mova                    m0, [tmpq+ 7*%%str]
   1933    mova                    m3, [tmpq+ 0*%%str]
   1934    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
   1935    lea                   dstq, [dstq+strideq*2]
   1936    mova                    m0, [tmpq+14*%%str]
   1937    mova                    m3, [tmpq+11*%%str]
   1938    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
   1939    lea                   dstq, [dstq+strideq*2]
   1940    mova                    m0, [tmpq+13*%%str]
   1941    mova                    m3, [tmpq+ 1*%%str]
   1942    VP9_IDCT8_WRITEx2        0,  3,  1,  2,  4, ROUND_REG, 6
   1943 %endif
   1944 
   1945    SWAP                     0,  4 ; zero
   1946 %undef ROUND_REG
   1947 %endif
   1948 %endmacro
   1949 
   1950 %macro IADST16_FN 5
   1951 INIT_XMM %5
   1952 cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
   1953    mov               cntd, 2
   1954    mov               tmpq, rsp
   1955 .loop1_full:
   1956    VP9_%2_1D       blockq, 1
   1957    add             blockq, 16
   1958    add               tmpq, 256
   1959    dec               cntd
   1960    jg .loop1_full
   1961    sub             blockq, 32
   1962 
   1963    mov               cntd, 2
   1964    mov               tmpq, rsp
   1965    mov           dst_bakq, dstq
   1966 .loop2_full:
   1967    VP9_%4_1D         tmpq, 2
   1968    lea               dstq, [dst_bakq+8]
   1969    add               tmpq, 16
   1970    dec               cntd
   1971    jg .loop2_full
   1972 
   1973    ; at the end of the loop, m0 should still be zero
   1974    ; use that to zero out block coefficients
   1975    ZERO_BLOCK      blockq, 32, 16, m0
   1976    RET
   1977 %endmacro
   1978 
   1979 IADST16_FN idct,  IDCT16,  iadst, IADST16, sse2
   1980 IADST16_FN iadst, IADST16, idct,  IDCT16,  sse2
   1981 IADST16_FN iadst, IADST16, iadst, IADST16, sse2
   1982 IADST16_FN idct,  IDCT16,  iadst, IADST16, ssse3
   1983 IADST16_FN iadst, IADST16, idct,  IDCT16,  ssse3
   1984 IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
   1985 IADST16_FN idct,  IDCT16,  iadst, IADST16, avx
   1986 IADST16_FN iadst, IADST16, idct,  IDCT16,  avx
   1987 IADST16_FN iadst, IADST16, iadst, IADST16, avx
   1988 
   1989 ; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128]
   1990 ; out: m[0-15] except m6, which is in [blockq+192]
   1991 ; uses blockq as scratch space
   1992 %macro VP9_IADST16_YMM_1D 0
   1993    mova          [blockq+ 32], m3
   1994    mova          [blockq+ 64], m7
   1995    mova          [blockq+ 96], m8
   1996 
   1997    ; first half of round 1
   1998    VP9_UNPACK_MULSUB_2D_4X  9,  6,  0,  3, 13160,  9760    ; m9/x=t7[d], m6/x=t6[d]
   1999    VP9_UNPACK_MULSUB_2D_4X  1, 14,  4,  7,  2404, 16207    ; m1/x=t15[d], m14/x=t14[d]
   2000    VP9_RND_SH_SUMSUB_BA    14,  6,  7,  3,  8, [pd_8192]   ; m14=t6[w], m6=t14[w]
   2001    VP9_RND_SH_SUMSUB_BA     1,  9,  4,  0,  8, [pd_8192]   ; m1=t7[w], m9=t15[w]
   2002 
   2003    VP9_UNPACK_MULSUB_2D_4X 13,  2,  4,  7, 15893,  3981    ; m13/x=t3[d], m2/x=t2[d]
   2004    VP9_UNPACK_MULSUB_2D_4X  5, 10,  0,  3,  8423, 14053    ; m5/x=t11[d], m10/x=t10[d]
   2005    VP9_RND_SH_SUMSUB_BA    10,  2,  3,  7,  8, [pd_8192]   ; m10=t2[w], m2=t10[w]
   2006    VP9_RND_SH_SUMSUB_BA     5, 13,  0,  4,  8, [pd_8192]   ; m5=t3[w], m13=t11[w]
   2007 
   2008    ; half of round 2 t8-15
   2009    VP9_UNPACK_MULSUB_2D_4X  2, 13,  4,  7,  9102, 13623    ; m2/x=t11[d], m13/x=t10[d]
   2010    VP9_UNPACK_MULSUB_2D_4X  9,  6,  3,  0, 13623,  9102    ; m9/x=t14[d], m6/x=t15[d]
   2011    VP9_RND_SH_SUMSUB_BA     9, 13,  3,  7,  8, [pd_8192]   ; m9=t10[w], m13=t14[w]
   2012    VP9_RND_SH_SUMSUB_BA     6,  2,  0,  4,  8, [pd_8192]   ; m6=t11[w], m2=t15[w]
   2013 
   2014    SUMSUB_BA            w, 14, 10,  8                      ; m14=t2, m10=t6
   2015    SUMSUB_BA            w,  1,  5,  8                      ; m1=t3, m5=t7
   2016 
   2017    mova                    m0, [blockq+  0]
   2018    mova                    m4, [blockq+128]
   2019    mova                    m3, [blockq+ 32]
   2020    mova                    m7, [blockq+ 64]
   2021    mova                    m8, [blockq+ 96]
   2022    mova          [blockq+  0], m1
   2023    mova          [blockq+128], m14
   2024    mova          [blockq+ 32], m6
   2025    mova          [blockq+ 64], m9
   2026    mova          [blockq+ 96], m10
   2027 
   2028    ; second half of round 1
   2029    VP9_UNPACK_MULSUB_2D_4X 15,  0,  1,  9, 16364,   804    ; m15/x=t1[d], m0/x=t0[d]
   2030    VP9_UNPACK_MULSUB_2D_4X  7,  8, 10,  6, 11003, 12140    ; m7/x=t9[d], m8/x=t8[d]
   2031    VP9_RND_SH_SUMSUB_BA     8,  0,  6,  9, 14, [pd_8192]   ; m8=t0[w], m0=t8[w]
   2032    VP9_RND_SH_SUMSUB_BA     7, 15, 10,  1, 14, [pd_8192]   ; m7=t1[w], m15=t9[w]
   2033 
   2034    VP9_UNPACK_MULSUB_2D_4X 11,  4, 10,  6, 14811,  7005    ; m11/x=t5[d], m4/x=t4[d]
   2035    VP9_UNPACK_MULSUB_2D_4X  3, 12,  1,  9,  5520, 15426    ; m3/x=t13[d], m12/x=t12[d]
   2036    VP9_RND_SH_SUMSUB_BA    12,  4,  9,  6, 14, [pd_8192]   ; m12=t4[w], m4=t12[w]
   2037    VP9_RND_SH_SUMSUB_BA     3, 11,  1, 10, 14, [pd_8192]   ; m3=t5[w], m11=t13[w]
   2038 
   2039    ; second half of round 2 t8-15
   2040    VP9_UNPACK_MULSUB_2D_4X  0, 15,  6, 10, 16069,  3196    ; m15/x=t8[d], m0/x=t9[d]
   2041    VP9_UNPACK_MULSUB_2D_4X 11,  4,  9,  1,  3196, 16069    ; m11/x=t12[d], m4/x=t13[d]
   2042    VP9_RND_SH_SUMSUB_BA    11, 15,  9, 10, 14, [pd_8192]   ; m11=t8[w], m15=t12[w]
   2043    VP9_RND_SH_SUMSUB_BA     4,  0,  1,  6, 14, [pd_8192]   ; m4=t9[w], m0=t13[w]
   2044 
   2045    SUMSUB_BA            w, 12,  8, 14                      ; m12=t0, m8=t4
   2046    SUMSUB_BA            w,  3,  7, 14                      ; m3=t1, m7=t5
   2047 
   2048    mova                   m10, [blockq+ 96]
   2049    mova          [blockq+ 96], m12
   2050 
   2051    ; round 3
   2052    VP9_UNPACK_MULSUB_2D_4X 15,  0,  9, 12, 15137,  6270    ; m15/x=t13[d], m0/x=t12[d]
   2053    VP9_UNPACK_MULSUB_2D_4X  2, 13,  1,  6,  6270, 15137    ; m2/x=t14[d], m13/x=t15[d]
   2054    VP9_RND_SH_SUMSUB_BA     2,  0,  1, 12, 14, [pd_8192]   ; m2=out2[w], m0=t14a[w]
   2055    VP9_RND_SH_SUMSUB_BA    13, 15,  6,  9, 14, [pd_8192]
   2056    PSIGNW                 m13, [pw_m1]                     ; m13=out13[w], m15=t15a[w]
   2057 
   2058    VP9_UNPACK_MULSUB_2D_4X  8,  7, 12,  9, 15137,  6270    ; m8/x=t5[d], m7/x=t4[d]
   2059    VP9_UNPACK_MULSUB_2D_4X  5, 10,  1,  6,  6270, 15137    ; m5/x=t6[d], m10/x=t7[d]
   2060    VP9_RND_SH_SUMSUB_BA     5,  7,  1,  9, 14, [pd_8192]
   2061    PSIGNW                  m5, [pw_m1]                     ; m5=out3[w], m7=t6[w]
   2062    VP9_RND_SH_SUMSUB_BA    10,  8,  6, 12, 14, [pd_8192]   ; m10=out12[w], m8=t7[w]
   2063 
   2064    mova                    m1, [blockq+  0]
   2065    mova                   m14, [blockq+128]
   2066    mova                    m6, [blockq+ 32]
   2067    mova                    m9, [blockq+ 64]
   2068    mova                   m12, [blockq+ 96]
   2069    mova          [blockq+  0], m10
   2070    mova          [blockq+128], m5
   2071 
   2072    SUMSUB_BA            w, 14, 12,  5                      ; m14=out0, m12=t2a
   2073    SUMSUB_BA            w,  1,  3,  5
   2074    PSIGNW                  m1, [pw_m1]                     ; m1=out15, m3=t3a
   2075 
   2076    SUMSUB_BA            w,  9, 11,  5
   2077    PSIGNW                  m9, [pw_m1]                     ; m9=out1, m11=t10
   2078    SUMSUB_BA            w,  6,  4,  5                      ; m6=out14, m4=t11
   2079 
   2080    VP9_UNPACK_MULSUB_2W_4X  4, 11, 11585, 11585, [pd_8192],  5, 10 ; m4=out9, m11=out6
   2081    mova                    m5, [blockq+128]
   2082    mova          [blockq+192], m11
   2083    PSIGNW                 m15, [pw_m1]
   2084    VP9_UNPACK_MULSUB_2W_4X 15,  0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10
   2085 
   2086    PSIGNW                  m3, [pw_m1]
   2087    VP9_UNPACK_MULSUB_2W_4X  3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8
   2088    VP9_UNPACK_MULSUB_2W_4X  8,  7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4
   2089 
   2090    mova                   m10, [blockq+  0]
   2091 
   2092    SWAP                     0, 14,  6, 11,  8, 12, 10
   2093    SWAP                     1,  9, 15,  4,  7,  3,  5
   2094    SWAP                     5,  9, 15
   2095 %endmacro
   2096 
   2097 %if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
   2098 %macro IADST16_YMM_FN 4
   2099 INIT_YMM avx2
   2100 cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob
   2101    mova                m1, [blockq+ 32]
   2102    mova                m2, [blockq+ 64]
   2103    mova                m3, [blockq+ 96]
   2104    mova                m5, [blockq+160]
   2105    mova                m6, [blockq+192]
   2106    mova                m7, [blockq+224]
   2107    mova                m8, [blockq+256]
   2108    mova                m9, [blockq+288]
   2109    mova               m10, [blockq+320]
   2110    mova               m11, [blockq+352]
   2111    mova               m12, [blockq+384]
   2112    mova               m13, [blockq+416]
   2113    mova               m14, [blockq+448]
   2114    mova               m15, [blockq+480]
   2115 
   2116    VP9_%2_YMM_1D
   2117    TRANSPOSE16x16W      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
   2118                         [blockq+192], [blockq+128], 1
   2119    mova      [blockq+  0], m0
   2120    VP9_%4_YMM_1D
   2121 
   2122    mova      [blockq+224], m7
   2123 
   2124    ; store
   2125    VP9_IDCT8_WRITEx2    0,  1, 6, 7, unused, [pw_512], 6
   2126    lea               dstq, [dstq+2*strideq]
   2127    VP9_IDCT8_WRITEx2    2,  3, 6, 7, unused, [pw_512], 6
   2128    lea               dstq, [dstq+2*strideq]
   2129    VP9_IDCT8_WRITEx2    4,  5, 6, 7, unused, [pw_512], 6
   2130    lea               dstq, [dstq+2*strideq]
   2131    mova                m6, [blockq+192]
   2132    mova                m7, [blockq+224]
   2133    VP9_IDCT8_WRITEx2    6,  7, 1, 2, unused, [pw_512], 6
   2134    lea               dstq, [dstq+2*strideq]
   2135    VP9_IDCT8_WRITEx2    8,  9, 1, 2, unused, [pw_512], 6
   2136    lea               dstq, [dstq+2*strideq]
   2137    VP9_IDCT8_WRITEx2   10, 11, 1, 2, unused, [pw_512], 6
   2138    lea               dstq, [dstq+2*strideq]
   2139    VP9_IDCT8_WRITEx2   12, 13, 1, 2, unused, [pw_512], 6
   2140    lea               dstq, [dstq+2*strideq]
   2141    VP9_IDCT8_WRITEx2   14, 15, 1, 2, unused, [pw_512], 6
   2142    lea               dstq, [dstq+2*strideq]
   2143 
   2144    ; at the end of the loop, m0 should still be zero
   2145    ; use that to zero out block coefficients
   2146    pxor                m0, m0
   2147    ZERO_BLOCK      blockq, 32, 16, m0
   2148    RET
   2149 %endmacro
   2150 
   2151 IADST16_YMM_FN idct,  IDCT16,  iadst, IADST16
   2152 IADST16_YMM_FN iadst, IADST16, idct,  IDCT16
   2153 IADST16_YMM_FN iadst, IADST16, iadst, IADST16
   2154 %endif
   2155 
   2156 ;---------------------------------------------------------------------------------------------
   2157 ; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
   2158 ;---------------------------------------------------------------------------------------------
   2159 
   2160 %macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
   2161 %if %2 == 1
   2162 %assign %%str mmsize
   2163 %else
   2164 %assign %%str 64
   2165 %endif
   2166 
   2167    ; first do t0-15, this can be done identical to idct16x16
   2168    VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1
   2169 
   2170    ; store everything on stack to make space available for t16-31
   2171    ; we store interleaved with the output of the second half (t16-31)
   2172    ; so we don't need to allocate extra stack space
   2173    mova    [tmpq+ 0*%%str], m0     ; t0
   2174    mova    [tmpq+ 4*%%str], m1     ; t1
   2175    mova    [tmpq+ 8*%%str], m2     ; t2
   2176    mova    [tmpq+12*%%str], m3     ; t3
   2177    mova    [tmpq+16*%%str], m4     ; t4
   2178    mova    [tmpq+20*%%str], m5     ; t5
   2179 %if ARCH_X86_64
   2180    mova    [tmpq+22*%%str], m10    ; t10
   2181    mova    [tmpq+18*%%str], m11    ; t11
   2182    mova    [tmpq+14*%%str], m12    ; t12
   2183    mova    [tmpq+10*%%str], m13    ; t13
   2184    mova    [tmpq+ 6*%%str], m14    ; t14
   2185    mova    [tmpq+ 2*%%str], m15    ; t15
   2186 %endif
   2187 
   2188    mova                m0, [tmpq+ 30*%%str]
   2189    UNSCRATCH            1,  6, tmpq+26*%%str
   2190    UNSCRATCH            2,  8, tmpq+24*%%str
   2191    UNSCRATCH            3,  9, tmpq+28*%%str
   2192    SUMSUB_BA            w,  1,  3, 4       ; t6, t9
   2193    SUMSUB_BA            w,  0,  2, 4       ; t7, t8
   2194 
   2195    mova    [tmpq+24*%%str], m1     ; t6
   2196    mova    [tmpq+28*%%str], m0     ; t7
   2197    mova    [tmpq+30*%%str], m2     ; t8
   2198    mova    [tmpq+26*%%str], m3     ; t9
   2199 
   2200    ; then, secondly, do t16-31
   2201 %if %3 <= 8
   2202    mova                 m4, [%1+ 1*64]
   2203    mova                 m7, [%1+ 7*64]
   2204 
   2205    pmulhrsw             m1,  m4, [pw_16364x2] ;t31
   2206    pmulhrsw             m4, [pw_804x2] ;t16
   2207 
   2208    VP9_UNPACK_MULSUB_2W_4X   5,  0,  1,  4, 16069,  3196, [pd_8192], 6,  2 ; t17, t30
   2209 
   2210    pmulhrsw             m3,  m7, [pw_m5520x2] ;t19
   2211    pmulhrsw             m7, [pw_15426x2] ;t28
   2212 
   2213    SCRATCH               4, 13, tmpq+ 1*%%str
   2214    SCRATCH               5, 12, tmpq+15*%%str
   2215 
   2216    VP9_UNPACK_MULSUB_2W_4X   2,  6,  7,  3, 3196, m16069, [pd_8192], 4,  5 ; t18, t29
   2217 %else
   2218    mova                 m0, [%1+ 1*64]
   2219    mova                 m1, [%1+15*64]
   2220 %if %3 <= 16
   2221    pmulhrsw             m5, m0, [pw_16364x2]
   2222    pmulhrsw             m0, [pw_804x2]
   2223    pmulhrsw             m4, m1, [pw_m11003x2]
   2224    pmulhrsw             m1, [pw_12140x2]
   2225 %else
   2226    mova                 m4, [%1+17*64]
   2227    mova                 m5, [%1+31*64]
   2228 
   2229    VP9_UNPACK_MULSUB_2W_4X   0,  5, 16364,   804, [pd_8192], 2, 3 ; t16, t31
   2230    VP9_UNPACK_MULSUB_2W_4X   4,  1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
   2231 %endif
   2232    SUMSUB_BA             w,  4,  0,  2
   2233    SUMSUB_BA             w,  1,  5,  2
   2234 
   2235    VP9_UNPACK_MULSUB_2W_4X   5,  0, 16069,  3196, [pd_8192], 2, 3 ; t17, t30
   2236 
   2237    SCRATCH               4, 13, tmpq+ 1*%%str
   2238    SCRATCH               5, 12, tmpq+15*%%str
   2239 
   2240    mova                 m2, [%1+ 7*64]
   2241    mova                 m3, [%1+ 9*64]
   2242 %if %3 <= 16
   2243    pmulhrsw             m7,  m3, [pw_14811x2]
   2244    pmulhrsw             m3, [pw_7005x2]
   2245    pmulhrsw             m6,  m2, [pw_m5520x2]
   2246    pmulhrsw             m2, [pw_15426x2]
   2247 %else
   2248    mova                 m7, [%1+23*64]
   2249    mova                 m6, [%1+25*64]
   2250 
   2251    VP9_UNPACK_MULSUB_2W_4X   3,  7, 14811,  7005, [pd_8192], 4, 5 ; t18, t29
   2252    VP9_UNPACK_MULSUB_2W_4X   6,  2,  5520, 15426, [pd_8192], 4, 5 ; t19, t28
   2253 %endif
   2254    SUMSUB_BA             w,  3,  6,  4
   2255    SUMSUB_BA             w,  7,  2,  4
   2256 
   2257    VP9_UNPACK_MULSUB_2W_4X   2,  6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29
   2258 %endif
   2259 
   2260    UNSCRATCH             5, 12, tmpq+15*%%str
   2261    SUMSUB_BA             w,  6,  0,  4
   2262    mova    [tmpq+25*%%str], m6             ; t19
   2263    UNSCRATCH             4, 13, tmpq+ 1*%%str
   2264    SUMSUB_BA             w,  7,  1,  6
   2265    SUMSUB_BA             w,  3,  4,  6
   2266    mova    [tmpq+23*%%str], m3             ; t16
   2267    SUMSUB_BA             w,  2,  5,  6
   2268 
   2269    VP9_UNPACK_MULSUB_2W_4X   0,  5, 15137,  6270, [pd_8192], 6, 3 ; t18, t29
   2270    VP9_UNPACK_MULSUB_2W_4X   1,  4, 15137,  6270, [pd_8192], 6, 3 ; t19, t28
   2271 
   2272    SCRATCH               0, 10, tmpq+ 1*%%str
   2273    SCRATCH               1, 11, tmpq+ 7*%%str
   2274    SCRATCH               2,  9, tmpq+ 9*%%str
   2275    SCRATCH               4, 14, tmpq+15*%%str
   2276    SCRATCH               5, 15, tmpq+17*%%str
   2277    SCRATCH               7, 13, tmpq+31*%%str
   2278 
   2279 %if %3 <= 8
   2280    mova                 m0, [%1+ 5*64]
   2281    mova                 m3, [%1+ 3*64]
   2282 
   2283    pmulhrsw             m5,  m0, [pw_15893x2] ;t27
   2284    pmulhrsw             m0, [pw_3981x2] ;t20
   2285 
   2286    VP9_UNPACK_MULSUB_2W_4X   1,  4,  5,  0,  9102, 13623, [pd_8192], 7,  2 ; t21, t26
   2287 
   2288    pmulhrsw             m6,  m3, [pw_m2404x2] ;t23
   2289    pmulhrsw             m3, [pw_16207x2] ;t24
   2290 
   2291    SCRATCH               5,  8, tmpq+ 5*%%str
   2292    SCRATCH               4, 12, tmpq+11*%%str
   2293 
   2294    VP9_UNPACK_MULSUB_2W_4X   7,  2,  3,  6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
   2295 %else
   2296    mova                 m4, [%1+ 5*64]
   2297    mova                 m5, [%1+11*64]
   2298 %if %3 <= 16
   2299    pmulhrsw             m1, m4, [pw_15893x2]
   2300    pmulhrsw             m4, [pw_3981x2]
   2301    pmulhrsw             m0, m5, [pw_m8423x2]
   2302    pmulhrsw             m5, [pw_14053x2]
   2303 %else
   2304    mova                 m0, [%1+21*64]
   2305    mova                 m1, [%1+27*64]
   2306 
   2307    VP9_UNPACK_MULSUB_2W_4X   4,  1, 15893,  3981, [pd_8192], 2, 3 ; t20, t27
   2308    VP9_UNPACK_MULSUB_2W_4X   0,  5,  8423, 14053, [pd_8192], 2, 3 ; t21, t26
   2309 %endif
   2310    SUMSUB_BA             w,  0,  4,  2
   2311    SUMSUB_BA             w,  5,  1,  2
   2312 
   2313    VP9_UNPACK_MULSUB_2W_4X   1,  4,  9102, 13623, [pd_8192], 2, 3 ; t21, t26
   2314 
   2315    SCRATCH               5,  8, tmpq+ 5*%%str
   2316    SCRATCH               4, 12, tmpq+11*%%str
   2317 
   2318    mova                 m7, [%1+ 3*64]
   2319    mova                 m6, [%1+13*64]
   2320 %if %3 <= 16
   2321    pmulhrsw             m3, m6, [pw_13160x2]
   2322    pmulhrsw             m6, [pw_9760x2]
   2323    pmulhrsw             m2, m7, [pw_m2404x2]
   2324    pmulhrsw             m7, [pw_16207x2]
   2325 %else
   2326    mova                 m2, [%1+29*64]
   2327    mova                 m3, [%1+19*64]
   2328    VP9_UNPACK_MULSUB_2W_4X   6,  3, 13160,  9760, [pd_8192], 4, 5 ; t22, t25
   2329    VP9_UNPACK_MULSUB_2W_4X   2,  7,  2404, 16207, [pd_8192], 4, 5 ; t23, t24
   2330 %endif
   2331    SUMSUB_BA             w,  6,  2,  4
   2332    SUMSUB_BA             w,  3,  7,  4
   2333 
   2334    VP9_UNPACK_MULSUB_2W_4X   7,  2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25
   2335 %endif
   2336 
   2337    ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
   2338    ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
   2339 
   2340    UNSCRATCH             4, 12, tmpq+11*%%str
   2341    SUMSUB_BA             w,  0,  6, 5
   2342    SUMSUB_BA             w,  4,  2, 5
   2343    UNSCRATCH             5,  8, tmpq+ 5*%%str
   2344    SCRATCH               4,  8, tmpq+11*%%str
   2345    SUMSUB_BA             w,  1,  7, 4
   2346    SUMSUB_BA             w,  5,  3, 4
   2347    SCRATCH               5, 12, tmpq+ 5*%%str
   2348 
   2349    VP9_UNPACK_MULSUB_2W_4X   3,  6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27
   2350    VP9_UNPACK_MULSUB_2W_4X   2,  7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26
   2351 
   2352    ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
   2353    ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
   2354 
   2355    UNSCRATCH             5,  9, tmpq+ 9*%%str
   2356    mova                 m4, [tmpq+23*%%str] ; t16
   2357 %if ARCH_X86_64
   2358    SUMSUB_BA             w,  1,  5,  9
   2359    SUMSUB_BA             w,  0,  4,  9
   2360 %else
   2361    SUMSUB_BADC           w,  1,  5,  0,  4
   2362 %endif
   2363    mova    [tmpq+29*%%str], m1     ; t17
   2364    mova    [tmpq+21*%%str], m0     ; t16
   2365    UNSCRATCH             0, 10, tmpq+ 1*%%str
   2366    UNSCRATCH             1, 11, tmpq+ 7*%%str
   2367 %if ARCH_X86_64
   2368    SUMSUB_BA             w,  2,  0,  9
   2369    SUMSUB_BA             w,  3,  1,  9
   2370 %else
   2371    SUMSUB_BADC           w,  2,  0,  3,  1
   2372 %endif
   2373    mova    [tmpq+ 9*%%str], m2     ; t18
   2374    mova    [tmpq+13*%%str], m3     ; t19
   2375    SCRATCH               0, 10, tmpq+23*%%str
   2376    SCRATCH               1, 11, tmpq+27*%%str
   2377 
   2378    UNSCRATCH             2, 14, tmpq+15*%%str
   2379    UNSCRATCH             3, 15, tmpq+17*%%str
   2380    SUMSUB_BA             w,  6,  2, 0
   2381    SUMSUB_BA             w,  7,  3, 0
   2382    SCRATCH               6, 14, tmpq+ 3*%%str
   2383    SCRATCH               7, 15, tmpq+ 7*%%str
   2384 
   2385    UNSCRATCH             0,  8, tmpq+11*%%str
   2386    mova                 m1, [tmpq+25*%%str] ; t19
   2387    UNSCRATCH             6, 12, tmpq+ 5*%%str
   2388    UNSCRATCH             7, 13, tmpq+31*%%str
   2389 %if ARCH_X86_64
   2390    SUMSUB_BA             w,  0,  1,  9
   2391    SUMSUB_BA             w,  6,  7,  9
   2392 %else
   2393    SUMSUB_BADC           w,  0,  1,  6,  7
   2394 %endif
   2395 
   2396    ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
   2397    ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
   2398 
   2399 %if 0; cpuflag(ssse3)
   2400 %if ARCH_X86_64
   2401    SUMSUB_BA             w,  4,  7,  8
   2402    SUMSUB_BA             w,  5,  1,  8
   2403 %else
   2404    SUMSUB_BADC           w,  4,  7,  5,  1
   2405 %endif
   2406 
   2407    pmulhrsw             m7, [pw_11585x2]
   2408    pmulhrsw             m4, [pw_11585x2]
   2409    pmulhrsw             m1, [pw_11585x2]
   2410    pmulhrsw             m5, [pw_11585x2]
   2411 
   2412    mova    [tmpq+ 5*%%str], m7     ; t23
   2413    SCRATCH               1, 13, tmpq+25*%%str
   2414    UNSCRATCH             7, 10, tmpq+23*%%str
   2415    UNSCRATCH             1, 11, tmpq+27*%%str
   2416 
   2417 %if ARCH_X86_64
   2418    SUMSUB_BA             w,  7,  3, 10
   2419    SUMSUB_BA             w,  1,  2, 10
   2420 %else
   2421    SUMSUB_BADC           w,  7,  3,  1,  2
   2422 %endif
   2423 
   2424    pmulhrsw             m3, [pw_11585x2]
   2425    pmulhrsw             m7, [pw_11585x2]
   2426    pmulhrsw             m2, [pw_11585x2]
   2427    pmulhrsw             m1, [pw_11585x2]
   2428 %else
   2429    SCRATCH               0,  8, tmpq+15*%%str
   2430    SCRATCH               6,  9, tmpq+17*%%str
   2431    VP9_UNPACK_MULSUB_2W_4X  7,  4, 11585, 11585, [pd_8192], 0, 6
   2432    mova    [tmpq+ 5*%%str], m7     ; t23
   2433    UNSCRATCH             7, 10, tmpq+23*%%str
   2434    VP9_UNPACK_MULSUB_2W_4X  1,  5, 11585, 11585, [pd_8192], 0, 6
   2435    SCRATCH               1, 13, tmpq+25*%%str
   2436    UNSCRATCH             1, 11, tmpq+27*%%str
   2437    VP9_UNPACK_MULSUB_2W_4X  3,  7, 11585, 11585, [pd_8192], 0, 6
   2438    VP9_UNPACK_MULSUB_2W_4X  2,  1, 11585, 11585, [pd_8192], 0, 6
   2439    UNSCRATCH             0,  8, tmpq+15*%%str
   2440    UNSCRATCH             6,  9, tmpq+17*%%str
   2441 %endif
   2442 
   2443    ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
   2444    ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
   2445 
   2446    ; then do final pass to sumsub+store the two halves
   2447 %if %2 == 1
   2448    mova    [tmpq+17*%%str], m2     ; t20
   2449    mova    [tmpq+ 1*%%str], m3     ; t21
   2450 %if ARCH_X86_64
   2451    mova    [tmpq+25*%%str], m13    ; t22
   2452 
   2453    mova                 m8, [tmpq+ 0*%%str] ; t0
   2454    mova                 m9, [tmpq+ 4*%%str] ; t1
   2455    mova                m12, [tmpq+ 8*%%str] ; t2
   2456    mova                m11, [tmpq+12*%%str] ; t3
   2457    mova                 m2, [tmpq+16*%%str] ; t4
   2458    mova                 m3, [tmpq+20*%%str] ; t5
   2459    mova                m13, [tmpq+24*%%str] ; t6
   2460 
   2461    SUMSUB_BA             w,  6,  8, 10
   2462    mova    [tmpq+ 3*%%str], m8              ; t15
   2463    SUMSUB_BA             w,  0,  9,  8
   2464    SUMSUB_BA             w, 15, 12,  8
   2465    SUMSUB_BA             w, 14, 11,  8
   2466    SUMSUB_BA             w,  1,  2,  8
   2467    SUMSUB_BA             w,  7,  3,  8
   2468    SUMSUB_BA             w,  5, 13,  8
   2469    mova                m10, [tmpq+28*%%str] ; t7
   2470    SUMSUB_BA             w,  4, 10,  8
   2471 %if cpuflag(avx2)
   2472    ; the "shitty" about this idct is that the final pass does the outermost
   2473    ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need
   2474    ; to be sequential, which means I need to load/store half of the sumsub
   2475    ; intermediates back to/from memory to get a 16x16 transpose going...
   2476    ; This would be easier if we had more (e.g. 32) YMM regs here.
   2477    mova    [tmpq+ 7*%%str], m9
   2478    mova    [tmpq+11*%%str], m12
   2479    mova    [tmpq+15*%%str], m11
   2480    mova    [tmpq+19*%%str], m2
   2481    mova    [tmpq+23*%%str], m3
   2482    mova    [tmpq+27*%%str], m13
   2483    mova    [tmpq+31*%%str], m10
   2484    mova    [tmpq+12*%%str], m5
   2485 
   2486    mova                m13, [tmpq+30*%%str] ; t8
   2487    mova                m12, [tmpq+26*%%str] ; t9
   2488    mova                m11, [tmpq+22*%%str] ; t10
   2489    mova                m10, [tmpq+18*%%str] ; t11
   2490    mova                 m9, [tmpq+17*%%str] ; t20
   2491    mova                 m8, [tmpq+ 1*%%str] ; t21
   2492    mova                 m3, [tmpq+25*%%str] ; t22
   2493    mova                 m2, [tmpq+ 5*%%str] ; t23
   2494 
   2495    SUMSUB_BA             w,  9, 10, 5
   2496    SUMSUB_BA             w,  8, 11, 5
   2497    SUMSUB_BA             w,  3, 12, 5
   2498    SUMSUB_BA             w,  2, 13, 5
   2499    mova    [tmpq+ 1*%%str], m10
   2500    mova    [tmpq+ 5*%%str], m11
   2501    mova    [tmpq+17*%%str], m12
   2502    mova    [tmpq+25*%%str], m13
   2503 
   2504    mova                m13, [tmpq+14*%%str] ; t12
   2505    mova                m12, [tmpq+10*%%str] ; t13
   2506    mova                m11, [tmpq+ 9*%%str] ; t18
   2507    mova                m10, [tmpq+13*%%str] ; t19
   2508 
   2509    SUMSUB_BA             w, 11, 12, 5
   2510    SUMSUB_BA             w, 10, 13, 5
   2511    mova    [tmpq+ 9*%%str], m13
   2512    mova    [tmpq+13*%%str], m12
   2513    mova    [tmpq+10*%%str], m10
   2514    mova    [tmpq+14*%%str], m11
   2515 
   2516    mova                m13, [tmpq+ 6*%%str] ; t14
   2517    mova                m12, [tmpq+ 2*%%str] ; t15
   2518    mova                m11, [tmpq+21*%%str] ; t16
   2519    mova                m10, [tmpq+29*%%str] ; t17
   2520    SUMSUB_BA             w, 11, 12, 5
   2521    SUMSUB_BA             w, 10, 13, 5
   2522    mova    [tmpq+21*%%str], m12
   2523    mova    [tmpq+29*%%str], m13
   2524    mova                m12, [tmpq+10*%%str]
   2525    mova                m13, [tmpq+14*%%str]
   2526 
   2527    TRANSPOSE16x16W       6,  0, 15, 14,  1,  7,  5,  4, \
   2528                          2,  3,  8,  9, 12, 13, 10, 11, \
   2529            [tmpq+12*%%str], [tmpq+ 8*%%str], 1
   2530    mova    [tmpq+ 0*%%str], m6
   2531    mova    [tmpq+ 2*%%str], m0
   2532    mova    [tmpq+ 4*%%str], m15
   2533    mova    [tmpq+ 6*%%str], m14
   2534    mova    [tmpq+10*%%str], m7
   2535    mova    [tmpq+12*%%str], m5
   2536    mova    [tmpq+14*%%str], m4
   2537    mova    [tmpq+16*%%str], m2
   2538    mova    [tmpq+18*%%str], m3
   2539    mova    [tmpq+20*%%str], m8
   2540    mova    [tmpq+22*%%str], m9
   2541    mova    [tmpq+24*%%str], m12
   2542    mova    [tmpq+26*%%str], m13
   2543    mova    [tmpq+28*%%str], m10
   2544    mova    [tmpq+30*%%str], m11
   2545 
   2546    mova                 m0, [tmpq+21*%%str]
   2547    mova                 m1, [tmpq+29*%%str]
   2548    mova                 m2, [tmpq+13*%%str]
   2549    mova                 m3, [tmpq+ 9*%%str]
   2550    mova                 m4, [tmpq+ 1*%%str]
   2551    mova                 m5, [tmpq+ 5*%%str]
   2552    mova                 m7, [tmpq+25*%%str]
   2553    mova                 m8, [tmpq+31*%%str]
   2554    mova                 m9, [tmpq+27*%%str]
   2555    mova                m10, [tmpq+23*%%str]
   2556    mova                m11, [tmpq+19*%%str]
   2557    mova                m12, [tmpq+15*%%str]
   2558    mova                m13, [tmpq+11*%%str]
   2559    mova                m14, [tmpq+ 7*%%str]
   2560    mova                m15, [tmpq+ 3*%%str]
   2561    TRANSPOSE16x16W       0,  1,  2,  3,  4,  5,  6,  7, \
   2562                          8,  9, 10, 11, 12, 13, 14, 15, \
   2563            [tmpq+17*%%str], [tmpq+ 9*%%str], 1
   2564    mova    [tmpq+ 1*%%str], m0
   2565    mova    [tmpq+ 3*%%str], m1
   2566    mova    [tmpq+ 5*%%str], m2
   2567    mova    [tmpq+ 7*%%str], m3
   2568    mova    [tmpq+11*%%str], m5
   2569    mova    [tmpq+13*%%str], m6
   2570    mova    [tmpq+15*%%str], m7
   2571    mova    [tmpq+17*%%str], m8
   2572    mova    [tmpq+19*%%str], m9
   2573    mova    [tmpq+21*%%str], m10
   2574    mova    [tmpq+23*%%str], m11
   2575    mova    [tmpq+25*%%str], m12
   2576    mova    [tmpq+27*%%str], m13
   2577    mova    [tmpq+29*%%str], m14
   2578    mova    [tmpq+31*%%str], m15
   2579 %else ; !avx2
   2580    TRANSPOSE8x8W         6, 0, 15, 14, 1, 7, 5, 4, 8
   2581    mova    [tmpq+ 0*%%str], m6
   2582    mova    [tmpq+ 4*%%str], m0
   2583    mova    [tmpq+ 8*%%str], m15
   2584    mova    [tmpq+12*%%str], m14
   2585    mova    [tmpq+16*%%str], m1
   2586    mova    [tmpq+20*%%str], m7
   2587    mova    [tmpq+24*%%str], m5
   2588    mova    [tmpq+28*%%str], m4
   2589 
   2590    mova                  m8, [tmpq+ 3*%%str] ; t15
   2591    TRANSPOSE8x8W         10, 13, 3, 2, 11, 12, 9, 8, 0
   2592    mova    [tmpq+ 3*%%str], m10
   2593    mova    [tmpq+ 7*%%str], m13
   2594    mova    [tmpq+11*%%str], m3
   2595    mova    [tmpq+15*%%str], m2
   2596    mova    [tmpq+19*%%str], m11
   2597    mova    [tmpq+23*%%str], m12
   2598    mova    [tmpq+27*%%str], m9
   2599    mova    [tmpq+31*%%str], m8
   2600 
   2601    mova                m15, [tmpq+30*%%str] ; t8
   2602    mova                m14, [tmpq+26*%%str] ; t9
   2603    mova                m13, [tmpq+22*%%str] ; t10
   2604    mova                m12, [tmpq+18*%%str] ; t11
   2605    mova                m11, [tmpq+14*%%str] ; t12
   2606    mova                m10, [tmpq+10*%%str] ; t13
   2607    mova                 m9, [tmpq+ 6*%%str] ; t14
   2608    mova                 m8, [tmpq+ 2*%%str] ; t15
   2609    mova                 m7, [tmpq+21*%%str] ; t16
   2610    mova                 m6, [tmpq+29*%%str] ; t17
   2611    mova                 m5, [tmpq+ 9*%%str] ; t18
   2612    mova                 m4, [tmpq+13*%%str] ; t19
   2613    mova                 m3, [tmpq+17*%%str] ; t20
   2614    mova                 m2, [tmpq+ 1*%%str] ; t21
   2615    mova                 m1, [tmpq+25*%%str] ; t22
   2616 
   2617    SUMSUB_BA             w,  7,  8, 0
   2618    mova    [tmpq+ 2*%%str], m8
   2619    mova                 m0, [tmpq+ 5*%%str] ; t23
   2620    SUMSUB_BA             w,  6,  9, 8
   2621    SUMSUB_BA             w,  5, 10, 8
   2622    SUMSUB_BA             w,  4, 11, 8
   2623    SUMSUB_BA             w,  3, 12, 8
   2624    SUMSUB_BA             w,  2, 13, 8
   2625    SUMSUB_BA             w,  1, 14, 8
   2626    SUMSUB_BA             w,  0, 15, 8
   2627 
   2628    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, 8
   2629    mova    [tmpq+ 1*%%str], m0
   2630    mova    [tmpq+ 5*%%str], m1
   2631    mova    [tmpq+ 9*%%str], m2
   2632    mova    [tmpq+13*%%str], m3
   2633    mova    [tmpq+17*%%str], m4
   2634    mova    [tmpq+21*%%str], m5
   2635    mova    [tmpq+25*%%str], m6
   2636    mova    [tmpq+29*%%str], m7
   2637 
   2638    mova                 m8, [tmpq+ 2*%%str]
   2639    TRANSPOSE8x8W         8, 9, 10, 11, 12, 13, 14, 15, 0
   2640    mova    [tmpq+ 2*%%str], m8
   2641    mova    [tmpq+ 6*%%str], m9
   2642    mova    [tmpq+10*%%str], m10
   2643    mova    [tmpq+14*%%str], m11
   2644    mova    [tmpq+18*%%str], m12
   2645    mova    [tmpq+22*%%str], m13
   2646    mova    [tmpq+26*%%str], m14
   2647    mova    [tmpq+30*%%str], m15
   2648 %endif ; avx2
   2649 %else
   2650    mova                 m2, [tmpq+24*%%str] ; t6
   2651    mova                 m3, [tmpq+28*%%str] ; t7
   2652    SUMSUB_BADC           w,  5,  2,  4,  3
   2653    mova    [tmpq+24*%%str], m5
   2654    mova    [tmpq+23*%%str], m2
   2655    mova    [tmpq+28*%%str], m4
   2656    mova    [tmpq+19*%%str], m3
   2657 
   2658    mova                 m2, [tmpq+16*%%str] ; t4
   2659    mova                 m3, [tmpq+20*%%str] ; t5
   2660    SUMSUB_BA             w,  1,  2,  5
   2661    SUMSUB_BA             w,  7,  3,  5
   2662    mova    [tmpq+15*%%str], m2
   2663    mova    [tmpq+11*%%str], m3
   2664 
   2665    mova                 m2, [tmpq+ 0*%%str] ; t0
   2666    mova                 m3, [tmpq+ 4*%%str] ; t1
   2667    SUMSUB_BA             w,  6,  2,  5
   2668    SUMSUB_BA             w,  0,  3,  5
   2669    mova    [tmpq+31*%%str], m2
   2670    mova    [tmpq+27*%%str], m3
   2671 
   2672    mova                 m2, [tmpq+ 8*%%str] ; t2
   2673    mova                 m3, [tmpq+12*%%str] ; t3
   2674    mova                 m5, [tmpq+ 7*%%str]
   2675    mova                 m4, [tmpq+ 3*%%str]
   2676    SUMSUB_BADC           w,  5,  2,  4,  3
   2677    mova    [tmpq+ 7*%%str], m2
   2678    mova    [tmpq+ 3*%%str], m3
   2679 
   2680    mova                 m3, [tmpq+28*%%str]
   2681    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1
   2682    mova    [tmpq+ 0*%%str], m6
   2683    mova    [tmpq+ 4*%%str], m0
   2684    mova    [tmpq+ 8*%%str], m5
   2685    mova    [tmpq+12*%%str], m4
   2686    mova    [tmpq+20*%%str], m7
   2687    mova    [tmpq+24*%%str], m2
   2688    mova    [tmpq+28*%%str], m3
   2689 
   2690    mova                 m6, [tmpq+19*%%str]
   2691    mova                 m0, [tmpq+23*%%str]
   2692    mova                 m5, [tmpq+11*%%str]
   2693    mova                 m4, [tmpq+15*%%str]
   2694    mova                 m1, [tmpq+ 3*%%str]
   2695    mova                 m7, [tmpq+ 7*%%str]
   2696    mova                 m3, [tmpq+31*%%str]
   2697    TRANSPOSE8x8W         6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1
   2698    mova    [tmpq+ 3*%%str], m6
   2699    mova    [tmpq+ 7*%%str], m0
   2700    mova    [tmpq+11*%%str], m5
   2701    mova    [tmpq+15*%%str], m4
   2702    mova    [tmpq+23*%%str], m7
   2703    mova    [tmpq+27*%%str], m2
   2704    mova    [tmpq+31*%%str], m3
   2705 
   2706    mova                 m1, [tmpq+ 6*%%str] ; t14
   2707    mova                 m0, [tmpq+ 2*%%str] ; t15
   2708    mova                 m7, [tmpq+21*%%str] ; t16
   2709    mova                 m6, [tmpq+29*%%str] ; t17
   2710    SUMSUB_BA             w,  7,  0,  2
   2711    SUMSUB_BA             w,  6,  1,  2
   2712    mova    [tmpq+29*%%str], m7
   2713    mova    [tmpq+ 2*%%str], m0
   2714    mova    [tmpq+21*%%str], m6
   2715    mova    [tmpq+ 6*%%str], m1
   2716 
   2717    mova                 m1, [tmpq+14*%%str] ; t12
   2718    mova                 m0, [tmpq+10*%%str] ; t13
   2719    mova                 m5, [tmpq+ 9*%%str] ; t18
   2720    mova                 m4, [tmpq+13*%%str] ; t19
   2721    SUMSUB_BA             w,  5,  0,  2
   2722    SUMSUB_BA             w,  4,  1,  2
   2723    mova     [tmpq+10*%%str], m0
   2724    mova     [tmpq+14*%%str], m1
   2725 
   2726    mova                 m1, [tmpq+22*%%str] ; t10
   2727    mova                 m0, [tmpq+18*%%str] ; t11
   2728    mova                 m3, [tmpq+17*%%str] ; t20
   2729    mova                 m2, [tmpq+ 1*%%str] ; t21
   2730    SUMSUB_BA             w,  3,  0,  6
   2731    SUMSUB_BA             w,  2,  1,  6
   2732    mova     [tmpq+18*%%str], m0
   2733    mova     [tmpq+22*%%str], m1
   2734 
   2735    mova                 m7, [tmpq+30*%%str] ; t8
   2736    mova                 m6, [tmpq+26*%%str] ; t9
   2737    mova                 m1, [tmpq+25*%%str] ; t22
   2738    mova                 m0, [tmpq+ 5*%%str] ; t23
   2739    SUMSUB_BADC           w,  1,  6,  0,  7
   2740    mova     [tmpq+26*%%str], m6
   2741    mova     [tmpq+30*%%str], m7
   2742 
   2743    mova                 m7, [tmpq+29*%%str]
   2744    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1
   2745    mova    [tmpq+ 1*%%str], m0
   2746    mova    [tmpq+ 5*%%str], m1
   2747    mova    [tmpq+ 9*%%str], m2
   2748    mova    [tmpq+13*%%str], m3
   2749    mova    [tmpq+21*%%str], m5
   2750    mova    [tmpq+25*%%str], m6
   2751    mova    [tmpq+29*%%str], m7
   2752 
   2753    mova                 m0, [tmpq+ 2*%%str]
   2754    mova                 m1, [tmpq+ 6*%%str]
   2755    mova                 m2, [tmpq+10*%%str]
   2756    mova                 m3, [tmpq+14*%%str]
   2757    mova                 m4, [tmpq+18*%%str]
   2758    mova                 m5, [tmpq+22*%%str]
   2759    mova                 m7, [tmpq+30*%%str]
   2760    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1
   2761    mova    [tmpq+ 2*%%str], m0
   2762    mova    [tmpq+ 6*%%str], m1
   2763    mova    [tmpq+10*%%str], m2
   2764    mova    [tmpq+14*%%str], m3
   2765    mova    [tmpq+22*%%str], m5
   2766    mova    [tmpq+26*%%str], m6
   2767    mova    [tmpq+30*%%str], m7
   2768 %endif
   2769 %else
   2770    ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
   2771    ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
   2772    ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
   2773    ; t20-22 is in m4-6
   2774    ; t24-31 is in m8-15
   2775 
   2776 %if cpuflag(ssse3)
   2777 %define ROUND_REG [pw_512]
   2778 %else
   2779 %define ROUND_REG [pw_32]
   2780 %endif
   2781 
   2782 %macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
   2783    SUMSUB_BA            w, %4, %1, %5
   2784    SUMSUB_BA            w, %3, %2, %5
   2785    VP9_IDCT8_WRITEx2   %4, %3, %5, %6, %7, ROUND_REG, 6
   2786 %if %8 == 1
   2787    add               dstq, stride2q
   2788 %endif
   2789    VP9_IDCT8_WRITEx2   %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq
   2790 %if %8 == 1
   2791    sub           dst_endq, stride2q
   2792 %endif
   2793 %endmacro
   2794 
   2795 %if ARCH_X86_64
   2796    pxor               m10, m10
   2797 
   2798    ; store t0-1 and t30-31
   2799    mova                m8, [tmpq+ 0*%%str]
   2800    mova                m9, [tmpq+ 4*%%str]
   2801    %%STORE_2X2          8,  9,  0,  6, 12, 11, 10
   2802 
   2803    ; store t2-3 and t28-29
   2804    mova                m8, [tmpq+ 8*%%str]
   2805    mova                m9, [tmpq+12*%%str]
   2806    %%STORE_2X2          8,  9, 14, 15, 12, 11, 10
   2807 
   2808    ; store t4-5 and t26-27
   2809    mova                m8, [tmpq+16*%%str]
   2810    mova                m9, [tmpq+20*%%str]
   2811    %%STORE_2X2          8,  9,  7,  1, 12, 11, 10
   2812 
   2813    ; store t6-7 and t24-25
   2814    mova                m8, [tmpq+24*%%str]
   2815    mova                m9, [tmpq+28*%%str]
   2816    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
   2817 
   2818    ; store t8-9 and t22-23
   2819    mova                m8, [tmpq+30*%%str]
   2820    mova                m9, [tmpq+26*%%str]
   2821    mova                m0, [tmpq+ 5*%%str]
   2822    %%STORE_2X2          8,  9, 13,  0, 12, 11, 10
   2823 
   2824    ; store t10-11 and t20-21
   2825    mova                m8, [tmpq+22*%%str]
   2826    mova                m9, [tmpq+18*%%str]
   2827    %%STORE_2X2          8,  9,  2,  3, 12, 11, 10
   2828 
   2829    ; store t12-13 and t18-19
   2830    mova                m8, [tmpq+14*%%str]
   2831    mova                m9, [tmpq+10*%%str]
   2832    mova                m5, [tmpq+13*%%str]
   2833    mova                m4, [tmpq+ 9*%%str]
   2834    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10
   2835 
   2836    ; store t14-17
   2837    mova                m8, [tmpq+ 6*%%str]
   2838    mova                m9, [tmpq+ 2*%%str]
   2839    mova                m5, [tmpq+29*%%str]
   2840    mova                m4, [tmpq+21*%%str]
   2841    %%STORE_2X2          8,  9,  4,  5, 12, 11, 10, 0
   2842 
   2843    SWAP                 1, 10 ; zero
   2844 %else
   2845    mova   [tmpq+ 1*%%str], m1
   2846    mova   [tmpq+11*%%str], m2
   2847    mova   [tmpq+15*%%str], m3
   2848    mova   [tmpq+17*%%str], m4
   2849    mova   [tmpq+19*%%str], m5
   2850    pxor                m1, m1
   2851 
   2852    ; store t0-1 and t30-31
   2853    mova                m2, [tmpq+ 0*%%str]
   2854    mova                m3, [tmpq+ 4*%%str]
   2855    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
   2856 
   2857    ; store t2-3 and t28-29
   2858    mova                m2, [tmpq+ 8*%%str]
   2859    mova                m3, [tmpq+12*%%str]
   2860    mova                m0, [tmpq+ 3*%%str]
   2861    mova                m6, [tmpq+ 7*%%str]
   2862    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
   2863 
   2864    ; store t4-5 and t26-27
   2865    mova                m2, [tmpq+16*%%str]
   2866    mova                m3, [tmpq+20*%%str]
   2867    mova                m0, [tmpq+ 1*%%str]
   2868    %%STORE_2X2          2,  3,  7,  0, 4, 5, 1
   2869 
   2870    ; store t6-7 and t24-25
   2871    mova                m2, [tmpq+24*%%str]
   2872    mova                m3, [tmpq+28*%%str]
   2873    mova                m0, [tmpq+17*%%str]
   2874    mova                m6, [tmpq+19*%%str]
   2875    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
   2876 
   2877    ; store t8-9 and t22-23
   2878    mova                m2, [tmpq+30*%%str]
   2879    mova                m3, [tmpq+26*%%str]
   2880    mova                m0, [tmpq+25*%%str]
   2881    mova                m6, [tmpq+ 5*%%str]
   2882    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
   2883 
   2884    ; store t10-11 and t20-21
   2885    mova                m2, [tmpq+22*%%str]
   2886    mova                m3, [tmpq+18*%%str]
   2887    mova                m0, [tmpq+11*%%str]
   2888    mova                m6, [tmpq+15*%%str]
   2889    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
   2890 
   2891    ; store t12-13 and t18-19
   2892    mova                m2, [tmpq+14*%%str]
   2893    mova                m3, [tmpq+10*%%str]
   2894    mova                m6, [tmpq+13*%%str]
   2895    mova                m0, [tmpq+ 9*%%str]
   2896    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1
   2897 
   2898    ; store t14-17
   2899    mova                m2, [tmpq+ 6*%%str]
   2900    mova                m3, [tmpq+ 2*%%str]
   2901    mova                m6, [tmpq+29*%%str]
   2902    mova                m0, [tmpq+21*%%str]
   2903    %%STORE_2X2          2,  3,  0,  6, 4, 5, 1, 0
   2904 %endif
   2905 %undef ROUND_REG
   2906 %endif
   2907 %endmacro
   2908 
   2909 %macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
   2910 INIT_XMM %1
   2911 cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
   2912    movifnidn         eobd, dword eobm
   2913 %if cpuflag(ssse3)
   2914    cmp eobd, 135
   2915    jg .idctfull
   2916    cmp eobd, 34
   2917    jg .idct16x16
   2918    cmp eobd, 1
   2919    jg .idct8x8
   2920 %else
   2921    cmp eobd, 1
   2922    jg .idctfull
   2923 %endif
   2924 
   2925    ; dc-only case
   2926    movifnidn       blockq, blockmp
   2927    movifnidn         dstq, dstmp
   2928    movifnidn      strideq, stridemp
   2929 %if cpuflag(ssse3)
   2930    movd                m0, [blockq]
   2931    mova                m1, [pw_11585x2]
   2932    pmulhrsw            m0, m1
   2933    pmulhrsw            m0, m1
   2934 %else
   2935    DEFINE_ARGS dst, stride, block, coef
   2936    movsx            coefd, word [blockq]
   2937    imul             coefd, 11585
   2938    add              coefd, 8192
   2939    sar              coefd, 14
   2940    imul             coefd, 11585
   2941    add              coefd, (32 << 14) + 8192
   2942    sar              coefd, 14 + 6
   2943    movd                m0, coefd
   2944 %endif
   2945    SPLATW              m0, m0, q0000
   2946 %if cpuflag(ssse3)
   2947    pmulhrsw            m0, [pw_512]
   2948 %endif
   2949    pxor                m5, m5
   2950    movd          [blockq], m5
   2951 %rep 31
   2952    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
   2953    add               dstq, strideq
   2954 %endrep
   2955    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
   2956    RET
   2957 
   2958 %if ARCH_X86_64
   2959    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
   2960 %else
   2961 %define dst_bakq r0mp
   2962 %endif
   2963 %if cpuflag(ssse3)
   2964 .idct8x8:
   2965 %if ARCH_X86_32
   2966    DEFINE_ARGS block, u1, u2, u3, u4, tmp
   2967    mov             blockq, r2mp
   2968 %endif
   2969    mov               tmpq, rsp
   2970    VP9_IDCT32_1D   blockq, 1, 8
   2971 
   2972 %if ARCH_X86_32
   2973    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
   2974    mov            strideq, r1mp
   2975 %define cntd dword r3m
   2976 %endif
   2977    mov          stride30q, strideq         ; stride
   2978    lea           stride2q, [strideq*2]     ; stride*2
   2979    shl          stride30q, 5               ; stride*32
   2980    mov               cntd, 4
   2981    sub          stride30q, stride2q        ; stride*30
   2982 .loop2_8x8:
   2983    mov               dstq, dst_bakq
   2984    lea           dst_endq, [dstq+stride30q]
   2985    VP9_IDCT32_1D     tmpq, 2, 8
   2986    add           dst_bakq, 8
   2987    add               tmpq, 16
   2988    dec               cntd
   2989    jg .loop2_8x8
   2990 
   2991    ; at the end of the loop, m7 should still be zero
   2992    ; use that to zero out block coefficients
   2993 %if ARCH_X86_32
   2994    DEFINE_ARGS block
   2995    mov             blockq, r2mp
   2996 %endif
   2997    ZERO_BLOCK      blockq, 64,  8, m1
   2998    RET
   2999 
   3000 .idct16x16:
   3001 %if ARCH_X86_32
   3002    DEFINE_ARGS block, tmp, cnt
   3003    mov             blockq, r2mp
   3004 %endif
   3005    mov               cntd, 2
   3006    mov               tmpq, rsp
   3007 .loop1_16x16:
   3008    VP9_IDCT32_1D   blockq, 1, 16
   3009    add             blockq, 16
   3010    add               tmpq, 512
   3011    dec               cntd
   3012    jg .loop1_16x16
   3013 
   3014 %if ARCH_X86_64
   3015    sub             blockq, 32
   3016 %else
   3017    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
   3018    mov            strideq, r1mp
   3019 %define cntd dword r3m
   3020 %endif
   3021 
   3022    mov          stride30q, strideq         ; stride
   3023    lea           stride2q, [strideq*2]     ; stride*2
   3024    shl          stride30q, 5               ; stride*32
   3025    mov               cntd, 4
   3026    mov               tmpq, rsp
   3027    sub          stride30q, stride2q        ; stride*30
   3028 .loop2_16x16:
   3029    mov               dstq, dst_bakq
   3030    lea           dst_endq, [dstq+stride30q]
   3031    VP9_IDCT32_1D     tmpq, 2, 16
   3032    add           dst_bakq, 8
   3033    add               tmpq, 16
   3034    dec               cntd
   3035    jg .loop2_16x16
   3036 
   3037    ; at the end of the loop, m7 should still be zero
   3038    ; use that to zero out block coefficients
   3039 %if ARCH_X86_32
   3040    DEFINE_ARGS block
   3041    mov             blockq, r2mp
   3042 %endif
   3043    ZERO_BLOCK      blockq, 64, 16, m1
   3044    RET
   3045 %endif
   3046 
   3047 .idctfull:
   3048 %if ARCH_X86_32
   3049    DEFINE_ARGS block, tmp, cnt
   3050    mov             blockq, r2mp
   3051 %endif
   3052    mov               cntd, 4
   3053    mov               tmpq, rsp
   3054 .loop1_full:
   3055    VP9_IDCT32_1D   blockq, 1
   3056    add             blockq, 16
   3057    add               tmpq, 512
   3058    dec               cntd
   3059    jg .loop1_full
   3060 
   3061 %if ARCH_X86_64
   3062    sub             blockq, 64
   3063 %else
   3064    DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
   3065    mov            strideq, r1mp
   3066 %define cntd dword r3m
   3067 %endif
   3068 
   3069    mov          stride30q, strideq         ; stride
   3070    lea           stride2q, [strideq*2]     ; stride*2
   3071    shl          stride30q, 5               ; stride*32
   3072    mov               cntd, 4
   3073    mov               tmpq, rsp
   3074    sub          stride30q, stride2q        ; stride*30
   3075 .loop2_full:
   3076    mov               dstq, dst_bakq
   3077    lea           dst_endq, [dstq+stride30q]
   3078    VP9_IDCT32_1D     tmpq, 2
   3079    add           dst_bakq, 8
   3080    add               tmpq, 16
   3081    dec               cntd
   3082    jg .loop2_full
   3083 
   3084    ; at the end of the loop, m7 should still be zero
   3085    ; use that to zero out block coefficients
   3086 %if ARCH_X86_32
   3087    DEFINE_ARGS block
   3088    mov             blockq, r2mp
   3089 %endif
   3090    ZERO_BLOCK      blockq, 64, 32, m1
   3091    RET
   3092 %endmacro
   3093 
   3094 VP9_IDCT_IDCT_32x32_ADD_XMM sse2
   3095 VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
   3096 VP9_IDCT_IDCT_32x32_ADD_XMM avx
   3097 
   3098 ; this is almost identical to VP9_STORE_2X, but it does two rows
   3099 ; for slightly improved interleaving, and it omits vpermq since the
   3100 ; input is DC so all values are identical
   3101 %macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero
   3102    mova               m%2, [dstq]
   3103    mova               m%4, [dstq+strideq]
   3104    punpckhbw          m%3, m%2, m%6
   3105    punpcklbw          m%2, m%6
   3106    punpckhbw          m%5, m%4, m%6
   3107    punpcklbw          m%4, m%6
   3108    paddw              m%3, m%1
   3109    paddw              m%2, m%1
   3110    paddw              m%5, m%1
   3111    paddw              m%4, m%1
   3112    packuswb           m%2, m%3
   3113    packuswb           m%4, m%5
   3114    mova  [dstq+strideq*0], m%2
   3115    mova  [dstq+strideq*1], m%4
   3116 %endmacro
   3117 
   3118 %if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
   3119 INIT_YMM avx2
   3120 cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
   3121    cmp eobd, 135
   3122    jg .idctfull
   3123    cmp eobd, 1
   3124    jg .idct16x16
   3125 
   3126    ; dc-only case
   3127    mova                m1, [pw_11585x2]
   3128    vpbroadcastw        m0, [blockq]
   3129    pmulhrsw            m0, m1
   3130    pmulhrsw            m0, m1
   3131    pxor                m5, m5
   3132    pmulhrsw            m0, [pw_512]
   3133    movd          [blockq], xm5
   3134 
   3135    DEFINE_ARGS dst, stride, cnt
   3136    mov               cntd, 16
   3137 .loop_dc:
   3138    VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5
   3139    lea               dstq, [dstq+2*strideq]
   3140    dec               cntd
   3141    jg .loop_dc
   3142    RET
   3143 
   3144    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
   3145 .idct16x16:
   3146    mov               tmpq, rsp
   3147    VP9_IDCT32_1D   blockq, 1, 16
   3148 
   3149    mov          stride30q, strideq         ; stride
   3150    lea           stride2q, [strideq*2]     ; stride*2
   3151    shl          stride30q, 5               ; stride*32
   3152    mov               cntd, 2
   3153    sub          stride30q, stride2q        ; stride*30
   3154 .loop2_16x16:
   3155    mov               dstq, dst_bakq
   3156    lea           dst_endq, [dstq+stride30q]
   3157    VP9_IDCT32_1D     tmpq, 2, 16
   3158    add           dst_bakq, 16
   3159    add               tmpq, 32
   3160    dec               cntd
   3161    jg .loop2_16x16
   3162 
   3163    ; at the end of the loop, m1 should still be zero
   3164    ; use that to zero out block coefficients
   3165    ZERO_BLOCK      blockq, 64, 16, m1
   3166    RET
   3167 
   3168 .idctfull:
   3169    mov               cntd, 2
   3170    mov               tmpq, rsp
   3171 .loop1_full:
   3172    VP9_IDCT32_1D   blockq, 1
   3173    add             blockq, 32
   3174    add               tmpq, 1024
   3175    dec               cntd
   3176    jg .loop1_full
   3177 
   3178    sub             blockq, 64
   3179 
   3180    mov          stride30q, strideq         ; stride
   3181    lea           stride2q, [strideq*2]     ; stride*2
   3182    shl          stride30q, 5               ; stride*32
   3183    mov               cntd, 2
   3184    mov               tmpq, rsp
   3185    sub          stride30q, stride2q        ; stride*30
   3186 .loop2_full:
   3187    mov               dstq, dst_bakq
   3188    lea           dst_endq, [dstq+stride30q]
   3189    VP9_IDCT32_1D     tmpq, 2
   3190    add           dst_bakq, 16
   3191    add               tmpq, 32
   3192    dec               cntd
   3193    jg .loop2_full
   3194 
   3195    ; at the end of the loop, m1 should still be zero
   3196    ; use that to zero out block coefficients
   3197    ZERO_BLOCK      blockq, 64, 32, m1
   3198    RET
   3199 %endif