tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9itxfm_16bpp.asm (73262B)


      1 ;******************************************************************************
      2 ;* VP9 inverse transform x86 SIMD optimizations
      3 ;*
      4 ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com>
      5 ;*
      6 ;* This file is part of FFmpeg.
      7 ;*
      8 ;* FFmpeg is free software; you can redistribute it and/or
      9 ;* modify it under the terms of the GNU Lesser General Public
     10 ;* License as published by the Free Software Foundation; either
     11 ;* version 2.1 of the License, or (at your option) any later version.
     12 ;*
     13 ;* FFmpeg is distributed in the hope that it will be useful,
     14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 ;* Lesser General Public License for more details.
     17 ;*
     18 ;* You should have received a copy of the GNU Lesser General Public
     19 ;* License along with FFmpeg; if not, write to the Free Software
     20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     21 ;******************************************************************************
     22 
     23 %include "libavutil/x86/x86util.asm"
     24 %include "vp9itxfm_template.asm"
     25 
     26 SECTION_RODATA
     27 
     28 cextern pw_8
     29 cextern pw_1023
     30 cextern pw_2048
     31 cextern pw_4095
     32 cextern pw_m1
     33 cextern pd_1
     34 cextern pd_16
     35 cextern pd_32
     36 cextern pd_8192
     37 
     38 pd_8: times 4 dd 8
     39 pd_3fff: times 4 dd 0x3fff
     40 
     41 cextern pw_11585x2
     42 
     43 cextern pw_5283_13377
     44 cextern pw_9929_13377
     45 cextern pw_15212_m13377
     46 cextern pw_15212_9929
     47 cextern pw_m5283_m15212
     48 cextern pw_13377x2
     49 cextern pw_m13377_13377
     50 cextern pw_13377_0
     51 
     52 pw_9929_m5283: times 4 dw 9929, -5283
     53 
     54 %macro COEF_PAIR 2-3
     55 cextern pw_m%1_%2
     56 cextern pw_%2_%1
     57 %if %0 == 3
     58 cextern pw_m%1_m%2
     59 %if %1 != %2
     60 cextern pw_m%2_%1
     61 cextern pw_%1_%2
     62 %endif
     63 %endif
     64 %endmacro
     65 
     66 COEF_PAIR  2404, 16207
     67 COEF_PAIR  3196, 16069, 1
     68 COEF_PAIR  4756, 15679
     69 COEF_PAIR  5520, 15426
     70 COEF_PAIR  6270, 15137, 1
     71 COEF_PAIR  8423, 14053
     72 COEF_PAIR 10394, 12665
     73 COEF_PAIR 11003, 12140
     74 COEF_PAIR 11585, 11585, 1
     75 COEF_PAIR 13160,  9760
     76 COEF_PAIR 13623,  9102, 1
     77 COEF_PAIR 14449,  7723
     78 COEF_PAIR 14811,  7005
     79 COEF_PAIR 15893,  3981
     80 COEF_PAIR 16305,  1606
     81 COEF_PAIR 16364,   804
     82 
     83 default_8x8:
     84 times 12 db 1
     85 times 52 db 2
     86 row_8x8:
     87 times 18 db 1
     88 times 46 db 2
     89 col_8x8:
     90 times 6 db 1
     91 times 58 db 2
     92 default_16x16:
     93 times 10 db 1
     94 times 28 db 2
     95 times 51 db 3
     96 times 167 db 4
     97 row_16x16:
     98 times 21 db 1
     99 times 45 db 2
    100 times 60 db 3
    101 times 130 db 4
    102 col_16x16:
    103 times 5 db 1
    104 times 12 db 2
    105 times 25 db 3
    106 times 214 db 4
    107 default_32x32:
    108 times 9 db 1
    109 times 25 db 2
    110 times 36 db 3
    111 times 65 db 4
    112 times 105 db 5
    113 times 96 db 6
    114 times 112 db 7
    115 times 576 db 8
    116 
    117 SECTION .text
    118 
    119 %macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
    120    mova               m%3, [%7]
    121    mova               m%4, [%7+strideq]
    122    paddw              m%3, m%1
    123    paddw              m%4, m%2
    124    pmaxsw             m%3, m%5
    125    pmaxsw             m%4, m%5
    126    pminsw             m%3, m%6
    127    pminsw             m%4, m%6
    128    mova              [%7], m%3
    129    mova      [%7+strideq], m%4
    130 %endmacro
    131 
    132 %macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
    133 %assign %%y 0
    134 %rep %3
    135 %assign %%x 0
    136 %rep %3*4/mmsize
    137    mova      [%1+%%y+%%x], %4
    138 %assign %%x (%%x+mmsize)
    139 %endrep
    140 %assign %%y (%%y+%2)
    141 %endrep
    142 %endmacro
    143 
    144 ; the input coefficients are scaled up by 2 bit (which we downscale immediately
    145 ; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d.
    146 ; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling,
    147 ; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits
    148 ; add 2 bits, we need to scale before converting to word in 12bpp, since the
    149 ; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp
    150 ; we can scale after converting to words (which is half the instructions),
    151 ; since the input is only 14+sign bit, which fits in 15+sign words directly.
    152 
    153 %macro IWHT4_FN 2 ; bpp, max
    154 cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob
    155    mova                m7, [pw_%2]
    156    mova                m0, [blockq+0*16+0]
    157    mova                m1, [blockq+1*16+0]
    158 %if %1 >= 12
    159    mova                m4, [blockq+0*16+8]
    160    mova                m5, [blockq+1*16+8]
    161    psrad               m0, 2
    162    psrad               m1, 2
    163    psrad               m4, 2
    164    psrad               m5, 2
    165    packssdw            m0, m4
    166    packssdw            m1, m5
    167 %else
    168    packssdw            m0, [blockq+0*16+8]
    169    packssdw            m1, [blockq+1*16+8]
    170    psraw               m0, 2
    171    psraw               m1, 2
    172 %endif
    173    mova                m2, [blockq+2*16+0]
    174    mova                m3, [blockq+3*16+0]
    175 %if %1 >= 12
    176    mova                m4, [blockq+2*16+8]
    177    mova                m5, [blockq+3*16+8]
    178    psrad               m2, 2
    179    psrad               m3, 2
    180    psrad               m4, 2
    181    psrad               m5, 2
    182    packssdw            m2, m4
    183    packssdw            m3, m5
    184 %else
    185    packssdw            m2, [blockq+2*16+8]
    186    packssdw            m3, [blockq+3*16+8]
    187    psraw               m2, 2
    188    psraw               m3, 2
    189 %endif
    190 
    191    VP9_IWHT4_1D
    192    TRANSPOSE4x4W        0, 1, 2, 3, 4
    193    VP9_IWHT4_1D
    194 
    195    pxor                m6, m6
    196    VP9_STORE_2X         0, 1, 4, 5, 6, 7
    197    lea               dstq, [dstq+strideq*2]
    198    VP9_STORE_2X         2, 3, 4, 5, 6, 7
    199    ZERO_BLOCK      blockq, 16, 4, m6
    200    RET
    201 %endmacro
    202 
    203 INIT_MMX mmxext
    204 IWHT4_FN 10, 1023
    205 INIT_MMX mmxext
    206 IWHT4_FN 12, 4095
    207 
    208 %macro VP9_IDCT4_WRITEOUT 0
    209 %if cpuflag(ssse3)
    210    mova                m5, [pw_2048]
    211    pmulhrsw            m0, m5
    212    pmulhrsw            m1, m5
    213    pmulhrsw            m2, m5
    214    pmulhrsw            m3, m5
    215 %else
    216    mova                m5, [pw_8]
    217    paddw               m0, m5
    218    paddw               m1, m5
    219    paddw               m2, m5
    220    paddw               m3, m5
    221    psraw               m0, 4
    222    psraw               m1, 4
    223    psraw               m2, 4
    224    psraw               m3, 4
    225 %endif
    226    mova                m5, [pw_1023]
    227    VP9_STORE_2X         0,  1,  6,  7,  4,  5
    228    lea               dstq, [dstq+2*strideq]
    229    VP9_STORE_2X         2,  3,  6,  7,  4,  5
    230 %endmacro
    231 
    232 %macro DC_ONLY 2 ; shift, zero
    233    mov              coefd, dword [blockq]
    234    movd          [blockq], %2
    235    imul             coefd, 11585
    236    add              coefd, 8192
    237    sar              coefd, 14
    238    imul             coefd, 11585
    239    add              coefd, ((1 << (%1 - 1)) << 14) + 8192
    240    sar              coefd, 14 + %1
    241 %endmacro
    242 
    243 ; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
    244 ; in 15+1 words without additional effort, since the coefficients are 15bpp.
    245 
    246 %macro IDCT4_10_FN 0
    247 cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
    248    cmp               eobd, 1
    249    jg .idctfull
    250 
    251    ; dc-only
    252    pxor                m4, m4
    253 %if cpuflag(ssse3)
    254    movd                m0, [blockq]
    255    movd          [blockq], m4
    256    mova                m5, [pw_11585x2]
    257    pmulhrsw            m0, m5
    258    pmulhrsw            m0, m5
    259 %else
    260    DEFINE_ARGS dst, stride, block, coef
    261    DC_ONLY              4, m4
    262    movd                m0, coefd
    263 %endif
    264    pshufw              m0, m0, 0
    265    mova                m5, [pw_1023]
    266 %if cpuflag(ssse3)
    267    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
    268 %endif
    269    VP9_STORE_2X         0,  0,  6,  7,  4,  5
    270    lea               dstq, [dstq+2*strideq]
    271    VP9_STORE_2X         0,  0,  6,  7,  4,  5
    272    RET
    273 
    274 .idctfull:
    275    mova                m0, [blockq+0*16+0]
    276    mova                m1, [blockq+1*16+0]
    277    packssdw            m0, [blockq+0*16+8]
    278    packssdw            m1, [blockq+1*16+8]
    279    mova                m2, [blockq+2*16+0]
    280    mova                m3, [blockq+3*16+0]
    281    packssdw            m2, [blockq+2*16+8]
    282    packssdw            m3, [blockq+3*16+8]
    283 
    284 %if cpuflag(ssse3)
    285    mova                m6, [pw_11585x2]
    286 %endif
    287    mova                m7, [pd_8192]       ; rounding
    288    VP9_IDCT4_1D
    289    TRANSPOSE4x4W  0, 1, 2, 3, 4
    290    VP9_IDCT4_1D
    291 
    292    pxor                m4, m4
    293    ZERO_BLOCK      blockq, 16, 4, m4
    294    VP9_IDCT4_WRITEOUT
    295    RET
    296 %endmacro
    297 
    298 INIT_MMX mmxext
    299 IDCT4_10_FN
    300 INIT_MMX ssse3
    301 IDCT4_10_FN
    302 
    303 %macro IADST4_FN 4
    304 cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
    305 %if WIN64 && notcpuflag(ssse3)
    306 INIT_XMM cpuname
    307    WIN64_SPILL_XMM 8
    308 INIT_MMX cpuname
    309 %endif
    310    movdqa            xmm5, [pd_8192]
    311    mova                m0, [blockq+0*16+0]
    312    mova                m1, [blockq+1*16+0]
    313    packssdw            m0, [blockq+0*16+8]
    314    packssdw            m1, [blockq+1*16+8]
    315    mova                m2, [blockq+2*16+0]
    316    mova                m3, [blockq+3*16+0]
    317    packssdw            m2, [blockq+2*16+8]
    318    packssdw            m3, [blockq+3*16+8]
    319 
    320 %if cpuflag(ssse3)
    321    mova                m6, [pw_11585x2]
    322 %endif
    323 %ifnidn %1%3, iadstiadst
    324    movdq2q             m7, xmm5
    325 %endif
    326    VP9_%2_1D
    327    TRANSPOSE4x4W  0, 1, 2, 3, 4
    328    VP9_%4_1D
    329 
    330    pxor                m4, m4
    331    ZERO_BLOCK      blockq, 16, 4, m4
    332    VP9_IDCT4_WRITEOUT
    333    RET
    334 %endmacro
    335 
    336 INIT_MMX sse2
    337 IADST4_FN idct,  IDCT4,  iadst, IADST4
    338 IADST4_FN iadst, IADST4, idct,  IDCT4
    339 IADST4_FN iadst, IADST4, iadst, IADST4
    340 
    341 INIT_MMX ssse3
    342 IADST4_FN idct,  IDCT4,  iadst, IADST4
    343 IADST4_FN iadst, IADST4, idct,  IDCT4
    344 IADST4_FN iadst, IADST4, iadst, IADST4
    345 
    346 ; inputs and outputs are dwords, coefficients are words
    347 ;
    348 ; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
    349 ; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
    350 %macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
    351    pand               m%3, m%1, %8
    352    pand               m%4, m%2, %8
    353    psrad              m%1, 14
    354    psrad              m%2, 14
    355    packssdw           m%4, m%2
    356    packssdw           m%3, m%1
    357    punpckhwd          m%2, m%4, m%3
    358    punpcklwd          m%4, m%3
    359    pmaddwd            m%3, m%4, [pw_%6_%5]
    360    pmaddwd            m%1, m%2, [pw_%6_%5]
    361    pmaddwd            m%4, [pw_m%5_%6]
    362    pmaddwd            m%2, [pw_m%5_%6]
    363    paddd              m%3, %7
    364    paddd              m%4, %7
    365    psrad              m%3, 14
    366    psrad              m%4, 14
    367    paddd              m%1, m%3
    368    paddd              m%2, m%4
    369 %endmacro
    370 
    371 %macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1
    372    SUMSUB_MUL          %3, %5, %7, %8, 11585, 11585, %1, %2
    373    SUMSUB_MUL          %4, %6, %7, %8, 15137,  6270, %1, %2
    374    SUMSUB_BA        d, %4, %3, %7
    375    SUMSUB_BA        d, %6, %5, %7
    376    SWAP                %4, %6, %3
    377 %endmacro
    378 
    379 %macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max
    380    movh               m%1, [dstq+strideq*0]
    381    movh               m%2, [dstq+strideq*2]
    382    movhps             m%1, [dstq+strideq*1]
    383    movhps             m%2, [dstq+stride3q ]
    384    paddw              m%1, m%3
    385    paddw              m%2, m%4
    386    pmaxsw             m%1, %5
    387    pmaxsw             m%2, %5
    388    pminsw             m%1, %6
    389    pminsw             m%2, %6
    390    movh   [dstq+strideq*0], m%1
    391    movhps [dstq+strideq*1], m%1
    392    movh   [dstq+strideq*2], m%2
    393    movhps [dstq+stride3q ], m%2
    394 %endmacro
    395 
    396 %macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift
    397    paddd              m%1, %7
    398    paddd              m%2, %7
    399    paddd              m%3, %7
    400    paddd              m%4, %7
    401    psrad              m%1, %8
    402    psrad              m%2, %8
    403    psrad              m%3, %8
    404    psrad              m%4, %8
    405    packssdw           m%1, m%2
    406    packssdw           m%3, m%4
    407    STORE_4x4           %2, %4, %1, %3, %5, %6
    408 %endmacro
    409 
    410 INIT_XMM sse2
    411 cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob
    412    cmp               eobd, 1
    413    jg .idctfull
    414 
    415    ; dc-only - this is special, since for 4x4 12bpp, the max coef size is
    416    ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the
    417    ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a
    418    ; dword. After the final shift (4), the result is 13+sign bits, so we
    419    ; don't need any additional processing to fit it in a word
    420    DEFINE_ARGS dst, stride, block, coef
    421    pxor                m4, m4
    422    DC_ONLY              4, m4
    423    movd                m0, coefd
    424    pshuflw             m0, m0, q0000
    425    punpcklqdq          m0, m0
    426    mova                m5, [pw_4095]
    427    DEFINE_ARGS dst, stride, stride3
    428    lea           stride3q, [strideq*3]
    429    STORE_4x4            1, 3, 0, 0, m4, m5
    430    RET
    431 
    432 .idctfull:
    433    DEFINE_ARGS dst, stride, block, eob
    434    mova                m0, [blockq+0*16]
    435    mova                m1, [blockq+1*16]
    436    mova                m2, [blockq+2*16]
    437    mova                m3, [blockq+3*16]
    438    mova                m6, [pd_8192]
    439    mova                m7, [pd_3fff]
    440 
    441    IDCT4_12BPP_1D      m6, m7
    442    TRANSPOSE4x4D        0, 1, 2, 3, 4
    443    IDCT4_12BPP_1D      m6, m7
    444 
    445    pxor                m4, m4
    446    ZERO_BLOCK      blockq, 16, 4, m4
    447 
    448    ; writeout
    449    DEFINE_ARGS dst, stride, stride3
    450    lea           stride3q, [strideq*3]
    451    mova                m5, [pw_4095]
    452    mova                m6, [pd_8]
    453    ROUND_AND_STORE_4x4  0, 1, 2, 3, m4, m5, m6, 4
    454    RET
    455 
    456 %macro SCRATCH 3-4
    457 %if ARCH_X86_64
    458    SWAP                %1, %2
    459 %if %0 == 4
    460 %define reg_%4 m%2
    461 %endif
    462 %else
    463    mova              [%3], m%1
    464 %if %0 == 4
    465 %define reg_%4 [%3]
    466 %endif
    467 %endif
    468 %endmacro
    469 
    470 %macro UNSCRATCH 3-4
    471 %if ARCH_X86_64
    472    SWAP                %1, %2
    473 %else
    474    mova               m%1, [%3]
    475 %endif
    476 %if %0 == 4
    477 %undef reg_%4
    478 %endif
    479 %endmacro
    480 
    481 %macro PRELOAD 2-3
    482 %if ARCH_X86_64
    483    mova               m%1, [%2]
    484 %if %0 == 3
    485 %define reg_%3 m%1
    486 %endif
    487 %elif %0 == 3
    488 %define reg_%3 [%2]
    489 %endif
    490 %endmacro
    491 
    492 ; out0 =  5283 * in0 + 13377 + in1 + 15212 * in2 +  9929 * in3 + rnd >> 14
    493 ; out1 =  9929 * in0 + 13377 * in1 -  5283 * in2 - 15282 * in3 + rnd >> 14
    494 ; out2 = 13377 * in0               - 13377 * in2 + 13377 * in3 + rnd >> 14
    495 ; out3 = 15212 * in0 - 13377 * in1 +  9929 * in2 -  5283 * in3 + rnd >> 14
    496 %macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask
    497    pand                m4, m0, %2
    498    pand                m5, m1, %2
    499    psrad               m0, 14
    500    psrad               m1, 14
    501    packssdw            m5, m1
    502    packssdw            m4, m0
    503    punpckhwd           m1, m4, m5
    504    punpcklwd           m4, m5
    505    pand                m5, m2, %2
    506    pand                m6, m3, %2
    507    psrad               m2, 14
    508    psrad               m3, 14
    509    packssdw            m6, m3
    510    packssdw            m5, m2
    511    punpckhwd           m3, m5, m6
    512    punpcklwd           m5, m6
    513    SCRATCH              1,  8, rsp+0*mmsize, a
    514    SCRATCH              5,  9, rsp+1*mmsize, b
    515 
    516    ; m1/3 have the high bits of 0,1,2,3
    517    ; m4/5 have the low bits of 0,1,2,3
    518    ; m0/2/6/7 are free
    519 
    520    mova                m2, [pw_15212_9929]
    521    mova                m0, [pw_5283_13377]
    522    pmaddwd             m7, m2, reg_b
    523    pmaddwd             m6, m4, m0
    524    pmaddwd             m2, m3
    525    pmaddwd             m0, reg_a
    526    paddd               m6, m7
    527    paddd               m0, m2
    528    mova                m1, [pw_m13377_13377]
    529    mova                m5, [pw_13377_0]
    530    pmaddwd             m7, m1, reg_b
    531    pmaddwd             m2, m4, m5
    532    pmaddwd             m1, m3
    533    pmaddwd             m5, reg_a
    534    paddd               m2, m7
    535    paddd               m1, m5
    536    paddd               m6, %1
    537    paddd               m2, %1
    538    psrad               m6, 14
    539    psrad               m2, 14
    540    paddd               m0, m6                      ; t0
    541    paddd               m2, m1                      ; t2
    542 
    543    mova                m7, [pw_m5283_m15212]
    544    mova                m5, [pw_9929_13377]
    545    pmaddwd             m1, m7, reg_b
    546    pmaddwd             m6, m4, m5
    547    pmaddwd             m7, m3
    548    pmaddwd             m5, reg_a
    549    paddd               m6, m1
    550    paddd               m7, m5
    551    UNSCRATCH            5,  9, rsp+1*mmsize, b
    552    pmaddwd             m5, [pw_9929_m5283]
    553    pmaddwd             m4, [pw_15212_m13377]
    554    pmaddwd             m3, [pw_9929_m5283]
    555    UNSCRATCH            1,  8, rsp+0*mmsize, a
    556    pmaddwd             m1, [pw_15212_m13377]
    557    paddd               m4, m5
    558    paddd               m3, m1
    559    paddd               m6, %1
    560    paddd               m4, %1
    561    psrad               m6, 14
    562    psrad               m4, 14
    563    paddd               m7, m6                      ; t1
    564    paddd               m3, m4                      ; t3
    565 
    566    SWAP                 1, 7
    567 %endmacro
    568 
    569 %macro IADST4_12BPP_FN 4
    570 cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
    571    mova                m0, [blockq+0*16]
    572    mova                m1, [blockq+1*16]
    573    mova                m2, [blockq+2*16]
    574    mova                m3, [blockq+3*16]
    575 
    576    PRELOAD             10, pd_8192, rnd
    577    PRELOAD             11, pd_3fff, mask
    578    %2_12BPP_1D    reg_rnd, reg_mask
    579    TRANSPOSE4x4D        0, 1, 2, 3, 4
    580    %4_12BPP_1D    reg_rnd, reg_mask
    581 
    582    pxor                m4, m4
    583    ZERO_BLOCK      blockq, 16, 4, m4
    584 
    585    ; writeout
    586    DEFINE_ARGS dst, stride, stride3
    587    lea           stride3q, [strideq*3]
    588    mova                m5, [pw_4095]
    589    mova                m6, [pd_8]
    590    ROUND_AND_STORE_4x4  0, 1, 2, 3, m4, m5, m6, 4
    591    RET
    592 %endmacro
    593 
    594 INIT_XMM sse2
    595 IADST4_12BPP_FN idct,  IDCT4,  iadst, IADST4
    596 IADST4_12BPP_FN iadst, IADST4, idct,  IDCT4
    597 IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
    598 
    599 ; the following line has not been executed at the end of this macro:
    600 ; UNSCRATCH            6, 8, rsp+%3*mmsize
    601 %macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset
    602    mova                m0, [%1+0*%4]
    603    mova                m2, [%1+2*%4]
    604    mova                m4, [%1+4*%4]
    605    mova                m6, [%1+6*%4]
    606    IDCT4_12BPP_1D      %2, %3, 0, 2, 4, 6, 1, 3            ; m0/2/4/6 have t0/1/2/3
    607    SCRATCH              4, 8, rsp+(%5+0)*mmsize
    608    SCRATCH              6, 9, rsp+(%5+1)*mmsize
    609    mova                m1, [%1+1*%4]
    610    mova                m3, [%1+3*%4]
    611    mova                m5, [%1+5*%4]
    612    mova                m7, [%1+7*%4]
    613    SUMSUB_MUL           1, 7, 4, 6, 16069,  3196, %2, %3   ; m1=t7a, m7=t4a
    614    SUMSUB_MUL           5, 3, 4, 6,  9102, 13623, %2, %3   ; m5=t6a, m3=t5a
    615    SUMSUB_BA         d, 3, 7, 4                            ; m3=t4, m7=t5a
    616    SUMSUB_BA         d, 5, 1, 4                            ; m5=t7, m1=t6a
    617    SUMSUB_MUL           1, 7, 4, 6, 11585, 11585, %2, %3   ; m1=t6, m7=t5
    618    SUMSUB_BA         d, 5, 0, 4                            ; m5=out0, m0=out7
    619    SUMSUB_BA         d, 1, 2, 4                            ; m1=out1, m2=out6
    620    UNSCRATCH            4, 8, rsp+(%5+0)*mmsize
    621    UNSCRATCH            6, 9, rsp+(%5+1)*mmsize
    622    SCRATCH              2, 8, rsp+(%5+0)*mmsize
    623    SUMSUB_BA         d, 7, 4, 2                            ; m7=out2, m4=out5
    624    SUMSUB_BA         d, 3, 6, 2                            ; m3=out3, m6=out4
    625    SWAP                 0, 5, 4, 6, 2, 7
    626 %endmacro
    627 
    628 %macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max
    629    mova               m%1, [%6+%7*0]
    630    mova               m%2, [%6+%7*1]
    631    paddw              m%1, m%3
    632    paddw              m%2, m%3
    633    pmaxsw             m%1, %4
    634    pmaxsw             m%2, %4
    635    pminsw             m%1, %5
    636    pminsw             m%2, %5
    637    mova         [%6+%7*0], m%1
    638    mova         [%6+%7*1], m%2
    639 %endmacro
    640 
    641 ; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
    642 ; storage also instead of allocating two more stack spaces. This doesn't
    643 ; matter much but it's something...
    644 INIT_XMM sse2
    645 cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \
    646                                  16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
    647                                  dst, stride, block, eob
    648    mova                m0, [pw_1023]
    649    cmp               eobd, 1
    650    jg .idctfull
    651 
    652    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
    653    ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily
    654    ; fits in 32bit
    655    DEFINE_ARGS dst, stride, block, coef
    656    pxor                m2, m2
    657    DC_ONLY              5, m2
    658    movd                m1, coefd
    659    pshuflw             m1, m1, q0000
    660    punpcklqdq          m1, m1
    661    DEFINE_ARGS dst, stride, cnt
    662    mov               cntd, 4
    663 .loop_dc:
    664    STORE_2x8            3, 4, 1, m2, m0
    665    lea               dstq, [dstq+strideq*2]
    666    dec               cntd
    667    jg .loop_dc
    668    RET
    669 
    670 .idctfull:
    671    SCRATCH              0, 12, rsp+16*mmsize, max
    672    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
    673 %if ARCH_X86_64
    674    mov            dstbakq, dstq
    675    movsxd            cntq, cntd
    676 %endif
    677 %if PIC
    678    lea               ptrq, [default_8x8]
    679    movzx             cntd, byte [ptrq+cntq-1]
    680 %else
    681    movzx             cntd, byte [default_8x8+cntq-1]
    682 %endif
    683    mov              skipd, 2
    684    sub              skipd, cntd
    685    mov               ptrq, rsp
    686    PRELOAD             10, pd_8192, rnd
    687    PRELOAD             11, pd_3fff, mask
    688    PRELOAD             13, pd_16, srnd
    689 .loop_1:
    690    IDCT8_1D        blockq, reg_rnd, reg_mask
    691 
    692    TRANSPOSE4x4D        0, 1, 2, 3, 6
    693    mova  [ptrq+ 0*mmsize], m0
    694    mova  [ptrq+ 2*mmsize], m1
    695    mova  [ptrq+ 4*mmsize], m2
    696    mova  [ptrq+ 6*mmsize], m3
    697    UNSCRATCH            6, 8, rsp+17*mmsize
    698    TRANSPOSE4x4D        4, 5, 6, 7, 0
    699    mova  [ptrq+ 1*mmsize], m4
    700    mova  [ptrq+ 3*mmsize], m5
    701    mova  [ptrq+ 5*mmsize], m6
    702    mova  [ptrq+ 7*mmsize], m7
    703    add               ptrq, 8 * mmsize
    704    add             blockq, mmsize
    705    dec               cntd
    706    jg .loop_1
    707 
    708    ; zero-pad the remainder (skipped cols)
    709    test             skipd, skipd
    710    jz .end
    711    add              skipd, skipd
    712    lea             blockq, [blockq+skipq*(mmsize/2)]
    713    pxor                m0, m0
    714 .loop_z:
    715    mova   [ptrq+mmsize*0], m0
    716    mova   [ptrq+mmsize*1], m0
    717    mova   [ptrq+mmsize*2], m0
    718    mova   [ptrq+mmsize*3], m0
    719    add               ptrq, 4 * mmsize
    720    dec              skipd
    721    jg .loop_z
    722 .end:
    723 
    724    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
    725    lea           stride3q, [strideq*3]
    726    mov               cntd, 2
    727    mov               ptrq, rsp
    728 .loop_2:
    729    IDCT8_1D          ptrq, reg_rnd, reg_mask
    730 
    731    pxor                m6, m6
    732    ROUND_AND_STORE_4x4  0, 1, 2, 3, m6, reg_max, reg_srnd, 5
    733    lea               dstq, [dstq+strideq*4]
    734    UNSCRATCH            0, 8, rsp+17*mmsize
    735    UNSCRATCH            1, 12, rsp+16*mmsize, max
    736    UNSCRATCH            2, 13, pd_16, srnd
    737    ROUND_AND_STORE_4x4  4, 5, 0, 7, m6, m1, m2, 5
    738    add               ptrq, 16
    739 %if ARCH_X86_64
    740    lea               dstq, [dstbakq+8]
    741 %else
    742    mov               dstq, dstm
    743    add               dstq, 8
    744 %endif
    745    dec               cntd
    746    jg .loop_2
    747 
    748    ; m6 is still zero
    749    ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
    750    RET
    751 
    752 %macro DC_ONLY_64BIT 2 ; shift, zero
    753 %if ARCH_X86_64
    754    movsxd           coefq, dword [blockq]
    755    movd          [blockq], %2
    756    imul             coefq, 11585
    757    add              coefq, 8192
    758    sar              coefq, 14
    759    imul             coefq, 11585
    760    add              coefq, ((1 << (%1 - 1)) << 14) + 8192
    761    sar              coefq, 14 + %1
    762 %else
    763    mov              coefd, dword [blockq]
    764    movd          [blockq], %2
    765    DEFINE_ARGS dst, stride, cnt, coef, coefl
    766    mov               cntd, 2
    767 .loop_dc_calc:
    768    mov             coefld, coefd
    769    sar              coefd, 14
    770    and             coefld, 0x3fff
    771    imul             coefd, 11585
    772    imul            coefld, 11585
    773    add             coefld, 8192
    774    sar             coefld, 14
    775    add              coefd, coefld
    776    dec               cntd
    777    jg .loop_dc_calc
    778    add              coefd, 1 << (%1 - 1)
    779    sar              coefd, %1
    780 %endif
    781 %endmacro
    782 
    783 INIT_XMM sse2
    784 cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \
    785                                  16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
    786                                  dst, stride, block, eob
    787    mova                m0, [pw_4095]
    788    cmp               eobd, 1
    789    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull
    790 
    791    ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign
    792    ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies
    793    DEFINE_ARGS dst, stride, block, coef, coefl
    794    pxor                m2, m2
    795    DC_ONLY_64BIT        5, m2
    796    movd                m1, coefd
    797    pshuflw             m1, m1, q0000
    798    punpcklqdq          m1, m1
    799    DEFINE_ARGS dst, stride, cnt
    800    mov               cntd, 4
    801 .loop_dc:
    802    STORE_2x8            3, 4, 1, m2, m0
    803    lea               dstq, [dstq+strideq*2]
    804    dec               cntd
    805    jg .loop_dc
    806    RET
    807 
    808 ; inputs and outputs are dwords, coefficients are words
    809 ;
    810 ; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
    811 ; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
    812 %macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask
    813    pand               m%3, m%1, %7
    814    pand               m%4, m%2, %7
    815    psrad              m%1, 14
    816    psrad              m%2, 14
    817    packssdw           m%4, m%2
    818    packssdw           m%3, m%1
    819    punpckhwd          m%2, m%4, m%3
    820    punpcklwd          m%4, m%3
    821    pmaddwd            m%3, m%4, [pw_%6_%5]
    822    pmaddwd            m%1, m%2, [pw_%6_%5]
    823    pmaddwd            m%4, [pw_m%5_%6]
    824    pmaddwd            m%2, [pw_m%5_%6]
    825 %endmacro
    826 
    827 ; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
    828 ; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
    829 %macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd
    830    SUMSUB_BA        d, %1, %2, %5
    831    SUMSUB_BA        d, %3, %4, %5
    832    paddd              m%3, %6
    833    paddd              m%4, %6
    834    psrad              m%3, 14
    835    psrad              m%4, 14
    836    paddd              m%1, m%3
    837    paddd              m%2, m%4
    838 %endmacro
    839 
    840 %macro NEGD 1
    841 %if cpuflag(ssse3)
    842    psignd              %1, [pw_m1]
    843 %else
    844    pxor                %1, [pw_m1]
    845    paddd               %1, [pd_1]
    846 %endif
    847 %endmacro
    848 
    849 ; the following line has not been executed at the end of this macro:
    850 ; UNSCRATCH            6, 8, rsp+17*mmsize
    851 %macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask
    852    mova                m0, [%1+ 0*mmsize]
    853    mova                m3, [%1+ 6*mmsize]
    854    mova                m4, [%1+ 8*mmsize]
    855    mova                m7, [%1+14*mmsize]
    856    SUMSUB_MUL_D         7, 0, 1, 2, 16305,  1606, %3   ; m7/1=t0a, m0/2=t1a
    857    SUMSUB_MUL_D         3, 4, 5, 6, 10394, 12665, %3   ; m3/5=t4a, m4/6=t5a
    858    SCRATCH              0, 8, rsp+17*mmsize
    859    SUMSUB_PACK_D        3, 7, 5, 1, 0, %2              ; m3=t0, m7=t4
    860    UNSCRATCH            0, 8, rsp+17*mmsize
    861    SUMSUB_PACK_D        4, 0, 6, 2, 1, %2              ; m4=t1, m0=t5
    862 
    863    SCRATCH              3, 8, rsp+17*mmsize
    864    SCRATCH              4, 9, rsp+18*mmsize
    865    SCRATCH              7, 10, rsp+19*mmsize
    866    SCRATCH              0, 11, rsp+20*mmsize
    867 
    868    mova                m1, [%1+ 2*mmsize]
    869    mova                m2, [%1+ 4*mmsize]
    870    mova                m5, [%1+10*mmsize]
    871    mova                m6, [%1+12*mmsize]
    872    SUMSUB_MUL_D         5, 2, 3, 4, 14449,  7723, %3   ; m5/8=t2a, m2/9=t3a
    873    SUMSUB_MUL_D         1, 6, 7, 0,  4756, 15679, %3   ; m1/10=t6a, m6/11=t7a
    874    SCRATCH              2, 12, rsp+21*mmsize
    875    SUMSUB_PACK_D        1, 5, 7, 3, 2, %2              ; m1=t2, m5=t6
    876    UNSCRATCH            2, 12, rsp+21*mmsize
    877    SUMSUB_PACK_D        6, 2, 0, 4, 3, %2              ; m6=t3, m2=t7
    878 
    879    UNSCRATCH            7, 10, rsp+19*mmsize
    880    UNSCRATCH            0, 11, rsp+20*mmsize
    881    SCRATCH              1, 10, rsp+19*mmsize
    882    SCRATCH              6, 11, rsp+20*mmsize
    883 
    884    SUMSUB_MUL_D         7, 0, 3, 4, 15137,  6270, %3   ; m7/8=t4a, m0/9=t5a
    885    SUMSUB_MUL_D         2, 5, 1, 6,  6270, 15137, %3   ; m2/10=t7a, m5/11=t6a
    886    SCRATCH              2, 12, rsp+21*mmsize
    887    SUMSUB_PACK_D        5, 7, 6, 3, 2, %2              ; m5=-out1, m7=t6
    888    UNSCRATCH            2, 12, rsp+21*mmsize
    889    NEGD                m5                              ; m5=out1
    890    SUMSUB_PACK_D        2, 0, 1, 4, 3, %2              ; m2=out6, m0=t7
    891    SUMSUB_MUL           7, 0, 3, 4, 11585, 11585, %2, %3   ; m7=out2, m0=-out5
    892    NEGD                m0                              ; m0=out5
    893 
    894    UNSCRATCH            3, 8, rsp+17*mmsize
    895    UNSCRATCH            4, 9, rsp+18*mmsize
    896    UNSCRATCH            1, 10, rsp+19*mmsize
    897    UNSCRATCH            6, 11, rsp+20*mmsize
    898    SCRATCH              2, 8, rsp+17*mmsize
    899    SCRATCH              0, 9, rsp+18*mmsize
    900 
    901    SUMSUB_BA         d, 1, 3,  2                       ; m1=out0, m3=t2
    902    SUMSUB_BA         d, 6, 4,  2                       ; m6=-out7, m4=t3
    903    NEGD                m6                              ; m6=out7
    904    SUMSUB_MUL           3, 4,  2,  0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4
    905    NEGD                m3                              ; m3=out3
    906 
    907    UNSCRATCH            0, 9, rsp+18*mmsize
    908 
    909    SWAP                 0, 1, 5
    910    SWAP                 2, 7, 6
    911 %endmacro
    912 
    913 %macro IADST8_FN 5
    914 cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
    915                              16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
    916                              dst, stride, block, eob
    917    mova                m0, [pw_1023]
    918 
    919 .body:
    920    SCRATCH              0, 13, rsp+16*mmsize, max
    921    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
    922 %if ARCH_X86_64
    923    mov            dstbakq, dstq
    924    movsxd            cntq, cntd
    925 %endif
    926 %if PIC
    927    lea               ptrq, [%5_8x8]
    928    movzx             cntd, byte [ptrq+cntq-1]
    929 %else
    930    movzx             cntd, byte [%5_8x8+cntq-1]
    931 %endif
    932    mov              skipd, 2
    933    sub              skipd, cntd
    934    mov               ptrq, rsp
    935    PRELOAD             14, pd_8192, rnd
    936    PRELOAD             15, pd_3fff, mask
    937 .loop_1:
    938    %2_1D           blockq, reg_rnd, reg_mask
    939 
    940    TRANSPOSE4x4D        0, 1, 2, 3, 6
    941    mova  [ptrq+ 0*mmsize], m0
    942    mova  [ptrq+ 2*mmsize], m1
    943    mova  [ptrq+ 4*mmsize], m2
    944    mova  [ptrq+ 6*mmsize], m3
    945    UNSCRATCH            6, 8, rsp+17*mmsize
    946    TRANSPOSE4x4D        4, 5, 6, 7, 0
    947    mova  [ptrq+ 1*mmsize], m4
    948    mova  [ptrq+ 3*mmsize], m5
    949    mova  [ptrq+ 5*mmsize], m6
    950    mova  [ptrq+ 7*mmsize], m7
    951    add               ptrq, 8 * mmsize
    952    add             blockq, mmsize
    953    dec               cntd
    954    jg .loop_1
    955 
    956    ; zero-pad the remainder (skipped cols)
    957    test             skipd, skipd
    958    jz .end
    959    add              skipd, skipd
    960    lea             blockq, [blockq+skipq*(mmsize/2)]
    961    pxor                m0, m0
    962 .loop_z:
    963    mova   [ptrq+mmsize*0], m0
    964    mova   [ptrq+mmsize*1], m0
    965    mova   [ptrq+mmsize*2], m0
    966    mova   [ptrq+mmsize*3], m0
    967    add               ptrq, 4 * mmsize
    968    dec              skipd
    969    jg .loop_z
    970 .end:
    971 
    972    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
    973    lea           stride3q, [strideq*3]
    974    mov               cntd, 2
    975    mov               ptrq, rsp
    976 .loop_2:
    977    %4_1D             ptrq, reg_rnd, reg_mask
    978 
    979    pxor                m6, m6
    980    PRELOAD              9, pd_16, srnd
    981    ROUND_AND_STORE_4x4  0, 1, 2, 3, m6, reg_max, reg_srnd, 5
    982    lea               dstq, [dstq+strideq*4]
    983    UNSCRATCH            0, 8, rsp+17*mmsize
    984    UNSCRATCH            1, 13, rsp+16*mmsize, max
    985    UNSCRATCH            2, 9, pd_16, srnd
    986    ROUND_AND_STORE_4x4  4, 5, 0, 7, m6, m1, m2, 5
    987    add               ptrq, 16
    988 %if ARCH_X86_64
    989    lea               dstq, [dstbakq+8]
    990 %else
    991    mov               dstq, dstm
    992    add               dstq, 8
    993 %endif
    994    dec               cntd
    995    jg .loop_2
    996 
    997    ; m6 is still zero
    998    ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
    999    RET
   1000 
   1001 cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \
   1002                              16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
   1003                              dst, stride, block, eob
   1004    mova                m0, [pw_4095]
   1005    jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body
   1006 %endmacro
   1007 
   1008 INIT_XMM sse2
   1009 IADST8_FN idct,  IDCT8,  iadst, IADST8, row
   1010 IADST8_FN iadst, IADST8, idct,  IDCT8,  col
   1011 IADST8_FN iadst, IADST8, iadst, IADST8, default
   1012 
   1013 %macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
   1014    IDCT8_1D            %1, [pd_8192], [pd_3fff], %2 * 2, %4    ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
   1015    ; SCRATCH            6, 8, rsp+(%4+0)*mmsize    ; t6
   1016    SCRATCH              0, 15, rsp+(%4+7)*mmsize   ; t0a
   1017    SCRATCH              1, 14, rsp+(%4+6)*mmsize   ; t1a
   1018    SCRATCH              2, 13, rsp+(%4+5)*mmsize   ; t2a
   1019    SCRATCH              3, 12, rsp+(%4+4)*mmsize   ; t3a
   1020    SCRATCH              4, 11, rsp+(%4+3)*mmsize   ; t4
   1021    mova [rsp+(%3+0)*mmsize], m5                    ; t5
   1022    mova [rsp+(%3+1)*mmsize], m7                    ; t7
   1023 
   1024    mova                m0, [%1+ 1*%2]              ; in1
   1025    mova                m3, [%1+ 7*%2]              ; in7
   1026    mova                m4, [%1+ 9*%2]              ; in9
   1027    mova                m7, [%1+15*%2]              ; in15
   1028 
   1029    SUMSUB_MUL           0, 7, 1, 2, 16305,  1606   ; m0=t15a, m7=t8a
   1030    SUMSUB_MUL           4, 3, 1, 2, 10394, 12665   ; m4=t14a, m3=t9a
   1031    SUMSUB_BA         d, 3, 7, 1                    ; m3=t8, m7=t9
   1032    SUMSUB_BA         d, 4, 0, 1                    ; m4=t15,m0=t14
   1033    SUMSUB_MUL           0, 7, 1, 2, 15137,  6270   ; m0=t14a, m7=t9a
   1034 
   1035    mova                m1, [%1+ 3*%2]              ; in3
   1036    mova                m2, [%1+ 5*%2]              ; in5
   1037    mova                m5, [%1+11*%2]              ; in11
   1038    mova                m6, [%1+13*%2]              ; in13
   1039 
   1040    SCRATCH              0,  9, rsp+(%4+1)*mmsize
   1041    SCRATCH              7, 10, rsp+(%4+2)*mmsize
   1042 
   1043    SUMSUB_MUL           2, 5, 0, 7, 14449,  7723   ; m2=t13a, m5=t10a
   1044    SUMSUB_MUL           6, 1, 0, 7,  4756, 15679   ; m6=t12a, m1=t11a
   1045    SUMSUB_BA         d, 5, 1, 0                    ; m5=t11,m1=t10
   1046    SUMSUB_BA         d, 2, 6, 0                    ; m2=t12,m6=t13
   1047    NEGD                m1                          ; m1=-t10
   1048    SUMSUB_MUL           1, 6, 0, 7, 15137,  6270   ; m1=t13a, m6=t10a
   1049 
   1050    UNSCRATCH            7, 10, rsp+(%4+2)*mmsize
   1051    SUMSUB_BA         d, 5, 3, 0                    ; m5=t8a, m3=t11a
   1052    SUMSUB_BA         d, 6, 7, 0                    ; m6=t9,  m7=t10
   1053    SUMSUB_BA         d, 2, 4, 0                    ; m2=t15a,m4=t12a
   1054    SCRATCH              5, 10, rsp+(%4+2)*mmsize
   1055    SUMSUB_MUL           4, 3, 0, 5, 11585, 11585   ; m4=t12, m3=t11
   1056    UNSCRATCH            0, 9, rsp+(%4+1)*mmsize
   1057    SUMSUB_BA         d, 1, 0, 5                    ; m1=t14, m0=t13
   1058    SCRATCH              6, 9, rsp+(%4+1)*mmsize
   1059    SUMSUB_MUL           0, 7, 6, 5, 11585, 11585   ; m0=t13a,m7=t10a
   1060 
   1061    ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2
   1062    ; free: 6,5
   1063 
   1064    UNSCRATCH            5, 15, rsp+(%4+7)*mmsize
   1065    SUMSUB_BA         d, 2, 5, 6                    ; m2=out0, m5=out15
   1066    SCRATCH              5, 15, rsp+(%4+7)*mmsize
   1067    UNSCRATCH            5, 14, rsp+(%4+6)*mmsize
   1068    SUMSUB_BA         d, 1, 5, 6                    ; m1=out1, m5=out14
   1069    SCRATCH              5, 14, rsp+(%4+6)*mmsize
   1070    UNSCRATCH            5, 13, rsp+(%4+5)*mmsize
   1071    SUMSUB_BA         d, 0, 5, 6                    ; m0=out2, m5=out13
   1072    SCRATCH              5, 13, rsp+(%4+5)*mmsize
   1073    UNSCRATCH            5, 12, rsp+(%4+4)*mmsize
   1074    SUMSUB_BA         d, 4, 5, 6                    ; m4=out3, m5=out12
   1075    SCRATCH              5, 12, rsp+(%4+4)*mmsize
   1076    UNSCRATCH            5, 11, rsp+(%4+3)*mmsize
   1077    SUMSUB_BA         d, 3, 5, 6                    ; m3=out4, m5=out11
   1078    SCRATCH              4, 11, rsp+(%4+3)*mmsize
   1079    mova                m4, [rsp+(%3+0)*mmsize]
   1080    SUMSUB_BA         d, 7, 4, 6                    ; m7=out5, m4=out10
   1081    mova [rsp+(%3+0)*mmsize], m5
   1082    UNSCRATCH            5, 8, rsp+(%4+0)*mmsize
   1083    UNSCRATCH            6, 9, rsp+(%4+1)*mmsize
   1084    SCRATCH              2, 8, rsp+(%4+0)*mmsize
   1085    SCRATCH              1, 9, rsp+(%4+1)*mmsize
   1086    UNSCRATCH            1, 10, rsp+(%4+2)*mmsize
   1087    SCRATCH              0, 10, rsp+(%4+2)*mmsize
   1088    mova                m0, [rsp+(%3+1)*mmsize]
   1089    SUMSUB_BA         d, 6, 5, 2                    ; m6=out6, m5=out9
   1090    SUMSUB_BA         d, 1, 0, 2                    ; m1=out7, m0=out8
   1091 
   1092    SWAP                 0, 3, 1, 7, 2, 6, 4
   1093 
   1094    ; output order: 8-11|r67-70=out0-3
   1095    ;               0-6,r65=out4-11
   1096    ;               12-15|r71-74=out12-15
   1097 %endmacro
   1098 
   1099 INIT_XMM sse2
   1100 cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
   1101                                    67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
   1102                                    dst, stride, block, eob
   1103    mova                m0, [pw_1023]
   1104    cmp               eobd, 1
   1105    jg .idctfull
   1106 
   1107    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
   1108    ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
   1109    ; fits in 32bit
   1110    DEFINE_ARGS dst, stride, block, coef
   1111    pxor                m2, m2
   1112    DC_ONLY              6, m2
   1113    movd                m1, coefd
   1114    pshuflw             m1, m1, q0000
   1115    punpcklqdq          m1, m1
   1116    DEFINE_ARGS dst, stride, cnt
   1117    mov               cntd, 8
   1118 .loop_dc:
   1119    STORE_2x8            3, 4, 1, m2, m0, dstq,         mmsize
   1120    STORE_2x8            3, 4, 1, m2, m0, dstq+strideq, mmsize
   1121    lea               dstq, [dstq+strideq*2]
   1122    dec               cntd
   1123    jg .loop_dc
   1124    RET
   1125 
   1126 .idctfull:
   1127    mova   [rsp+64*mmsize], m0
   1128    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
   1129 %if ARCH_X86_64
   1130    mov            dstbakq, dstq
   1131    movsxd            cntq, cntd
   1132 %endif
   1133 %if PIC
   1134    lea               ptrq, [default_16x16]
   1135    movzx             cntd, byte [ptrq+cntq-1]
   1136 %else
   1137    movzx             cntd, byte [default_16x16+cntq-1]
   1138 %endif
   1139    mov              skipd, 4
   1140    sub              skipd, cntd
   1141    mov               ptrq, rsp
   1142 .loop_1:
   1143    IDCT16_1D       blockq
   1144 
   1145    TRANSPOSE4x4D        0, 1, 2, 3, 7
   1146    mova  [ptrq+ 1*mmsize], m0
   1147    mova  [ptrq+ 5*mmsize], m1
   1148    mova  [ptrq+ 9*mmsize], m2
   1149    mova  [ptrq+13*mmsize], m3
   1150    mova                m7, [rsp+65*mmsize]
   1151    TRANSPOSE4x4D        4, 5, 6, 7, 0
   1152    mova  [ptrq+ 2*mmsize], m4
   1153    mova  [ptrq+ 6*mmsize], m5
   1154    mova  [ptrq+10*mmsize], m6
   1155    mova  [ptrq+14*mmsize], m7
   1156    UNSCRATCH               0, 8, rsp+67*mmsize
   1157    UNSCRATCH               1, 9, rsp+68*mmsize
   1158    UNSCRATCH               2, 10, rsp+69*mmsize
   1159    UNSCRATCH               3, 11, rsp+70*mmsize
   1160    TRANSPOSE4x4D        0, 1, 2, 3, 7
   1161    mova  [ptrq+ 0*mmsize], m0
   1162    mova  [ptrq+ 4*mmsize], m1
   1163    mova  [ptrq+ 8*mmsize], m2
   1164    mova  [ptrq+12*mmsize], m3
   1165    UNSCRATCH               4, 12, rsp+71*mmsize
   1166    UNSCRATCH               5, 13, rsp+72*mmsize
   1167    UNSCRATCH               6, 14, rsp+73*mmsize
   1168    UNSCRATCH               7, 15, rsp+74*mmsize
   1169    TRANSPOSE4x4D        4, 5, 6, 7, 0
   1170    mova  [ptrq+ 3*mmsize], m4
   1171    mova  [ptrq+ 7*mmsize], m5
   1172    mova  [ptrq+11*mmsize], m6
   1173    mova  [ptrq+15*mmsize], m7
   1174    add               ptrq, 16 * mmsize
   1175    add             blockq, mmsize
   1176    dec               cntd
   1177    jg .loop_1
   1178 
   1179    ; zero-pad the remainder (skipped cols)
   1180    test             skipd, skipd
   1181    jz .end
   1182    add              skipd, skipd
   1183    lea             blockq, [blockq+skipq*(mmsize/2)]
   1184    pxor                m0, m0
   1185 .loop_z:
   1186    mova   [ptrq+mmsize*0], m0
   1187    mova   [ptrq+mmsize*1], m0
   1188    mova   [ptrq+mmsize*2], m0
   1189    mova   [ptrq+mmsize*3], m0
   1190    mova   [ptrq+mmsize*4], m0
   1191    mova   [ptrq+mmsize*5], m0
   1192    mova   [ptrq+mmsize*6], m0
   1193    mova   [ptrq+mmsize*7], m0
   1194    add               ptrq, 8 * mmsize
   1195    dec              skipd
   1196    jg .loop_z
   1197 .end:
   1198 
   1199    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
   1200    lea           stride3q, [strideq*3]
   1201    mov               cntd, 4
   1202    mov               ptrq, rsp
   1203 .loop_2:
   1204    IDCT16_1D         ptrq
   1205 
   1206    pxor               m7, m7
   1207    lea               dstq, [dstq+strideq*4]
   1208    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
   1209    lea               dstq, [dstq+strideq*4]
   1210    mova                m0, [rsp+65*mmsize]
   1211    mova                m1, [rsp+64*mmsize]
   1212    mova                m2, [pd_32]
   1213    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
   1214 
   1215 %if ARCH_X86_64
   1216    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
   1217 %else
   1218    mov               dstq, dstm
   1219 %endif
   1220    UNSCRATCH               0, 8, rsp+67*mmsize
   1221    UNSCRATCH               4, 9, rsp+68*mmsize
   1222    UNSCRATCH               5, 10, rsp+69*mmsize
   1223    UNSCRATCH               3, 11, rsp+70*mmsize
   1224    ROUND_AND_STORE_4x4  0, 4, 5, 3, m7, m1, m2, 6
   1225 %if ARCH_X86_64
   1226    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
   1227    lea               dstq, [dstbakq+stride3q*4]
   1228 %else
   1229    lea               dstq, [dstq+stride3q*4]
   1230 %endif
   1231    UNSCRATCH               4, 12, rsp+71*mmsize
   1232    UNSCRATCH               5, 13, rsp+72*mmsize
   1233    UNSCRATCH               6, 14, rsp+73*mmsize
   1234    UNSCRATCH               0, 15, rsp+74*mmsize
   1235    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
   1236 
   1237    add               ptrq, mmsize
   1238 %if ARCH_X86_64
   1239    add            dstbakq, 8
   1240    mov               dstq, dstbakq
   1241 %else
   1242    add         dword dstm, 8
   1243    mov               dstq, dstm
   1244 %endif
   1245    dec               cntd
   1246    jg .loop_2
   1247 
   1248    ; m7 is still zero
   1249    ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
   1250    RET
   1251 
   1252 INIT_XMM sse2
   1253 cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
   1254                                    67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
   1255                                    dst, stride, block, eob
   1256    mova                m0, [pw_4095]
   1257    cmp               eobd, 1
   1258    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull
   1259 
   1260    ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
   1261    ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
   1262    DEFINE_ARGS dst, stride, block, coef, coefl
   1263    pxor                m2, m2
   1264    DC_ONLY_64BIT        6, m2
   1265    movd                m1, coefd
   1266    pshuflw             m1, m1, q0000
   1267    punpcklqdq          m1, m1
   1268    DEFINE_ARGS dst, stride, cnt
   1269    mov               cntd, 8
   1270 .loop_dc:
   1271    STORE_2x8            3, 4, 1, m2, m0, dstq,         mmsize
   1272    STORE_2x8            3, 4, 1, m2, m0, dstq+strideq, mmsize
   1273    lea               dstq, [dstq+strideq*2]
   1274    dec               cntd
   1275    jg .loop_dc
   1276    RET
   1277 
   1278 ; r65-69 are available for spills
   1279 ; r70-77 are available on x86-32 only (x86-64 should use m8-15)
   1280 ; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77
   1281 %macro IADST16_1D 1 ; src
   1282    mova                m0, [%1+ 0*4*mmsize]        ; in0
   1283    mova                m1, [%1+ 7*4*mmsize]        ; in7
   1284    mova                m2, [%1+ 8*4*mmsize]        ; in8
   1285    mova                m3, [%1+15*4*mmsize]        ; in15
   1286    SUMSUB_MUL_D         3, 0, 4, 5, 16364,  804    ; m3/4=t0, m0/5=t1
   1287    SUMSUB_MUL_D         1, 2, 6, 7, 11003, 12140   ; m1/6=t8, m2/7=t9
   1288    SCRATCH              0, 8, rsp+70*mmsize
   1289    SUMSUB_PACK_D        1, 3, 6, 4, 0              ; m1=t0a, m3=t8a
   1290    UNSCRATCH            0, 8, rsp+70*mmsize
   1291    SUMSUB_PACK_D        2, 0, 7, 5, 4              ; m2=t1a, m0=t9a
   1292    mova   [rsp+67*mmsize], m1
   1293    SCRATCH              2, 9, rsp+71*mmsize
   1294    SCRATCH              3, 12, rsp+74*mmsize
   1295    SCRATCH              0, 13, rsp+75*mmsize
   1296 
   1297    mova                m0, [%1+ 3*4*mmsize]        ; in3
   1298    mova                m1, [%1+ 4*4*mmsize]        ; in4
   1299    mova                m2, [%1+11*4*mmsize]        ; in11
   1300    mova                m3, [%1+12*4*mmsize]        ; in12
   1301    SUMSUB_MUL_D         2, 1, 4, 5, 14811,  7005   ; m2/4=t4, m1/5=t5
   1302    SUMSUB_MUL_D         0, 3, 6, 7,  5520, 15426   ; m0/6=t12, m3/7=t13
   1303    SCRATCH              1, 10, rsp+72*mmsize
   1304    SUMSUB_PACK_D        0, 2, 6, 4, 1              ; m0=t4a, m2=t12a
   1305    UNSCRATCH            1, 10, rsp+72*mmsize
   1306    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=t5a, m1=t13a
   1307    SCRATCH              0, 15, rsp+77*mmsize
   1308    SCRATCH              3, 11, rsp+73*mmsize
   1309 
   1310    UNSCRATCH            0, 12, rsp+74*mmsize       ; t8a
   1311    UNSCRATCH            3, 13, rsp+75*mmsize       ; t9a
   1312    SUMSUB_MUL_D         0, 3, 4, 5, 16069,  3196   ; m0/4=t8, m3/5=t9
   1313    SUMSUB_MUL_D         1, 2, 6, 7,  3196, 16069   ; m1/6=t13, m2/7=t12
   1314    SCRATCH              1, 12, rsp+74*mmsize
   1315    SUMSUB_PACK_D        2, 0, 7, 4, 1              ; m2=t8a, m0=t12a
   1316    UNSCRATCH            1, 12, rsp+74*mmsize
   1317    SUMSUB_PACK_D        1, 3, 6, 5, 4              ; m1=t9a, m3=t13a
   1318    mova   [rsp+65*mmsize], m2
   1319    mova   [rsp+66*mmsize], m1
   1320    SCRATCH              0, 8, rsp+70*mmsize
   1321    SCRATCH              3, 12, rsp+74*mmsize
   1322 
   1323    mova                m0, [%1+ 2*4*mmsize]        ; in2
   1324    mova                m1, [%1+ 5*4*mmsize]        ; in5
   1325    mova                m2, [%1+10*4*mmsize]        ; in10
   1326    mova                m3, [%1+13*4*mmsize]        ; in13
   1327    SUMSUB_MUL_D         3, 0, 4, 5, 15893,  3981   ; m3/4=t2, m0/5=t3
   1328    SUMSUB_MUL_D         1, 2, 6, 7,  8423, 14053   ; m1/6=t10, m2/7=t11
   1329    SCRATCH              0, 10, rsp+72*mmsize
   1330    SUMSUB_PACK_D        1, 3, 6, 4, 0              ; m1=t2a, m3=t10a
   1331    UNSCRATCH            0, 10, rsp+72*mmsize
   1332    SUMSUB_PACK_D        2, 0, 7, 5, 4              ; m2=t3a, m0=t11a
   1333    mova   [rsp+68*mmsize], m1
   1334    mova   [rsp+69*mmsize], m2
   1335    SCRATCH              3, 13, rsp+75*mmsize
   1336    SCRATCH              0, 14, rsp+76*mmsize
   1337 
   1338    mova                m0, [%1+ 1*4*mmsize]        ; in1
   1339    mova                m1, [%1+ 6*4*mmsize]        ; in6
   1340    mova                m2, [%1+ 9*4*mmsize]        ; in9
   1341    mova                m3, [%1+14*4*mmsize]        ; in14
   1342    SUMSUB_MUL_D         2, 1, 4, 5, 13160,  9760   ; m2/4=t6, m1/5=t7
   1343    SUMSUB_MUL_D         0, 3, 6, 7,  2404, 16207   ; m0/6=t14, m3/7=t15
   1344    SCRATCH              1, 10, rsp+72*mmsize
   1345    SUMSUB_PACK_D        0, 2, 6, 4, 1              ; m0=t6a, m2=t14a
   1346    UNSCRATCH            1, 10, rsp+72*mmsize
   1347    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=t7a, m1=t15a
   1348 
   1349    UNSCRATCH            4, 13, rsp+75*mmsize       ; t10a
   1350    UNSCRATCH            5, 14, rsp+76*mmsize       ; t11a
   1351    SCRATCH              0, 13, rsp+75*mmsize
   1352    SCRATCH              3, 14, rsp+76*mmsize
   1353    SUMSUB_MUL_D         4, 5, 6, 7,  9102, 13623   ; m4/6=t10, m5/7=t11
   1354    SUMSUB_MUL_D         1, 2, 0, 3, 13623,  9102   ; m1/0=t15, m2/3=t14
   1355    SCRATCH              0, 10, rsp+72*mmsize
   1356    SUMSUB_PACK_D        2, 4, 3, 6, 0              ; m2=t10a, m4=t14a
   1357    UNSCRATCH            0, 10, rsp+72*mmsize
   1358    SUMSUB_PACK_D        1, 5, 0, 7, 6              ; m1=t11a, m5=t15a
   1359 
   1360    UNSCRATCH            0, 8, rsp+70*mmsize        ; t12a
   1361    UNSCRATCH            3, 12, rsp+74*mmsize       ; t13a
   1362    SCRATCH              2, 8, rsp+70*mmsize
   1363    SCRATCH              1, 12, rsp+74*mmsize
   1364    SUMSUB_MUL_D         0, 3, 1, 2, 15137,  6270   ; m0/1=t12, m3/2=t13
   1365    SUMSUB_MUL_D         5, 4, 7, 6,  6270, 15137   ; m5/7=t15, m4/6=t14
   1366    SCRATCH              2, 10, rsp+72*mmsize
   1367    SUMSUB_PACK_D        4, 0, 6, 1, 2              ; m4=out2, m0=t14a
   1368    UNSCRATCH            2, 10, rsp+72*mmsize
   1369    SUMSUB_PACK_D        5, 3, 7, 2, 1              ; m5=-out13, m3=t15a
   1370    NEGD                m5                          ; m5=out13
   1371 
   1372    UNSCRATCH            1, 9, rsp+71*mmsize        ; t1a
   1373    mova                m2, [rsp+68*mmsize]         ; t2a
   1374    UNSCRATCH            6, 13, rsp+75*mmsize       ; t6a
   1375    UNSCRATCH            7, 14, rsp+76*mmsize       ; t7a
   1376    SCRATCH              4, 10, rsp+72*mmsize
   1377    SCRATCH              5, 13, rsp+75*mmsize
   1378    UNSCRATCH            4, 15, rsp+77*mmsize       ; t4a
   1379    UNSCRATCH            5, 11, rsp+73*mmsize       ; t5a
   1380    SCRATCH              0, 14, rsp+76*mmsize
   1381    SCRATCH              3, 15, rsp+77*mmsize
   1382    mova                m0, [rsp+67*mmsize]         ; t0a
   1383    SUMSUB_BA         d, 4, 0, 3                    ; m4=t0, m0=t4
   1384    SUMSUB_BA         d, 5, 1, 3                    ; m5=t1, m1=t5
   1385    SUMSUB_BA         d, 6, 2, 3                    ; m6=t2, m2=t6
   1386    SCRATCH              4, 9, rsp+71*mmsize
   1387    mova                m3, [rsp+69*mmsize]         ; t3a
   1388    SUMSUB_BA         d, 7, 3, 4                    ; m7=t3, m3=t7
   1389 
   1390    mova   [rsp+67*mmsize], m5
   1391    mova   [rsp+68*mmsize], m6
   1392    mova   [rsp+69*mmsize], m7
   1393    SUMSUB_MUL_D         0, 1, 4, 5, 15137,  6270   ; m0/4=t4a, m1/5=t5a
   1394    SUMSUB_MUL_D         3, 2, 7, 6,  6270, 15137   ; m3/7=t7a, m2/6=t6a
   1395    SCRATCH              1, 11, rsp+73*mmsize
   1396    SUMSUB_PACK_D        2, 0, 6, 4, 1              ; m2=-out3, m0=t6
   1397    NEGD                m2                          ; m2=out3
   1398    UNSCRATCH            1, 11, rsp+73*mmsize
   1399    SUMSUB_PACK_D        3, 1, 7, 5, 4              ; m3=out12, m1=t7
   1400    SCRATCH              2, 11, rsp+73*mmsize
   1401    UNSCRATCH            2, 12, rsp+74*mmsize       ; t11a
   1402    SCRATCH              3, 12, rsp+74*mmsize
   1403 
   1404    UNSCRATCH            3, 8, rsp+70*mmsize        ; t10a
   1405    mova                m4, [rsp+65*mmsize]         ; t8a
   1406    mova                m5, [rsp+66*mmsize]         ; t9a
   1407    SUMSUB_BA         d, 3, 4, 6                    ; m3=-out1, m4=t10
   1408    NEGD                m3                          ; m3=out1
   1409    SUMSUB_BA         d, 2, 5, 6                    ; m2=out14, m5=t11
   1410    UNSCRATCH            6, 9, rsp+71*mmsize        ; t0
   1411    UNSCRATCH            7, 14, rsp+76*mmsize       ; t14a
   1412    SCRATCH              3, 9, rsp+71*mmsize
   1413    SCRATCH              2, 14, rsp+76*mmsize
   1414 
   1415    SUMSUB_MUL           1, 0, 2, 3, 11585, 11585   ; m1=out4, m0=out11
   1416    mova   [rsp+65*mmsize], m0
   1417    SUMSUB_MUL           5, 4, 2, 3, 11585, 11585   ; m5=out6, m4=out9
   1418    UNSCRATCH            0, 15, rsp+77*mmsize       ; t15a
   1419    SUMSUB_MUL           7, 0, 2, 3, 11585, m11585  ; m7=out10, m0=out5
   1420 
   1421    mova                m2, [rsp+68*mmsize]         ; t2
   1422    SUMSUB_BA         d, 2, 6, 3                    ; m2=out0, m6=t2a
   1423    SCRATCH              2, 8, rsp+70*mmsize
   1424    mova                m2, [rsp+67*mmsize]         ; t1
   1425    mova                m3, [rsp+69*mmsize]         ; t3
   1426    mova   [rsp+67*mmsize], m7
   1427    SUMSUB_BA         d, 3, 2, 7                    ; m3=-out15, m2=t3a
   1428    NEGD                m3                          ; m3=out15
   1429    SCRATCH              3, 15, rsp+77*mmsize
   1430    SUMSUB_MUL           6, 2, 7, 3, 11585, m11585  ; m6=out8, m2=out7
   1431    mova                m7, [rsp+67*mmsize]
   1432 
   1433    SWAP                 0, 1
   1434    SWAP                 2, 5, 4, 6, 7, 3
   1435 %endmacro
   1436 
   1437 %macro IADST16_FN 7
   1438 cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
   1439                                70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
   1440                                dst, stride, block, eob
   1441    mova                m0, [pw_1023]
   1442 
   1443 .body:
   1444    mova   [rsp+64*mmsize], m0
   1445    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
   1446 %if ARCH_X86_64
   1447    mov            dstbakq, dstq
   1448    movsxd            cntq, cntd
   1449 %endif
   1450 %if PIC
   1451    lea               ptrq, [%7_16x16]
   1452    movzx             cntd, byte [ptrq+cntq-1]
   1453 %else
   1454    movzx             cntd, byte [%7_16x16+cntq-1]
   1455 %endif
   1456    mov              skipd, 4
   1457    sub              skipd, cntd
   1458    mov               ptrq, rsp
   1459 .loop_1:
   1460    %2_1D           blockq
   1461 
   1462    TRANSPOSE4x4D        0, 1, 2, 3, 7
   1463    mova  [ptrq+ 1*mmsize], m0
   1464    mova  [ptrq+ 5*mmsize], m1
   1465    mova  [ptrq+ 9*mmsize], m2
   1466    mova  [ptrq+13*mmsize], m3
   1467    mova                m7, [rsp+65*mmsize]
   1468    TRANSPOSE4x4D        4, 5, 6, 7, 0
   1469    mova  [ptrq+ 2*mmsize], m4
   1470    mova  [ptrq+ 6*mmsize], m5
   1471    mova  [ptrq+10*mmsize], m6
   1472    mova  [ptrq+14*mmsize], m7
   1473    UNSCRATCH               0, 8, rsp+(%3+0)*mmsize
   1474    UNSCRATCH               1, 9, rsp+(%3+1)*mmsize
   1475    UNSCRATCH               2, 10, rsp+(%3+2)*mmsize
   1476    UNSCRATCH               3, 11, rsp+(%3+3)*mmsize
   1477    TRANSPOSE4x4D        0, 1, 2, 3, 7
   1478    mova  [ptrq+ 0*mmsize], m0
   1479    mova  [ptrq+ 4*mmsize], m1
   1480    mova  [ptrq+ 8*mmsize], m2
   1481    mova  [ptrq+12*mmsize], m3
   1482    UNSCRATCH               4, 12, rsp+(%3+4)*mmsize
   1483    UNSCRATCH               5, 13, rsp+(%3+5)*mmsize
   1484    UNSCRATCH               6, 14, rsp+(%3+6)*mmsize
   1485    UNSCRATCH               7, 15, rsp+(%3+7)*mmsize
   1486    TRANSPOSE4x4D        4, 5, 6, 7, 0
   1487    mova  [ptrq+ 3*mmsize], m4
   1488    mova  [ptrq+ 7*mmsize], m5
   1489    mova  [ptrq+11*mmsize], m6
   1490    mova  [ptrq+15*mmsize], m7
   1491    add               ptrq, 16 * mmsize
   1492    add             blockq, mmsize
   1493    dec               cntd
   1494    jg .loop_1
   1495 
   1496    ; zero-pad the remainder (skipped cols)
   1497    test             skipd, skipd
   1498    jz .end
   1499    add              skipd, skipd
   1500    lea             blockq, [blockq+skipq*(mmsize/2)]
   1501    pxor                m0, m0
   1502 .loop_z:
   1503    mova   [ptrq+mmsize*0], m0
   1504    mova   [ptrq+mmsize*1], m0
   1505    mova   [ptrq+mmsize*2], m0
   1506    mova   [ptrq+mmsize*3], m0
   1507    mova   [ptrq+mmsize*4], m0
   1508    mova   [ptrq+mmsize*5], m0
   1509    mova   [ptrq+mmsize*6], m0
   1510    mova   [ptrq+mmsize*7], m0
   1511    add               ptrq, 8 * mmsize
   1512    dec              skipd
   1513    jg .loop_z
   1514 .end:
   1515 
   1516    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
   1517    lea           stride3q, [strideq*3]
   1518    mov               cntd, 4
   1519    mov               ptrq, rsp
   1520 .loop_2:
   1521    %5_1D             ptrq
   1522 
   1523    pxor                m7, m7
   1524    lea               dstq, [dstq+strideq*4]
   1525    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
   1526    lea               dstq, [dstq+strideq*4]
   1527    mova                m0, [rsp+65*mmsize]
   1528    mova                m1, [rsp+64*mmsize]
   1529    mova                m2, [pd_32]
   1530    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
   1531 
   1532 %if ARCH_X86_64
   1533    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
   1534 %else
   1535    mov               dstq, dstm
   1536 %endif
   1537    UNSCRATCH               0, 8, rsp+(%6+0)*mmsize
   1538    UNSCRATCH               4, 9, rsp+(%6+1)*mmsize
   1539    UNSCRATCH               5, 10, rsp+(%6+2)*mmsize
   1540    UNSCRATCH               3, 11, rsp+(%6+3)*mmsize
   1541    ROUND_AND_STORE_4x4  0, 4, 5, 3, m7, m1, m2, 6
   1542 %if ARCH_X86_64
   1543    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
   1544    lea               dstq, [dstbakq+stride3q*4]
   1545 %else
   1546    lea               dstq, [dstq+stride3q*4]
   1547 %endif
   1548    UNSCRATCH               4, 12, rsp+(%6+4)*mmsize
   1549    UNSCRATCH               5, 13, rsp+(%6+5)*mmsize
   1550    UNSCRATCH               6, 14, rsp+(%6+6)*mmsize
   1551    UNSCRATCH               0, 15, rsp+(%6+7)*mmsize
   1552    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, m1, m2, 6
   1553 
   1554    add               ptrq, mmsize
   1555 %if ARCH_X86_64
   1556    add            dstbakq, 8
   1557    mov               dstq, dstbakq
   1558 %else
   1559    add         dword dstm, 8
   1560    mov               dstq, dstm
   1561 %endif
   1562    dec               cntd
   1563    jg .loop_2
   1564 
   1565    ; m7 is still zero
   1566    ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
   1567    RET
   1568 
   1569 cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
   1570                                70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
   1571                                dst, stride, block, eob
   1572    mova                m0, [pw_4095]
   1573    jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body
   1574 %endmacro
   1575 
   1576 INIT_XMM sse2
   1577 IADST16_FN idct,  IDCT16,  67, iadst, IADST16, 70, row
   1578 IADST16_FN iadst, IADST16, 70, idct,  IDCT16,  67, col
   1579 IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default
   1580 
   1581 %macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride
   1582    IDCT16_1D %2, 2 * %3, 272, 257
   1583 %if ARCH_X86_64
   1584    mova  [rsp+257*mmsize], m8
   1585    mova  [rsp+258*mmsize], m9
   1586    mova  [rsp+259*mmsize], m10
   1587    mova  [rsp+260*mmsize], m11
   1588    mova  [rsp+261*mmsize], m12
   1589    mova  [rsp+262*mmsize], m13
   1590    mova  [rsp+263*mmsize], m14
   1591    mova  [rsp+264*mmsize], m15
   1592 %endif
   1593    mova  [rsp+265*mmsize], m0
   1594    mova  [rsp+266*mmsize], m1
   1595    mova  [rsp+267*mmsize], m2
   1596    mova  [rsp+268*mmsize], m3
   1597    mova  [rsp+269*mmsize], m4
   1598    mova  [rsp+270*mmsize], m5
   1599    mova  [rsp+271*mmsize], m6
   1600 
   1601    ; r257-260: t0-3
   1602    ; r265-272: t4/5a/6a/7/8/9a/10/11a
   1603    ; r261-264: t12a/13/14a/15
   1604    ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit
   1605 
   1606    mova                m0, [%2+ 1*%3]              ; in1
   1607    mova                m1, [%2+15*%3]              ; in15
   1608    mova                m2, [%2+17*%3]              ; in17
   1609    mova                m3, [%2+31*%3]              ; in31
   1610    SUMSUB_MUL           0, 3, 4, 5, 16364,  804    ; m0=t31a, m3=t16a
   1611    SUMSUB_MUL           2, 1, 4, 5, 11003, 12140   ; m2=t30a, m1=t17a
   1612    SUMSUB_BA         d, 1, 3, 4                    ; m1=t16, m3=t17
   1613    SUMSUB_BA         d, 2, 0, 4                    ; m2=t31, m0=t30
   1614    SUMSUB_MUL           0, 3, 4, 5, 16069,  3196   ; m0=t30a, m3=t17a
   1615    SCRATCH              0, 8, rsp+275*mmsize
   1616    SCRATCH              2, 9, rsp+276*mmsize
   1617 
   1618    ; end of stage 1-3 first quart
   1619 
   1620    mova                m0, [%2+ 7*%3]              ; in7
   1621    mova                m2, [%2+ 9*%3]              ; in9
   1622    mova                m4, [%2+23*%3]              ; in23
   1623    mova                m5, [%2+25*%3]              ; in25
   1624    SUMSUB_MUL           2, 4, 6, 7, 14811,  7005   ; m2=t29a, m4=t18a
   1625    SUMSUB_MUL           5, 0, 6, 7,  5520, 15426   ; m5=t28a, m0=t19a
   1626    SUMSUB_BA         d, 4, 0, 6                    ; m4=t19, m0=t18
   1627    SUMSUB_BA         d, 2, 5, 6                    ; m2=t28, m5=t29
   1628    SUMSUB_MUL           5, 0, 6, 7,  3196, m16069  ; m5=t29a, m0=t18a
   1629 
   1630    ; end of stage 1-3 second quart
   1631 
   1632    SUMSUB_BA         d, 4, 1, 6                    ; m4=t16a, m1=t19a
   1633    SUMSUB_BA         d, 0, 3, 6                    ; m0=t17, m3=t18
   1634    UNSCRATCH            6, 8, rsp+275*mmsize       ; t30a
   1635    UNSCRATCH            7, 9, rsp+276*mmsize       ; t31
   1636    mova  [rsp+273*mmsize], m4
   1637    mova  [rsp+274*mmsize], m0
   1638    SUMSUB_BA         d, 2, 7, 0                    ; m2=t31a, m7=t28a
   1639    SUMSUB_BA         d, 5, 6, 0                    ; m5=t30, m6=t29
   1640    SUMSUB_MUL           6, 3, 0, 4, 15137,  6270   ; m6=t29a, m3=t18a
   1641    SUMSUB_MUL           7, 1, 0, 4, 15137,  6270   ; m7=t28, m1=t19
   1642    SCRATCH              3, 10, rsp+277*mmsize
   1643    SCRATCH              1, 11, rsp+278*mmsize
   1644    SCRATCH              7, 12, rsp+279*mmsize
   1645    SCRATCH              6, 13, rsp+280*mmsize
   1646    SCRATCH              5, 14, rsp+281*mmsize
   1647    SCRATCH              2, 15, rsp+282*mmsize
   1648 
   1649    ; end of stage 4-5 first half
   1650 
   1651    mova                m0, [%2+ 5*%3]              ; in5
   1652    mova                m1, [%2+11*%3]              ; in11
   1653    mova                m2, [%2+21*%3]              ; in21
   1654    mova                m3, [%2+27*%3]              ; in27
   1655    SUMSUB_MUL           0, 3, 4, 5, 15893,  3981   ; m0=t27a, m3=t20a
   1656    SUMSUB_MUL           2, 1, 4, 5,  8423, 14053   ; m2=t26a, m1=t21a
   1657    SUMSUB_BA         d, 1, 3, 4                    ; m1=t20, m3=t21
   1658    SUMSUB_BA         d, 2, 0, 4                    ; m2=t27, m0=t26
   1659    SUMSUB_MUL           0, 3, 4, 5,  9102, 13623   ; m0=t26a, m3=t21a
   1660    SCRATCH              0, 8, rsp+275*mmsize
   1661    SCRATCH              2, 9, rsp+276*mmsize
   1662 
   1663    ; end of stage 1-3 third quart
   1664 
   1665    mova                m0, [%2+ 3*%3]              ; in3
   1666    mova                m2, [%2+13*%3]              ; in13
   1667    mova                m4, [%2+19*%3]              ; in19
   1668    mova                m5, [%2+29*%3]              ; in29
   1669    SUMSUB_MUL           2, 4, 6, 7, 13160,  9760   ; m2=t25a, m4=t22a
   1670    SUMSUB_MUL           5, 0, 6, 7,  2404, 16207   ; m5=t24a, m0=t23a
   1671    SUMSUB_BA         d, 4, 0, 6                    ; m4=t23, m0=t22
   1672    SUMSUB_BA         d, 2, 5, 6                    ; m2=t24, m5=t25
   1673    SUMSUB_MUL           5, 0, 6, 7, 13623, m9102   ; m5=t25a, m0=t22a
   1674 
   1675    ; end of stage 1-3 fourth quart
   1676 
   1677    SUMSUB_BA         d, 1, 4, 6                    ; m1=t23a, m4=t20a
   1678    SUMSUB_BA         d, 3, 0, 6                    ; m3=t22, m0=t21
   1679    UNSCRATCH            6, 8, rsp+275*mmsize       ; t26a
   1680    UNSCRATCH            7, 9, rsp+276*mmsize       ; t27
   1681    SCRATCH              3, 8, rsp+275*mmsize
   1682    SCRATCH              1, 9, rsp+276*mmsize
   1683    SUMSUB_BA         d, 7, 2, 1                    ; m7=t24a, m2=t27a
   1684    SUMSUB_BA         d, 6, 5, 1                    ; m6=t25, m5=t26
   1685    SUMSUB_MUL           2, 4, 1, 3,  6270, m15137  ; m2=t27, m4=t20
   1686    SUMSUB_MUL           5, 0, 1, 3,  6270, m15137  ; m5=t26a, m0=t21a
   1687 
   1688    ; end of stage 4-5 second half
   1689 
   1690    UNSCRATCH            1, 12, rsp+279*mmsize      ; t28
   1691    UNSCRATCH            3, 13, rsp+280*mmsize      ; t29a
   1692    SCRATCH              4, 12, rsp+279*mmsize
   1693    SCRATCH              0, 13, rsp+280*mmsize
   1694    SUMSUB_BA         d, 5, 3, 0                    ; m5=t29, m3=t26
   1695    SUMSUB_BA         d, 2, 1, 0                    ; m2=t28a, m1=t27a
   1696    UNSCRATCH            0, 14, rsp+281*mmsize      ; t30
   1697    UNSCRATCH            4, 15, rsp+282*mmsize      ; t31a
   1698    SCRATCH              2, 14, rsp+281*mmsize
   1699    SCRATCH              5, 15, rsp+282*mmsize
   1700    SUMSUB_BA         d, 6, 0, 2                    ; m6=t30a, m0=t25a
   1701    SUMSUB_BA         d, 7, 4, 2                    ; m7=t31, m4=t24
   1702 
   1703    mova                m2, [rsp+273*mmsize]        ; t16a
   1704    mova                m5, [rsp+274*mmsize]        ; t17
   1705    mova  [rsp+273*mmsize], m6
   1706    mova  [rsp+274*mmsize], m7
   1707    UNSCRATCH            6, 10, rsp+277*mmsize      ; t18a
   1708    UNSCRATCH            7, 11, rsp+278*mmsize      ; t19
   1709    SCRATCH              4, 10, rsp+277*mmsize
   1710    SCRATCH              0, 11, rsp+278*mmsize
   1711    UNSCRATCH            4, 12, rsp+279*mmsize      ; t20
   1712    UNSCRATCH            0, 13, rsp+280*mmsize      ; t21a
   1713    SCRATCH              3, 12, rsp+279*mmsize
   1714    SCRATCH              1, 13, rsp+280*mmsize
   1715    SUMSUB_BA         d, 0, 6, 1                    ; m0=t18, m6=t21
   1716    SUMSUB_BA         d, 4, 7, 1                    ; m4=t19a, m7=t20a
   1717    UNSCRATCH            3, 8, rsp+275*mmsize       ; t22
   1718    UNSCRATCH            1, 9, rsp+276*mmsize       ; t23a
   1719    SCRATCH              0, 8, rsp+275*mmsize
   1720    SCRATCH              4, 9, rsp+276*mmsize
   1721    SUMSUB_BA         d, 3, 5, 0                    ; m3=t17a, m5=t22a
   1722    SUMSUB_BA         d, 1, 2, 0                    ; m1=t16, m2=t23
   1723 
   1724    ; end of stage 6
   1725 
   1726    UNSCRATCH            0, 10, rsp+277*mmsize      ; t24
   1727    UNSCRATCH            4, 11, rsp+278*mmsize      ; t25a
   1728    SCRATCH              1, 10, rsp+277*mmsize
   1729    SCRATCH              3, 11, rsp+278*mmsize
   1730    SUMSUB_MUL           0, 2, 1, 3, 11585, 11585   ; m0=t24a, m2=t23a
   1731    SUMSUB_MUL           4, 5, 1, 3, 11585, 11585   ; m4=t25, m5=t22
   1732    UNSCRATCH            1, 12, rsp+279*mmsize      ; t26
   1733    UNSCRATCH            3, 13, rsp+280*mmsize      ; t27a
   1734    SCRATCH              0, 12, rsp+279*mmsize
   1735    SCRATCH              4, 13, rsp+280*mmsize
   1736    SUMSUB_MUL           3, 7, 0, 4, 11585, 11585   ; m3=t27, m7=t20
   1737    SUMSUB_MUL           1, 6, 0, 4, 11585, 11585   ; m1=t26a, m6=t21a
   1738 
   1739    ; end of stage 7
   1740 
   1741    mova                m0, [rsp+269*mmsize]        ; t8
   1742    mova                m4, [rsp+270*mmsize]        ; t9a
   1743    mova  [rsp+269*mmsize], m1                      ; t26a
   1744    mova  [rsp+270*mmsize], m3                      ; t27
   1745    mova                m3, [rsp+271*mmsize]        ; t10
   1746    SUMSUB_BA         d, 2, 0, 1                    ; m2=out8, m0=out23
   1747    SUMSUB_BA         d, 5, 4, 1                    ; m5=out9, m4=out22
   1748    SUMSUB_BA         d, 6, 3, 1                    ; m6=out10, m3=out21
   1749    mova                m1, [rsp+272*mmsize]        ; t11a
   1750    mova  [rsp+271*mmsize], m0
   1751    SUMSUB_BA         d, 7, 1, 0                    ; m7=out11, m1=out20
   1752 
   1753 %if %1 == 1
   1754    TRANSPOSE4x4D        2, 5, 6, 7, 0
   1755    mova  [ptrq+ 2*mmsize], m2
   1756    mova  [ptrq+10*mmsize], m5
   1757    mova  [ptrq+18*mmsize], m6
   1758    mova  [ptrq+26*mmsize], m7
   1759 %else ; %1 == 2
   1760    pxor                m0, m0
   1761    lea               dstq, [dstq+strideq*8]
   1762    ROUND_AND_STORE_4x4  2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
   1763 %endif
   1764    mova                m2, [rsp+271*mmsize]
   1765 %if %1 == 1
   1766    TRANSPOSE4x4D        1, 3, 4, 2, 0
   1767    mova  [ptrq+ 5*mmsize], m1
   1768    mova  [ptrq+13*mmsize], m3
   1769    mova  [ptrq+21*mmsize], m4
   1770    mova  [ptrq+29*mmsize], m2
   1771 %else ; %1 == 2
   1772    lea               dstq, [dstq+stride3q*4]
   1773    ROUND_AND_STORE_4x4  1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6
   1774 %endif
   1775 
   1776    ; end of last stage + store for out8-11 and out20-23
   1777 
   1778    UNSCRATCH            0, 9, rsp+276*mmsize       ; t19a
   1779    UNSCRATCH            1, 8, rsp+275*mmsize       ; t18
   1780    UNSCRATCH            2, 11, rsp+278*mmsize      ; t17a
   1781    UNSCRATCH            3, 10, rsp+277*mmsize      ; t16
   1782    mova                m7, [rsp+261*mmsize]        ; t12a
   1783    mova                m6, [rsp+262*mmsize]        ; t13
   1784    mova                m5, [rsp+263*mmsize]        ; t14a
   1785    SUMSUB_BA         d, 0, 7, 4                    ; m0=out12, m7=out19
   1786    SUMSUB_BA         d, 1, 6, 4                    ; m1=out13, m6=out18
   1787    SUMSUB_BA         d, 2, 5, 4                    ; m2=out14, m5=out17
   1788    mova                m4, [rsp+264*mmsize]        ; t15
   1789    SCRATCH              7, 8, rsp+275*mmsize
   1790    SUMSUB_BA         d, 3, 4, 7                    ; m3=out15, m4=out16
   1791 
   1792 %if %1 == 1
   1793    TRANSPOSE4x4D        0, 1, 2, 3, 7
   1794    mova  [ptrq+ 3*mmsize], m0
   1795    mova  [ptrq+11*mmsize], m1
   1796    mova  [ptrq+19*mmsize], m2
   1797    mova  [ptrq+27*mmsize], m3
   1798 %else ; %1 == 2
   1799 %if ARCH_X86_64
   1800    SWAP                 7, 9
   1801    lea               dstq, [dstbakq+stride3q*4]
   1802 %else ; x86-32
   1803    pxor                m7, m7
   1804    mov               dstq, dstm
   1805    lea               dstq, [dstq+stride3q*4]
   1806 %endif
   1807    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
   1808 %endif
   1809    UNSCRATCH            0, 8, rsp+275*mmsize       ; out19
   1810 %if %1 == 1
   1811    TRANSPOSE4x4D        4, 5, 6, 0, 7
   1812    mova  [ptrq+ 4*mmsize], m4
   1813    mova  [ptrq+12*mmsize], m5
   1814    mova  [ptrq+20*mmsize], m6
   1815    mova  [ptrq+28*mmsize], m0
   1816 %else ; %1 == 2
   1817    lea               dstq, [dstq+strideq*4]
   1818    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
   1819 %endif
   1820 
   1821    ; end of last stage + store for out12-19
   1822 
   1823 %if ARCH_X86_64
   1824    SWAP                 7, 8
   1825 %endif
   1826    mova                m7, [rsp+257*mmsize]        ; t0
   1827    mova                m6, [rsp+258*mmsize]        ; t1
   1828    mova                m5, [rsp+259*mmsize]        ; t2
   1829    mova                m4, [rsp+260*mmsize]        ; t3
   1830    mova                m0, [rsp+274*mmsize]        ; t31
   1831    mova                m1, [rsp+273*mmsize]        ; t30a
   1832    UNSCRATCH            2, 15, rsp+282*mmsize      ; t29
   1833    SUMSUB_BA         d, 0, 7, 3                    ; m0=out0, m7=out31
   1834    SUMSUB_BA         d, 1, 6, 3                    ; m1=out1, m6=out30
   1835    SUMSUB_BA         d, 2, 5, 3                    ; m2=out2, m5=out29
   1836    SCRATCH              0, 9, rsp+276*mmsize
   1837    UNSCRATCH            3, 14, rsp+281*mmsize      ; t28a
   1838    SUMSUB_BA         d, 3, 4, 0                    ; m3=out3, m4=out28
   1839 
   1840 %if %1 == 1
   1841    TRANSPOSE4x4D        4, 5, 6, 7, 0
   1842    mova  [ptrq+ 7*mmsize], m4
   1843    mova  [ptrq+15*mmsize], m5
   1844    mova  [ptrq+23*mmsize], m6
   1845    mova  [ptrq+31*mmsize], m7
   1846 %else ; %1 == 2
   1847 %if ARCH_X86_64
   1848    SWAP                 0, 8
   1849 %else ; x86-32
   1850    pxor                m0, m0
   1851 %endif
   1852    lea               dstq, [dstq+stride3q*4]
   1853    ROUND_AND_STORE_4x4  4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
   1854 %endif
   1855    UNSCRATCH            7, 9, rsp+276*mmsize       ; out0
   1856 %if %1 == 1
   1857    TRANSPOSE4x4D        7, 1, 2, 3, 0
   1858    mova  [ptrq+ 0*mmsize], m7
   1859    mova  [ptrq+ 8*mmsize], m1
   1860    mova  [ptrq+16*mmsize], m2
   1861    mova  [ptrq+24*mmsize], m3
   1862 %else ; %1 == 2
   1863 %if ARCH_X86_64
   1864    DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
   1865 %else ; x86-32
   1866    mov               dstq, dstm
   1867 %endif
   1868    ROUND_AND_STORE_4x4  7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6
   1869 %if ARCH_X86_64
   1870    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
   1871 %endif
   1872 %endif
   1873 
   1874    ; end of last stage + store for out0-3 and out28-31
   1875 
   1876 %if ARCH_X86_64
   1877    SWAP                 0, 8
   1878 %endif
   1879    mova                m7, [rsp+265*mmsize]        ; t4
   1880    mova                m6, [rsp+266*mmsize]        ; t5a
   1881    mova                m5, [rsp+267*mmsize]        ; t6a
   1882    mova                m4, [rsp+268*mmsize]        ; t7
   1883    mova                m0, [rsp+270*mmsize]        ; t27
   1884    mova                m1, [rsp+269*mmsize]        ; t26a
   1885    UNSCRATCH            2, 13, rsp+280*mmsize      ; t25
   1886    SUMSUB_BA         d, 0, 7, 3                    ; m0=out4, m7=out27
   1887    SUMSUB_BA         d, 1, 6, 3                    ; m1=out5, m6=out26
   1888    SUMSUB_BA         d, 2, 5, 3                    ; m2=out6, m5=out25
   1889    UNSCRATCH            3, 12, rsp+279*mmsize      ; t24a
   1890    SCRATCH              7, 9, rsp+276*mmsize
   1891    SUMSUB_BA         d, 3, 4, 7                    ; m3=out7, m4=out24
   1892 
   1893 %if %1 == 1
   1894    TRANSPOSE4x4D        0, 1, 2, 3, 7
   1895    mova  [ptrq+ 1*mmsize], m0
   1896    mova  [ptrq+ 9*mmsize], m1
   1897    mova  [ptrq+17*mmsize], m2
   1898    mova  [ptrq+25*mmsize], m3
   1899 %else ; %1 == 2
   1900 %if ARCH_X86_64
   1901    SWAP                 7, 8
   1902    lea               dstq, [dstbakq+strideq*4]
   1903 %else ; x86-32
   1904    pxor                m7, m7
   1905    lea               dstq, [dstq+strideq*4]
   1906 %endif
   1907    ROUND_AND_STORE_4x4  0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
   1908 %endif
   1909    UNSCRATCH            0, 9, rsp+276*mmsize       ; out27
   1910 %if %1 == 1
   1911    TRANSPOSE4x4D        4, 5, 6, 0, 7
   1912    mova  [ptrq+ 6*mmsize], m4
   1913    mova  [ptrq+14*mmsize], m5
   1914    mova  [ptrq+22*mmsize], m6
   1915    mova  [ptrq+30*mmsize], m0
   1916 %else ; %1 == 2
   1917 %if ARCH_X86_64
   1918    lea               dstq, [dstbakq+stride3q*8]
   1919 %else
   1920    mov               dstq, dstm
   1921    lea               dstq, [dstq+stride3q*8]
   1922 %endif
   1923    ROUND_AND_STORE_4x4  4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
   1924 %endif
   1925 
   1926    ; end of last stage + store for out4-7 and out24-27
   1927 %endmacro
   1928 
   1929 INIT_XMM sse2
   1930 cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
   1931                                    275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
   1932                                    dst, stride, block, eob
   1933    mova                m0, [pw_1023]
   1934    cmp               eobd, 1
   1935    jg .idctfull
   1936 
   1937    ; dc-only - the 10bit version can be done entirely in 32bit, since the max
   1938    ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
   1939    ; fits in 32bit
   1940    DEFINE_ARGS dst, stride, block, coef
   1941    pxor                m2, m2
   1942    DC_ONLY              6, m2
   1943    movd                m1, coefd
   1944    pshuflw             m1, m1, q0000
   1945    punpcklqdq          m1, m1
   1946    DEFINE_ARGS dst, stride, cnt
   1947    mov               cntd, 32
   1948 .loop_dc:
   1949    STORE_2x8            3, 4, 1, m2, m0, dstq,          mmsize
   1950    STORE_2x8            3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
   1951    add               dstq, strideq
   1952    dec               cntd
   1953    jg .loop_dc
   1954    RET
   1955 
   1956 .idctfull:
   1957    mova  [rsp+256*mmsize], m0
   1958    DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
   1959 %if ARCH_X86_64
   1960    mov            dstbakq, dstq
   1961    movsxd            cntq, cntd
   1962 %endif
   1963 %if PIC
   1964    lea               ptrq, [default_32x32]
   1965    movzx             cntd, byte [ptrq+cntq-1]
   1966 %else
   1967    movzx             cntd, byte [default_32x32+cntq-1]
   1968 %endif
   1969    mov              skipd, 8
   1970    sub              skipd, cntd
   1971    mov               ptrq, rsp
   1972 .loop_1:
   1973    IDCT32_1D            1, blockq
   1974 
   1975    add               ptrq, 32 * mmsize
   1976    add             blockq, mmsize
   1977    dec               cntd
   1978    jg .loop_1
   1979 
   1980    ; zero-pad the remainder (skipped cols)
   1981    test             skipd, skipd
   1982    jz .end
   1983    shl              skipd, 2
   1984    lea             blockq, [blockq+skipq*(mmsize/4)]
   1985    pxor                m0, m0
   1986 .loop_z:
   1987    mova   [ptrq+mmsize*0], m0
   1988    mova   [ptrq+mmsize*1], m0
   1989    mova   [ptrq+mmsize*2], m0
   1990    mova   [ptrq+mmsize*3], m0
   1991    mova   [ptrq+mmsize*4], m0
   1992    mova   [ptrq+mmsize*5], m0
   1993    mova   [ptrq+mmsize*6], m0
   1994    mova   [ptrq+mmsize*7], m0
   1995    add               ptrq, 8 * mmsize
   1996    dec              skipd
   1997    jg .loop_z
   1998 .end:
   1999 
   2000    DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
   2001    lea           stride3q, [strideq*3]
   2002    mov               cntd, 8
   2003    mov               ptrq, rsp
   2004 .loop_2:
   2005    IDCT32_1D            2, ptrq
   2006 
   2007    add               ptrq, mmsize
   2008 %if ARCH_X86_64
   2009    add            dstbakq, 8
   2010    mov               dstq, dstbakq
   2011 %else
   2012    add         dword dstm, 8
   2013    mov               dstq, dstm
   2014 %endif
   2015    dec               cntd
   2016    jg .loop_2
   2017 
   2018    ; m7 is still zero
   2019    ZERO_BLOCK blockq-8*mmsize, 128, 32, m7
   2020    RET
   2021 
   2022 INIT_XMM sse2
   2023 cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \
   2024                                    275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
   2025                                    dst, stride, block, eob
   2026    mova                m0, [pw_4095]
   2027    cmp               eobd, 1
   2028    jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull
   2029 
   2030    ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
   2031    ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
   2032    DEFINE_ARGS dst, stride, block, coef, coefl
   2033    pxor                m2, m2
   2034    DC_ONLY_64BIT        6, m2
   2035    movd                m1, coefd
   2036    pshuflw             m1, m1, q0000
   2037    punpcklqdq          m1, m1
   2038    DEFINE_ARGS dst, stride, cnt
   2039    mov               cntd, 32
   2040 .loop_dc:
   2041    STORE_2x8            3, 4, 1, m2, m0, dstq,          mmsize
   2042    STORE_2x8            3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
   2043    add               dstq, strideq
   2044    dec               cntd
   2045    jg .loop_dc
   2046    RET