vp9itxfm.asm (112846B)
1 ;****************************************************************************** 2 ;* VP9 IDCT SIMD optimizations 3 ;* 4 ;* Copyright (C) 2013 Clément Bœsch <u pkh me> 5 ;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> 6 ;* 7 ;* This file is part of FFmpeg. 8 ;* 9 ;* FFmpeg is free software; you can redistribute it and/or 10 ;* modify it under the terms of the GNU Lesser General Public 11 ;* License as published by the Free Software Foundation; either 12 ;* version 2.1 of the License, or (at your option) any later version. 13 ;* 14 ;* FFmpeg is distributed in the hope that it will be useful, 15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 ;* Lesser General Public License for more details. 18 ;* 19 ;* You should have received a copy of the GNU Lesser General Public 20 ;* License along with FFmpeg; if not, write to the Free Software 21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 ;****************************************************************************** 23 24 %include "libavutil/x86/x86util.asm" 25 %include "vp9itxfm_template.asm" 26 27 SECTION_RODATA 32 28 29 %macro VP9_IDCT_COEFFS 2-3 0 30 const pw_m%1_%2 31 times 8 dw -%1, %2 32 const pw_%2_%1 33 times 8 dw %2, %1 34 35 %if %3 == 1 36 const pw_m%2_m%1 37 times 8 dw -%2, -%1 38 %if %1 != %2 39 const pw_m%2_%1 40 times 8 dw -%2, %1 41 const pw_%1_%2 42 times 8 dw %1, %2 43 %endif 44 %endif 45 46 %if %1 < 11585 47 pw_m%1x2: times 16 dw -%1*2 48 %elif %1 > 11585 49 pw_%1x2: times 16 dw %1*2 50 %else 51 const pw_%1x2 52 times 16 dw %1*2 53 %endif 54 55 %if %2 != %1 56 pw_%2x2: times 16 dw %2*2 57 %endif 58 %endmacro 59 60 VP9_IDCT_COEFFS 16364, 804 61 VP9_IDCT_COEFFS 16305, 1606 62 VP9_IDCT_COEFFS 16069, 3196, 1 63 VP9_IDCT_COEFFS 15893, 3981 64 VP9_IDCT_COEFFS 15137, 6270, 1 65 VP9_IDCT_COEFFS 14811, 7005 66 VP9_IDCT_COEFFS 14449, 7723 67 VP9_IDCT_COEFFS 13160, 9760 68 VP9_IDCT_COEFFS 11585, 11585, 1 69 VP9_IDCT_COEFFS 11003, 12140 70 VP9_IDCT_COEFFS 10394, 12665 71 VP9_IDCT_COEFFS 9102, 13623, 1 72 VP9_IDCT_COEFFS 8423, 14053 73 VP9_IDCT_COEFFS 5520, 15426 74 VP9_IDCT_COEFFS 4756, 15679 75 VP9_IDCT_COEFFS 2404, 16207 76 77 const pw_5283_13377 78 times 4 dw 5283, 13377 79 const pw_9929_13377 80 times 4 dw 9929, 13377 81 const pw_15212_m13377 82 times 4 dw 15212, -13377 83 const pw_15212_9929 84 times 4 dw 15212, 9929 85 const pw_m5283_m15212 86 times 4 dw -5283, -15212 87 const pw_13377x2 88 times 8 dw 13377*2 89 const pw_m13377_13377 90 times 4 dw -13377, 13377 91 const pw_13377_0 92 times 4 dw 13377, 0 93 94 cextern pw_8 95 cextern pw_16 96 cextern pw_32 97 cextern pw_512 98 cextern pw_1024 99 cextern pw_2048 100 cextern pw_m1 101 cextern pd_8192 102 103 SECTION .text 104 105 %macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2 106 punpckhwd m%4, m%2, m%1 107 punpcklwd m%2, m%1 108 pmaddwd m%3, m%4, [pw_m%5_%6] 109 pmaddwd m%4, [pw_%6_%5] 110 pmaddwd m%1, m%2, [pw_m%5_%6] 111 pmaddwd m%2, [pw_%6_%5] 112 %endmacro 113 114 %macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round 115 SUMSUB_BA d, %1, %2, %5 116 SUMSUB_BA d, %3, %4, %5 117 paddd m%1, %6 118 paddd m%2, %6 119 paddd m%3, %6 120 paddd m%4, %6 121 psrad m%1, 14 122 psrad m%2, 14 123 psrad m%3, 14 124 psrad m%4, 14 125 packssdw m%1, m%3 126 packssdw m%2, m%4 127 %endmacro 128 129 %macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst 130 %if mmsize == 32 131 pmovzxbw m%3, [%6] 132 pmovzxbw m%4, [%6+strideq] 133 %else 134 movh m%3, [%6] 135 movh m%4, [%6+strideq] 136 punpcklbw m%3, m%5 137 punpcklbw m%4, m%5 138 %endif 139 paddw m%3, m%1 140 paddw m%4, m%2 141 %if mmsize == 32 142 packuswb m%3, m%4 143 ; Intel... 144 vpermq m%3, m%3, q3120 145 mova [%6], xm%3 146 vextracti128 [%6+strideq], m%3, 1 147 %elif mmsize == 16 148 packuswb m%3, m%4 149 movh [%6], m%3 150 movhps [%6+strideq], m%3 151 %else 152 packuswb m%3, m%5 153 packuswb m%4, m%5 154 movh [%6], m%3 155 movh [%6+strideq], m%4 156 %endif 157 %endmacro 158 159 %macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg 160 %assign %%y 0 161 %rep %3 162 %assign %%x 0 163 %rep %3*2/mmsize 164 mova [%1+%%y+%%x], %4 165 %assign %%x (%%x+mmsize) 166 %endrep 167 %assign %%y (%%y+%2) 168 %endrep 169 %endmacro 170 171 ;------------------------------------------------------------------------------------------- 172 ; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 173 ;------------------------------------------------------------------------------------------- 174 175 INIT_MMX mmx 176 cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob 177 mova m0, [blockq+0*8] 178 mova m1, [blockq+1*8] 179 mova m2, [blockq+2*8] 180 mova m3, [blockq+3*8] 181 psraw m0, 2 182 psraw m1, 2 183 psraw m2, 2 184 psraw m3, 2 185 186 VP9_IWHT4_1D 187 TRANSPOSE4x4W 0, 1, 2, 3, 4 188 VP9_IWHT4_1D 189 190 pxor m4, m4 191 VP9_STORE_2X 0, 1, 5, 6, 4 192 lea dstq, [dstq+strideq*2] 193 VP9_STORE_2X 2, 3, 5, 6, 4 194 ZERO_BLOCK blockq, 8, 4, m4 195 RET 196 197 ;------------------------------------------------------------------------------------------- 198 ; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 199 ;------------------------------------------------------------------------------------------- 200 201 ; 2x2 top left corner 202 %macro VP9_IDCT4_2x2_1D 0 203 pmulhrsw m0, m5 ; m0=t1 204 mova m2, m0 ; m2=t0 205 mova m3, m1 206 pmulhrsw m1, m6 ; m1=t2 207 pmulhrsw m3, m7 ; m3=t3 208 VP9_IDCT4_1D_FINALIZE 209 %endmacro 210 211 %macro VP9_IDCT4_WRITEOUT 0 212 %if cpuflag(ssse3) 213 mova m5, [pw_2048] 214 pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 215 pmulhrsw m1, m5 216 %else 217 mova m5, [pw_8] 218 paddw m0, m5 219 paddw m1, m5 220 psraw m0, 4 221 psraw m1, 4 222 %endif 223 VP9_STORE_2X 0, 1, 6, 7, 4 224 lea dstq, [dstq+2*strideq] 225 %if cpuflag(ssse3) 226 pmulhrsw m2, m5 227 pmulhrsw m3, m5 228 %else 229 paddw m2, m5 230 paddw m3, m5 231 psraw m2, 4 232 psraw m3, 4 233 %endif 234 VP9_STORE_2X 2, 3, 6, 7, 4 235 %endmacro 236 237 %macro IDCT_4x4_FN 1 238 INIT_MMX %1 239 cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob 240 241 %if cpuflag(ssse3) 242 cmp eobd, 4 ; 2x2 or smaller 243 jg .idctfull 244 245 cmp eobd, 1 ; faster path for when only DC is set 246 jne .idct2x2 247 %else 248 cmp eobd, 1 249 jg .idctfull 250 %endif 251 252 %if cpuflag(ssse3) 253 movd m0, [blockq] 254 mova m5, [pw_11585x2] 255 pmulhrsw m0, m5 256 pmulhrsw m0, m5 257 %else 258 DEFINE_ARGS dst, stride, block, coef 259 movsx coefd, word [blockq] 260 imul coefd, 11585 261 add coefd, 8192 262 sar coefd, 14 263 imul coefd, 11585 264 add coefd, (8 << 14) + 8192 265 sar coefd, 14 + 4 266 movd m0, coefd 267 %endif 268 pshufw m0, m0, 0 269 pxor m4, m4 270 movh [blockq], m4 271 %if cpuflag(ssse3) 272 pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 273 %endif 274 VP9_STORE_2X 0, 0, 6, 7, 4 275 lea dstq, [dstq+2*strideq] 276 VP9_STORE_2X 0, 0, 6, 7, 4 277 RET 278 279 %if cpuflag(ssse3) 280 ; faster path for when only top left 2x2 block is set 281 .idct2x2: 282 movd m0, [blockq+0] 283 movd m1, [blockq+8] 284 mova m5, [pw_11585x2] 285 mova m6, [pw_6270x2] 286 mova m7, [pw_15137x2] 287 VP9_IDCT4_2x2_1D 288 ; partial 2x4 transpose 289 punpcklwd m0, m1 290 punpcklwd m2, m3 291 SBUTTERFLY dq, 0, 2, 1 292 SWAP 1, 2 293 VP9_IDCT4_2x2_1D 294 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X 295 movh [blockq+ 0], m4 296 movh [blockq+ 8], m4 297 VP9_IDCT4_WRITEOUT 298 RET 299 %endif 300 301 .idctfull: ; generic full 4x4 idct/idct 302 mova m0, [blockq+ 0] 303 mova m1, [blockq+ 8] 304 mova m2, [blockq+16] 305 mova m3, [blockq+24] 306 %if cpuflag(ssse3) 307 mova m6, [pw_11585x2] 308 %endif 309 mova m7, [pd_8192] ; rounding 310 VP9_IDCT4_1D 311 TRANSPOSE4x4W 0, 1, 2, 3, 4 312 VP9_IDCT4_1D 313 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X 314 mova [blockq+ 0], m4 315 mova [blockq+ 8], m4 316 mova [blockq+16], m4 317 mova [blockq+24], m4 318 VP9_IDCT4_WRITEOUT 319 RET 320 %endmacro 321 322 IDCT_4x4_FN mmxext 323 IDCT_4x4_FN ssse3 324 325 ;------------------------------------------------------------------------------------------- 326 ; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 327 ;------------------------------------------------------------------------------------------- 328 329 %macro IADST4_FN 5 330 INIT_MMX %5 331 cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob 332 %if WIN64 && notcpuflag(ssse3) 333 INIT_XMM cpuname 334 WIN64_SPILL_XMM 8 335 INIT_MMX cpuname 336 %endif 337 movdqa xmm5, [pd_8192] 338 mova m0, [blockq+ 0] 339 mova m1, [blockq+ 8] 340 mova m2, [blockq+16] 341 mova m3, [blockq+24] 342 %if cpuflag(ssse3) 343 mova m6, [pw_11585x2] 344 %endif 345 %ifnidn %1%3, iadstiadst 346 movdq2q m7, xmm5 347 %endif 348 VP9_%2_1D 349 TRANSPOSE4x4W 0, 1, 2, 3, 4 350 VP9_%4_1D 351 pxor m4, m4 ; used for the block reset, and VP9_STORE_2X 352 mova [blockq+ 0], m4 353 mova [blockq+ 8], m4 354 mova [blockq+16], m4 355 mova [blockq+24], m4 356 VP9_IDCT4_WRITEOUT 357 RET 358 %endmacro 359 360 IADST4_FN idct, IDCT4, iadst, IADST4, sse2 361 IADST4_FN iadst, IADST4, idct, IDCT4, sse2 362 IADST4_FN iadst, IADST4, iadst, IADST4, sse2 363 364 IADST4_FN idct, IDCT4, iadst, IADST4, ssse3 365 IADST4_FN iadst, IADST4, idct, IDCT4, ssse3 366 IADST4_FN iadst, IADST4, iadst, IADST4, ssse3 367 368 %macro SCRATCH 3 369 %if ARCH_X86_64 370 SWAP %1, %2 371 %else 372 mova [%3], m%1 373 %endif 374 %endmacro 375 376 %macro UNSCRATCH 3 377 %if ARCH_X86_64 378 SWAP %1, %2 379 %else 380 mova m%1, [%3] 381 %endif 382 %endmacro 383 384 ;------------------------------------------------------------------------------------------- 385 ; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 386 ;------------------------------------------------------------------------------------------- 387 388 %macro VP9_IDCT8_1D_FINALIZE 0 389 SUMSUB_BA w, 3, 6, 5 ; m3=t0+t7, m6=t0-t7 390 SUMSUB_BA w, 1, 2, 5 ; m1=t1+t6, m2=t1-t6 391 SUMSUB_BA w, 7, 0, 5 ; m7=t2+t5, m0=t2-t5 392 393 UNSCRATCH 5, 8, blockq+ 0 394 SCRATCH 2, 8, blockq+ 0 395 396 SUMSUB_BA w, 5, 4, 2 ; m5=t3+t4, m4=t3-t4 397 SWAP 7, 6, 2 398 SWAP 3, 5, 0 399 400 %if ARCH_X86_64 401 SWAP 6, 8 402 %endif 403 %endmacro 404 405 ; x86-32 406 ; - in: m0/m4 is in mem 407 ; - out: m6 is in mem 408 ; x86-64: 409 ; - everything is in registers (m0-7) 410 %macro VP9_IDCT8_1D 0 411 %if ARCH_X86_64 412 SWAP 0, 8 413 SWAP 4, 9 414 %endif 415 416 VP9_UNPACK_MULSUB_2W_4X 5, 3, 9102, 13623, D_8192_REG, 0, 4 ; m5=t5a, m3=t6a 417 VP9_UNPACK_MULSUB_2W_4X 1, 7, 16069, 3196, D_8192_REG, 0, 4 ; m1=t4a, m7=t7a 418 SUMSUB_BA w, 5, 1, 0 ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a) 419 SUMSUB_BA w, 3, 7, 0 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) 420 %if cpuflag(ssse3) 421 SUMSUB_BA w, 1, 7, 0 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) 422 pmulhrsw m1, W_11585x2_REG ; m1=t6 423 pmulhrsw m7, W_11585x2_REG ; m7=t5 424 %else 425 VP9_UNPACK_MULSUB_2W_4X 7, 1, 11585, 11585, D_8192_REG, 0, 4 426 %endif 427 VP9_UNPACK_MULSUB_2W_4X 2, 6, 15137, 6270, D_8192_REG, 0, 4 ; m2=t2a, m6=t3a 428 429 UNSCRATCH 0, 8, blockq+ 0 ; IN(0) 430 UNSCRATCH 4, 9, blockq+64 ; IN(4) 431 SCRATCH 5, 8, blockq+ 0 432 433 %if cpuflag(ssse3) 434 SUMSUB_BA w, 4, 0, 5 ; m4=IN(0)+IN(4) m0=IN(0)-IN(4) 435 pmulhrsw m4, W_11585x2_REG ; m4=t0a 436 pmulhrsw m0, W_11585x2_REG ; m0=t1a 437 %else 438 SCRATCH 7, 9, blockq+64 439 VP9_UNPACK_MULSUB_2W_4X 0, 4, 11585, 11585, D_8192_REG, 5, 7 440 UNSCRATCH 7, 9, blockq+64 441 %endif 442 SUMSUB_BA w, 6, 4, 5 ; m6=t0a+t3a (t0), m4=t0a-t3a (t3) 443 SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) 444 445 VP9_IDCT8_1D_FINALIZE 446 %endmacro 447 448 %macro VP9_IDCT8_4x4_1D 0 449 pmulhrsw m0, W_11585x2_REG ; m0=t1a/t0a 450 pmulhrsw m6, m2, [pw_15137x2] ; m6=t3a 451 pmulhrsw m2, [pw_6270x2] ; m2=t2a 452 pmulhrsw m7, m1, [pw_16069x2] ; m7=t7a 453 pmulhrsw m1, [pw_3196x2] ; m1=t4a 454 pmulhrsw m5, m3, [pw_m9102x2] ; m5=t5a 455 pmulhrsw m3, [pw_13623x2] ; m3=t6a 456 SUMSUB_BA w, 5, 1, 4 ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a) 457 SUMSUB_BA w, 3, 7, 4 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) 458 SUMSUB_BA w, 1, 7, 4 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) 459 pmulhrsw m1, W_11585x2_REG ; m1=t6 460 pmulhrsw m7, W_11585x2_REG ; m7=t5 461 psubw m4, m0, m6 ; m4=t0a-t3a (t3) 462 paddw m6, m0 ; m6=t0a+t3a (t0) 463 SCRATCH 5, 8, blockq+ 0 464 SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) 465 VP9_IDCT8_1D_FINALIZE 466 %endmacro 467 468 %macro VP9_IDCT8_2x2_1D 1 469 pmulhrsw m0, W_11585x2_REG ; m0=t0 470 pmulhrsw m3, m1, W_16069x2_REG ; m3=t7 471 pmulhrsw m1, W_3196x2_REG ; m1=t4 472 psubw m7, m3, m1 ; t5 = t7a - t4a 473 paddw m5, m3, m1 ; t6 = t7a + t4a 474 pmulhrsw m7, W_11585x2_REG ; m7=t5 475 pmulhrsw m5, W_11585x2_REG ; m5=t6 476 SWAP 5, 1 477 ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier 478 psubw m6, m0, m3 ; m6=t0-t7 479 paddw m3, m0 ; m3=t0+t7 480 psubw m2, m0, m1 ; m2=t1-t6 481 paddw m1, m0 ; m1=t1+t6 482 %if %1 == 1 483 punpcklwd m3, m1 484 %define SCRATCH_REG 1 485 %elif ARCH_X86_32 486 mova [blockq+ 0], m2 487 %define SCRATCH_REG 2 488 %else 489 %define SCRATCH_REG 8 490 %endif 491 psubw m4, m0, m5 ; m4=t3-t4 492 paddw m5, m0 ; m5=t3+t4 493 SUMSUB_BA w, 7, 0, SCRATCH_REG ; m7=t2+t5, m0=t2-t5 494 SWAP 7, 6, 2 495 SWAP 3, 5, 0 496 %undef SCRATCH_REG 497 %endmacro 498 499 %macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift 500 %if cpuflag(ssse3) 501 pmulhrsw m%1, %6 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 502 pmulhrsw m%2, %6 503 %else 504 paddw m%1, %6 505 paddw m%2, %6 506 psraw m%1, %7 507 psraw m%2, %7 508 %endif 509 %if %0 <= 7 510 VP9_STORE_2X %1, %2, %3, %4, %5 511 %else 512 VP9_STORE_2X %1, %2, %3, %4, %5, %8 513 %endif 514 %endmacro 515 516 ; x86-32: 517 ; - m6 is in mem 518 ; x86-64: 519 ; - m8 holds m6 (SWAP) 520 ; m6 holds zero 521 %macro VP9_IDCT8_WRITEOUT 0 522 %if ARCH_X86_64 523 %if cpuflag(ssse3) 524 mova m9, [pw_1024] 525 %else 526 mova m9, [pw_16] 527 %endif 528 %define ROUND_REG m9 529 %else 530 %if cpuflag(ssse3) 531 %define ROUND_REG [pw_1024] 532 %else 533 %define ROUND_REG [pw_16] 534 %endif 535 %endif 536 SCRATCH 5, 10, blockq+16 537 SCRATCH 7, 11, blockq+32 538 VP9_IDCT8_WRITEx2 0, 1, 5, 7, 6, ROUND_REG 539 lea dstq, [dstq+2*strideq] 540 VP9_IDCT8_WRITEx2 2, 3, 5, 7, 6, ROUND_REG 541 lea dstq, [dstq+2*strideq] 542 UNSCRATCH 5, 10, blockq+16 543 UNSCRATCH 7, 11, blockq+32 544 VP9_IDCT8_WRITEx2 4, 5, 0, 1, 6, ROUND_REG 545 lea dstq, [dstq+2*strideq] 546 UNSCRATCH 5, 8, blockq+ 0 547 VP9_IDCT8_WRITEx2 5, 7, 0, 1, 6, ROUND_REG 548 549 %undef ROUND_REG 550 %endmacro 551 552 %macro VP9_IDCT_IDCT_8x8_ADD_XMM 2 553 INIT_XMM %1 554 cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob 555 556 %if cpuflag(ssse3) 557 %if ARCH_X86_64 558 mova m12, [pw_11585x2] ; often used 559 %define W_11585x2_REG m12 560 %else 561 %define W_11585x2_REG [pw_11585x2] 562 %endif 563 564 cmp eobd, 12 ; top left half or less 565 jg .idctfull 566 567 cmp eobd, 3 ; top left corner or less 568 jg .idcthalf 569 570 cmp eobd, 1 ; faster path for when only DC is set 571 jne .idcttopleftcorner 572 %else 573 cmp eobd, 1 574 jg .idctfull 575 %endif 576 577 %if cpuflag(ssse3) 578 movd m0, [blockq] 579 pmulhrsw m0, W_11585x2_REG 580 pmulhrsw m0, W_11585x2_REG 581 %else 582 DEFINE_ARGS dst, stride, block, coef 583 movsx coefd, word [blockq] 584 imul coefd, 11585 585 add coefd, 8192 586 sar coefd, 14 587 imul coefd, 11585 588 add coefd, (16 << 14) + 8192 589 sar coefd, 14 + 5 590 movd m0, coefd 591 %endif 592 SPLATW m0, m0, 0 593 pxor m4, m4 594 movd [blockq], m4 595 %if cpuflag(ssse3) 596 pmulhrsw m0, [pw_1024] ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 597 %endif 598 %rep 3 599 VP9_STORE_2X 0, 0, 6, 7, 4 600 lea dstq, [dstq+2*strideq] 601 %endrep 602 VP9_STORE_2X 0, 0, 6, 7, 4 603 RET 604 605 %if cpuflag(ssse3) 606 ; faster path for when only left corner is set (3 input: DC, right to DC, below 607 ; to DC). Note: also working with a 2x2 block 608 .idcttopleftcorner: 609 movd m0, [blockq+0] 610 movd m1, [blockq+16] 611 %if ARCH_X86_64 612 mova m10, [pw_3196x2] 613 mova m11, [pw_16069x2] 614 %define W_3196x2_REG m10 615 %define W_16069x2_REG m11 616 %else 617 %define W_3196x2_REG [pw_3196x2] 618 %define W_16069x2_REG [pw_16069x2] 619 %endif 620 VP9_IDCT8_2x2_1D 1 621 ; partial 2x8 transpose 622 ; punpcklwd m0, m1 already done inside idct 623 punpcklwd m2, m3 624 punpcklwd m4, m5 625 punpcklwd m6, m7 626 punpckldq m0, m2 627 punpckldq m4, m6 628 SBUTTERFLY qdq, 0, 4, 1 629 SWAP 1, 4 630 VP9_IDCT8_2x2_1D 2 631 %if ARCH_X86_64 632 SWAP 6, 8 633 %endif 634 pxor m6, m6 ; used for the block reset, and VP9_STORE_2X 635 VP9_IDCT8_WRITEOUT 636 %if ARCH_X86_64 637 movd [blockq+ 0], m6 638 movd [blockq+16], m6 639 %else 640 mova [blockq+ 0], m6 641 mova [blockq+16], m6 642 mova [blockq+32], m6 643 %endif 644 RET 645 646 .idcthalf: 647 movh m0, [blockq + 0] 648 movh m1, [blockq +16] 649 movh m2, [blockq +32] 650 movh m3, [blockq +48] 651 VP9_IDCT8_4x4_1D 652 ; partial 4x8 transpose 653 %if ARCH_X86_32 654 mova m6, [blockq+ 0] 655 %endif 656 punpcklwd m0, m1 657 punpcklwd m2, m3 658 punpcklwd m4, m5 659 punpcklwd m6, m7 660 SBUTTERFLY dq, 0, 2, 1 661 SBUTTERFLY dq, 4, 6, 5 662 SBUTTERFLY qdq, 0, 4, 1 663 SBUTTERFLY qdq, 2, 6, 5 664 SWAP 1, 4 665 SWAP 3, 6 666 VP9_IDCT8_4x4_1D 667 %if ARCH_X86_64 668 SWAP 6, 8 669 %endif 670 pxor m6, m6 671 VP9_IDCT8_WRITEOUT 672 %if ARCH_X86_64 673 movh [blockq+ 0], m6 674 movh [blockq+16], m6 675 movh [blockq+32], m6 676 %else 677 mova [blockq+ 0], m6 678 mova [blockq+16], m6 679 mova [blockq+32], m6 680 %endif 681 movh [blockq+48], m6 682 RET 683 %endif 684 685 .idctfull: ; generic full 8x8 idct/idct 686 %if ARCH_X86_64 687 mova m0, [blockq+ 0] ; IN(0) 688 %endif 689 mova m1, [blockq+ 16] ; IN(1) 690 mova m2, [blockq+ 32] ; IN(2) 691 mova m3, [blockq+ 48] ; IN(3) 692 %if ARCH_X86_64 693 mova m4, [blockq+ 64] ; IN(4) 694 %endif 695 mova m5, [blockq+ 80] ; IN(5) 696 mova m6, [blockq+ 96] ; IN(6) 697 mova m7, [blockq+112] ; IN(7) 698 %if ARCH_X86_64 699 mova m11, [pd_8192] ; rounding 700 %define D_8192_REG m11 701 %else 702 %define D_8192_REG [pd_8192] 703 %endif 704 VP9_IDCT8_1D 705 %if ARCH_X86_64 706 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 707 %else 708 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 709 mova [blockq+0], m0 710 %endif 711 VP9_IDCT8_1D 712 713 %if ARCH_X86_64 714 SWAP 6, 8 715 %endif 716 pxor m6, m6 ; used for the block reset, and VP9_STORE_2X 717 VP9_IDCT8_WRITEOUT 718 ZERO_BLOCK blockq, 16, 8, m6 719 RET 720 %undef W_11585x2_REG 721 %endmacro 722 723 VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12 724 VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13 725 VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13 726 727 ;--------------------------------------------------------------------------------------------- 728 ; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 729 ;--------------------------------------------------------------------------------------------- 730 731 ; x86-32: 732 ; - in: m0/3/4/7 are in mem [blockq+N*16] 733 ; - out: m6 is in mem [blockq+0] 734 ; x86-64: 735 ; - everything is in registers 736 %macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7 737 %if ARCH_X86_64 738 SWAP 0, 8 739 SWAP 3, 9 740 SWAP 4, 10 741 SWAP 7, 11 742 %endif 743 744 VP9_UNPACK_MULSUB_2D_4X 5, 2, 0, 3, 14449, 7723 ; m5/2=t3[d], m2/4=t2[d] 745 VP9_UNPACK_MULSUB_2D_4X 1, 6, 4, 7, 4756, 15679 ; m1/4=t7[d], m6/7=t6[d] 746 SCRATCH 4, 12, blockq+1*16 747 VP9_RND_SH_SUMSUB_BA 6, 2, 7, 3, 4, D_8192_REG ; m6=t2[w], m2=t6[w] 748 UNSCRATCH 4, 12, blockq+1*16 749 VP9_RND_SH_SUMSUB_BA 1, 5, 4, 0, 3, D_8192_REG ; m1=t3[w], m5=t7[w] 750 751 UNSCRATCH 0, 8, blockq+16*0 752 UNSCRATCH 3, 9, blockq+16*3 753 UNSCRATCH 4, 10, blockq+16*4 754 UNSCRATCH 7, 11, blockq+16*7 755 SCRATCH 1, 8, blockq+16*1 756 SCRATCH 2, 9, blockq+16*2 757 SCRATCH 5, 10, blockq+16*5 758 SCRATCH 6, 11, blockq+16*6 759 760 VP9_UNPACK_MULSUB_2D_4X 7, 0, 1, 2, 16305, 1606 ; m7/1=t1[d], m0/2=t0[d] 761 VP9_UNPACK_MULSUB_2D_4X 3, 4, 5, 6, 10394, 12665 ; m3/5=t5[d], m4/6=t4[d] 762 SCRATCH 1, 12, blockq+ 0*16 763 VP9_RND_SH_SUMSUB_BA 4, 0, 6, 2, 1, D_8192_REG ; m4=t0[w], m0=t4[w] 764 UNSCRATCH 1, 12, blockq+ 0*16 765 VP9_RND_SH_SUMSUB_BA 3, 7, 5, 1, 2, D_8192_REG ; m3=t1[w], m7=t5[w] 766 767 UNSCRATCH 2, 9, blockq+16*2 768 UNSCRATCH 5, 10, blockq+16*5 769 SCRATCH 3, 9, blockq+16*3 770 SCRATCH 4, 10, blockq+16*4 771 772 ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7 773 774 VP9_UNPACK_MULSUB_2D_4X 0, 7, 1, 3, 15137, 6270 ; m0/1=t5[d], m7/3=t4[d] 775 VP9_UNPACK_MULSUB_2D_4X 5, 2, 4, 6, 6270, 15137 ; m5/4=t6[d], m2/6=t7[d] 776 SCRATCH 1, 12, blockq+ 0*16 777 VP9_RND_SH_SUMSUB_BA 5, 7, 4, 3, 1, D_8192_REG 778 UNSCRATCH 1, 12, blockq+ 0*16 779 PSIGNW m5, W_M1_REG ; m5=out1[w], m7=t6[w] 780 VP9_RND_SH_SUMSUB_BA 2, 0, 6, 1, 3, D_8192_REG ; m2=out6[w], m0=t7[w] 781 782 UNSCRATCH 1, 8, blockq+16*1 783 UNSCRATCH 3, 9, blockq+16*3 784 UNSCRATCH 4, 10, blockq+16*4 785 UNSCRATCH 6, 11, blockq+16*6 786 SCRATCH 2, 8, blockq+16*0 787 788 SUMSUB_BA w, 6, 4, 2 ; m6=out0[w], m4=t2[w] 789 SUMSUB_BA w, 1, 3, 2 790 PSIGNW m1, W_M1_REG ; m1=out7[w], m3=t3[w] 791 792 ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7 793 794 ; unfortunately, the code below overflows in some cases 795 %if 0; cpuflag(ssse3) 796 SUMSUB_BA w, 3, 4, 2 797 SUMSUB_BA w, 0, 7, 2 798 pmulhrsw m3, W_11585x2_REG 799 pmulhrsw m7, W_11585x2_REG 800 pmulhrsw m4, W_11585x2_REG ; out4 801 pmulhrsw m0, W_11585x2_REG ; out2 802 %else 803 SCRATCH 5, 9, blockq+16*1 804 VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, D_8192_REG, 2, 5 805 VP9_UNPACK_MULSUB_2W_4X 7, 0, 11585, 11585, D_8192_REG, 2, 5 806 UNSCRATCH 5, 9, blockq+16*1 807 %endif 808 PSIGNW m3, W_M1_REG ; out3 809 PSIGNW m7, W_M1_REG ; out5 810 811 ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7 812 813 %if ARCH_X86_64 814 SWAP 2, 8 815 %endif 816 SWAP 0, 6, 2 817 SWAP 7, 1, 5 818 %endmacro 819 820 %macro IADST8_FN 6 821 INIT_XMM %5 822 cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob 823 824 %ifidn %1, idct 825 %define first_is_idct 1 826 %else 827 %define first_is_idct 0 828 %endif 829 830 %ifidn %3, idct 831 %define second_is_idct 1 832 %else 833 %define second_is_idct 0 834 %endif 835 836 %if ARCH_X86_64 837 mova m0, [blockq+ 0] ; IN(0) 838 %endif 839 mova m1, [blockq+ 16] ; IN(1) 840 mova m2, [blockq+ 32] ; IN(2) 841 %if ARCH_X86_64 || first_is_idct 842 mova m3, [blockq+ 48] ; IN(3) 843 %endif 844 %if ARCH_X86_64 845 mova m4, [blockq+ 64] ; IN(4) 846 %endif 847 mova m5, [blockq+ 80] ; IN(5) 848 mova m6, [blockq+ 96] ; IN(6) 849 %if ARCH_X86_64 || first_is_idct 850 mova m7, [blockq+112] ; IN(7) 851 %endif 852 %if ARCH_X86_64 853 %if cpuflag(ssse3) 854 mova m15, [pw_11585x2] ; often used 855 %endif 856 mova m13, [pd_8192] ; rounding 857 mova m14, [pw_m1] 858 %define W_11585x2_REG m15 859 %define D_8192_REG m13 860 %define W_M1_REG m14 861 %else 862 %define W_11585x2_REG [pw_11585x2] 863 %define D_8192_REG [pd_8192] 864 %define W_M1_REG [pw_m1] 865 %endif 866 867 ; note different calling conventions for idct8 vs. iadst8 on x86-32 868 VP9_%2_1D 869 %if ARCH_X86_64 870 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 871 %else 872 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 873 mova [blockq+ 0], m0 874 %if second_is_idct == 0 875 mova [blockq+ 48], m3 876 mova [blockq+112], m7 877 %endif 878 %endif 879 VP9_%4_1D 880 881 %if ARCH_X86_64 882 SWAP 6, 8 883 %endif 884 pxor m6, m6 ; used for the block reset, and VP9_STORE_2X 885 VP9_IDCT8_WRITEOUT 886 ZERO_BLOCK blockq, 16, 8, m6 887 RET 888 889 %undef W_11585x2_REG 890 %undef first_is_idct 891 %undef second_is_idct 892 893 %endmacro 894 895 IADST8_FN idct, IDCT8, iadst, IADST8, sse2, 15 896 IADST8_FN iadst, IADST8, idct, IDCT8, sse2, 15 897 IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15 898 IADST8_FN idct, IDCT8, iadst, IADST8, ssse3, 16 899 IADST8_FN idct, IDCT8, iadst, IADST8, avx, 16 900 IADST8_FN iadst, IADST8, idct, IDCT8, ssse3, 16 901 IADST8_FN iadst, IADST8, idct, IDCT8, avx, 16 902 IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16 903 IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 904 905 ;--------------------------------------------------------------------------------------------- 906 ; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 907 ;--------------------------------------------------------------------------------------------- 908 909 ; x86-64: 910 ; at the end of this macro, m7 is stored in [%4+15*%5] 911 ; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15 912 ; the following sumsubs have not been done yet: 913 ; SUMSUB_BA w, 6, 9, 15 ; t6, t9 914 ; SUMSUB_BA w, 7, 8, 15 ; t7, t8 915 ; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1, 916 ; and the following simsubs have not been done yet: 917 ; SUMSUB_BA w, x13, x14, 7 ; t6, t9 918 ; SUMSUB_BA w, x15, x12, 7 ; t7, t8 919 920 %macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst 921 %if %2 <= 4 922 mova m3, [%1+ 1*%3] ; IN(1) 923 mova m0, [%1+ 3*%3] ; IN(3) 924 925 pmulhrsw m4, m3, [pw_16305x2] ; t14-15 926 pmulhrsw m3, [pw_1606x2] ; t8-9 927 pmulhrsw m7, m0, [pw_m4756x2] ; t10-11 928 pmulhrsw m0, [pw_15679x2] ; t12-13 929 930 ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 931 ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 932 933 VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 1, 6 ; t9, t14 934 SCRATCH 4, 10, %4+ 1*%5 935 SCRATCH 5, 11, %4+ 7*%5 936 VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 937 UNSCRATCH 5, 11, %4+ 7*%5 938 939 ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 940 ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 941 %else 942 mova m5, [%1+ 1*%3] ; IN(1) 943 mova m4, [%1+ 7*%3] ; IN(7) 944 %if %2 <= 8 945 pmulhrsw m2, m5, [pw_16305x2] ; t15 946 pmulhrsw m5, [pw_1606x2] ; t8 947 pmulhrsw m3, m4, [pw_m10394x2] ; t9 948 pmulhrsw m4, [pw_12665x2] ; t14 949 %else 950 mova m3, [%1+ 9*%3] ; IN(9) 951 mova m2, [%1+15*%3] ; IN(15) 952 953 ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7 954 ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15 955 956 VP9_UNPACK_MULSUB_2W_4X 5, 2, 16305, 1606, [pd_8192], 0, 1 ; t8, t15 957 VP9_UNPACK_MULSUB_2W_4X 3, 4, 10394, 12665, [pd_8192], 0, 1 ; t9, t14 958 %endif 959 960 SUMSUB_BA w, 3, 5, 0 ; t8, t9 961 SUMSUB_BA w, 4, 2, 0 ; t15, t14 962 963 VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 0, 1 ; t9, t14 964 965 SCRATCH 4, 10, %4+ 1*%5 966 SCRATCH 5, 11, %4+ 7*%5 967 968 mova m6, [%1+ 3*%3] ; IN(3) 969 mova m7, [%1+ 5*%3] ; IN(5) 970 %if %2 <= 8 971 pmulhrsw m0, m7, [pw_14449x2] ; t13 972 pmulhrsw m7, [pw_7723x2] ; t10 973 pmulhrsw m1, m6, [pw_m4756x2] ; t11 974 pmulhrsw m6, [pw_15679x2] ; t12 975 %else 976 mova m0, [%1+11*%3] ; IN(11) 977 mova m1, [%1+13*%3] ; IN(13) 978 979 VP9_UNPACK_MULSUB_2W_4X 7, 0, 14449, 7723, [pd_8192], 4, 5 ; t10, t13 980 VP9_UNPACK_MULSUB_2W_4X 1, 6, 4756, 15679, [pd_8192], 4, 5 ; t11, t12 981 %endif 982 983 ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7 984 ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15 985 986 SUMSUB_BA w, 7, 1, 4 ; t11, t10 987 SUMSUB_BA w, 0, 6, 4 ; t12, t13 988 989 ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 990 ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 991 992 VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 993 994 UNSCRATCH 5, 11, %4+ 7*%5 995 %endif 996 997 ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7 998 ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15 999 1000 SUMSUB_BA w, 7, 3, 4 ; t8, t11 1001 1002 ; backup first register 1003 mova [%4+15*%5], m7 1004 1005 SUMSUB_BA w, 6, 2, 7 ; t9, t10 1006 UNSCRATCH 4, 10, %4+ 1*%5 1007 SUMSUB_BA w, 0, 4, 7 ; t15, t12 1008 SUMSUB_BA w, 1, 5, 7 ; t14. t13 1009 1010 ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 1011 ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 1012 1013 %if cpuflag(ssse3) && %6 == 0 1014 SUMSUB_BA w, 2, 5, 7 1015 SUMSUB_BA w, 3, 4, 7 1016 pmulhrsw m5, [pw_11585x2] ; t10 1017 pmulhrsw m4, [pw_11585x2] ; t11 1018 pmulhrsw m3, [pw_11585x2] ; t12 1019 pmulhrsw m2, [pw_11585x2] ; t13 1020 %else 1021 SCRATCH 6, 10, %4+ 1*%5 1022 VP9_UNPACK_MULSUB_2W_4X 5, 2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13 1023 VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12 1024 UNSCRATCH 6, 10, %4+ 1*%5 1025 %endif 1026 1027 ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 1028 ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15 1029 1030 SCRATCH 0, 8, %4+ 1*%5 1031 SCRATCH 1, 9, %4+ 3*%5 1032 SCRATCH 2, 10, %4+ 5*%5 1033 SCRATCH 3, 11, %4+ 7*%5 1034 SCRATCH 4, 12, %4+ 9*%5 1035 SCRATCH 5, 13, %4+11*%5 1036 SCRATCH 6, 14, %4+13*%5 1037 1038 ; even (tx8x8) 1039 %if %2 <= 4 1040 mova m3, [%1+ 0*%3] ; IN(0) 1041 mova m4, [%1+ 2*%3] ; IN(2) 1042 1043 pmulhrsw m3, [pw_11585x2] ; t0-t3 1044 pmulhrsw m7, m4, [pw_16069x2] ; t6-7 1045 pmulhrsw m4, [pw_3196x2] ; t4-5 1046 1047 %if 0 ; overflows :( 1048 paddw m6, m7, m4 1049 psubw m5, m7, m4 1050 pmulhrsw m5, [pw_11585x2] ; t5 1051 pmulhrsw m6, [pw_11585x2] ; t6 1052 %else 1053 VP9_UNPACK_MULSUB_2W_4X 5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5, t6 1054 %endif 1055 1056 psubw m0, m3, m7 1057 paddw m7, m3 1058 psubw m1, m3, m6 1059 paddw m6, m3 1060 psubw m2, m3, m5 1061 paddw m5, m3 1062 1063 %if ARCH_X86_32 1064 SWAP 0, 7 1065 %endif 1066 SCRATCH 7, 15, %4+12*%5 1067 %else 1068 mova m6, [%1+ 2*%3] ; IN(2) 1069 mova m1, [%1+ 4*%3] ; IN(4) 1070 mova m7, [%1+ 6*%3] ; IN(6) 1071 %if %2 <= 8 1072 pmulhrsw m0, m1, [pw_15137x2] ; t3 1073 pmulhrsw m1, [pw_6270x2] ; t2 1074 pmulhrsw m5, m6, [pw_16069x2] ; t7 1075 pmulhrsw m6, [pw_3196x2] ; t4 1076 pmulhrsw m4, m7, [pw_m9102x2] ; t5 1077 pmulhrsw m7, [pw_13623x2] ; t6 1078 %else 1079 mova m4, [%1+10*%3] ; IN(10) 1080 mova m0, [%1+12*%3] ; IN(12) 1081 mova m5, [%1+14*%3] ; IN(14) 1082 1083 VP9_UNPACK_MULSUB_2W_4X 1, 0, 15137, 6270, [pd_8192], 2, 3 ; t2, t3 1084 VP9_UNPACK_MULSUB_2W_4X 6, 5, 16069, 3196, [pd_8192], 2, 3 ; t4, t7 1085 VP9_UNPACK_MULSUB_2W_4X 4, 7, 9102, 13623, [pd_8192], 2, 3 ; t5, t6 1086 %endif 1087 1088 SUMSUB_BA w, 4, 6, 2 ; t4, t5 1089 SUMSUB_BA w, 7, 5, 2 ; t7, t6 1090 1091 %if cpuflag(ssse3) && %6 == 0 1092 SUMSUB_BA w, 6, 5, 2 1093 pmulhrsw m5, [pw_11585x2] ; t5 1094 pmulhrsw m6, [pw_11585x2] ; t6 1095 %else 1096 VP9_UNPACK_MULSUB_2W_4X 5, 6, 11585, 11585, [pd_8192], 2, 3 ; t5, t6 1097 %endif 1098 1099 SCRATCH 5, 15, %4+10*%5 1100 mova m2, [%1+ 0*%3] ; IN(0) 1101 %if %2 <= 8 1102 pmulhrsw m2, [pw_11585x2] ; t0 and t1 1103 psubw m3, m2, m0 1104 paddw m0, m2 1105 1106 SUMSUB_BA w, 7, 0, 5 ; t0, t7 1107 %else 1108 mova m3, [%1+ 8*%3] ; IN(8) 1109 1110 ; from 3 stages back 1111 %if cpuflag(ssse3) && %6 == 0 1112 SUMSUB_BA w, 3, 2, 5 1113 pmulhrsw m3, [pw_11585x2] ; t0 1114 pmulhrsw m2, [pw_11585x2] ; t1 1115 %else 1116 mova [%1+ 0*%3], m0 1117 VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 0 ; t0, t1 1118 mova m0, [%1+ 0*%3] 1119 %endif 1120 1121 ; from 2 stages back 1122 SUMSUB_BA w, 0, 3, 5 ; t0, t3 1123 1124 SUMSUB_BA w, 7, 0, 5 ; t0, t7 1125 %endif 1126 UNSCRATCH 5, 15, %4+10*%5 1127 %if ARCH_X86_32 1128 SWAP 0, 7 1129 %endif 1130 SCRATCH 7, 15, %4+12*%5 1131 SUMSUB_BA w, 1, 2, 7 ; t1, t2 1132 1133 ; from 1 stage back 1134 SUMSUB_BA w, 6, 1, 7 ; t1, t6 1135 SUMSUB_BA w, 5, 2, 7 ; t2, t5 1136 %endif 1137 SUMSUB_BA w, 4, 3, 7 ; t3, t4 1138 1139 %if ARCH_X86_64 1140 SWAP 0, 8 1141 SWAP 1, 9 1142 SWAP 2, 10 1143 SWAP 3, 11 1144 SWAP 4, 12 1145 SWAP 5, 13 1146 SWAP 6, 14 1147 1148 SUMSUB_BA w, 0, 15, 7 ; t0, t15 1149 SUMSUB_BA w, 1, 14, 7 ; t1, t14 1150 SUMSUB_BA w, 2, 13, 7 ; t2, t13 1151 SUMSUB_BA w, 3, 12, 7 ; t3, t12 1152 SUMSUB_BA w, 4, 11, 7 ; t4, t11 1153 SUMSUB_BA w, 5, 10, 7 ; t5, t10 1154 %else 1155 SWAP 1, 6 1156 SWAP 2, 5 1157 SWAP 3, 4 1158 mova [%4+14*%5], m6 1159 1160 %macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride 1161 mova m6, [%4+%2*%5] 1162 SUMSUB_BA w, 6, %1, 7 1163 SWAP %1, 6 1164 mova [%4+%3*%5], m6 1165 %endmacro 1166 1167 %%SUMSUB_BA_STORE 0, 1, 1, %4, %5 ; t0, t15 1168 %%SUMSUB_BA_STORE 1, 3, 3, %4, %5 ; t1, t14 1169 %%SUMSUB_BA_STORE 2, 5, 5, %4, %5 ; t2, t13 1170 %%SUMSUB_BA_STORE 3, 7, 7, %4, %5 ; t3, t12 1171 %%SUMSUB_BA_STORE 4, 9, 9, %4, %5 ; t4, t11 1172 %%SUMSUB_BA_STORE 5, 11, 11, %4, %5 ; t5, t10 1173 %endif 1174 %endmacro 1175 1176 %macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst 1177 %if %2 == 1 1178 VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4 1179 1180 %if ARCH_X86_64 1181 ; backup a different register 1182 mova m7, [tmpq+15*16] 1183 mova [tmpq+ 1*16], m15 1184 1185 SUMSUB_BA w, 6, 9, 15 ; t6, t9 1186 SUMSUB_BA w, 7, 8, 15 ; t7, t8 1187 1188 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15 1189 mova [tmpq+ 0], m0 1190 mova [tmpq+ 32], m1 1191 mova [tmpq+ 64], m2 1192 mova [tmpq+ 96], m3 1193 mova [tmpq+128], m4 1194 mova [tmpq+160], m5 1195 mova [tmpq+192], m6 1196 mova [tmpq+224], m7 1197 1198 mova m15, [tmpq+ 1*16] 1199 TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 1200 mova [tmpq+ 16], m8 1201 mova [tmpq+ 48], m9 1202 mova [tmpq+ 80], m10 1203 mova [tmpq+112], m11 1204 mova [tmpq+144], m12 1205 mova [tmpq+176], m13 1206 mova [tmpq+208], m14 1207 mova [tmpq+240], m15 1208 %else 1209 mova m6, [tmpq+13*16] 1210 mova m7, [tmpq+14*16] 1211 SUMSUB_BA w, 6, 7 ; t6, t9 1212 mova [tmpq+14*16], m6 1213 mova [tmpq+13*16], m7 1214 mova m7, [tmpq+15*16] 1215 mova m6, [tmpq+12*16] 1216 SUMSUB_BA w, 7, 6 ; t7, t8 1217 mova [tmpq+15*16], m6 1218 1219 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1 1220 mova [tmpq+ 0*16], m0 1221 mova [tmpq+ 2*16], m1 1222 mova [tmpq+ 4*16], m2 1223 mova [tmpq+ 6*16], m3 1224 mova [tmpq+10*16], m5 1225 mova [tmpq+12*16], m6 1226 mova [tmpq+14*16], m7 1227 1228 mova m0, [tmpq+15*16] 1229 mova m1, [tmpq+13*16] 1230 mova m2, [tmpq+11*16] 1231 mova m3, [tmpq+ 9*16] 1232 mova m4, [tmpq+ 7*16] 1233 mova m5, [tmpq+ 5*16] 1234 mova m7, [tmpq+ 1*16] 1235 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1 1236 mova [tmpq+ 1*16], m0 1237 mova [tmpq+ 3*16], m1 1238 mova [tmpq+ 5*16], m2 1239 mova [tmpq+ 7*16], m3 1240 mova [tmpq+11*16], m5 1241 mova [tmpq+13*16], m6 1242 mova [tmpq+15*16], m7 1243 %endif 1244 %else ; %2 == 2 1245 VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4 1246 1247 %if cpuflag(ssse3) 1248 %define ROUND_REG [pw_512] 1249 %else 1250 %define ROUND_REG [pw_32] 1251 %endif 1252 1253 pxor m7, m7 1254 %if ARCH_X86_64 1255 ; backup more registers 1256 mova [%1+ 2*32], m8 1257 mova [%1+ 3*32], m9 1258 1259 VP9_IDCT8_WRITEx2 0, 1, 8, 9, 7, ROUND_REG, 6 1260 lea dstq, [dstq+strideq*2] 1261 VP9_IDCT8_WRITEx2 2, 3, 8, 9, 7, ROUND_REG, 6 1262 lea dstq, [dstq+strideq*2] 1263 VP9_IDCT8_WRITEx2 4, 5, 8, 9, 7, ROUND_REG, 6 1264 lea dstq, [dstq+strideq*2] 1265 1266 ; restore from cache 1267 SWAP 0, 7 ; move zero from m7 to m0 1268 mova m7, [%1+15*32] 1269 mova m8, [%1+ 2*32] 1270 mova m9, [%1+ 3*32] 1271 1272 SUMSUB_BA w, 6, 9, 3 ; t6, t9 1273 SUMSUB_BA w, 7, 8, 3 ; t7, t8 1274 1275 VP9_IDCT8_WRITEx2 6, 7, 3, 4, 0, ROUND_REG, 6 1276 lea dstq, [dstq+strideq*2] 1277 VP9_IDCT8_WRITEx2 8, 9, 3, 4, 0, ROUND_REG, 6 1278 lea dstq, [dstq+strideq*2] 1279 VP9_IDCT8_WRITEx2 10, 11, 1, 2, 0, ROUND_REG, 6 1280 lea dstq, [dstq+strideq*2] 1281 VP9_IDCT8_WRITEx2 12, 13, 1, 2, 0, ROUND_REG, 6 1282 lea dstq, [dstq+strideq*2] 1283 VP9_IDCT8_WRITEx2 14, 15, 1, 2, 0, ROUND_REG, 6 1284 %else 1285 mova [tmpq+ 0*32], m5 1286 1287 VP9_IDCT8_WRITEx2 0, 1, 5, 6, 7, ROUND_REG, 6 1288 lea dstq, [dstq+strideq*2] 1289 VP9_IDCT8_WRITEx2 2, 3, 5, 6, 7, ROUND_REG, 6 1290 lea dstq, [dstq+strideq*2] 1291 1292 SWAP 0, 7 ; move zero from m7 to m0 1293 mova m5, [tmpq+ 0*32] 1294 1295 VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1296 lea dstq, [dstq+strideq*2] 1297 1298 mova m4, [tmpq+13*32] 1299 mova m7, [tmpq+14*32] 1300 mova m5, [tmpq+15*32] 1301 mova m6, [tmpq+12*32] 1302 SUMSUB_BADC w, 4, 7, 5, 6, 1 1303 1304 VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1305 lea dstq, [dstq+strideq*2] 1306 VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 1307 lea dstq, [dstq+strideq*2] 1308 1309 mova m4, [tmpq+11*32] 1310 mova m5, [tmpq+ 9*32] 1311 mova m6, [tmpq+ 7*32] 1312 mova m7, [tmpq+ 5*32] 1313 1314 VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1315 lea dstq, [dstq+strideq*2] 1316 VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 1317 lea dstq, [dstq+strideq*2] 1318 1319 mova m4, [tmpq+ 3*32] 1320 mova m5, [tmpq+ 1*32] 1321 1322 VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 1323 lea dstq, [dstq+strideq*2] 1324 %endif 1325 1326 %undef ROUND_REG 1327 %endif ; %2 == 1/2 1328 %endmacro 1329 1330 %macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride 1331 mova m%3, [dstq] 1332 mova m%5, [dstq+%7] 1333 punpcklbw m%2, m%3, m%6 1334 punpckhbw m%3, m%6 1335 punpcklbw m%4, m%5, m%6 1336 punpckhbw m%5, m%6 1337 paddw m%2, m%1 1338 paddw m%3, m%1 1339 paddw m%4, m%1 1340 paddw m%5, m%1 1341 packuswb m%2, m%3 1342 packuswb m%4, m%5 1343 mova [dstq], m%2 1344 mova [dstq+%7], m%4 1345 %endmacro 1346 1347 %macro VP9_IDCT_IDCT_16x16_ADD_XMM 1 1348 INIT_XMM %1 1349 cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob 1350 %if cpuflag(ssse3) 1351 ; 2x2=eob=3, 4x4=eob=10 1352 cmp eobd, 38 1353 jg .idctfull 1354 cmp eobd, 1 ; faster path for when only DC is set 1355 jne .idct8x8 1356 %else 1357 cmp eobd, 1 ; faster path for when only DC is set 1358 jg .idctfull 1359 %endif 1360 1361 ; dc-only 1362 %if cpuflag(ssse3) 1363 movd m0, [blockq] 1364 mova m1, [pw_11585x2] 1365 pmulhrsw m0, m1 1366 pmulhrsw m0, m1 1367 %else 1368 DEFINE_ARGS dst, stride, block, coef 1369 movsx coefd, word [blockq] 1370 imul coefd, 11585 1371 add coefd, 8192 1372 sar coefd, 14 1373 imul coefd, 11585 1374 add coefd, (32 << 14) + 8192 1375 sar coefd, 14 + 6 1376 movd m0, coefd 1377 %endif 1378 SPLATW m0, m0, q0000 1379 %if cpuflag(ssse3) 1380 pmulhrsw m0, [pw_512] 1381 %endif 1382 pxor m5, m5 1383 movd [blockq], m5 1384 %rep 7 1385 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 1386 lea dstq, [dstq+2*strideq] 1387 %endrep 1388 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 1389 RET 1390 1391 DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp 1392 %if cpuflag(ssse3) 1393 .idct8x8: 1394 mov tmpq, rsp 1395 VP9_IDCT16_1D blockq, 1, 8, 0 1396 1397 mov cntd, 2 1398 mov dst_bakq, dstq 1399 .loop2_8x8: 1400 VP9_IDCT16_1D tmpq, 2, 8, 0 1401 lea dstq, [dst_bakq+8] 1402 add tmpq, 16 1403 dec cntd 1404 jg .loop2_8x8 1405 1406 ; at the end of the loop, m0 should still be zero 1407 ; use that to zero out block coefficients 1408 ZERO_BLOCK blockq, 32, 8, m0 1409 RET 1410 %endif 1411 1412 .idctfull: 1413 mov cntd, 2 1414 mov tmpq, rsp 1415 .loop1_full: 1416 VP9_IDCT16_1D blockq, 1, 16, 0 1417 add blockq, 16 1418 add tmpq, 256 1419 dec cntd 1420 jg .loop1_full 1421 sub blockq, 32 1422 1423 mov cntd, 2 1424 mov tmpq, rsp 1425 mov dst_bakq, dstq 1426 .loop2_full: 1427 VP9_IDCT16_1D tmpq, 2, 16, 0 1428 lea dstq, [dst_bakq+8] 1429 add tmpq, 16 1430 dec cntd 1431 jg .loop2_full 1432 1433 ; at the end of the loop, m0 should still be zero 1434 ; use that to zero out block coefficients 1435 ZERO_BLOCK blockq, 32, 16, m0 1436 RET 1437 %endmacro 1438 1439 VP9_IDCT_IDCT_16x16_ADD_XMM sse2 1440 VP9_IDCT_IDCT_16x16_ADD_XMM ssse3 1441 VP9_IDCT_IDCT_16x16_ADD_XMM avx 1442 1443 %macro VP9_IDCT16_YMM_1D 0 1444 VP9_UNPACK_MULSUB_2W_4X 1, 15, 16305, 1606, [pd_8192], 0, 4 ; t8, t15 1445 VP9_UNPACK_MULSUB_2W_4X 9, 7, 10394, 12665, [pd_8192], 0, 4 ; t9, t14 1446 1447 SUMSUB_BA w, 9, 1, 0 ; t8, t9 1448 SUMSUB_BA w, 7, 15, 0 ; t15, t14 1449 1450 VP9_UNPACK_MULSUB_2W_4X 15, 1, 15137, 6270, [pd_8192], 0, 4 ; t9, t14 1451 1452 VP9_UNPACK_MULSUB_2W_4X 5, 11, 14449, 7723, [pd_8192], 0, 4 ; t10, t13 1453 VP9_UNPACK_MULSUB_2W_4X 13, 3, 4756, 15679, [pd_8192], 0, 4 ; t11, t12 1454 1455 SUMSUB_BA w, 5, 13, 0 ; t11, t10 1456 SUMSUB_BA w, 11, 3, 0 ; t12, t13 1457 1458 VP9_UNPACK_MULSUB_2W_4X 3, 13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13 1459 1460 SUMSUB_BA w, 5, 9, 0 ; t8, t11 1461 SUMSUB_BA w, 3, 15, 0 ; t9, t10 1462 SUMSUB_BA w, 11, 7, 0 ; t15, t12 1463 SUMSUB_BA w, 13, 1, 0 ; t14, t13 1464 1465 SUMSUB_BA w, 15, 1, 0 1466 SUMSUB_BA w, 9, 7, 0 1467 pmulhrsw m1, [pw_11585x2] ; t10 1468 pmulhrsw m7, [pw_11585x2] ; t11 1469 pmulhrsw m9, [pw_11585x2] ; t12 1470 pmulhrsw m15, [pw_11585x2] ; t13 1471 1472 ; even (tx8x8) 1473 mova m4, [blockq+128] 1474 mova [blockq+128], m5 1475 VP9_UNPACK_MULSUB_2W_4X 4, 12, 15137, 6270, [pd_8192], 0, 5 ; t2, t3 1476 VP9_UNPACK_MULSUB_2W_4X 2, 14, 16069, 3196, [pd_8192], 0, 5 ; t4, t7 1477 VP9_UNPACK_MULSUB_2W_4X 10, 6, 9102, 13623, [pd_8192], 0, 5 ; t5, t6 1478 mova m0, [blockq+ 0] 1479 SUMSUB_BA w, 8, 0, 5 1480 pmulhrsw m8, [pw_11585x2] ; t0 1481 pmulhrsw m0, [pw_11585x2] ; t1 1482 1483 SUMSUB_BA w, 10, 2, 5 ; t4, t5 1484 SUMSUB_BA w, 6, 14, 5 ; t7, t6 1485 SUMSUB_BA w, 12, 8, 5 ; t0, t3 1486 SUMSUB_BA w, 4, 0, 5 ; t1, t2 1487 1488 SUMSUB_BA w, 2, 14, 5 1489 pmulhrsw m14, [pw_11585x2] ; t5 1490 pmulhrsw m2, [pw_11585x2] ; t6 1491 1492 SUMSUB_BA w, 6, 12, 5 ; t0, t7 1493 SUMSUB_BA w, 2, 4, 5 ; t1, t6 1494 SUMSUB_BA w, 14, 0, 5 ; t2, t5 1495 SUMSUB_BA w, 10, 8, 5 ; t3, t4 1496 1497 ; final stage 1498 SUMSUB_BA w, 11, 6, 5 ; out0, out15 1499 SUMSUB_BA w, 13, 2, 5 ; out1, out14 1500 SUMSUB_BA w, 15, 14, 5 ; out2, out13 1501 SUMSUB_BA w, 9, 10, 5 ; out3, out12 1502 SUMSUB_BA w, 7, 8, 5 ; out4, out11 1503 SUMSUB_BA w, 1, 0, 5 ; out5, out10 1504 SUMSUB_BA w, 3, 4, 5 ; out6, out9 1505 mova m5, [blockq+128] 1506 mova [blockq+192], m3 1507 SUMSUB_BA w, 5, 12, 3 ; out7, out8 1508 1509 SWAP 0, 11, 8, 12, 10 1510 SWAP 1, 13, 14, 2, 15, 6, 3, 9, 4, 7, 5 1511 %endmacro 1512 1513 ; this is almost identical to VP9_STORE_2X, but it does two rows 1514 ; for slightly improved interleaving, and it omits vpermq since the 1515 ; input is DC so all values are identical 1516 %macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero 1517 mova xm%2, [dstq] 1518 mova xm%4, [dstq+strideq*2] 1519 vinserti128 m%2, m%2, [dstq+strideq], 1 1520 vinserti128 m%4, m%4, [dstq+stride3q], 1 1521 punpckhbw m%3, m%2, m%6 1522 punpcklbw m%2, m%6 1523 punpckhbw m%5, m%4, m%6 1524 punpcklbw m%4, m%6 1525 paddw m%3, m%1 1526 paddw m%2, m%1 1527 paddw m%5, m%1 1528 paddw m%4, m%1 1529 packuswb m%2, m%3 1530 packuswb m%4, m%5 1531 mova [dstq], xm%2 1532 mova [dstq+strideq*2], xm%4 1533 vextracti128 [dstq+strideq], m%2, 1 1534 vextracti128 [dstq+stride3q], m%4, 1 1535 %endmacro 1536 1537 %if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 1538 INIT_YMM avx2 1539 cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob 1540 cmp eobd, 1 ; faster path for when only DC is set 1541 jg .idctfull 1542 1543 ; dc-only 1544 mova m1, [pw_11585x2] 1545 vpbroadcastw m0, [blockq] 1546 pmulhrsw m0, m1 1547 pmulhrsw m0, m1 1548 pxor m5, m5 1549 pmulhrsw m0, [pw_512] 1550 movd [blockq], xm5 1551 1552 DEFINE_ARGS dst, stride, stride3, cnt 1553 mov cntd, 4 1554 lea stride3q, [strideq*3] 1555 .loop_dc: 1556 VP9_STORE_YMM_DC_4X 0, 1, 2, 3, 4, 5 1557 lea dstq, [dstq+4*strideq] 1558 dec cntd 1559 jg .loop_dc 1560 RET 1561 1562 DEFINE_ARGS dst, stride, block, eob 1563 .idctfull: 1564 mova m1, [blockq+ 32] 1565 mova m2, [blockq+ 64] 1566 mova m3, [blockq+ 96] 1567 mova m5, [blockq+160] 1568 mova m6, [blockq+192] 1569 mova m7, [blockq+224] 1570 mova m8, [blockq+256] 1571 mova m9, [blockq+288] 1572 mova m10, [blockq+320] 1573 mova m11, [blockq+352] 1574 mova m12, [blockq+384] 1575 mova m13, [blockq+416] 1576 mova m14, [blockq+448] 1577 mova m15, [blockq+480] 1578 1579 VP9_IDCT16_YMM_1D 1580 TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 1581 [blockq+192], [blockq+128], 1 1582 mova [blockq+ 0], m0 1583 VP9_IDCT16_YMM_1D 1584 1585 mova [blockq+224], m7 1586 1587 ; store 1588 VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 1589 lea dstq, [dstq+2*strideq] 1590 VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 1591 lea dstq, [dstq+2*strideq] 1592 VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 1593 lea dstq, [dstq+2*strideq] 1594 mova m6, [blockq+192] 1595 mova m7, [blockq+224] 1596 VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 1597 lea dstq, [dstq+2*strideq] 1598 VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 1599 lea dstq, [dstq+2*strideq] 1600 VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 1601 lea dstq, [dstq+2*strideq] 1602 VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 1603 lea dstq, [dstq+2*strideq] 1604 VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 1605 lea dstq, [dstq+2*strideq] 1606 1607 ; at the end of the loop, m0 should still be zero 1608 ; use that to zero out block coefficients 1609 pxor m0, m0 1610 ZERO_BLOCK blockq, 32, 16, m0 1611 RET 1612 %endif 1613 1614 ;--------------------------------------------------------------------------------------------- 1615 ; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 1616 ;--------------------------------------------------------------------------------------------- 1617 1618 %macro VP9_IADST16_1D 2 ; src, pass 1619 %assign %%str 16*%2 1620 mova m0, [%1+ 0*32] ; in0 1621 mova m1, [%1+15*32] ; in15 1622 mova m2, [%1+ 7*32] ; in7 1623 mova m3, [%1+ 8*32] ; in8 1624 1625 VP9_UNPACK_MULSUB_2D_4X 1, 0, 4, 5, 16364, 804 ; m1/4=t1[d], m0/5=t0[d] 1626 VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 6, 11003, 12140 ; m2/7=t9[d], m3/6=t8[d] 1627 SCRATCH 4, 8, tmpq+ 0*%%str 1628 VP9_RND_SH_SUMSUB_BA 3, 0, 6, 5, 4, [pd_8192] ; m3=t0[w], m0=t8[w] 1629 UNSCRATCH 4, 8, tmpq+ 0*%%str 1630 VP9_RND_SH_SUMSUB_BA 2, 1, 7, 4, 5, [pd_8192] ; m2=t1[w], m1=t9[w] 1631 1632 SCRATCH 0, 10, tmpq+ 0*%%str 1633 SCRATCH 1, 11, tmpq+15*%%str 1634 mova [tmpq+ 7*%%str], m2 1635 mova [tmpq+ 8*%%str], m3 1636 1637 mova m1, [%1+ 2*32] ; in2 1638 mova m0, [%1+13*32] ; in13 1639 mova m3, [%1+ 5*32] ; in5 1640 mova m2, [%1+10*32] ; in10 1641 1642 VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 15893, 3981 ; m0/6=t3[d], m1/7=t2[d] 1643 VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d] 1644 SCRATCH 4, 12, tmpq+ 2*%%str 1645 VP9_RND_SH_SUMSUB_BA 2, 1, 5, 7, 4, [pd_8192] ; m2=t2[w], m1=t10[w] 1646 UNSCRATCH 4, 12, tmpq+ 2*%%str 1647 VP9_RND_SH_SUMSUB_BA 3, 0, 4, 6, 5, [pd_8192] ; m3=t3[w], m0=t11[w] 1648 1649 SCRATCH 0, 12, tmpq+ 2*%%str 1650 SCRATCH 1, 13, tmpq+13*%%str 1651 mova [tmpq+ 5*%%str], m2 1652 mova [tmpq+10*%%str], m3 1653 1654 mova m2, [%1+ 4*32] ; in4 1655 mova m3, [%1+11*32] ; in11 1656 mova m0, [%1+ 3*32] ; in3 1657 mova m1, [%1+12*32] ; in12 1658 1659 VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 14811, 7005 ; m3/7=t5[d], m2/6=t4[d] 1660 VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 5520, 15426 ; m0/4=t13[d], m1/5=t12[d] 1661 SCRATCH 4, 9, tmpq+ 4*%%str 1662 VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t4[w], m2=t12[w] 1663 UNSCRATCH 4, 9, tmpq+ 4*%%str 1664 VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t5[w], m3=t13[w] 1665 1666 SCRATCH 0, 8, tmpq+ 4*%%str 1667 mova [tmpq+11*%%str], m1 ; t4:m1->r11 1668 UNSCRATCH 0, 10, tmpq+ 0*%%str 1669 UNSCRATCH 1, 11, tmpq+15*%%str 1670 1671 ; round 2 interleaved part 1 1672 VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d] 1673 VP9_UNPACK_MULSUB_2D_4X 3, 2, 5, 4, 3196, 16069 ; m3/5=t12[d], m2/4=t13[d] 1674 SCRATCH 4, 9, tmpq+ 3*%%str 1675 VP9_RND_SH_SUMSUB_BA 3, 1, 5, 7, 4, [pd_8192] ; m3=t8[w], m1=t12[w] 1676 UNSCRATCH 4, 9, tmpq+ 3*%%str 1677 VP9_RND_SH_SUMSUB_BA 2, 0, 4, 6, 5, [pd_8192] ; m2=t9[w], m0=t13[w] 1678 1679 SCRATCH 0, 10, tmpq+ 0*%%str 1680 SCRATCH 1, 11, tmpq+15*%%str 1681 SCRATCH 2, 14, tmpq+ 3*%%str 1682 SCRATCH 3, 15, tmpq+12*%%str 1683 1684 mova m2, [%1+ 6*32] ; in6 1685 mova m3, [%1+ 9*32] ; in9 1686 mova m0, [%1+ 1*32] ; in1 1687 mova m1, [%1+14*32] ; in14 1688 1689 VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d] 1690 VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 2404, 16207 ; m0/4=t15[d], m1/5=t14[d] 1691 SCRATCH 4, 9, tmpq+ 6*%%str 1692 VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t6[w], m2=t14[w] 1693 UNSCRATCH 4, 9, tmpq+ 6*%%str 1694 VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t7[w], m3=t15[w] 1695 1696 ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7 1697 ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15 1698 1699 UNSCRATCH 4, 12, tmpq+ 2*%%str 1700 UNSCRATCH 5, 13, tmpq+13*%%str 1701 SCRATCH 0, 12, tmpq+ 1*%%str 1702 SCRATCH 1, 13, tmpq+14*%%str 1703 1704 ; remainder of round 2 (rest of t8-15) 1705 VP9_UNPACK_MULSUB_2D_4X 5, 4, 6, 7, 9102, 13623 ; m5/6=t11[d], m4/7=t10[d] 1706 VP9_UNPACK_MULSUB_2D_4X 3, 2, 1, 0, 13623, 9102 ; m3/1=t14[d], m2/0=t15[d] 1707 SCRATCH 0, 9, tmpq+ 6*%%str 1708 VP9_RND_SH_SUMSUB_BA 3, 4, 1, 7, 0, [pd_8192] ; m3=t10[w], m4=t14[w] 1709 UNSCRATCH 0, 9, tmpq+ 6*%%str 1710 VP9_RND_SH_SUMSUB_BA 2, 5, 0, 6, 1, [pd_8192] ; m2=t11[w], m5=t15[w] 1711 1712 ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15 1713 1714 UNSCRATCH 6, 14, tmpq+ 3*%%str 1715 UNSCRATCH 7, 15, tmpq+12*%%str 1716 1717 SUMSUB_BA w, 3, 7, 1 1718 PSIGNW m3, [pw_m1] ; m3=out1[w], m7=t10[w] 1719 SUMSUB_BA w, 2, 6, 1 ; m2=out14[w], m6=t11[w] 1720 1721 ; unfortunately, the code below overflows in some cases, e.g. 1722 ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm 1723 %if 0; cpuflag(ssse3) 1724 SUMSUB_BA w, 7, 6, 1 1725 pmulhrsw m7, [pw_11585x2] ; m7=out6[w] 1726 pmulhrsw m6, [pw_11585x2] ; m6=out9[w] 1727 %else 1728 VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, 11585, [pd_8192], 1, 0 1729 %endif 1730 1731 mova [tmpq+ 3*%%str], m6 1732 mova [tmpq+ 6*%%str], m7 1733 UNSCRATCH 6, 10, tmpq+ 0*%%str 1734 UNSCRATCH 7, 11, tmpq+15*%%str 1735 mova [tmpq+13*%%str], m2 1736 SCRATCH 3, 11, tmpq+ 9*%%str 1737 1738 VP9_UNPACK_MULSUB_2D_4X 7, 6, 2, 3, 15137, 6270 ; m6/3=t13[d], m7/2=t12[d] 1739 VP9_UNPACK_MULSUB_2D_4X 5, 4, 1, 0, 6270, 15137 ; m5/1=t14[d], m4/0=t15[d] 1740 SCRATCH 0, 9, tmpq+ 2*%%str 1741 VP9_RND_SH_SUMSUB_BA 5, 6, 1, 3, 0, [pd_8192] ; m5=out2[w], m6=t14[w] 1742 UNSCRATCH 0, 9, tmpq+ 2*%%str 1743 VP9_RND_SH_SUMSUB_BA 4, 7, 0, 2, 1, [pd_8192] 1744 PSIGNW m4, [pw_m1] ; m4=out13[w], m7=t15[w] 1745 1746 ; unfortunately, the code below overflows in some cases 1747 %if 0; cpuflag(ssse3) 1748 SUMSUB_BA w, 7, 6, 1 1749 pmulhrsw m7, [pw_m11585x2] ; m7=out5[w] 1750 pmulhrsw m6, [pw_11585x2] ; m6=out10[w] 1751 %else 1752 PSIGNW m7, [pw_m1] 1753 VP9_UNPACK_MULSUB_2W_4X 7, 6, 11585, 11585, [pd_8192], 1, 0 1754 %endif 1755 1756 ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14 1757 1758 mova m2, [tmpq+ 8*%%str] 1759 mova m3, [tmpq+ 7*%%str] 1760 mova m1, [tmpq+11*%%str] 1761 mova [tmpq+ 7*%%str], m6 1762 mova [tmpq+11*%%str], m4 1763 mova m4, [tmpq+ 5*%%str] 1764 SCRATCH 5, 14, tmpq+ 5*%%str 1765 SCRATCH 7, 15, tmpq+ 8*%%str 1766 UNSCRATCH 6, 8, tmpq+ 4*%%str 1767 UNSCRATCH 5, 12, tmpq+ 1*%%str 1768 UNSCRATCH 7, 13, tmpq+14*%%str 1769 1770 ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7 1771 ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 1772 1773 SUMSUB_BA w, 1, 2, 0 ; m1=t0[w], m2=t4[w] 1774 mova m0, [tmpq+10*%%str] 1775 SCRATCH 1, 12, tmpq+ 1*%%str 1776 SUMSUB_BA w, 6, 3, 1 ; m8=t1[w], m3=t5[w] 1777 SCRATCH 6, 13, tmpq+ 4*%%str 1778 SUMSUB_BA w, 7, 4, 1 ; m13=t2[w], m9=t6[w] 1779 SCRATCH 7, 8, tmpq+10*%%str 1780 SUMSUB_BA w, 5, 0, 1 ; m12=t3[w], m0=t7[w] 1781 SCRATCH 5, 9, tmpq+14*%%str 1782 1783 VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 5, 15137, 6270 ; m2/6=t5[d], m3/10=t4[d] 1784 VP9_UNPACK_MULSUB_2D_4X 0, 4, 1, 6, 6270, 15137 ; m0/14=t6[d], m9/15=t7[d] 1785 SCRATCH 6, 10, tmpq+ 0*%%str 1786 VP9_RND_SH_SUMSUB_BA 0, 3, 1, 5, 6, [pd_8192] 1787 UNSCRATCH 6, 10, tmpq+ 0*%%str 1788 PSIGNW m0, [pw_m1] ; m0=out3[w], m3=t6[w] 1789 VP9_RND_SH_SUMSUB_BA 4, 2, 6, 7, 5, [pd_8192] ; m9=out12[w], m2=t7[w] 1790 1791 UNSCRATCH 1, 8, tmpq+10*%%str 1792 UNSCRATCH 5, 9, tmpq+14*%%str 1793 UNSCRATCH 6, 12, tmpq+ 1*%%str 1794 UNSCRATCH 7, 13, tmpq+ 4*%%str 1795 SCRATCH 4, 9, tmpq+14*%%str 1796 1797 SUMSUB_BA w, 1, 6, 4 ; m13=out0[w], m1=t2[w] 1798 SUMSUB_BA w, 5, 7, 4 1799 PSIGNW m5, [pw_m1] ; m12=out15[w], m8=t3[w] 1800 1801 ; unfortunately, the code below overflows in some cases, e.g. 1802 ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm 1803 %if 0 ; cpuflag(ssse3) 1804 SUMSUB_BA w, 7, 6, 4 1805 pmulhrsw m7, [pw_m11585x2] ; m8=out7[w] 1806 pmulhrsw m6, [pw_11585x2] ; m1=out8[w] 1807 SWAP 6, 7 1808 SUMSUB_BA w, 3, 2, 4 1809 pmulhrsw m3, [pw_11585x2] ; m3=out4[w] 1810 pmulhrsw m2, [pw_11585x2] ; m2=out11[w] 1811 %else 1812 SCRATCH 5, 8, tmpq+10*%%str 1813 VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, m11585, [pd_8192], 5, 4 1814 VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 4 1815 UNSCRATCH 5, 8, tmpq+10*%%str 1816 %endif 1817 1818 ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15 1819 ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 1820 1821 %if %2 == 1 1822 %if ARCH_X86_64 1823 mova m13, [tmpq+ 6*%%str] 1824 TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 6, 10 1825 mova [tmpq+ 0*16], m1 1826 mova [tmpq+ 2*16], m11 1827 mova [tmpq+ 4*16], m14 1828 mova [tmpq+ 6*16], m0 1829 mova m1, [tmpq+ 3*%%str] 1830 mova m11, [tmpq+ 7*%%str] 1831 mova m14, [tmpq+11*%%str] 1832 mova m0, [tmpq+13*%%str] 1833 mova [tmpq+ 8*16], m3 1834 mova [tmpq+10*16], m15 1835 mova [tmpq+12*16], m13 1836 mova [tmpq+14*16], m6 1837 1838 TRANSPOSE8x8W 7, 1, 11, 2, 9, 14, 0, 5, 10 1839 mova [tmpq+ 1*16], m7 1840 mova [tmpq+ 3*16], m1 1841 mova [tmpq+ 5*16], m11 1842 mova [tmpq+ 7*16], m2 1843 mova [tmpq+ 9*16], m9 1844 mova [tmpq+11*16], m14 1845 mova [tmpq+13*16], m0 1846 mova [tmpq+15*16], m5 1847 %else 1848 mova [tmpq+12*%%str], m2 1849 mova [tmpq+ 1*%%str], m5 1850 mova [tmpq+15*%%str], m7 1851 mova m2, [tmpq+ 9*%%str] 1852 mova m5, [tmpq+ 5*%%str] 1853 mova m7, [tmpq+ 8*%%str] 1854 TRANSPOSE8x8W 1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1 1855 mova [tmpq+ 0*16], m1 1856 mova [tmpq+ 2*16], m2 1857 mova [tmpq+ 4*16], m5 1858 mova [tmpq+ 6*16], m0 1859 mova [tmpq+10*16], m7 1860 mova m3, [tmpq+12*%%str] 1861 mova [tmpq+12*16], m4 1862 mova m4, [tmpq+14*%%str] 1863 mova [tmpq+14*16], m6 1864 1865 mova m0, [tmpq+15*%%str] 1866 mova m1, [tmpq+ 3*%%str] 1867 mova m2, [tmpq+ 7*%%str] 1868 mova m5, [tmpq+11*%%str] 1869 mova m7, [tmpq+ 1*%%str] 1870 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1 1871 mova [tmpq+ 1*16], m0 1872 mova [tmpq+ 3*16], m1 1873 mova [tmpq+ 5*16], m2 1874 mova [tmpq+ 7*16], m3 1875 mova [tmpq+11*16], m5 1876 mova [tmpq+13*16], m6 1877 mova [tmpq+15*16], m7 1878 %endif 1879 %else 1880 pxor m4, m4 1881 1882 %if cpuflag(ssse3) 1883 %define ROUND_REG [pw_512] 1884 %else 1885 %define ROUND_REG [pw_32] 1886 %endif 1887 1888 %if ARCH_X86_64 1889 mova m12, [tmpq+ 6*%%str] 1890 VP9_IDCT8_WRITEx2 1, 11, 10, 8, 4, ROUND_REG, 6 1891 lea dstq, [dstq+strideq*2] 1892 VP9_IDCT8_WRITEx2 14, 0, 10, 8, 4, ROUND_REG, 6 1893 lea dstq, [dstq+strideq*2] 1894 VP9_IDCT8_WRITEx2 3, 15, 10, 8, 4, ROUND_REG, 6 1895 lea dstq, [dstq+strideq*2] 1896 VP9_IDCT8_WRITEx2 12, 6, 10, 8, 4, ROUND_REG, 6 1897 lea dstq, [dstq+strideq*2] 1898 1899 mova m1, [tmpq+ 3*%%str] 1900 mova m11, [tmpq+ 7*%%str] 1901 mova m14, [tmpq+11*%%str] 1902 mova m0, [tmpq+13*%%str] 1903 1904 VP9_IDCT8_WRITEx2 7, 1, 10, 8, 4, ROUND_REG, 6 1905 lea dstq, [dstq+strideq*2] 1906 VP9_IDCT8_WRITEx2 11, 2, 10, 8, 4, ROUND_REG, 6 1907 lea dstq, [dstq+strideq*2] 1908 VP9_IDCT8_WRITEx2 9, 14, 10, 8, 4, ROUND_REG, 6 1909 lea dstq, [dstq+strideq*2] 1910 VP9_IDCT8_WRITEx2 0, 5, 10, 8, 4, ROUND_REG, 6 1911 %else 1912 mova [tmpq+ 0*%%str], m2 1913 mova [tmpq+ 1*%%str], m5 1914 mova [tmpq+ 2*%%str], m7 1915 mova m2, [tmpq+ 9*%%str] 1916 VP9_IDCT8_WRITEx2 1, 2, 5, 7, 4, ROUND_REG, 6 1917 lea dstq, [dstq+strideq*2] 1918 mova m5, [tmpq+ 5*%%str] 1919 VP9_IDCT8_WRITEx2 5, 0, 1, 2, 4, ROUND_REG, 6 1920 lea dstq, [dstq+strideq*2] 1921 mova m5, [tmpq+ 8*%%str] 1922 VP9_IDCT8_WRITEx2 3, 5, 1, 2, 4, ROUND_REG, 6 1923 lea dstq, [dstq+strideq*2] 1924 mova m5, [tmpq+ 6*%%str] 1925 VP9_IDCT8_WRITEx2 5, 6, 1, 2, 4, ROUND_REG, 6 1926 lea dstq, [dstq+strideq*2] 1927 1928 mova m0, [tmpq+ 2*%%str] 1929 mova m3, [tmpq+ 3*%%str] 1930 VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1931 lea dstq, [dstq+strideq*2] 1932 mova m0, [tmpq+ 7*%%str] 1933 mova m3, [tmpq+ 0*%%str] 1934 VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1935 lea dstq, [dstq+strideq*2] 1936 mova m0, [tmpq+14*%%str] 1937 mova m3, [tmpq+11*%%str] 1938 VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1939 lea dstq, [dstq+strideq*2] 1940 mova m0, [tmpq+13*%%str] 1941 mova m3, [tmpq+ 1*%%str] 1942 VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 1943 %endif 1944 1945 SWAP 0, 4 ; zero 1946 %undef ROUND_REG 1947 %endif 1948 %endmacro 1949 1950 %macro IADST16_FN 5 1951 INIT_XMM %5 1952 cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp 1953 mov cntd, 2 1954 mov tmpq, rsp 1955 .loop1_full: 1956 VP9_%2_1D blockq, 1 1957 add blockq, 16 1958 add tmpq, 256 1959 dec cntd 1960 jg .loop1_full 1961 sub blockq, 32 1962 1963 mov cntd, 2 1964 mov tmpq, rsp 1965 mov dst_bakq, dstq 1966 .loop2_full: 1967 VP9_%4_1D tmpq, 2 1968 lea dstq, [dst_bakq+8] 1969 add tmpq, 16 1970 dec cntd 1971 jg .loop2_full 1972 1973 ; at the end of the loop, m0 should still be zero 1974 ; use that to zero out block coefficients 1975 ZERO_BLOCK blockq, 32, 16, m0 1976 RET 1977 %endmacro 1978 1979 IADST16_FN idct, IDCT16, iadst, IADST16, sse2 1980 IADST16_FN iadst, IADST16, idct, IDCT16, sse2 1981 IADST16_FN iadst, IADST16, iadst, IADST16, sse2 1982 IADST16_FN idct, IDCT16, iadst, IADST16, ssse3 1983 IADST16_FN iadst, IADST16, idct, IDCT16, ssse3 1984 IADST16_FN iadst, IADST16, iadst, IADST16, ssse3 1985 IADST16_FN idct, IDCT16, iadst, IADST16, avx 1986 IADST16_FN iadst, IADST16, idct, IDCT16, avx 1987 IADST16_FN iadst, IADST16, iadst, IADST16, avx 1988 1989 ; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128] 1990 ; out: m[0-15] except m6, which is in [blockq+192] 1991 ; uses blockq as scratch space 1992 %macro VP9_IADST16_YMM_1D 0 1993 mova [blockq+ 32], m3 1994 mova [blockq+ 64], m7 1995 mova [blockq+ 96], m8 1996 1997 ; first half of round 1 1998 VP9_UNPACK_MULSUB_2D_4X 9, 6, 0, 3, 13160, 9760 ; m9/x=t7[d], m6/x=t6[d] 1999 VP9_UNPACK_MULSUB_2D_4X 1, 14, 4, 7, 2404, 16207 ; m1/x=t15[d], m14/x=t14[d] 2000 VP9_RND_SH_SUMSUB_BA 14, 6, 7, 3, 8, [pd_8192] ; m14=t6[w], m6=t14[w] 2001 VP9_RND_SH_SUMSUB_BA 1, 9, 4, 0, 8, [pd_8192] ; m1=t7[w], m9=t15[w] 2002 2003 VP9_UNPACK_MULSUB_2D_4X 13, 2, 4, 7, 15893, 3981 ; m13/x=t3[d], m2/x=t2[d] 2004 VP9_UNPACK_MULSUB_2D_4X 5, 10, 0, 3, 8423, 14053 ; m5/x=t11[d], m10/x=t10[d] 2005 VP9_RND_SH_SUMSUB_BA 10, 2, 3, 7, 8, [pd_8192] ; m10=t2[w], m2=t10[w] 2006 VP9_RND_SH_SUMSUB_BA 5, 13, 0, 4, 8, [pd_8192] ; m5=t3[w], m13=t11[w] 2007 2008 ; half of round 2 t8-15 2009 VP9_UNPACK_MULSUB_2D_4X 2, 13, 4, 7, 9102, 13623 ; m2/x=t11[d], m13/x=t10[d] 2010 VP9_UNPACK_MULSUB_2D_4X 9, 6, 3, 0, 13623, 9102 ; m9/x=t14[d], m6/x=t15[d] 2011 VP9_RND_SH_SUMSUB_BA 9, 13, 3, 7, 8, [pd_8192] ; m9=t10[w], m13=t14[w] 2012 VP9_RND_SH_SUMSUB_BA 6, 2, 0, 4, 8, [pd_8192] ; m6=t11[w], m2=t15[w] 2013 2014 SUMSUB_BA w, 14, 10, 8 ; m14=t2, m10=t6 2015 SUMSUB_BA w, 1, 5, 8 ; m1=t3, m5=t7 2016 2017 mova m0, [blockq+ 0] 2018 mova m4, [blockq+128] 2019 mova m3, [blockq+ 32] 2020 mova m7, [blockq+ 64] 2021 mova m8, [blockq+ 96] 2022 mova [blockq+ 0], m1 2023 mova [blockq+128], m14 2024 mova [blockq+ 32], m6 2025 mova [blockq+ 64], m9 2026 mova [blockq+ 96], m10 2027 2028 ; second half of round 1 2029 VP9_UNPACK_MULSUB_2D_4X 15, 0, 1, 9, 16364, 804 ; m15/x=t1[d], m0/x=t0[d] 2030 VP9_UNPACK_MULSUB_2D_4X 7, 8, 10, 6, 11003, 12140 ; m7/x=t9[d], m8/x=t8[d] 2031 VP9_RND_SH_SUMSUB_BA 8, 0, 6, 9, 14, [pd_8192] ; m8=t0[w], m0=t8[w] 2032 VP9_RND_SH_SUMSUB_BA 7, 15, 10, 1, 14, [pd_8192] ; m7=t1[w], m15=t9[w] 2033 2034 VP9_UNPACK_MULSUB_2D_4X 11, 4, 10, 6, 14811, 7005 ; m11/x=t5[d], m4/x=t4[d] 2035 VP9_UNPACK_MULSUB_2D_4X 3, 12, 1, 9, 5520, 15426 ; m3/x=t13[d], m12/x=t12[d] 2036 VP9_RND_SH_SUMSUB_BA 12, 4, 9, 6, 14, [pd_8192] ; m12=t4[w], m4=t12[w] 2037 VP9_RND_SH_SUMSUB_BA 3, 11, 1, 10, 14, [pd_8192] ; m3=t5[w], m11=t13[w] 2038 2039 ; second half of round 2 t8-15 2040 VP9_UNPACK_MULSUB_2D_4X 0, 15, 6, 10, 16069, 3196 ; m15/x=t8[d], m0/x=t9[d] 2041 VP9_UNPACK_MULSUB_2D_4X 11, 4, 9, 1, 3196, 16069 ; m11/x=t12[d], m4/x=t13[d] 2042 VP9_RND_SH_SUMSUB_BA 11, 15, 9, 10, 14, [pd_8192] ; m11=t8[w], m15=t12[w] 2043 VP9_RND_SH_SUMSUB_BA 4, 0, 1, 6, 14, [pd_8192] ; m4=t9[w], m0=t13[w] 2044 2045 SUMSUB_BA w, 12, 8, 14 ; m12=t0, m8=t4 2046 SUMSUB_BA w, 3, 7, 14 ; m3=t1, m7=t5 2047 2048 mova m10, [blockq+ 96] 2049 mova [blockq+ 96], m12 2050 2051 ; round 3 2052 VP9_UNPACK_MULSUB_2D_4X 15, 0, 9, 12, 15137, 6270 ; m15/x=t13[d], m0/x=t12[d] 2053 VP9_UNPACK_MULSUB_2D_4X 2, 13, 1, 6, 6270, 15137 ; m2/x=t14[d], m13/x=t15[d] 2054 VP9_RND_SH_SUMSUB_BA 2, 0, 1, 12, 14, [pd_8192] ; m2=out2[w], m0=t14a[w] 2055 VP9_RND_SH_SUMSUB_BA 13, 15, 6, 9, 14, [pd_8192] 2056 PSIGNW m13, [pw_m1] ; m13=out13[w], m15=t15a[w] 2057 2058 VP9_UNPACK_MULSUB_2D_4X 8, 7, 12, 9, 15137, 6270 ; m8/x=t5[d], m7/x=t4[d] 2059 VP9_UNPACK_MULSUB_2D_4X 5, 10, 1, 6, 6270, 15137 ; m5/x=t6[d], m10/x=t7[d] 2060 VP9_RND_SH_SUMSUB_BA 5, 7, 1, 9, 14, [pd_8192] 2061 PSIGNW m5, [pw_m1] ; m5=out3[w], m7=t6[w] 2062 VP9_RND_SH_SUMSUB_BA 10, 8, 6, 12, 14, [pd_8192] ; m10=out12[w], m8=t7[w] 2063 2064 mova m1, [blockq+ 0] 2065 mova m14, [blockq+128] 2066 mova m6, [blockq+ 32] 2067 mova m9, [blockq+ 64] 2068 mova m12, [blockq+ 96] 2069 mova [blockq+ 0], m10 2070 mova [blockq+128], m5 2071 2072 SUMSUB_BA w, 14, 12, 5 ; m14=out0, m12=t2a 2073 SUMSUB_BA w, 1, 3, 5 2074 PSIGNW m1, [pw_m1] ; m1=out15, m3=t3a 2075 2076 SUMSUB_BA w, 9, 11, 5 2077 PSIGNW m9, [pw_m1] ; m9=out1, m11=t10 2078 SUMSUB_BA w, 6, 4, 5 ; m6=out14, m4=t11 2079 2080 VP9_UNPACK_MULSUB_2W_4X 4, 11, 11585, 11585, [pd_8192], 5, 10 ; m4=out9, m11=out6 2081 mova m5, [blockq+128] 2082 mova [blockq+192], m11 2083 PSIGNW m15, [pw_m1] 2084 VP9_UNPACK_MULSUB_2W_4X 15, 0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10 2085 2086 PSIGNW m3, [pw_m1] 2087 VP9_UNPACK_MULSUB_2W_4X 3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8 2088 VP9_UNPACK_MULSUB_2W_4X 8, 7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4 2089 2090 mova m10, [blockq+ 0] 2091 2092 SWAP 0, 14, 6, 11, 8, 12, 10 2093 SWAP 1, 9, 15, 4, 7, 3, 5 2094 SWAP 5, 9, 15 2095 %endmacro 2096 2097 %if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 2098 %macro IADST16_YMM_FN 4 2099 INIT_YMM avx2 2100 cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob 2101 mova m1, [blockq+ 32] 2102 mova m2, [blockq+ 64] 2103 mova m3, [blockq+ 96] 2104 mova m5, [blockq+160] 2105 mova m6, [blockq+192] 2106 mova m7, [blockq+224] 2107 mova m8, [blockq+256] 2108 mova m9, [blockq+288] 2109 mova m10, [blockq+320] 2110 mova m11, [blockq+352] 2111 mova m12, [blockq+384] 2112 mova m13, [blockq+416] 2113 mova m14, [blockq+448] 2114 mova m15, [blockq+480] 2115 2116 VP9_%2_YMM_1D 2117 TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 2118 [blockq+192], [blockq+128], 1 2119 mova [blockq+ 0], m0 2120 VP9_%4_YMM_1D 2121 2122 mova [blockq+224], m7 2123 2124 ; store 2125 VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 2126 lea dstq, [dstq+2*strideq] 2127 VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 2128 lea dstq, [dstq+2*strideq] 2129 VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 2130 lea dstq, [dstq+2*strideq] 2131 mova m6, [blockq+192] 2132 mova m7, [blockq+224] 2133 VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 2134 lea dstq, [dstq+2*strideq] 2135 VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 2136 lea dstq, [dstq+2*strideq] 2137 VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 2138 lea dstq, [dstq+2*strideq] 2139 VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 2140 lea dstq, [dstq+2*strideq] 2141 VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 2142 lea dstq, [dstq+2*strideq] 2143 2144 ; at the end of the loop, m0 should still be zero 2145 ; use that to zero out block coefficients 2146 pxor m0, m0 2147 ZERO_BLOCK blockq, 32, 16, m0 2148 RET 2149 %endmacro 2150 2151 IADST16_YMM_FN idct, IDCT16, iadst, IADST16 2152 IADST16_YMM_FN iadst, IADST16, idct, IDCT16 2153 IADST16_YMM_FN iadst, IADST16, iadst, IADST16 2154 %endif 2155 2156 ;--------------------------------------------------------------------------------------------- 2157 ; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); 2158 ;--------------------------------------------------------------------------------------------- 2159 2160 %macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc 2161 %if %2 == 1 2162 %assign %%str mmsize 2163 %else 2164 %assign %%str 64 2165 %endif 2166 2167 ; first do t0-15, this can be done identical to idct16x16 2168 VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1 2169 2170 ; store everything on stack to make space available for t16-31 2171 ; we store interleaved with the output of the second half (t16-31) 2172 ; so we don't need to allocate extra stack space 2173 mova [tmpq+ 0*%%str], m0 ; t0 2174 mova [tmpq+ 4*%%str], m1 ; t1 2175 mova [tmpq+ 8*%%str], m2 ; t2 2176 mova [tmpq+12*%%str], m3 ; t3 2177 mova [tmpq+16*%%str], m4 ; t4 2178 mova [tmpq+20*%%str], m5 ; t5 2179 %if ARCH_X86_64 2180 mova [tmpq+22*%%str], m10 ; t10 2181 mova [tmpq+18*%%str], m11 ; t11 2182 mova [tmpq+14*%%str], m12 ; t12 2183 mova [tmpq+10*%%str], m13 ; t13 2184 mova [tmpq+ 6*%%str], m14 ; t14 2185 mova [tmpq+ 2*%%str], m15 ; t15 2186 %endif 2187 2188 mova m0, [tmpq+ 30*%%str] 2189 UNSCRATCH 1, 6, tmpq+26*%%str 2190 UNSCRATCH 2, 8, tmpq+24*%%str 2191 UNSCRATCH 3, 9, tmpq+28*%%str 2192 SUMSUB_BA w, 1, 3, 4 ; t6, t9 2193 SUMSUB_BA w, 0, 2, 4 ; t7, t8 2194 2195 mova [tmpq+24*%%str], m1 ; t6 2196 mova [tmpq+28*%%str], m0 ; t7 2197 mova [tmpq+30*%%str], m2 ; t8 2198 mova [tmpq+26*%%str], m3 ; t9 2199 2200 ; then, secondly, do t16-31 2201 %if %3 <= 8 2202 mova m4, [%1+ 1*64] 2203 mova m7, [%1+ 7*64] 2204 2205 pmulhrsw m1, m4, [pw_16364x2] ;t31 2206 pmulhrsw m4, [pw_804x2] ;t16 2207 2208 VP9_UNPACK_MULSUB_2W_4X 5, 0, 1, 4, 16069, 3196, [pd_8192], 6, 2 ; t17, t30 2209 2210 pmulhrsw m3, m7, [pw_m5520x2] ;t19 2211 pmulhrsw m7, [pw_15426x2] ;t28 2212 2213 SCRATCH 4, 13, tmpq+ 1*%%str 2214 SCRATCH 5, 12, tmpq+15*%%str 2215 2216 VP9_UNPACK_MULSUB_2W_4X 2, 6, 7, 3, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 2217 %else 2218 mova m0, [%1+ 1*64] 2219 mova m1, [%1+15*64] 2220 %if %3 <= 16 2221 pmulhrsw m5, m0, [pw_16364x2] 2222 pmulhrsw m0, [pw_804x2] 2223 pmulhrsw m4, m1, [pw_m11003x2] 2224 pmulhrsw m1, [pw_12140x2] 2225 %else 2226 mova m4, [%1+17*64] 2227 mova m5, [%1+31*64] 2228 2229 VP9_UNPACK_MULSUB_2W_4X 0, 5, 16364, 804, [pd_8192], 2, 3 ; t16, t31 2230 VP9_UNPACK_MULSUB_2W_4X 4, 1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30 2231 %endif 2232 SUMSUB_BA w, 4, 0, 2 2233 SUMSUB_BA w, 1, 5, 2 2234 2235 VP9_UNPACK_MULSUB_2W_4X 5, 0, 16069, 3196, [pd_8192], 2, 3 ; t17, t30 2236 2237 SCRATCH 4, 13, tmpq+ 1*%%str 2238 SCRATCH 5, 12, tmpq+15*%%str 2239 2240 mova m2, [%1+ 7*64] 2241 mova m3, [%1+ 9*64] 2242 %if %3 <= 16 2243 pmulhrsw m7, m3, [pw_14811x2] 2244 pmulhrsw m3, [pw_7005x2] 2245 pmulhrsw m6, m2, [pw_m5520x2] 2246 pmulhrsw m2, [pw_15426x2] 2247 %else 2248 mova m7, [%1+23*64] 2249 mova m6, [%1+25*64] 2250 2251 VP9_UNPACK_MULSUB_2W_4X 3, 7, 14811, 7005, [pd_8192], 4, 5 ; t18, t29 2252 VP9_UNPACK_MULSUB_2W_4X 6, 2, 5520, 15426, [pd_8192], 4, 5 ; t19, t28 2253 %endif 2254 SUMSUB_BA w, 3, 6, 4 2255 SUMSUB_BA w, 7, 2, 4 2256 2257 VP9_UNPACK_MULSUB_2W_4X 2, 6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 2258 %endif 2259 2260 UNSCRATCH 5, 12, tmpq+15*%%str 2261 SUMSUB_BA w, 6, 0, 4 2262 mova [tmpq+25*%%str], m6 ; t19 2263 UNSCRATCH 4, 13, tmpq+ 1*%%str 2264 SUMSUB_BA w, 7, 1, 6 2265 SUMSUB_BA w, 3, 4, 6 2266 mova [tmpq+23*%%str], m3 ; t16 2267 SUMSUB_BA w, 2, 5, 6 2268 2269 VP9_UNPACK_MULSUB_2W_4X 0, 5, 15137, 6270, [pd_8192], 6, 3 ; t18, t29 2270 VP9_UNPACK_MULSUB_2W_4X 1, 4, 15137, 6270, [pd_8192], 6, 3 ; t19, t28 2271 2272 SCRATCH 0, 10, tmpq+ 1*%%str 2273 SCRATCH 1, 11, tmpq+ 7*%%str 2274 SCRATCH 2, 9, tmpq+ 9*%%str 2275 SCRATCH 4, 14, tmpq+15*%%str 2276 SCRATCH 5, 15, tmpq+17*%%str 2277 SCRATCH 7, 13, tmpq+31*%%str 2278 2279 %if %3 <= 8 2280 mova m0, [%1+ 5*64] 2281 mova m3, [%1+ 3*64] 2282 2283 pmulhrsw m5, m0, [pw_15893x2] ;t27 2284 pmulhrsw m0, [pw_3981x2] ;t20 2285 2286 VP9_UNPACK_MULSUB_2W_4X 1, 4, 5, 0, 9102, 13623, [pd_8192], 7, 2 ; t21, t26 2287 2288 pmulhrsw m6, m3, [pw_m2404x2] ;t23 2289 pmulhrsw m3, [pw_16207x2] ;t24 2290 2291 SCRATCH 5, 8, tmpq+ 5*%%str 2292 SCRATCH 4, 12, tmpq+11*%%str 2293 2294 VP9_UNPACK_MULSUB_2W_4X 7, 2, 3, 6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 2295 %else 2296 mova m4, [%1+ 5*64] 2297 mova m5, [%1+11*64] 2298 %if %3 <= 16 2299 pmulhrsw m1, m4, [pw_15893x2] 2300 pmulhrsw m4, [pw_3981x2] 2301 pmulhrsw m0, m5, [pw_m8423x2] 2302 pmulhrsw m5, [pw_14053x2] 2303 %else 2304 mova m0, [%1+21*64] 2305 mova m1, [%1+27*64] 2306 2307 VP9_UNPACK_MULSUB_2W_4X 4, 1, 15893, 3981, [pd_8192], 2, 3 ; t20, t27 2308 VP9_UNPACK_MULSUB_2W_4X 0, 5, 8423, 14053, [pd_8192], 2, 3 ; t21, t26 2309 %endif 2310 SUMSUB_BA w, 0, 4, 2 2311 SUMSUB_BA w, 5, 1, 2 2312 2313 VP9_UNPACK_MULSUB_2W_4X 1, 4, 9102, 13623, [pd_8192], 2, 3 ; t21, t26 2314 2315 SCRATCH 5, 8, tmpq+ 5*%%str 2316 SCRATCH 4, 12, tmpq+11*%%str 2317 2318 mova m7, [%1+ 3*64] 2319 mova m6, [%1+13*64] 2320 %if %3 <= 16 2321 pmulhrsw m3, m6, [pw_13160x2] 2322 pmulhrsw m6, [pw_9760x2] 2323 pmulhrsw m2, m7, [pw_m2404x2] 2324 pmulhrsw m7, [pw_16207x2] 2325 %else 2326 mova m2, [%1+29*64] 2327 mova m3, [%1+19*64] 2328 VP9_UNPACK_MULSUB_2W_4X 6, 3, 13160, 9760, [pd_8192], 4, 5 ; t22, t25 2329 VP9_UNPACK_MULSUB_2W_4X 2, 7, 2404, 16207, [pd_8192], 4, 5 ; t23, t24 2330 %endif 2331 SUMSUB_BA w, 6, 2, 4 2332 SUMSUB_BA w, 3, 7, 4 2333 2334 VP9_UNPACK_MULSUB_2W_4X 7, 2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 2335 %endif 2336 2337 ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23, 2338 ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31 2339 2340 UNSCRATCH 4, 12, tmpq+11*%%str 2341 SUMSUB_BA w, 0, 6, 5 2342 SUMSUB_BA w, 4, 2, 5 2343 UNSCRATCH 5, 8, tmpq+ 5*%%str 2344 SCRATCH 4, 8, tmpq+11*%%str 2345 SUMSUB_BA w, 1, 7, 4 2346 SUMSUB_BA w, 5, 3, 4 2347 SCRATCH 5, 12, tmpq+ 5*%%str 2348 2349 VP9_UNPACK_MULSUB_2W_4X 3, 6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27 2350 VP9_UNPACK_MULSUB_2W_4X 2, 7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26 2351 2352 ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23, 2353 ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31 2354 2355 UNSCRATCH 5, 9, tmpq+ 9*%%str 2356 mova m4, [tmpq+23*%%str] ; t16 2357 %if ARCH_X86_64 2358 SUMSUB_BA w, 1, 5, 9 2359 SUMSUB_BA w, 0, 4, 9 2360 %else 2361 SUMSUB_BADC w, 1, 5, 0, 4 2362 %endif 2363 mova [tmpq+29*%%str], m1 ; t17 2364 mova [tmpq+21*%%str], m0 ; t16 2365 UNSCRATCH 0, 10, tmpq+ 1*%%str 2366 UNSCRATCH 1, 11, tmpq+ 7*%%str 2367 %if ARCH_X86_64 2368 SUMSUB_BA w, 2, 0, 9 2369 SUMSUB_BA w, 3, 1, 9 2370 %else 2371 SUMSUB_BADC w, 2, 0, 3, 1 2372 %endif 2373 mova [tmpq+ 9*%%str], m2 ; t18 2374 mova [tmpq+13*%%str], m3 ; t19 2375 SCRATCH 0, 10, tmpq+23*%%str 2376 SCRATCH 1, 11, tmpq+27*%%str 2377 2378 UNSCRATCH 2, 14, tmpq+15*%%str 2379 UNSCRATCH 3, 15, tmpq+17*%%str 2380 SUMSUB_BA w, 6, 2, 0 2381 SUMSUB_BA w, 7, 3, 0 2382 SCRATCH 6, 14, tmpq+ 3*%%str 2383 SCRATCH 7, 15, tmpq+ 7*%%str 2384 2385 UNSCRATCH 0, 8, tmpq+11*%%str 2386 mova m1, [tmpq+25*%%str] ; t19 2387 UNSCRATCH 6, 12, tmpq+ 5*%%str 2388 UNSCRATCH 7, 13, tmpq+31*%%str 2389 %if ARCH_X86_64 2390 SUMSUB_BA w, 0, 1, 9 2391 SUMSUB_BA w, 6, 7, 9 2392 %else 2393 SUMSUB_BADC w, 0, 1, 6, 7 2394 %endif 2395 2396 ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23, 2397 ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31 2398 2399 %if 0; cpuflag(ssse3) 2400 %if ARCH_X86_64 2401 SUMSUB_BA w, 4, 7, 8 2402 SUMSUB_BA w, 5, 1, 8 2403 %else 2404 SUMSUB_BADC w, 4, 7, 5, 1 2405 %endif 2406 2407 pmulhrsw m7, [pw_11585x2] 2408 pmulhrsw m4, [pw_11585x2] 2409 pmulhrsw m1, [pw_11585x2] 2410 pmulhrsw m5, [pw_11585x2] 2411 2412 mova [tmpq+ 5*%%str], m7 ; t23 2413 SCRATCH 1, 13, tmpq+25*%%str 2414 UNSCRATCH 7, 10, tmpq+23*%%str 2415 UNSCRATCH 1, 11, tmpq+27*%%str 2416 2417 %if ARCH_X86_64 2418 SUMSUB_BA w, 7, 3, 10 2419 SUMSUB_BA w, 1, 2, 10 2420 %else 2421 SUMSUB_BADC w, 7, 3, 1, 2 2422 %endif 2423 2424 pmulhrsw m3, [pw_11585x2] 2425 pmulhrsw m7, [pw_11585x2] 2426 pmulhrsw m2, [pw_11585x2] 2427 pmulhrsw m1, [pw_11585x2] 2428 %else 2429 SCRATCH 0, 8, tmpq+15*%%str 2430 SCRATCH 6, 9, tmpq+17*%%str 2431 VP9_UNPACK_MULSUB_2W_4X 7, 4, 11585, 11585, [pd_8192], 0, 6 2432 mova [tmpq+ 5*%%str], m7 ; t23 2433 UNSCRATCH 7, 10, tmpq+23*%%str 2434 VP9_UNPACK_MULSUB_2W_4X 1, 5, 11585, 11585, [pd_8192], 0, 6 2435 SCRATCH 1, 13, tmpq+25*%%str 2436 UNSCRATCH 1, 11, tmpq+27*%%str 2437 VP9_UNPACK_MULSUB_2W_4X 3, 7, 11585, 11585, [pd_8192], 0, 6 2438 VP9_UNPACK_MULSUB_2W_4X 2, 1, 11585, 11585, [pd_8192], 0, 6 2439 UNSCRATCH 0, 8, tmpq+15*%%str 2440 UNSCRATCH 6, 9, tmpq+17*%%str 2441 %endif 2442 2443 ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23, 2444 ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31 2445 2446 ; then do final pass to sumsub+store the two halves 2447 %if %2 == 1 2448 mova [tmpq+17*%%str], m2 ; t20 2449 mova [tmpq+ 1*%%str], m3 ; t21 2450 %if ARCH_X86_64 2451 mova [tmpq+25*%%str], m13 ; t22 2452 2453 mova m8, [tmpq+ 0*%%str] ; t0 2454 mova m9, [tmpq+ 4*%%str] ; t1 2455 mova m12, [tmpq+ 8*%%str] ; t2 2456 mova m11, [tmpq+12*%%str] ; t3 2457 mova m2, [tmpq+16*%%str] ; t4 2458 mova m3, [tmpq+20*%%str] ; t5 2459 mova m13, [tmpq+24*%%str] ; t6 2460 2461 SUMSUB_BA w, 6, 8, 10 2462 mova [tmpq+ 3*%%str], m8 ; t15 2463 SUMSUB_BA w, 0, 9, 8 2464 SUMSUB_BA w, 15, 12, 8 2465 SUMSUB_BA w, 14, 11, 8 2466 SUMSUB_BA w, 1, 2, 8 2467 SUMSUB_BA w, 7, 3, 8 2468 SUMSUB_BA w, 5, 13, 8 2469 mova m10, [tmpq+28*%%str] ; t7 2470 SUMSUB_BA w, 4, 10, 8 2471 %if cpuflag(avx2) 2472 ; the "shitty" about this idct is that the final pass does the outermost 2473 ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need 2474 ; to be sequential, which means I need to load/store half of the sumsub 2475 ; intermediates back to/from memory to get a 16x16 transpose going... 2476 ; This would be easier if we had more (e.g. 32) YMM regs here. 2477 mova [tmpq+ 7*%%str], m9 2478 mova [tmpq+11*%%str], m12 2479 mova [tmpq+15*%%str], m11 2480 mova [tmpq+19*%%str], m2 2481 mova [tmpq+23*%%str], m3 2482 mova [tmpq+27*%%str], m13 2483 mova [tmpq+31*%%str], m10 2484 mova [tmpq+12*%%str], m5 2485 2486 mova m13, [tmpq+30*%%str] ; t8 2487 mova m12, [tmpq+26*%%str] ; t9 2488 mova m11, [tmpq+22*%%str] ; t10 2489 mova m10, [tmpq+18*%%str] ; t11 2490 mova m9, [tmpq+17*%%str] ; t20 2491 mova m8, [tmpq+ 1*%%str] ; t21 2492 mova m3, [tmpq+25*%%str] ; t22 2493 mova m2, [tmpq+ 5*%%str] ; t23 2494 2495 SUMSUB_BA w, 9, 10, 5 2496 SUMSUB_BA w, 8, 11, 5 2497 SUMSUB_BA w, 3, 12, 5 2498 SUMSUB_BA w, 2, 13, 5 2499 mova [tmpq+ 1*%%str], m10 2500 mova [tmpq+ 5*%%str], m11 2501 mova [tmpq+17*%%str], m12 2502 mova [tmpq+25*%%str], m13 2503 2504 mova m13, [tmpq+14*%%str] ; t12 2505 mova m12, [tmpq+10*%%str] ; t13 2506 mova m11, [tmpq+ 9*%%str] ; t18 2507 mova m10, [tmpq+13*%%str] ; t19 2508 2509 SUMSUB_BA w, 11, 12, 5 2510 SUMSUB_BA w, 10, 13, 5 2511 mova [tmpq+ 9*%%str], m13 2512 mova [tmpq+13*%%str], m12 2513 mova [tmpq+10*%%str], m10 2514 mova [tmpq+14*%%str], m11 2515 2516 mova m13, [tmpq+ 6*%%str] ; t14 2517 mova m12, [tmpq+ 2*%%str] ; t15 2518 mova m11, [tmpq+21*%%str] ; t16 2519 mova m10, [tmpq+29*%%str] ; t17 2520 SUMSUB_BA w, 11, 12, 5 2521 SUMSUB_BA w, 10, 13, 5 2522 mova [tmpq+21*%%str], m12 2523 mova [tmpq+29*%%str], m13 2524 mova m12, [tmpq+10*%%str] 2525 mova m13, [tmpq+14*%%str] 2526 2527 TRANSPOSE16x16W 6, 0, 15, 14, 1, 7, 5, 4, \ 2528 2, 3, 8, 9, 12, 13, 10, 11, \ 2529 [tmpq+12*%%str], [tmpq+ 8*%%str], 1 2530 mova [tmpq+ 0*%%str], m6 2531 mova [tmpq+ 2*%%str], m0 2532 mova [tmpq+ 4*%%str], m15 2533 mova [tmpq+ 6*%%str], m14 2534 mova [tmpq+10*%%str], m7 2535 mova [tmpq+12*%%str], m5 2536 mova [tmpq+14*%%str], m4 2537 mova [tmpq+16*%%str], m2 2538 mova [tmpq+18*%%str], m3 2539 mova [tmpq+20*%%str], m8 2540 mova [tmpq+22*%%str], m9 2541 mova [tmpq+24*%%str], m12 2542 mova [tmpq+26*%%str], m13 2543 mova [tmpq+28*%%str], m10 2544 mova [tmpq+30*%%str], m11 2545 2546 mova m0, [tmpq+21*%%str] 2547 mova m1, [tmpq+29*%%str] 2548 mova m2, [tmpq+13*%%str] 2549 mova m3, [tmpq+ 9*%%str] 2550 mova m4, [tmpq+ 1*%%str] 2551 mova m5, [tmpq+ 5*%%str] 2552 mova m7, [tmpq+25*%%str] 2553 mova m8, [tmpq+31*%%str] 2554 mova m9, [tmpq+27*%%str] 2555 mova m10, [tmpq+23*%%str] 2556 mova m11, [tmpq+19*%%str] 2557 mova m12, [tmpq+15*%%str] 2558 mova m13, [tmpq+11*%%str] 2559 mova m14, [tmpq+ 7*%%str] 2560 mova m15, [tmpq+ 3*%%str] 2561 TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, \ 2562 8, 9, 10, 11, 12, 13, 14, 15, \ 2563 [tmpq+17*%%str], [tmpq+ 9*%%str], 1 2564 mova [tmpq+ 1*%%str], m0 2565 mova [tmpq+ 3*%%str], m1 2566 mova [tmpq+ 5*%%str], m2 2567 mova [tmpq+ 7*%%str], m3 2568 mova [tmpq+11*%%str], m5 2569 mova [tmpq+13*%%str], m6 2570 mova [tmpq+15*%%str], m7 2571 mova [tmpq+17*%%str], m8 2572 mova [tmpq+19*%%str], m9 2573 mova [tmpq+21*%%str], m10 2574 mova [tmpq+23*%%str], m11 2575 mova [tmpq+25*%%str], m12 2576 mova [tmpq+27*%%str], m13 2577 mova [tmpq+29*%%str], m14 2578 mova [tmpq+31*%%str], m15 2579 %else ; !avx2 2580 TRANSPOSE8x8W 6, 0, 15, 14, 1, 7, 5, 4, 8 2581 mova [tmpq+ 0*%%str], m6 2582 mova [tmpq+ 4*%%str], m0 2583 mova [tmpq+ 8*%%str], m15 2584 mova [tmpq+12*%%str], m14 2585 mova [tmpq+16*%%str], m1 2586 mova [tmpq+20*%%str], m7 2587 mova [tmpq+24*%%str], m5 2588 mova [tmpq+28*%%str], m4 2589 2590 mova m8, [tmpq+ 3*%%str] ; t15 2591 TRANSPOSE8x8W 10, 13, 3, 2, 11, 12, 9, 8, 0 2592 mova [tmpq+ 3*%%str], m10 2593 mova [tmpq+ 7*%%str], m13 2594 mova [tmpq+11*%%str], m3 2595 mova [tmpq+15*%%str], m2 2596 mova [tmpq+19*%%str], m11 2597 mova [tmpq+23*%%str], m12 2598 mova [tmpq+27*%%str], m9 2599 mova [tmpq+31*%%str], m8 2600 2601 mova m15, [tmpq+30*%%str] ; t8 2602 mova m14, [tmpq+26*%%str] ; t9 2603 mova m13, [tmpq+22*%%str] ; t10 2604 mova m12, [tmpq+18*%%str] ; t11 2605 mova m11, [tmpq+14*%%str] ; t12 2606 mova m10, [tmpq+10*%%str] ; t13 2607 mova m9, [tmpq+ 6*%%str] ; t14 2608 mova m8, [tmpq+ 2*%%str] ; t15 2609 mova m7, [tmpq+21*%%str] ; t16 2610 mova m6, [tmpq+29*%%str] ; t17 2611 mova m5, [tmpq+ 9*%%str] ; t18 2612 mova m4, [tmpq+13*%%str] ; t19 2613 mova m3, [tmpq+17*%%str] ; t20 2614 mova m2, [tmpq+ 1*%%str] ; t21 2615 mova m1, [tmpq+25*%%str] ; t22 2616 2617 SUMSUB_BA w, 7, 8, 0 2618 mova [tmpq+ 2*%%str], m8 2619 mova m0, [tmpq+ 5*%%str] ; t23 2620 SUMSUB_BA w, 6, 9, 8 2621 SUMSUB_BA w, 5, 10, 8 2622 SUMSUB_BA w, 4, 11, 8 2623 SUMSUB_BA w, 3, 12, 8 2624 SUMSUB_BA w, 2, 13, 8 2625 SUMSUB_BA w, 1, 14, 8 2626 SUMSUB_BA w, 0, 15, 8 2627 2628 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 2629 mova [tmpq+ 1*%%str], m0 2630 mova [tmpq+ 5*%%str], m1 2631 mova [tmpq+ 9*%%str], m2 2632 mova [tmpq+13*%%str], m3 2633 mova [tmpq+17*%%str], m4 2634 mova [tmpq+21*%%str], m5 2635 mova [tmpq+25*%%str], m6 2636 mova [tmpq+29*%%str], m7 2637 2638 mova m8, [tmpq+ 2*%%str] 2639 TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 2640 mova [tmpq+ 2*%%str], m8 2641 mova [tmpq+ 6*%%str], m9 2642 mova [tmpq+10*%%str], m10 2643 mova [tmpq+14*%%str], m11 2644 mova [tmpq+18*%%str], m12 2645 mova [tmpq+22*%%str], m13 2646 mova [tmpq+26*%%str], m14 2647 mova [tmpq+30*%%str], m15 2648 %endif ; avx2 2649 %else 2650 mova m2, [tmpq+24*%%str] ; t6 2651 mova m3, [tmpq+28*%%str] ; t7 2652 SUMSUB_BADC w, 5, 2, 4, 3 2653 mova [tmpq+24*%%str], m5 2654 mova [tmpq+23*%%str], m2 2655 mova [tmpq+28*%%str], m4 2656 mova [tmpq+19*%%str], m3 2657 2658 mova m2, [tmpq+16*%%str] ; t4 2659 mova m3, [tmpq+20*%%str] ; t5 2660 SUMSUB_BA w, 1, 2, 5 2661 SUMSUB_BA w, 7, 3, 5 2662 mova [tmpq+15*%%str], m2 2663 mova [tmpq+11*%%str], m3 2664 2665 mova m2, [tmpq+ 0*%%str] ; t0 2666 mova m3, [tmpq+ 4*%%str] ; t1 2667 SUMSUB_BA w, 6, 2, 5 2668 SUMSUB_BA w, 0, 3, 5 2669 mova [tmpq+31*%%str], m2 2670 mova [tmpq+27*%%str], m3 2671 2672 mova m2, [tmpq+ 8*%%str] ; t2 2673 mova m3, [tmpq+12*%%str] ; t3 2674 mova m5, [tmpq+ 7*%%str] 2675 mova m4, [tmpq+ 3*%%str] 2676 SUMSUB_BADC w, 5, 2, 4, 3 2677 mova [tmpq+ 7*%%str], m2 2678 mova [tmpq+ 3*%%str], m3 2679 2680 mova m3, [tmpq+28*%%str] 2681 TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1 2682 mova [tmpq+ 0*%%str], m6 2683 mova [tmpq+ 4*%%str], m0 2684 mova [tmpq+ 8*%%str], m5 2685 mova [tmpq+12*%%str], m4 2686 mova [tmpq+20*%%str], m7 2687 mova [tmpq+24*%%str], m2 2688 mova [tmpq+28*%%str], m3 2689 2690 mova m6, [tmpq+19*%%str] 2691 mova m0, [tmpq+23*%%str] 2692 mova m5, [tmpq+11*%%str] 2693 mova m4, [tmpq+15*%%str] 2694 mova m1, [tmpq+ 3*%%str] 2695 mova m7, [tmpq+ 7*%%str] 2696 mova m3, [tmpq+31*%%str] 2697 TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1 2698 mova [tmpq+ 3*%%str], m6 2699 mova [tmpq+ 7*%%str], m0 2700 mova [tmpq+11*%%str], m5 2701 mova [tmpq+15*%%str], m4 2702 mova [tmpq+23*%%str], m7 2703 mova [tmpq+27*%%str], m2 2704 mova [tmpq+31*%%str], m3 2705 2706 mova m1, [tmpq+ 6*%%str] ; t14 2707 mova m0, [tmpq+ 2*%%str] ; t15 2708 mova m7, [tmpq+21*%%str] ; t16 2709 mova m6, [tmpq+29*%%str] ; t17 2710 SUMSUB_BA w, 7, 0, 2 2711 SUMSUB_BA w, 6, 1, 2 2712 mova [tmpq+29*%%str], m7 2713 mova [tmpq+ 2*%%str], m0 2714 mova [tmpq+21*%%str], m6 2715 mova [tmpq+ 6*%%str], m1 2716 2717 mova m1, [tmpq+14*%%str] ; t12 2718 mova m0, [tmpq+10*%%str] ; t13 2719 mova m5, [tmpq+ 9*%%str] ; t18 2720 mova m4, [tmpq+13*%%str] ; t19 2721 SUMSUB_BA w, 5, 0, 2 2722 SUMSUB_BA w, 4, 1, 2 2723 mova [tmpq+10*%%str], m0 2724 mova [tmpq+14*%%str], m1 2725 2726 mova m1, [tmpq+22*%%str] ; t10 2727 mova m0, [tmpq+18*%%str] ; t11 2728 mova m3, [tmpq+17*%%str] ; t20 2729 mova m2, [tmpq+ 1*%%str] ; t21 2730 SUMSUB_BA w, 3, 0, 6 2731 SUMSUB_BA w, 2, 1, 6 2732 mova [tmpq+18*%%str], m0 2733 mova [tmpq+22*%%str], m1 2734 2735 mova m7, [tmpq+30*%%str] ; t8 2736 mova m6, [tmpq+26*%%str] ; t9 2737 mova m1, [tmpq+25*%%str] ; t22 2738 mova m0, [tmpq+ 5*%%str] ; t23 2739 SUMSUB_BADC w, 1, 6, 0, 7 2740 mova [tmpq+26*%%str], m6 2741 mova [tmpq+30*%%str], m7 2742 2743 mova m7, [tmpq+29*%%str] 2744 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1 2745 mova [tmpq+ 1*%%str], m0 2746 mova [tmpq+ 5*%%str], m1 2747 mova [tmpq+ 9*%%str], m2 2748 mova [tmpq+13*%%str], m3 2749 mova [tmpq+21*%%str], m5 2750 mova [tmpq+25*%%str], m6 2751 mova [tmpq+29*%%str], m7 2752 2753 mova m0, [tmpq+ 2*%%str] 2754 mova m1, [tmpq+ 6*%%str] 2755 mova m2, [tmpq+10*%%str] 2756 mova m3, [tmpq+14*%%str] 2757 mova m4, [tmpq+18*%%str] 2758 mova m5, [tmpq+22*%%str] 2759 mova m7, [tmpq+30*%%str] 2760 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1 2761 mova [tmpq+ 2*%%str], m0 2762 mova [tmpq+ 6*%%str], m1 2763 mova [tmpq+10*%%str], m2 2764 mova [tmpq+14*%%str], m3 2765 mova [tmpq+22*%%str], m5 2766 mova [tmpq+26*%%str], m6 2767 mova [tmpq+30*%%str], m7 2768 %endif 2769 %else 2770 ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str] 2771 ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str] 2772 ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str] 2773 ; t20-22 is in m4-6 2774 ; t24-31 is in m8-15 2775 2776 %if cpuflag(ssse3) 2777 %define ROUND_REG [pw_512] 2778 %else 2779 %define ROUND_REG [pw_32] 2780 %endif 2781 2782 %macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs 2783 SUMSUB_BA w, %4, %1, %5 2784 SUMSUB_BA w, %3, %2, %5 2785 VP9_IDCT8_WRITEx2 %4, %3, %5, %6, %7, ROUND_REG, 6 2786 %if %8 == 1 2787 add dstq, stride2q 2788 %endif 2789 VP9_IDCT8_WRITEx2 %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq 2790 %if %8 == 1 2791 sub dst_endq, stride2q 2792 %endif 2793 %endmacro 2794 2795 %if ARCH_X86_64 2796 pxor m10, m10 2797 2798 ; store t0-1 and t30-31 2799 mova m8, [tmpq+ 0*%%str] 2800 mova m9, [tmpq+ 4*%%str] 2801 %%STORE_2X2 8, 9, 0, 6, 12, 11, 10 2802 2803 ; store t2-3 and t28-29 2804 mova m8, [tmpq+ 8*%%str] 2805 mova m9, [tmpq+12*%%str] 2806 %%STORE_2X2 8, 9, 14, 15, 12, 11, 10 2807 2808 ; store t4-5 and t26-27 2809 mova m8, [tmpq+16*%%str] 2810 mova m9, [tmpq+20*%%str] 2811 %%STORE_2X2 8, 9, 7, 1, 12, 11, 10 2812 2813 ; store t6-7 and t24-25 2814 mova m8, [tmpq+24*%%str] 2815 mova m9, [tmpq+28*%%str] 2816 %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 2817 2818 ; store t8-9 and t22-23 2819 mova m8, [tmpq+30*%%str] 2820 mova m9, [tmpq+26*%%str] 2821 mova m0, [tmpq+ 5*%%str] 2822 %%STORE_2X2 8, 9, 13, 0, 12, 11, 10 2823 2824 ; store t10-11 and t20-21 2825 mova m8, [tmpq+22*%%str] 2826 mova m9, [tmpq+18*%%str] 2827 %%STORE_2X2 8, 9, 2, 3, 12, 11, 10 2828 2829 ; store t12-13 and t18-19 2830 mova m8, [tmpq+14*%%str] 2831 mova m9, [tmpq+10*%%str] 2832 mova m5, [tmpq+13*%%str] 2833 mova m4, [tmpq+ 9*%%str] 2834 %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 2835 2836 ; store t14-17 2837 mova m8, [tmpq+ 6*%%str] 2838 mova m9, [tmpq+ 2*%%str] 2839 mova m5, [tmpq+29*%%str] 2840 mova m4, [tmpq+21*%%str] 2841 %%STORE_2X2 8, 9, 4, 5, 12, 11, 10, 0 2842 2843 SWAP 1, 10 ; zero 2844 %else 2845 mova [tmpq+ 1*%%str], m1 2846 mova [tmpq+11*%%str], m2 2847 mova [tmpq+15*%%str], m3 2848 mova [tmpq+17*%%str], m4 2849 mova [tmpq+19*%%str], m5 2850 pxor m1, m1 2851 2852 ; store t0-1 and t30-31 2853 mova m2, [tmpq+ 0*%%str] 2854 mova m3, [tmpq+ 4*%%str] 2855 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2856 2857 ; store t2-3 and t28-29 2858 mova m2, [tmpq+ 8*%%str] 2859 mova m3, [tmpq+12*%%str] 2860 mova m0, [tmpq+ 3*%%str] 2861 mova m6, [tmpq+ 7*%%str] 2862 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2863 2864 ; store t4-5 and t26-27 2865 mova m2, [tmpq+16*%%str] 2866 mova m3, [tmpq+20*%%str] 2867 mova m0, [tmpq+ 1*%%str] 2868 %%STORE_2X2 2, 3, 7, 0, 4, 5, 1 2869 2870 ; store t6-7 and t24-25 2871 mova m2, [tmpq+24*%%str] 2872 mova m3, [tmpq+28*%%str] 2873 mova m0, [tmpq+17*%%str] 2874 mova m6, [tmpq+19*%%str] 2875 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2876 2877 ; store t8-9 and t22-23 2878 mova m2, [tmpq+30*%%str] 2879 mova m3, [tmpq+26*%%str] 2880 mova m0, [tmpq+25*%%str] 2881 mova m6, [tmpq+ 5*%%str] 2882 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2883 2884 ; store t10-11 and t20-21 2885 mova m2, [tmpq+22*%%str] 2886 mova m3, [tmpq+18*%%str] 2887 mova m0, [tmpq+11*%%str] 2888 mova m6, [tmpq+15*%%str] 2889 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2890 2891 ; store t12-13 and t18-19 2892 mova m2, [tmpq+14*%%str] 2893 mova m3, [tmpq+10*%%str] 2894 mova m6, [tmpq+13*%%str] 2895 mova m0, [tmpq+ 9*%%str] 2896 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 2897 2898 ; store t14-17 2899 mova m2, [tmpq+ 6*%%str] 2900 mova m3, [tmpq+ 2*%%str] 2901 mova m6, [tmpq+29*%%str] 2902 mova m0, [tmpq+21*%%str] 2903 %%STORE_2X2 2, 3, 0, 6, 4, 5, 1, 0 2904 %endif 2905 %undef ROUND_REG 2906 %endif 2907 %endmacro 2908 2909 %macro VP9_IDCT_IDCT_32x32_ADD_XMM 1 2910 INIT_XMM %1 2911 cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob 2912 movifnidn eobd, dword eobm 2913 %if cpuflag(ssse3) 2914 cmp eobd, 135 2915 jg .idctfull 2916 cmp eobd, 34 2917 jg .idct16x16 2918 cmp eobd, 1 2919 jg .idct8x8 2920 %else 2921 cmp eobd, 1 2922 jg .idctfull 2923 %endif 2924 2925 ; dc-only case 2926 movifnidn blockq, blockmp 2927 movifnidn dstq, dstmp 2928 movifnidn strideq, stridemp 2929 %if cpuflag(ssse3) 2930 movd m0, [blockq] 2931 mova m1, [pw_11585x2] 2932 pmulhrsw m0, m1 2933 pmulhrsw m0, m1 2934 %else 2935 DEFINE_ARGS dst, stride, block, coef 2936 movsx coefd, word [blockq] 2937 imul coefd, 11585 2938 add coefd, 8192 2939 sar coefd, 14 2940 imul coefd, 11585 2941 add coefd, (32 << 14) + 8192 2942 sar coefd, 14 + 6 2943 movd m0, coefd 2944 %endif 2945 SPLATW m0, m0, q0000 2946 %if cpuflag(ssse3) 2947 pmulhrsw m0, [pw_512] 2948 %endif 2949 pxor m5, m5 2950 movd [blockq], m5 2951 %rep 31 2952 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize 2953 add dstq, strideq 2954 %endrep 2955 VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize 2956 RET 2957 2958 %if ARCH_X86_64 2959 DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp 2960 %else 2961 %define dst_bakq r0mp 2962 %endif 2963 %if cpuflag(ssse3) 2964 .idct8x8: 2965 %if ARCH_X86_32 2966 DEFINE_ARGS block, u1, u2, u3, u4, tmp 2967 mov blockq, r2mp 2968 %endif 2969 mov tmpq, rsp 2970 VP9_IDCT32_1D blockq, 1, 8 2971 2972 %if ARCH_X86_32 2973 DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp 2974 mov strideq, r1mp 2975 %define cntd dword r3m 2976 %endif 2977 mov stride30q, strideq ; stride 2978 lea stride2q, [strideq*2] ; stride*2 2979 shl stride30q, 5 ; stride*32 2980 mov cntd, 4 2981 sub stride30q, stride2q ; stride*30 2982 .loop2_8x8: 2983 mov dstq, dst_bakq 2984 lea dst_endq, [dstq+stride30q] 2985 VP9_IDCT32_1D tmpq, 2, 8 2986 add dst_bakq, 8 2987 add tmpq, 16 2988 dec cntd 2989 jg .loop2_8x8 2990 2991 ; at the end of the loop, m7 should still be zero 2992 ; use that to zero out block coefficients 2993 %if ARCH_X86_32 2994 DEFINE_ARGS block 2995 mov blockq, r2mp 2996 %endif 2997 ZERO_BLOCK blockq, 64, 8, m1 2998 RET 2999 3000 .idct16x16: 3001 %if ARCH_X86_32 3002 DEFINE_ARGS block, tmp, cnt 3003 mov blockq, r2mp 3004 %endif 3005 mov cntd, 2 3006 mov tmpq, rsp 3007 .loop1_16x16: 3008 VP9_IDCT32_1D blockq, 1, 16 3009 add blockq, 16 3010 add tmpq, 512 3011 dec cntd 3012 jg .loop1_16x16 3013 3014 %if ARCH_X86_64 3015 sub blockq, 32 3016 %else 3017 DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp 3018 mov strideq, r1mp 3019 %define cntd dword r3m 3020 %endif 3021 3022 mov stride30q, strideq ; stride 3023 lea stride2q, [strideq*2] ; stride*2 3024 shl stride30q, 5 ; stride*32 3025 mov cntd, 4 3026 mov tmpq, rsp 3027 sub stride30q, stride2q ; stride*30 3028 .loop2_16x16: 3029 mov dstq, dst_bakq 3030 lea dst_endq, [dstq+stride30q] 3031 VP9_IDCT32_1D tmpq, 2, 16 3032 add dst_bakq, 8 3033 add tmpq, 16 3034 dec cntd 3035 jg .loop2_16x16 3036 3037 ; at the end of the loop, m7 should still be zero 3038 ; use that to zero out block coefficients 3039 %if ARCH_X86_32 3040 DEFINE_ARGS block 3041 mov blockq, r2mp 3042 %endif 3043 ZERO_BLOCK blockq, 64, 16, m1 3044 RET 3045 %endif 3046 3047 .idctfull: 3048 %if ARCH_X86_32 3049 DEFINE_ARGS block, tmp, cnt 3050 mov blockq, r2mp 3051 %endif 3052 mov cntd, 4 3053 mov tmpq, rsp 3054 .loop1_full: 3055 VP9_IDCT32_1D blockq, 1 3056 add blockq, 16 3057 add tmpq, 512 3058 dec cntd 3059 jg .loop1_full 3060 3061 %if ARCH_X86_64 3062 sub blockq, 64 3063 %else 3064 DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp 3065 mov strideq, r1mp 3066 %define cntd dword r3m 3067 %endif 3068 3069 mov stride30q, strideq ; stride 3070 lea stride2q, [strideq*2] ; stride*2 3071 shl stride30q, 5 ; stride*32 3072 mov cntd, 4 3073 mov tmpq, rsp 3074 sub stride30q, stride2q ; stride*30 3075 .loop2_full: 3076 mov dstq, dst_bakq 3077 lea dst_endq, [dstq+stride30q] 3078 VP9_IDCT32_1D tmpq, 2 3079 add dst_bakq, 8 3080 add tmpq, 16 3081 dec cntd 3082 jg .loop2_full 3083 3084 ; at the end of the loop, m7 should still be zero 3085 ; use that to zero out block coefficients 3086 %if ARCH_X86_32 3087 DEFINE_ARGS block 3088 mov blockq, r2mp 3089 %endif 3090 ZERO_BLOCK blockq, 64, 32, m1 3091 RET 3092 %endmacro 3093 3094 VP9_IDCT_IDCT_32x32_ADD_XMM sse2 3095 VP9_IDCT_IDCT_32x32_ADD_XMM ssse3 3096 VP9_IDCT_IDCT_32x32_ADD_XMM avx 3097 3098 ; this is almost identical to VP9_STORE_2X, but it does two rows 3099 ; for slightly improved interleaving, and it omits vpermq since the 3100 ; input is DC so all values are identical 3101 %macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero 3102 mova m%2, [dstq] 3103 mova m%4, [dstq+strideq] 3104 punpckhbw m%3, m%2, m%6 3105 punpcklbw m%2, m%6 3106 punpckhbw m%5, m%4, m%6 3107 punpcklbw m%4, m%6 3108 paddw m%3, m%1 3109 paddw m%2, m%1 3110 paddw m%5, m%1 3111 paddw m%4, m%1 3112 packuswb m%2, m%3 3113 packuswb m%4, m%5 3114 mova [dstq+strideq*0], m%2 3115 mova [dstq+strideq*1], m%4 3116 %endmacro 3117 3118 %if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 3119 INIT_YMM avx2 3120 cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob 3121 cmp eobd, 135 3122 jg .idctfull 3123 cmp eobd, 1 3124 jg .idct16x16 3125 3126 ; dc-only case 3127 mova m1, [pw_11585x2] 3128 vpbroadcastw m0, [blockq] 3129 pmulhrsw m0, m1 3130 pmulhrsw m0, m1 3131 pxor m5, m5 3132 pmulhrsw m0, [pw_512] 3133 movd [blockq], xm5 3134 3135 DEFINE_ARGS dst, stride, cnt 3136 mov cntd, 16 3137 .loop_dc: 3138 VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5 3139 lea dstq, [dstq+2*strideq] 3140 dec cntd 3141 jg .loop_dc 3142 RET 3143 3144 DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp 3145 .idct16x16: 3146 mov tmpq, rsp 3147 VP9_IDCT32_1D blockq, 1, 16 3148 3149 mov stride30q, strideq ; stride 3150 lea stride2q, [strideq*2] ; stride*2 3151 shl stride30q, 5 ; stride*32 3152 mov cntd, 2 3153 sub stride30q, stride2q ; stride*30 3154 .loop2_16x16: 3155 mov dstq, dst_bakq 3156 lea dst_endq, [dstq+stride30q] 3157 VP9_IDCT32_1D tmpq, 2, 16 3158 add dst_bakq, 16 3159 add tmpq, 32 3160 dec cntd 3161 jg .loop2_16x16 3162 3163 ; at the end of the loop, m1 should still be zero 3164 ; use that to zero out block coefficients 3165 ZERO_BLOCK blockq, 64, 16, m1 3166 RET 3167 3168 .idctfull: 3169 mov cntd, 2 3170 mov tmpq, rsp 3171 .loop1_full: 3172 VP9_IDCT32_1D blockq, 1 3173 add blockq, 32 3174 add tmpq, 1024 3175 dec cntd 3176 jg .loop1_full 3177 3178 sub blockq, 64 3179 3180 mov stride30q, strideq ; stride 3181 lea stride2q, [strideq*2] ; stride*2 3182 shl stride30q, 5 ; stride*32 3183 mov cntd, 2 3184 mov tmpq, rsp 3185 sub stride30q, stride2q ; stride*30 3186 .loop2_full: 3187 mov dstq, dst_bakq 3188 lea dst_endq, [dstq+stride30q] 3189 VP9_IDCT32_1D tmpq, 2 3190 add dst_bakq, 16 3191 add tmpq, 32 3192 dec cntd 3193 jg .loop2_full 3194 3195 ; at the end of the loop, m1 should still be zero 3196 ; use that to zero out block coefficients 3197 ZERO_BLOCK blockq, 64, 32, m1 3198 RET 3199 %endif