vp9itxfm_16bpp.asm (73262B)
1 ;****************************************************************************** 2 ;* VP9 inverse transform x86 SIMD optimizations 3 ;* 4 ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com> 5 ;* 6 ;* This file is part of FFmpeg. 7 ;* 8 ;* FFmpeg is free software; you can redistribute it and/or 9 ;* modify it under the terms of the GNU Lesser General Public 10 ;* License as published by the Free Software Foundation; either 11 ;* version 2.1 of the License, or (at your option) any later version. 12 ;* 13 ;* FFmpeg is distributed in the hope that it will be useful, 14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 ;* Lesser General Public License for more details. 17 ;* 18 ;* You should have received a copy of the GNU Lesser General Public 19 ;* License along with FFmpeg; if not, write to the Free Software 20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 ;****************************************************************************** 22 23 %include "libavutil/x86/x86util.asm" 24 %include "vp9itxfm_template.asm" 25 26 SECTION_RODATA 27 28 cextern pw_8 29 cextern pw_1023 30 cextern pw_2048 31 cextern pw_4095 32 cextern pw_m1 33 cextern pd_1 34 cextern pd_16 35 cextern pd_32 36 cextern pd_8192 37 38 pd_8: times 4 dd 8 39 pd_3fff: times 4 dd 0x3fff 40 41 cextern pw_11585x2 42 43 cextern pw_5283_13377 44 cextern pw_9929_13377 45 cextern pw_15212_m13377 46 cextern pw_15212_9929 47 cextern pw_m5283_m15212 48 cextern pw_13377x2 49 cextern pw_m13377_13377 50 cextern pw_13377_0 51 52 pw_9929_m5283: times 4 dw 9929, -5283 53 54 %macro COEF_PAIR 2-3 55 cextern pw_m%1_%2 56 cextern pw_%2_%1 57 %if %0 == 3 58 cextern pw_m%1_m%2 59 %if %1 != %2 60 cextern pw_m%2_%1 61 cextern pw_%1_%2 62 %endif 63 %endif 64 %endmacro 65 66 COEF_PAIR 2404, 16207 67 COEF_PAIR 3196, 16069, 1 68 COEF_PAIR 4756, 15679 69 COEF_PAIR 5520, 15426 70 COEF_PAIR 6270, 15137, 1 71 COEF_PAIR 8423, 14053 72 COEF_PAIR 10394, 12665 73 COEF_PAIR 11003, 12140 74 COEF_PAIR 11585, 11585, 1 75 COEF_PAIR 13160, 9760 76 COEF_PAIR 13623, 9102, 1 77 COEF_PAIR 14449, 7723 78 COEF_PAIR 14811, 7005 79 COEF_PAIR 15893, 3981 80 COEF_PAIR 16305, 1606 81 COEF_PAIR 16364, 804 82 83 default_8x8: 84 times 12 db 1 85 times 52 db 2 86 row_8x8: 87 times 18 db 1 88 times 46 db 2 89 col_8x8: 90 times 6 db 1 91 times 58 db 2 92 default_16x16: 93 times 10 db 1 94 times 28 db 2 95 times 51 db 3 96 times 167 db 4 97 row_16x16: 98 times 21 db 1 99 times 45 db 2 100 times 60 db 3 101 times 130 db 4 102 col_16x16: 103 times 5 db 1 104 times 12 db 2 105 times 25 db 3 106 times 214 db 4 107 default_32x32: 108 times 9 db 1 109 times 25 db 2 110 times 36 db 3 111 times 65 db 4 112 times 105 db 5 113 times 96 db 6 114 times 112 db 7 115 times 576 db 8 116 117 SECTION .text 118 119 %macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst 120 mova m%3, [%7] 121 mova m%4, [%7+strideq] 122 paddw m%3, m%1 123 paddw m%4, m%2 124 pmaxsw m%3, m%5 125 pmaxsw m%4, m%5 126 pminsw m%3, m%6 127 pminsw m%4, m%6 128 mova [%7], m%3 129 mova [%7+strideq], m%4 130 %endmacro 131 132 %macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg 133 %assign %%y 0 134 %rep %3 135 %assign %%x 0 136 %rep %3*4/mmsize 137 mova [%1+%%y+%%x], %4 138 %assign %%x (%%x+mmsize) 139 %endrep 140 %assign %%y (%%y+%2) 141 %endrep 142 %endmacro 143 144 ; the input coefficients are scaled up by 2 bit (which we downscale immediately 145 ; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d. 146 ; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling, 147 ; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits 148 ; add 2 bits, we need to scale before converting to word in 12bpp, since the 149 ; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp 150 ; we can scale after converting to words (which is half the instructions), 151 ; since the input is only 14+sign bit, which fits in 15+sign words directly. 152 153 %macro IWHT4_FN 2 ; bpp, max 154 cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob 155 mova m7, [pw_%2] 156 mova m0, [blockq+0*16+0] 157 mova m1, [blockq+1*16+0] 158 %if %1 >= 12 159 mova m4, [blockq+0*16+8] 160 mova m5, [blockq+1*16+8] 161 psrad m0, 2 162 psrad m1, 2 163 psrad m4, 2 164 psrad m5, 2 165 packssdw m0, m4 166 packssdw m1, m5 167 %else 168 packssdw m0, [blockq+0*16+8] 169 packssdw m1, [blockq+1*16+8] 170 psraw m0, 2 171 psraw m1, 2 172 %endif 173 mova m2, [blockq+2*16+0] 174 mova m3, [blockq+3*16+0] 175 %if %1 >= 12 176 mova m4, [blockq+2*16+8] 177 mova m5, [blockq+3*16+8] 178 psrad m2, 2 179 psrad m3, 2 180 psrad m4, 2 181 psrad m5, 2 182 packssdw m2, m4 183 packssdw m3, m5 184 %else 185 packssdw m2, [blockq+2*16+8] 186 packssdw m3, [blockq+3*16+8] 187 psraw m2, 2 188 psraw m3, 2 189 %endif 190 191 VP9_IWHT4_1D 192 TRANSPOSE4x4W 0, 1, 2, 3, 4 193 VP9_IWHT4_1D 194 195 pxor m6, m6 196 VP9_STORE_2X 0, 1, 4, 5, 6, 7 197 lea dstq, [dstq+strideq*2] 198 VP9_STORE_2X 2, 3, 4, 5, 6, 7 199 ZERO_BLOCK blockq, 16, 4, m6 200 RET 201 %endmacro 202 203 INIT_MMX mmxext 204 IWHT4_FN 10, 1023 205 INIT_MMX mmxext 206 IWHT4_FN 12, 4095 207 208 %macro VP9_IDCT4_WRITEOUT 0 209 %if cpuflag(ssse3) 210 mova m5, [pw_2048] 211 pmulhrsw m0, m5 212 pmulhrsw m1, m5 213 pmulhrsw m2, m5 214 pmulhrsw m3, m5 215 %else 216 mova m5, [pw_8] 217 paddw m0, m5 218 paddw m1, m5 219 paddw m2, m5 220 paddw m3, m5 221 psraw m0, 4 222 psraw m1, 4 223 psraw m2, 4 224 psraw m3, 4 225 %endif 226 mova m5, [pw_1023] 227 VP9_STORE_2X 0, 1, 6, 7, 4, 5 228 lea dstq, [dstq+2*strideq] 229 VP9_STORE_2X 2, 3, 6, 7, 4, 5 230 %endmacro 231 232 %macro DC_ONLY 2 ; shift, zero 233 mov coefd, dword [blockq] 234 movd [blockq], %2 235 imul coefd, 11585 236 add coefd, 8192 237 sar coefd, 14 238 imul coefd, 11585 239 add coefd, ((1 << (%1 - 1)) << 14) + 8192 240 sar coefd, 14 + %1 241 %endmacro 242 243 ; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits 244 ; in 15+1 words without additional effort, since the coefficients are 15bpp. 245 246 %macro IDCT4_10_FN 0 247 cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob 248 cmp eobd, 1 249 jg .idctfull 250 251 ; dc-only 252 pxor m4, m4 253 %if cpuflag(ssse3) 254 movd m0, [blockq] 255 movd [blockq], m4 256 mova m5, [pw_11585x2] 257 pmulhrsw m0, m5 258 pmulhrsw m0, m5 259 %else 260 DEFINE_ARGS dst, stride, block, coef 261 DC_ONLY 4, m4 262 movd m0, coefd 263 %endif 264 pshufw m0, m0, 0 265 mova m5, [pw_1023] 266 %if cpuflag(ssse3) 267 pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 268 %endif 269 VP9_STORE_2X 0, 0, 6, 7, 4, 5 270 lea dstq, [dstq+2*strideq] 271 VP9_STORE_2X 0, 0, 6, 7, 4, 5 272 RET 273 274 .idctfull: 275 mova m0, [blockq+0*16+0] 276 mova m1, [blockq+1*16+0] 277 packssdw m0, [blockq+0*16+8] 278 packssdw m1, [blockq+1*16+8] 279 mova m2, [blockq+2*16+0] 280 mova m3, [blockq+3*16+0] 281 packssdw m2, [blockq+2*16+8] 282 packssdw m3, [blockq+3*16+8] 283 284 %if cpuflag(ssse3) 285 mova m6, [pw_11585x2] 286 %endif 287 mova m7, [pd_8192] ; rounding 288 VP9_IDCT4_1D 289 TRANSPOSE4x4W 0, 1, 2, 3, 4 290 VP9_IDCT4_1D 291 292 pxor m4, m4 293 ZERO_BLOCK blockq, 16, 4, m4 294 VP9_IDCT4_WRITEOUT 295 RET 296 %endmacro 297 298 INIT_MMX mmxext 299 IDCT4_10_FN 300 INIT_MMX ssse3 301 IDCT4_10_FN 302 303 %macro IADST4_FN 4 304 cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob 305 %if WIN64 && notcpuflag(ssse3) 306 INIT_XMM cpuname 307 WIN64_SPILL_XMM 8 308 INIT_MMX cpuname 309 %endif 310 movdqa xmm5, [pd_8192] 311 mova m0, [blockq+0*16+0] 312 mova m1, [blockq+1*16+0] 313 packssdw m0, [blockq+0*16+8] 314 packssdw m1, [blockq+1*16+8] 315 mova m2, [blockq+2*16+0] 316 mova m3, [blockq+3*16+0] 317 packssdw m2, [blockq+2*16+8] 318 packssdw m3, [blockq+3*16+8] 319 320 %if cpuflag(ssse3) 321 mova m6, [pw_11585x2] 322 %endif 323 %ifnidn %1%3, iadstiadst 324 movdq2q m7, xmm5 325 %endif 326 VP9_%2_1D 327 TRANSPOSE4x4W 0, 1, 2, 3, 4 328 VP9_%4_1D 329 330 pxor m4, m4 331 ZERO_BLOCK blockq, 16, 4, m4 332 VP9_IDCT4_WRITEOUT 333 RET 334 %endmacro 335 336 INIT_MMX sse2 337 IADST4_FN idct, IDCT4, iadst, IADST4 338 IADST4_FN iadst, IADST4, idct, IDCT4 339 IADST4_FN iadst, IADST4, iadst, IADST4 340 341 INIT_MMX ssse3 342 IADST4_FN idct, IDCT4, iadst, IADST4 343 IADST4_FN iadst, IADST4, idct, IDCT4 344 IADST4_FN iadst, IADST4, iadst, IADST4 345 346 ; inputs and outputs are dwords, coefficients are words 347 ; 348 ; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14 349 ; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14 350 %macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask 351 pand m%3, m%1, %8 352 pand m%4, m%2, %8 353 psrad m%1, 14 354 psrad m%2, 14 355 packssdw m%4, m%2 356 packssdw m%3, m%1 357 punpckhwd m%2, m%4, m%3 358 punpcklwd m%4, m%3 359 pmaddwd m%3, m%4, [pw_%6_%5] 360 pmaddwd m%1, m%2, [pw_%6_%5] 361 pmaddwd m%4, [pw_m%5_%6] 362 pmaddwd m%2, [pw_m%5_%6] 363 paddd m%3, %7 364 paddd m%4, %7 365 psrad m%3, 14 366 psrad m%4, 14 367 paddd m%1, m%3 368 paddd m%2, m%4 369 %endmacro 370 371 %macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1 372 SUMSUB_MUL %3, %5, %7, %8, 11585, 11585, %1, %2 373 SUMSUB_MUL %4, %6, %7, %8, 15137, 6270, %1, %2 374 SUMSUB_BA d, %4, %3, %7 375 SUMSUB_BA d, %6, %5, %7 376 SWAP %4, %6, %3 377 %endmacro 378 379 %macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max 380 movh m%1, [dstq+strideq*0] 381 movh m%2, [dstq+strideq*2] 382 movhps m%1, [dstq+strideq*1] 383 movhps m%2, [dstq+stride3q ] 384 paddw m%1, m%3 385 paddw m%2, m%4 386 pmaxsw m%1, %5 387 pmaxsw m%2, %5 388 pminsw m%1, %6 389 pminsw m%2, %6 390 movh [dstq+strideq*0], m%1 391 movhps [dstq+strideq*1], m%1 392 movh [dstq+strideq*2], m%2 393 movhps [dstq+stride3q ], m%2 394 %endmacro 395 396 %macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift 397 paddd m%1, %7 398 paddd m%2, %7 399 paddd m%3, %7 400 paddd m%4, %7 401 psrad m%1, %8 402 psrad m%2, %8 403 psrad m%3, %8 404 psrad m%4, %8 405 packssdw m%1, m%2 406 packssdw m%3, m%4 407 STORE_4x4 %2, %4, %1, %3, %5, %6 408 %endmacro 409 410 INIT_XMM sse2 411 cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob 412 cmp eobd, 1 413 jg .idctfull 414 415 ; dc-only - this is special, since for 4x4 12bpp, the max coef size is 416 ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the 417 ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a 418 ; dword. After the final shift (4), the result is 13+sign bits, so we 419 ; don't need any additional processing to fit it in a word 420 DEFINE_ARGS dst, stride, block, coef 421 pxor m4, m4 422 DC_ONLY 4, m4 423 movd m0, coefd 424 pshuflw m0, m0, q0000 425 punpcklqdq m0, m0 426 mova m5, [pw_4095] 427 DEFINE_ARGS dst, stride, stride3 428 lea stride3q, [strideq*3] 429 STORE_4x4 1, 3, 0, 0, m4, m5 430 RET 431 432 .idctfull: 433 DEFINE_ARGS dst, stride, block, eob 434 mova m0, [blockq+0*16] 435 mova m1, [blockq+1*16] 436 mova m2, [blockq+2*16] 437 mova m3, [blockq+3*16] 438 mova m6, [pd_8192] 439 mova m7, [pd_3fff] 440 441 IDCT4_12BPP_1D m6, m7 442 TRANSPOSE4x4D 0, 1, 2, 3, 4 443 IDCT4_12BPP_1D m6, m7 444 445 pxor m4, m4 446 ZERO_BLOCK blockq, 16, 4, m4 447 448 ; writeout 449 DEFINE_ARGS dst, stride, stride3 450 lea stride3q, [strideq*3] 451 mova m5, [pw_4095] 452 mova m6, [pd_8] 453 ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4 454 RET 455 456 %macro SCRATCH 3-4 457 %if ARCH_X86_64 458 SWAP %1, %2 459 %if %0 == 4 460 %define reg_%4 m%2 461 %endif 462 %else 463 mova [%3], m%1 464 %if %0 == 4 465 %define reg_%4 [%3] 466 %endif 467 %endif 468 %endmacro 469 470 %macro UNSCRATCH 3-4 471 %if ARCH_X86_64 472 SWAP %1, %2 473 %else 474 mova m%1, [%3] 475 %endif 476 %if %0 == 4 477 %undef reg_%4 478 %endif 479 %endmacro 480 481 %macro PRELOAD 2-3 482 %if ARCH_X86_64 483 mova m%1, [%2] 484 %if %0 == 3 485 %define reg_%3 m%1 486 %endif 487 %elif %0 == 3 488 %define reg_%3 [%2] 489 %endif 490 %endmacro 491 492 ; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14 493 ; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14 494 ; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14 495 ; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14 496 %macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask 497 pand m4, m0, %2 498 pand m5, m1, %2 499 psrad m0, 14 500 psrad m1, 14 501 packssdw m5, m1 502 packssdw m4, m0 503 punpckhwd m1, m4, m5 504 punpcklwd m4, m5 505 pand m5, m2, %2 506 pand m6, m3, %2 507 psrad m2, 14 508 psrad m3, 14 509 packssdw m6, m3 510 packssdw m5, m2 511 punpckhwd m3, m5, m6 512 punpcklwd m5, m6 513 SCRATCH 1, 8, rsp+0*mmsize, a 514 SCRATCH 5, 9, rsp+1*mmsize, b 515 516 ; m1/3 have the high bits of 0,1,2,3 517 ; m4/5 have the low bits of 0,1,2,3 518 ; m0/2/6/7 are free 519 520 mova m2, [pw_15212_9929] 521 mova m0, [pw_5283_13377] 522 pmaddwd m7, m2, reg_b 523 pmaddwd m6, m4, m0 524 pmaddwd m2, m3 525 pmaddwd m0, reg_a 526 paddd m6, m7 527 paddd m0, m2 528 mova m1, [pw_m13377_13377] 529 mova m5, [pw_13377_0] 530 pmaddwd m7, m1, reg_b 531 pmaddwd m2, m4, m5 532 pmaddwd m1, m3 533 pmaddwd m5, reg_a 534 paddd m2, m7 535 paddd m1, m5 536 paddd m6, %1 537 paddd m2, %1 538 psrad m6, 14 539 psrad m2, 14 540 paddd m0, m6 ; t0 541 paddd m2, m1 ; t2 542 543 mova m7, [pw_m5283_m15212] 544 mova m5, [pw_9929_13377] 545 pmaddwd m1, m7, reg_b 546 pmaddwd m6, m4, m5 547 pmaddwd m7, m3 548 pmaddwd m5, reg_a 549 paddd m6, m1 550 paddd m7, m5 551 UNSCRATCH 5, 9, rsp+1*mmsize, b 552 pmaddwd m5, [pw_9929_m5283] 553 pmaddwd m4, [pw_15212_m13377] 554 pmaddwd m3, [pw_9929_m5283] 555 UNSCRATCH 1, 8, rsp+0*mmsize, a 556 pmaddwd m1, [pw_15212_m13377] 557 paddd m4, m5 558 paddd m3, m1 559 paddd m6, %1 560 paddd m4, %1 561 psrad m6, 14 562 psrad m4, 14 563 paddd m7, m6 ; t1 564 paddd m3, m4 ; t3 565 566 SWAP 1, 7 567 %endmacro 568 569 %macro IADST4_12BPP_FN 4 570 cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob 571 mova m0, [blockq+0*16] 572 mova m1, [blockq+1*16] 573 mova m2, [blockq+2*16] 574 mova m3, [blockq+3*16] 575 576 PRELOAD 10, pd_8192, rnd 577 PRELOAD 11, pd_3fff, mask 578 %2_12BPP_1D reg_rnd, reg_mask 579 TRANSPOSE4x4D 0, 1, 2, 3, 4 580 %4_12BPP_1D reg_rnd, reg_mask 581 582 pxor m4, m4 583 ZERO_BLOCK blockq, 16, 4, m4 584 585 ; writeout 586 DEFINE_ARGS dst, stride, stride3 587 lea stride3q, [strideq*3] 588 mova m5, [pw_4095] 589 mova m6, [pd_8] 590 ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4 591 RET 592 %endmacro 593 594 INIT_XMM sse2 595 IADST4_12BPP_FN idct, IDCT4, iadst, IADST4 596 IADST4_12BPP_FN iadst, IADST4, idct, IDCT4 597 IADST4_12BPP_FN iadst, IADST4, iadst, IADST4 598 599 ; the following line has not been executed at the end of this macro: 600 ; UNSCRATCH 6, 8, rsp+%3*mmsize 601 %macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset 602 mova m0, [%1+0*%4] 603 mova m2, [%1+2*%4] 604 mova m4, [%1+4*%4] 605 mova m6, [%1+6*%4] 606 IDCT4_12BPP_1D %2, %3, 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3 607 SCRATCH 4, 8, rsp+(%5+0)*mmsize 608 SCRATCH 6, 9, rsp+(%5+1)*mmsize 609 mova m1, [%1+1*%4] 610 mova m3, [%1+3*%4] 611 mova m5, [%1+5*%4] 612 mova m7, [%1+7*%4] 613 SUMSUB_MUL 1, 7, 4, 6, 16069, 3196, %2, %3 ; m1=t7a, m7=t4a 614 SUMSUB_MUL 5, 3, 4, 6, 9102, 13623, %2, %3 ; m5=t6a, m3=t5a 615 SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a 616 SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a 617 SUMSUB_MUL 1, 7, 4, 6, 11585, 11585, %2, %3 ; m1=t6, m7=t5 618 SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7 619 SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6 620 UNSCRATCH 4, 8, rsp+(%5+0)*mmsize 621 UNSCRATCH 6, 9, rsp+(%5+1)*mmsize 622 SCRATCH 2, 8, rsp+(%5+0)*mmsize 623 SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5 624 SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4 625 SWAP 0, 5, 4, 6, 2, 7 626 %endmacro 627 628 %macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max 629 mova m%1, [%6+%7*0] 630 mova m%2, [%6+%7*1] 631 paddw m%1, m%3 632 paddw m%2, m%3 633 pmaxsw m%1, %4 634 pmaxsw m%2, %4 635 pminsw m%1, %5 636 pminsw m%2, %5 637 mova [%6+%7*0], m%1 638 mova [%6+%7*1], m%2 639 %endmacro 640 641 ; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp 642 ; storage also instead of allocating two more stack spaces. This doesn't 643 ; matter much but it's something... 644 INIT_XMM sse2 645 cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \ 646 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \ 647 dst, stride, block, eob 648 mova m0, [pw_1023] 649 cmp eobd, 1 650 jg .idctfull 651 652 ; dc-only - the 10bit version can be done entirely in 32bit, since the max 653 ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily 654 ; fits in 32bit 655 DEFINE_ARGS dst, stride, block, coef 656 pxor m2, m2 657 DC_ONLY 5, m2 658 movd m1, coefd 659 pshuflw m1, m1, q0000 660 punpcklqdq m1, m1 661 DEFINE_ARGS dst, stride, cnt 662 mov cntd, 4 663 .loop_dc: 664 STORE_2x8 3, 4, 1, m2, m0 665 lea dstq, [dstq+strideq*2] 666 dec cntd 667 jg .loop_dc 668 RET 669 670 .idctfull: 671 SCRATCH 0, 12, rsp+16*mmsize, max 672 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 673 %if ARCH_X86_64 674 mov dstbakq, dstq 675 movsxd cntq, cntd 676 %endif 677 %if PIC 678 lea ptrq, [default_8x8] 679 movzx cntd, byte [ptrq+cntq-1] 680 %else 681 movzx cntd, byte [default_8x8+cntq-1] 682 %endif 683 mov skipd, 2 684 sub skipd, cntd 685 mov ptrq, rsp 686 PRELOAD 10, pd_8192, rnd 687 PRELOAD 11, pd_3fff, mask 688 PRELOAD 13, pd_16, srnd 689 .loop_1: 690 IDCT8_1D blockq, reg_rnd, reg_mask 691 692 TRANSPOSE4x4D 0, 1, 2, 3, 6 693 mova [ptrq+ 0*mmsize], m0 694 mova [ptrq+ 2*mmsize], m1 695 mova [ptrq+ 4*mmsize], m2 696 mova [ptrq+ 6*mmsize], m3 697 UNSCRATCH 6, 8, rsp+17*mmsize 698 TRANSPOSE4x4D 4, 5, 6, 7, 0 699 mova [ptrq+ 1*mmsize], m4 700 mova [ptrq+ 3*mmsize], m5 701 mova [ptrq+ 5*mmsize], m6 702 mova [ptrq+ 7*mmsize], m7 703 add ptrq, 8 * mmsize 704 add blockq, mmsize 705 dec cntd 706 jg .loop_1 707 708 ; zero-pad the remainder (skipped cols) 709 test skipd, skipd 710 jz .end 711 add skipd, skipd 712 lea blockq, [blockq+skipq*(mmsize/2)] 713 pxor m0, m0 714 .loop_z: 715 mova [ptrq+mmsize*0], m0 716 mova [ptrq+mmsize*1], m0 717 mova [ptrq+mmsize*2], m0 718 mova [ptrq+mmsize*3], m0 719 add ptrq, 4 * mmsize 720 dec skipd 721 jg .loop_z 722 .end: 723 724 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 725 lea stride3q, [strideq*3] 726 mov cntd, 2 727 mov ptrq, rsp 728 .loop_2: 729 IDCT8_1D ptrq, reg_rnd, reg_mask 730 731 pxor m6, m6 732 ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5 733 lea dstq, [dstq+strideq*4] 734 UNSCRATCH 0, 8, rsp+17*mmsize 735 UNSCRATCH 1, 12, rsp+16*mmsize, max 736 UNSCRATCH 2, 13, pd_16, srnd 737 ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5 738 add ptrq, 16 739 %if ARCH_X86_64 740 lea dstq, [dstbakq+8] 741 %else 742 mov dstq, dstm 743 add dstq, 8 744 %endif 745 dec cntd 746 jg .loop_2 747 748 ; m6 is still zero 749 ZERO_BLOCK blockq-2*mmsize, 32, 8, m6 750 RET 751 752 %macro DC_ONLY_64BIT 2 ; shift, zero 753 %if ARCH_X86_64 754 movsxd coefq, dword [blockq] 755 movd [blockq], %2 756 imul coefq, 11585 757 add coefq, 8192 758 sar coefq, 14 759 imul coefq, 11585 760 add coefq, ((1 << (%1 - 1)) << 14) + 8192 761 sar coefq, 14 + %1 762 %else 763 mov coefd, dword [blockq] 764 movd [blockq], %2 765 DEFINE_ARGS dst, stride, cnt, coef, coefl 766 mov cntd, 2 767 .loop_dc_calc: 768 mov coefld, coefd 769 sar coefd, 14 770 and coefld, 0x3fff 771 imul coefd, 11585 772 imul coefld, 11585 773 add coefld, 8192 774 sar coefld, 14 775 add coefd, coefld 776 dec cntd 777 jg .loop_dc_calc 778 add coefd, 1 << (%1 - 1) 779 sar coefd, %1 780 %endif 781 %endmacro 782 783 INIT_XMM sse2 784 cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \ 785 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \ 786 dst, stride, block, eob 787 mova m0, [pw_4095] 788 cmp eobd, 1 789 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull 790 791 ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign 792 ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies 793 DEFINE_ARGS dst, stride, block, coef, coefl 794 pxor m2, m2 795 DC_ONLY_64BIT 5, m2 796 movd m1, coefd 797 pshuflw m1, m1, q0000 798 punpcklqdq m1, m1 799 DEFINE_ARGS dst, stride, cnt 800 mov cntd, 4 801 .loop_dc: 802 STORE_2x8 3, 4, 1, m2, m0 803 lea dstq, [dstq+strideq*2] 804 dec cntd 805 jg .loop_dc 806 RET 807 808 ; inputs and outputs are dwords, coefficients are words 809 ; 810 ; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2 811 ; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1 812 %macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask 813 pand m%3, m%1, %7 814 pand m%4, m%2, %7 815 psrad m%1, 14 816 psrad m%2, 14 817 packssdw m%4, m%2 818 packssdw m%3, m%1 819 punpckhwd m%2, m%4, m%3 820 punpcklwd m%4, m%3 821 pmaddwd m%3, m%4, [pw_%6_%5] 822 pmaddwd m%1, m%2, [pw_%6_%5] 823 pmaddwd m%4, [pw_m%5_%6] 824 pmaddwd m%2, [pw_m%5_%6] 825 %endmacro 826 827 ; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14 828 ; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14 829 %macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd 830 SUMSUB_BA d, %1, %2, %5 831 SUMSUB_BA d, %3, %4, %5 832 paddd m%3, %6 833 paddd m%4, %6 834 psrad m%3, 14 835 psrad m%4, 14 836 paddd m%1, m%3 837 paddd m%2, m%4 838 %endmacro 839 840 %macro NEGD 1 841 %if cpuflag(ssse3) 842 psignd %1, [pw_m1] 843 %else 844 pxor %1, [pw_m1] 845 paddd %1, [pd_1] 846 %endif 847 %endmacro 848 849 ; the following line has not been executed at the end of this macro: 850 ; UNSCRATCH 6, 8, rsp+17*mmsize 851 %macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask 852 mova m0, [%1+ 0*mmsize] 853 mova m3, [%1+ 6*mmsize] 854 mova m4, [%1+ 8*mmsize] 855 mova m7, [%1+14*mmsize] 856 SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606, %3 ; m7/1=t0a, m0/2=t1a 857 SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665, %3 ; m3/5=t4a, m4/6=t5a 858 SCRATCH 0, 8, rsp+17*mmsize 859 SUMSUB_PACK_D 3, 7, 5, 1, 0, %2 ; m3=t0, m7=t4 860 UNSCRATCH 0, 8, rsp+17*mmsize 861 SUMSUB_PACK_D 4, 0, 6, 2, 1, %2 ; m4=t1, m0=t5 862 863 SCRATCH 3, 8, rsp+17*mmsize 864 SCRATCH 4, 9, rsp+18*mmsize 865 SCRATCH 7, 10, rsp+19*mmsize 866 SCRATCH 0, 11, rsp+20*mmsize 867 868 mova m1, [%1+ 2*mmsize] 869 mova m2, [%1+ 4*mmsize] 870 mova m5, [%1+10*mmsize] 871 mova m6, [%1+12*mmsize] 872 SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723, %3 ; m5/8=t2a, m2/9=t3a 873 SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679, %3 ; m1/10=t6a, m6/11=t7a 874 SCRATCH 2, 12, rsp+21*mmsize 875 SUMSUB_PACK_D 1, 5, 7, 3, 2, %2 ; m1=t2, m5=t6 876 UNSCRATCH 2, 12, rsp+21*mmsize 877 SUMSUB_PACK_D 6, 2, 0, 4, 3, %2 ; m6=t3, m2=t7 878 879 UNSCRATCH 7, 10, rsp+19*mmsize 880 UNSCRATCH 0, 11, rsp+20*mmsize 881 SCRATCH 1, 10, rsp+19*mmsize 882 SCRATCH 6, 11, rsp+20*mmsize 883 884 SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270, %3 ; m7/8=t4a, m0/9=t5a 885 SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137, %3 ; m2/10=t7a, m5/11=t6a 886 SCRATCH 2, 12, rsp+21*mmsize 887 SUMSUB_PACK_D 5, 7, 6, 3, 2, %2 ; m5=-out1, m7=t6 888 UNSCRATCH 2, 12, rsp+21*mmsize 889 NEGD m5 ; m5=out1 890 SUMSUB_PACK_D 2, 0, 1, 4, 3, %2 ; m2=out6, m0=t7 891 SUMSUB_MUL 7, 0, 3, 4, 11585, 11585, %2, %3 ; m7=out2, m0=-out5 892 NEGD m0 ; m0=out5 893 894 UNSCRATCH 3, 8, rsp+17*mmsize 895 UNSCRATCH 4, 9, rsp+18*mmsize 896 UNSCRATCH 1, 10, rsp+19*mmsize 897 UNSCRATCH 6, 11, rsp+20*mmsize 898 SCRATCH 2, 8, rsp+17*mmsize 899 SCRATCH 0, 9, rsp+18*mmsize 900 901 SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2 902 SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3 903 NEGD m6 ; m6=out7 904 SUMSUB_MUL 3, 4, 2, 0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4 905 NEGD m3 ; m3=out3 906 907 UNSCRATCH 0, 9, rsp+18*mmsize 908 909 SWAP 0, 1, 5 910 SWAP 2, 7, 6 911 %endmacro 912 913 %macro IADST8_FN 5 914 cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \ 915 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ 916 dst, stride, block, eob 917 mova m0, [pw_1023] 918 919 .body: 920 SCRATCH 0, 13, rsp+16*mmsize, max 921 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 922 %if ARCH_X86_64 923 mov dstbakq, dstq 924 movsxd cntq, cntd 925 %endif 926 %if PIC 927 lea ptrq, [%5_8x8] 928 movzx cntd, byte [ptrq+cntq-1] 929 %else 930 movzx cntd, byte [%5_8x8+cntq-1] 931 %endif 932 mov skipd, 2 933 sub skipd, cntd 934 mov ptrq, rsp 935 PRELOAD 14, pd_8192, rnd 936 PRELOAD 15, pd_3fff, mask 937 .loop_1: 938 %2_1D blockq, reg_rnd, reg_mask 939 940 TRANSPOSE4x4D 0, 1, 2, 3, 6 941 mova [ptrq+ 0*mmsize], m0 942 mova [ptrq+ 2*mmsize], m1 943 mova [ptrq+ 4*mmsize], m2 944 mova [ptrq+ 6*mmsize], m3 945 UNSCRATCH 6, 8, rsp+17*mmsize 946 TRANSPOSE4x4D 4, 5, 6, 7, 0 947 mova [ptrq+ 1*mmsize], m4 948 mova [ptrq+ 3*mmsize], m5 949 mova [ptrq+ 5*mmsize], m6 950 mova [ptrq+ 7*mmsize], m7 951 add ptrq, 8 * mmsize 952 add blockq, mmsize 953 dec cntd 954 jg .loop_1 955 956 ; zero-pad the remainder (skipped cols) 957 test skipd, skipd 958 jz .end 959 add skipd, skipd 960 lea blockq, [blockq+skipq*(mmsize/2)] 961 pxor m0, m0 962 .loop_z: 963 mova [ptrq+mmsize*0], m0 964 mova [ptrq+mmsize*1], m0 965 mova [ptrq+mmsize*2], m0 966 mova [ptrq+mmsize*3], m0 967 add ptrq, 4 * mmsize 968 dec skipd 969 jg .loop_z 970 .end: 971 972 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 973 lea stride3q, [strideq*3] 974 mov cntd, 2 975 mov ptrq, rsp 976 .loop_2: 977 %4_1D ptrq, reg_rnd, reg_mask 978 979 pxor m6, m6 980 PRELOAD 9, pd_16, srnd 981 ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5 982 lea dstq, [dstq+strideq*4] 983 UNSCRATCH 0, 8, rsp+17*mmsize 984 UNSCRATCH 1, 13, rsp+16*mmsize, max 985 UNSCRATCH 2, 9, pd_16, srnd 986 ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5 987 add ptrq, 16 988 %if ARCH_X86_64 989 lea dstq, [dstbakq+8] 990 %else 991 mov dstq, dstm 992 add dstq, 8 993 %endif 994 dec cntd 995 jg .loop_2 996 997 ; m6 is still zero 998 ZERO_BLOCK blockq-2*mmsize, 32, 8, m6 999 RET 1000 1001 cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \ 1002 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ 1003 dst, stride, block, eob 1004 mova m0, [pw_4095] 1005 jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body 1006 %endmacro 1007 1008 INIT_XMM sse2 1009 IADST8_FN idct, IDCT8, iadst, IADST8, row 1010 IADST8_FN iadst, IADST8, idct, IDCT8, col 1011 IADST8_FN iadst, IADST8, iadst, IADST8, default 1012 1013 %macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset 1014 IDCT8_1D %1, [pd_8192], [pd_3fff], %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7 1015 ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6 1016 SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a 1017 SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a 1018 SCRATCH 2, 13, rsp+(%4+5)*mmsize ; t2a 1019 SCRATCH 3, 12, rsp+(%4+4)*mmsize ; t3a 1020 SCRATCH 4, 11, rsp+(%4+3)*mmsize ; t4 1021 mova [rsp+(%3+0)*mmsize], m5 ; t5 1022 mova [rsp+(%3+1)*mmsize], m7 ; t7 1023 1024 mova m0, [%1+ 1*%2] ; in1 1025 mova m3, [%1+ 7*%2] ; in7 1026 mova m4, [%1+ 9*%2] ; in9 1027 mova m7, [%1+15*%2] ; in15 1028 1029 SUMSUB_MUL 0, 7, 1, 2, 16305, 1606 ; m0=t15a, m7=t8a 1030 SUMSUB_MUL 4, 3, 1, 2, 10394, 12665 ; m4=t14a, m3=t9a 1031 SUMSUB_BA d, 3, 7, 1 ; m3=t8, m7=t9 1032 SUMSUB_BA d, 4, 0, 1 ; m4=t15,m0=t14 1033 SUMSUB_MUL 0, 7, 1, 2, 15137, 6270 ; m0=t14a, m7=t9a 1034 1035 mova m1, [%1+ 3*%2] ; in3 1036 mova m2, [%1+ 5*%2] ; in5 1037 mova m5, [%1+11*%2] ; in11 1038 mova m6, [%1+13*%2] ; in13 1039 1040 SCRATCH 0, 9, rsp+(%4+1)*mmsize 1041 SCRATCH 7, 10, rsp+(%4+2)*mmsize 1042 1043 SUMSUB_MUL 2, 5, 0, 7, 14449, 7723 ; m2=t13a, m5=t10a 1044 SUMSUB_MUL 6, 1, 0, 7, 4756, 15679 ; m6=t12a, m1=t11a 1045 SUMSUB_BA d, 5, 1, 0 ; m5=t11,m1=t10 1046 SUMSUB_BA d, 2, 6, 0 ; m2=t12,m6=t13 1047 NEGD m1 ; m1=-t10 1048 SUMSUB_MUL 1, 6, 0, 7, 15137, 6270 ; m1=t13a, m6=t10a 1049 1050 UNSCRATCH 7, 10, rsp+(%4+2)*mmsize 1051 SUMSUB_BA d, 5, 3, 0 ; m5=t8a, m3=t11a 1052 SUMSUB_BA d, 6, 7, 0 ; m6=t9, m7=t10 1053 SUMSUB_BA d, 2, 4, 0 ; m2=t15a,m4=t12a 1054 SCRATCH 5, 10, rsp+(%4+2)*mmsize 1055 SUMSUB_MUL 4, 3, 0, 5, 11585, 11585 ; m4=t12, m3=t11 1056 UNSCRATCH 0, 9, rsp+(%4+1)*mmsize 1057 SUMSUB_BA d, 1, 0, 5 ; m1=t14, m0=t13 1058 SCRATCH 6, 9, rsp+(%4+1)*mmsize 1059 SUMSUB_MUL 0, 7, 6, 5, 11585, 11585 ; m0=t13a,m7=t10a 1060 1061 ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2 1062 ; free: 6,5 1063 1064 UNSCRATCH 5, 15, rsp+(%4+7)*mmsize 1065 SUMSUB_BA d, 2, 5, 6 ; m2=out0, m5=out15 1066 SCRATCH 5, 15, rsp+(%4+7)*mmsize 1067 UNSCRATCH 5, 14, rsp+(%4+6)*mmsize 1068 SUMSUB_BA d, 1, 5, 6 ; m1=out1, m5=out14 1069 SCRATCH 5, 14, rsp+(%4+6)*mmsize 1070 UNSCRATCH 5, 13, rsp+(%4+5)*mmsize 1071 SUMSUB_BA d, 0, 5, 6 ; m0=out2, m5=out13 1072 SCRATCH 5, 13, rsp+(%4+5)*mmsize 1073 UNSCRATCH 5, 12, rsp+(%4+4)*mmsize 1074 SUMSUB_BA d, 4, 5, 6 ; m4=out3, m5=out12 1075 SCRATCH 5, 12, rsp+(%4+4)*mmsize 1076 UNSCRATCH 5, 11, rsp+(%4+3)*mmsize 1077 SUMSUB_BA d, 3, 5, 6 ; m3=out4, m5=out11 1078 SCRATCH 4, 11, rsp+(%4+3)*mmsize 1079 mova m4, [rsp+(%3+0)*mmsize] 1080 SUMSUB_BA d, 7, 4, 6 ; m7=out5, m4=out10 1081 mova [rsp+(%3+0)*mmsize], m5 1082 UNSCRATCH 5, 8, rsp+(%4+0)*mmsize 1083 UNSCRATCH 6, 9, rsp+(%4+1)*mmsize 1084 SCRATCH 2, 8, rsp+(%4+0)*mmsize 1085 SCRATCH 1, 9, rsp+(%4+1)*mmsize 1086 UNSCRATCH 1, 10, rsp+(%4+2)*mmsize 1087 SCRATCH 0, 10, rsp+(%4+2)*mmsize 1088 mova m0, [rsp+(%3+1)*mmsize] 1089 SUMSUB_BA d, 6, 5, 2 ; m6=out6, m5=out9 1090 SUMSUB_BA d, 1, 0, 2 ; m1=out7, m0=out8 1091 1092 SWAP 0, 3, 1, 7, 2, 6, 4 1093 1094 ; output order: 8-11|r67-70=out0-3 1095 ; 0-6,r65=out4-11 1096 ; 12-15|r71-74=out12-15 1097 %endmacro 1098 1099 INIT_XMM sse2 1100 cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ 1101 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1102 dst, stride, block, eob 1103 mova m0, [pw_1023] 1104 cmp eobd, 1 1105 jg .idctfull 1106 1107 ; dc-only - the 10bit version can be done entirely in 32bit, since the max 1108 ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily 1109 ; fits in 32bit 1110 DEFINE_ARGS dst, stride, block, coef 1111 pxor m2, m2 1112 DC_ONLY 6, m2 1113 movd m1, coefd 1114 pshuflw m1, m1, q0000 1115 punpcklqdq m1, m1 1116 DEFINE_ARGS dst, stride, cnt 1117 mov cntd, 8 1118 .loop_dc: 1119 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 1120 STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize 1121 lea dstq, [dstq+strideq*2] 1122 dec cntd 1123 jg .loop_dc 1124 RET 1125 1126 .idctfull: 1127 mova [rsp+64*mmsize], m0 1128 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 1129 %if ARCH_X86_64 1130 mov dstbakq, dstq 1131 movsxd cntq, cntd 1132 %endif 1133 %if PIC 1134 lea ptrq, [default_16x16] 1135 movzx cntd, byte [ptrq+cntq-1] 1136 %else 1137 movzx cntd, byte [default_16x16+cntq-1] 1138 %endif 1139 mov skipd, 4 1140 sub skipd, cntd 1141 mov ptrq, rsp 1142 .loop_1: 1143 IDCT16_1D blockq 1144 1145 TRANSPOSE4x4D 0, 1, 2, 3, 7 1146 mova [ptrq+ 1*mmsize], m0 1147 mova [ptrq+ 5*mmsize], m1 1148 mova [ptrq+ 9*mmsize], m2 1149 mova [ptrq+13*mmsize], m3 1150 mova m7, [rsp+65*mmsize] 1151 TRANSPOSE4x4D 4, 5, 6, 7, 0 1152 mova [ptrq+ 2*mmsize], m4 1153 mova [ptrq+ 6*mmsize], m5 1154 mova [ptrq+10*mmsize], m6 1155 mova [ptrq+14*mmsize], m7 1156 UNSCRATCH 0, 8, rsp+67*mmsize 1157 UNSCRATCH 1, 9, rsp+68*mmsize 1158 UNSCRATCH 2, 10, rsp+69*mmsize 1159 UNSCRATCH 3, 11, rsp+70*mmsize 1160 TRANSPOSE4x4D 0, 1, 2, 3, 7 1161 mova [ptrq+ 0*mmsize], m0 1162 mova [ptrq+ 4*mmsize], m1 1163 mova [ptrq+ 8*mmsize], m2 1164 mova [ptrq+12*mmsize], m3 1165 UNSCRATCH 4, 12, rsp+71*mmsize 1166 UNSCRATCH 5, 13, rsp+72*mmsize 1167 UNSCRATCH 6, 14, rsp+73*mmsize 1168 UNSCRATCH 7, 15, rsp+74*mmsize 1169 TRANSPOSE4x4D 4, 5, 6, 7, 0 1170 mova [ptrq+ 3*mmsize], m4 1171 mova [ptrq+ 7*mmsize], m5 1172 mova [ptrq+11*mmsize], m6 1173 mova [ptrq+15*mmsize], m7 1174 add ptrq, 16 * mmsize 1175 add blockq, mmsize 1176 dec cntd 1177 jg .loop_1 1178 1179 ; zero-pad the remainder (skipped cols) 1180 test skipd, skipd 1181 jz .end 1182 add skipd, skipd 1183 lea blockq, [blockq+skipq*(mmsize/2)] 1184 pxor m0, m0 1185 .loop_z: 1186 mova [ptrq+mmsize*0], m0 1187 mova [ptrq+mmsize*1], m0 1188 mova [ptrq+mmsize*2], m0 1189 mova [ptrq+mmsize*3], m0 1190 mova [ptrq+mmsize*4], m0 1191 mova [ptrq+mmsize*5], m0 1192 mova [ptrq+mmsize*6], m0 1193 mova [ptrq+mmsize*7], m0 1194 add ptrq, 8 * mmsize 1195 dec skipd 1196 jg .loop_z 1197 .end: 1198 1199 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1200 lea stride3q, [strideq*3] 1201 mov cntd, 4 1202 mov ptrq, rsp 1203 .loop_2: 1204 IDCT16_1D ptrq 1205 1206 pxor m7, m7 1207 lea dstq, [dstq+strideq*4] 1208 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 1209 lea dstq, [dstq+strideq*4] 1210 mova m0, [rsp+65*mmsize] 1211 mova m1, [rsp+64*mmsize] 1212 mova m2, [pd_32] 1213 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1214 1215 %if ARCH_X86_64 1216 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst 1217 %else 1218 mov dstq, dstm 1219 %endif 1220 UNSCRATCH 0, 8, rsp+67*mmsize 1221 UNSCRATCH 4, 9, rsp+68*mmsize 1222 UNSCRATCH 5, 10, rsp+69*mmsize 1223 UNSCRATCH 3, 11, rsp+70*mmsize 1224 ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6 1225 %if ARCH_X86_64 1226 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1227 lea dstq, [dstbakq+stride3q*4] 1228 %else 1229 lea dstq, [dstq+stride3q*4] 1230 %endif 1231 UNSCRATCH 4, 12, rsp+71*mmsize 1232 UNSCRATCH 5, 13, rsp+72*mmsize 1233 UNSCRATCH 6, 14, rsp+73*mmsize 1234 UNSCRATCH 0, 15, rsp+74*mmsize 1235 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1236 1237 add ptrq, mmsize 1238 %if ARCH_X86_64 1239 add dstbakq, 8 1240 mov dstq, dstbakq 1241 %else 1242 add dword dstm, 8 1243 mov dstq, dstm 1244 %endif 1245 dec cntd 1246 jg .loop_2 1247 1248 ; m7 is still zero 1249 ZERO_BLOCK blockq-4*mmsize, 64, 16, m7 1250 RET 1251 1252 INIT_XMM sse2 1253 cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ 1254 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1255 dst, stride, block, eob 1256 mova m0, [pw_4095] 1257 cmp eobd, 1 1258 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull 1259 1260 ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign 1261 ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies 1262 DEFINE_ARGS dst, stride, block, coef, coefl 1263 pxor m2, m2 1264 DC_ONLY_64BIT 6, m2 1265 movd m1, coefd 1266 pshuflw m1, m1, q0000 1267 punpcklqdq m1, m1 1268 DEFINE_ARGS dst, stride, cnt 1269 mov cntd, 8 1270 .loop_dc: 1271 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 1272 STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize 1273 lea dstq, [dstq+strideq*2] 1274 dec cntd 1275 jg .loop_dc 1276 RET 1277 1278 ; r65-69 are available for spills 1279 ; r70-77 are available on x86-32 only (x86-64 should use m8-15) 1280 ; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77 1281 %macro IADST16_1D 1 ; src 1282 mova m0, [%1+ 0*4*mmsize] ; in0 1283 mova m1, [%1+ 7*4*mmsize] ; in7 1284 mova m2, [%1+ 8*4*mmsize] ; in8 1285 mova m3, [%1+15*4*mmsize] ; in15 1286 SUMSUB_MUL_D 3, 0, 4, 5, 16364, 804 ; m3/4=t0, m0/5=t1 1287 SUMSUB_MUL_D 1, 2, 6, 7, 11003, 12140 ; m1/6=t8, m2/7=t9 1288 SCRATCH 0, 8, rsp+70*mmsize 1289 SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t0a, m3=t8a 1290 UNSCRATCH 0, 8, rsp+70*mmsize 1291 SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t1a, m0=t9a 1292 mova [rsp+67*mmsize], m1 1293 SCRATCH 2, 9, rsp+71*mmsize 1294 SCRATCH 3, 12, rsp+74*mmsize 1295 SCRATCH 0, 13, rsp+75*mmsize 1296 1297 mova m0, [%1+ 3*4*mmsize] ; in3 1298 mova m1, [%1+ 4*4*mmsize] ; in4 1299 mova m2, [%1+11*4*mmsize] ; in11 1300 mova m3, [%1+12*4*mmsize] ; in12 1301 SUMSUB_MUL_D 2, 1, 4, 5, 14811, 7005 ; m2/4=t4, m1/5=t5 1302 SUMSUB_MUL_D 0, 3, 6, 7, 5520, 15426 ; m0/6=t12, m3/7=t13 1303 SCRATCH 1, 10, rsp+72*mmsize 1304 SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t4a, m2=t12a 1305 UNSCRATCH 1, 10, rsp+72*mmsize 1306 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t5a, m1=t13a 1307 SCRATCH 0, 15, rsp+77*mmsize 1308 SCRATCH 3, 11, rsp+73*mmsize 1309 1310 UNSCRATCH 0, 12, rsp+74*mmsize ; t8a 1311 UNSCRATCH 3, 13, rsp+75*mmsize ; t9a 1312 SUMSUB_MUL_D 0, 3, 4, 5, 16069, 3196 ; m0/4=t8, m3/5=t9 1313 SUMSUB_MUL_D 1, 2, 6, 7, 3196, 16069 ; m1/6=t13, m2/7=t12 1314 SCRATCH 1, 12, rsp+74*mmsize 1315 SUMSUB_PACK_D 2, 0, 7, 4, 1 ; m2=t8a, m0=t12a 1316 UNSCRATCH 1, 12, rsp+74*mmsize 1317 SUMSUB_PACK_D 1, 3, 6, 5, 4 ; m1=t9a, m3=t13a 1318 mova [rsp+65*mmsize], m2 1319 mova [rsp+66*mmsize], m1 1320 SCRATCH 0, 8, rsp+70*mmsize 1321 SCRATCH 3, 12, rsp+74*mmsize 1322 1323 mova m0, [%1+ 2*4*mmsize] ; in2 1324 mova m1, [%1+ 5*4*mmsize] ; in5 1325 mova m2, [%1+10*4*mmsize] ; in10 1326 mova m3, [%1+13*4*mmsize] ; in13 1327 SUMSUB_MUL_D 3, 0, 4, 5, 15893, 3981 ; m3/4=t2, m0/5=t3 1328 SUMSUB_MUL_D 1, 2, 6, 7, 8423, 14053 ; m1/6=t10, m2/7=t11 1329 SCRATCH 0, 10, rsp+72*mmsize 1330 SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t2a, m3=t10a 1331 UNSCRATCH 0, 10, rsp+72*mmsize 1332 SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t3a, m0=t11a 1333 mova [rsp+68*mmsize], m1 1334 mova [rsp+69*mmsize], m2 1335 SCRATCH 3, 13, rsp+75*mmsize 1336 SCRATCH 0, 14, rsp+76*mmsize 1337 1338 mova m0, [%1+ 1*4*mmsize] ; in1 1339 mova m1, [%1+ 6*4*mmsize] ; in6 1340 mova m2, [%1+ 9*4*mmsize] ; in9 1341 mova m3, [%1+14*4*mmsize] ; in14 1342 SUMSUB_MUL_D 2, 1, 4, 5, 13160, 9760 ; m2/4=t6, m1/5=t7 1343 SUMSUB_MUL_D 0, 3, 6, 7, 2404, 16207 ; m0/6=t14, m3/7=t15 1344 SCRATCH 1, 10, rsp+72*mmsize 1345 SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t6a, m2=t14a 1346 UNSCRATCH 1, 10, rsp+72*mmsize 1347 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t7a, m1=t15a 1348 1349 UNSCRATCH 4, 13, rsp+75*mmsize ; t10a 1350 UNSCRATCH 5, 14, rsp+76*mmsize ; t11a 1351 SCRATCH 0, 13, rsp+75*mmsize 1352 SCRATCH 3, 14, rsp+76*mmsize 1353 SUMSUB_MUL_D 4, 5, 6, 7, 9102, 13623 ; m4/6=t10, m5/7=t11 1354 SUMSUB_MUL_D 1, 2, 0, 3, 13623, 9102 ; m1/0=t15, m2/3=t14 1355 SCRATCH 0, 10, rsp+72*mmsize 1356 SUMSUB_PACK_D 2, 4, 3, 6, 0 ; m2=t10a, m4=t14a 1357 UNSCRATCH 0, 10, rsp+72*mmsize 1358 SUMSUB_PACK_D 1, 5, 0, 7, 6 ; m1=t11a, m5=t15a 1359 1360 UNSCRATCH 0, 8, rsp+70*mmsize ; t12a 1361 UNSCRATCH 3, 12, rsp+74*mmsize ; t13a 1362 SCRATCH 2, 8, rsp+70*mmsize 1363 SCRATCH 1, 12, rsp+74*mmsize 1364 SUMSUB_MUL_D 0, 3, 1, 2, 15137, 6270 ; m0/1=t12, m3/2=t13 1365 SUMSUB_MUL_D 5, 4, 7, 6, 6270, 15137 ; m5/7=t15, m4/6=t14 1366 SCRATCH 2, 10, rsp+72*mmsize 1367 SUMSUB_PACK_D 4, 0, 6, 1, 2 ; m4=out2, m0=t14a 1368 UNSCRATCH 2, 10, rsp+72*mmsize 1369 SUMSUB_PACK_D 5, 3, 7, 2, 1 ; m5=-out13, m3=t15a 1370 NEGD m5 ; m5=out13 1371 1372 UNSCRATCH 1, 9, rsp+71*mmsize ; t1a 1373 mova m2, [rsp+68*mmsize] ; t2a 1374 UNSCRATCH 6, 13, rsp+75*mmsize ; t6a 1375 UNSCRATCH 7, 14, rsp+76*mmsize ; t7a 1376 SCRATCH 4, 10, rsp+72*mmsize 1377 SCRATCH 5, 13, rsp+75*mmsize 1378 UNSCRATCH 4, 15, rsp+77*mmsize ; t4a 1379 UNSCRATCH 5, 11, rsp+73*mmsize ; t5a 1380 SCRATCH 0, 14, rsp+76*mmsize 1381 SCRATCH 3, 15, rsp+77*mmsize 1382 mova m0, [rsp+67*mmsize] ; t0a 1383 SUMSUB_BA d, 4, 0, 3 ; m4=t0, m0=t4 1384 SUMSUB_BA d, 5, 1, 3 ; m5=t1, m1=t5 1385 SUMSUB_BA d, 6, 2, 3 ; m6=t2, m2=t6 1386 SCRATCH 4, 9, rsp+71*mmsize 1387 mova m3, [rsp+69*mmsize] ; t3a 1388 SUMSUB_BA d, 7, 3, 4 ; m7=t3, m3=t7 1389 1390 mova [rsp+67*mmsize], m5 1391 mova [rsp+68*mmsize], m6 1392 mova [rsp+69*mmsize], m7 1393 SUMSUB_MUL_D 0, 1, 4, 5, 15137, 6270 ; m0/4=t4a, m1/5=t5a 1394 SUMSUB_MUL_D 3, 2, 7, 6, 6270, 15137 ; m3/7=t7a, m2/6=t6a 1395 SCRATCH 1, 11, rsp+73*mmsize 1396 SUMSUB_PACK_D 2, 0, 6, 4, 1 ; m2=-out3, m0=t6 1397 NEGD m2 ; m2=out3 1398 UNSCRATCH 1, 11, rsp+73*mmsize 1399 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=out12, m1=t7 1400 SCRATCH 2, 11, rsp+73*mmsize 1401 UNSCRATCH 2, 12, rsp+74*mmsize ; t11a 1402 SCRATCH 3, 12, rsp+74*mmsize 1403 1404 UNSCRATCH 3, 8, rsp+70*mmsize ; t10a 1405 mova m4, [rsp+65*mmsize] ; t8a 1406 mova m5, [rsp+66*mmsize] ; t9a 1407 SUMSUB_BA d, 3, 4, 6 ; m3=-out1, m4=t10 1408 NEGD m3 ; m3=out1 1409 SUMSUB_BA d, 2, 5, 6 ; m2=out14, m5=t11 1410 UNSCRATCH 6, 9, rsp+71*mmsize ; t0 1411 UNSCRATCH 7, 14, rsp+76*mmsize ; t14a 1412 SCRATCH 3, 9, rsp+71*mmsize 1413 SCRATCH 2, 14, rsp+76*mmsize 1414 1415 SUMSUB_MUL 1, 0, 2, 3, 11585, 11585 ; m1=out4, m0=out11 1416 mova [rsp+65*mmsize], m0 1417 SUMSUB_MUL 5, 4, 2, 3, 11585, 11585 ; m5=out6, m4=out9 1418 UNSCRATCH 0, 15, rsp+77*mmsize ; t15a 1419 SUMSUB_MUL 7, 0, 2, 3, 11585, m11585 ; m7=out10, m0=out5 1420 1421 mova m2, [rsp+68*mmsize] ; t2 1422 SUMSUB_BA d, 2, 6, 3 ; m2=out0, m6=t2a 1423 SCRATCH 2, 8, rsp+70*mmsize 1424 mova m2, [rsp+67*mmsize] ; t1 1425 mova m3, [rsp+69*mmsize] ; t3 1426 mova [rsp+67*mmsize], m7 1427 SUMSUB_BA d, 3, 2, 7 ; m3=-out15, m2=t3a 1428 NEGD m3 ; m3=out15 1429 SCRATCH 3, 15, rsp+77*mmsize 1430 SUMSUB_MUL 6, 2, 7, 3, 11585, m11585 ; m6=out8, m2=out7 1431 mova m7, [rsp+67*mmsize] 1432 1433 SWAP 0, 1 1434 SWAP 2, 5, 4, 6, 7, 3 1435 %endmacro 1436 1437 %macro IADST16_FN 7 1438 cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ 1439 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1440 dst, stride, block, eob 1441 mova m0, [pw_1023] 1442 1443 .body: 1444 mova [rsp+64*mmsize], m0 1445 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 1446 %if ARCH_X86_64 1447 mov dstbakq, dstq 1448 movsxd cntq, cntd 1449 %endif 1450 %if PIC 1451 lea ptrq, [%7_16x16] 1452 movzx cntd, byte [ptrq+cntq-1] 1453 %else 1454 movzx cntd, byte [%7_16x16+cntq-1] 1455 %endif 1456 mov skipd, 4 1457 sub skipd, cntd 1458 mov ptrq, rsp 1459 .loop_1: 1460 %2_1D blockq 1461 1462 TRANSPOSE4x4D 0, 1, 2, 3, 7 1463 mova [ptrq+ 1*mmsize], m0 1464 mova [ptrq+ 5*mmsize], m1 1465 mova [ptrq+ 9*mmsize], m2 1466 mova [ptrq+13*mmsize], m3 1467 mova m7, [rsp+65*mmsize] 1468 TRANSPOSE4x4D 4, 5, 6, 7, 0 1469 mova [ptrq+ 2*mmsize], m4 1470 mova [ptrq+ 6*mmsize], m5 1471 mova [ptrq+10*mmsize], m6 1472 mova [ptrq+14*mmsize], m7 1473 UNSCRATCH 0, 8, rsp+(%3+0)*mmsize 1474 UNSCRATCH 1, 9, rsp+(%3+1)*mmsize 1475 UNSCRATCH 2, 10, rsp+(%3+2)*mmsize 1476 UNSCRATCH 3, 11, rsp+(%3+3)*mmsize 1477 TRANSPOSE4x4D 0, 1, 2, 3, 7 1478 mova [ptrq+ 0*mmsize], m0 1479 mova [ptrq+ 4*mmsize], m1 1480 mova [ptrq+ 8*mmsize], m2 1481 mova [ptrq+12*mmsize], m3 1482 UNSCRATCH 4, 12, rsp+(%3+4)*mmsize 1483 UNSCRATCH 5, 13, rsp+(%3+5)*mmsize 1484 UNSCRATCH 6, 14, rsp+(%3+6)*mmsize 1485 UNSCRATCH 7, 15, rsp+(%3+7)*mmsize 1486 TRANSPOSE4x4D 4, 5, 6, 7, 0 1487 mova [ptrq+ 3*mmsize], m4 1488 mova [ptrq+ 7*mmsize], m5 1489 mova [ptrq+11*mmsize], m6 1490 mova [ptrq+15*mmsize], m7 1491 add ptrq, 16 * mmsize 1492 add blockq, mmsize 1493 dec cntd 1494 jg .loop_1 1495 1496 ; zero-pad the remainder (skipped cols) 1497 test skipd, skipd 1498 jz .end 1499 add skipd, skipd 1500 lea blockq, [blockq+skipq*(mmsize/2)] 1501 pxor m0, m0 1502 .loop_z: 1503 mova [ptrq+mmsize*0], m0 1504 mova [ptrq+mmsize*1], m0 1505 mova [ptrq+mmsize*2], m0 1506 mova [ptrq+mmsize*3], m0 1507 mova [ptrq+mmsize*4], m0 1508 mova [ptrq+mmsize*5], m0 1509 mova [ptrq+mmsize*6], m0 1510 mova [ptrq+mmsize*7], m0 1511 add ptrq, 8 * mmsize 1512 dec skipd 1513 jg .loop_z 1514 .end: 1515 1516 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1517 lea stride3q, [strideq*3] 1518 mov cntd, 4 1519 mov ptrq, rsp 1520 .loop_2: 1521 %5_1D ptrq 1522 1523 pxor m7, m7 1524 lea dstq, [dstq+strideq*4] 1525 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 1526 lea dstq, [dstq+strideq*4] 1527 mova m0, [rsp+65*mmsize] 1528 mova m1, [rsp+64*mmsize] 1529 mova m2, [pd_32] 1530 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1531 1532 %if ARCH_X86_64 1533 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst 1534 %else 1535 mov dstq, dstm 1536 %endif 1537 UNSCRATCH 0, 8, rsp+(%6+0)*mmsize 1538 UNSCRATCH 4, 9, rsp+(%6+1)*mmsize 1539 UNSCRATCH 5, 10, rsp+(%6+2)*mmsize 1540 UNSCRATCH 3, 11, rsp+(%6+3)*mmsize 1541 ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6 1542 %if ARCH_X86_64 1543 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1544 lea dstq, [dstbakq+stride3q*4] 1545 %else 1546 lea dstq, [dstq+stride3q*4] 1547 %endif 1548 UNSCRATCH 4, 12, rsp+(%6+4)*mmsize 1549 UNSCRATCH 5, 13, rsp+(%6+5)*mmsize 1550 UNSCRATCH 6, 14, rsp+(%6+6)*mmsize 1551 UNSCRATCH 0, 15, rsp+(%6+7)*mmsize 1552 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 1553 1554 add ptrq, mmsize 1555 %if ARCH_X86_64 1556 add dstbakq, 8 1557 mov dstq, dstbakq 1558 %else 1559 add dword dstm, 8 1560 mov dstq, dstm 1561 %endif 1562 dec cntd 1563 jg .loop_2 1564 1565 ; m7 is still zero 1566 ZERO_BLOCK blockq-4*mmsize, 64, 16, m7 1567 RET 1568 1569 cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ 1570 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1571 dst, stride, block, eob 1572 mova m0, [pw_4095] 1573 jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body 1574 %endmacro 1575 1576 INIT_XMM sse2 1577 IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70, row 1578 IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67, col 1579 IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default 1580 1581 %macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride 1582 IDCT16_1D %2, 2 * %3, 272, 257 1583 %if ARCH_X86_64 1584 mova [rsp+257*mmsize], m8 1585 mova [rsp+258*mmsize], m9 1586 mova [rsp+259*mmsize], m10 1587 mova [rsp+260*mmsize], m11 1588 mova [rsp+261*mmsize], m12 1589 mova [rsp+262*mmsize], m13 1590 mova [rsp+263*mmsize], m14 1591 mova [rsp+264*mmsize], m15 1592 %endif 1593 mova [rsp+265*mmsize], m0 1594 mova [rsp+266*mmsize], m1 1595 mova [rsp+267*mmsize], m2 1596 mova [rsp+268*mmsize], m3 1597 mova [rsp+269*mmsize], m4 1598 mova [rsp+270*mmsize], m5 1599 mova [rsp+271*mmsize], m6 1600 1601 ; r257-260: t0-3 1602 ; r265-272: t4/5a/6a/7/8/9a/10/11a 1603 ; r261-264: t12a/13/14a/15 1604 ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit 1605 1606 mova m0, [%2+ 1*%3] ; in1 1607 mova m1, [%2+15*%3] ; in15 1608 mova m2, [%2+17*%3] ; in17 1609 mova m3, [%2+31*%3] ; in31 1610 SUMSUB_MUL 0, 3, 4, 5, 16364, 804 ; m0=t31a, m3=t16a 1611 SUMSUB_MUL 2, 1, 4, 5, 11003, 12140 ; m2=t30a, m1=t17a 1612 SUMSUB_BA d, 1, 3, 4 ; m1=t16, m3=t17 1613 SUMSUB_BA d, 2, 0, 4 ; m2=t31, m0=t30 1614 SUMSUB_MUL 0, 3, 4, 5, 16069, 3196 ; m0=t30a, m3=t17a 1615 SCRATCH 0, 8, rsp+275*mmsize 1616 SCRATCH 2, 9, rsp+276*mmsize 1617 1618 ; end of stage 1-3 first quart 1619 1620 mova m0, [%2+ 7*%3] ; in7 1621 mova m2, [%2+ 9*%3] ; in9 1622 mova m4, [%2+23*%3] ; in23 1623 mova m5, [%2+25*%3] ; in25 1624 SUMSUB_MUL 2, 4, 6, 7, 14811, 7005 ; m2=t29a, m4=t18a 1625 SUMSUB_MUL 5, 0, 6, 7, 5520, 15426 ; m5=t28a, m0=t19a 1626 SUMSUB_BA d, 4, 0, 6 ; m4=t19, m0=t18 1627 SUMSUB_BA d, 2, 5, 6 ; m2=t28, m5=t29 1628 SUMSUB_MUL 5, 0, 6, 7, 3196, m16069 ; m5=t29a, m0=t18a 1629 1630 ; end of stage 1-3 second quart 1631 1632 SUMSUB_BA d, 4, 1, 6 ; m4=t16a, m1=t19a 1633 SUMSUB_BA d, 0, 3, 6 ; m0=t17, m3=t18 1634 UNSCRATCH 6, 8, rsp+275*mmsize ; t30a 1635 UNSCRATCH 7, 9, rsp+276*mmsize ; t31 1636 mova [rsp+273*mmsize], m4 1637 mova [rsp+274*mmsize], m0 1638 SUMSUB_BA d, 2, 7, 0 ; m2=t31a, m7=t28a 1639 SUMSUB_BA d, 5, 6, 0 ; m5=t30, m6=t29 1640 SUMSUB_MUL 6, 3, 0, 4, 15137, 6270 ; m6=t29a, m3=t18a 1641 SUMSUB_MUL 7, 1, 0, 4, 15137, 6270 ; m7=t28, m1=t19 1642 SCRATCH 3, 10, rsp+277*mmsize 1643 SCRATCH 1, 11, rsp+278*mmsize 1644 SCRATCH 7, 12, rsp+279*mmsize 1645 SCRATCH 6, 13, rsp+280*mmsize 1646 SCRATCH 5, 14, rsp+281*mmsize 1647 SCRATCH 2, 15, rsp+282*mmsize 1648 1649 ; end of stage 4-5 first half 1650 1651 mova m0, [%2+ 5*%3] ; in5 1652 mova m1, [%2+11*%3] ; in11 1653 mova m2, [%2+21*%3] ; in21 1654 mova m3, [%2+27*%3] ; in27 1655 SUMSUB_MUL 0, 3, 4, 5, 15893, 3981 ; m0=t27a, m3=t20a 1656 SUMSUB_MUL 2, 1, 4, 5, 8423, 14053 ; m2=t26a, m1=t21a 1657 SUMSUB_BA d, 1, 3, 4 ; m1=t20, m3=t21 1658 SUMSUB_BA d, 2, 0, 4 ; m2=t27, m0=t26 1659 SUMSUB_MUL 0, 3, 4, 5, 9102, 13623 ; m0=t26a, m3=t21a 1660 SCRATCH 0, 8, rsp+275*mmsize 1661 SCRATCH 2, 9, rsp+276*mmsize 1662 1663 ; end of stage 1-3 third quart 1664 1665 mova m0, [%2+ 3*%3] ; in3 1666 mova m2, [%2+13*%3] ; in13 1667 mova m4, [%2+19*%3] ; in19 1668 mova m5, [%2+29*%3] ; in29 1669 SUMSUB_MUL 2, 4, 6, 7, 13160, 9760 ; m2=t25a, m4=t22a 1670 SUMSUB_MUL 5, 0, 6, 7, 2404, 16207 ; m5=t24a, m0=t23a 1671 SUMSUB_BA d, 4, 0, 6 ; m4=t23, m0=t22 1672 SUMSUB_BA d, 2, 5, 6 ; m2=t24, m5=t25 1673 SUMSUB_MUL 5, 0, 6, 7, 13623, m9102 ; m5=t25a, m0=t22a 1674 1675 ; end of stage 1-3 fourth quart 1676 1677 SUMSUB_BA d, 1, 4, 6 ; m1=t23a, m4=t20a 1678 SUMSUB_BA d, 3, 0, 6 ; m3=t22, m0=t21 1679 UNSCRATCH 6, 8, rsp+275*mmsize ; t26a 1680 UNSCRATCH 7, 9, rsp+276*mmsize ; t27 1681 SCRATCH 3, 8, rsp+275*mmsize 1682 SCRATCH 1, 9, rsp+276*mmsize 1683 SUMSUB_BA d, 7, 2, 1 ; m7=t24a, m2=t27a 1684 SUMSUB_BA d, 6, 5, 1 ; m6=t25, m5=t26 1685 SUMSUB_MUL 2, 4, 1, 3, 6270, m15137 ; m2=t27, m4=t20 1686 SUMSUB_MUL 5, 0, 1, 3, 6270, m15137 ; m5=t26a, m0=t21a 1687 1688 ; end of stage 4-5 second half 1689 1690 UNSCRATCH 1, 12, rsp+279*mmsize ; t28 1691 UNSCRATCH 3, 13, rsp+280*mmsize ; t29a 1692 SCRATCH 4, 12, rsp+279*mmsize 1693 SCRATCH 0, 13, rsp+280*mmsize 1694 SUMSUB_BA d, 5, 3, 0 ; m5=t29, m3=t26 1695 SUMSUB_BA d, 2, 1, 0 ; m2=t28a, m1=t27a 1696 UNSCRATCH 0, 14, rsp+281*mmsize ; t30 1697 UNSCRATCH 4, 15, rsp+282*mmsize ; t31a 1698 SCRATCH 2, 14, rsp+281*mmsize 1699 SCRATCH 5, 15, rsp+282*mmsize 1700 SUMSUB_BA d, 6, 0, 2 ; m6=t30a, m0=t25a 1701 SUMSUB_BA d, 7, 4, 2 ; m7=t31, m4=t24 1702 1703 mova m2, [rsp+273*mmsize] ; t16a 1704 mova m5, [rsp+274*mmsize] ; t17 1705 mova [rsp+273*mmsize], m6 1706 mova [rsp+274*mmsize], m7 1707 UNSCRATCH 6, 10, rsp+277*mmsize ; t18a 1708 UNSCRATCH 7, 11, rsp+278*mmsize ; t19 1709 SCRATCH 4, 10, rsp+277*mmsize 1710 SCRATCH 0, 11, rsp+278*mmsize 1711 UNSCRATCH 4, 12, rsp+279*mmsize ; t20 1712 UNSCRATCH 0, 13, rsp+280*mmsize ; t21a 1713 SCRATCH 3, 12, rsp+279*mmsize 1714 SCRATCH 1, 13, rsp+280*mmsize 1715 SUMSUB_BA d, 0, 6, 1 ; m0=t18, m6=t21 1716 SUMSUB_BA d, 4, 7, 1 ; m4=t19a, m7=t20a 1717 UNSCRATCH 3, 8, rsp+275*mmsize ; t22 1718 UNSCRATCH 1, 9, rsp+276*mmsize ; t23a 1719 SCRATCH 0, 8, rsp+275*mmsize 1720 SCRATCH 4, 9, rsp+276*mmsize 1721 SUMSUB_BA d, 3, 5, 0 ; m3=t17a, m5=t22a 1722 SUMSUB_BA d, 1, 2, 0 ; m1=t16, m2=t23 1723 1724 ; end of stage 6 1725 1726 UNSCRATCH 0, 10, rsp+277*mmsize ; t24 1727 UNSCRATCH 4, 11, rsp+278*mmsize ; t25a 1728 SCRATCH 1, 10, rsp+277*mmsize 1729 SCRATCH 3, 11, rsp+278*mmsize 1730 SUMSUB_MUL 0, 2, 1, 3, 11585, 11585 ; m0=t24a, m2=t23a 1731 SUMSUB_MUL 4, 5, 1, 3, 11585, 11585 ; m4=t25, m5=t22 1732 UNSCRATCH 1, 12, rsp+279*mmsize ; t26 1733 UNSCRATCH 3, 13, rsp+280*mmsize ; t27a 1734 SCRATCH 0, 12, rsp+279*mmsize 1735 SCRATCH 4, 13, rsp+280*mmsize 1736 SUMSUB_MUL 3, 7, 0, 4, 11585, 11585 ; m3=t27, m7=t20 1737 SUMSUB_MUL 1, 6, 0, 4, 11585, 11585 ; m1=t26a, m6=t21a 1738 1739 ; end of stage 7 1740 1741 mova m0, [rsp+269*mmsize] ; t8 1742 mova m4, [rsp+270*mmsize] ; t9a 1743 mova [rsp+269*mmsize], m1 ; t26a 1744 mova [rsp+270*mmsize], m3 ; t27 1745 mova m3, [rsp+271*mmsize] ; t10 1746 SUMSUB_BA d, 2, 0, 1 ; m2=out8, m0=out23 1747 SUMSUB_BA d, 5, 4, 1 ; m5=out9, m4=out22 1748 SUMSUB_BA d, 6, 3, 1 ; m6=out10, m3=out21 1749 mova m1, [rsp+272*mmsize] ; t11a 1750 mova [rsp+271*mmsize], m0 1751 SUMSUB_BA d, 7, 1, 0 ; m7=out11, m1=out20 1752 1753 %if %1 == 1 1754 TRANSPOSE4x4D 2, 5, 6, 7, 0 1755 mova [ptrq+ 2*mmsize], m2 1756 mova [ptrq+10*mmsize], m5 1757 mova [ptrq+18*mmsize], m6 1758 mova [ptrq+26*mmsize], m7 1759 %else ; %1 == 2 1760 pxor m0, m0 1761 lea dstq, [dstq+strideq*8] 1762 ROUND_AND_STORE_4x4 2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6 1763 %endif 1764 mova m2, [rsp+271*mmsize] 1765 %if %1 == 1 1766 TRANSPOSE4x4D 1, 3, 4, 2, 0 1767 mova [ptrq+ 5*mmsize], m1 1768 mova [ptrq+13*mmsize], m3 1769 mova [ptrq+21*mmsize], m4 1770 mova [ptrq+29*mmsize], m2 1771 %else ; %1 == 2 1772 lea dstq, [dstq+stride3q*4] 1773 ROUND_AND_STORE_4x4 1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6 1774 %endif 1775 1776 ; end of last stage + store for out8-11 and out20-23 1777 1778 UNSCRATCH 0, 9, rsp+276*mmsize ; t19a 1779 UNSCRATCH 1, 8, rsp+275*mmsize ; t18 1780 UNSCRATCH 2, 11, rsp+278*mmsize ; t17a 1781 UNSCRATCH 3, 10, rsp+277*mmsize ; t16 1782 mova m7, [rsp+261*mmsize] ; t12a 1783 mova m6, [rsp+262*mmsize] ; t13 1784 mova m5, [rsp+263*mmsize] ; t14a 1785 SUMSUB_BA d, 0, 7, 4 ; m0=out12, m7=out19 1786 SUMSUB_BA d, 1, 6, 4 ; m1=out13, m6=out18 1787 SUMSUB_BA d, 2, 5, 4 ; m2=out14, m5=out17 1788 mova m4, [rsp+264*mmsize] ; t15 1789 SCRATCH 7, 8, rsp+275*mmsize 1790 SUMSUB_BA d, 3, 4, 7 ; m3=out15, m4=out16 1791 1792 %if %1 == 1 1793 TRANSPOSE4x4D 0, 1, 2, 3, 7 1794 mova [ptrq+ 3*mmsize], m0 1795 mova [ptrq+11*mmsize], m1 1796 mova [ptrq+19*mmsize], m2 1797 mova [ptrq+27*mmsize], m3 1798 %else ; %1 == 2 1799 %if ARCH_X86_64 1800 SWAP 7, 9 1801 lea dstq, [dstbakq+stride3q*4] 1802 %else ; x86-32 1803 pxor m7, m7 1804 mov dstq, dstm 1805 lea dstq, [dstq+stride3q*4] 1806 %endif 1807 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6 1808 %endif 1809 UNSCRATCH 0, 8, rsp+275*mmsize ; out19 1810 %if %1 == 1 1811 TRANSPOSE4x4D 4, 5, 6, 0, 7 1812 mova [ptrq+ 4*mmsize], m4 1813 mova [ptrq+12*mmsize], m5 1814 mova [ptrq+20*mmsize], m6 1815 mova [ptrq+28*mmsize], m0 1816 %else ; %1 == 2 1817 lea dstq, [dstq+strideq*4] 1818 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6 1819 %endif 1820 1821 ; end of last stage + store for out12-19 1822 1823 %if ARCH_X86_64 1824 SWAP 7, 8 1825 %endif 1826 mova m7, [rsp+257*mmsize] ; t0 1827 mova m6, [rsp+258*mmsize] ; t1 1828 mova m5, [rsp+259*mmsize] ; t2 1829 mova m4, [rsp+260*mmsize] ; t3 1830 mova m0, [rsp+274*mmsize] ; t31 1831 mova m1, [rsp+273*mmsize] ; t30a 1832 UNSCRATCH 2, 15, rsp+282*mmsize ; t29 1833 SUMSUB_BA d, 0, 7, 3 ; m0=out0, m7=out31 1834 SUMSUB_BA d, 1, 6, 3 ; m1=out1, m6=out30 1835 SUMSUB_BA d, 2, 5, 3 ; m2=out2, m5=out29 1836 SCRATCH 0, 9, rsp+276*mmsize 1837 UNSCRATCH 3, 14, rsp+281*mmsize ; t28a 1838 SUMSUB_BA d, 3, 4, 0 ; m3=out3, m4=out28 1839 1840 %if %1 == 1 1841 TRANSPOSE4x4D 4, 5, 6, 7, 0 1842 mova [ptrq+ 7*mmsize], m4 1843 mova [ptrq+15*mmsize], m5 1844 mova [ptrq+23*mmsize], m6 1845 mova [ptrq+31*mmsize], m7 1846 %else ; %1 == 2 1847 %if ARCH_X86_64 1848 SWAP 0, 8 1849 %else ; x86-32 1850 pxor m0, m0 1851 %endif 1852 lea dstq, [dstq+stride3q*4] 1853 ROUND_AND_STORE_4x4 4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6 1854 %endif 1855 UNSCRATCH 7, 9, rsp+276*mmsize ; out0 1856 %if %1 == 1 1857 TRANSPOSE4x4D 7, 1, 2, 3, 0 1858 mova [ptrq+ 0*mmsize], m7 1859 mova [ptrq+ 8*mmsize], m1 1860 mova [ptrq+16*mmsize], m2 1861 mova [ptrq+24*mmsize], m3 1862 %else ; %1 == 2 1863 %if ARCH_X86_64 1864 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst 1865 %else ; x86-32 1866 mov dstq, dstm 1867 %endif 1868 ROUND_AND_STORE_4x4 7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6 1869 %if ARCH_X86_64 1870 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 1871 %endif 1872 %endif 1873 1874 ; end of last stage + store for out0-3 and out28-31 1875 1876 %if ARCH_X86_64 1877 SWAP 0, 8 1878 %endif 1879 mova m7, [rsp+265*mmsize] ; t4 1880 mova m6, [rsp+266*mmsize] ; t5a 1881 mova m5, [rsp+267*mmsize] ; t6a 1882 mova m4, [rsp+268*mmsize] ; t7 1883 mova m0, [rsp+270*mmsize] ; t27 1884 mova m1, [rsp+269*mmsize] ; t26a 1885 UNSCRATCH 2, 13, rsp+280*mmsize ; t25 1886 SUMSUB_BA d, 0, 7, 3 ; m0=out4, m7=out27 1887 SUMSUB_BA d, 1, 6, 3 ; m1=out5, m6=out26 1888 SUMSUB_BA d, 2, 5, 3 ; m2=out6, m5=out25 1889 UNSCRATCH 3, 12, rsp+279*mmsize ; t24a 1890 SCRATCH 7, 9, rsp+276*mmsize 1891 SUMSUB_BA d, 3, 4, 7 ; m3=out7, m4=out24 1892 1893 %if %1 == 1 1894 TRANSPOSE4x4D 0, 1, 2, 3, 7 1895 mova [ptrq+ 1*mmsize], m0 1896 mova [ptrq+ 9*mmsize], m1 1897 mova [ptrq+17*mmsize], m2 1898 mova [ptrq+25*mmsize], m3 1899 %else ; %1 == 2 1900 %if ARCH_X86_64 1901 SWAP 7, 8 1902 lea dstq, [dstbakq+strideq*4] 1903 %else ; x86-32 1904 pxor m7, m7 1905 lea dstq, [dstq+strideq*4] 1906 %endif 1907 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6 1908 %endif 1909 UNSCRATCH 0, 9, rsp+276*mmsize ; out27 1910 %if %1 == 1 1911 TRANSPOSE4x4D 4, 5, 6, 0, 7 1912 mova [ptrq+ 6*mmsize], m4 1913 mova [ptrq+14*mmsize], m5 1914 mova [ptrq+22*mmsize], m6 1915 mova [ptrq+30*mmsize], m0 1916 %else ; %1 == 2 1917 %if ARCH_X86_64 1918 lea dstq, [dstbakq+stride3q*8] 1919 %else 1920 mov dstq, dstm 1921 lea dstq, [dstq+stride3q*8] 1922 %endif 1923 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6 1924 %endif 1925 1926 ; end of last stage + store for out4-7 and out24-27 1927 %endmacro 1928 1929 INIT_XMM sse2 1930 cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \ 1931 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 1932 dst, stride, block, eob 1933 mova m0, [pw_1023] 1934 cmp eobd, 1 1935 jg .idctfull 1936 1937 ; dc-only - the 10bit version can be done entirely in 32bit, since the max 1938 ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily 1939 ; fits in 32bit 1940 DEFINE_ARGS dst, stride, block, coef 1941 pxor m2, m2 1942 DC_ONLY 6, m2 1943 movd m1, coefd 1944 pshuflw m1, m1, q0000 1945 punpcklqdq m1, m1 1946 DEFINE_ARGS dst, stride, cnt 1947 mov cntd, 32 1948 .loop_dc: 1949 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 1950 STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize 1951 add dstq, strideq 1952 dec cntd 1953 jg .loop_dc 1954 RET 1955 1956 .idctfull: 1957 mova [rsp+256*mmsize], m0 1958 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak 1959 %if ARCH_X86_64 1960 mov dstbakq, dstq 1961 movsxd cntq, cntd 1962 %endif 1963 %if PIC 1964 lea ptrq, [default_32x32] 1965 movzx cntd, byte [ptrq+cntq-1] 1966 %else 1967 movzx cntd, byte [default_32x32+cntq-1] 1968 %endif 1969 mov skipd, 8 1970 sub skipd, cntd 1971 mov ptrq, rsp 1972 .loop_1: 1973 IDCT32_1D 1, blockq 1974 1975 add ptrq, 32 * mmsize 1976 add blockq, mmsize 1977 dec cntd 1978 jg .loop_1 1979 1980 ; zero-pad the remainder (skipped cols) 1981 test skipd, skipd 1982 jz .end 1983 shl skipd, 2 1984 lea blockq, [blockq+skipq*(mmsize/4)] 1985 pxor m0, m0 1986 .loop_z: 1987 mova [ptrq+mmsize*0], m0 1988 mova [ptrq+mmsize*1], m0 1989 mova [ptrq+mmsize*2], m0 1990 mova [ptrq+mmsize*3], m0 1991 mova [ptrq+mmsize*4], m0 1992 mova [ptrq+mmsize*5], m0 1993 mova [ptrq+mmsize*6], m0 1994 mova [ptrq+mmsize*7], m0 1995 add ptrq, 8 * mmsize 1996 dec skipd 1997 jg .loop_z 1998 .end: 1999 2000 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak 2001 lea stride3q, [strideq*3] 2002 mov cntd, 8 2003 mov ptrq, rsp 2004 .loop_2: 2005 IDCT32_1D 2, ptrq 2006 2007 add ptrq, mmsize 2008 %if ARCH_X86_64 2009 add dstbakq, 8 2010 mov dstq, dstbakq 2011 %else 2012 add dword dstm, 8 2013 mov dstq, dstm 2014 %endif 2015 dec cntd 2016 jg .loop_2 2017 2018 ; m7 is still zero 2019 ZERO_BLOCK blockq-8*mmsize, 128, 32, m7 2020 RET 2021 2022 INIT_XMM sse2 2023 cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \ 2024 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \ 2025 dst, stride, block, eob 2026 mova m0, [pw_4095] 2027 cmp eobd, 1 2028 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull 2029 2030 ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign 2031 ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies 2032 DEFINE_ARGS dst, stride, block, coef, coefl 2033 pxor m2, m2 2034 DC_ONLY_64BIT 6, m2 2035 movd m1, coefd 2036 pshuflw m1, m1, q0000 2037 punpcklqdq m1, m1 2038 DEFINE_ARGS dst, stride, cnt 2039 mov cntd, 32 2040 .loop_dc: 2041 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize 2042 STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize 2043 add dstq, strideq 2044 dec cntd 2045 jg .loop_dc 2046 RET