h264_intrapred.asm (55163B)
1 ;****************************************************************************** 2 ;* H.264 intra prediction asm optimizations 3 ;* Copyright (c) 2010 Fiona Glaser 4 ;* Copyright (c) 2010 Holger Lubitz 5 ;* Copyright (c) 2010 Loren Merritt 6 ;* Copyright (c) 2010 Ronald S. Bultje 7 ;* 8 ;* This file is part of FFmpeg. 9 ;* 10 ;* FFmpeg is free software; you can redistribute it and/or 11 ;* modify it under the terms of the GNU Lesser General Public 12 ;* License as published by the Free Software Foundation; either 13 ;* version 2.1 of the License, or (at your option) any later version. 14 ;* 15 ;* FFmpeg is distributed in the hope that it will be useful, 16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 ;* Lesser General Public License for more details. 19 ;* 20 ;* You should have received a copy of the GNU Lesser General Public 21 ;* License along with FFmpeg; if not, write to the Free Software 22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23 ;****************************************************************************** 24 25 %include "libavutil/x86/x86util.asm" 26 27 SECTION_RODATA 28 29 tm_shuf: times 8 db 0x03, 0x80 30 pw_ff00: times 8 dw 0xff00 31 plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 32 db 1, 2, 3, 4, 5, 6, 7, 8 33 plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 34 db 1, 2, 3, 4, 0, 0, 0, 0 35 pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 36 pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 37 pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 38 pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 39 40 SECTION .text 41 42 cextern pb_1 43 cextern pb_3 44 cextern pw_4 45 cextern pw_8 46 47 ;----------------------------------------------------------------------------- 48 ; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride) 49 ;----------------------------------------------------------------------------- 50 51 INIT_XMM sse 52 cglobal pred16x16_vertical_8, 2,3 53 sub r0, r1 54 mov r2, 4 55 movaps xmm0, [r0] 56 .loop: 57 movaps [r0+r1*1], xmm0 58 movaps [r0+r1*2], xmm0 59 lea r0, [r0+r1*2] 60 movaps [r0+r1*1], xmm0 61 movaps [r0+r1*2], xmm0 62 lea r0, [r0+r1*2] 63 dec r2 64 jg .loop 65 RET 66 67 ;----------------------------------------------------------------------------- 68 ; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride) 69 ;----------------------------------------------------------------------------- 70 71 %macro PRED16x16_H 0 72 cglobal pred16x16_horizontal_8, 2,3 73 mov r2, 8 74 %if cpuflag(ssse3) 75 mova m2, [pb_3] 76 %endif 77 .loop: 78 movd m0, [r0+r1*0-4] 79 movd m1, [r0+r1*1-4] 80 81 %if cpuflag(ssse3) 82 pshufb m0, m2 83 pshufb m1, m2 84 %else 85 punpcklbw m0, m0 86 punpcklbw m1, m1 87 SPLATW m0, m0, 3 88 SPLATW m1, m1, 3 89 %endif 90 91 mova [r0+r1*0], m0 92 mova [r0+r1*1], m1 93 lea r0, [r0+r1*2] 94 dec r2 95 jg .loop 96 RET 97 %endmacro 98 99 INIT_XMM sse2 100 PRED16x16_H 101 INIT_XMM ssse3 102 PRED16x16_H 103 104 ;----------------------------------------------------------------------------- 105 ; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride) 106 ;----------------------------------------------------------------------------- 107 108 %macro PRED16x16_DC 0 109 cglobal pred16x16_dc_8, 2,7 110 mov r4, r0 111 sub r0, r1 112 pxor mm0, mm0 113 pxor mm1, mm1 114 psadbw mm0, [r0+0] 115 psadbw mm1, [r0+8] 116 dec r0 117 movzx r5d, byte [r0+r1*1] 118 paddw mm0, mm1 119 movd r6d, mm0 120 lea r0, [r0+r1*2] 121 %rep 7 122 movzx r2d, byte [r0+r1*0] 123 movzx r3d, byte [r0+r1*1] 124 add r5d, r2d 125 add r6d, r3d 126 lea r0, [r0+r1*2] 127 %endrep 128 movzx r2d, byte [r0+r1*0] 129 add r5d, r6d 130 lea r2d, [r2+r5+16] 131 shr r2d, 5 132 %if cpuflag(ssse3) 133 pxor m1, m1 134 %endif 135 SPLATB_REG m0, r2, m1 136 137 mov r3d, 4 138 .loop: 139 mova [r4+r1*0], m0 140 mova [r4+r1*1], m0 141 lea r4, [r4+r1*2] 142 mova [r4+r1*0], m0 143 mova [r4+r1*1], m0 144 lea r4, [r4+r1*2] 145 dec r3d 146 jg .loop 147 RET 148 %endmacro 149 150 INIT_XMM sse2 151 PRED16x16_DC 152 INIT_XMM ssse3 153 PRED16x16_DC 154 155 ;----------------------------------------------------------------------------- 156 ; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride) 157 ;----------------------------------------------------------------------------- 158 159 INIT_XMM sse2 160 cglobal pred16x16_tm_vp8_8, 2,6,6 161 sub r0, r1 162 pxor xmm2, xmm2 163 movdqa xmm0, [r0] 164 movdqa xmm1, xmm0 165 punpcklbw xmm0, xmm2 166 punpckhbw xmm1, xmm2 167 movzx r4d, byte [r0-1] 168 mov r5d, 8 169 .loop: 170 movzx r2d, byte [r0+r1*1-1] 171 movzx r3d, byte [r0+r1*2-1] 172 sub r2d, r4d 173 sub r3d, r4d 174 movd xmm2, r2d 175 movd xmm4, r3d 176 pshuflw xmm2, xmm2, 0 177 pshuflw xmm4, xmm4, 0 178 punpcklqdq xmm2, xmm2 179 punpcklqdq xmm4, xmm4 180 movdqa xmm3, xmm2 181 movdqa xmm5, xmm4 182 paddw xmm2, xmm0 183 paddw xmm3, xmm1 184 paddw xmm4, xmm0 185 paddw xmm5, xmm1 186 packuswb xmm2, xmm3 187 packuswb xmm4, xmm5 188 movdqa [r0+r1*1], xmm2 189 movdqa [r0+r1*2], xmm4 190 lea r0, [r0+r1*2] 191 dec r5d 192 jg .loop 193 RET 194 195 %if HAVE_AVX2_EXTERNAL 196 INIT_YMM avx2 197 cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration 198 sub dstq, strideq 199 pmovzxbw m0, [dstq] 200 vpbroadcastb xm1, [r0-1] 201 pmovzxbw m1, xm1 202 psubw m0, m1 203 mov iterationd, 4 204 lea stride3q, [strideq*3] 205 .loop: 206 vpbroadcastb xm1, [dstq+strideq*1-1] 207 vpbroadcastb xm2, [dstq+strideq*2-1] 208 vpbroadcastb xm3, [dstq+stride3q-1] 209 vpbroadcastb xm4, [dstq+strideq*4-1] 210 pmovzxbw m1, xm1 211 pmovzxbw m2, xm2 212 pmovzxbw m3, xm3 213 pmovzxbw m4, xm4 214 paddw m1, m0 215 paddw m2, m0 216 paddw m3, m0 217 paddw m4, m0 218 vpackuswb m1, m1, m2 219 vpackuswb m3, m3, m4 220 vpermq m1, m1, q3120 221 vpermq m3, m3, q3120 222 movdqa [dstq+strideq*1], xm1 223 vextracti128 [dstq+strideq*2], m1, 1 224 movdqa [dstq+stride3q*1], xm3 225 vextracti128 [dstq+strideq*4], m3, 1 226 lea dstq, [dstq+strideq*4] 227 dec iterationd 228 jg .loop 229 RET 230 %endif 231 232 ;----------------------------------------------------------------------------- 233 ; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride) 234 ;----------------------------------------------------------------------------- 235 236 %macro H264_PRED16x16_PLANE 1 237 cglobal pred16x16_plane_%1_8, 2,9,7 238 mov r2, r1 ; +stride 239 neg r1 ; -stride 240 241 movh m0, [r0+r1 -1] 242 %if cpuflag(ssse3) 243 movhps m0, [r0+r1 +8] 244 pmaddubsw m0, [plane_shuf] ; H coefficients 245 %else ; sse2 246 pxor m2, m2 247 movh m1, [r0+r1 +8] 248 punpcklbw m0, m2 249 punpcklbw m1, m2 250 pmullw m0, [pw_m8tom1] 251 pmullw m1, [pw_1to8] 252 paddw m0, m1 253 %endif 254 movhlps m1, m0 255 paddw m0, m1 256 PSHUFLW m1, m0, 0xE 257 paddw m0, m1 258 PSHUFLW m1, m0, 0x1 259 paddw m0, m1 ; sum of H coefficients 260 261 lea r4, [r0+r2*8-1] 262 lea r3, [r0+r2*4-1] 263 add r4, r2 264 265 %if ARCH_X86_64 266 %define e_reg r8 267 %else 268 %define e_reg r0 269 %endif 270 271 movzx e_reg, byte [r3+r2*2 ] 272 movzx r5, byte [r4+r1 ] 273 sub r5, e_reg 274 275 movzx e_reg, byte [r3+r2 ] 276 movzx r6, byte [r4 ] 277 sub r6, e_reg 278 lea r5, [r5+r6*2] 279 280 movzx e_reg, byte [r3+r1 ] 281 movzx r6, byte [r4+r2*2 ] 282 sub r6, e_reg 283 lea r5, [r5+r6*4] 284 285 movzx e_reg, byte [r3 ] 286 %if ARCH_X86_64 287 movzx r7, byte [r4+r2 ] 288 sub r7, e_reg 289 %else 290 movzx r6, byte [r4+r2 ] 291 sub r6, e_reg 292 lea r5, [r5+r6*4] 293 sub r5, r6 294 %endif 295 296 lea e_reg, [r3+r1*4] 297 lea r3, [r4+r2*4] 298 299 movzx r4, byte [e_reg+r2 ] 300 movzx r6, byte [r3 ] 301 sub r6, r4 302 %if ARCH_X86_64 303 lea r6, [r7+r6*2] 304 lea r5, [r5+r6*2] 305 add r5, r6 306 %else 307 lea r5, [r5+r6*4] 308 lea r5, [r5+r6*2] 309 %endif 310 311 movzx r4, byte [e_reg ] 312 %if ARCH_X86_64 313 movzx r7, byte [r3 +r2 ] 314 sub r7, r4 315 sub r5, r7 316 %else 317 movzx r6, byte [r3 +r2 ] 318 sub r6, r4 319 lea r5, [r5+r6*8] 320 sub r5, r6 321 %endif 322 323 movzx r4, byte [e_reg+r1 ] 324 movzx r6, byte [r3 +r2*2] 325 sub r6, r4 326 %if ARCH_X86_64 327 add r6, r7 328 %endif 329 lea r5, [r5+r6*8] 330 331 movzx r4, byte [e_reg+r2*2] 332 movzx r6, byte [r3 +r1 ] 333 sub r6, r4 334 lea r5, [r5+r6*4] 335 add r5, r6 ; sum of V coefficients 336 337 %if ARCH_X86_64 == 0 338 mov r0, r0m 339 %endif 340 341 %ifidn %1, h264 342 lea r5, [r5*5+32] 343 sar r5, 6 344 %elifidn %1, rv40 345 lea r5, [r5*5] 346 sar r5, 6 347 %elifidn %1, svq3 348 test r5, r5 349 lea r6, [r5+3] 350 cmovs r5, r6 351 sar r5, 2 ; V/4 352 lea r5, [r5*5] ; 5*(V/4) 353 test r5, r5 354 lea r6, [r5+15] 355 cmovs r5, r6 356 sar r5, 4 ; (5*(V/4))/16 357 %endif 358 359 movzx r4, byte [r0+r1 +15] 360 movzx r3, byte [r3+r2*2 ] 361 lea r3, [r3+r4+1] 362 shl r3, 4 363 364 movd r1d, m0 365 movsx r1d, r1w 366 %ifnidn %1, svq3 367 %ifidn %1, h264 368 lea r1d, [r1d*5+32] 369 %else ; rv40 370 lea r1d, [r1d*5] 371 %endif 372 sar r1d, 6 373 %else ; svq3 374 test r1d, r1d 375 lea r4d, [r1d+3] 376 cmovs r1d, r4d 377 sar r1d, 2 ; H/4 378 lea r1d, [r1d*5] ; 5*(H/4) 379 test r1d, r1d 380 lea r4d, [r1d+15] 381 cmovs r1d, r4d 382 sar r1d, 4 ; (5*(H/4))/16 383 %endif 384 movd m0, r1d 385 386 add r1d, r5d 387 add r3d, r1d 388 shl r1d, 3 389 sub r3d, r1d ; a 390 391 movd m1, r5d 392 movd m3, r3d 393 SPLATW m0, m0, 0 ; H 394 SPLATW m1, m1, 0 ; V 395 SPLATW m3, m3, 0 ; a 396 %ifidn %1, svq3 397 SWAP 0, 1 398 %endif 399 mova m2, m0 400 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 401 psllw m2, 3 402 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 403 paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H 404 405 mov r4, 8 406 .loop: 407 mova m3, m0 ; b[0..7] 408 mova m4, m2 ; b[8..15] 409 psraw m3, 5 410 psraw m4, 5 411 packuswb m3, m4 412 mova [r0], m3 413 paddw m0, m1 414 paddw m2, m1 415 416 mova m3, m0 ; b[0..7] 417 mova m4, m2 ; b[8..15] 418 psraw m3, 5 419 psraw m4, 5 420 packuswb m3, m4 421 mova [r0+r2], m3 422 paddw m0, m1 423 paddw m2, m1 424 425 lea r0, [r0+r2*2] 426 dec r4 427 jg .loop 428 RET 429 %endmacro 430 431 INIT_XMM sse2 432 H264_PRED16x16_PLANE h264 433 H264_PRED16x16_PLANE rv40 434 H264_PRED16x16_PLANE svq3 435 INIT_XMM ssse3 436 H264_PRED16x16_PLANE h264 437 H264_PRED16x16_PLANE rv40 438 H264_PRED16x16_PLANE svq3 439 440 ;----------------------------------------------------------------------------- 441 ; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride) 442 ;----------------------------------------------------------------------------- 443 444 %macro H264_PRED8x8_PLANE 0 445 cglobal pred8x8_plane_8, 2,9,7 446 mov r2, r1 ; +stride 447 neg r1 ; -stride 448 449 movd m0, [r0+r1 -1] 450 %if cpuflag(ssse3) 451 movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary 452 pmaddubsw m0, [plane8_shuf] ; H coefficients 453 %else ; sse2 454 pxor m2, m2 455 movd m1, [r0+r1 +4] 456 punpckldq m0, m1 457 punpcklbw m0, m2 458 pmullw m0, [pw_m4to4] 459 %endif 460 movhlps m1, m0 461 paddw m0, m1 462 463 %if notcpuflag(ssse3) 464 PSHUFLW m1, m0, 0xE 465 paddw m0, m1 466 %endif ; !ssse3 467 468 PSHUFLW m1, m0, 0x1 469 paddw m0, m1 ; sum of H coefficients 470 471 lea r4, [r0+r2*4-1] 472 lea r3, [r0 -1] 473 add r4, r2 474 475 %if ARCH_X86_64 476 %define e_reg r8 477 %else 478 %define e_reg r0 479 %endif 480 481 movzx e_reg, byte [r3+r2*2 ] 482 movzx r5, byte [r4+r1 ] 483 sub r5, e_reg 484 485 movzx e_reg, byte [r3 ] 486 %if ARCH_X86_64 487 movzx r7, byte [r4+r2 ] 488 sub r7, e_reg 489 sub r5, r7 490 %else 491 movzx r6, byte [r4+r2 ] 492 sub r6, e_reg 493 lea r5, [r5+r6*4] 494 sub r5, r6 495 %endif 496 497 movzx e_reg, byte [r3+r1 ] 498 movzx r6, byte [r4+r2*2 ] 499 sub r6, e_reg 500 %if ARCH_X86_64 501 add r6, r7 502 %endif 503 lea r5, [r5+r6*4] 504 505 movzx e_reg, byte [r3+r2 ] 506 movzx r6, byte [r4 ] 507 sub r6, e_reg 508 lea r6, [r5+r6*2] 509 510 lea r5, [r6*9+16] 511 lea r5, [r5+r6*8] 512 sar r5, 5 513 514 %if ARCH_X86_64 == 0 515 mov r0, r0m 516 %endif 517 518 movzx r3, byte [r4+r2*2 ] 519 movzx r4, byte [r0+r1 +7] 520 lea r3, [r3+r4+1] 521 shl r3, 4 522 movd r1d, m0 523 movsx r1d, r1w 524 imul r1d, 17 525 add r1d, 16 526 sar r1d, 5 527 movd m0, r1d 528 add r1d, r5d 529 sub r3d, r1d 530 add r1d, r1d 531 sub r3d, r1d ; a 532 533 movd m1, r5d 534 movd m3, r3d 535 SPLATW m0, m0, 0 ; H 536 SPLATW m1, m1, 0 ; V 537 SPLATW m3, m3, 0 ; a 538 pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) 539 paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H 540 541 mov r4, 4 542 ALIGN 16 543 .loop: 544 mova m3, m0 ; b[0..7] 545 paddw m0, m1 546 psraw m3, 5 547 mova m4, m0 ; V+b[0..7] 548 paddw m0, m1 549 psraw m4, 5 550 packuswb m3, m4 551 movh [r0], m3 552 movhps [r0+r2], m3 553 554 lea r0, [r0+r2*2] 555 dec r4 556 jg .loop 557 RET 558 %endmacro 559 560 INIT_XMM sse2 561 H264_PRED8x8_PLANE 562 INIT_XMM ssse3 563 H264_PRED8x8_PLANE 564 565 ;----------------------------------------------------------------------------- 566 ; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride) 567 ;----------------------------------------------------------------------------- 568 569 INIT_XMM sse2 570 cglobal pred8x8_vertical_8, 2,2 571 sub r0, r1 572 movq m0, [r0] 573 %rep 3 574 movq [r0+r1*1], m0 575 movq [r0+r1*2], m0 576 lea r0, [r0+r1*2] 577 %endrep 578 movq [r0+r1*1], m0 579 movq [r0+r1*2], m0 580 RET 581 582 ;----------------------------------------------------------------------------- 583 ; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride) 584 ;----------------------------------------------------------------------------- 585 586 %macro PRED8x8_H 0 587 cglobal pred8x8_horizontal_8, 2,3 588 mov r2, 4 589 %if cpuflag(ssse3) 590 mova m2, [pb_3] 591 %endif 592 .loop: 593 SPLATB_LOAD m0, r0+r1*0-1, m2 594 SPLATB_LOAD m1, r0+r1*1-1, m2 595 mova [r0+r1*0], m0 596 mova [r0+r1*1], m1 597 lea r0, [r0+r1*2] 598 dec r2 599 jg .loop 600 RET 601 %endmacro 602 603 INIT_MMX mmxext 604 PRED8x8_H 605 INIT_MMX ssse3 606 PRED8x8_H 607 608 ;----------------------------------------------------------------------------- 609 ; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) 610 ;----------------------------------------------------------------------------- 611 INIT_MMX mmxext 612 cglobal pred8x8_top_dc_8, 2,5 613 sub r0, r1 614 movq mm0, [r0] 615 pxor mm1, mm1 616 pxor mm2, mm2 617 lea r2, [r0+r1*2] 618 punpckhbw mm1, mm0 619 punpcklbw mm0, mm2 620 psadbw mm1, mm2 ; s1 621 lea r3, [r2+r1*2] 622 psadbw mm0, mm2 ; s0 623 psrlw mm1, 1 624 psrlw mm0, 1 625 pavgw mm1, mm2 626 lea r4, [r3+r1*2] 627 pavgw mm0, mm2 628 pshufw mm1, mm1, 0 629 pshufw mm0, mm0, 0 ; dc0 (w) 630 packuswb mm0, mm1 ; dc0,dc1 (b) 631 movq [r0+r1*1], mm0 632 movq [r0+r1*2], mm0 633 lea r0, [r3+r1*2] 634 movq [r2+r1*1], mm0 635 movq [r2+r1*2], mm0 636 movq [r3+r1*1], mm0 637 movq [r3+r1*2], mm0 638 movq [r0+r1*1], mm0 639 movq [r0+r1*2], mm0 640 RET 641 642 ;----------------------------------------------------------------------------- 643 ; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) 644 ;----------------------------------------------------------------------------- 645 646 INIT_MMX mmxext 647 cglobal pred8x8_dc_8, 2,5 648 sub r0, r1 649 pxor m7, m7 650 movd m0, [r0+0] 651 movd m1, [r0+4] 652 psadbw m0, m7 ; s0 653 mov r4, r0 654 psadbw m1, m7 ; s1 655 656 movzx r2d, byte [r0+r1*1-1] 657 movzx r3d, byte [r0+r1*2-1] 658 lea r0, [r0+r1*2] 659 add r2d, r3d 660 movzx r3d, byte [r0+r1*1-1] 661 add r2d, r3d 662 movzx r3d, byte [r0+r1*2-1] 663 add r2d, r3d 664 lea r0, [r0+r1*2] 665 movd m2, r2d ; s2 666 movzx r2d, byte [r0+r1*1-1] 667 movzx r3d, byte [r0+r1*2-1] 668 lea r0, [r0+r1*2] 669 add r2d, r3d 670 movzx r3d, byte [r0+r1*1-1] 671 add r2d, r3d 672 movzx r3d, byte [r0+r1*2-1] 673 add r2d, r3d 674 movd m3, r2d ; s3 675 676 punpcklwd m0, m1 677 mov r0, r4 678 punpcklwd m2, m3 679 punpckldq m0, m2 ; s0, s1, s2, s3 680 pshufw m3, m0, 11110110b ; s2, s1, s3, s3 681 lea r2, [r0+r1*2] 682 pshufw m0, m0, 01110100b ; s0, s1, s3, s1 683 paddw m0, m3 684 lea r3, [r2+r1*2] 685 psrlw m0, 2 686 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 687 lea r4, [r3+r1*2] 688 packuswb m0, m0 689 punpcklbw m0, m0 690 movq m1, m0 691 punpcklbw m0, m0 692 punpckhbw m1, m1 693 movq [r0+r1*1], m0 694 movq [r0+r1*2], m0 695 movq [r2+r1*1], m0 696 movq [r2+r1*2], m0 697 movq [r3+r1*1], m1 698 movq [r3+r1*2], m1 699 movq [r4+r1*1], m1 700 movq [r4+r1*2], m1 701 RET 702 703 ;----------------------------------------------------------------------------- 704 ; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride) 705 ;----------------------------------------------------------------------------- 706 707 INIT_MMX mmxext 708 cglobal pred8x8_dc_rv40_8, 2,7 709 mov r4, r0 710 sub r0, r1 711 pxor mm0, mm0 712 psadbw mm0, [r0] 713 dec r0 714 movzx r5d, byte [r0+r1*1] 715 movd r6d, mm0 716 lea r0, [r0+r1*2] 717 %rep 3 718 movzx r2d, byte [r0+r1*0] 719 movzx r3d, byte [r0+r1*1] 720 add r5d, r2d 721 add r6d, r3d 722 lea r0, [r0+r1*2] 723 %endrep 724 movzx r2d, byte [r0+r1*0] 725 add r5d, r6d 726 lea r2d, [r2+r5+8] 727 shr r2d, 4 728 movd mm0, r2d 729 punpcklbw mm0, mm0 730 pshufw mm0, mm0, 0 731 mov r3d, 4 732 .loop: 733 movq [r4+r1*0], mm0 734 movq [r4+r1*1], mm0 735 lea r4, [r4+r1*2] 736 dec r3d 737 jg .loop 738 RET 739 740 ;----------------------------------------------------------------------------- 741 ; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride) 742 ;----------------------------------------------------------------------------- 743 744 INIT_XMM sse2 745 cglobal pred8x8_tm_vp8_8, 2,6,4 746 sub r0, r1 747 pxor xmm1, xmm1 748 movq xmm0, [r0] 749 punpcklbw xmm0, xmm1 750 movzx r4d, byte [r0-1] 751 mov r5d, 4 752 .loop: 753 movzx r2d, byte [r0+r1*1-1] 754 movzx r3d, byte [r0+r1*2-1] 755 sub r2d, r4d 756 sub r3d, r4d 757 movd xmm2, r2d 758 movd xmm3, r3d 759 pshuflw xmm2, xmm2, 0 760 pshuflw xmm3, xmm3, 0 761 punpcklqdq xmm2, xmm2 762 punpcklqdq xmm3, xmm3 763 paddw xmm2, xmm0 764 paddw xmm3, xmm0 765 packuswb xmm2, xmm3 766 movq [r0+r1*1], xmm2 767 movhps [r0+r1*2], xmm2 768 lea r0, [r0+r1*2] 769 dec r5d 770 jg .loop 771 RET 772 773 INIT_XMM ssse3 774 cglobal pred8x8_tm_vp8_8, 2,3,6 775 sub r0, r1 776 movdqa xmm4, [tm_shuf] 777 pxor xmm1, xmm1 778 movq xmm0, [r0] 779 punpcklbw xmm0, xmm1 780 movd xmm5, [r0-4] 781 pshufb xmm5, xmm4 782 mov r2d, 4 783 .loop: 784 movd xmm2, [r0+r1*1-4] 785 movd xmm3, [r0+r1*2-4] 786 pshufb xmm2, xmm4 787 pshufb xmm3, xmm4 788 psubw xmm2, xmm5 789 psubw xmm3, xmm5 790 paddw xmm2, xmm0 791 paddw xmm3, xmm0 792 packuswb xmm2, xmm3 793 movq [r0+r1*1], xmm2 794 movhps [r0+r1*2], xmm2 795 lea r0, [r0+r1*2] 796 dec r2d 797 jg .loop 798 RET 799 800 ; dest, left, right, src, tmp 801 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 802 %macro PRED4x4_LOWPASS 5 803 mova %5, %2 804 pavgb %2, %3 805 pxor %3, %5 806 mova %1, %4 807 pand %3, [pb_1] 808 psubusb %2, %3 809 pavgb %1, %2 810 %endmacro 811 812 ;----------------------------------------------------------------------------- 813 ; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, 814 ; ptrdiff_t stride) 815 ;----------------------------------------------------------------------------- 816 %macro PRED8x8L_TOP_DC 0 817 cglobal pred8x8l_top_dc_8, 4,4 818 sub r0, r3 819 pxor mm7, mm7 820 movq mm0, [r0-8] 821 movq mm3, [r0] 822 movq mm1, [r0+8] 823 movq mm2, mm3 824 movq mm4, mm3 825 PALIGNR mm2, mm0, 7, mm0 826 PALIGNR mm1, mm4, 1, mm4 827 test r1d, r1d ; top_left 828 jz .fix_lt_2 829 test r2d, r2d ; top_right 830 jz .fix_tr_1 831 jmp .body 832 .fix_lt_2: 833 movq mm5, mm3 834 pxor mm5, mm2 835 psllq mm5, 56 836 psrlq mm5, 56 837 pxor mm2, mm5 838 test r2d, r2d ; top_right 839 jnz .body 840 .fix_tr_1: 841 movq mm5, mm3 842 pxor mm5, mm1 843 psrlq mm5, 56 844 psllq mm5, 56 845 pxor mm1, mm5 846 .body: 847 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 848 psadbw mm7, mm0 849 paddw mm7, [pw_4] 850 psrlw mm7, 3 851 pshufw mm7, mm7, 0 852 packuswb mm7, mm7 853 %rep 3 854 movq [r0+r3*1], mm7 855 movq [r0+r3*2], mm7 856 lea r0, [r0+r3*2] 857 %endrep 858 movq [r0+r3*1], mm7 859 movq [r0+r3*2], mm7 860 RET 861 %endmacro 862 863 INIT_MMX mmxext 864 PRED8x8L_TOP_DC 865 INIT_MMX ssse3 866 PRED8x8L_TOP_DC 867 868 ;----------------------------------------------------------------------------- 869 ; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, 870 ; ptrdiff_t stride) 871 ;----------------------------------------------------------------------------- 872 873 %macro PRED8x8L_DC 0 874 cglobal pred8x8l_dc_8, 4,5 875 sub r0, r3 876 lea r4, [r0+r3*2] 877 movq mm0, [r0+r3*1-8] 878 punpckhbw mm0, [r0+r3*0-8] 879 movq mm1, [r4+r3*1-8] 880 punpckhbw mm1, [r0+r3*2-8] 881 mov r4, r0 882 punpckhwd mm1, mm0 883 lea r0, [r0+r3*4] 884 movq mm2, [r0+r3*1-8] 885 punpckhbw mm2, [r0+r3*0-8] 886 lea r0, [r0+r3*2] 887 movq mm3, [r0+r3*1-8] 888 punpckhbw mm3, [r0+r3*0-8] 889 punpckhwd mm3, mm2 890 punpckhdq mm3, mm1 891 lea r0, [r0+r3*2] 892 movq mm0, [r0+r3*0-8] 893 movq mm1, [r4] 894 mov r0, r4 895 movq mm4, mm3 896 movq mm2, mm3 897 PALIGNR mm4, mm0, 7, mm0 898 PALIGNR mm1, mm2, 1, mm2 899 test r1d, r1d 900 jnz .do_left 901 .fix_lt_1: 902 movq mm5, mm3 903 pxor mm5, mm4 904 psrlq mm5, 56 905 psllq mm5, 48 906 pxor mm1, mm5 907 jmp .do_left 908 .fix_lt_2: 909 movq mm5, mm3 910 pxor mm5, mm2 911 psllq mm5, 56 912 psrlq mm5, 56 913 pxor mm2, mm5 914 test r2d, r2d 915 jnz .body 916 .fix_tr_1: 917 movq mm5, mm3 918 pxor mm5, mm1 919 psrlq mm5, 56 920 psllq mm5, 56 921 pxor mm1, mm5 922 jmp .body 923 .do_left: 924 movq mm0, mm4 925 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 926 movq mm4, mm0 927 movq mm7, mm2 928 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 929 psllq mm1, 56 930 PALIGNR mm7, mm1, 7, mm3 931 movq mm0, [r0-8] 932 movq mm3, [r0] 933 movq mm1, [r0+8] 934 movq mm2, mm3 935 movq mm4, mm3 936 PALIGNR mm2, mm0, 7, mm0 937 PALIGNR mm1, mm4, 1, mm4 938 test r1d, r1d 939 jz .fix_lt_2 940 test r2d, r2d 941 jz .fix_tr_1 942 .body: 943 lea r1, [r0+r3*2] 944 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 945 pxor mm0, mm0 946 pxor mm1, mm1 947 lea r2, [r1+r3*2] 948 psadbw mm0, mm7 949 psadbw mm1, mm6 950 paddw mm0, [pw_8] 951 paddw mm0, mm1 952 lea r4, [r2+r3*2] 953 psrlw mm0, 4 954 pshufw mm0, mm0, 0 955 packuswb mm0, mm0 956 movq [r0+r3*1], mm0 957 movq [r0+r3*2], mm0 958 movq [r1+r3*1], mm0 959 movq [r1+r3*2], mm0 960 movq [r2+r3*1], mm0 961 movq [r2+r3*2], mm0 962 movq [r4+r3*1], mm0 963 movq [r4+r3*2], mm0 964 RET 965 %endmacro 966 967 INIT_MMX mmxext 968 PRED8x8L_DC 969 INIT_MMX ssse3 970 PRED8x8L_DC 971 972 ;----------------------------------------------------------------------------- 973 ; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft, 974 ; int has_topright, ptrdiff_t stride) 975 ;----------------------------------------------------------------------------- 976 977 %macro PRED8x8L_HORIZONTAL 0 978 cglobal pred8x8l_horizontal_8, 4,4 979 sub r0, r3 980 lea r2, [r0+r3*2] 981 movq mm0, [r0+r3*1-8] 982 test r1d, r1d 983 lea r1, [r0+r3] 984 cmovnz r1, r0 985 punpckhbw mm0, [r1+r3*0-8] 986 movq mm1, [r2+r3*1-8] 987 punpckhbw mm1, [r0+r3*2-8] 988 mov r2, r0 989 punpckhwd mm1, mm0 990 lea r0, [r0+r3*4] 991 movq mm2, [r0+r3*1-8] 992 punpckhbw mm2, [r0+r3*0-8] 993 lea r0, [r0+r3*2] 994 movq mm3, [r0+r3*1-8] 995 punpckhbw mm3, [r0+r3*0-8] 996 punpckhwd mm3, mm2 997 punpckhdq mm3, mm1 998 lea r0, [r0+r3*2] 999 movq mm0, [r0+r3*0-8] 1000 movq mm1, [r1+r3*0-8] 1001 mov r0, r2 1002 movq mm4, mm3 1003 movq mm2, mm3 1004 PALIGNR mm4, mm0, 7, mm0 1005 PALIGNR mm1, mm2, 1, mm2 1006 movq mm0, mm4 1007 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1008 movq mm4, mm0 1009 movq mm7, mm2 1010 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1011 psllq mm1, 56 1012 PALIGNR mm7, mm1, 7, mm3 1013 movq mm3, mm7 1014 lea r1, [r0+r3*2] 1015 movq mm7, mm3 1016 punpckhbw mm3, mm3 1017 punpcklbw mm7, mm7 1018 pshufw mm0, mm3, 0xff 1019 pshufw mm1, mm3, 0xaa 1020 lea r2, [r1+r3*2] 1021 pshufw mm2, mm3, 0x55 1022 pshufw mm3, mm3, 0x00 1023 pshufw mm4, mm7, 0xff 1024 pshufw mm5, mm7, 0xaa 1025 pshufw mm6, mm7, 0x55 1026 pshufw mm7, mm7, 0x00 1027 movq [r0+r3*1], mm0 1028 movq [r0+r3*2], mm1 1029 movq [r1+r3*1], mm2 1030 movq [r1+r3*2], mm3 1031 movq [r2+r3*1], mm4 1032 movq [r2+r3*2], mm5 1033 lea r0, [r2+r3*2] 1034 movq [r0+r3*1], mm6 1035 movq [r0+r3*2], mm7 1036 RET 1037 %endmacro 1038 1039 INIT_MMX mmxext 1040 PRED8x8L_HORIZONTAL 1041 INIT_MMX ssse3 1042 PRED8x8L_HORIZONTAL 1043 1044 ;----------------------------------------------------------------------------- 1045 ; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, 1046 ; ptrdiff_t stride) 1047 ;----------------------------------------------------------------------------- 1048 1049 %macro PRED8x8L_VERTICAL 0 1050 cglobal pred8x8l_vertical_8, 4,4 1051 sub r0, r3 1052 movq mm0, [r0-8] 1053 movq mm3, [r0] 1054 movq mm1, [r0+8] 1055 movq mm2, mm3 1056 movq mm4, mm3 1057 PALIGNR mm2, mm0, 7, mm0 1058 PALIGNR mm1, mm4, 1, mm4 1059 test r1d, r1d ; top_left 1060 jz .fix_lt_2 1061 test r2d, r2d ; top_right 1062 jz .fix_tr_1 1063 jmp .body 1064 .fix_lt_2: 1065 movq mm5, mm3 1066 pxor mm5, mm2 1067 psllq mm5, 56 1068 psrlq mm5, 56 1069 pxor mm2, mm5 1070 test r2d, r2d ; top_right 1071 jnz .body 1072 .fix_tr_1: 1073 movq mm5, mm3 1074 pxor mm5, mm1 1075 psrlq mm5, 56 1076 psllq mm5, 56 1077 pxor mm1, mm5 1078 .body: 1079 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 1080 %rep 3 1081 movq [r0+r3*1], mm0 1082 movq [r0+r3*2], mm0 1083 lea r0, [r0+r3*2] 1084 %endrep 1085 movq [r0+r3*1], mm0 1086 movq [r0+r3*2], mm0 1087 RET 1088 %endmacro 1089 1090 INIT_MMX mmxext 1091 PRED8x8L_VERTICAL 1092 INIT_MMX ssse3 1093 PRED8x8L_VERTICAL 1094 1095 ;----------------------------------------------------------------------------- 1096 ; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft, 1097 ; int has_topright, ptrdiff_t stride) 1098 ;----------------------------------------------------------------------------- 1099 1100 %macro PRED8x8L_DOWN_LEFT 0 1101 cglobal pred8x8l_down_left_8, 4,4 1102 sub r0, r3 1103 movq mm0, [r0-8] 1104 movq mm3, [r0] 1105 movq mm1, [r0+8] 1106 movq mm2, mm3 1107 movq mm4, mm3 1108 PALIGNR mm2, mm0, 7, mm0 1109 PALIGNR mm1, mm4, 1, mm4 1110 test r1d, r1d ; top_left 1111 jz .fix_lt_2 1112 test r2d, r2d ; top_right 1113 jz .fix_tr_1 1114 jmp .do_top 1115 .fix_lt_2: 1116 movq mm5, mm3 1117 pxor mm5, mm2 1118 psllq mm5, 56 1119 psrlq mm5, 56 1120 pxor mm2, mm5 1121 test r2d, r2d ; top_right 1122 jnz .do_top 1123 .fix_tr_1: 1124 movq mm5, mm3 1125 pxor mm5, mm1 1126 psrlq mm5, 56 1127 psllq mm5, 56 1128 pxor mm1, mm5 1129 jmp .do_top 1130 .fix_tr_2: 1131 punpckhbw mm3, mm3 1132 pshufw mm1, mm3, 0xFF 1133 jmp .do_topright 1134 .do_top: 1135 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1136 movq2dq xmm3, mm4 1137 test r2d, r2d ; top_right 1138 jz .fix_tr_2 1139 movq mm0, [r0+8] 1140 movq mm5, mm0 1141 movq mm2, mm0 1142 movq mm4, mm0 1143 psrlq mm5, 56 1144 PALIGNR mm2, mm3, 7, mm3 1145 PALIGNR mm5, mm4, 1, mm4 1146 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1147 .do_topright: 1148 movq2dq xmm4, mm1 1149 psrlq mm1, 56 1150 movq2dq xmm5, mm1 1151 lea r1, [r0+r3*2] 1152 pslldq xmm4, 8 1153 por xmm3, xmm4 1154 movdqa xmm2, xmm3 1155 psrldq xmm2, 1 1156 pslldq xmm5, 15 1157 por xmm2, xmm5 1158 lea r2, [r1+r3*2] 1159 movdqa xmm1, xmm3 1160 pslldq xmm1, 1 1161 INIT_XMM cpuname 1162 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1163 psrldq xmm0, 1 1164 movq [r0+r3*1], xmm0 1165 psrldq xmm0, 1 1166 movq [r0+r3*2], xmm0 1167 psrldq xmm0, 1 1168 lea r0, [r2+r3*2] 1169 movq [r1+r3*1], xmm0 1170 psrldq xmm0, 1 1171 movq [r1+r3*2], xmm0 1172 psrldq xmm0, 1 1173 movq [r2+r3*1], xmm0 1174 psrldq xmm0, 1 1175 movq [r2+r3*2], xmm0 1176 psrldq xmm0, 1 1177 movq [r0+r3*1], xmm0 1178 psrldq xmm0, 1 1179 movq [r0+r3*2], xmm0 1180 RET 1181 %endmacro 1182 1183 INIT_MMX sse2 1184 PRED8x8L_DOWN_LEFT 1185 INIT_MMX ssse3 1186 PRED8x8L_DOWN_LEFT 1187 1188 ;----------------------------------------------------------------------------- 1189 ; void ff_pred8x8l_down_right_8(uint8_t *src, int has_topleft, 1190 ; int has_topright, ptrdiff_t stride) 1191 ;----------------------------------------------------------------------------- 1192 1193 %macro PRED8x8L_DOWN_RIGHT 0 1194 cglobal pred8x8l_down_right_8, 4,5 1195 sub r0, r3 1196 lea r4, [r0+r3*2] 1197 movq mm0, [r0+r3*1-8] 1198 punpckhbw mm0, [r0+r3*0-8] 1199 movq mm1, [r4+r3*1-8] 1200 punpckhbw mm1, [r0+r3*2-8] 1201 mov r4, r0 1202 punpckhwd mm1, mm0 1203 lea r0, [r0+r3*4] 1204 movq mm2, [r0+r3*1-8] 1205 punpckhbw mm2, [r0+r3*0-8] 1206 lea r0, [r0+r3*2] 1207 movq mm3, [r0+r3*1-8] 1208 punpckhbw mm3, [r0+r3*0-8] 1209 punpckhwd mm3, mm2 1210 punpckhdq mm3, mm1 1211 lea r0, [r0+r3*2] 1212 movq mm0, [r0+r3*0-8] 1213 movq mm1, [r4] 1214 mov r0, r4 1215 movq mm4, mm3 1216 movq mm2, mm3 1217 PALIGNR mm4, mm0, 7, mm0 1218 PALIGNR mm1, mm2, 1, mm2 1219 test r1d, r1d 1220 jz .fix_lt_1 1221 jmp .do_left 1222 .fix_lt_1: 1223 movq mm5, mm3 1224 pxor mm5, mm4 1225 psrlq mm5, 56 1226 psllq mm5, 48 1227 pxor mm1, mm5 1228 jmp .do_left 1229 .fix_lt_2: 1230 movq mm5, mm3 1231 pxor mm5, mm2 1232 psllq mm5, 56 1233 psrlq mm5, 56 1234 pxor mm2, mm5 1235 test r2d, r2d 1236 jnz .do_top 1237 .fix_tr_1: 1238 movq mm5, mm3 1239 pxor mm5, mm1 1240 psrlq mm5, 56 1241 psllq mm5, 56 1242 pxor mm1, mm5 1243 jmp .do_top 1244 .do_left: 1245 movq mm0, mm4 1246 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1247 movq mm4, mm0 1248 movq mm7, mm2 1249 movq2dq xmm3, mm2 1250 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1251 psllq mm1, 56 1252 PALIGNR mm7, mm1, 7, mm3 1253 movq2dq xmm1, mm7 1254 movq mm0, [r0-8] 1255 movq mm3, [r0] 1256 movq mm1, [r0+8] 1257 movq mm2, mm3 1258 movq mm4, mm3 1259 PALIGNR mm2, mm0, 7, mm0 1260 PALIGNR mm1, mm4, 1, mm4 1261 test r1d, r1d 1262 jz .fix_lt_2 1263 test r2d, r2d 1264 jz .fix_tr_1 1265 .do_top: 1266 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1267 movq2dq xmm4, mm4 1268 lea r1, [r0+r3*2] 1269 movdqa xmm0, xmm3 1270 pslldq xmm4, 8 1271 por xmm3, xmm4 1272 lea r2, [r1+r3*2] 1273 pslldq xmm4, 1 1274 por xmm1, xmm4 1275 psrldq xmm0, 7 1276 pslldq xmm0, 15 1277 psrldq xmm0, 7 1278 por xmm1, xmm0 1279 lea r0, [r2+r3*2] 1280 movdqa xmm2, xmm3 1281 psrldq xmm2, 1 1282 INIT_XMM cpuname 1283 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 1284 movdqa xmm1, xmm0 1285 psrldq xmm1, 1 1286 movq [r0+r3*2], xmm0 1287 movq [r0+r3*1], xmm1 1288 psrldq xmm0, 2 1289 psrldq xmm1, 2 1290 movq [r2+r3*2], xmm0 1291 movq [r2+r3*1], xmm1 1292 psrldq xmm0, 2 1293 psrldq xmm1, 2 1294 movq [r1+r3*2], xmm0 1295 movq [r1+r3*1], xmm1 1296 psrldq xmm0, 2 1297 psrldq xmm1, 2 1298 movq [r4+r3*2], xmm0 1299 movq [r4+r3*1], xmm1 1300 RET 1301 %endmacro 1302 1303 INIT_MMX sse2 1304 PRED8x8L_DOWN_RIGHT 1305 INIT_MMX ssse3 1306 PRED8x8L_DOWN_RIGHT 1307 1308 ;----------------------------------------------------------------------------- 1309 ; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, 1310 ; int has_topright, ptrdiff_t stride) 1311 ;----------------------------------------------------------------------------- 1312 1313 %macro PRED8x8L_VERTICAL_RIGHT 0 1314 cglobal pred8x8l_vertical_right_8, 4,5,6 1315 sub r0, r3 1316 lea r4, [r0+r3*2] 1317 movq mm0, [r0+r3*1-8] 1318 punpckhbw mm0, [r0+r3*0-8] 1319 movq mm1, [r4+r3*1-8] 1320 punpckhbw mm1, [r0+r3*2-8] 1321 mov r4, r0 1322 punpckhwd mm1, mm0 1323 lea r0, [r0+r3*4] 1324 movq mm2, [r0+r3*1-8] 1325 punpckhbw mm2, [r0+r3*0-8] 1326 lea r0, [r0+r3*2] 1327 movq mm3, [r0+r3*1-8] 1328 punpckhbw mm3, [r0+r3*0-8] 1329 punpckhwd mm3, mm2 1330 punpckhdq mm3, mm1 1331 lea r0, [r0+r3*2] 1332 movq mm0, [r0+r3*0-8] 1333 movq mm1, [r4] 1334 mov r0, r4 1335 movq mm4, mm3 1336 movq mm2, mm3 1337 PALIGNR mm4, mm0, 7, mm0 1338 PALIGNR mm1, mm2, 1, mm2 1339 test r1d, r1d 1340 jnz .do_left 1341 .fix_lt_1: 1342 movq mm5, mm3 1343 pxor mm5, mm4 1344 psrlq mm5, 56 1345 psllq mm5, 48 1346 pxor mm1, mm5 1347 jmp .do_left 1348 .fix_lt_2: 1349 movq mm5, mm3 1350 pxor mm5, mm2 1351 psllq mm5, 56 1352 psrlq mm5, 56 1353 pxor mm2, mm5 1354 test r2d, r2d 1355 jnz .do_top 1356 .fix_tr_1: 1357 movq mm5, mm3 1358 pxor mm5, mm1 1359 psrlq mm5, 56 1360 psllq mm5, 56 1361 pxor mm1, mm5 1362 jmp .do_top 1363 .do_left: 1364 movq mm0, mm4 1365 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1366 movq2dq xmm0, mm2 1367 movq mm0, [r0-8] 1368 movq mm3, [r0] 1369 movq mm1, [r0+8] 1370 movq mm2, mm3 1371 movq mm4, mm3 1372 PALIGNR mm2, mm0, 7, mm0 1373 PALIGNR mm1, mm4, 1, mm4 1374 test r1d, r1d 1375 jz .fix_lt_2 1376 test r2d, r2d 1377 jz .fix_tr_1 1378 .do_top: 1379 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 1380 lea r1, [r0+r3*2] 1381 movq2dq xmm4, mm6 1382 pslldq xmm4, 8 1383 por xmm0, xmm4 1384 movdqa xmm1, xmm0 1385 lea r2, [r1+r3*2] 1386 movdqa xmm2, xmm0 1387 movdqa xmm3, xmm0 1388 pslldq xmm0, 1 1389 pslldq xmm1, 2 1390 pavgb xmm2, xmm0 1391 INIT_XMM cpuname 1392 PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 1393 movdqa xmm0, [pw_ff00] 1394 pandn xmm0, xmm4 1395 movdqa xmm5, xmm4 1396 psrlw xmm4, 8 1397 packuswb xmm0, xmm4 1398 movhlps xmm4, xmm0 1399 movhps [r0+r3*2], xmm5 1400 movhps [r0+r3*1], xmm2 1401 psrldq xmm5, 4 1402 movss xmm5, xmm0 1403 psrldq xmm2, 4 1404 movss xmm2, xmm4 1405 lea r0, [r2+r3*2] 1406 psrldq xmm5, 1 1407 psrldq xmm2, 1 1408 movq [r0+r3*2], xmm5 1409 movq [r0+r3*1], xmm2 1410 psrldq xmm5, 1 1411 psrldq xmm2, 1 1412 movq [r2+r3*2], xmm5 1413 movq [r2+r3*1], xmm2 1414 psrldq xmm5, 1 1415 psrldq xmm2, 1 1416 movq [r1+r3*2], xmm5 1417 movq [r1+r3*1], xmm2 1418 RET 1419 %endmacro 1420 1421 INIT_MMX sse2 1422 PRED8x8L_VERTICAL_RIGHT 1423 INIT_MMX ssse3 1424 PRED8x8L_VERTICAL_RIGHT 1425 1426 ;----------------------------------------------------------------------------- 1427 ; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, 1428 ; int has_topright, ptrdiff_t stride) 1429 ;----------------------------------------------------------------------------- 1430 1431 %macro PRED8x8L_VERTICAL_LEFT 0 1432 cglobal pred8x8l_vertical_left_8, 4,4 1433 sub r0, r3 1434 movq mm0, [r0-8] 1435 movq mm3, [r0] 1436 movq mm1, [r0+8] 1437 movq mm2, mm3 1438 movq mm4, mm3 1439 PALIGNR mm2, mm0, 7, mm0 1440 PALIGNR mm1, mm4, 1, mm4 1441 test r1d, r1d 1442 jz .fix_lt_2 1443 test r2d, r2d 1444 jz .fix_tr_1 1445 jmp .do_top 1446 .fix_lt_2: 1447 movq mm5, mm3 1448 pxor mm5, mm2 1449 psllq mm5, 56 1450 psrlq mm5, 56 1451 pxor mm2, mm5 1452 test r2d, r2d 1453 jnz .do_top 1454 .fix_tr_1: 1455 movq mm5, mm3 1456 pxor mm5, mm1 1457 psrlq mm5, 56 1458 psllq mm5, 56 1459 pxor mm1, mm5 1460 jmp .do_top 1461 .fix_tr_2: 1462 punpckhbw mm3, mm3 1463 pshufw mm1, mm3, 0xFF 1464 jmp .do_topright 1465 .do_top: 1466 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1467 movq2dq xmm4, mm4 1468 test r2d, r2d 1469 jz .fix_tr_2 1470 movq mm0, [r0+8] 1471 movq mm5, mm0 1472 movq mm2, mm0 1473 movq mm4, mm0 1474 psrlq mm5, 56 1475 PALIGNR mm2, mm3, 7, mm3 1476 PALIGNR mm5, mm4, 1, mm4 1477 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1478 .do_topright: 1479 movq2dq xmm3, mm1 1480 lea r1, [r0+r3*2] 1481 pslldq xmm3, 8 1482 por xmm4, xmm3 1483 movdqa xmm2, xmm4 1484 movdqa xmm1, xmm4 1485 movdqa xmm3, xmm4 1486 psrldq xmm2, 1 1487 pslldq xmm1, 1 1488 pavgb xmm3, xmm2 1489 lea r2, [r1+r3*2] 1490 INIT_XMM cpuname 1491 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 1492 psrldq xmm0, 1 1493 movq [r0+r3*1], xmm3 1494 movq [r0+r3*2], xmm0 1495 lea r0, [r2+r3*2] 1496 psrldq xmm3, 1 1497 psrldq xmm0, 1 1498 movq [r1+r3*1], xmm3 1499 movq [r1+r3*2], xmm0 1500 psrldq xmm3, 1 1501 psrldq xmm0, 1 1502 movq [r2+r3*1], xmm3 1503 movq [r2+r3*2], xmm0 1504 psrldq xmm3, 1 1505 psrldq xmm0, 1 1506 movq [r0+r3*1], xmm3 1507 movq [r0+r3*2], xmm0 1508 RET 1509 %endmacro 1510 1511 INIT_MMX sse2 1512 PRED8x8L_VERTICAL_LEFT 1513 INIT_MMX ssse3 1514 PRED8x8L_VERTICAL_LEFT 1515 1516 ;----------------------------------------------------------------------------- 1517 ; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, 1518 ; int has_topright, ptrdiff_t stride) 1519 ;----------------------------------------------------------------------------- 1520 1521 %macro PRED8x8L_HORIZONTAL_UP 0 1522 cglobal pred8x8l_horizontal_up_8, 4,4 1523 sub r0, r3 1524 lea r2, [r0+r3*2] 1525 movq mm0, [r0+r3*1-8] 1526 test r1d, r1d 1527 lea r1, [r0+r3] 1528 cmovnz r1, r0 1529 punpckhbw mm0, [r1+r3*0-8] 1530 movq mm1, [r2+r3*1-8] 1531 punpckhbw mm1, [r0+r3*2-8] 1532 mov r2, r0 1533 punpckhwd mm1, mm0 1534 lea r0, [r0+r3*4] 1535 movq mm2, [r0+r3*1-8] 1536 punpckhbw mm2, [r0+r3*0-8] 1537 lea r0, [r0+r3*2] 1538 movq mm3, [r0+r3*1-8] 1539 punpckhbw mm3, [r0+r3*0-8] 1540 punpckhwd mm3, mm2 1541 punpckhdq mm3, mm1 1542 lea r0, [r0+r3*2] 1543 movq mm0, [r0+r3*0-8] 1544 movq mm1, [r1+r3*0-8] 1545 mov r0, r2 1546 movq mm4, mm3 1547 movq mm2, mm3 1548 PALIGNR mm4, mm0, 7, mm0 1549 PALIGNR mm1, mm2, 1, mm2 1550 movq mm0, mm4 1551 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1552 movq mm4, mm0 1553 movq mm7, mm2 1554 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1555 psllq mm1, 56 1556 PALIGNR mm7, mm1, 7, mm3 1557 lea r1, [r0+r3*2] 1558 pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 1559 psllq mm7, 56 ; l7 .. .. .. .. .. .. .. 1560 movq mm2, mm0 1561 psllw mm0, 8 1562 psrlw mm2, 8 1563 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 1564 movq mm3, mm2 1565 movq mm4, mm2 1566 movq mm5, mm2 1567 psrlq mm2, 8 1568 psrlq mm3, 16 1569 lea r2, [r1+r3*2] 1570 por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1 1571 punpckhbw mm7, mm7 1572 por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2 1573 pavgb mm4, mm2 1574 PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6 1575 movq mm5, mm4 1576 punpcklbw mm4, mm1 ; p4 p3 p2 p1 1577 punpckhbw mm5, mm1 ; p8 p7 p6 p5 1578 movq mm6, mm5 1579 movq mm7, mm5 1580 movq mm0, mm5 1581 PALIGNR mm5, mm4, 2, mm1 1582 pshufw mm1, mm6, 11111001b 1583 PALIGNR mm6, mm4, 4, mm2 1584 pshufw mm2, mm7, 11111110b 1585 PALIGNR mm7, mm4, 6, mm3 1586 pshufw mm3, mm0, 11111111b 1587 movq [r0+r3*1], mm4 1588 movq [r0+r3*2], mm5 1589 lea r0, [r2+r3*2] 1590 movq [r1+r3*1], mm6 1591 movq [r1+r3*2], mm7 1592 movq [r2+r3*1], mm0 1593 movq [r2+r3*2], mm1 1594 movq [r0+r3*1], mm2 1595 movq [r0+r3*2], mm3 1596 RET 1597 %endmacro 1598 1599 INIT_MMX mmxext 1600 PRED8x8L_HORIZONTAL_UP 1601 INIT_MMX ssse3 1602 PRED8x8L_HORIZONTAL_UP 1603 1604 ;----------------------------------------------------------------------------- 1605 ; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, 1606 ; int has_topright, ptrdiff_t stride) 1607 ;----------------------------------------------------------------------------- 1608 1609 %macro PRED8x8L_HORIZONTAL_DOWN 0 1610 cglobal pred8x8l_horizontal_down_8, 4,5 1611 sub r0, r3 1612 lea r4, [r0+r3*2] 1613 movq mm0, [r0+r3*1-8] 1614 punpckhbw mm0, [r0+r3*0-8] 1615 movq mm1, [r4+r3*1-8] 1616 punpckhbw mm1, [r0+r3*2-8] 1617 mov r4, r0 1618 punpckhwd mm1, mm0 1619 lea r0, [r0+r3*4] 1620 movq mm2, [r0+r3*1-8] 1621 punpckhbw mm2, [r0+r3*0-8] 1622 lea r0, [r0+r3*2] 1623 movq mm3, [r0+r3*1-8] 1624 punpckhbw mm3, [r0+r3*0-8] 1625 punpckhwd mm3, mm2 1626 punpckhdq mm3, mm1 1627 lea r0, [r0+r3*2] 1628 movq mm0, [r0+r3*0-8] 1629 movq mm1, [r4] 1630 mov r0, r4 1631 movq mm4, mm3 1632 movq mm2, mm3 1633 PALIGNR mm4, mm0, 7, mm0 1634 PALIGNR mm1, mm2, 1, mm2 1635 test r1d, r1d 1636 jnz .do_left 1637 .fix_lt_1: 1638 movq mm5, mm3 1639 pxor mm5, mm4 1640 psrlq mm5, 56 1641 psllq mm5, 48 1642 pxor mm1, mm5 1643 jmp .do_left 1644 .fix_lt_2: 1645 movq mm5, mm3 1646 pxor mm5, mm2 1647 psllq mm5, 56 1648 psrlq mm5, 56 1649 pxor mm2, mm5 1650 test r2d, r2d 1651 jnz .do_top 1652 .fix_tr_1: 1653 movq mm5, mm3 1654 pxor mm5, mm1 1655 psrlq mm5, 56 1656 psllq mm5, 56 1657 pxor mm1, mm5 1658 jmp .do_top 1659 .fix_tr_2: 1660 punpckhbw mm3, mm3 1661 pshufw mm1, mm3, 0xFF 1662 jmp .do_topright 1663 .do_left: 1664 movq mm0, mm4 1665 PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 1666 movq2dq xmm0, mm2 1667 pslldq xmm0, 8 1668 movq mm4, mm0 1669 PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 1670 movq2dq xmm2, mm1 1671 pslldq xmm2, 15 1672 psrldq xmm2, 8 1673 por xmm0, xmm2 1674 movq mm0, [r0-8] 1675 movq mm3, [r0] 1676 movq mm1, [r0+8] 1677 movq mm2, mm3 1678 movq mm4, mm3 1679 PALIGNR mm2, mm0, 7, mm0 1680 PALIGNR mm1, mm4, 1, mm4 1681 test r1d, r1d 1682 jz .fix_lt_2 1683 test r2d, r2d 1684 jz .fix_tr_1 1685 .do_top: 1686 PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 1687 movq2dq xmm1, mm4 1688 test r2d, r2d 1689 jz .fix_tr_2 1690 movq mm0, [r0+8] 1691 movq mm5, mm0 1692 movq mm2, mm0 1693 movq mm4, mm0 1694 psrlq mm5, 56 1695 PALIGNR mm2, mm3, 7, mm3 1696 PALIGNR mm5, mm4, 1, mm4 1697 PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 1698 .do_topright: 1699 movq2dq xmm5, mm1 1700 pslldq xmm5, 8 1701 por xmm1, xmm5 1702 INIT_XMM cpuname 1703 lea r2, [r4+r3*2] 1704 movdqa xmm2, xmm1 1705 movdqa xmm3, xmm1 1706 PALIGNR xmm1, xmm0, 7, xmm4 1707 PALIGNR xmm2, xmm0, 9, xmm5 1708 lea r1, [r2+r3*2] 1709 PALIGNR xmm3, xmm0, 8, xmm0 1710 movdqa xmm4, xmm1 1711 pavgb xmm4, xmm3 1712 lea r0, [r1+r3*2] 1713 PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 1714 punpcklbw xmm4, xmm0 1715 movhlps xmm0, xmm4 1716 movq [r0+r3*2], xmm4 1717 movq [r2+r3*2], xmm0 1718 psrldq xmm4, 2 1719 psrldq xmm0, 2 1720 movq [r0+r3*1], xmm4 1721 movq [r2+r3*1], xmm0 1722 psrldq xmm4, 2 1723 psrldq xmm0, 2 1724 movq [r1+r3*2], xmm4 1725 movq [r4+r3*2], xmm0 1726 psrldq xmm4, 2 1727 psrldq xmm0, 2 1728 movq [r1+r3*1], xmm4 1729 movq [r4+r3*1], xmm0 1730 RET 1731 %endmacro 1732 1733 INIT_MMX sse2 1734 PRED8x8L_HORIZONTAL_DOWN 1735 INIT_MMX ssse3 1736 PRED8x8L_HORIZONTAL_DOWN 1737 1738 ;------------------------------------------------------------------------------- 1739 ; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, 1740 ; ptrdiff_t stride) 1741 ;------------------------------------------------------------------------------- 1742 1743 INIT_MMX mmxext 1744 cglobal pred4x4_dc_8, 3,5 1745 pxor mm7, mm7 1746 mov r4, r0 1747 sub r0, r2 1748 movd mm0, [r0] 1749 psadbw mm0, mm7 1750 movzx r1d, byte [r0+r2*1-1] 1751 movd r3d, mm0 1752 add r3d, r1d 1753 movzx r1d, byte [r0+r2*2-1] 1754 lea r0, [r0+r2*2] 1755 add r3d, r1d 1756 movzx r1d, byte [r0+r2*1-1] 1757 add r3d, r1d 1758 movzx r1d, byte [r0+r2*2-1] 1759 add r3d, r1d 1760 add r3d, 4 1761 shr r3d, 3 1762 imul r3d, 0x01010101 1763 mov [r4+r2*0], r3d 1764 mov [r0+r2*0], r3d 1765 mov [r0+r2*1], r3d 1766 mov [r0+r2*2], r3d 1767 RET 1768 1769 ;----------------------------------------------------------------------------- 1770 ; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, 1771 ; ptrdiff_t stride) 1772 ;----------------------------------------------------------------------------- 1773 1774 INIT_MMX mmxext 1775 cglobal pred4x4_tm_vp8_8, 3,6 1776 sub r0, r2 1777 pxor mm7, mm7 1778 movd mm0, [r0] 1779 punpcklbw mm0, mm7 1780 movzx r4d, byte [r0-1] 1781 mov r5d, 2 1782 .loop: 1783 movzx r1d, byte [r0+r2*1-1] 1784 movzx r3d, byte [r0+r2*2-1] 1785 sub r1d, r4d 1786 sub r3d, r4d 1787 movd mm2, r1d 1788 movd mm4, r3d 1789 pshufw mm2, mm2, 0 1790 pshufw mm4, mm4, 0 1791 paddw mm2, mm0 1792 paddw mm4, mm0 1793 packuswb mm2, mm2 1794 packuswb mm4, mm4 1795 movd [r0+r2*1], mm2 1796 movd [r0+r2*2], mm4 1797 lea r0, [r0+r2*2] 1798 dec r5d 1799 jg .loop 1800 RET 1801 1802 INIT_XMM ssse3 1803 cglobal pred4x4_tm_vp8_8, 3,3 1804 sub r0, r2 1805 movq mm6, [tm_shuf] 1806 pxor mm1, mm1 1807 movd mm0, [r0] 1808 punpcklbw mm0, mm1 1809 movd mm7, [r0-4] 1810 pshufb mm7, mm6 1811 lea r1, [r0+r2*2] 1812 movd mm2, [r0+r2*1-4] 1813 movd mm3, [r0+r2*2-4] 1814 movd mm4, [r1+r2*1-4] 1815 movd mm5, [r1+r2*2-4] 1816 pshufb mm2, mm6 1817 pshufb mm3, mm6 1818 pshufb mm4, mm6 1819 pshufb mm5, mm6 1820 psubw mm0, mm7 1821 paddw mm2, mm0 1822 paddw mm3, mm0 1823 paddw mm4, mm0 1824 paddw mm5, mm0 1825 packuswb mm2, mm2 1826 packuswb mm3, mm3 1827 packuswb mm4, mm4 1828 packuswb mm5, mm5 1829 movd [r0+r2*1], mm2 1830 movd [r0+r2*2], mm3 1831 movd [r1+r2*1], mm4 1832 movd [r1+r2*2], mm5 1833 RET 1834 1835 ;----------------------------------------------------------------------------- 1836 ; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, 1837 ; ptrdiff_t stride) 1838 ;----------------------------------------------------------------------------- 1839 1840 INIT_MMX mmxext 1841 cglobal pred4x4_vertical_vp8_8, 3,3 1842 sub r0, r2 1843 movd m1, [r0-1] 1844 movd m0, [r0] 1845 mova m2, m0 ;t0 t1 t2 t3 1846 punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 1847 lea r1, [r0+r2*2] 1848 psrlq m0, 8 ;t1 t2 t3 t4 1849 PRED4x4_LOWPASS m3, m1, m0, m2, m4 1850 movd [r0+r2*1], m3 1851 movd [r0+r2*2], m3 1852 movd [r1+r2*1], m3 1853 movd [r1+r2*2], m3 1854 RET 1855 1856 ;----------------------------------------------------------------------------- 1857 ; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, 1858 ; ptrdiff_t stride) 1859 ;----------------------------------------------------------------------------- 1860 INIT_MMX mmxext 1861 cglobal pred4x4_down_left_8, 3,3 1862 sub r0, r2 1863 movq m1, [r0] 1864 punpckldq m1, [r1] 1865 movq m2, m1 1866 movq m3, m1 1867 psllq m1, 8 1868 pxor m2, m1 1869 psrlq m2, 8 1870 pxor m2, m3 1871 PRED4x4_LOWPASS m0, m1, m2, m3, m4 1872 lea r1, [r0+r2*2] 1873 psrlq m0, 8 1874 movd [r0+r2*1], m0 1875 psrlq m0, 8 1876 movd [r0+r2*2], m0 1877 psrlq m0, 8 1878 movd [r1+r2*1], m0 1879 psrlq m0, 8 1880 movd [r1+r2*2], m0 1881 RET 1882 1883 ;------------------------------------------------------------------------------ 1884 ; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, 1885 ; ptrdiff_t stride) 1886 ;------------------------------------------------------------------------------ 1887 1888 INIT_MMX mmxext 1889 cglobal pred4x4_vertical_left_8, 3,3 1890 sub r0, r2 1891 movq m1, [r0] 1892 punpckldq m1, [r1] 1893 movq m3, m1 1894 movq m2, m1 1895 psrlq m3, 8 1896 psrlq m2, 16 1897 movq m4, m3 1898 pavgb m4, m1 1899 PRED4x4_LOWPASS m0, m1, m2, m3, m5 1900 lea r1, [r0+r2*2] 1901 movh [r0+r2*1], m4 1902 movh [r0+r2*2], m0 1903 psrlq m4, 8 1904 psrlq m0, 8 1905 movh [r1+r2*1], m4 1906 movh [r1+r2*2], m0 1907 RET 1908 1909 ;------------------------------------------------------------------------------ 1910 ; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, 1911 ; ptrdiff_t stride) 1912 ;------------------------------------------------------------------------------ 1913 1914 INIT_MMX mmxext 1915 cglobal pred4x4_horizontal_up_8, 3,3 1916 sub r0, r2 1917 lea r1, [r0+r2*2] 1918 movd m0, [r0+r2*1-4] 1919 punpcklbw m0, [r0+r2*2-4] 1920 movd m1, [r1+r2*1-4] 1921 punpcklbw m1, [r1+r2*2-4] 1922 punpckhwd m0, m1 1923 movq m1, m0 1924 punpckhbw m1, m1 1925 pshufw m1, m1, 0xFF 1926 punpckhdq m0, m1 1927 movq m2, m0 1928 movq m3, m0 1929 movq m7, m0 1930 psrlq m2, 16 1931 psrlq m3, 8 1932 pavgb m7, m3 1933 PRED4x4_LOWPASS m4, m0, m2, m3, m5 1934 punpcklbw m7, m4 1935 movd [r0+r2*1], m7 1936 psrlq m7, 16 1937 movd [r0+r2*2], m7 1938 psrlq m7, 16 1939 movd [r1+r2*1], m7 1940 movd [r1+r2*2], m1 1941 RET 1942 1943 ;------------------------------------------------------------------------------ 1944 ; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src, 1945 ; const uint8_t *topright, 1946 ; ptrdiff_t stride) 1947 ;------------------------------------------------------------------------------ 1948 1949 INIT_MMX mmxext 1950 cglobal pred4x4_horizontal_down_8, 3,3 1951 sub r0, r2 1952 lea r1, [r0+r2*2] 1953 movh m0, [r0-4] ; lt .. 1954 punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. .. 1955 psllq m0, 8 ; t2 t1 t0 lt .. .. .. .. 1956 movd m1, [r1+r2*2-4] ; l3 1957 punpcklbw m1, [r1+r2*1-4] ; l2 l3 1958 movd m2, [r0+r2*2-4] ; l1 1959 punpcklbw m2, [r0+r2*1-4] ; l0 l1 1960 punpckhwd m1, m2 ; l0 l1 l2 l3 1961 punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 1962 movq m0, m1 1963 movq m2, m1 1964 movq m5, m1 1965 psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 1966 psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 1967 pavgb m5, m2 1968 PRED4x4_LOWPASS m3, m1, m0, m2, m4 1969 punpcklbw m5, m3 1970 psrlq m3, 32 1971 PALIGNR m3, m5, 6, m4 1972 movh [r1+r2*2], m5 1973 psrlq m5, 16 1974 movh [r1+r2*1], m5 1975 psrlq m5, 16 1976 movh [r0+r2*2], m5 1977 movh [r0+r2*1], m3 1978 RET 1979 1980 ;----------------------------------------------------------------------------- 1981 ; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src, 1982 ; const uint8_t *topright, 1983 ; ptrdiff_t stride) 1984 ;----------------------------------------------------------------------------- 1985 1986 INIT_MMX mmxext 1987 cglobal pred4x4_vertical_right_8, 3,3 1988 sub r0, r2 1989 lea r1, [r0+r2*2] 1990 movh m0, [r0] ; ........t3t2t1t0 1991 movq m5, m0 1992 PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt 1993 pavgb m5, m0 1994 PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0 1995 movq m1, m0 1996 PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 1997 movq m2, m0 1998 PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 1999 PRED4x4_LOWPASS m3, m1, m0, m2, m4 2000 movq m1, m3 2001 psrlq m3, 16 2002 psllq m1, 48 2003 movh [r0+r2*1], m5 2004 movh [r0+r2*2], m3 2005 PALIGNR m5, m1, 7, m2 2006 psllq m1, 8 2007 movh [r1+r2*1], m5 2008 PALIGNR m3, m1, 7, m1 2009 movh [r1+r2*2], m3 2010 RET 2011 2012 ;----------------------------------------------------------------------------- 2013 ; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, 2014 ; ptrdiff_t stride) 2015 ;----------------------------------------------------------------------------- 2016 2017 INIT_MMX mmxext 2018 cglobal pred4x4_down_right_8, 3,3 2019 sub r0, r2 2020 lea r1, [r0+r2*2] 2021 movq m1, [r1-8] 2022 movq m2, [r0+r2*1-8] 2023 punpckhbw m2, [r0-8] 2024 movh m3, [r0] 2025 punpckhwd m1, m2 2026 PALIGNR m3, m1, 5, m1 2027 movq m1, m3 2028 PALIGNR m3, [r1+r2*1-8], 7, m4 2029 movq m2, m3 2030 PALIGNR m3, [r1+r2*2-8], 7, m4 2031 PRED4x4_LOWPASS m0, m3, m1, m2, m4 2032 movh [r1+r2*2], m0 2033 psrlq m0, 8 2034 movh [r1+r2*1], m0 2035 psrlq m0, 8 2036 movh [r0+r2*2], m0 2037 psrlq m0, 8 2038 movh [r0+r2*1], m0 2039 RET