subpel_variance_ssse3.asm (42539B)
1 ; 2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 ; 4 ; This source code is subject to the terms of the BSD 2 Clause License and 5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 ; was not distributed with this source code in the LICENSE file, you can 7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 ; Media Patent License 1.0 was not distributed with this source code in the 9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 ; 11 12 ; 13 14 %include "third_party/x86inc/x86inc.asm" 15 16 SECTION_RODATA 17 pw_8: times 8 dw 8 18 19 bilin_filter_m_ssse3: times 8 db 16, 0 20 times 8 db 14, 2 21 times 8 db 12, 4 22 times 8 db 10, 6 23 times 16 db 8 24 times 8 db 6, 10 25 times 8 db 4, 12 26 times 8 db 2, 14 27 28 SECTION .text 29 30 ; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, 31 ; int x_offset, int y_offset, 32 ; const uint8_t *dst, ptrdiff_t dst_stride, 33 ; int height, unsigned int *sse); 34 ; 35 ; This function returns the SE and stores SSE in the given pointer. 36 37 %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse 38 psubw %3, %4 39 psubw %1, %2 40 paddw %5, %3 41 pmaddwd %3, %3 42 paddw %5, %1 43 pmaddwd %1, %1 44 paddd %6, %3 45 paddd %6, %1 46 %endmacro 47 48 %macro STORE_AND_RET 1 49 %if %1 > 4 50 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit 51 ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. 52 ; We have to sign-extend it before adding the words within the register 53 ; and outputing to a dword. 54 pcmpgtw m5, m6 ; mask for 0 > x 55 movhlps m3, m7 56 punpcklwd m4, m6, m5 57 punpckhwd m6, m5 ; sign-extend m6 word->dword 58 paddd m7, m3 59 paddd m6, m4 60 pshufd m3, m7, 0x1 61 movhlps m4, m6 62 paddd m7, m3 63 paddd m6, m4 64 mov r1, ssem ; r1 = unsigned int *sse 65 pshufd m4, m6, 0x1 66 movd [r1], m7 ; store sse 67 paddd m6, m4 68 movd raxd, m6 ; store sum as return value 69 %else ; 4xh 70 pshuflw m4, m6, 0xe 71 pshuflw m3, m7, 0xe 72 paddw m6, m4 73 paddd m7, m3 74 pcmpgtw m5, m6 ; mask for 0 > x 75 mov r1, ssem ; r1 = unsigned int *sse 76 punpcklwd m6, m5 ; sign-extend m6 word->dword 77 movd [r1], m7 ; store sse 78 pshuflw m4, m6, 0xe 79 paddd m6, m4 80 movd raxd, m6 ; store sum as return value 81 %endif 82 RET 83 %endmacro 84 85 %macro INC_SRC_BY_SRC_STRIDE 0 86 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 87 add srcq, src_stridemp 88 %else 89 add srcq, src_strideq 90 %endif 91 %endmacro 92 93 %macro SUBPEL_VARIANCE 1-2 0 ; W 94 %if cpuflag(ssse3) 95 %define bilin_filter_m bilin_filter_m_ssse3 96 %define filter_idx_shift 4 97 %endif 98 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses 99 ; 11, not 13, if the registers are ordered correctly. May make a minor speed 100 ; difference on Win64 101 102 %if AOM_ARCH_X86_64 103 %if %2 == 1 ; avg 104 cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ 105 x_offset, y_offset, dst, dst_stride, \ 106 sec, sec_stride, height, sse 107 %define sec_str sec_strideq 108 %else 109 cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ 110 x_offset, y_offset, dst, dst_stride, \ 111 height, sse 112 %endif 113 %define block_height heightd 114 %define bilin_filter sseq 115 %else 116 %if CONFIG_PIC=1 117 %if %2 == 1 ; avg 118 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 119 x_offset, y_offset, dst, dst_stride, \ 120 sec, sec_stride, height, sse 121 %define block_height dword heightm 122 %define sec_str sec_stridemp 123 %else 124 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 125 x_offset, y_offset, dst, dst_stride, \ 126 height, sse 127 %define block_height heightd 128 %endif 129 130 ; reuse argument stack space 131 %define g_bilin_filterm x_offsetm 132 %define g_pw_8m y_offsetm 133 134 ;Store bilin_filter and pw_8 location in stack 135 %if GET_GOT_DEFINED == 1 136 GET_GOT eax 137 add esp, 4 ; restore esp 138 %endif 139 140 lea ecx, [GLOBAL(bilin_filter_m)] 141 mov g_bilin_filterm, ecx 142 143 lea ecx, [GLOBAL(pw_8)] 144 mov g_pw_8m, ecx 145 146 LOAD_IF_USED 0, 1 ; load eax, ecx back 147 %else 148 %if %2 == 1 ; avg 149 cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ 150 x_offset, y_offset, \ 151 dst, dst_stride, sec, sec_stride, \ 152 height, sse 153 %define block_height dword heightm 154 %define sec_str sec_stridemp 155 %else 156 cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ 157 x_offset, y_offset, dst, dst_stride, \ 158 height, sse 159 %define block_height heightd 160 %endif 161 %define bilin_filter bilin_filter_m 162 %endif 163 %endif 164 165 %if %1 == 4 166 %define movx movd 167 %else 168 %define movx movh 169 %endif 170 171 ASSERT %1 <= 16 ; m6 overflows if w > 16 172 pxor m6, m6 ; sum 173 pxor m7, m7 ; sse 174 ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we 175 ; could perhaps use it for something more productive then 176 pxor m5, m5 ; dedicated zero register 177 %if %1 < 16 178 sar block_height, 1 179 %if %2 == 1 ; avg 180 shl sec_str, 1 181 %endif 182 %endif 183 184 ; FIXME(rbultje) replace by jumptable? 185 test x_offsetd, x_offsetd 186 jnz .x_nonzero 187 ; x_offset == 0 188 test y_offsetd, y_offsetd 189 jnz .x_zero_y_nonzero 190 191 ; x_offset == 0 && y_offset == 0 192 .x_zero_y_zero_loop: 193 %if %1 == 16 194 movu m0, [srcq] 195 mova m1, [dstq] 196 %if %2 == 1 ; avg 197 pavgb m0, [secq] 198 punpckhbw m3, m1, m5 199 punpcklbw m1, m5 200 %endif 201 punpckhbw m2, m0, m5 202 punpcklbw m0, m5 203 204 %if %2 == 0 ; !avg 205 punpckhbw m3, m1, m5 206 punpcklbw m1, m5 207 %endif 208 SUM_SSE m0, m1, m2, m3, m6, m7 209 210 add srcq, src_strideq 211 add dstq, dst_strideq 212 %else ; %1 < 16 213 movx m0, [srcq] 214 %if %2 == 1 ; avg 215 %if %1 > 4 216 movhps m0, [srcq+src_strideq] 217 %else ; 4xh 218 movx m1, [srcq+src_strideq] 219 punpckldq m0, m1 220 %endif 221 %else ; !avg 222 movx m2, [srcq+src_strideq] 223 %endif 224 225 movx m1, [dstq] 226 movx m3, [dstq+dst_strideq] 227 228 %if %2 == 1 ; avg 229 %if %1 > 4 230 pavgb m0, [secq] 231 %else 232 movh m2, [secq] 233 pavgb m0, m2 234 %endif 235 punpcklbw m3, m5 236 punpcklbw m1, m5 237 %if %1 > 4 238 punpckhbw m2, m0, m5 239 punpcklbw m0, m5 240 %else ; 4xh 241 punpcklbw m0, m5 242 movhlps m2, m0 243 %endif 244 %else ; !avg 245 punpcklbw m0, m5 246 punpcklbw m2, m5 247 punpcklbw m3, m5 248 punpcklbw m1, m5 249 %endif 250 SUM_SSE m0, m1, m2, m3, m6, m7 251 252 lea srcq, [srcq+src_strideq*2] 253 lea dstq, [dstq+dst_strideq*2] 254 %endif 255 %if %2 == 1 ; avg 256 add secq, sec_str 257 %endif 258 dec block_height 259 jg .x_zero_y_zero_loop 260 STORE_AND_RET %1 261 262 .x_zero_y_nonzero: 263 cmp y_offsetd, 4 264 jne .x_zero_y_nonhalf 265 266 ; x_offset == 0 && y_offset == 0.5 267 .x_zero_y_half_loop: 268 %if %1 == 16 269 movu m0, [srcq] 270 movu m4, [srcq+src_strideq] 271 mova m1, [dstq] 272 pavgb m0, m4 273 punpckhbw m3, m1, m5 274 %if %2 == 1 ; avg 275 pavgb m0, [secq] 276 %endif 277 punpcklbw m1, m5 278 punpckhbw m2, m0, m5 279 punpcklbw m0, m5 280 SUM_SSE m0, m1, m2, m3, m6, m7 281 282 add srcq, src_strideq 283 add dstq, dst_strideq 284 %else ; %1 < 16 285 movx m0, [srcq] 286 movx m2, [srcq+src_strideq] 287 %if %2 == 1 ; avg 288 %if %1 > 4 289 movhps m2, [srcq+src_strideq*2] 290 %else ; 4xh 291 movx m1, [srcq+src_strideq*2] 292 punpckldq m2, m1 293 %endif 294 movx m1, [dstq] 295 %if %1 > 4 296 movlhps m0, m2 297 %else ; 4xh 298 punpckldq m0, m2 299 %endif 300 movx m3, [dstq+dst_strideq] 301 pavgb m0, m2 302 punpcklbw m1, m5 303 %if %1 > 4 304 pavgb m0, [secq] 305 punpcklbw m3, m5 306 punpckhbw m2, m0, m5 307 punpcklbw m0, m5 308 %else ; 4xh 309 movh m4, [secq] 310 pavgb m0, m4 311 punpcklbw m3, m5 312 punpcklbw m0, m5 313 movhlps m2, m0 314 %endif 315 %else ; !avg 316 movx m4, [srcq+src_strideq*2] 317 movx m1, [dstq] 318 pavgb m0, m2 319 movx m3, [dstq+dst_strideq] 320 pavgb m2, m4 321 punpcklbw m0, m5 322 punpcklbw m2, m5 323 punpcklbw m3, m5 324 punpcklbw m1, m5 325 %endif 326 SUM_SSE m0, m1, m2, m3, m6, m7 327 328 lea srcq, [srcq+src_strideq*2] 329 lea dstq, [dstq+dst_strideq*2] 330 %endif 331 %if %2 == 1 ; avg 332 add secq, sec_str 333 %endif 334 dec block_height 335 jg .x_zero_y_half_loop 336 STORE_AND_RET %1 337 338 .x_zero_y_nonhalf: 339 ; x_offset == 0 && y_offset == bilin interpolation 340 %if AOM_ARCH_X86_64 341 lea bilin_filter, [GLOBAL(bilin_filter_m)] 342 %endif 343 shl y_offsetd, filter_idx_shift 344 %if AOM_ARCH_X86_64 && %1 > 4 345 mova m8, [bilin_filter+y_offsetq] 346 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 347 mova m9, [bilin_filter+y_offsetq+16] 348 %endif 349 mova m10, [GLOBAL(pw_8)] 350 %define filter_y_a m8 351 %define filter_y_b m9 352 %define filter_rnd m10 353 %else ; x86-32 or mmx 354 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 355 ; x_offset == 0, reuse x_offset reg 356 %define tempq x_offsetq 357 add y_offsetq, g_bilin_filterm 358 %define filter_y_a [y_offsetq] 359 %define filter_y_b [y_offsetq+16] 360 mov tempq, g_pw_8m 361 %define filter_rnd [tempq] 362 %else 363 add y_offsetq, bilin_filter 364 %define filter_y_a [y_offsetq] 365 %define filter_y_b [y_offsetq+16] 366 %define filter_rnd [GLOBAL(pw_8)] 367 %endif 368 %endif 369 370 .x_zero_y_other_loop: 371 %if %1 == 16 372 movu m0, [srcq] 373 movu m4, [srcq+src_strideq] 374 mova m1, [dstq] 375 %if cpuflag(ssse3) 376 punpckhbw m2, m0, m4 377 punpcklbw m0, m4 378 pmaddubsw m2, filter_y_a 379 pmaddubsw m0, filter_y_a 380 paddw m2, filter_rnd 381 paddw m0, filter_rnd 382 %else 383 punpckhbw m2, m0, m5 384 punpckhbw m3, m4, m5 385 punpcklbw m0, m5 386 punpcklbw m4, m5 387 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can 388 ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of 389 ; instructions is the same (5), but it is 1 mul instead of 2, so might be 390 ; slightly faster because of pmullw latency. It would also cut our rodata 391 ; tables in half for this function, and save 1-2 registers on x86-64. 392 pmullw m2, filter_y_a 393 pmullw m3, filter_y_b 394 paddw m2, filter_rnd 395 pmullw m0, filter_y_a 396 pmullw m4, filter_y_b 397 paddw m0, filter_rnd 398 paddw m2, m3 399 paddw m0, m4 400 %endif 401 psraw m2, 4 402 psraw m0, 4 403 %if %2 == 1 ; avg 404 ; FIXME(rbultje) pipeline 405 packuswb m0, m2 406 pavgb m0, [secq] 407 punpckhbw m2, m0, m5 408 punpcklbw m0, m5 409 %endif 410 punpckhbw m3, m1, m5 411 punpcklbw m1, m5 412 SUM_SSE m0, m1, m2, m3, m6, m7 413 414 add srcq, src_strideq 415 add dstq, dst_strideq 416 %else ; %1 < 16 417 movx m0, [srcq] 418 movx m2, [srcq+src_strideq] 419 movx m4, [srcq+src_strideq*2] 420 movx m3, [dstq+dst_strideq] 421 %if cpuflag(ssse3) 422 movx m1, [dstq] 423 punpcklbw m0, m2 424 punpcklbw m2, m4 425 pmaddubsw m0, filter_y_a 426 pmaddubsw m2, filter_y_a 427 punpcklbw m3, m5 428 paddw m2, filter_rnd 429 paddw m0, filter_rnd 430 %else 431 punpcklbw m0, m5 432 punpcklbw m2, m5 433 punpcklbw m4, m5 434 pmullw m0, filter_y_a 435 pmullw m1, m2, filter_y_b 436 punpcklbw m3, m5 437 paddw m0, filter_rnd 438 pmullw m2, filter_y_a 439 pmullw m4, filter_y_b 440 paddw m0, m1 441 paddw m2, filter_rnd 442 movx m1, [dstq] 443 paddw m2, m4 444 %endif 445 psraw m0, 4 446 psraw m2, 4 447 %if %2 == 1 ; avg 448 ; FIXME(rbultje) pipeline 449 %if %1 == 4 450 movlhps m0, m2 451 %endif 452 packuswb m0, m2 453 %if %1 > 4 454 pavgb m0, [secq] 455 punpckhbw m2, m0, m5 456 punpcklbw m0, m5 457 %else ; 4xh 458 movh m2, [secq] 459 pavgb m0, m2 460 punpcklbw m0, m5 461 movhlps m2, m0 462 %endif 463 %endif 464 punpcklbw m1, m5 465 SUM_SSE m0, m1, m2, m3, m6, m7 466 467 lea srcq, [srcq+src_strideq*2] 468 lea dstq, [dstq+dst_strideq*2] 469 %endif 470 %if %2 == 1 ; avg 471 add secq, sec_str 472 %endif 473 dec block_height 474 jg .x_zero_y_other_loop 475 %undef filter_y_a 476 %undef filter_y_b 477 %undef filter_rnd 478 STORE_AND_RET %1 479 480 .x_nonzero: 481 cmp x_offsetd, 4 482 jne .x_nonhalf 483 ; x_offset == 0.5 484 test y_offsetd, y_offsetd 485 jnz .x_half_y_nonzero 486 487 ; x_offset == 0.5 && y_offset == 0 488 .x_half_y_zero_loop: 489 %if %1 == 16 490 movu m0, [srcq] 491 movu m4, [srcq+1] 492 mova m1, [dstq] 493 pavgb m0, m4 494 punpckhbw m3, m1, m5 495 %if %2 == 1 ; avg 496 pavgb m0, [secq] 497 %endif 498 punpcklbw m1, m5 499 punpckhbw m2, m0, m5 500 punpcklbw m0, m5 501 SUM_SSE m0, m1, m2, m3, m6, m7 502 503 add srcq, src_strideq 504 add dstq, dst_strideq 505 %else ; %1 < 16 506 movx m0, [srcq] 507 movx m4, [srcq+1] 508 %if %2 == 1 ; avg 509 %if %1 > 4 510 movhps m0, [srcq+src_strideq] 511 movhps m4, [srcq+src_strideq+1] 512 %else ; 4xh 513 movx m1, [srcq+src_strideq] 514 punpckldq m0, m1 515 movx m2, [srcq+src_strideq+1] 516 punpckldq m4, m2 517 %endif 518 movx m1, [dstq] 519 movx m3, [dstq+dst_strideq] 520 pavgb m0, m4 521 punpcklbw m3, m5 522 %if %1 > 4 523 pavgb m0, [secq] 524 punpcklbw m1, m5 525 punpckhbw m2, m0, m5 526 punpcklbw m0, m5 527 %else ; 4xh 528 movh m2, [secq] 529 pavgb m0, m2 530 punpcklbw m1, m5 531 punpcklbw m0, m5 532 movhlps m2, m0 533 %endif 534 %else ; !avg 535 movx m2, [srcq+src_strideq] 536 movx m1, [dstq] 537 pavgb m0, m4 538 movx m4, [srcq+src_strideq+1] 539 movx m3, [dstq+dst_strideq] 540 pavgb m2, m4 541 punpcklbw m0, m5 542 punpcklbw m2, m5 543 punpcklbw m3, m5 544 punpcklbw m1, m5 545 %endif 546 SUM_SSE m0, m1, m2, m3, m6, m7 547 548 lea srcq, [srcq+src_strideq*2] 549 lea dstq, [dstq+dst_strideq*2] 550 %endif 551 %if %2 == 1 ; avg 552 add secq, sec_str 553 %endif 554 dec block_height 555 jg .x_half_y_zero_loop 556 STORE_AND_RET %1 557 558 .x_half_y_nonzero: 559 cmp y_offsetd, 4 560 jne .x_half_y_nonhalf 561 562 ; x_offset == 0.5 && y_offset == 0.5 563 %if %1 == 16 564 movu m0, [srcq] 565 movu m3, [srcq+1] 566 add srcq, src_strideq 567 pavgb m0, m3 568 .x_half_y_half_loop: 569 movu m4, [srcq] 570 movu m3, [srcq+1] 571 mova m1, [dstq] 572 pavgb m4, m3 573 punpckhbw m3, m1, m5 574 pavgb m0, m4 575 %if %2 == 1 ; avg 576 punpcklbw m1, m5 577 pavgb m0, [secq] 578 punpckhbw m2, m0, m5 579 punpcklbw m0, m5 580 %else 581 punpckhbw m2, m0, m5 582 punpcklbw m0, m5 583 punpcklbw m1, m5 584 %endif 585 SUM_SSE m0, m1, m2, m3, m6, m7 586 mova m0, m4 587 588 add srcq, src_strideq 589 add dstq, dst_strideq 590 %else ; %1 < 16 591 movx m0, [srcq] 592 movx m3, [srcq+1] 593 add srcq, src_strideq 594 pavgb m0, m3 595 .x_half_y_half_loop: 596 movx m2, [srcq] 597 movx m3, [srcq+1] 598 %if %2 == 1 ; avg 599 %if %1 > 4 600 movhps m2, [srcq+src_strideq] 601 movhps m3, [srcq+src_strideq+1] 602 %else 603 movx m1, [srcq+src_strideq] 604 punpckldq m2, m1 605 movx m1, [srcq+src_strideq+1] 606 punpckldq m3, m1 607 %endif 608 pavgb m2, m3 609 %if %1 > 4 610 movlhps m0, m2 611 movhlps m4, m2 612 %else ; 4xh 613 punpckldq m0, m2 614 pshuflw m4, m2, 0xe 615 %endif 616 movx m1, [dstq] 617 pavgb m0, m2 618 movx m3, [dstq+dst_strideq] 619 %if %1 > 4 620 pavgb m0, [secq] 621 %else 622 movh m2, [secq] 623 pavgb m0, m2 624 %endif 625 punpcklbw m3, m5 626 punpcklbw m1, m5 627 %if %1 > 4 628 punpckhbw m2, m0, m5 629 punpcklbw m0, m5 630 %else 631 punpcklbw m0, m5 632 movhlps m2, m0 633 %endif 634 %else ; !avg 635 movx m4, [srcq+src_strideq] 636 movx m1, [srcq+src_strideq+1] 637 pavgb m2, m3 638 pavgb m4, m1 639 pavgb m0, m2 640 pavgb m2, m4 641 movx m1, [dstq] 642 movx m3, [dstq+dst_strideq] 643 punpcklbw m0, m5 644 punpcklbw m2, m5 645 punpcklbw m3, m5 646 punpcklbw m1, m5 647 %endif 648 SUM_SSE m0, m1, m2, m3, m6, m7 649 mova m0, m4 650 651 lea srcq, [srcq+src_strideq*2] 652 lea dstq, [dstq+dst_strideq*2] 653 %endif 654 %if %2 == 1 ; avg 655 add secq, sec_str 656 %endif 657 dec block_height 658 jg .x_half_y_half_loop 659 STORE_AND_RET %1 660 661 .x_half_y_nonhalf: 662 ; x_offset == 0.5 && y_offset == bilin interpolation 663 %if AOM_ARCH_X86_64 664 lea bilin_filter, [GLOBAL(bilin_filter_m)] 665 %endif 666 shl y_offsetd, filter_idx_shift 667 %if AOM_ARCH_X86_64 && %1 > 4 668 mova m8, [bilin_filter+y_offsetq] 669 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 670 mova m9, [bilin_filter+y_offsetq+16] 671 %endif 672 mova m10, [GLOBAL(pw_8)] 673 %define filter_y_a m8 674 %define filter_y_b m9 675 %define filter_rnd m10 676 %else ;x86_32 677 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 678 ; x_offset == 0.5. We can reuse x_offset reg 679 %define tempq x_offsetq 680 add y_offsetq, g_bilin_filterm 681 %define filter_y_a [y_offsetq] 682 %define filter_y_b [y_offsetq+16] 683 mov tempq, g_pw_8m 684 %define filter_rnd [tempq] 685 %else 686 add y_offsetq, bilin_filter 687 %define filter_y_a [y_offsetq] 688 %define filter_y_b [y_offsetq+16] 689 %define filter_rnd [GLOBAL(pw_8)] 690 %endif 691 %endif 692 693 %if %1 == 16 694 movu m0, [srcq] 695 movu m3, [srcq+1] 696 add srcq, src_strideq 697 pavgb m0, m3 698 .x_half_y_other_loop: 699 movu m4, [srcq] 700 movu m2, [srcq+1] 701 mova m1, [dstq] 702 pavgb m4, m2 703 %if cpuflag(ssse3) 704 punpckhbw m2, m0, m4 705 punpcklbw m0, m4 706 pmaddubsw m2, filter_y_a 707 pmaddubsw m0, filter_y_a 708 paddw m2, filter_rnd 709 paddw m0, filter_rnd 710 psraw m2, 4 711 %else 712 punpckhbw m2, m0, m5 713 punpckhbw m3, m4, m5 714 pmullw m2, filter_y_a 715 pmullw m3, filter_y_b 716 paddw m2, filter_rnd 717 punpcklbw m0, m5 718 paddw m2, m3 719 punpcklbw m3, m4, m5 720 pmullw m0, filter_y_a 721 pmullw m3, filter_y_b 722 paddw m0, filter_rnd 723 psraw m2, 4 724 paddw m0, m3 725 %endif 726 punpckhbw m3, m1, m5 727 psraw m0, 4 728 %if %2 == 1 ; avg 729 ; FIXME(rbultje) pipeline 730 packuswb m0, m2 731 pavgb m0, [secq] 732 punpckhbw m2, m0, m5 733 punpcklbw m0, m5 734 %endif 735 punpcklbw m1, m5 736 SUM_SSE m0, m1, m2, m3, m6, m7 737 mova m0, m4 738 739 add srcq, src_strideq 740 add dstq, dst_strideq 741 %else ; %1 < 16 742 movx m0, [srcq] 743 movx m3, [srcq+1] 744 add srcq, src_strideq 745 pavgb m0, m3 746 %if notcpuflag(ssse3) 747 punpcklbw m0, m5 748 %endif 749 .x_half_y_other_loop: 750 movx m2, [srcq] 751 movx m1, [srcq+1] 752 movx m4, [srcq+src_strideq] 753 movx m3, [srcq+src_strideq+1] 754 pavgb m2, m1 755 pavgb m4, m3 756 movx m3, [dstq+dst_strideq] 757 %if cpuflag(ssse3) 758 movx m1, [dstq] 759 punpcklbw m0, m2 760 punpcklbw m2, m4 761 pmaddubsw m0, filter_y_a 762 pmaddubsw m2, filter_y_a 763 punpcklbw m3, m5 764 paddw m0, filter_rnd 765 paddw m2, filter_rnd 766 %else 767 punpcklbw m2, m5 768 punpcklbw m4, m5 769 pmullw m0, filter_y_a 770 pmullw m1, m2, filter_y_b 771 punpcklbw m3, m5 772 paddw m0, filter_rnd 773 pmullw m2, filter_y_a 774 paddw m0, m1 775 pmullw m1, m4, filter_y_b 776 paddw m2, filter_rnd 777 paddw m2, m1 778 movx m1, [dstq] 779 %endif 780 psraw m0, 4 781 psraw m2, 4 782 %if %2 == 1 ; avg 783 ; FIXME(rbultje) pipeline 784 %if %1 == 4 785 movlhps m0, m2 786 %endif 787 packuswb m0, m2 788 %if %1 > 4 789 pavgb m0, [secq] 790 punpckhbw m2, m0, m5 791 punpcklbw m0, m5 792 %else 793 movh m2, [secq] 794 pavgb m0, m2 795 punpcklbw m0, m5 796 movhlps m2, m0 797 %endif 798 %endif 799 punpcklbw m1, m5 800 SUM_SSE m0, m1, m2, m3, m6, m7 801 mova m0, m4 802 803 lea srcq, [srcq+src_strideq*2] 804 lea dstq, [dstq+dst_strideq*2] 805 %endif 806 %if %2 == 1 ; avg 807 add secq, sec_str 808 %endif 809 dec block_height 810 jg .x_half_y_other_loop 811 %undef filter_y_a 812 %undef filter_y_b 813 %undef filter_rnd 814 STORE_AND_RET %1 815 816 .x_nonhalf: 817 test y_offsetd, y_offsetd 818 jnz .x_nonhalf_y_nonzero 819 820 ; x_offset == bilin interpolation && y_offset == 0 821 %if AOM_ARCH_X86_64 822 lea bilin_filter, [GLOBAL(bilin_filter_m)] 823 %endif 824 shl x_offsetd, filter_idx_shift 825 %if AOM_ARCH_X86_64 && %1 > 4 826 mova m8, [bilin_filter+x_offsetq] 827 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 828 mova m9, [bilin_filter+x_offsetq+16] 829 %endif 830 mova m10, [GLOBAL(pw_8)] 831 %define filter_x_a m8 832 %define filter_x_b m9 833 %define filter_rnd m10 834 %else ; x86-32 835 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 836 ;y_offset == 0. We can reuse y_offset reg. 837 %define tempq y_offsetq 838 add x_offsetq, g_bilin_filterm 839 %define filter_x_a [x_offsetq] 840 %define filter_x_b [x_offsetq+16] 841 mov tempq, g_pw_8m 842 %define filter_rnd [tempq] 843 %else 844 add x_offsetq, bilin_filter 845 %define filter_x_a [x_offsetq] 846 %define filter_x_b [x_offsetq+16] 847 %define filter_rnd [GLOBAL(pw_8)] 848 %endif 849 %endif 850 851 .x_other_y_zero_loop: 852 %if %1 == 16 853 movu m0, [srcq] 854 movu m4, [srcq+1] 855 mova m1, [dstq] 856 %if cpuflag(ssse3) 857 punpckhbw m2, m0, m4 858 punpcklbw m0, m4 859 pmaddubsw m2, filter_x_a 860 pmaddubsw m0, filter_x_a 861 paddw m2, filter_rnd 862 paddw m0, filter_rnd 863 %else 864 punpckhbw m2, m0, m5 865 punpckhbw m3, m4, m5 866 punpcklbw m0, m5 867 punpcklbw m4, m5 868 pmullw m2, filter_x_a 869 pmullw m3, filter_x_b 870 paddw m2, filter_rnd 871 pmullw m0, filter_x_a 872 pmullw m4, filter_x_b 873 paddw m0, filter_rnd 874 paddw m2, m3 875 paddw m0, m4 876 %endif 877 psraw m2, 4 878 psraw m0, 4 879 %if %2 == 1 ; avg 880 ; FIXME(rbultje) pipeline 881 packuswb m0, m2 882 pavgb m0, [secq] 883 punpckhbw m2, m0, m5 884 punpcklbw m0, m5 885 %endif 886 punpckhbw m3, m1, m5 887 punpcklbw m1, m5 888 SUM_SSE m0, m1, m2, m3, m6, m7 889 890 add srcq, src_strideq 891 add dstq, dst_strideq 892 %else ; %1 < 16 893 movx m0, [srcq] 894 movx m1, [srcq+1] 895 movx m2, [srcq+src_strideq] 896 movx m4, [srcq+src_strideq+1] 897 movx m3, [dstq+dst_strideq] 898 %if cpuflag(ssse3) 899 punpcklbw m0, m1 900 movx m1, [dstq] 901 punpcklbw m2, m4 902 pmaddubsw m0, filter_x_a 903 pmaddubsw m2, filter_x_a 904 punpcklbw m3, m5 905 paddw m0, filter_rnd 906 paddw m2, filter_rnd 907 %else 908 punpcklbw m0, m5 909 punpcklbw m1, m5 910 punpcklbw m2, m5 911 punpcklbw m4, m5 912 pmullw m0, filter_x_a 913 pmullw m1, filter_x_b 914 punpcklbw m3, m5 915 paddw m0, filter_rnd 916 pmullw m2, filter_x_a 917 pmullw m4, filter_x_b 918 paddw m0, m1 919 paddw m2, filter_rnd 920 movx m1, [dstq] 921 paddw m2, m4 922 %endif 923 psraw m0, 4 924 psraw m2, 4 925 %if %2 == 1 ; avg 926 ; FIXME(rbultje) pipeline 927 %if %1 == 4 928 movlhps m0, m2 929 %endif 930 packuswb m0, m2 931 %if %1 > 4 932 pavgb m0, [secq] 933 punpckhbw m2, m0, m5 934 punpcklbw m0, m5 935 %else 936 movh m2, [secq] 937 pavgb m0, m2 938 punpcklbw m0, m5 939 movhlps m2, m0 940 %endif 941 %endif 942 punpcklbw m1, m5 943 SUM_SSE m0, m1, m2, m3, m6, m7 944 945 lea srcq, [srcq+src_strideq*2] 946 lea dstq, [dstq+dst_strideq*2] 947 %endif 948 %if %2 == 1 ; avg 949 add secq, sec_str 950 %endif 951 dec block_height 952 jg .x_other_y_zero_loop 953 %undef filter_x_a 954 %undef filter_x_b 955 %undef filter_rnd 956 STORE_AND_RET %1 957 958 .x_nonhalf_y_nonzero: 959 cmp y_offsetd, 4 960 jne .x_nonhalf_y_nonhalf 961 962 ; x_offset == bilin interpolation && y_offset == 0.5 963 %if AOM_ARCH_X86_64 964 lea bilin_filter, [GLOBAL(bilin_filter_m)] 965 %endif 966 shl x_offsetd, filter_idx_shift 967 %if AOM_ARCH_X86_64 && %1 > 4 968 mova m8, [bilin_filter+x_offsetq] 969 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 970 mova m9, [bilin_filter+x_offsetq+16] 971 %endif 972 mova m10, [GLOBAL(pw_8)] 973 %define filter_x_a m8 974 %define filter_x_b m9 975 %define filter_rnd m10 976 %else ; x86-32 977 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 978 ; y_offset == 0.5. We can reuse y_offset reg. 979 %define tempq y_offsetq 980 add x_offsetq, g_bilin_filterm 981 %define filter_x_a [x_offsetq] 982 %define filter_x_b [x_offsetq+16] 983 mov tempq, g_pw_8m 984 %define filter_rnd [tempq] 985 %else 986 add x_offsetq, bilin_filter 987 %define filter_x_a [x_offsetq] 988 %define filter_x_b [x_offsetq+16] 989 %define filter_rnd [GLOBAL(pw_8)] 990 %endif 991 %endif 992 993 %if %1 == 16 994 movu m0, [srcq] 995 movu m1, [srcq+1] 996 %if cpuflag(ssse3) 997 punpckhbw m2, m0, m1 998 punpcklbw m0, m1 999 pmaddubsw m2, filter_x_a 1000 pmaddubsw m0, filter_x_a 1001 paddw m2, filter_rnd 1002 paddw m0, filter_rnd 1003 %else 1004 punpckhbw m2, m0, m5 1005 punpckhbw m3, m1, m5 1006 punpcklbw m0, m5 1007 punpcklbw m1, m5 1008 pmullw m0, filter_x_a 1009 pmullw m1, filter_x_b 1010 paddw m0, filter_rnd 1011 pmullw m2, filter_x_a 1012 pmullw m3, filter_x_b 1013 paddw m2, filter_rnd 1014 paddw m0, m1 1015 paddw m2, m3 1016 %endif 1017 psraw m0, 4 1018 psraw m2, 4 1019 add srcq, src_strideq 1020 packuswb m0, m2 1021 .x_other_y_half_loop: 1022 movu m4, [srcq] 1023 movu m3, [srcq+1] 1024 %if cpuflag(ssse3) 1025 mova m1, [dstq] 1026 punpckhbw m2, m4, m3 1027 punpcklbw m4, m3 1028 pmaddubsw m2, filter_x_a 1029 pmaddubsw m4, filter_x_a 1030 paddw m2, filter_rnd 1031 paddw m4, filter_rnd 1032 psraw m2, 4 1033 psraw m4, 4 1034 packuswb m4, m2 1035 pavgb m0, m4 1036 punpckhbw m3, m1, m5 1037 punpcklbw m1, m5 1038 %else 1039 punpckhbw m2, m4, m5 1040 punpckhbw m1, m3, m5 1041 punpcklbw m4, m5 1042 punpcklbw m3, m5 1043 pmullw m4, filter_x_a 1044 pmullw m3, filter_x_b 1045 paddw m4, filter_rnd 1046 pmullw m2, filter_x_a 1047 pmullw m1, filter_x_b 1048 paddw m2, filter_rnd 1049 paddw m4, m3 1050 paddw m2, m1 1051 mova m1, [dstq] 1052 psraw m4, 4 1053 psraw m2, 4 1054 punpckhbw m3, m1, m5 1055 ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we 1056 ; have a 1-register shortage to be able to store the backup of the bilin 1057 ; filtered second line as words as cache for the next line. Packing into 1058 ; a byte costs 1 pack and 2 unpacks, but saves a register. 1059 packuswb m4, m2 1060 punpcklbw m1, m5 1061 pavgb m0, m4 1062 %endif 1063 %if %2 == 1 ; avg 1064 ; FIXME(rbultje) pipeline 1065 pavgb m0, [secq] 1066 %endif 1067 punpckhbw m2, m0, m5 1068 punpcklbw m0, m5 1069 SUM_SSE m0, m1, m2, m3, m6, m7 1070 mova m0, m4 1071 1072 add srcq, src_strideq 1073 add dstq, dst_strideq 1074 %else ; %1 < 16 1075 movx m0, [srcq] 1076 movx m1, [srcq+1] 1077 %if cpuflag(ssse3) 1078 punpcklbw m0, m1 1079 pmaddubsw m0, filter_x_a 1080 paddw m0, filter_rnd 1081 %else 1082 punpcklbw m0, m5 1083 punpcklbw m1, m5 1084 pmullw m0, filter_x_a 1085 pmullw m1, filter_x_b 1086 paddw m0, filter_rnd 1087 paddw m0, m1 1088 %endif 1089 add srcq, src_strideq 1090 psraw m0, 4 1091 .x_other_y_half_loop: 1092 movx m2, [srcq] 1093 movx m1, [srcq+1] 1094 movx m4, [srcq+src_strideq] 1095 movx m3, [srcq+src_strideq+1] 1096 %if cpuflag(ssse3) 1097 punpcklbw m2, m1 1098 punpcklbw m4, m3 1099 pmaddubsw m2, filter_x_a 1100 pmaddubsw m4, filter_x_a 1101 movx m1, [dstq] 1102 movx m3, [dstq+dst_strideq] 1103 paddw m2, filter_rnd 1104 paddw m4, filter_rnd 1105 %else 1106 punpcklbw m2, m5 1107 punpcklbw m1, m5 1108 punpcklbw m4, m5 1109 punpcklbw m3, m5 1110 pmullw m2, filter_x_a 1111 pmullw m1, filter_x_b 1112 paddw m2, filter_rnd 1113 pmullw m4, filter_x_a 1114 pmullw m3, filter_x_b 1115 paddw m4, filter_rnd 1116 paddw m2, m1 1117 movx m1, [dstq] 1118 paddw m4, m3 1119 movx m3, [dstq+dst_strideq] 1120 %endif 1121 psraw m2, 4 1122 psraw m4, 4 1123 pavgw m0, m2 1124 pavgw m2, m4 1125 %if %2 == 1 ; avg 1126 ; FIXME(rbultje) pipeline - also consider going to bytes here 1127 %if %1 == 4 1128 movlhps m0, m2 1129 %endif 1130 packuswb m0, m2 1131 %if %1 > 4 1132 pavgb m0, [secq] 1133 punpckhbw m2, m0, m5 1134 punpcklbw m0, m5 1135 %else 1136 movh m2, [secq] 1137 pavgb m0, m2 1138 punpcklbw m0, m5 1139 movhlps m2, m0 1140 %endif 1141 %endif 1142 punpcklbw m3, m5 1143 punpcklbw m1, m5 1144 SUM_SSE m0, m1, m2, m3, m6, m7 1145 mova m0, m4 1146 1147 lea srcq, [srcq+src_strideq*2] 1148 lea dstq, [dstq+dst_strideq*2] 1149 %endif 1150 %if %2 == 1 ; avg 1151 add secq, sec_str 1152 %endif 1153 dec block_height 1154 jg .x_other_y_half_loop 1155 %undef filter_x_a 1156 %undef filter_x_b 1157 %undef filter_rnd 1158 STORE_AND_RET %1 1159 1160 .x_nonhalf_y_nonhalf: 1161 %if AOM_ARCH_X86_64 1162 lea bilin_filter, [GLOBAL(bilin_filter_m)] 1163 %endif 1164 shl x_offsetd, filter_idx_shift 1165 shl y_offsetd, filter_idx_shift 1166 %if AOM_ARCH_X86_64 && %1 > 4 1167 mova m8, [bilin_filter+x_offsetq] 1168 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1169 mova m9, [bilin_filter+x_offsetq+16] 1170 %endif 1171 mova m10, [bilin_filter+y_offsetq] 1172 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 1173 mova m11, [bilin_filter+y_offsetq+16] 1174 %endif 1175 mova m12, [GLOBAL(pw_8)] 1176 %define filter_x_a m8 1177 %define filter_x_b m9 1178 %define filter_y_a m10 1179 %define filter_y_b m11 1180 %define filter_rnd m12 1181 %else ; x86-32 1182 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 1183 ; In this case, there is NO unused register. Used src_stride register. Later, 1184 ; src_stride has to be loaded from stack when it is needed. 1185 %define tempq src_strideq 1186 mov tempq, g_bilin_filterm 1187 add x_offsetq, tempq 1188 add y_offsetq, tempq 1189 %define filter_x_a [x_offsetq] 1190 %define filter_x_b [x_offsetq+16] 1191 %define filter_y_a [y_offsetq] 1192 %define filter_y_b [y_offsetq+16] 1193 1194 mov tempq, g_pw_8m 1195 %define filter_rnd [tempq] 1196 %else 1197 add x_offsetq, bilin_filter 1198 add y_offsetq, bilin_filter 1199 %define filter_x_a [x_offsetq] 1200 %define filter_x_b [x_offsetq+16] 1201 %define filter_y_a [y_offsetq] 1202 %define filter_y_b [y_offsetq+16] 1203 %define filter_rnd [GLOBAL(pw_8)] 1204 %endif 1205 %endif 1206 1207 ; x_offset == bilin interpolation && y_offset == bilin interpolation 1208 %if %1 == 16 1209 movu m0, [srcq] 1210 movu m1, [srcq+1] 1211 %if cpuflag(ssse3) 1212 punpckhbw m2, m0, m1 1213 punpcklbw m0, m1 1214 pmaddubsw m2, filter_x_a 1215 pmaddubsw m0, filter_x_a 1216 paddw m2, filter_rnd 1217 paddw m0, filter_rnd 1218 %else 1219 punpckhbw m2, m0, m5 1220 punpckhbw m3, m1, m5 1221 punpcklbw m0, m5 1222 punpcklbw m1, m5 1223 pmullw m0, filter_x_a 1224 pmullw m1, filter_x_b 1225 paddw m0, filter_rnd 1226 pmullw m2, filter_x_a 1227 pmullw m3, filter_x_b 1228 paddw m2, filter_rnd 1229 paddw m0, m1 1230 paddw m2, m3 1231 %endif 1232 psraw m0, 4 1233 psraw m2, 4 1234 1235 INC_SRC_BY_SRC_STRIDE 1236 1237 packuswb m0, m2 1238 .x_other_y_other_loop: 1239 %if cpuflag(ssse3) 1240 movu m4, [srcq] 1241 movu m3, [srcq+1] 1242 mova m1, [dstq] 1243 punpckhbw m2, m4, m3 1244 punpcklbw m4, m3 1245 pmaddubsw m2, filter_x_a 1246 pmaddubsw m4, filter_x_a 1247 punpckhbw m3, m1, m5 1248 paddw m2, filter_rnd 1249 paddw m4, filter_rnd 1250 psraw m2, 4 1251 psraw m4, 4 1252 packuswb m4, m2 1253 punpckhbw m2, m0, m4 1254 punpcklbw m0, m4 1255 pmaddubsw m2, filter_y_a 1256 pmaddubsw m0, filter_y_a 1257 punpcklbw m1, m5 1258 paddw m2, filter_rnd 1259 paddw m0, filter_rnd 1260 psraw m2, 4 1261 psraw m0, 4 1262 %else 1263 movu m3, [srcq] 1264 movu m4, [srcq+1] 1265 punpckhbw m1, m3, m5 1266 punpckhbw m2, m4, m5 1267 punpcklbw m3, m5 1268 punpcklbw m4, m5 1269 pmullw m3, filter_x_a 1270 pmullw m4, filter_x_b 1271 paddw m3, filter_rnd 1272 pmullw m1, filter_x_a 1273 pmullw m2, filter_x_b 1274 paddw m1, filter_rnd 1275 paddw m3, m4 1276 paddw m1, m2 1277 psraw m3, 4 1278 psraw m1, 4 1279 packuswb m4, m3, m1 1280 punpckhbw m2, m0, m5 1281 punpcklbw m0, m5 1282 pmullw m2, filter_y_a 1283 pmullw m1, filter_y_b 1284 paddw m2, filter_rnd 1285 pmullw m0, filter_y_a 1286 pmullw m3, filter_y_b 1287 paddw m2, m1 1288 mova m1, [dstq] 1289 paddw m0, filter_rnd 1290 psraw m2, 4 1291 paddw m0, m3 1292 punpckhbw m3, m1, m5 1293 psraw m0, 4 1294 punpcklbw m1, m5 1295 %endif 1296 %if %2 == 1 ; avg 1297 ; FIXME(rbultje) pipeline 1298 packuswb m0, m2 1299 pavgb m0, [secq] 1300 punpckhbw m2, m0, m5 1301 punpcklbw m0, m5 1302 %endif 1303 SUM_SSE m0, m1, m2, m3, m6, m7 1304 mova m0, m4 1305 1306 INC_SRC_BY_SRC_STRIDE 1307 add dstq, dst_strideq 1308 %else ; %1 < 16 1309 movx m0, [srcq] 1310 movx m1, [srcq+1] 1311 %if cpuflag(ssse3) 1312 punpcklbw m0, m1 1313 pmaddubsw m0, filter_x_a 1314 paddw m0, filter_rnd 1315 %else 1316 punpcklbw m0, m5 1317 punpcklbw m1, m5 1318 pmullw m0, filter_x_a 1319 pmullw m1, filter_x_b 1320 paddw m0, filter_rnd 1321 paddw m0, m1 1322 %endif 1323 psraw m0, 4 1324 %if cpuflag(ssse3) 1325 packuswb m0, m0 1326 %endif 1327 1328 INC_SRC_BY_SRC_STRIDE 1329 1330 .x_other_y_other_loop: 1331 movx m2, [srcq] 1332 movx m1, [srcq+1] 1333 1334 INC_SRC_BY_SRC_STRIDE 1335 movx m4, [srcq] 1336 movx m3, [srcq+1] 1337 1338 %if cpuflag(ssse3) 1339 punpcklbw m2, m1 1340 punpcklbw m4, m3 1341 pmaddubsw m2, filter_x_a 1342 pmaddubsw m4, filter_x_a 1343 movx m3, [dstq+dst_strideq] 1344 movx m1, [dstq] 1345 paddw m2, filter_rnd 1346 paddw m4, filter_rnd 1347 psraw m2, 4 1348 psraw m4, 4 1349 packuswb m2, m2 1350 packuswb m4, m4 1351 punpcklbw m0, m2 1352 punpcklbw m2, m4 1353 pmaddubsw m0, filter_y_a 1354 pmaddubsw m2, filter_y_a 1355 punpcklbw m3, m5 1356 paddw m0, filter_rnd 1357 paddw m2, filter_rnd 1358 psraw m0, 4 1359 psraw m2, 4 1360 punpcklbw m1, m5 1361 %else 1362 punpcklbw m2, m5 1363 punpcklbw m1, m5 1364 punpcklbw m4, m5 1365 punpcklbw m3, m5 1366 pmullw m2, filter_x_a 1367 pmullw m1, filter_x_b 1368 paddw m2, filter_rnd 1369 pmullw m4, filter_x_a 1370 pmullw m3, filter_x_b 1371 paddw m4, filter_rnd 1372 paddw m2, m1 1373 paddw m4, m3 1374 psraw m2, 4 1375 psraw m4, 4 1376 pmullw m0, filter_y_a 1377 pmullw m3, m2, filter_y_b 1378 paddw m0, filter_rnd 1379 pmullw m2, filter_y_a 1380 pmullw m1, m4, filter_y_b 1381 paddw m2, filter_rnd 1382 paddw m0, m3 1383 movx m3, [dstq+dst_strideq] 1384 paddw m2, m1 1385 movx m1, [dstq] 1386 psraw m0, 4 1387 psraw m2, 4 1388 punpcklbw m3, m5 1389 punpcklbw m1, m5 1390 %endif 1391 %if %2 == 1 ; avg 1392 ; FIXME(rbultje) pipeline 1393 %if %1 == 4 1394 movlhps m0, m2 1395 %endif 1396 packuswb m0, m2 1397 %if %1 > 4 1398 pavgb m0, [secq] 1399 punpckhbw m2, m0, m5 1400 punpcklbw m0, m5 1401 %else 1402 movh m2, [secq] 1403 pavgb m0, m2 1404 punpcklbw m0, m5 1405 movhlps m2, m0 1406 %endif 1407 %endif 1408 SUM_SSE m0, m1, m2, m3, m6, m7 1409 mova m0, m4 1410 1411 INC_SRC_BY_SRC_STRIDE 1412 lea dstq, [dstq+dst_strideq*2] 1413 %endif 1414 %if %2 == 1 ; avg 1415 add secq, sec_str 1416 %endif 1417 dec block_height 1418 jg .x_other_y_other_loop 1419 %undef filter_x_a 1420 %undef filter_x_b 1421 %undef filter_y_a 1422 %undef filter_y_b 1423 %undef filter_rnd 1424 %undef movx 1425 STORE_AND_RET %1 1426 %endmacro 1427 1428 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical 1429 ; between the ssse3 and non-ssse3 version. It may make sense to merge their 1430 ; code in the sense that the ssse3 version would jump to the appropriate 1431 ; location in the sse/2 version, rather than duplicating that code in the 1432 ; binary. 1433 1434 INIT_XMM ssse3 1435 SUBPEL_VARIANCE 4 1436 SUBPEL_VARIANCE 8 1437 SUBPEL_VARIANCE 16 1438 1439 INIT_XMM ssse3 1440 SUBPEL_VARIANCE 4, 1 1441 SUBPEL_VARIANCE 8, 1 1442 SUBPEL_VARIANCE 16, 1