looprestoration_sse.asm (101312B)
1 ; Copyright © 2018, VideoLAN and dav1d authors 2 ; Copyright © 2018, Two Orioles, LLC 3 ; Copyright © 2018, VideoLabs 4 ; All rights reserved. 5 ; 6 ; Redistribution and use in source and binary forms, with or without 7 ; modification, are permitted provided that the following conditions are met: 8 ; 9 ; 1. Redistributions of source code must retain the above copyright notice, this 10 ; list of conditions and the following disclaimer. 11 ; 12 ; 2. Redistributions in binary form must reproduce the above copyright notice, 13 ; this list of conditions and the following disclaimer in the documentation 14 ; and/or other materials provided with the distribution. 15 ; 16 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27 %include "config.asm" 28 %include "ext/x86/x86inc.asm" 29 30 SECTION_RODATA 16 31 32 wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4 33 wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 34 wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 35 wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 36 wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1 37 wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 38 sgr_lshuf3: db 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 39 sgr_lshuf5: db 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 40 pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 41 42 pb_right_ext_mask: times 24 db 0xff 43 times 8 db 0 44 pb_1: times 16 db 1 45 pw_256: times 8 dw 256 46 pw_2056: times 8 dw 2056 47 pw_m16380: times 8 dw -16380 48 pw_164_24: times 4 dw 164, 24 49 pw_455_24: times 4 dw 455, 24 50 pd_4096: times 4 dd 4096 51 pd_34816: times 4 dd 34816 52 pd_0xffff: times 4 dd 0xffff 53 pf_256: times 4 dd 256.0 54 55 SECTION .text 56 57 %macro movif64 2 ; dst, src 58 %if ARCH_X86_64 59 mov %1, %2 60 %endif 61 %endmacro 62 63 %macro movif32 2 ; dst, src 64 %if ARCH_X86_32 65 mov %1, %2 66 %endif 67 %endmacro 68 69 %if ARCH_X86_32 70 %define PIC_base_offset $$ 71 72 %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg 73 %assign pic_reg_stk_off 4 74 %xdefine PIC_reg %1 75 %if %2 == 1 76 mov [esp], %1 77 %endif 78 LEA PIC_reg, PIC_base_offset 79 %if %3 == 1 80 XCHG_PIC_REG 81 %endif 82 %endmacro 83 84 %macro XCHG_PIC_REG 0 85 mov [esp+pic_reg_stk_off], PIC_reg 86 %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 87 mov PIC_reg, [esp+pic_reg_stk_off] 88 %endmacro 89 90 %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) 91 92 %else 93 %macro XCHG_PIC_REG 0 94 %endmacro 95 96 %define PIC_sym(sym) (sym) 97 %endif 98 99 %macro WIENER 0 100 %if ARCH_X86_64 101 DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers 102 cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 103 w, h, edge, flt, x 104 %define tmpstrideq strideq 105 %define base 0 106 mov fltq, r6mp 107 mov wd, wm 108 movifnidn hd, hm 109 mov edged, r7m 110 movq m14, [fltq] 111 add lpfq, wq 112 movq m7, [fltq+16] 113 add dstq, wq 114 lea t1, [rsp+wq*2+16] 115 mova m15, [pw_2056] 116 neg wq 117 %if cpuflag(ssse3) 118 pshufb m14, [wiener_init] 119 mova m8, [wiener_shufA] 120 pshufd m12, m14, q2222 ; x0 x0 121 mova m9, [wiener_shufB] 122 pshufd m13, m14, q3333 ; x1 x2 123 mova m10, [wiener_shufC] 124 punpcklqdq m14, m14 ; x3 125 mova m11, [wiener_shufD] 126 %else 127 mova m10, [pw_m16380] 128 punpcklwd m14, m14 129 pshufd m11, m14, q0000 ; x0 130 pshufd m12, m14, q1111 ; x1 131 pshufd m13, m14, q2222 ; x2 132 pshufd m14, m14, q3333 ; x3 133 %endif 134 %else 135 DECLARE_REG_TMP 4, 0, _, 5 136 %if cpuflag(ssse3) 137 %define m10 [base+wiener_shufC] 138 %define m11 [base+wiener_shufD] 139 %define stk_off 96 140 %else 141 %define m10 [base+pw_m16380] 142 %define m11 [stk+96] 143 %define stk_off 112 144 %endif 145 cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride 146 %define base r6-pb_right_ext_mask-21 147 %define stk esp 148 %define dstq leftq 149 %define edgeb byte edged 150 %define edged [stk+ 8] 151 %define dstmp [stk+12] 152 %define hd dword [stk+16] 153 %define wq [stk+20] 154 %define strideq [stk+24] 155 %define leftmp [stk+28] 156 %define t2 [stk+32] 157 %define t4 [stk+36] 158 %define t5 [stk+40] 159 %define t6 [stk+44] 160 %define m8 [base+wiener_shufA] 161 %define m9 [base+wiener_shufB] 162 %define m12 [stk+48] 163 %define m13 [stk+64] 164 %define m14 [stk+80] 165 %define m15 [base+pw_2056] 166 mov r1, r6m ; flt 167 mov r0, r0m ; dst 168 mov r4, r4m ; w 169 mov lpfq, lpfm 170 mov r2, r7m ; edge 171 mov r5, r5m ; h 172 movq m3, [r1+ 0] 173 movq m7, [r1+16] 174 add r0, r4 175 mov r1, r1m ; stride 176 add lpfq, r4 177 mov edged, r2 178 mov r2, r2m ; left 179 mov dstmp, r0 180 lea t1, [rsp+r4*2+stk_off] 181 mov hd, r5 182 neg r4 183 LEA r6, pb_right_ext_mask+21 184 mov wq, r4 185 mov strideq, r1 186 mov leftmp, r2 187 mov r4, r1 188 %if cpuflag(ssse3) 189 pshufb m3, [base+wiener_init] 190 pshufd m1, m3, q2222 191 pshufd m2, m3, q3333 192 punpcklqdq m3, m3 193 %else 194 punpcklwd m3, m3 195 pshufd m0, m3, q0000 196 pshufd m1, m3, q1111 197 pshufd m2, m3, q2222 198 pshufd m3, m3, q3333 199 mova m11, m0 200 %endif 201 mova m12, m1 202 mova m13, m2 203 mova m14, m3 204 %endif 205 psllw m7, 5 206 pshufd m6, m7, q0000 ; y0 y1 207 pshufd m7, m7, q1111 ; y2 y3 208 test edgeb, 4 ; LR_HAVE_TOP 209 jz .no_top 210 call .h_top 211 add lpfq, strideq 212 mov t6, t1 213 mov t5, t1 214 add t1, 384*2 215 call .h_top 216 lea t3, [lpfq+tmpstrideq*4] 217 mov lpfq, dstmp 218 add t3, tmpstrideq 219 mov [rsp], t3 ; below 220 mov t4, t1 221 add t1, 384*2 222 call .h 223 mov t3, t1 224 mov t2, t1 225 dec hd 226 jz .v1 227 add lpfq, strideq 228 add t1, 384*2 229 call .h 230 mov t2, t1 231 dec hd 232 jz .v2 233 add lpfq, strideq 234 add t1, 384*2 235 call .h 236 dec hd 237 jz .v3 238 .main: 239 lea t0, [t1+384*2] 240 .main_loop: 241 call .hv 242 dec hd 243 jnz .main_loop 244 test edgeb, 8 ; LR_HAVE_BOTTOM 245 jz .v3 246 mov lpfq, [rsp] 247 call .hv_bottom 248 add lpfq, strideq 249 call .hv_bottom 250 .v1: 251 call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 252 RET 253 .no_top: 254 lea t3, [lpfq+tmpstrideq*4] 255 mov lpfq, dstmp 256 lea t3, [t3+tmpstrideq*2] 257 mov [rsp], t3 258 call .h 259 mov t6, t1 260 mov t5, t1 261 mov t4, t1 262 mov t3, t1 263 mov t2, t1 264 dec hd 265 jz .v1 266 add lpfq, strideq 267 add t1, 384*2 268 call .h 269 mov t2, t1 270 dec hd 271 jz .v2 272 add lpfq, strideq 273 add t1, 384*2 274 call .h 275 dec hd 276 jz .v3 277 lea t0, [t1+384*2] 278 call .hv 279 dec hd 280 jz .v3 281 add t0, 384*8 282 call .hv 283 dec hd 284 jnz .main 285 .v3: 286 call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 287 .v2: 288 call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v 289 jmp .v1 290 .extend_right: 291 movd m2, [lpfq-1] 292 %if ARCH_X86_64 293 push r0 294 lea r0, [pb_right_ext_mask+21] 295 movu m0, [r0+xq+0] 296 movu m1, [r0+xq+8] 297 pop r0 298 %else 299 movu m0, [r6+xq+0] 300 movu m1, [r6+xq+8] 301 %endif 302 %if cpuflag(ssse3) 303 pxor m3, m3 304 pshufb m2, m3 305 %else 306 punpcklbw m2, m2 307 pshuflw m2, m2, q0000 308 punpcklqdq m2, m2 309 %endif 310 pand m4, m0 311 pand m5, m1 312 pandn m0, m2 313 pandn m1, m2 314 por m4, m0 315 por m5, m1 316 ret 317 .h: 318 %define stk esp+4 ; offset due to call 319 mov xq, wq 320 test edgeb, 1 ; LR_HAVE_LEFT 321 jz .h_extend_left 322 movifnidn leftq, leftmp 323 mova m4, [lpfq+xq] 324 movd m5, [leftq] 325 add leftq, 4 326 pslldq m4, 4 327 por m4, m5 328 movifnidn leftmp, leftq 329 jmp .h_main 330 .h_extend_left: 331 %if cpuflag(ssse3) 332 mova m4, [lpfq+xq] 333 pshufb m4, [base+wiener_l_shuf] 334 %else 335 mova m5, [lpfq+xq] 336 pshufd m4, m5, q2103 337 punpcklbw m5, m5 338 punpcklwd m5, m5 339 movss m4, m5 340 %endif 341 jmp .h_main 342 .h_top: 343 mov xq, wq 344 test edgeb, 1 ; LR_HAVE_LEFT 345 jz .h_extend_left 346 .h_loop: 347 movu m4, [lpfq+xq-4] 348 .h_main: 349 movu m5, [lpfq+xq+4] 350 test edgeb, 2 ; LR_HAVE_RIGHT 351 jnz .h_have_right 352 cmp xd, -18 353 jl .h_have_right 354 call .extend_right 355 .h_have_right: 356 %macro %%h7 0 357 %if cpuflag(ssse3) 358 pshufb m0, m4, m8 359 pmaddubsw m0, m12 360 pshufb m1, m5, m8 361 pmaddubsw m1, m12 362 pshufb m2, m4, m9 363 pmaddubsw m2, m13 364 pshufb m3, m5, m9 365 pmaddubsw m3, m13 366 paddw m0, m2 367 pshufb m2, m4, m10 368 pmaddubsw m2, m13 369 paddw m1, m3 370 pshufb m3, m5, m10 371 pmaddubsw m3, m13 372 pshufb m4, m11 373 paddw m0, m2 374 pmullw m2, m14, m4 375 pshufb m5, m11 376 paddw m1, m3 377 pmullw m3, m14, m5 378 psllw m4, 7 379 psllw m5, 7 380 paddw m0, m2 381 mova m2, [base+pw_m16380] 382 paddw m1, m3 383 paddw m4, m2 384 paddw m5, m2 385 paddsw m0, m4 386 paddsw m1, m5 387 %else 388 psrldq m0, m4, 1 389 pslldq m1, m4, 1 390 pxor m3, m3 391 punpcklbw m0, m3 392 punpckhbw m1, m3 393 paddw m0, m1 394 pmullw m0, m11 395 psrldq m1, m4, 2 396 pslldq m2, m4, 2 397 punpcklbw m1, m3 398 punpckhbw m2, m3 399 paddw m1, m2 400 pmullw m1, m12 401 paddw m0, m1 402 pshufd m2, m4, q0321 403 punpcklbw m2, m3 404 pmullw m1, m14, m2 405 paddw m0, m1 406 psrldq m1, m4, 3 407 pslldq m4, 3 408 punpcklbw m1, m3 409 punpckhbw m4, m3 410 paddw m1, m4 411 pmullw m1, m13 412 paddw m0, m1 413 psllw m2, 7 414 paddw m2, m10 415 paddsw m0, m2 416 psrldq m1, m5, 1 417 pslldq m2, m5, 1 418 punpcklbw m1, m3 419 punpckhbw m2, m3 420 paddw m1, m2 421 pmullw m1, m11 422 psrldq m2, m5, 2 423 pslldq m4, m5, 2 424 punpcklbw m2, m3 425 punpckhbw m4, m3 426 paddw m2, m4 427 pmullw m2, m12 428 paddw m1, m2 429 pshufd m4, m5, q0321 430 punpcklbw m4, m3 431 pmullw m2, m14, m4 432 paddw m1, m2 433 psrldq m2, m5, 3 434 pslldq m5, 3 435 punpcklbw m2, m3 436 punpckhbw m5, m3 437 paddw m2, m5 438 pmullw m2, m13 439 paddw m1, m2 440 psllw m4, 7 441 paddw m4, m10 442 paddsw m1, m4 443 %endif 444 %endmacro 445 %%h7 446 psraw m0, 3 447 psraw m1, 3 448 paddw m0, m15 449 paddw m1, m15 450 mova [t1+xq*2+ 0], m0 451 mova [t1+xq*2+16], m1 452 add xq, 16 453 jl .h_loop 454 ret 455 ALIGN function_align 456 .hv: 457 add lpfq, strideq 458 mov xq, wq 459 test edgeb, 1 ; LR_HAVE_LEFT 460 jz .hv_extend_left 461 movifnidn leftq, leftmp 462 mova m4, [lpfq+xq] 463 movd m5, [leftq] 464 add leftq, 4 465 pslldq m4, 4 466 por m4, m5 467 movifnidn leftmp, leftq 468 jmp .hv_main 469 .hv_extend_left: 470 %if cpuflag(ssse3) 471 mova m4, [lpfq+xq] 472 pshufb m4, [base+wiener_l_shuf] 473 %else 474 mova m5, [lpfq+xq] 475 pshufd m4, m5, q2103 476 punpcklbw m5, m5 477 punpcklwd m5, m5 478 movss m4, m5 479 %endif 480 jmp .hv_main 481 .hv_bottom: 482 mov xq, wq 483 test edgeb, 1 ; LR_HAVE_LEFT 484 jz .hv_extend_left 485 .hv_loop: 486 movu m4, [lpfq+xq-4] 487 .hv_main: 488 movu m5, [lpfq+xq+4] 489 test edgeb, 2 ; LR_HAVE_RIGHT 490 jnz .hv_have_right 491 cmp xd, -18 492 jl .hv_have_right 493 call .extend_right 494 .hv_have_right: 495 %%h7 496 %if ARCH_X86_64 497 mova m2, [t4+xq*2] 498 paddw m2, [t2+xq*2] 499 %else 500 mov r2, t4 501 mova m2, [r2+xq*2] 502 mov r2, t2 503 paddw m2, [r2+xq*2] 504 mov r2, t5 505 %endif 506 mova m3, [t3+xq*2] 507 %if ARCH_X86_64 508 mova m5, [t5+xq*2] 509 %else 510 mova m5, [r2+xq*2] 511 mov r2, t6 512 %endif 513 paddw m5, [t1+xq*2] 514 psraw m0, 3 515 psraw m1, 3 516 paddw m0, m15 517 paddw m1, m15 518 %if ARCH_X86_64 519 paddw m4, m0, [t6+xq*2] 520 %else 521 paddw m4, m0, [r2+xq*2] 522 mov r2, t4 523 %endif 524 mova [t0+xq*2], m0 525 punpcklwd m0, m2, m3 526 pmaddwd m0, m7 527 punpckhwd m2, m3 528 pmaddwd m2, m7 529 punpcklwd m3, m4, m5 530 pmaddwd m3, m6 531 punpckhwd m4, m5 532 pmaddwd m4, m6 533 paddd m0, m3 534 mova m3, [t3+xq*2+16] 535 paddd m4, m2 536 %if ARCH_X86_64 537 mova m2, [t4+xq*2+16] 538 paddw m2, [t2+xq*2+16] 539 mova m5, [t5+xq*2+16] 540 %else 541 mova m2, [r2+xq*2+16] 542 mov r2, t2 543 paddw m2, [r2+xq*2+16] 544 mov r2, t5 545 mova m5, [r2+xq*2+16] 546 mov r2, t6 547 %endif 548 paddw m5, [t1+xq*2+16] 549 packuswb m0, m4 550 %if ARCH_X86_64 551 paddw m4, m1, [t6+xq*2+16] 552 %else 553 paddw m4, m1, [r2+xq*2+16] 554 mov dstq, dstmp 555 %endif 556 mova [t0+xq*2+16], m1 557 punpcklwd m1, m2, m3 558 pmaddwd m1, m7 559 punpckhwd m2, m3 560 pmaddwd m2, m7 561 punpcklwd m3, m4, m5 562 pmaddwd m3, m6 563 punpckhwd m4, m5 564 pmaddwd m4, m6 565 paddd m1, m3 566 paddd m2, m4 567 packuswb m1, m2 568 psrlw m0, 8 569 psrlw m1, 8 570 packuswb m0, m1 571 mova [dstq+xq], m0 572 add xq, 16 573 jl .hv_loop 574 add dstq, strideq 575 %if ARCH_X86_64 576 mov t6, t5 577 mov t5, t4 578 mov t4, t3 579 mov t3, t2 580 mov t2, t1 581 mov t1, t0 582 mov t0, t6 583 %else 584 mov dstmp, dstq 585 mov r1, t5 586 mov r2, t4 587 mov t6, r1 588 mov t5, r2 589 mov t4, t3 590 mov t3, t2 591 mov t2, t1 592 mov t1, t0 593 mov t0, r1 594 %endif 595 ret 596 %if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code 597 .v: 598 mov xq, wq 599 .v_loop: 600 %if ARCH_X86_64 601 mova m1, [t4+xq*2] 602 paddw m1, [t2+xq*2] 603 %else 604 mov r2, t4 605 mova m1, [r2+xq*2] 606 mov r2, t2 607 paddw m1, [r2+xq*2] 608 mov r2, t6 609 %endif 610 mova m2, [t3+xq*2] 611 mova m4, [t1+xq*2] 612 %if ARCH_X86_64 613 paddw m3, m4, [t6+xq*2] 614 paddw m4, [t5+xq*2] 615 %else 616 paddw m3, m4, [r2+xq*2] 617 mov r2, t5 618 paddw m4, [r2+xq*2] 619 mov r2, t4 620 %endif 621 punpcklwd m0, m1, m2 622 pmaddwd m0, m7 623 punpckhwd m1, m2 624 pmaddwd m1, m7 625 punpcklwd m2, m3, m4 626 pmaddwd m2, m6 627 punpckhwd m3, m4 628 pmaddwd m3, m6 629 paddd m0, m2 630 paddd m1, m3 631 %if ARCH_X86_64 632 mova m2, [t4+xq*2+16] 633 paddw m2, [t2+xq*2+16] 634 %else 635 mova m2, [r2+xq*2+16] 636 mov r2, t2 637 paddw m2, [r2+xq*2+16] 638 mov r2, t6 639 %endif 640 mova m3, [t3+xq*2+16] 641 mova m5, [t1+xq*2+16] 642 %if ARCH_X86_64 643 paddw m4, m5, [t6+xq*2+16] 644 paddw m5, [t5+xq*2+16] 645 %else 646 paddw m4, m5, [r2+xq*2+16] 647 mov r2, t5 648 paddw m5, [r2+xq*2+16] 649 movifnidn dstq, dstmp 650 %endif 651 packuswb m0, m1 652 punpcklwd m1, m2, m3 653 pmaddwd m1, m7 654 punpckhwd m2, m3 655 pmaddwd m2, m7 656 punpcklwd m3, m4, m5 657 pmaddwd m3, m6 658 punpckhwd m4, m5 659 pmaddwd m4, m6 660 paddd m1, m3 661 paddd m2, m4 662 packuswb m1, m2 663 psrlw m0, 8 664 psrlw m1, 8 665 packuswb m0, m1 666 mova [dstq+xq], m0 667 add xq, 16 668 jl .v_loop 669 add dstq, strideq 670 %if ARCH_X86_64 671 mov t6, t5 672 mov t5, t4 673 %else 674 mov dstmp, dstq 675 mov r1, t5 676 mov r2, t4 677 mov t6, r1 678 mov t5, r2 679 %endif 680 mov t4, t3 681 mov t3, t2 682 mov t2, t1 683 ret 684 %endif 685 686 %if ARCH_X86_64 687 cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ 688 w, h, edge, flt, x 689 mov fltq, r6mp 690 mov wd, wm 691 movifnidn hd, hm 692 mov edged, r7m 693 movq m14, [fltq] 694 add lpfq, wq 695 movq m7, [fltq+16] 696 add dstq, wq 697 mova m8, [pw_m16380] 698 lea t1, [rsp+wq*2+16] 699 mova m15, [pw_2056] 700 neg wq 701 %if cpuflag(ssse3) 702 pshufb m14, [wiener_init] 703 mova m9, [wiener_shufB] 704 pshufd m13, m14, q3333 ; x1 x2 705 mova m10, [wiener_shufC] 706 punpcklqdq m14, m14 ; x3 707 mova m11, [wiener_shufD] 708 mova m12, [wiener_l_shuf] 709 %else 710 punpcklwd m14, m14 711 pshufd m11, m14, q1111 ; x1 712 pshufd m13, m14, q2222 ; x2 713 pshufd m14, m14, q3333 ; x3 714 %endif 715 %else 716 %if cpuflag(ssse3) 717 %define stk_off 80 718 %else 719 %define m11 [stk+80] 720 %define stk_off 96 721 %endif 722 cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride 723 %define stk esp 724 %define leftmp [stk+28] 725 %define m8 [base+pw_m16380] 726 %define m12 [base+wiener_l_shuf] 727 %define m14 [stk+48] 728 mov r1, r6m ; flt 729 mov r0, r0m ; dst 730 mov r4, r4m ; w 731 mov lpfq, lpfm 732 mov r2, r7m ; edge 733 mov r5, r5m ; h 734 movq m2, [r1+ 0] 735 movq m7, [r1+16] 736 add r0, r4 737 mov r1, r1m ; stride 738 add lpfq, r4 739 mov edged, r2 740 mov r2, r2m ; left 741 mov dstmp, r0 742 lea t1, [rsp+r4*2+stk_off] 743 mov hd, r5 744 neg r4 745 LEA r6, pb_right_ext_mask+21 746 mov wq, r4 747 mov strideq, r1 748 mov leftmp, r2 749 mov r4, r1 750 %if cpuflag(ssse3) 751 pshufb m2, [base+wiener_init] 752 pshufd m1, m2, q3333 753 punpcklqdq m2, m2 754 %else 755 punpcklwd m2, m2 756 pshufd m0, m2, q1111 757 pshufd m1, m2, q2222 758 pshufd m2, m2, q3333 759 mova m11, m0 760 %endif 761 mova m13, m1 762 mova m14, m2 763 %endif 764 psllw m7, 5 765 pshufd m6, m7, q0000 ; __ y1 766 pshufd m7, m7, q1111 ; y2 y3 767 test edgeb, 4 ; LR_HAVE_TOP 768 jz .no_top 769 call .h_top 770 add lpfq, strideq 771 mov t4, t1 772 add t1, 384*2 773 call .h_top 774 lea xq, [lpfq+tmpstrideq*4] 775 mov lpfq, dstmp 776 mov t3, t1 777 add t1, 384*2 778 add xq, tmpstrideq 779 mov [rsp], xq ; below 780 call .h 781 mov t2, t1 782 dec hd 783 jz .v1 784 add lpfq, strideq 785 add t1, 384*2 786 call .h 787 dec hd 788 jz .v2 789 .main: 790 mov t0, t4 791 .main_loop: 792 call .hv 793 dec hd 794 jnz .main_loop 795 test edgeb, 8 ; LR_HAVE_BOTTOM 796 jz .v2 797 mov lpfq, [rsp] 798 call .hv_bottom 799 add lpfq, strideq 800 call .hv_bottom 801 .end: 802 RET 803 .no_top: 804 lea t3, [lpfq+tmpstrideq*4] 805 mov lpfq, dstmp 806 lea t3, [t3+tmpstrideq*2] 807 mov [rsp], t3 808 call .h 809 mov t4, t1 810 mov t3, t1 811 mov t2, t1 812 dec hd 813 jz .v1 814 add lpfq, strideq 815 add t1, 384*2 816 call .h 817 dec hd 818 jz .v2 819 lea t0, [t1+384*2] 820 call .hv 821 dec hd 822 jz .v2 823 add t0, 384*6 824 call .hv 825 dec hd 826 jnz .main 827 .v2: 828 call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v 829 add dstq, strideq 830 mov t4, t3 831 mov t3, t2 832 mov t2, t1 833 movifnidn dstmp, dstq 834 .v1: 835 call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v 836 jmp .end 837 .h: 838 %define stk esp+4 839 mov xq, wq 840 test edgeb, 1 ; LR_HAVE_LEFT 841 jz .h_extend_left 842 movifnidn leftq, leftmp 843 mova m4, [lpfq+xq] 844 movd m5, [leftq] 845 add leftq, 4 846 pslldq m4, 4 847 por m4, m5 848 movifnidn leftmp, leftq 849 jmp .h_main 850 .h_extend_left: 851 %if cpuflag(ssse3) 852 mova m4, [lpfq+xq] 853 pshufb m4, m12 854 %else 855 mova m5, [lpfq+xq] 856 pshufd m4, m5, q2103 857 punpcklbw m5, m5 858 punpcklwd m5, m5 859 movss m4, m5 860 %endif 861 jmp .h_main 862 .h_top: 863 mov xq, wq 864 test edgeb, 1 ; LR_HAVE_LEFT 865 jz .h_extend_left 866 .h_loop: 867 movu m4, [lpfq+xq-4] 868 .h_main: 869 movu m5, [lpfq+xq+4] 870 test edgeb, 2 ; LR_HAVE_RIGHT 871 jnz .h_have_right 872 cmp xd, -17 873 jl .h_have_right 874 call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right 875 .h_have_right: 876 %macro %%h5 0 877 %if cpuflag(ssse3) 878 pshufb m0, m4, m9 879 pmaddubsw m0, m13 880 pshufb m1, m5, m9 881 pmaddubsw m1, m13 882 pshufb m2, m4, m10 883 pmaddubsw m2, m13 884 pshufb m3, m5, m10 885 pmaddubsw m3, m13 886 pshufb m4, m11 887 paddw m0, m2 888 pmullw m2, m14, m4 889 pshufb m5, m11 890 paddw m1, m3 891 pmullw m3, m14, m5 892 psllw m4, 7 893 psllw m5, 7 894 paddw m4, m8 895 paddw m5, m8 896 paddw m0, m2 897 paddw m1, m3 898 paddsw m0, m4 899 paddsw m1, m5 900 %else 901 psrldq m0, m4, 2 902 pslldq m1, m4, 2 903 pxor m3, m3 904 punpcklbw m0, m3 905 punpckhbw m1, m3 906 paddw m0, m1 907 pmullw m0, m11 908 pshufd m2, m4, q0321 909 punpcklbw m2, m3 910 pmullw m1, m14, m2 911 paddw m0, m1 912 psrldq m1, m4, 3 913 pslldq m4, 3 914 punpcklbw m1, m3 915 punpckhbw m4, m3 916 paddw m1, m4 917 pmullw m1, m13 918 paddw m0, m1 919 psllw m2, 7 920 paddw m2, m8 921 paddsw m0, m2 922 psrldq m1, m5, 2 923 pslldq m4, m5, 2 924 punpcklbw m1, m3 925 punpckhbw m4, m3 926 paddw m1, m4 927 pmullw m1, m11 928 pshufd m4, m5, q0321 929 punpcklbw m4, m3 930 pmullw m2, m14, m4 931 paddw m1, m2 932 psrldq m2, m5, 3 933 pslldq m5, 3 934 punpcklbw m2, m3 935 punpckhbw m5, m3 936 paddw m2, m5 937 pmullw m2, m13 938 paddw m1, m2 939 psllw m4, 7 940 paddw m4, m8 941 paddsw m1, m4 942 %endif 943 %endmacro 944 %%h5 945 psraw m0, 3 946 psraw m1, 3 947 paddw m0, m15 948 paddw m1, m15 949 mova [t1+xq*2+ 0], m0 950 mova [t1+xq*2+16], m1 951 add xq, 16 952 jl .h_loop 953 ret 954 ALIGN function_align 955 .hv: 956 add lpfq, strideq 957 mov xq, wq 958 test edgeb, 1 ; LR_HAVE_LEFT 959 jz .hv_extend_left 960 movifnidn leftq, leftmp 961 mova m4, [lpfq+xq] 962 movd m5, [leftq] 963 add leftq, 4 964 pslldq m4, 4 965 por m4, m5 966 movifnidn leftmp, leftq 967 jmp .hv_main 968 .hv_extend_left: 969 %if cpuflag(ssse3) 970 mova m4, [lpfq+xq] 971 pshufb m4, m12 972 %else 973 mova m5, [lpfq+xq] 974 pshufd m4, m5, q2103 975 punpcklbw m5, m5 976 punpcklwd m5, m5 977 movss m4, m5 978 %endif 979 jmp .hv_main 980 .hv_bottom: 981 mov xq, wq 982 test edgeb, 1 ; LR_HAVE_LEFT 983 jz .hv_extend_left 984 .hv_loop: 985 movu m4, [lpfq+xq-4] 986 .hv_main: 987 movu m5, [lpfq+xq+4] 988 test edgeb, 2 ; LR_HAVE_RIGHT 989 jnz .hv_have_right 990 cmp xd, -17 991 jl .hv_have_right 992 call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right 993 .hv_have_right: 994 %%h5 995 mova m2, [t3+xq*2] 996 paddw m2, [t1+xq*2] 997 psraw m0, 3 998 psraw m1, 3 999 paddw m0, m15 1000 paddw m1, m15 1001 %if ARCH_X86_64 1002 mova m3, [t2+xq*2] 1003 paddw m4, m0, [t4+xq*2] 1004 %else 1005 mov r2, t2 1006 mova m3, [r2+xq*2] 1007 mov r2, t4 1008 paddw m4, m0, [r2+xq*2] 1009 %endif 1010 mova [t0+xq*2], m0 1011 punpcklwd m0, m2, m3 1012 pmaddwd m0, m7 1013 punpckhwd m2, m3 1014 pmaddwd m2, m7 1015 punpcklwd m3, m4, m4 1016 pmaddwd m3, m6 1017 punpckhwd m4, m4 1018 pmaddwd m4, m6 1019 paddd m0, m3 1020 paddd m4, m2 1021 mova m2, [t3+xq*2+16] 1022 paddw m2, [t1+xq*2+16] 1023 packuswb m0, m4 1024 %if ARCH_X86_64 1025 mova m3, [t2+xq*2+16] 1026 paddw m4, m1, [t4+xq*2+16] 1027 %else 1028 paddw m4, m1, [r2+xq*2+16] 1029 mov r2, t2 1030 mova m3, [r2+xq*2+16] 1031 mov dstq, dstmp 1032 %endif 1033 mova [t0+xq*2+16], m1 1034 punpcklwd m1, m2, m3 1035 pmaddwd m1, m7 1036 punpckhwd m2, m3 1037 pmaddwd m2, m7 1038 punpcklwd m3, m4, m4 1039 pmaddwd m3, m6 1040 punpckhwd m4, m4 1041 pmaddwd m4, m6 1042 paddd m1, m3 1043 paddd m2, m4 1044 packuswb m1, m2 1045 psrlw m0, 8 1046 psrlw m1, 8 1047 packuswb m0, m1 1048 mova [dstq+xq], m0 1049 add xq, 16 1050 jl .hv_loop 1051 add dstq, strideq 1052 mov t4, t3 1053 mov t3, t2 1054 mov t2, t1 1055 mov t1, t0 1056 mov t0, t4 1057 movifnidn dstmp, dstq 1058 ret 1059 %if cpuflag(ssse3) 1060 .v: 1061 mov xq, wq 1062 .v_loop: 1063 mova m3, [t1+xq*2] 1064 paddw m1, m3, [t3+xq*2] 1065 %if ARCH_X86_64 1066 mova m2, [t2+xq*2] 1067 paddw m3, [t4+xq*2] 1068 %else 1069 mov r2, t2 1070 mova m2, [r2+xq*2] 1071 mov r2, t4 1072 paddw m3, [r2+xq*2] 1073 %endif 1074 punpcklwd m0, m1, m2 1075 pmaddwd m0, m7 1076 punpckhwd m1, m2 1077 pmaddwd m1, m7 1078 punpcklwd m2, m3 1079 pmaddwd m2, m6 1080 punpckhwd m3, m3 1081 pmaddwd m3, m6 1082 paddd m0, m2 1083 paddd m1, m3 1084 mova m4, [t1+xq*2+16] 1085 paddw m2, m4, [t3+xq*2+16] 1086 %if ARCH_X86_64 1087 mova m3, [t2+xq*2+16] 1088 paddw m4, [t4+xq*2+16] 1089 %else 1090 paddw m4, [r2+xq*2+16] 1091 mov r2, t2 1092 mova m3, [r2+xq*2+16] 1093 mov dstq, dstmp 1094 %endif 1095 packuswb m0, m1 1096 punpcklwd m1, m2, m3 1097 pmaddwd m1, m7 1098 punpckhwd m2, m3 1099 pmaddwd m2, m7 1100 punpcklwd m3, m4 1101 pmaddwd m3, m6 1102 punpckhwd m4, m4 1103 pmaddwd m4, m6 1104 paddd m1, m3 1105 paddd m2, m4 1106 packuswb m1, m2 1107 psrlw m0, 8 1108 psrlw m1, 8 1109 packuswb m0, m1 1110 mova [dstq+xq], m0 1111 add xq, 16 1112 jl .v_loop 1113 ret 1114 %endif 1115 %endmacro 1116 1117 INIT_XMM sse2 1118 WIENER 1119 1120 INIT_XMM ssse3 1121 WIENER 1122 1123 ;;;;;;;;;;;;;;;;;;;;;;;;;; 1124 ;; self-guided ;; 1125 ;;;;;;;;;;;;;;;;;;;;;;;;;; 1126 1127 %macro MUL_32X16X2 6 ; dst[1-2], src[1-2], tmp[1-2] 1128 pmulhuw %5, %1, %3 1129 pmulhuw %6, %2, %4 1130 pmullw %1, %3 1131 pmullw %2, %4 1132 pslld %5, 16 1133 pslld %6, 16 1134 paddd %1, %5 1135 paddd %2, %6 1136 %endmacro 1137 1138 %macro SGR_CALC_X 9 ; dst, tmp, b[1-2], an[1-2], s, b_mul, pf_256 1139 pmaddwd %1, %3, %3 ; b * b 1140 pmaddwd %2, %4, %4 1141 psubd %5, %1 ; p 1142 psubd %6, %2 1143 MUL_32X16X2 %5, %6, %7, %7, %1, %2 ; p * s 1144 pmaddwd %3, %8 ; b * b_mul 1145 pmaddwd %4, %8 1146 paddw %5, %8 1147 paddw %6, %8 1148 psrld %5, 20 ; z + 1 1149 psrld %6, 20 1150 cvtdq2ps %5, %5 1151 cvtdq2ps %6, %6 1152 rcpps %1, %5 ; 1 / (z + 1) 1153 rcpps %2, %6 1154 cmpltps %5, %9 1155 cmpltps %6, %9 1156 mulps %1, %9 ; 256 / (z + 1) 1157 mulps %2, %9 1158 packssdw %5, %6 1159 cvtps2dq %1, %1 1160 cvtps2dq %2, %2 1161 psrlw %5, 8 ; z < 255 ? 255 : 0 1162 packssdw %1, %2 1163 pminsw %1, %5 ; x 1164 %endmacro 1165 1166 %if ARCH_X86_32 1167 DECLARE_REG_TMP 0, 1, 2, 3, 5 1168 %if STACK_ALIGNMENT < 16 1169 %assign extra_stack 5*16 1170 %else 1171 %assign extra_stack 3*16 1172 %endif 1173 cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \ 1174 dst, stride, left, lpf, w 1175 %if STACK_ALIGNMENT < 16 1176 %define dstm dword [esp+calloff+16*0+4*6] 1177 %define stridemp dword [esp+calloff+16*0+4*7] 1178 %define leftm dword [esp+calloff+16*3+4*0] 1179 %define lpfm dword [esp+calloff+16*3+4*1] 1180 %define w0m dword [esp+calloff+16*3+4*2] 1181 %define hd dword [esp+calloff+16*3+4*3] 1182 %define edgeb byte [esp+calloff+16*3+4*4] 1183 %define edged dword [esp+calloff+16*3+4*4] 1184 %define leftmp leftm 1185 %else 1186 %define w0m wm 1187 %define hd dword r5m 1188 %define edgeb byte r7m 1189 %define edged dword r7m 1190 %endif 1191 %define hvsrcm dword [esp+calloff+4*0] 1192 %define w1m dword [esp+calloff+4*1] 1193 %define t0m dword [esp+calloff+4*2] 1194 %define t2m dword [esp+calloff+4*3] 1195 %define t3m dword [esp+calloff+4*4] 1196 %define t4m dword [esp+calloff+4*5] 1197 %define m8 [base+pb_1] 1198 %define m9 [esp+calloff+16*2] 1199 %define m10 [base+pw_164_24] 1200 %define m11 [base+sgr_lshuf5] 1201 %define m12 [base+pd_34816] 1202 %define m13 [base+pb_0to15] 1203 %define m14 [base+pf_256] 1204 %define r10 r4 1205 %define base r6-pw_2056 1206 %assign calloff 0 1207 %if STACK_ALIGNMENT < 16 1208 mov strideq, [rstk+stack_offset+ 8] 1209 mov leftq, [rstk+stack_offset+12] 1210 mov lpfq, [rstk+stack_offset+16] 1211 mov wd, [rstk+stack_offset+20] 1212 mov dstm, dstq 1213 mov stridemp, strideq 1214 mov leftm, leftq 1215 mov r1, [rstk+stack_offset+24] 1216 mov r2, [rstk+stack_offset+32] 1217 mov lpfm, lpfq 1218 mov hd, r1 1219 mov edged, r2 1220 %endif 1221 %else 1222 DECLARE_REG_TMP 8, 7, 9, 11, 12 1223 cglobal sgr_filter_5x5_8bpc, 4, 13, 15, -400*24-16, dst, stride, left, lpf, \ 1224 w, h, edge, params 1225 %endif 1226 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1227 mov wd, wm 1228 %endif 1229 %if ARCH_X86_64 1230 mov paramsq, r6mp 1231 movifnidn hd, hm 1232 mov edged, r7m 1233 movu m9, [paramsq] 1234 add lpfq, wq 1235 mova m8, [pb_1] 1236 lea t1, [rsp+wq*2+20] 1237 mova m10, [pw_164_24] 1238 add dstq, wq 1239 lea t3, [rsp+wq*4+400*12+16] 1240 mova m12, [pd_34816] ; (1 << 11) + (1 << 15) 1241 lea t4, [rsp+wq*2+400*20+16] 1242 pshufhw m7, m9, q0000 1243 pshufb m9, [pw_256] ; s0 1244 punpckhqdq m7, m7 ; w0 1245 neg wq 1246 mova m13, [pb_0to15] 1247 pxor m6, m6 1248 mova m11, [sgr_lshuf5] 1249 psllw m7, 4 1250 movaps m14, [pf_256] 1251 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1252 %define lpfm [rsp] 1253 %else 1254 mov r1, [rstk+stack_offset+28] ; params 1255 LEA r6, pw_2056 1256 movu m1, [r1] 1257 add lpfm, wq 1258 lea t1, [rsp+extra_stack+wq*2+20] 1259 add dstq, wq 1260 lea t3, [rsp+extra_stack+wq*4+400*12+16] 1261 mov dstm, dstq 1262 lea t4, [rsp+extra_stack+wq*2+400*20+16] 1263 mov t3m, t3 1264 pshufhw m7, m1, q0000 1265 mov t4m, t4 1266 pshufb m1, [base+pw_256] ; s0 1267 punpckhqdq m7, m7 ; w0 1268 psllw m7, 4 1269 neg wq 1270 mova m9, m1 1271 pxor m6, m6 1272 mov w1m, wd 1273 sub wd, 2 1274 mov lpfq, lpfm 1275 mov w0m, wd 1276 %define strideq r5 1277 %endif 1278 test edgeb, 4 ; LR_HAVE_TOP 1279 jz .no_top 1280 call .h_top 1281 add lpfq, stridemp 1282 mov t2, t1 1283 call .top_fixup 1284 add t1, 400*6 1285 call .h_top 1286 movif32 strideq, stridemp 1287 lea r10, [lpfq+strideq*4] 1288 mov lpfq, dstq 1289 add r10, strideq 1290 mov lpfm, r10 ; below 1291 movif32 t0m, t2 1292 mov t0, t2 1293 dec hd 1294 jz .height1 1295 or edged, 16 1296 call .h 1297 .main: 1298 add lpfq, stridemp 1299 movif32 t4, t4m 1300 call .hv 1301 call .prep_n 1302 sub hd, 2 1303 jl .extend_bottom 1304 .main_loop: 1305 movif32 lpfq, hvsrcm 1306 add lpfq, stridemp 1307 %if ARCH_X86_64 1308 test hb, hb 1309 %else 1310 mov r4, hd 1311 test r4, r4 1312 %endif 1313 jz .odd_height 1314 call .h 1315 add lpfq, stridemp 1316 call .hv 1317 movif32 dstq, dstm 1318 call .n0 1319 call .n1 1320 sub hd, 2 1321 movif32 t0, t0m 1322 jge .main_loop 1323 test edgeb, 8 ; LR_HAVE_BOTTOM 1324 jz .extend_bottom 1325 mov lpfq, lpfm 1326 call .h_top 1327 add lpfq, stridemp 1328 call .hv_bottom 1329 .end: 1330 movif32 dstq, dstm 1331 call .n0 1332 call .n1 1333 .end2: 1334 RET 1335 .height1: 1336 movif32 t4, t4m 1337 call .hv 1338 call .prep_n 1339 jmp .odd_height_end 1340 .odd_height: 1341 call .hv 1342 movif32 dstq, dstm 1343 call .n0 1344 call .n1 1345 .odd_height_end: 1346 call .v 1347 movif32 dstq, dstm 1348 call .n0 1349 jmp .end2 1350 .extend_bottom: 1351 call .v 1352 jmp .end 1353 .no_top: 1354 movif32 strideq, stridemp 1355 lea r10, [lpfq+strideq*4] 1356 mov lpfq, dstq 1357 lea r10, [r10+strideq*2] 1358 mov lpfm, r10 1359 call .h 1360 lea t2, [t1+400*6] 1361 call .top_fixup 1362 dec hd 1363 jz .no_top_height1 1364 or edged, 16 1365 mov t0, t1 1366 mov t1, t2 1367 movif32 t0m, t0 1368 jmp .main 1369 .no_top_height1: 1370 movif32 t3, t3m 1371 movif32 t4, t4m 1372 call .v 1373 call .prep_n 1374 jmp .odd_height_end 1375 .extend_right: 1376 %assign stack_offset stack_offset+8 1377 %assign calloff 8 1378 movd m1, wd 1379 movd m3, [lpfq-1] 1380 pshufb m1, m6 1381 pshufb m3, m6 1382 psubb m2, m8, m1 1383 pcmpgtb m2, m13 1384 pand m5, m2 1385 pandn m2, m3 1386 por m5, m2 1387 ret 1388 %assign stack_offset stack_offset-4 1389 %assign calloff 4 1390 .h: ; horizontal boxsum 1391 %if ARCH_X86_64 1392 lea wq, [r4-2] 1393 %else 1394 %define leftq r4 1395 %endif 1396 test edgeb, 1 ; LR_HAVE_LEFT 1397 jz .h_extend_left 1398 movif32 leftq, leftm 1399 movddup m4, [leftq-4] 1400 movif32 wq, w0m 1401 mova m5, [lpfq+wq+2] 1402 add leftmp, 4 1403 palignr m5, m4, 13 1404 jmp .h_main 1405 .h_extend_left: 1406 movif32 wq, w0m 1407 mova m5, [lpfq+wq+2] 1408 pshufb m5, m11 1409 jmp .h_main 1410 .h_top: 1411 %if ARCH_X86_64 1412 lea wq, [r4-2] 1413 %endif 1414 test edgeb, 1 ; LR_HAVE_LEFT 1415 jz .h_extend_left 1416 movif32 wq, w0m 1417 .h_loop: 1418 movu m5, [lpfq+wq-1] 1419 .h_main: 1420 test edgeb, 2 ; LR_HAVE_RIGHT 1421 jnz .h_have_right 1422 cmp wd, -10 1423 jl .h_have_right 1424 call .extend_right 1425 .h_have_right: 1426 punpcklbw m4, m5, m6 1427 punpckhbw m5, m6 1428 palignr m2, m5, m4, 2 1429 paddw m0, m4, m2 1430 palignr m3, m5, m4, 6 1431 paddw m0, m3 1432 punpcklwd m1, m2, m3 1433 pmaddwd m1, m1 1434 punpckhwd m2, m3 1435 pmaddwd m2, m2 1436 palignr m5, m4, 8 1437 paddw m0, m5 1438 punpcklwd m3, m4, m5 1439 pmaddwd m3, m3 1440 paddd m1, m3 1441 punpckhwd m3, m4, m5 1442 pmaddwd m3, m3 1443 shufps m4, m5, q2121 1444 paddw m0, m4 ; sum 1445 punpcklwd m5, m4, m6 1446 pmaddwd m5, m5 1447 punpckhwd m4, m6 1448 pmaddwd m4, m4 1449 paddd m2, m3 1450 test edgeb, 16 ; y > 0 1451 jz .h_loop_end 1452 paddw m0, [t1+wq*2+400*0] 1453 paddd m1, [t1+wq*2+400*2] 1454 paddd m2, [t1+wq*2+400*4] 1455 .h_loop_end: 1456 paddd m1, m5 ; sumsq 1457 paddd m2, m4 1458 mova [t1+wq*2+400*0], m0 1459 mova [t1+wq*2+400*2], m1 1460 mova [t1+wq*2+400*4], m2 1461 add wq, 8 1462 jl .h_loop 1463 ret 1464 .top_fixup: 1465 %if ARCH_X86_64 1466 lea wq, [r4-2] 1467 %else 1468 mov wd, w0m 1469 %endif 1470 .top_fixup_loop: ; the sums of the first row needs to be doubled 1471 mova m0, [t1+wq*2+400*0] 1472 mova m1, [t1+wq*2+400*2] 1473 mova m2, [t1+wq*2+400*4] 1474 paddw m0, m0 1475 paddd m1, m1 1476 paddd m2, m2 1477 mova [t2+wq*2+400*0], m0 1478 mova [t2+wq*2+400*2], m1 1479 mova [t2+wq*2+400*4], m2 1480 add wq, 8 1481 jl .top_fixup_loop 1482 ret 1483 ALIGN function_align 1484 .hv: ; horizontal boxsum + vertical boxsum + ab 1485 %if ARCH_X86_64 1486 lea wq, [r4-2] 1487 %else 1488 mov hvsrcm, lpfq 1489 %endif 1490 test edgeb, 1 ; LR_HAVE_LEFT 1491 jz .hv_extend_left 1492 movif32 leftq, leftm 1493 movddup m4, [leftq-4] 1494 movif32 wq, w0m 1495 mova m5, [lpfq+wq+2] 1496 add leftmp, 4 1497 palignr m5, m4, 13 1498 jmp .hv_main 1499 .hv_extend_left: 1500 movif32 wq, w0m 1501 mova m5, [lpfq+wq+2] 1502 pshufb m5, m11 1503 jmp .hv_main 1504 .hv_bottom: 1505 %if ARCH_X86_64 1506 lea wq, [r4-2] 1507 %else 1508 mov hvsrcm, lpfq 1509 %endif 1510 test edgeb, 1 ; LR_HAVE_LEFT 1511 jz .hv_extend_left 1512 movif32 wq, w0m 1513 %if ARCH_X86_32 1514 jmp .hv_loop_start 1515 %endif 1516 .hv_loop: 1517 movif32 lpfq, hvsrcm 1518 .hv_loop_start: 1519 movu m5, [lpfq+wq-1] 1520 .hv_main: 1521 test edgeb, 2 ; LR_HAVE_RIGHT 1522 jnz .hv_have_right 1523 cmp wd, -10 1524 jl .hv_have_right 1525 call .extend_right 1526 .hv_have_right: 1527 movif32 t3, hd 1528 punpcklbw m4, m5, m6 1529 punpckhbw m5, m6 1530 palignr m3, m5, m4, 2 1531 paddw m0, m4, m3 1532 palignr m1, m5, m4, 6 1533 paddw m0, m1 1534 punpcklwd m2, m3, m1 1535 pmaddwd m2, m2 1536 punpckhwd m3, m1 1537 pmaddwd m3, m3 1538 palignr m5, m4, 8 1539 paddw m0, m5 1540 punpcklwd m1, m4, m5 1541 pmaddwd m1, m1 1542 paddd m2, m1 1543 punpckhwd m1, m4, m5 1544 pmaddwd m1, m1 1545 shufps m4, m5, q2121 1546 paddw m0, m4 ; h sum 1547 punpcklwd m5, m4, m6 1548 pmaddwd m5, m5 1549 punpckhwd m4, m6 1550 pmaddwd m4, m4 1551 paddd m3, m1 1552 paddd m2, m5 ; h sumsq 1553 paddd m3, m4 1554 paddw m1, m0, [t1+wq*2+400*0] 1555 paddd m4, m2, [t1+wq*2+400*2] 1556 paddd m5, m3, [t1+wq*2+400*4] 1557 %if ARCH_X86_64 1558 test hd, hd 1559 %else 1560 test t3, t3 1561 %endif 1562 jz .hv_last_row 1563 .hv_main2: 1564 paddw m1, [t2+wq*2+400*0] ; hv sum 1565 paddd m4, [t2+wq*2+400*2] ; hv sumsq 1566 paddd m5, [t2+wq*2+400*4] 1567 mova [t0+wq*2+400*0], m0 1568 pslld m0, m4, 4 1569 mova [t0+wq*2+400*2], m2 1570 mova [t0+wq*2+400*4], m3 1571 pslld m2, m4, 3 1572 paddd m4, m0 1573 pslld m0, m5, 4 1574 paddd m4, m2 ; a * 25 1575 pslld m2, m5, 3 1576 paddd m5, m0 1577 paddd m5, m2 1578 punpcklwd m0, m1, m6 ; b 1579 punpckhwd m1, m6 1580 SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14 1581 movif32 t3, t3m 1582 punpcklwd m2, m3, m3 1583 mova [t4+wq*2+4], m3 1584 punpckhwd m3, m3 1585 MUL_32X16X2 m0, m1, m2, m3, m4, m5 1586 paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) 1587 paddd m1, m12 1588 psrld m0, 12 ; b 1589 psrld m1, 12 1590 mova [t3+wq*4+ 8], m0 1591 mova [t3+wq*4+24], m1 1592 add wq, 8 1593 jl .hv_loop 1594 mov t2, t1 1595 mov t1, t0 1596 mov t0, t2 1597 movif32 t0m, t0 1598 ret 1599 .hv_last_row: ; esoteric edge case for odd heights 1600 mova [t1+wq*2+400*0], m1 1601 paddw m1, m0 1602 mova [t1+wq*2+400*2], m4 1603 paddd m4, m2 1604 mova [t1+wq*2+400*4], m5 1605 paddd m5, m3 1606 jmp .hv_main2 1607 .v: ; vertical boxsum + ab 1608 %if ARCH_X86_64 1609 lea wq, [r4-2] 1610 %else 1611 mov wd, w0m 1612 %endif 1613 .v_loop: 1614 mova m0, [t1+wq*2+400*0] 1615 mova m2, [t1+wq*2+400*2] 1616 mova m3, [t1+wq*2+400*4] 1617 paddw m1, m0, [t2+wq*2+400*0] 1618 paddd m4, m2, [t2+wq*2+400*2] 1619 paddd m5, m3, [t2+wq*2+400*4] 1620 paddw m0, m0 1621 paddd m2, m2 1622 paddd m3, m3 1623 paddw m1, m0 ; hv sum 1624 paddd m4, m2 ; hv sumsq 1625 pslld m0, m4, 4 1626 paddd m5, m3 1627 pslld m2, m4, 3 1628 paddd m4, m0 1629 pslld m0, m5, 4 1630 paddd m4, m2 ; a * 25 1631 pslld m2, m5, 3 1632 paddd m5, m0 1633 paddd m5, m2 1634 punpcklwd m0, m1, m6 1635 punpckhwd m1, m6 1636 SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14 1637 punpcklwd m2, m3, m3 1638 mova [t4+wq*2+4], m3 1639 punpckhwd m3, m3 1640 MUL_32X16X2 m0, m1, m2, m3, m4, m5 1641 paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) 1642 paddd m1, m12 1643 psrld m0, 12 ; b 1644 psrld m1, 12 1645 mova [t3+wq*4+ 8], m0 1646 mova [t3+wq*4+24], m1 1647 add wq, 8 1648 jl .v_loop 1649 ret 1650 .prep_n: ; initial neighbor setup 1651 movif64 wq, r4 1652 movif32 wd, w1m 1653 .prep_n_loop: 1654 movu m0, [t4+wq*2+ 2] 1655 movu m3, [t4+wq*2+ 4] 1656 movu m1, [t3+wq*4+ 4] 1657 movu m4, [t3+wq*4+ 8] 1658 movu m2, [t3+wq*4+20] 1659 movu m5, [t3+wq*4+24] 1660 paddw m3, m0 1661 paddd m4, m1 1662 paddd m5, m2 1663 paddw m3, [t4+wq*2+ 0] 1664 paddd m4, [t3+wq*4+ 0] 1665 paddd m5, [t3+wq*4+16] 1666 paddw m0, m3 1667 psllw m3, 2 1668 paddd m1, m4 1669 pslld m4, 2 1670 paddd m2, m5 1671 pslld m5, 2 1672 paddw m0, m3 ; a 565 1673 paddd m1, m4 ; b 565 1674 paddd m2, m5 1675 mova [t4+wq*2+400*2+ 0], m0 1676 mova [t3+wq*4+400*4+ 0], m1 1677 mova [t3+wq*4+400*4+16], m2 1678 add wq, 8 1679 jl .prep_n_loop 1680 ret 1681 ALIGN function_align 1682 .n0: ; neighbor + output (even rows) 1683 movif64 wq, r4 1684 movif32 wd, w1m 1685 .n0_loop: 1686 movu m0, [t4+wq*2+ 2] 1687 movu m3, [t4+wq*2+ 4] 1688 movu m1, [t3+wq*4+ 4] 1689 movu m4, [t3+wq*4+ 8] 1690 movu m2, [t3+wq*4+20] 1691 movu m5, [t3+wq*4+24] 1692 paddw m3, m0 1693 paddd m4, m1 1694 paddd m5, m2 1695 paddw m3, [t4+wq*2+ 0] 1696 paddd m4, [t3+wq*4+ 0] 1697 paddd m5, [t3+wq*4+16] 1698 paddw m0, m3 1699 psllw m3, 2 1700 paddd m1, m4 1701 pslld m4, 2 1702 paddd m2, m5 1703 pslld m5, 2 1704 paddw m0, m3 ; a 565 1705 paddd m1, m4 ; b 565 1706 paddd m2, m5 1707 paddw m3, m0, [t4+wq*2+400*2+ 0] 1708 paddd m4, m1, [t3+wq*4+400*4+ 0] 1709 paddd m5, m2, [t3+wq*4+400*4+16] 1710 mova [t4+wq*2+400*2+ 0], m0 1711 mova [t3+wq*4+400*4+ 0], m1 1712 mova [t3+wq*4+400*4+16], m2 1713 movq m0, [dstq+wq] 1714 punpcklbw m0, m6 1715 punpcklwd m1, m0, m6 ; src 1716 punpcklwd m2, m3, m6 ; a 1717 pmaddwd m2, m1 ; a * src 1718 punpckhwd m1, m0, m6 1719 punpckhwd m3, m6 1720 pmaddwd m3, m1 1721 psubd m4, m2 ; b - a * src + (1 << 8) 1722 psubd m5, m3 1723 psrad m4, 9 1724 psrad m5, 9 1725 packssdw m4, m5 1726 pmulhrsw m4, m7 1727 paddw m0, m4 1728 packuswb m0, m0 1729 movq [dstq+wq], m0 1730 add wq, 8 1731 jl .n0_loop 1732 add dstq, stridemp 1733 ret 1734 ALIGN function_align 1735 .n1: ; neighbor + output (odd rows) 1736 movif64 wq, r4 1737 movif32 wd, w1m 1738 .n1_loop: 1739 movq m0, [dstq+wq] 1740 mova m3, [t4+wq*2+400*2+ 0] 1741 mova m4, [t3+wq*4+400*4+ 0] 1742 mova m5, [t3+wq*4+400*4+16] 1743 punpcklbw m0, m6 1744 punpcklwd m1, m0, m6 ; src 1745 punpcklwd m2, m3, m6 ; a 1746 pmaddwd m2, m1 ; a * src 1747 punpckhwd m1, m0, m6 1748 punpckhwd m3, m6 1749 pmaddwd m3, m1 1750 psubd m4, m2 ; b - a * src + (1 << 7) 1751 psubd m5, m3 1752 psrad m4, 8 1753 psrad m5, 8 1754 packssdw m4, m5 1755 pmulhrsw m4, m7 1756 paddw m0, m4 1757 packuswb m0, m0 1758 movq [dstq+wq], m0 1759 add wq, 8 1760 jl .n1_loop 1761 add dstq, stridemp 1762 movif32 dstm, dstq 1763 ret 1764 1765 %if ARCH_X86_32 1766 %if STACK_ALIGNMENT < 16 1767 %assign extra_stack 4*16 1768 %else 1769 %assign extra_stack 2*16 1770 %endif 1771 cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \ 1772 dst, stride, left, lpf, w 1773 %if STACK_ALIGNMENT < 16 1774 %define dstm dword [esp+calloff+16*2+4*0] 1775 %define stridemp dword [esp+calloff+16*2+4*1] 1776 %define leftm dword [esp+calloff+16*2+4*2] 1777 %define lpfm dword [esp+calloff+16*2+4*3] 1778 %define w0m dword [esp+calloff+16*2+4*4] 1779 %define hd dword [esp+calloff+16*2+4*5] 1780 %define edgeb byte [esp+calloff+16*2+4*6] 1781 %define edged dword [esp+calloff+16*2+4*6] 1782 %define leftmp leftm 1783 %else 1784 %define w0m wm 1785 %define hd dword r5m 1786 %define edgeb byte r7m 1787 %define edged dword r7m 1788 %endif 1789 %define hvsrcm dword [esp+calloff+4*0] 1790 %define w1m dword [esp+calloff+4*1] 1791 %define t3m dword [esp+calloff+4*2] 1792 %define t4m dword [esp+calloff+4*3] 1793 %define m8 [base+pb_0to15] 1794 %define m9 [esp+calloff+16*1] 1795 %define m10 [base+pw_455_24] 1796 %define m11 [base+pd_34816] 1797 %define m12 m6 1798 %define m13 [base+sgr_lshuf3] 1799 %define m14 [base+pf_256] 1800 %define base r6-pw_2056 1801 %assign calloff 0 1802 %if STACK_ALIGNMENT < 16 1803 mov strideq, [rstk+stack_offset+ 8] 1804 mov leftq, [rstk+stack_offset+12] 1805 mov lpfq, [rstk+stack_offset+16] 1806 mov wd, [rstk+stack_offset+20] 1807 mov dstm, dstq 1808 mov stridemp, strideq 1809 mov leftm, leftq 1810 mov r1, [rstk+stack_offset+24] 1811 mov r2, [rstk+stack_offset+32] 1812 mov lpfm, lpfq 1813 mov hd, r1 1814 mov edged, r2 1815 %endif 1816 %else 1817 cglobal sgr_filter_3x3_8bpc, 4, 13, 15, -400*42-8, dst, stride, left, lpf, \ 1818 w, h, edge, params 1819 %endif 1820 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1821 mov wd, wm 1822 %endif 1823 %if ARCH_X86_64 1824 mov paramsq, r6mp 1825 mov hd, hm 1826 mov edged, r7m 1827 movq m9, [paramsq+4] 1828 add lpfq, wq 1829 lea t1, [rsp+wq*2+12] 1830 mova m8, [pb_0to15] 1831 add dstq, wq 1832 lea t3, [rsp+wq*4+400*12+8] 1833 mova m10, [pw_455_24] 1834 lea t4, [rsp+wq*2+400*32+8] 1835 mova m11, [pd_34816] 1836 pshuflw m7, m9, q3333 1837 pshufb m9, [pw_256] ; s1 1838 punpcklqdq m7, m7 ; w1 1839 neg wq 1840 pxor m6, m6 1841 mova m13, [sgr_lshuf3] 1842 psllw m7, 4 1843 movaps m14, [pf_256] 1844 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1845 %define lpfm [rsp] 1846 %else 1847 mov r1, [rstk+stack_offset+28] ; params 1848 LEA r6, pw_2056 1849 movq m1, [r1+4] 1850 add lpfm, wq 1851 lea t1, [rsp+extra_stack+wq*2+20] 1852 add dstq, wq 1853 lea t3, [rsp+extra_stack+wq*4+400*12+16] 1854 mov dstm, dstq 1855 lea t4, [rsp+extra_stack+wq*2+400*32+16] 1856 mov t3m, t3 1857 pshuflw m7, m1, q3333 1858 mov t4m, t4 1859 pshufb m1, [base+pw_256] ; s1 1860 punpcklqdq m7, m7 ; w1 1861 psllw m7, 4 1862 neg wq 1863 mova m9, m1 1864 pxor m6, m6 1865 mov w1m, wd 1866 sub wd, 2 1867 mov lpfq, lpfm 1868 mov w0m, wd 1869 %define strideq r5 1870 %endif 1871 test edgeb, 4 ; LR_HAVE_TOP 1872 jz .no_top 1873 call .h_top 1874 add lpfq, stridemp 1875 mov t2, t1 1876 add t1, 400*6 1877 call .h_top 1878 movif32 strideq, stridemp 1879 lea r10, [lpfq+strideq*4] 1880 mov lpfq, dstq 1881 add r10, strideq 1882 mov lpfm, r10 ; below 1883 movif32 t4, t4m 1884 call .hv0 1885 .main: 1886 dec hd 1887 jz .height1 1888 movif32 lpfq, hvsrcm 1889 add lpfq, stridemp 1890 call .hv1 1891 call .prep_n 1892 sub hd, 2 1893 jl .extend_bottom 1894 .main_loop: 1895 movif32 lpfq, hvsrcm 1896 add lpfq, stridemp 1897 call .hv0 1898 %if ARCH_X86_64 1899 test hb, hb 1900 %else 1901 mov r4, hd 1902 test r4, r4 1903 %endif 1904 jz .odd_height 1905 movif32 lpfq, hvsrcm 1906 add lpfq, stridemp 1907 call .hv1 1908 call .n0 1909 call .n1 1910 sub hd, 2 1911 jge .main_loop 1912 test edgeb, 8 ; LR_HAVE_BOTTOM 1913 jz .extend_bottom 1914 mov lpfq, lpfm 1915 call .hv0_bottom 1916 movif32 lpfq, hvsrcm 1917 add lpfq, stridemp 1918 call .hv1_bottom 1919 .end: 1920 call .n0 1921 call .n1 1922 .end2: 1923 RET 1924 .height1: 1925 call .v1 1926 call .prep_n 1927 jmp .odd_height_end 1928 .odd_height: 1929 call .v1 1930 call .n0 1931 call .n1 1932 .odd_height_end: 1933 call .v0 1934 call .v1 1935 call .n0 1936 jmp .end2 1937 .extend_bottom: 1938 call .v0 1939 call .v1 1940 jmp .end 1941 .no_top: 1942 movif32 strideq, stridemp 1943 lea r10, [lpfq+strideq*4] 1944 mov lpfq, dstq 1945 lea r10, [r10+strideq*2] 1946 mov lpfm, r10 1947 call .h 1948 %if ARCH_X86_64 1949 lea wq, [r4-2] 1950 %else 1951 mov wq, w0m 1952 mov hvsrcm, lpfq 1953 %endif 1954 lea t2, [t1+400*6] 1955 .top_fixup_loop: 1956 mova m0, [t1+wq*2+400*0] 1957 mova m1, [t1+wq*2+400*2] 1958 mova m2, [t1+wq*2+400*4] 1959 mova [t2+wq*2+400*0], m0 1960 mova [t2+wq*2+400*2], m1 1961 mova [t2+wq*2+400*4], m2 1962 add wq, 8 1963 jl .top_fixup_loop 1964 movif32 t3, t3m 1965 movif32 t4, t4m 1966 call .v0 1967 jmp .main 1968 .extend_right: 1969 %assign stack_offset stack_offset+8 1970 %assign calloff 8 1971 movd m0, [lpfq-1] 1972 movd m1, wd 1973 mova m3, m8 1974 pshufb m0, m6 1975 pshufb m1, m6 1976 mova m2, m6 1977 psubb m2, m1 1978 pcmpgtb m2, m3 1979 pand m5, m2 1980 pandn m2, m0 1981 por m5, m2 1982 ret 1983 %assign stack_offset stack_offset-4 1984 %assign calloff 4 1985 .h: ; horizontal boxsum 1986 %if ARCH_X86_64 1987 lea wq, [r4-2] 1988 %else 1989 %define leftq r4 1990 %endif 1991 test edgeb, 1 ; LR_HAVE_LEFT 1992 jz .h_extend_left 1993 movif32 leftq, leftm 1994 movddup m4, [leftq-4] 1995 movif32 wq, w0m 1996 mova m5, [lpfq+wq+2] 1997 add leftmp, 4 1998 palignr m5, m4, 14 1999 jmp .h_main 2000 .h_extend_left: 2001 movif32 wq, w0m 2002 mova m5, [lpfq+wq+2] 2003 pshufb m5, m13 2004 jmp .h_main 2005 .h_top: 2006 %if ARCH_X86_64 2007 lea wq, [r4-2] 2008 %endif 2009 test edgeb, 1 ; LR_HAVE_LEFT 2010 jz .h_extend_left 2011 movif32 wq, w0m 2012 .h_loop: 2013 movu m5, [lpfq+wq] 2014 .h_main: 2015 test edgeb, 2 ; LR_HAVE_RIGHT 2016 jnz .h_have_right 2017 cmp wd, -9 2018 jl .h_have_right 2019 call .extend_right 2020 .h_have_right: 2021 punpcklbw m4, m5, m6 2022 punpckhbw m5, m6 2023 palignr m0, m5, m4, 2 2024 paddw m1, m4, m0 2025 punpcklwd m2, m4, m0 2026 pmaddwd m2, m2 2027 punpckhwd m3, m4, m0 2028 pmaddwd m3, m3 2029 palignr m5, m4, 4 2030 paddw m1, m5 ; sum 2031 punpcklwd m4, m5, m6 2032 pmaddwd m4, m4 2033 punpckhwd m5, m6 2034 pmaddwd m5, m5 2035 paddd m2, m4 ; sumsq 2036 paddd m3, m5 2037 mova [t1+wq*2+400*0], m1 2038 mova [t1+wq*2+400*2], m2 2039 mova [t1+wq*2+400*4], m3 2040 add wq, 8 2041 jl .h_loop 2042 ret 2043 ALIGN function_align 2044 .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 2045 %if ARCH_X86_64 2046 lea wq, [r4-2] 2047 %else 2048 mov hvsrcm, lpfq 2049 %endif 2050 test edgeb, 1 ; LR_HAVE_LEFT 2051 jz .hv0_extend_left 2052 movif32 leftq, leftm 2053 movddup m4, [leftq-4] 2054 movif32 wq, w0m 2055 mova m5, [lpfq+wq+2] 2056 add leftmp, 4 2057 palignr m5, m4, 14 2058 jmp .hv0_main 2059 .hv0_extend_left: 2060 movif32 wq, w0m 2061 mova m5, [lpfq+wq+2] 2062 pshufb m5, m13 2063 jmp .hv0_main 2064 .hv0_bottom: 2065 %if ARCH_X86_64 2066 lea wq, [r4-2] 2067 %else 2068 mov hvsrcm, lpfq 2069 %endif 2070 test edgeb, 1 ; LR_HAVE_LEFT 2071 jz .hv0_extend_left 2072 movif32 wq, w0m 2073 %if ARCH_X86_32 2074 jmp .hv0_loop_start 2075 %endif 2076 .hv0_loop: 2077 movif32 lpfq, hvsrcm 2078 .hv0_loop_start: 2079 movu m5, [lpfq+wq] 2080 .hv0_main: 2081 test edgeb, 2 ; LR_HAVE_RIGHT 2082 jnz .hv0_have_right 2083 cmp wd, -9 2084 jl .hv0_have_right 2085 call .extend_right 2086 .hv0_have_right: 2087 punpcklbw m4, m5, m6 2088 punpckhbw m5, m6 2089 palignr m0, m5, m4, 2 2090 paddw m1, m4, m0 2091 punpcklwd m2, m4, m0 2092 pmaddwd m2, m2 2093 punpckhwd m3, m4, m0 2094 pmaddwd m3, m3 2095 palignr m5, m4, 4 2096 paddw m1, m5 ; sum 2097 punpcklwd m4, m5, m6 2098 pmaddwd m4, m4 2099 punpckhwd m5, m6 2100 pmaddwd m5, m5 2101 paddd m2, m4 ; sumsq 2102 paddd m3, m5 2103 paddw m0, m1, [t1+wq*2+400*0] 2104 paddd m4, m2, [t1+wq*2+400*2] 2105 paddd m5, m3, [t1+wq*2+400*4] 2106 mova [t1+wq*2+400*0], m1 2107 mova [t1+wq*2+400*2], m2 2108 mova [t1+wq*2+400*4], m3 2109 paddw m1, m0, [t2+wq*2+400*0] 2110 paddd m2, m4, [t2+wq*2+400*2] 2111 paddd m3, m5, [t2+wq*2+400*4] 2112 mova [t2+wq*2+400*0], m0 2113 mova [t2+wq*2+400*2], m4 2114 mova [t2+wq*2+400*4], m5 2115 pslld m4, m2, 3 2116 pslld m5, m3, 3 2117 paddd m4, m2 ; a * 9 2118 paddd m5, m3 2119 punpcklwd m0, m1, m6 ; b 2120 punpckhwd m1, m6 2121 SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14 2122 movif32 t3, t3m 2123 punpcklwd m2, m3, m3 2124 mova [t4+wq*2+4], m3 2125 punpckhwd m3, m3 2126 MUL_32X16X2 m0, m1, m2, m3, m4, m5 2127 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2128 paddd m1, m11 2129 psrld m0, 12 2130 psrld m1, 12 2131 mova [t3+wq*4+ 8], m0 2132 mova [t3+wq*4+24], m1 2133 add wq, 8 2134 jl .hv0_loop 2135 ret 2136 ALIGN function_align 2137 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2138 %if ARCH_X86_64 2139 lea wq, [r4-2] 2140 %else 2141 mov hvsrcm, lpfq 2142 %endif 2143 test edgeb, 1 ; LR_HAVE_LEFT 2144 jz .hv1_extend_left 2145 movif32 leftq, leftm 2146 movddup m4, [leftq-4] 2147 movif32 wq, w0m 2148 mova m5, [lpfq+wq+2] 2149 add leftmp, 4 2150 palignr m5, m4, 14 2151 jmp .hv1_main 2152 .hv1_extend_left: 2153 movif32 wq, w0m 2154 mova m5, [lpfq+wq+2] 2155 pshufb m5, m13 2156 jmp .hv1_main 2157 .hv1_bottom: 2158 %if ARCH_X86_64 2159 lea wq, [r4-2] 2160 %else 2161 mov hvsrcm, lpfq 2162 %endif 2163 test edgeb, 1 ; LR_HAVE_LEFT 2164 jz .hv1_extend_left 2165 movif32 wq, w0m 2166 %if ARCH_X86_32 2167 jmp .hv1_loop_start 2168 %endif 2169 .hv1_loop: 2170 movif32 lpfq, hvsrcm 2171 .hv1_loop_start: 2172 movu m5, [lpfq+wq] 2173 .hv1_main: 2174 test edgeb, 2 ; LR_HAVE_RIGHT 2175 jnz .hv1_have_right 2176 cmp wd, -9 2177 jl .hv1_have_right 2178 call .extend_right 2179 .hv1_have_right: 2180 punpcklbw m4, m5, m6 2181 punpckhbw m5, m6 2182 palignr m1, m5, m4, 2 2183 paddw m0, m4, m1 2184 punpcklwd m2, m4, m1 2185 pmaddwd m2, m2 2186 punpckhwd m3, m4, m1 2187 pmaddwd m3, m3 2188 palignr m5, m4, 4 2189 paddw m0, m5 ; h sum 2190 punpcklwd m1, m5, m6 2191 pmaddwd m1, m1 2192 punpckhwd m5, m6 2193 pmaddwd m5, m5 2194 paddd m2, m1 ; h sumsq 2195 paddd m3, m5 2196 paddw m1, m0, [t2+wq*2+400*0] 2197 paddd m4, m2, [t2+wq*2+400*2] 2198 paddd m5, m3, [t2+wq*2+400*4] 2199 mova [t2+wq*2+400*0], m0 2200 mova [t2+wq*2+400*2], m2 2201 mova [t2+wq*2+400*4], m3 2202 pslld m2, m4, 3 2203 pslld m3, m5, 3 2204 paddd m4, m2 ; a * 9 2205 paddd m5, m3 2206 punpcklwd m0, m1, m6 ; b 2207 punpckhwd m1, m6 2208 SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14 2209 movif32 t3, t3m 2210 punpcklwd m2, m3, m3 2211 mova [t4+wq*2+400*2 +4], m3 2212 punpckhwd m3, m3 2213 MUL_32X16X2 m0, m1, m2, m3, m4, m5 2214 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2215 paddd m1, m11 2216 psrld m0, 12 2217 psrld m1, 12 2218 mova [t3+wq*4+400*4+ 8], m0 2219 mova [t3+wq*4+400*4+24], m1 2220 add wq, 8 2221 jl .hv1_loop 2222 mov r10, t2 2223 mov t2, t1 2224 mov t1, r10 2225 ret 2226 .v0: ; vertical boxsums + ab (even rows) 2227 %if ARCH_X86_64 2228 lea wq, [r4-2] 2229 %else 2230 mov wd, w0m 2231 %endif 2232 .v0_loop: 2233 mova m0, [t1+wq*2+400*0] 2234 mova m4, [t1+wq*2+400*2] 2235 mova m5, [t1+wq*2+400*4] 2236 paddw m0, m0 2237 paddd m4, m4 2238 paddd m5, m5 2239 paddw m1, m0, [t2+wq*2+400*0] 2240 paddd m2, m4, [t2+wq*2+400*2] 2241 paddd m3, m5, [t2+wq*2+400*4] 2242 mova [t2+wq*2+400*0], m0 2243 mova [t2+wq*2+400*2], m4 2244 mova [t2+wq*2+400*4], m5 2245 pslld m4, m2, 3 2246 pslld m5, m3, 3 2247 paddd m4, m2 ; a * 9 2248 paddd m5, m3 2249 punpcklwd m0, m1, m6 ; b 2250 punpckhwd m1, m6 2251 SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14 2252 punpcklwd m2, m3, m3 2253 mova [t4+wq*2+4], m3 2254 punpckhwd m3, m3 2255 MUL_32X16X2 m0, m1, m2, m3, m4, m5 2256 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2257 paddd m1, m11 2258 psrld m0, 12 2259 psrld m1, 12 2260 mova [t3+wq*4+ 8], m0 2261 mova [t3+wq*4+24], m1 2262 add wq, 8 2263 jl .v0_loop 2264 ret 2265 .v1: ; vertical boxsums + ab (odd rows) 2266 %if ARCH_X86_64 2267 lea wq, [r4-2] 2268 %else 2269 mov wd, w0m 2270 %endif 2271 .v1_loop: 2272 mova m0, [t1+wq*2+400*0] 2273 mova m4, [t1+wq*2+400*2] 2274 mova m5, [t1+wq*2+400*4] 2275 paddw m1, m0, [t2+wq*2+400*0] 2276 paddd m2, m4, [t2+wq*2+400*2] 2277 paddd m3, m5, [t2+wq*2+400*4] 2278 mova [t2+wq*2+400*0], m0 2279 mova [t2+wq*2+400*2], m4 2280 mova [t2+wq*2+400*4], m5 2281 pslld m4, m2, 3 2282 pslld m5, m3, 3 2283 paddd m4, m2 ; a * 9 2284 paddd m5, m3 2285 punpcklwd m0, m1, m6 ; b 2286 punpckhwd m1, m6 2287 SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14 2288 punpcklwd m2, m3, m3 2289 mova [t4+wq*2+400*2+ 4], m3 2290 punpckhwd m3, m3 2291 MUL_32X16X2 m0, m1, m2, m3, m4, m5 2292 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2293 paddd m1, m11 2294 psrld m0, 12 2295 psrld m1, 12 2296 mova [t3+wq*4+400*4+ 8], m0 2297 mova [t3+wq*4+400*4+24], m1 2298 add wq, 8 2299 jl .v1_loop 2300 mov r10, t2 2301 mov t2, t1 2302 mov t1, r10 2303 ret 2304 .prep_n: ; initial neighbor setup 2305 movif64 wq, r4 2306 movif32 wd, w1m 2307 .prep_n_loop: 2308 movu m0, [t4+wq*2+400*0+ 4] 2309 movu m1, [t3+wq*4+400*0+ 8] 2310 movu m2, [t3+wq*4+400*0+24] 2311 movu m3, [t4+wq*2+400*0+ 2] 2312 movu m4, [t3+wq*4+400*0+ 4] 2313 movu m5, [t3+wq*4+400*0+20] 2314 paddw m0, [t4+wq*2+400*0+ 0] 2315 paddd m1, [t3+wq*4+400*0+ 0] 2316 paddd m2, [t3+wq*4+400*0+16] 2317 paddw m3, m0 2318 paddd m4, m1 2319 paddd m5, m2 2320 psllw m3, 2 ; a[-1] 444 2321 pslld m4, 2 ; b[-1] 444 2322 pslld m5, 2 2323 psubw m3, m0 ; a[-1] 343 2324 psubd m4, m1 ; b[-1] 343 2325 psubd m5, m2 2326 mova [t4+wq*2+400*4], m3 2327 mova [t3+wq*4+400*8+ 0], m4 2328 mova [t3+wq*4+400*8+16], m5 2329 movu m0, [t4+wq*2+400*2+ 4] 2330 movu m1, [t3+wq*4+400*4+ 8] 2331 movu m2, [t3+wq*4+400*4+24] 2332 movu m3, [t4+wq*2+400*2+ 2] 2333 movu m4, [t3+wq*4+400*4+ 4] 2334 movu m5, [t3+wq*4+400*4+20] 2335 paddw m0, [t4+wq*2+400*2+ 0] 2336 paddd m1, [t3+wq*4+400*4+ 0] 2337 paddd m2, [t3+wq*4+400*4+16] 2338 paddw m3, m0 2339 paddd m4, m1 2340 paddd m5, m2 2341 psllw m3, 2 ; a[ 0] 444 2342 pslld m4, 2 ; b[ 0] 444 2343 pslld m5, 2 2344 mova [t4+wq*2+400* 6], m3 2345 mova [t3+wq*4+400*12+ 0], m4 2346 mova [t3+wq*4+400*12+16], m5 2347 psubw m3, m0 ; a[ 0] 343 2348 psubd m4, m1 ; b[ 0] 343 2349 psubd m5, m2 2350 mova [t4+wq*2+400* 8], m3 2351 mova [t3+wq*4+400*16+ 0], m4 2352 mova [t3+wq*4+400*16+16], m5 2353 add wq, 8 2354 jl .prep_n_loop 2355 ret 2356 ALIGN function_align 2357 .n0: ; neighbor + output (even rows) 2358 movif64 wq, r4 2359 movif32 wd, w1m 2360 .n0_loop: 2361 movu m3, [t4+wq*2+400*0+4] 2362 movu m1, [t4+wq*2+400*0+2] 2363 paddw m3, [t4+wq*2+400*0+0] 2364 paddw m1, m3 2365 psllw m1, 2 ; a[ 1] 444 2366 psubw m2, m1, m3 ; a[ 1] 343 2367 paddw m3, m2, [t4+wq*2+400*4] 2368 paddw m3, [t4+wq*2+400*6] 2369 mova [t4+wq*2+400*4], m2 2370 mova [t4+wq*2+400*6], m1 2371 movu m4, [t3+wq*4+400*0+8] 2372 movu m1, [t3+wq*4+400*0+4] 2373 paddd m4, [t3+wq*4+400*0+0] 2374 paddd m1, m4 2375 pslld m1, 2 ; b[ 1] 444 2376 psubd m2, m1, m4 ; b[ 1] 343 2377 paddd m4, m2, [t3+wq*4+400* 8+ 0] 2378 paddd m4, [t3+wq*4+400*12+ 0] 2379 mova [t3+wq*4+400* 8+ 0], m2 2380 mova [t3+wq*4+400*12+ 0], m1 2381 movu m5, [t3+wq*4+400*0+24] 2382 movu m1, [t3+wq*4+400*0+20] 2383 paddd m5, [t3+wq*4+400*0+16] 2384 paddd m1, m5 2385 pslld m1, 2 2386 psubd m2, m1, m5 2387 paddd m5, m2, [t3+wq*4+400* 8+16] 2388 paddd m5, [t3+wq*4+400*12+16] 2389 mova [t3+wq*4+400* 8+16], m2 2390 mova [t3+wq*4+400*12+16], m1 2391 movq m0, [dstq+wq] 2392 punpcklbw m0, m6 2393 punpcklwd m1, m0, m6 2394 punpcklwd m2, m3, m6 2395 pmaddwd m2, m1 ; a * src 2396 punpckhwd m1, m0, m6 2397 punpckhwd m3, m6 2398 pmaddwd m3, m1 2399 psubd m4, m2 ; b - a * src + (1 << 8) 2400 psubd m5, m3 2401 psrad m4, 9 2402 psrad m5, 9 2403 packssdw m4, m5 2404 pmulhrsw m4, m7 2405 paddw m0, m4 2406 packuswb m0, m0 2407 movq [dstq+wq], m0 2408 add wq, 8 2409 jl .n0_loop 2410 add dstq, stridemp 2411 ret 2412 ALIGN function_align 2413 .n1: ; neighbor + output (odd rows) 2414 movif64 wq, r4 2415 movif32 wd, w1m 2416 .n1_loop: 2417 movu m3, [t4+wq*2+400*2+4] 2418 movu m1, [t4+wq*2+400*2+2] 2419 paddw m3, [t4+wq*2+400*2+0] 2420 paddw m1, m3 2421 psllw m1, 2 ; a[ 1] 444 2422 psubw m2, m1, m3 ; a[ 1] 343 2423 paddw m3, m2, [t4+wq*2+400*6] 2424 paddw m3, [t4+wq*2+400*8] 2425 mova [t4+wq*2+400*6], m1 2426 mova [t4+wq*2+400*8], m2 2427 movu m4, [t3+wq*4+400*4+8] 2428 movu m1, [t3+wq*4+400*4+4] 2429 paddd m4, [t3+wq*4+400*4+0] 2430 paddd m1, m4 2431 pslld m1, 2 ; b[ 1] 444 2432 psubd m2, m1, m4 ; b[ 1] 343 2433 paddd m4, m2, [t3+wq*4+400*12+ 0] 2434 paddd m4, [t3+wq*4+400*16+ 0] 2435 mova [t3+wq*4+400*12+ 0], m1 2436 mova [t3+wq*4+400*16+ 0], m2 2437 movu m5, [t3+wq*4+400*4+24] 2438 movu m1, [t3+wq*4+400*4+20] 2439 paddd m5, [t3+wq*4+400*4+16] 2440 paddd m1, m5 2441 pslld m1, 2 2442 psubd m2, m1, m5 2443 paddd m5, m2, [t3+wq*4+400*12+16] 2444 paddd m5, [t3+wq*4+400*16+16] 2445 mova [t3+wq*4+400*12+16], m1 2446 mova [t3+wq*4+400*16+16], m2 2447 movq m0, [dstq+wq] 2448 punpcklbw m0, m6 2449 punpcklwd m1, m0, m6 2450 punpcklwd m2, m3, m6 2451 pmaddwd m2, m1 ; a * src 2452 punpckhwd m1, m0, m6 2453 punpckhwd m3, m6 2454 pmaddwd m3, m1 2455 psubd m4, m2 ; b - a * src + (1 << 8) 2456 psubd m5, m3 2457 psrad m4, 9 2458 psrad m5, 9 2459 packssdw m4, m5 2460 pmulhrsw m4, m7 2461 paddw m0, m4 2462 packuswb m0, m0 2463 movq [dstq+wq], m0 2464 add wq, 8 2465 jl .n1_loop 2466 add dstq, stridemp 2467 movif32 dstm, dstq 2468 ret 2469 2470 %if ARCH_X86_32 2471 %if STACK_ALIGNMENT < 16 2472 %assign extra_stack 10*16 2473 %else 2474 %assign extra_stack 8*16 2475 %endif 2476 cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \ 2477 dst, stride, left, lpf, w 2478 %if STACK_ALIGNMENT < 16 2479 %define dstm dword [esp+calloff+16*8+4*0] 2480 %define stridemp dword [esp+calloff+16*8+4*1] 2481 %define leftm dword [esp+calloff+16*8+4*2] 2482 %define lpfm dword [esp+calloff+16*8+4*3] 2483 %define w0m dword [esp+calloff+16*8+4*4] 2484 %define hd dword [esp+calloff+16*8+4*5] 2485 %define edgeb byte [esp+calloff+16*8+4*6] 2486 %define edged dword [esp+calloff+16*8+4*6] 2487 %define leftmp leftm 2488 %else 2489 %define w0m wm 2490 %define hd dword r5m 2491 %define edgeb byte r7m 2492 %define edged dword r7m 2493 %endif 2494 %define hvsrcm dword [esp+calloff+4*0] 2495 %define w1m dword [esp+calloff+4*1] 2496 %define t3m dword [esp+calloff+4*2] 2497 %define t4m dword [esp+calloff+4*3] 2498 %xdefine m8 m6 2499 %define m9 [base+pd_0xffff] 2500 %define m10 [base+pd_34816] 2501 %define m11 [base+pw_455_24] 2502 %define m12 [base+pw_164_24] 2503 %define m13 [esp+calloff+16*4] 2504 %define m14 [esp+calloff+16*5] 2505 %define m15 [esp+calloff+16*6] 2506 %define m6 [esp+calloff+16*7] 2507 %define base r6-pw_2056 2508 %assign calloff 0 2509 %if STACK_ALIGNMENT < 16 2510 mov strideq, [rstk+stack_offset+ 8] 2511 mov leftq, [rstk+stack_offset+12] 2512 mov lpfq, [rstk+stack_offset+16] 2513 mov wd, [rstk+stack_offset+20] 2514 mov dstm, dstq 2515 mov stridemp, strideq 2516 mov leftm, leftq 2517 mov r1, [rstk+stack_offset+24] 2518 mov r2, [rstk+stack_offset+32] 2519 mov lpfm, lpfq 2520 mov hd, r1 2521 mov edged, r2 2522 %endif 2523 %else 2524 cglobal sgr_filter_mix_8bpc, 4, 13, 16, -400*66-40, dst, stride, left, lpf, \ 2525 w, h, edge, params 2526 %endif 2527 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 2528 mov wd, wm 2529 %endif 2530 %if ARCH_X86_64 2531 mov paramsq, r6mp 2532 movifnidn hd, hm 2533 mov edged, r7m 2534 mova m15, [paramsq] 2535 add lpfq, wq 2536 mova m9, [pd_0xffff] 2537 lea t1, [rsp+wq*2+44] 2538 mova m10, [pd_34816] 2539 add dstq, wq 2540 lea t3, [rsp+wq*4+400*24+40] 2541 mova m11, [pw_455_24] 2542 lea t4, [rsp+wq*2+400*52+40] 2543 mova m12, [pw_164_24] 2544 neg wq 2545 pshuflw m13, m15, q0000 2546 pshuflw m14, m15, q2222 2547 pshufhw m15, m15, q1010 2548 punpcklqdq m13, m13 ; s0 2549 punpcklqdq m14, m14 ; s1 2550 punpckhqdq m15, m15 ; w0 w1 2551 pxor m6, m6 2552 psllw m15, 2 2553 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 2554 %define lpfm [rsp] 2555 %else 2556 mov r1, [rstk+stack_offset+28] ; params 2557 LEA r6, pw_2056 2558 mova m2, [r1] 2559 add lpfm, wq 2560 lea t1, [rsp+extra_stack+wq*2+52] 2561 add dstq, wq 2562 lea t3, [rsp+extra_stack+wq*4+400*24+48] 2563 mov dstm, dstq 2564 lea t4, [rsp+extra_stack+wq*2+400*52+48] 2565 mov t3m, t3 2566 mov t4m, t4 2567 neg wq 2568 pshuflw m0, m2, q0000 2569 pshuflw m1, m2, q2222 2570 pshufhw m2, m2, q1010 2571 punpcklqdq m0, m0 ; s0 2572 punpcklqdq m1, m1 ; s1 2573 punpckhqdq m2, m2 ; w0 w1 2574 mov w1m, wd 2575 pxor m3, m3 2576 psllw m2, 2 2577 mova m13, m0 2578 mova m14, m1 2579 sub wd, 2 2580 mova m15, m2 2581 mova m6, m3 2582 mov lpfq, lpfm 2583 mov w0m, wd 2584 %define strideq r5 2585 %endif 2586 test edgeb, 4 ; LR_HAVE_TOP 2587 jz .no_top 2588 call .h_top 2589 add lpfq, stridemp 2590 mov t2, t1 2591 %if ARCH_X86_64 2592 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup 2593 %else 2594 mov wq, w0m 2595 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop 2596 %endif 2597 add t1, 400*12 2598 call .h_top 2599 movif32 strideq, stridemp 2600 lea r10, [lpfq+strideq*4] 2601 mov lpfq, dstq 2602 add r10, strideq 2603 mov lpfm, r10 ; below 2604 movif32 t4, t4m 2605 call .hv0 2606 .main: 2607 dec hd 2608 jz .height1 2609 movif32 lpfq, hvsrcm 2610 add lpfq, stridemp 2611 call .hv1 2612 call .prep_n 2613 sub hd, 2 2614 jl .extend_bottom 2615 .main_loop: 2616 movif32 lpfq, hvsrcm 2617 add lpfq, stridemp 2618 call .hv0 2619 %if ARCH_X86_64 2620 test hd, hd 2621 %else 2622 mov r4, hd 2623 test r4, r4 2624 %endif 2625 jz .odd_height 2626 movif32 lpfq, hvsrcm 2627 add lpfq, stridemp 2628 call .hv1 2629 call .n0 2630 call .n1 2631 sub hd, 2 2632 jge .main_loop 2633 test edgeb, 8 ; LR_HAVE_BOTTOM 2634 jz .extend_bottom 2635 mov lpfq, lpfm 2636 call .hv0_bottom 2637 movif32 lpfq, hvsrcm 2638 add lpfq, stridemp 2639 call .hv1_bottom 2640 .end: 2641 call .n0 2642 call .n1 2643 .end2: 2644 RET 2645 .height1: 2646 call .v1 2647 call .prep_n 2648 jmp .odd_height_end 2649 .odd_height: 2650 call .v1 2651 call .n0 2652 call .n1 2653 .odd_height_end: 2654 call .v0 2655 call .v1 2656 call .n0 2657 jmp .end2 2658 .extend_bottom: 2659 call .v0 2660 call .v1 2661 jmp .end 2662 .no_top: 2663 movif32 strideq, stridemp 2664 lea r10, [lpfq+strideq*4] 2665 mov lpfq, dstq 2666 lea r10, [r10+strideq*2] 2667 mov lpfm, r10 2668 call .h 2669 %if ARCH_X86_64 2670 lea wq, [r4-2] 2671 %else 2672 mov wq, w0m 2673 mov hvsrcm, lpfq 2674 %endif 2675 lea t2, [t1+400*12] 2676 .top_fixup_loop: 2677 mova m0, [t1+wq*2+400* 0] 2678 mova m1, [t1+wq*2+400* 2] 2679 mova m2, [t1+wq*2+400* 4] 2680 paddw m0, m0 2681 mova m3, [t1+wq*2+400* 6] 2682 paddd m1, m1 2683 mova m4, [t1+wq*2+400* 8] 2684 paddd m2, m2 2685 mova m5, [t1+wq*2+400*10] 2686 mova [t2+wq*2+400* 0], m0 2687 mova [t2+wq*2+400* 2], m1 2688 mova [t2+wq*2+400* 4], m2 2689 mova [t2+wq*2+400* 6], m3 2690 mova [t2+wq*2+400* 8], m4 2691 mova [t2+wq*2+400*10], m5 2692 add wq, 8 2693 jl .top_fixup_loop 2694 movif32 t3, t3m 2695 movif32 t4, t4m 2696 call .v0 2697 jmp .main 2698 .extend_right: 2699 %assign stack_offset stack_offset+8 2700 %assign calloff 8 2701 %if ARCH_X86_64 2702 SWAP m8, m6 2703 %endif 2704 movd m1, wd 2705 movd m3, [lpfq-1] 2706 pshufb m1, m8 2707 pshufb m3, m8 2708 psubb m2, [base+pb_1], m1 2709 pcmpgtb m2, [base+pb_0to15] 2710 pand m5, m2 2711 pandn m2, m3 2712 por m5, m2 2713 %if ARCH_X86_64 2714 SWAP m6, m8 2715 %endif 2716 ret 2717 %assign stack_offset stack_offset-4 2718 %assign calloff 4 2719 .h: ; horizontal boxsum 2720 %if ARCH_X86_64 2721 lea wq, [r4-2] 2722 %else 2723 %define leftq r4 2724 %endif 2725 test edgeb, 1 ; LR_HAVE_LEFT 2726 jz .h_extend_left 2727 movif32 leftq, leftm 2728 movddup m4, [leftq-4] 2729 movif32 wq, w0m 2730 mova m5, [lpfq+wq+2] 2731 add leftmp, 4 2732 palignr m5, m4, 13 2733 jmp .h_main 2734 .h_extend_left: 2735 movif32 wq, w0m 2736 mova m5, [lpfq+wq+2] 2737 pshufb m5, [base+sgr_lshuf5] 2738 jmp .h_main 2739 .h_top: 2740 %if ARCH_X86_64 2741 lea wq, [r4-2] 2742 %endif 2743 test edgeb, 1 ; LR_HAVE_LEFT 2744 jz .h_extend_left 2745 movif32 wq, w0m 2746 .h_loop: 2747 movu m5, [lpfq+wq-1] 2748 .h_main: 2749 test edgeb, 2 ; LR_HAVE_RIGHT 2750 %if ARCH_X86_32 2751 pxor m8, m8 2752 %else 2753 SWAP m8, m6 2754 %endif 2755 jnz .h_have_right 2756 cmp wd, -10 2757 jl .h_have_right 2758 call .extend_right 2759 .h_have_right: 2760 punpcklbw m4, m5, m8 2761 punpckhbw m5, m8 2762 palignr m3, m5, m4, 2 2763 palignr m0, m5, m4, 4 2764 paddw m1, m3, m0 2765 punpcklwd m2, m3, m0 2766 pmaddwd m2, m2 2767 punpckhwd m3, m0 2768 pmaddwd m3, m3 2769 palignr m0, m5, m4, 6 2770 paddw m1, m0 ; sum3 2771 punpcklwd m7, m0, m8 2772 pmaddwd m7, m7 2773 punpckhwd m0, m8 2774 pmaddwd m0, m0 2775 %if ARCH_X86_64 2776 SWAP m6, m8 2777 %endif 2778 paddd m2, m7 ; sumsq3 2779 palignr m5, m4, 8 2780 punpcklwd m7, m5, m4 2781 paddw m8, m4, m5 2782 pmaddwd m7, m7 2783 punpckhwd m5, m4 2784 pmaddwd m5, m5 2785 paddd m3, m0 2786 mova [t1+wq*2+400* 6], m1 2787 mova [t1+wq*2+400* 8], m2 2788 mova [t1+wq*2+400*10], m3 2789 paddw m8, m1 ; sum5 2790 paddd m7, m2 ; sumsq5 2791 paddd m5, m3 2792 mova [t1+wq*2+400* 0], m8 2793 mova [t1+wq*2+400* 2], m7 2794 mova [t1+wq*2+400* 4], m5 2795 add wq, 8 2796 jl .h_loop 2797 ret 2798 ALIGN function_align 2799 .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 2800 %if ARCH_X86_64 2801 lea wq, [r4-2] 2802 %else 2803 mov hvsrcm, lpfq 2804 %endif 2805 test edgeb, 1 ; LR_HAVE_LEFT 2806 jz .hv0_extend_left 2807 movif32 leftq, leftm 2808 movddup m4, [leftq-4] 2809 movif32 wq, w0m 2810 mova m5, [lpfq+wq+2] 2811 add leftmp, 4 2812 palignr m5, m4, 13 2813 jmp .hv0_main 2814 .hv0_extend_left: 2815 movif32 wq, w0m 2816 mova m5, [lpfq+wq+2] 2817 pshufb m5, [base+sgr_lshuf5] 2818 jmp .hv0_main 2819 .hv0_bottom: 2820 %if ARCH_X86_64 2821 lea wq, [r4-2] 2822 %else 2823 mov hvsrcm, lpfq 2824 %endif 2825 test edgeb, 1 ; LR_HAVE_LEFT 2826 jz .hv0_extend_left 2827 movif32 wq, w0m 2828 %if ARCH_X86_32 2829 jmp .hv0_loop_start 2830 %endif 2831 .hv0_loop: 2832 movif32 lpfq, hvsrcm 2833 .hv0_loop_start: 2834 movu m5, [lpfq+wq-1] 2835 .hv0_main: 2836 test edgeb, 2 ; LR_HAVE_RIGHT 2837 %if ARCH_X86_32 2838 pxor m8, m8 2839 %else 2840 SWAP m8, m6 2841 %endif 2842 jnz .hv0_have_right 2843 cmp wd, -10 2844 jl .hv0_have_right 2845 call .extend_right 2846 .hv0_have_right: 2847 punpcklbw m4, m5, m8 2848 punpckhbw m5, m8 2849 palignr m3, m5, m4, 2 2850 palignr m0, m5, m4, 4 2851 movif32 t3, t3m 2852 paddw m1, m3, m0 2853 punpcklwd m2, m3, m0 2854 pmaddwd m2, m2 2855 punpckhwd m3, m0 2856 pmaddwd m3, m3 2857 palignr m0, m5, m4, 6 2858 paddw m1, m0 ; h sum3 2859 punpcklwd m7, m0, m8 2860 pmaddwd m7, m7 2861 punpckhwd m0, m8 2862 %if ARCH_X86_64 2863 SWAP m6, m8 2864 %endif 2865 pmaddwd m0, m0 2866 paddd m2, m7 ; h sumsq3 2867 palignr m5, m4, 8 2868 punpcklwd m7, m5, m4 2869 paddw m8, m4, m5 2870 pmaddwd m7, m7 2871 punpckhwd m5, m4 2872 pmaddwd m5, m5 2873 paddd m3, m0 2874 paddw m8, m1 ; h sum5 2875 paddd m7, m2 ; h sumsq5 2876 paddd m5, m3 2877 mova [t3+wq*4+400*8+ 8], m8 2878 mova [t3+wq*4+400*0+ 8], m7 2879 mova [t3+wq*4+400*0+24], m5 2880 paddw m8, [t1+wq*2+400* 0] 2881 paddd m7, [t1+wq*2+400* 2] 2882 paddd m5, [t1+wq*2+400* 4] 2883 mova [t1+wq*2+400* 0], m8 2884 mova [t1+wq*2+400* 2], m7 2885 mova [t1+wq*2+400* 4], m5 2886 paddw m0, m1, [t1+wq*2+400* 6] 2887 paddd m4, m2, [t1+wq*2+400* 8] 2888 paddd m5, m3, [t1+wq*2+400*10] 2889 mova [t1+wq*2+400* 6], m1 2890 mova [t1+wq*2+400* 8], m2 2891 mova [t1+wq*2+400*10], m3 2892 paddw m1, m0, [t2+wq*2+400* 6] 2893 paddd m2, m4, [t2+wq*2+400* 8] 2894 paddd m3, m5, [t2+wq*2+400*10] 2895 mova [t2+wq*2+400* 6], m0 2896 mova [t2+wq*2+400* 8], m4 2897 mova [t2+wq*2+400*10], m5 2898 pslld m4, m2, 3 2899 pslld m5, m3, 3 2900 paddd m4, m2 ; a3 * 9 2901 paddd m5, m3 2902 movaps m7, [base+pf_256] 2903 %if ARCH_X86_32 2904 pxor m2, m2 2905 punpcklwd m0, m1, m2 2906 punpckhwd m1, m2 2907 %else 2908 punpcklwd m0, m1, m6 ; b3 2909 punpckhwd m1, m6 2910 %endif 2911 SGR_CALC_X m3, m2, m0, m1, m4, m5, m14, m11, m7 2912 punpcklwd m2, m3, m3 2913 mova [t4+wq*2+400*2+ 4], m3 2914 punpckhwd m3, m3 2915 MUL_32X16X2 m0, m1, m2, m3, m4, m5 2916 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2917 paddd m1, m10 2918 psrld m0, 12 2919 psrld m1, 12 2920 mova [t3+wq*4+400*4+ 8], m0 2921 mova [t3+wq*4+400*4+24], m1 2922 add wq, 8 2923 jl .hv0_loop 2924 ret 2925 ALIGN function_align 2926 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2927 %if ARCH_X86_64 2928 lea wq, [r4-2] 2929 %else 2930 mov hvsrcm, lpfq 2931 %endif 2932 test edgeb, 1 ; LR_HAVE_LEFT 2933 jz .hv1_extend_left 2934 movif32 leftq, leftm 2935 movddup m4, [leftq-4] 2936 movif32 wq, w0m 2937 mova m5, [lpfq+wq+2] 2938 add leftmp, 4 2939 palignr m5, m4, 13 2940 jmp .hv1_main 2941 .hv1_extend_left: 2942 movif32 wq, w0m 2943 mova m5, [lpfq+wq+2] 2944 pshufb m5, [base+sgr_lshuf5] 2945 jmp .hv1_main 2946 .hv1_bottom: 2947 %if ARCH_X86_64 2948 lea wq, [r4-2] 2949 %else 2950 mov hvsrcm, lpfq 2951 %endif 2952 test edgeb, 1 ; LR_HAVE_LEFT 2953 jz .hv1_extend_left 2954 movif32 wq, w0m 2955 %if ARCH_X86_32 2956 jmp .hv1_loop_start 2957 %endif 2958 .hv1_loop: 2959 movif32 lpfq, hvsrcm 2960 .hv1_loop_start: 2961 movu m5, [lpfq+wq-1] 2962 .hv1_main: 2963 test edgeb, 2 ; LR_HAVE_RIGHT 2964 %if ARCH_X86_32 2965 pxor m8, m8 2966 %else 2967 SWAP m8, m6 2968 %endif 2969 jnz .hv1_have_right 2970 cmp wd, -10 2971 jl .hv1_have_right 2972 call .extend_right 2973 .hv1_have_right: 2974 punpcklbw m4, m5, m8 2975 punpckhbw m5, m8 2976 palignr m7, m5, m4, 2 2977 palignr m3, m5, m4, 4 2978 paddw m2, m7, m3 2979 punpcklwd m0, m7, m3 2980 pmaddwd m0, m0 2981 punpckhwd m7, m3 2982 pmaddwd m7, m7 2983 palignr m3, m5, m4, 6 2984 paddw m2, m3 ; h sum3 2985 punpcklwd m1, m3, m8 2986 pmaddwd m1, m1 2987 punpckhwd m3, m8 2988 %if ARCH_X86_64 2989 SWAP m6, m8 2990 %endif 2991 pmaddwd m3, m3 2992 paddd m0, m1 ; h sumsq3 2993 palignr m5, m4, 8 2994 punpckhwd m1, m4, m5 2995 paddw m8, m4, m5 2996 pmaddwd m1, m1 2997 punpcklwd m4, m5 2998 pmaddwd m4, m4 2999 paddd m7, m3 3000 paddw m5, m2, [t2+wq*2+400* 6] 3001 mova [t2+wq*2+400* 6], m2 3002 paddw m8, m2 ; h sum5 3003 paddd m2, m0, [t2+wq*2+400* 8] 3004 paddd m3, m7, [t2+wq*2+400*10] 3005 mova [t2+wq*2+400* 8], m0 3006 mova [t2+wq*2+400*10], m7 3007 paddd m4, m0 ; h sumsq5 3008 paddd m1, m7 3009 pslld m0, m2, 3 3010 pslld m7, m3, 3 3011 paddd m2, m0 ; a3 * 9 3012 paddd m3, m7 3013 %if ARCH_X86_32 3014 mova [esp+20], m8 3015 pxor m8, m8 3016 %else 3017 SWAP m8, m6 3018 %endif 3019 punpcklwd m0, m5, m8 ; b3 3020 punpckhwd m5, m8 3021 SGR_CALC_X m8, m7, m0, m5, m2, m3, m14, m11, [base+pf_256] 3022 movif32 t3, t3m 3023 punpcklwd m2, m8, m8 3024 mova [t4+wq*2+400*4+ 4], m8 3025 punpckhwd m8, m8 3026 MUL_32X16X2 m0, m5, m2, m8, m3, m7 3027 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3028 paddd m5, m10 3029 psrld m0, 12 3030 psrld m5, 12 3031 mova [t3+wq*4+400*8+ 8], m0 3032 mova [t3+wq*4+400*8+24], m5 3033 %if ARCH_X86_32 3034 mova m8, [esp+20] 3035 %else 3036 SWAP m6, m8 3037 pxor m6, m6 3038 %endif 3039 paddw m5, m8, [t2+wq*2+400*0] 3040 paddd m2, m4, [t2+wq*2+400*2] 3041 paddd m3, m1, [t2+wq*2+400*4] 3042 paddw m5, [t1+wq*2+400*0] 3043 paddd m2, [t1+wq*2+400*2] 3044 paddd m3, [t1+wq*2+400*4] 3045 mova [t2+wq*2+400*0], m8 3046 pslld m0, m2, 4 3047 mova [t2+wq*2+400*2], m4 3048 pslld m8, m3, 4 3049 mova [t2+wq*2+400*4], m1 3050 pslld m4, m2, 3 3051 paddd m2, m0 3052 pslld m0, m3, 3 3053 paddd m3, m8 3054 paddd m2, m4 ; a5 * 25 3055 paddd m3, m0 3056 %if ARCH_X86_32 3057 pxor m7, m7 3058 punpcklwd m0, m5, m7 3059 punpckhwd m5, m7 3060 %else 3061 punpcklwd m0, m5, m6 ; b5 3062 punpckhwd m5, m6 3063 %endif 3064 movaps m8, [base+pf_256] 3065 SGR_CALC_X m1, m4, m0, m5, m2, m3, m13, m12, m8 3066 punpcklwd m2, m1, m1 3067 mova [t4+wq*2+4], m1 3068 punpckhwd m1, m1 3069 MUL_32X16X2 m0, m5, m2, m1, m3, m4 3070 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3071 paddd m5, m10 3072 psrld m0, 12 3073 psrld m5, 12 3074 mova [t3+wq*4+ 8], m0 3075 mova [t3+wq*4+24], m5 3076 add wq, 8 3077 jl .hv1_loop 3078 mov r10, t2 3079 mov t2, t1 3080 mov t1, r10 3081 ret 3082 .v0: ; vertical boxsums + ab3 (even rows) 3083 %if ARCH_X86_64 3084 lea wq, [r4-2] 3085 %else 3086 mov wd, w0m 3087 %endif 3088 movaps m8, [base+pf_256] 3089 .v0_loop: 3090 mova m0, [t1+wq*2+400* 6] 3091 mova m4, [t1+wq*2+400* 8] 3092 mova m5, [t1+wq*2+400*10] 3093 paddw m0, m0 3094 paddd m4, m4 3095 paddd m5, m5 3096 paddw m1, m0, [t2+wq*2+400* 6] 3097 paddd m2, m4, [t2+wq*2+400* 8] 3098 paddd m3, m5, [t2+wq*2+400*10] 3099 mova [t2+wq*2+400* 6], m0 3100 mova [t2+wq*2+400* 8], m4 3101 mova [t2+wq*2+400*10], m5 3102 pslld m4, m2, 3 3103 pslld m5, m3, 3 3104 paddd m4, m2 ; a3 * 9 3105 paddd m5, m3 3106 %if ARCH_X86_32 3107 pxor m7, m7 3108 punpcklwd m0, m1, m7 3109 punpckhwd m1, m7 3110 %else 3111 punpcklwd m0, m1, m6 ; b3 3112 punpckhwd m1, m6 3113 %endif 3114 SGR_CALC_X m3, m2, m0, m1, m4, m5, m14, m11, m8 3115 punpcklwd m2, m3, m3 3116 mova [t4+wq*2+400*2+4], m3 3117 punpckhwd m3, m3 3118 MUL_32X16X2 m0, m1, m2, m3, m4, m5 3119 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3120 paddd m1, m10 3121 psrld m0, 12 3122 psrld m1, 12 3123 mova m3, [t1+wq*2+400*0] 3124 mova m4, [t1+wq*2+400*2] 3125 mova m5, [t1+wq*2+400*4] 3126 mova [t3+wq*4+400*8+ 8], m3 3127 mova [t3+wq*4+400*0+ 8], m4 3128 mova [t3+wq*4+400*0+24], m5 3129 paddw m3, m3 ; cc5 3130 paddd m4, m4 3131 paddd m5, m5 3132 mova [t1+wq*2+400*0], m3 3133 mova [t1+wq*2+400*2], m4 3134 mova [t1+wq*2+400*4], m5 3135 mova [t3+wq*4+400*4+ 8], m0 3136 mova [t3+wq*4+400*4+24], m1 3137 add wq, 8 3138 jl .v0_loop 3139 ret 3140 .v1: ; vertical boxsums + ab (odd rows) 3141 %if ARCH_X86_64 3142 lea wq, [r4-2] 3143 %else 3144 mov wd, w0m 3145 %endif 3146 .v1_loop: 3147 mova m4, [t1+wq*2+400* 6] 3148 mova m5, [t1+wq*2+400* 8] 3149 mova m7, [t1+wq*2+400*10] 3150 paddw m8, m4, [t2+wq*2+400* 6] 3151 paddd m2, m5, [t2+wq*2+400* 8] 3152 paddd m3, m7, [t2+wq*2+400*10] 3153 mova [t2+wq*2+400* 6], m4 3154 mova [t2+wq*2+400* 8], m5 3155 mova [t2+wq*2+400*10], m7 3156 pslld m4, m2, 3 3157 pslld m5, m3, 3 3158 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3159 paddd m5, m3 3160 movaps m1, [base+pf_256] 3161 %if ARCH_X86_32 3162 pxor m7, m7 3163 punpcklwd m0, m8, m7 3164 punpckhwd m8, m7 3165 %else 3166 punpcklwd m0, m8, m6 ; b3 3167 punpckhwd m8, m6 3168 %endif 3169 SGR_CALC_X m3, m2, m0, m8, m4, m5, m14, m11, m1 3170 punpcklwd m2, m3, m3 3171 mova [t4+wq*2+400*4+4], m3 3172 punpckhwd m3, m3 3173 MUL_32X16X2 m0, m8, m2, m3, m4, m5 3174 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3175 paddd m8, m10 3176 psrld m0, 12 3177 psrld m8, 12 3178 mova m4, [t3+wq*4+400*8+ 8] 3179 mova m5, [t3+wq*4+400*0+ 8] 3180 mova m7, [t3+wq*4+400*0+24] 3181 paddw m1, m4, [t2+wq*2+400*0] 3182 paddd m2, m5, [t2+wq*2+400*2] 3183 paddd m3, m7, [t2+wq*2+400*4] 3184 paddw m1, [t1+wq*2+400*0] 3185 paddd m2, [t1+wq*2+400*2] 3186 paddd m3, [t1+wq*2+400*4] 3187 mova [t2+wq*2+400*0], m4 3188 mova [t2+wq*2+400*2], m5 3189 mova [t2+wq*2+400*4], m7 3190 pslld m4, m2, 4 3191 mova [t3+wq*4+400*8+ 8], m0 3192 pslld m5, m3, 4 3193 mova [t3+wq*4+400*8+24], m8 3194 pslld m7, m2, 3 3195 paddd m2, m4 3196 pslld m4, m3, 3 3197 paddd m3, m5 3198 paddd m2, m7 ; a5 * 25 3199 paddd m3, m4 3200 movaps m8, [base+pf_256] 3201 %if ARCH_X86_32 3202 pxor m7, m7 3203 punpcklwd m0, m1, m7 3204 punpckhwd m1, m7 3205 %else 3206 punpcklwd m0, m1, m6 ; b5 3207 punpckhwd m1, m6 3208 %endif 3209 SGR_CALC_X m5, m4, m0, m1, m2, m3, m13, m12, m8 3210 punpcklwd m4, m5, m5 3211 mova [t4+wq*2+4], m5 3212 punpckhwd m5, m5 3213 MUL_32X16X2 m0, m1, m4, m5, m2, m3 3214 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3215 paddd m1, m10 3216 psrld m0, 12 3217 psrld m1, 12 3218 mova [t3+wq*4+ 8], m0 3219 mova [t3+wq*4+24], m1 3220 add wq, 8 3221 jl .v1_loop 3222 mov r10, t2 3223 mov t2, t1 3224 mov t1, r10 3225 ret 3226 .prep_n: ; initial neighbor setup 3227 movif64 wq, r4 3228 movif32 wd, w1m 3229 .prep_n_loop: 3230 movu m0, [t4+wq*2+400*0+ 2] 3231 movu m1, [t3+wq*4+400*0+ 4] 3232 movu m2, [t3+wq*4+400*0+20] 3233 movu m7, [t4+wq*2+400*0+ 4] 3234 movu m8, [t3+wq*4+400*0+ 8] 3235 paddw m3, m0, [t4+wq*2+400*0+ 0] 3236 paddd m4, m1, [t3+wq*4+400*0+ 0] 3237 paddd m5, m2, [t3+wq*4+400*0+16] 3238 paddw m3, m7 3239 paddd m4, m8 3240 movu m7, [t3+wq*4+400*0+24] 3241 paddw m0, m3 3242 paddd m1, m4 3243 psllw m3, 2 3244 pslld m4, 2 3245 paddd m5, m7 3246 paddd m2, m5 3247 pslld m5, 2 3248 paddw m0, m3 ; a5 565 3249 paddd m1, m4 ; b5 565 3250 paddd m2, m5 3251 mova [t4+wq*2+400* 6+ 0], m0 3252 mova [t3+wq*4+400*12+ 0], m1 3253 mova [t3+wq*4+400*12+16], m2 3254 movu m0, [t4+wq*2+400*2+ 4] 3255 movu m1, [t3+wq*4+400*4+ 8] 3256 movu m2, [t3+wq*4+400*4+24] 3257 movu m3, [t4+wq*2+400*2+ 2] 3258 movu m4, [t3+wq*4+400*4+ 4] 3259 movu m5, [t3+wq*4+400*4+20] 3260 paddw m0, [t4+wq*2+400*2+ 0] 3261 paddd m1, [t3+wq*4+400*4+ 0] 3262 paddd m2, [t3+wq*4+400*4+16] 3263 paddw m3, m0 3264 paddd m4, m1 3265 paddd m5, m2 3266 psllw m3, 2 ; a3[-1] 444 3267 pslld m4, 2 ; b3[-1] 444 3268 pslld m5, 2 3269 psubw m3, m0 ; a3[-1] 343 3270 psubd m4, m1 ; b3[-1] 343 3271 psubd m5, m2 3272 mova [t4+wq*2+400* 8+ 0], m3 3273 mova [t3+wq*4+400*16+ 0], m4 3274 mova [t3+wq*4+400*16+16], m5 3275 movu m0, [t4+wq*2+400*4+ 4] 3276 movu m1, [t3+wq*4+400*8+ 8] 3277 movu m2, [t3+wq*4+400*8+24] 3278 movu m3, [t4+wq*2+400*4+ 2] 3279 movu m4, [t3+wq*4+400*8+ 4] 3280 movu m5, [t3+wq*4+400*8+20] 3281 paddw m0, [t4+wq*2+400*4+ 0] 3282 paddd m1, [t3+wq*4+400*8+ 0] 3283 paddd m2, [t3+wq*4+400*8+16] 3284 paddw m3, m0 3285 paddd m4, m1 3286 paddd m5, m2 3287 psllw m3, 2 ; a3[ 0] 444 3288 pslld m4, 2 ; b3[ 0] 444 3289 pslld m5, 2 3290 mova [t4+wq*2+400*10+ 0], m3 3291 mova [t3+wq*4+400*20+ 0], m4 3292 mova [t3+wq*4+400*20+16], m5 3293 psubw m3, m0 ; a3[ 0] 343 3294 psubd m4, m1 ; b3[ 0] 343 3295 psubd m5, m2 3296 mova [t4+wq*2+400*12+ 0], m3 3297 mova [t3+wq*4+400*24+ 0], m4 3298 mova [t3+wq*4+400*24+16], m5 3299 add wq, 8 3300 jl .prep_n_loop 3301 ret 3302 ALIGN function_align 3303 .n0: ; neighbor + output (even rows) 3304 movif64 wq, r4 3305 movif32 wd, w1m 3306 .n0_loop: 3307 movu m0, [t4+wq*2+ 4] 3308 movu m2, [t4+wq*2+ 2] 3309 paddw m0, [t4+wq*2+ 0] 3310 paddw m0, m2 3311 paddw m2, m0 3312 psllw m0, 2 3313 paddw m0, m2 ; a5 3314 movu m4, [t3+wq*4+ 8] 3315 movu m5, [t3+wq*4+24] 3316 movu m1, [t3+wq*4+ 4] 3317 movu m3, [t3+wq*4+20] 3318 paddd m4, [t3+wq*4+ 0] 3319 paddd m5, [t3+wq*4+16] 3320 paddd m4, m1 3321 paddd m5, m3 3322 paddd m1, m4 3323 paddd m3, m5 3324 pslld m4, 2 3325 pslld m5, 2 3326 paddd m4, m1 ; b5 3327 paddd m5, m3 3328 movu m2, [t4+wq*2+400* 6] 3329 paddw m2, m0 3330 mova [t4+wq*2+400* 6], m0 3331 paddd m0, m4, [t3+wq*4+400*12+ 0] 3332 paddd m1, m5, [t3+wq*4+400*12+16] 3333 mova [t3+wq*4+400*12+ 0], m4 3334 mova [t3+wq*4+400*12+16], m5 3335 mova [rsp+16+ARCH_X86_32*4], m1 3336 movu m3, [t4+wq*2+400*2+4] 3337 movu m5, [t4+wq*2+400*2+2] 3338 paddw m3, [t4+wq*2+400*2+0] 3339 paddw m5, m3 3340 psllw m5, 2 ; a3[ 1] 444 3341 psubw m4, m5, m3 ; a3[ 1] 343 3342 movu m3, [t4+wq*2+400* 8] 3343 paddw m3, [t4+wq*2+400*10] 3344 paddw m3, m4 3345 mova [t4+wq*2+400* 8], m4 3346 mova [t4+wq*2+400*10], m5 3347 movu m1, [t3+wq*4+400*4+ 8] 3348 movu m5, [t3+wq*4+400*4+ 4] 3349 movu m7, [t3+wq*4+400*4+24] 3350 movu m8, [t3+wq*4+400*4+20] 3351 paddd m1, [t3+wq*4+400*4+ 0] 3352 paddd m7, [t3+wq*4+400*4+16] 3353 paddd m5, m1 3354 paddd m8, m7 3355 pslld m5, 2 ; b3[ 1] 444 3356 pslld m8, 2 3357 psubd m4, m5, m1 ; b3[ 1] 343 3358 %if ARCH_X86_32 3359 mova [esp+52], m8 3360 psubd m8, m7 3361 %else 3362 psubd m6, m8, m7 3363 SWAP m8, m6 3364 %endif 3365 paddd m1, m4, [t3+wq*4+400*16+ 0] 3366 paddd m7, m8, [t3+wq*4+400*16+16] 3367 paddd m1, [t3+wq*4+400*20+ 0] 3368 paddd m7, [t3+wq*4+400*20+16] 3369 mova [t3+wq*4+400*16+ 0], m4 3370 mova [t3+wq*4+400*16+16], m8 3371 mova [t3+wq*4+400*20+ 0], m5 3372 %if ARCH_X86_32 3373 mova m8, [esp+52] 3374 %else 3375 SWAP m8, m6 3376 pxor m6, m6 3377 %endif 3378 mova [t3+wq*4+400*20+16], m8 3379 mova [rsp+32+ARCH_X86_32*4], m7 3380 movq m4, [dstq+wq] 3381 punpcklbw m4, m6 3382 punpcklwd m5, m4, m6 3383 punpcklwd m7, m2, m6 3384 pmaddwd m7, m5 ; a5 * src 3385 punpcklwd m8, m3, m6 3386 pmaddwd m8, m5 ; a3 * src 3387 punpckhwd m5, m4, m6 3388 punpckhwd m2, m6 3389 pmaddwd m2, m5 3390 punpckhwd m3, m6 3391 pmaddwd m3, m5 3392 psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13) 3393 psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13) 3394 psrld m0, 9 3395 pslld m1, 7 3396 pand m0, m9 3397 pandn m8, m9, m1 3398 por m0, m8 3399 mova m1, [rsp+16+ARCH_X86_32*4] 3400 psubd m1, m2 3401 mova m2, [rsp+32+ARCH_X86_32*4] 3402 psubd m2, m3 3403 mova m3, [base+pd_4096] 3404 psrld m1, 9 3405 pslld m2, 7 3406 pand m1, m9 3407 pandn m5, m9, m2 3408 por m1, m5 3409 pmaddwd m0, m15 3410 pmaddwd m1, m15 3411 paddd m0, m3 3412 paddd m1, m3 3413 psrad m0, 13 3414 psrad m1, 13 3415 packssdw m0, m1 3416 paddw m0, m4 3417 packuswb m0, m0 3418 movq [dstq+wq], m0 3419 add wq, 8 3420 jl .n0_loop 3421 add dstq, stridemp 3422 ret 3423 ALIGN function_align 3424 .n1: ; neighbor + output (odd rows) 3425 movif64 wq, r4 3426 movif32 wd, w1m 3427 .n1_loop: 3428 movu m3, [t4+wq*2+400*4+4] 3429 movu m5, [t4+wq*2+400*4+2] 3430 paddw m3, [t4+wq*2+400*4+0] 3431 paddw m5, m3 3432 psllw m5, 2 ; a3[ 1] 444 3433 psubw m4, m5, m3 ; a3[ 1] 343 3434 paddw m3, m4, [t4+wq*2+400*12] 3435 paddw m3, [t4+wq*2+400*10] 3436 mova [t4+wq*2+400*10], m5 3437 mova [t4+wq*2+400*12], m4 3438 movu m1, [t3+wq*4+400*8+ 8] 3439 movu m5, [t3+wq*4+400*8+ 4] 3440 movu m7, [t3+wq*4+400*8+24] 3441 movu m8, [t3+wq*4+400*8+20] 3442 paddd m1, [t3+wq*4+400*8+ 0] 3443 paddd m7, [t3+wq*4+400*8+16] 3444 paddd m5, m1 3445 paddd m8, m7 3446 pslld m5, 2 ; b3[ 1] 444 3447 pslld m8, 2 3448 psubd m4, m5, m1 ; b3[ 1] 343 3449 psubd m0, m8, m7 3450 paddd m1, m4, [t3+wq*4+400*24+ 0] 3451 paddd m7, m0, [t3+wq*4+400*24+16] 3452 paddd m1, [t3+wq*4+400*20+ 0] 3453 paddd m7, [t3+wq*4+400*20+16] 3454 mova [t3+wq*4+400*20+ 0], m5 3455 mova [t3+wq*4+400*20+16], m8 3456 mova [t3+wq*4+400*24+ 0], m4 3457 mova [t3+wq*4+400*24+16], m0 3458 movq m5, [dstq+wq] 3459 mova m2, [t4+wq*2+400* 6] 3460 punpcklbw m5, m6 3461 punpcklwd m4, m5, m6 3462 punpcklwd m8, m2, m6 3463 pmaddwd m8, m4 ; a5 * src 3464 punpcklwd m0, m3, m6 3465 pmaddwd m0, m4 ; a3 * src 3466 punpckhwd m4, m5, m6 3467 punpckhwd m2, m6 3468 pmaddwd m2, m4 3469 punpckhwd m3, m6 3470 pmaddwd m3, m4 3471 psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13) 3472 mova m0, [t3+wq*4+400*12+ 0] 3473 psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13) 3474 mova m4, [t3+wq*4+400*12+16] 3475 psubd m4, m2 3476 psubd m7, m3 3477 pslld m1, 7 3478 psrld m0, 8 3479 psrld m4, 8 3480 pslld m7, 7 3481 pandn m3, m9, m1 3482 pand m0, m9 3483 por m0, m3 3484 pand m4, m9 3485 pandn m2, m9, m7 3486 por m2, m4 3487 mova m1, [base+pd_4096] 3488 pmaddwd m0, m15 3489 pmaddwd m2, m15 3490 paddd m0, m1 3491 paddd m2, m1 3492 psrad m0, 13 3493 psrad m2, 13 3494 packssdw m0, m2 3495 paddw m0, m5 3496 packuswb m0, m0 3497 movq [dstq+wq], m0 3498 add wq, 8 3499 jl .n1_loop 3500 add dstq, stridemp 3501 movif32 dstm, dstq 3502 ret