looprestoration16_sse.asm (102502B)
1 ; Copyright © 2021, VideoLAN and dav1d authors 2 ; Copyright © 2021, Two Orioles, LLC 3 ; All rights reserved. 4 ; 5 ; Redistribution and use in source and binary forms, with or without 6 ; modification, are permitted provided that the following conditions are met: 7 ; 8 ; 1. Redistributions of source code must retain the above copyright notice, this 9 ; list of conditions and the following disclaimer. 10 ; 11 ; 2. Redistributions in binary form must reproduce the above copyright notice, 12 ; this list of conditions and the following disclaimer in the documentation 13 ; and/or other materials provided with the distribution. 14 ; 15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26 %include "config.asm" 27 %include "ext/x86/x86inc.asm" 28 29 SECTION_RODATA 30 31 wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 32 wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 33 wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 34 wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 35 wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 36 wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 37 wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 38 sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 39 sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 40 pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 41 42 pb_m14_m13: times 8 db -14,-13 43 pb_m10_m9: times 8 db -10, -9 44 pb_m6_m5: times 8 db -6, -5 45 pb_m2_m1: times 8 db -2, -1 46 pb_2_3: times 8 db 2, 3 47 pb_6_7: times 8 db 6, 7 48 pw_256: times 8 dw 256 49 pw_1023: times 8 dw 1023 50 pw_164_24: times 4 dw 164, 24 51 pw_455_24: times 4 dw 455, 24 52 pd_8: times 4 dd 8 53 pd_4096: times 4 dd 4096 54 pd_34816: times 4 dd 34816 55 pd_m262128: times 4 dd -262128 56 pd_0xffff: times 4 dd 0xffff 57 pd_0xfffffff0: times 4 dd 0xfffffff0 58 pf_256: times 4 dd 256.0 59 60 wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 61 wiener_round: dd 1049600, 1048832 62 63 SECTION .text 64 65 %macro movif64 2 ; dst, src 66 %if ARCH_X86_64 67 mov %1, %2 68 %endif 69 %endmacro 70 71 %macro movif32 2 ; dst, src 72 %if ARCH_X86_32 73 mov %1, %2 74 %endif 75 %endmacro 76 77 INIT_XMM ssse3 78 %if ARCH_X86_32 79 DECLARE_REG_TMP 5, 6 80 %if STACK_ALIGNMENT < 16 81 %assign extra_stack 13*16 82 %else 83 %assign extra_stack 12*16 84 %endif 85 cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \ 86 dst, stride, left, lpf, w, flt 87 %if STACK_ALIGNMENT < 16 88 %define lpfm dword [esp+calloff+16*12+ 0] 89 %define wm dword [esp+calloff+16*12+ 4] 90 %define hd dword [esp+calloff+16*12+ 8] 91 %define edgeb byte [esp+calloff+16*12+12] 92 %define edged dword [esp+calloff+16*12+12] 93 %else 94 %define hd dword r5m 95 %define edgeb byte r7m 96 %endif 97 %define PICmem dword [esp+calloff+4*0] 98 %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers 99 %define t1m dword [esp+calloff+4*2] 100 %define t2m dword [esp+calloff+4*3] 101 %define t3m dword [esp+calloff+4*4] 102 %define t4m dword [esp+calloff+4*5] 103 %define t5m dword [esp+calloff+4*6] 104 %define t6m dword [esp+calloff+4*7] 105 %define t2 t2m 106 %define t3 t3m 107 %define t4 t4m 108 %define t5 t5m 109 %define t6 t6m 110 %define m8 [esp+calloff+16*2] 111 %define m9 [esp+calloff+16*3] 112 %define m10 [esp+calloff+16*4] 113 %define m11 [esp+calloff+16*5] 114 %define m12 [esp+calloff+16*6] 115 %define m13 [esp+calloff+16*7] 116 %define m14 [esp+calloff+16*8] 117 %define m15 [esp+calloff+16*9] 118 %define r10 r4 119 %define base t0-wiener_shifts 120 %assign calloff 0 121 %if STACK_ALIGNMENT < 16 122 mov wd, [rstk+stack_offset+20] 123 mov wm, wd 124 mov r5, [rstk+stack_offset+24] 125 mov hd, r5 126 mov r5, [rstk+stack_offset+32] 127 mov edged, r5 ; edge 128 %endif 129 %else 130 DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers 131 cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 132 w, h, edge, flt 133 %define base 134 %endif 135 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 136 movifnidn wd, wm 137 %endif 138 %if ARCH_X86_64 139 mov fltq, r6mp 140 movifnidn hd, hm 141 mov edged, r7m 142 mov t3d, r8m ; pixel_max 143 movq m13, [fltq] 144 movq m15, [fltq+16] 145 %else 146 %if STACK_ALIGNMENT < 16 147 mov t0, [rstk+stack_offset+28] 148 mov t1, [rstk+stack_offset+36] ; pixel_max 149 movq m1, [t0] ; fx 150 movq m3, [t0+16] ; fy 151 LEA t0, wiener_shifts 152 %else 153 mov fltq, r6m 154 movq m1, [fltq] 155 movq m3, [fltq+16] 156 LEA t0, wiener_shifts 157 mov t1, r8m ; pixel_max 158 %endif 159 mov PICmem, t0 160 %endif 161 mova m6, [base+wiener_shufA] 162 mova m7, [base+wiener_shufB] 163 %if ARCH_X86_64 164 lea t4, [wiener_shifts] 165 add wd, wd 166 pshufd m12, m13, q0000 ; x0 x1 167 pshufd m13, m13, q1111 ; x2 x3 168 pshufd m14, m15, q0000 ; y0 y1 169 pshufd m15, m15, q1111 ; y2 y3 170 mova m8, [wiener_shufC] 171 mova m9, [wiener_shufD] 172 add lpfq, wq 173 lea t1, [rsp+wq+16] 174 add dstq, wq 175 neg wq 176 shr t3d, 11 177 %define base t4-wiener_shifts 178 movd m10, [base+wiener_round+t3*4] 179 movq m11, [base+wiener_shifts+t3*8] 180 pshufd m10, m10, q0000 181 pshufd m0, m11, q0000 182 pshufd m11, m11, q1111 183 pmullw m12, m0 ; upshift filter coefs to make the 184 pmullw m13, m0 ; horizontal downshift constant 185 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 186 %define lpfm [rsp] 187 %define base 188 %define wiener_lshuf7_mem [wiener_lshuf7] 189 %define pd_m262128_mem [pd_m262128] 190 %else 191 add wd, wd 192 mova m4, [base+wiener_shufC] 193 mova m5, [base+wiener_shufD] 194 pshufd m0, m1, q0000 195 pshufd m1, m1, q1111 196 pshufd m2, m3, q0000 197 pshufd m3, m3, q1111 198 mova m8, m4 199 mova m9, m5 200 mova m14, m2 201 mova m15, m3 202 shr t1, 11 203 add lpfq, wq 204 mova m3, [base+pd_m262128] 205 movd m4, [base+wiener_round+t1*4] 206 movq m5, [base+wiener_shifts+t1*8] 207 lea t1, [esp+extra_stack+wq+16] 208 add dstq, wq 209 neg wq 210 pshufd m4, m4, q0000 211 pshufd m2, m5, q0000 212 pshufd m5, m5, q1111 213 mov wm, wq 214 pmullw m0, m2 215 pmullw m1, m2 216 mova m2, [base+wiener_lshuf7] 217 %define pd_m262128_mem [esp+calloff+16*10] 218 mova pd_m262128_mem, m3 219 mova m10, m4 220 mova m11, m5 221 mova m12, m0 222 mova m13, m1 223 %define wiener_lshuf7_mem [esp+calloff+16*11] 224 mova wiener_lshuf7_mem, m2 225 %endif 226 test edgeb, 4 ; LR_HAVE_TOP 227 jz .no_top 228 call .h_top 229 add lpfq, strideq 230 mov t6, t1 231 mov t5, t1 232 add t1, 384*2 233 call .h_top 234 lea r10, [lpfq+strideq*4] 235 mov lpfq, dstq 236 mov t4, t1 237 add t1, 384*2 238 add r10, strideq 239 mov lpfm, r10 ; below 240 call .h 241 mov t3, t1 242 mov t2, t1 243 dec hd 244 jz .v1 245 add lpfq, strideq 246 add t1, 384*2 247 call .h 248 mov t2, t1 249 dec hd 250 jz .v2 251 add lpfq, strideq 252 add t1, 384*2 253 call .h 254 dec hd 255 jz .v3 256 .main: 257 lea t0, [t1+384*2] 258 .main_loop: 259 call .hv 260 dec hd 261 jnz .main_loop 262 test edgeb, 8 ; LR_HAVE_BOTTOM 263 jz .v3 264 mov lpfq, lpfm 265 call .hv_bottom 266 add lpfq, strideq 267 call .hv_bottom 268 .v1: 269 call .v 270 RET 271 .no_top: 272 lea r10, [lpfq+strideq*4] 273 mov lpfq, dstq 274 lea r10, [r10+strideq*2] 275 mov lpfm, r10 276 call .h 277 mov t6, t1 278 mov t5, t1 279 mov t4, t1 280 mov t3, t1 281 mov t2, t1 282 dec hd 283 jz .v1 284 add lpfq, strideq 285 add t1, 384*2 286 call .h 287 mov t2, t1 288 dec hd 289 jz .v2 290 add lpfq, strideq 291 add t1, 384*2 292 call .h 293 dec hd 294 jz .v3 295 lea t0, [t1+384*2] 296 call .hv 297 dec hd 298 jz .v3 299 add t0, 384*8 300 call .hv 301 dec hd 302 jnz .main 303 .v3: 304 call .v 305 movif32 wq, wm 306 .v2: 307 call .v 308 movif32 wq, wm 309 jmp .v1 310 .extend_right: 311 %assign stack_offset stack_offset+8 312 %assign calloff 8 313 movif32 t0, PICmem 314 pxor m0, m0 315 movd m1, wd 316 mova m2, [base+pb_0to15] 317 pshufb m1, m0 318 mova m0, [base+pb_6_7] 319 psubb m0, m1 320 pminub m0, m2 321 pshufb m3, m0 322 mova m0, [base+pb_m2_m1] 323 psubb m0, m1 324 pminub m0, m2 325 pshufb m4, m0 326 mova m0, [base+pb_m10_m9] 327 psubb m0, m1 328 pminub m0, m2 329 pshufb m5, m0 330 movif32 t0, t0m 331 ret 332 %assign stack_offset stack_offset-4 333 %assign calloff 4 334 .h: 335 movif64 wq, r4 336 movif32 wq, wm 337 test edgeb, 1 ; LR_HAVE_LEFT 338 jz .h_extend_left 339 movq m3, [leftq] 340 movhps m3, [lpfq+wq] 341 add leftq, 8 342 jmp .h_main 343 .h_extend_left: 344 mova m3, [lpfq+wq] ; avoid accessing memory located 345 pshufb m3, wiener_lshuf7_mem ; before the start of the buffer 346 jmp .h_main 347 .h_top: 348 movif64 wq, r4 349 test edgeb, 1 ; LR_HAVE_LEFT 350 jz .h_extend_left 351 .h_loop: 352 movu m3, [lpfq+wq-8] 353 .h_main: 354 mova m4, [lpfq+wq+0] 355 movu m5, [lpfq+wq+8] 356 test edgeb, 2 ; LR_HAVE_RIGHT 357 jnz .h_have_right 358 cmp wd, -20 359 jl .h_have_right 360 call .extend_right 361 .h_have_right: 362 pshufb m0, m3, m6 363 pshufb m1, m4, m7 364 paddw m0, m1 365 pshufb m3, m8 366 pmaddwd m0, m12 367 pshufb m1, m4, m9 368 paddw m3, m1 369 pshufb m1, m4, m6 370 pmaddwd m3, m13 371 pshufb m2, m5, m7 372 paddw m1, m2 373 mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18) 374 pshufb m4, m8 375 pmaddwd m1, m12 376 pshufb m5, m9 377 paddw m4, m5 378 pmaddwd m4, m13 379 paddd m0, m2 380 paddd m1, m2 381 paddd m0, m3 382 paddd m1, m4 383 psrad m0, 4 384 psrad m1, 4 385 packssdw m0, m1 386 psraw m0, 1 387 mova [t1+wq], m0 388 add wq, 16 389 jl .h_loop 390 movif32 wq, wm 391 ret 392 ALIGN function_align 393 .hv: 394 add lpfq, strideq 395 movif64 wq, r4 396 movif32 t0m, t0 397 movif32 t1m, t1 398 test edgeb, 1 ; LR_HAVE_LEFT 399 jz .hv_extend_left 400 movq m3, [leftq] 401 movhps m3, [lpfq+wq] 402 add leftq, 8 403 jmp .hv_main 404 .hv_extend_left: 405 mova m3, [lpfq+wq] 406 pshufb m3, wiener_lshuf7_mem 407 jmp .hv_main 408 .hv_bottom: 409 movif64 wq, r4 410 movif32 t0m, t0 411 movif32 t1m, t1 412 test edgeb, 1 ; LR_HAVE_LEFT 413 jz .hv_extend_left 414 .hv_loop: 415 movu m3, [lpfq+wq-8] 416 .hv_main: 417 mova m4, [lpfq+wq+0] 418 movu m5, [lpfq+wq+8] 419 test edgeb, 2 ; LR_HAVE_RIGHT 420 jnz .hv_have_right 421 cmp wd, -20 422 jl .hv_have_right 423 call .extend_right 424 .hv_have_right: 425 movif32 t1, t4m 426 movif32 t0, t2m 427 pshufb m0, m3, m6 428 pshufb m1, m4, m7 429 paddw m0, m1 430 pshufb m3, m8 431 pmaddwd m0, m12 432 pshufb m1, m4, m9 433 paddw m3, m1 434 pshufb m1, m4, m6 435 pmaddwd m3, m13 436 pshufb m2, m5, m7 437 paddw m1, m2 438 mova m2, pd_m262128_mem 439 pshufb m4, m8 440 pmaddwd m1, m12 441 pshufb m5, m9 442 paddw m4, m5 443 pmaddwd m4, m13 444 paddd m0, m2 445 paddd m1, m2 446 %if ARCH_X86_64 447 mova m2, [t4+wq] 448 paddw m2, [t2+wq] 449 mova m5, [t3+wq] 450 %else 451 mova m2, [t1+wq] 452 paddw m2, [t0+wq] 453 mov t1, t3m 454 mov t0, t5m 455 mova m5, [t1+wq] 456 mov t1, t1m 457 %endif 458 paddd m0, m3 459 paddd m1, m4 460 psrad m0, 4 461 psrad m1, 4 462 packssdw m0, m1 463 %if ARCH_X86_64 464 mova m4, [t5+wq] 465 paddw m4, [t1+wq] 466 psraw m0, 1 467 paddw m3, m0, [t6+wq] 468 %else 469 mova m4, [t0+wq] 470 paddw m4, [t1+wq] 471 mov t0, t0m 472 mov t1, t6m 473 psraw m0, 1 474 paddw m3, m0, [t1+wq] 475 %endif 476 mova [t0+wq], m0 477 punpcklwd m0, m2, m5 478 pmaddwd m0, m15 479 punpckhwd m2, m5 480 pmaddwd m2, m15 481 punpcklwd m1, m3, m4 482 pmaddwd m1, m14 483 punpckhwd m3, m4 484 pmaddwd m3, m14 485 paddd m0, m10 486 paddd m2, m10 487 paddd m0, m1 488 paddd m2, m3 489 psrad m0, 6 490 psrad m2, 6 491 packssdw m0, m2 492 pmulhw m0, m11 493 pxor m1, m1 494 pmaxsw m0, m1 495 mova [dstq+wq], m0 496 add wq, 16 497 jl .hv_loop 498 %if ARCH_X86_64 499 mov t6, t5 500 mov t5, t4 501 mov t4, t3 502 mov t3, t2 503 mov t2, t1 504 mov t1, t0 505 mov t0, t6 506 %else 507 mov r4, t5m 508 mov t1, t4m 509 mov t6m, r4 510 mov t5m, t1 511 mov r4, t3m 512 mov t1, t2m 513 mov t4m, r4 514 mov t3m, t1 515 mov r4, t1m 516 mov t1, t0 517 mov t2m, r4 518 mov t0, t6m 519 mov wq, wm 520 %endif 521 add dstq, strideq 522 ret 523 .v: 524 movif64 wq, r4 525 movif32 t0m, t0 526 movif32 t1m, t1 527 .v_loop: 528 %if ARCH_X86_64 529 mova m1, [t4+wq] 530 paddw m1, [t2+wq] 531 mova m2, [t3+wq] 532 mova m4, [t1+wq] 533 paddw m3, m4, [t6+wq] 534 paddw m4, [t5+wq] 535 %else 536 mov t0, t4m 537 mov t1, t2m 538 mova m1, [t0+wq] 539 paddw m1, [t1+wq] 540 mov t0, t3m 541 mov t1, t1m 542 mova m2, [t0+wq] 543 mova m4, [t1+wq] 544 mov t0, t6m 545 mov t1, t5m 546 paddw m3, m4, [t0+wq] 547 paddw m4, [t1+wq] 548 %endif 549 punpcklwd m0, m1, m2 550 pmaddwd m0, m15 551 punpckhwd m1, m2 552 pmaddwd m1, m15 553 punpcklwd m2, m3, m4 554 pmaddwd m2, m14 555 punpckhwd m3, m4 556 pmaddwd m3, m14 557 paddd m0, m10 558 paddd m1, m10 559 paddd m0, m2 560 paddd m1, m3 561 psrad m0, 6 562 psrad m1, 6 563 packssdw m0, m1 564 pmulhw m0, m11 565 pxor m1, m1 566 pmaxsw m0, m1 567 mova [dstq+wq], m0 568 add wq, 16 569 jl .v_loop 570 %if ARCH_X86_64 571 mov t6, t5 572 mov t5, t4 573 mov t4, t3 574 mov t3, t2 575 mov t2, t1 576 %else 577 mov t0, t5m 578 mov t1, t4m 579 mov r4, t3m 580 mov t6m, t0 581 mov t5m, t1 582 mov t4m, r4 583 mov r4, t2m 584 mov t1, t1m 585 mov t0, t0m 586 mov t3m, r4 587 mov t2m, t1 588 %endif 589 add dstq, strideq 590 ret 591 592 %if ARCH_X86_32 593 %if STACK_ALIGNMENT < 16 594 %assign stack_size 12*16+384*8 595 %else 596 %assign stack_size 11*16+384*8 597 %endif 598 cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \ 599 lpf, w, flt 600 %if STACK_ALIGNMENT < 16 601 %define lpfm dword [esp+calloff+4*6] 602 %define wm dword [esp+calloff+4*7] 603 %define hd dword [esp+calloff+16*10+0] 604 %define edgeb byte [esp+calloff+16*10+4] 605 %define edged dword [esp+calloff+16*10+4] 606 %else 607 %define hd dword r5m 608 %define edgeb byte r7m 609 %endif 610 %define PICmem dword [esp+calloff+4*0] 611 %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers 612 %define t1m dword [esp+calloff+4*2] 613 %define t2m dword [esp+calloff+4*3] 614 %define t3m dword [esp+calloff+4*4] 615 %define t4m dword [esp+calloff+4*5] 616 %define t2 t2m 617 %define t3 t3m 618 %define t4 t4m 619 %define m8 [esp+calloff+16*2] 620 %define m9 [esp+calloff+16*3] 621 %define m10 [esp+calloff+16*4] 622 %define m11 [esp+calloff+16*5] 623 %define m12 [esp+calloff+16*6] 624 %define m13 [esp+calloff+16*7] 625 %define m14 [esp+calloff+16*8] 626 %define m15 [esp+calloff+16*9] 627 %define base t0-wiener_shifts 628 %assign calloff 0 629 %if STACK_ALIGNMENT < 16 630 mov wd, [rstk+stack_offset+20] 631 mov wm, wd 632 mov r5, [rstk+stack_offset+24] 633 mov hd, r5 634 mov r5, [rstk+stack_offset+32] 635 mov edged, r5 ; edge 636 %endif 637 %else 638 cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \ 639 w, h, edge, flt 640 %define base 641 %endif 642 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 643 movifnidn wd, wm 644 %endif 645 %if ARCH_X86_64 646 mov fltq, r6mp 647 movifnidn hd, hm 648 mov edged, r7m 649 mov t3d, r8m ; pixel_max 650 movq m12, [fltq] 651 movq m14, [fltq+16] 652 %else 653 %if STACK_ALIGNMENT < 16 654 mov t0, [rstk+stack_offset+28] 655 mov t1, [rstk+stack_offset+36] ; pixel_max 656 movq m1, [t0] ; fx 657 movq m3, [t0+16] ; fy 658 LEA t0, wiener_shifts 659 %else 660 mov fltq, r6m 661 movq m1, [fltq] 662 movq m3, [fltq+16] 663 LEA t0, wiener_shifts 664 mov t1, r8m ; pixel_max 665 %endif 666 mov PICmem, t0 667 %endif 668 mova m5, [base+wiener_shufE] 669 mova m6, [base+wiener_shufB] 670 mova m7, [base+wiener_shufD] 671 %if ARCH_X86_64 672 lea t4, [wiener_shifts] 673 add wd, wd 674 punpcklwd m11, m12, m12 675 pshufd m11, m11, q1111 ; x1 676 pshufd m12, m12, q1111 ; x2 x3 677 punpcklwd m13, m14, m14 678 pshufd m13, m13, q1111 ; y1 679 pshufd m14, m14, q1111 ; y2 y3 680 shr t3d, 11 681 mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) 682 add lpfq, wq 683 lea t1, [rsp+wq+16] 684 add dstq, wq 685 neg wq 686 %define base t4-wiener_shifts 687 movd m9, [base+wiener_round+t3*4] 688 movq m10, [base+wiener_shifts+t3*8] 689 pshufd m9, m9, q0000 690 pshufd m0, m10, q0000 691 pshufd m10, m10, q1111 692 mova m15, [wiener_lshuf5] 693 pmullw m11, m0 694 pmullw m12, m0 695 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 696 %define lpfm [rsp] 697 %define base 698 %else 699 add wd, wd 700 punpcklwd m0, m1, m1 701 pshufd m0, m0, q1111 ; x1 702 pshufd m1, m1, q1111 ; x2 x3 703 punpcklwd m2, m3, m3 704 pshufd m2, m2, q1111 ; y1 705 pshufd m3, m3, q1111 ; y2 y3 706 mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) 707 mova m13, m2 708 mova m14, m3 709 mova m8, m4 710 shr t1, 11 711 add lpfq, wq 712 movd m2, [base+wiener_round+t1*4] 713 movq m3, [base+wiener_shifts+t1*8] 714 %if STACK_ALIGNMENT < 16 715 lea t1, [esp+16*11+wq+16] 716 %else 717 lea t1, [esp+16*10+wq+16] 718 %endif 719 add dstq, wq 720 neg wq 721 pshufd m2, m2, q0000 722 pshufd m4, m3, q0000 723 pshufd m3, m3, q1111 724 mov wm, wq 725 pmullw m0, m4 726 pmullw m1, m4 727 mova m4, [base+wiener_lshuf5] 728 mova m9, m2 729 mova m10, m3 730 mova m11, m0 731 mova m12, m1 732 mova m15, m4 733 %endif 734 test edgeb, 4 ; LR_HAVE_TOP 735 jz .no_top 736 call .h_top 737 add lpfq, strideq 738 mov t4, t1 739 add t1, 384*2 740 call .h_top 741 lea r10, [lpfq+strideq*4] 742 mov lpfq, dstq 743 mov t3, t1 744 add t1, 384*2 745 add r10, strideq 746 mov lpfm, r10 ; below 747 call .h 748 mov t2, t1 749 dec hd 750 jz .v1 751 add lpfq, strideq 752 add t1, 384*2 753 call .h 754 dec hd 755 jz .v2 756 .main: 757 mov t0, t4 758 .main_loop: 759 call .hv 760 dec hd 761 jnz .main_loop 762 test edgeb, 8 ; LR_HAVE_BOTTOM 763 jz .v2 764 mov lpfq, lpfm 765 call .hv_bottom 766 add lpfq, strideq 767 call .hv_bottom 768 .end: 769 RET 770 .no_top: 771 lea r10, [lpfq+strideq*4] 772 mov lpfq, dstq 773 lea r10, [r10+strideq*2] 774 mov lpfm, r10 775 call .h 776 mov t4, t1 777 mov t3, t1 778 mov t2, t1 779 dec hd 780 jz .v1 781 add lpfq, strideq 782 add t1, 384*2 783 call .h 784 dec hd 785 jz .v2 786 lea t0, [t1+384*2] 787 call .hv 788 dec hd 789 jz .v2 790 add t0, 384*6 791 call .hv 792 dec hd 793 jnz .main 794 .v2: 795 call .v 796 %if ARCH_X86_64 797 mov t4, t3 798 mov t3, t2 799 mov t2, t1 800 %else 801 mov t0, t3m 802 mov r4, t2m 803 mov t1, t1m 804 mov t4m, t0 805 mov t3m, r4 806 mov t2m, t1 807 mov wq, wm 808 %endif 809 add dstq, strideq 810 .v1: 811 call .v 812 jmp .end 813 .extend_right: 814 %assign stack_offset stack_offset+8 815 %assign calloff 8 816 movif32 t0, PICmem 817 pxor m1, m1 818 movd m2, wd 819 mova m0, [base+pb_2_3] 820 pshufb m2, m1 821 mova m1, [base+pb_m6_m5] 822 psubb m0, m2 823 psubb m1, m2 824 mova m2, [base+pb_0to15] 825 pminub m0, m2 826 pminub m1, m2 827 pshufb m3, m0 828 pshufb m4, m1 829 ret 830 %assign stack_offset stack_offset-4 831 %assign calloff 4 832 .h: 833 movif64 wq, r4 834 movif32 wq, wm 835 test edgeb, 1 ; LR_HAVE_LEFT 836 jz .h_extend_left 837 mova m4, [lpfq+wq] 838 movd m3, [leftq+4] 839 pslldq m4, 4 840 por m3, m4 841 add leftq, 8 842 jmp .h_main 843 .h_extend_left: 844 mova m3, [lpfq+wq] ; avoid accessing memory located 845 pshufb m3, m15 ; before the start of the buffer 846 jmp .h_main 847 .h_top: 848 movif64 wq, r4 849 movif32 wq, wm 850 test edgeb, 1 ; LR_HAVE_LEFT 851 jz .h_extend_left 852 .h_loop: 853 movu m3, [lpfq+wq-4] 854 .h_main: 855 movu m4, [lpfq+wq+4] 856 test edgeb, 2 ; LR_HAVE_RIGHT 857 jnz .h_have_right 858 cmp wd, -18 859 jl .h_have_right 860 call .extend_right 861 .h_have_right: 862 pshufb m0, m3, m5 863 pmaddwd m0, m11 864 pshufb m1, m4, m5 865 pmaddwd m1, m11 866 pshufb m2, m3, m6 867 pshufb m3, m7 868 paddw m2, m3 869 pshufb m3, m4, m6 870 pmaddwd m2, m12 871 pshufb m4, m7 872 paddw m3, m4 873 pmaddwd m3, m12 874 paddd m0, m8 875 paddd m1, m8 876 paddd m0, m2 877 paddd m1, m3 878 psrad m0, 4 879 psrad m1, 4 880 packssdw m0, m1 881 psraw m0, 1 882 mova [t1+wq], m0 883 add wq, 16 884 jl .h_loop 885 movif32 wq, wm 886 ret 887 ALIGN function_align 888 .hv: 889 add lpfq, strideq 890 movif64 wq, r4 891 movif32 t0m, t0 892 movif32 t1m, t1 893 test edgeb, 1 ; LR_HAVE_LEFT 894 jz .hv_extend_left 895 mova m4, [lpfq+wq] 896 movd m3, [leftq+4] 897 pslldq m4, 4 898 por m3, m4 899 add leftq, 8 900 jmp .hv_main 901 .hv_extend_left: 902 mova m3, [lpfq+wq] 903 pshufb m3, m15 904 jmp .hv_main 905 .hv_bottom: 906 movif64 wq, r4 907 movif32 t0m, t0 908 movif32 t1m, t1 909 test edgeb, 1 ; LR_HAVE_LEFT 910 jz .hv_extend_left 911 .hv_loop: 912 movu m3, [lpfq+wq-4] 913 .hv_main: 914 movu m4, [lpfq+wq+4] 915 test edgeb, 2 ; LR_HAVE_RIGHT 916 jnz .hv_have_right 917 cmp wd, -18 918 jl .hv_have_right 919 call .extend_right 920 .hv_have_right: 921 movif32 t1, t1m 922 movif32 t0, t3m 923 pshufb m0, m3, m5 924 pmaddwd m0, m11 925 pshufb m1, m4, m5 926 pmaddwd m1, m11 927 pshufb m2, m3, m6 928 pshufb m3, m7 929 paddw m2, m3 930 pshufb m3, m4, m6 931 pmaddwd m2, m12 932 pshufb m4, m7 933 paddw m3, m4 934 pmaddwd m3, m12 935 paddd m0, m8 936 paddd m1, m8 937 paddd m0, m2 938 %if ARCH_X86_64 939 mova m2, [t3+wq] 940 paddw m2, [t1+wq] 941 paddd m1, m3 942 mova m4, [t2+wq] 943 %else 944 mova m2, [t0+wq] 945 mov t0, t2m 946 paddw m2, [t1+wq] 947 mov t1, t4m 948 paddd m1, m3 949 mova m4, [t0+wq] 950 mov t0, t0m 951 %endif 952 punpckhwd m3, m2, m4 953 pmaddwd m3, m14 954 punpcklwd m2, m4 955 %if ARCH_X86_64 956 mova m4, [t4+wq] 957 %else 958 mova m4, [t1+wq] 959 %endif 960 psrad m0, 4 961 psrad m1, 4 962 packssdw m0, m1 963 pmaddwd m2, m14 964 psraw m0, 1 965 mova [t0+wq], m0 966 punpckhwd m1, m0, m4 967 pmaddwd m1, m13 968 punpcklwd m0, m4 969 pmaddwd m0, m13 970 paddd m3, m9 971 paddd m2, m9 972 paddd m1, m3 973 paddd m0, m2 974 psrad m1, 6 975 psrad m0, 6 976 packssdw m0, m1 977 pmulhw m0, m10 978 pxor m1, m1 979 pmaxsw m0, m1 980 mova [dstq+wq], m0 981 add wq, 16 982 jl .hv_loop 983 %if ARCH_X86_64 984 mov t4, t3 985 mov t3, t2 986 mov t2, t1 987 mov t1, t0 988 mov t0, t4 989 %else 990 mov r4, t3m 991 mov t1, t2m 992 mov t4m, r4 993 mov t3m, t1 994 mov r4, t1m 995 mov t1, t0 996 mov t2m, r4 997 mov t0, t4m 998 mov wq, wm 999 %endif 1000 add dstq, strideq 1001 ret 1002 .v: 1003 movif64 wq, r4 1004 movif32 t1m, t1 1005 .v_loop: 1006 %if ARCH_X86_64 1007 mova m0, [t1+wq] 1008 paddw m2, m0, [t3+wq] 1009 mova m1, [t2+wq] 1010 mova m4, [t4+wq] 1011 %else 1012 mov t0, t3m 1013 mova m0, [t1+wq] 1014 mov t1, t2m 1015 paddw m2, m0, [t0+wq] 1016 mov t0, t4m 1017 mova m1, [t1+wq] 1018 mova m4, [t0+wq] 1019 %endif 1020 punpckhwd m3, m2, m1 1021 pmaddwd m3, m14 1022 punpcklwd m2, m1 1023 pmaddwd m2, m14 1024 punpckhwd m1, m0, m4 1025 pmaddwd m1, m13 1026 punpcklwd m0, m4 1027 pmaddwd m0, m13 1028 paddd m3, m9 1029 paddd m2, m9 1030 paddd m1, m3 1031 paddd m0, m2 1032 psrad m1, 6 1033 psrad m0, 6 1034 packssdw m0, m1 1035 pmulhw m0, m10 1036 pxor m1, m1 1037 pmaxsw m0, m1 1038 mova [dstq+wq], m0 1039 add wq, 16 1040 %if ARCH_X86_64 1041 jl .v_loop 1042 %else 1043 jge .v_end 1044 mov t1, t1m 1045 jmp .v_loop 1046 .v_end: 1047 %endif 1048 ret 1049 1050 %macro MUL_32X16X2 6 ; dst[1-2], src[1-2], tmp[1-2] 1051 pmulhuw %5, %1, %3 1052 pmulhuw %6, %2, %4 1053 pmullw %1, %3 1054 pmullw %2, %4 1055 pslld %5, 16 1056 pslld %6, 16 1057 paddd %1, %5 1058 paddd %2, %6 1059 %endmacro 1060 1061 %macro SGR_CALC_X 10 ; BB_dst, BB_src, b, tmp, an[1-2], zero, s, b_mul, pf_256 1062 punpcklwd %4, %3, %7 1063 punpckhwd %3, %7 1064 pmaddwd %4, %4 ; b * b 1065 pmaddwd %3, %3 1066 punpcklwd %1, %2, %7 ; BB 1067 punpckhwd %2, %7 1068 psubd %5, %4 ; a * n - b * b 1069 psubd %6, %3 1070 pcmpgtd %4, %5, %7 1071 pcmpgtd %3, %6, %7 1072 pand %5, %4 ; p 1073 pand %6, %3 1074 MUL_32X16X2 %5, %6, %8, %8, %4, %3 ; p * s 1075 paddw %5, %9 1076 paddw %6, %9 1077 psrld %5, 20 ; z + 1 1078 psrld %6, 20 1079 cvtdq2ps %5, %5 1080 cvtdq2ps %6, %6 1081 pmaddwd %1, %9 ; BB * 164 1082 pmaddwd %2, %9 1083 rcpps %3, %5 ; 1 / (z + 1) 1084 rcpps %4, %6 1085 cmpltps %5, %10 1086 cmpltps %6, %10 1087 mulps %3, %10 ; 256 / (z + 1) 1088 mulps %4, %10 1089 packssdw %5, %6 1090 cvtps2dq %3, %3 1091 cvtps2dq %4, %4 1092 psrlw %5, 8 ; z < 255 ? 255 : 0 1093 packssdw %3, %4 1094 pminsw %3, %5 ; x 1095 %endmacro 1096 1097 %if ARCH_X86_32 1098 DECLARE_REG_TMP 0, 1, 2, 3, 5 1099 %if STACK_ALIGNMENT < 16 1100 %assign extra_stack 5*16 1101 %else 1102 %assign extra_stack 3*16 1103 %endif 1104 cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ 1105 dst, stride, left, lpf, w 1106 %if STACK_ALIGNMENT < 16 1107 %define dstm dword [esp+calloff+16*0+4*6] 1108 %define stridemp dword [esp+calloff+16*0+4*7] 1109 %define leftm dword [esp+calloff+16*3+4*0] 1110 %define lpfm dword [esp+calloff+16*3+4*1] 1111 %define w0m dword [esp+calloff+16*3+4*2] 1112 %define hd dword [esp+calloff+16*3+4*3] 1113 %define edgeb byte [esp+calloff+16*3+4*4] 1114 %define edged dword [esp+calloff+16*3+4*4] 1115 %define leftmp leftm 1116 %else 1117 %define w0m wm 1118 %define hd dword r5m 1119 %define edgeb byte r7m 1120 %define edged dword r7m 1121 %endif 1122 %define hvsrcm dword [esp+calloff+4*0] 1123 %define w1m dword [esp+calloff+4*1] 1124 %define t0m dword [esp+calloff+4*2] 1125 %define t2m dword [esp+calloff+4*3] 1126 %define t3m dword [esp+calloff+4*4] 1127 %define t4m dword [esp+calloff+4*5] 1128 %define m8 [base+pd_8] 1129 %define m9 [base+pd_0xfffffff0] 1130 %define m10 [esp+calloff+16*2] 1131 %define m11 [base+pw_164_24] 1132 %define m12 [base+sgr_lshuf5] 1133 %define m13 [base+pd_34816] 1134 %define m14 [base+pw_1023] 1135 %define m15 [base+pf_256] 1136 %define r10 r4 1137 %define base r6-pw_455_24 1138 %assign calloff 0 1139 %if STACK_ALIGNMENT < 16 1140 mov strideq, [rstk+stack_offset+ 8] 1141 mov leftq, [rstk+stack_offset+12] 1142 mov lpfq, [rstk+stack_offset+16] 1143 mov wd, [rstk+stack_offset+20] 1144 mov dstm, dstq 1145 mov stridemp, strideq 1146 mov leftm, leftq 1147 mov r1, [rstk+stack_offset+24] 1148 mov r2, [rstk+stack_offset+32] 1149 mov lpfm, lpfq 1150 mov hd, r1 1151 mov edged, r2 1152 %endif 1153 %else 1154 cglobal sgr_filter_5x5_16bpc, 4, 13, 16, -400*24-16, dst, stride, left, lpf, \ 1155 w, h, edge, params 1156 %endif 1157 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1158 movifnidn wd, wm 1159 %endif 1160 %if ARCH_X86_64 1161 mov paramsq, r6mp 1162 movifnidn hd, hm 1163 add wd, wd 1164 mov edged, r7m 1165 movu m10, [paramsq] 1166 mova m12, [sgr_lshuf5] 1167 add lpfq, wq 1168 mova m8, [pd_8] 1169 lea t1, [rsp+wq+20] 1170 mova m9, [pd_0xfffffff0] 1171 add dstq, wq 1172 lea t3, [rsp+wq*2+400*12+16] 1173 mova m11, [pw_164_24] 1174 lea t4, [rsp+wq+400*20+16] 1175 pshufhw m7, m10, q0000 1176 pshufb m10, [pw_256] ; s0 1177 punpckhqdq m7, m7 ; w0 1178 neg wq 1179 mova m13, [pd_34816] ; (1 << 11) + (1 << 15) 1180 pxor m6, m6 1181 mova m14, [pw_1023] 1182 psllw m7, 4 1183 movaps m15, [pf_256] 1184 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1185 %define lpfm [rsp] 1186 %else 1187 mov r1, [rstk+stack_offset+28] ; params 1188 LEA r6, pw_455_24 1189 add wd, wd 1190 movu m1, [r1] 1191 add lpfm, wq 1192 lea t1, [rsp+extra_stack+wq+20] 1193 add dstq, wq 1194 lea t3, [rsp+extra_stack+wq*2+400*12+16] 1195 mov dstm, dstq 1196 lea t4, [rsp+extra_stack+wq+400*20+16] 1197 mov t3m, t3 1198 pshufhw m7, m1, q0000 1199 mov t4m, t4 1200 pshufb m1, [base+pw_256] ; s0 1201 punpckhqdq m7, m7 ; w0 1202 psllw m7, 4 1203 neg wq 1204 mova m10, m1 1205 pxor m6, m6 1206 mov w1m, wd 1207 sub wd, 4 1208 mov lpfq, lpfm 1209 mov w0m, wd 1210 %define strideq r5 1211 %endif 1212 test edgeb, 4 ; LR_HAVE_TOP 1213 jz .no_top 1214 call .h_top 1215 add lpfq, stridemp 1216 mov t2, t1 1217 call .top_fixup 1218 add t1, 400*6 1219 call .h_top 1220 movif32 strideq, stridemp 1221 lea r10, [lpfq+strideq*4] 1222 mov lpfq, dstq 1223 add r10, strideq 1224 mov lpfm, r10 ; below 1225 movif32 t0m, t2 1226 mov t0, t2 1227 dec hd 1228 jz .height1 1229 or edged, 16 1230 call .h 1231 .main: 1232 add lpfq, stridemp 1233 movif32 t4, t4m 1234 call .hv 1235 call .prep_n 1236 sub hd, 2 1237 jl .extend_bottom 1238 .main_loop: 1239 movif32 lpfq, hvsrcm 1240 add lpfq, stridemp 1241 %if ARCH_X86_64 1242 test hb, hb 1243 %else 1244 mov r4, hd 1245 test r4, r4 1246 %endif 1247 jz .odd_height 1248 call .h 1249 add lpfq, stridemp 1250 call .hv 1251 movif32 dstq, dstm 1252 call .n0 1253 call .n1 1254 sub hd, 2 1255 movif32 t0, t0m 1256 jge .main_loop 1257 test edgeb, 8 ; LR_HAVE_BOTTOM 1258 jz .extend_bottom 1259 mov lpfq, lpfm 1260 call .h_top 1261 add lpfq, stridemp 1262 call .hv_bottom 1263 .end: 1264 movif32 dstq, dstm 1265 call .n0 1266 call .n1 1267 .end2: 1268 RET 1269 .height1: 1270 movif32 t4, t4m 1271 call .hv 1272 call .prep_n 1273 jmp .odd_height_end 1274 .odd_height: 1275 call .hv 1276 movif32 dstq, dstm 1277 call .n0 1278 call .n1 1279 .odd_height_end: 1280 call .v 1281 movif32 dstq, dstm 1282 call .n0 1283 jmp .end2 1284 .extend_bottom: 1285 call .v 1286 jmp .end 1287 .no_top: 1288 movif32 strideq, stridemp 1289 lea r10, [lpfq+strideq*4] 1290 mov lpfq, dstq 1291 lea r10, [r10+strideq*2] 1292 mov lpfm, r10 1293 call .h 1294 lea t2, [t1+400*6] 1295 call .top_fixup 1296 dec hd 1297 jz .no_top_height1 1298 or edged, 16 1299 mov t0, t1 1300 mov t1, t2 1301 movif32 t0m, t0 1302 jmp .main 1303 .no_top_height1: 1304 movif32 t3, t3m 1305 movif32 t4, t4m 1306 call .v 1307 call .prep_n 1308 jmp .odd_height_end 1309 .extend_right: 1310 movd m0, wd 1311 movd m1, [lpfq-2] 1312 mova m2, [base+pw_256] 1313 mova m3, [base+pb_m14_m13] 1314 pshufb m0, m6 1315 pshufb m1, m2 1316 psubb m2, m0 1317 psubb m3, m0 1318 mova m0, [base+pb_0to15] 1319 pcmpgtb m2, m0 1320 pcmpgtb m3, m0 1321 pand m4, m2 1322 pand m5, m3 1323 pandn m2, m1 1324 pandn m3, m1 1325 por m4, m2 1326 por m5, m3 1327 ret 1328 %assign stack_offset stack_offset+4 1329 %assign calloff 4 1330 .h: ; horizontal boxsum 1331 %if ARCH_X86_64 1332 lea wq, [r4-4] 1333 %else 1334 %define leftq r4 1335 %endif 1336 test edgeb, 1 ; LR_HAVE_LEFT 1337 jz .h_extend_left 1338 movif32 leftq, leftm 1339 movddup m5, [leftq] 1340 movif32 wq, w0m 1341 mova m4, [lpfq+wq+4] 1342 add leftmp, 8 1343 palignr m4, m5, 10 1344 jmp .h_main 1345 .h_extend_left: 1346 movif32 wq, w0m 1347 mova m4, [lpfq+wq+4] 1348 pshufb m4, m12 1349 jmp .h_main 1350 .h_top: 1351 %if ARCH_X86_64 1352 lea wq, [r4-4] 1353 %endif 1354 test edgeb, 1 ; LR_HAVE_LEFT 1355 jz .h_extend_left 1356 movif32 wq, w0m 1357 .h_loop: 1358 movu m4, [lpfq+wq- 2] 1359 .h_main: 1360 movu m5, [lpfq+wq+14] 1361 test edgeb, 2 ; LR_HAVE_RIGHT 1362 jnz .h_have_right 1363 cmp wd, -20 1364 jl .h_have_right 1365 call .extend_right 1366 .h_have_right: 1367 palignr m2, m5, m4, 2 1368 paddw m0, m4, m2 1369 palignr m3, m5, m4, 6 1370 paddw m0, m3 1371 punpcklwd m1, m2, m3 1372 pmaddwd m1, m1 1373 punpckhwd m2, m3 1374 pmaddwd m2, m2 1375 palignr m5, m4, 8 1376 paddw m0, m5 1377 punpcklwd m3, m4, m5 1378 pmaddwd m3, m3 1379 paddd m1, m3 1380 punpckhwd m3, m4, m5 1381 pmaddwd m3, m3 1382 shufps m4, m5, q2121 1383 paddw m0, m4 ; sum 1384 punpcklwd m5, m4, m6 1385 pmaddwd m5, m5 1386 punpckhwd m4, m6 1387 pmaddwd m4, m4 1388 paddd m2, m3 1389 test edgeb, 16 ; y > 0 1390 jz .h_loop_end 1391 paddw m0, [t1+wq+400*0] 1392 paddd m1, [t1+wq+400*2] 1393 paddd m2, [t1+wq+400*4] 1394 .h_loop_end: 1395 paddd m1, m5 ; sumsq 1396 paddd m2, m4 1397 mova [t1+wq+400*0], m0 1398 mova [t1+wq+400*2], m1 1399 mova [t1+wq+400*4], m2 1400 add wq, 16 1401 jl .h_loop 1402 ret 1403 .top_fixup: 1404 %if ARCH_X86_64 1405 lea wq, [r4-4] 1406 %else 1407 mov wd, w0m 1408 %endif 1409 .top_fixup_loop: ; the sums of the first row needs to be doubled 1410 mova m0, [t1+wq+400*0] 1411 mova m1, [t1+wq+400*2] 1412 mova m2, [t1+wq+400*4] 1413 paddw m0, m0 1414 paddd m1, m1 1415 paddd m2, m2 1416 mova [t2+wq+400*0], m0 1417 mova [t2+wq+400*2], m1 1418 mova [t2+wq+400*4], m2 1419 add wq, 16 1420 jl .top_fixup_loop 1421 ret 1422 ALIGN function_align 1423 .hv: ; horizontal boxsum + vertical boxsum + ab 1424 %if ARCH_X86_64 1425 lea wq, [r4-4] 1426 %else 1427 mov hvsrcm, lpfq 1428 %endif 1429 test edgeb, 1 ; LR_HAVE_LEFT 1430 jz .hv_extend_left 1431 movif32 leftq, leftm 1432 movddup m5, [leftq] 1433 movif32 wq, w0m 1434 mova m4, [lpfq+wq+4] 1435 add leftmp, 8 1436 palignr m4, m5, 10 1437 jmp .hv_main 1438 .hv_extend_left: 1439 movif32 wq, w0m 1440 mova m4, [lpfq+wq+4] 1441 pshufb m4, m12 1442 jmp .hv_main 1443 .hv_bottom: 1444 %if ARCH_X86_64 1445 lea wq, [r4-4] 1446 %else 1447 mov hvsrcm, lpfq 1448 %endif 1449 test edgeb, 1 ; LR_HAVE_LEFT 1450 jz .hv_extend_left 1451 movif32 wq, w0m 1452 %if ARCH_X86_32 1453 jmp .hv_loop_start 1454 %endif 1455 .hv_loop: 1456 movif32 lpfq, hvsrcm 1457 .hv_loop_start: 1458 movu m4, [lpfq+wq- 2] 1459 .hv_main: 1460 movu m5, [lpfq+wq+14] 1461 test edgeb, 2 ; LR_HAVE_RIGHT 1462 jnz .hv_have_right 1463 cmp wd, -20 1464 jl .hv_have_right 1465 call .extend_right 1466 .hv_have_right: 1467 movif32 t3, hd 1468 palignr m3, m5, m4, 2 1469 paddw m0, m4, m3 1470 palignr m1, m5, m4, 6 1471 paddw m0, m1 1472 punpcklwd m2, m3, m1 1473 pmaddwd m2, m2 1474 punpckhwd m3, m1 1475 pmaddwd m3, m3 1476 palignr m5, m4, 8 1477 paddw m0, m5 1478 punpcklwd m1, m4, m5 1479 pmaddwd m1, m1 1480 paddd m2, m1 1481 punpckhwd m1, m4, m5 1482 pmaddwd m1, m1 1483 shufps m4, m5, q2121 1484 paddw m0, m4 ; h sum 1485 punpcklwd m5, m4, m6 1486 pmaddwd m5, m5 1487 punpckhwd m4, m6 1488 pmaddwd m4, m4 1489 paddd m3, m1 1490 paddd m2, m5 ; h sumsq 1491 paddd m3, m4 1492 paddw m1, m0, [t1+wq+400*0] 1493 paddd m4, m2, [t1+wq+400*2] 1494 paddd m5, m3, [t1+wq+400*4] 1495 %if ARCH_X86_64 1496 test hd, hd 1497 %else 1498 test t3, t3 1499 %endif 1500 jz .hv_last_row 1501 .hv_main2: 1502 paddw m1, [t2+wq+400*0] ; hv sum 1503 paddd m4, [t2+wq+400*2] ; hv sumsq 1504 paddd m5, [t2+wq+400*4] 1505 mova [t0+wq+400*0], m0 1506 mova [t0+wq+400*2], m2 1507 mova [t0+wq+400*4], m3 1508 psrlw m3, m1, 1 1509 paddd m4, m8 1510 pavgw m3, m6 ; (b + 2) >> 2 1511 paddd m5, m8 1512 pand m4, m9 ; ((a + 8) >> 4) << 4 1513 pand m5, m9 1514 psrld m2, m4, 4 1515 psrld m0, m5, 4 1516 paddd m2, m4 1517 psrld m4, 1 1518 paddd m0, m5 1519 psrld m5, 1 1520 paddd m4, m2 ; a * 25 1521 paddd m5, m0 1522 movif32 t3, t3m 1523 SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m10, m11, m15 1524 punpcklwd m2, m3, m3 1525 mova [t4+wq+4], m3 1526 punpckhwd m3, m3 1527 MUL_32X16X2 m0, m1, m2, m3, m4, m5 1528 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 1529 paddd m1, m13 1530 psrld m0, 12 ; b 1531 psrld m1, 12 1532 mova [t3+wq*2+ 8], m0 1533 mova [t3+wq*2+24], m1 1534 add wq, 16 1535 jl .hv_loop 1536 mov t2, t1 1537 mov t1, t0 1538 mov t0, t2 1539 movif32 t0m, t0 1540 ret 1541 .hv_last_row: ; esoteric edge case for odd heights 1542 mova [t1+wq+400*0], m1 1543 paddw m1, m0 1544 mova [t1+wq+400*2], m4 1545 paddd m4, m2 1546 mova [t1+wq+400*4], m5 1547 paddd m5, m3 1548 jmp .hv_main2 1549 .v: ; vertical boxsum + ab 1550 %if ARCH_X86_64 1551 lea wq, [r4-4] 1552 %else 1553 mov wd, w0m 1554 %endif 1555 .v_loop: 1556 mova m0, [t1+wq+400*0] 1557 mova m2, [t1+wq+400*2] 1558 mova m3, [t1+wq+400*4] 1559 paddw m1, m0, [t2+wq+400*0] 1560 paddd m4, m2, [t2+wq+400*2] 1561 paddd m5, m3, [t2+wq+400*4] 1562 paddw m0, m0 1563 paddd m2, m2 1564 paddd m3, m3 1565 paddw m1, m0 ; hv sum 1566 paddd m4, m2 ; hv sumsq 1567 paddd m5, m3 1568 psrlw m3, m1, 1 1569 paddd m4, m8 1570 pavgw m3, m6 ; (b + 2) >> 2 1571 paddd m5, m8 1572 pand m4, m9 ; ((a + 8) >> 4) << 4 1573 pand m5, m9 1574 psrld m2, m4, 4 1575 psrld m0, m5, 4 1576 paddd m2, m4 1577 psrld m4, 1 1578 paddd m0, m5 1579 psrld m5, 1 1580 paddd m4, m2 ; a * 25 1581 paddd m5, m0 1582 SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m10, m11, m15 1583 punpcklwd m2, m3, m3 1584 mova [t4+wq+4], m3 1585 punpckhwd m3, m3 1586 MUL_32X16X2 m0, m1, m2, m3, m4, m5 1587 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) 1588 paddd m1, m13 1589 psrld m0, 12 ; b 1590 psrld m1, 12 1591 mova [t3+wq*2+ 8], m0 1592 mova [t3+wq*2+24], m1 1593 add wq, 16 1594 jl .v_loop 1595 ret 1596 .prep_n: ; initial neighbor setup 1597 movif64 wq, r4 1598 movif32 wd, w1m 1599 .prep_n_loop: 1600 movu m0, [t4+wq*1+ 2] 1601 movu m3, [t4+wq*1+ 4] 1602 movu m1, [t3+wq*2+ 4] 1603 movu m4, [t3+wq*2+ 8] 1604 movu m2, [t3+wq*2+20] 1605 movu m5, [t3+wq*2+24] 1606 paddw m3, m0 1607 paddd m4, m1 1608 paddd m5, m2 1609 paddw m3, [t4+wq*1+ 0] 1610 paddd m4, [t3+wq*2+ 0] 1611 paddd m5, [t3+wq*2+16] 1612 paddw m0, m3 1613 psllw m3, 2 1614 paddd m1, m4 1615 pslld m4, 2 1616 paddd m2, m5 1617 pslld m5, 2 1618 paddw m0, m3 ; a 565 1619 paddd m1, m4 ; b 565 1620 paddd m2, m5 1621 mova [t4+wq*1+400*2+ 0], m0 1622 mova [t3+wq*2+400*4+ 0], m1 1623 mova [t3+wq*2+400*4+16], m2 1624 add wq, 16 1625 jl .prep_n_loop 1626 ret 1627 ALIGN function_align 1628 .n0: ; neighbor + output (even rows) 1629 movif64 wq, r4 1630 movif32 wd, w1m 1631 .n0_loop: 1632 movu m0, [t4+wq*1+ 2] 1633 movu m3, [t4+wq*1+ 4] 1634 movu m1, [t3+wq*2+ 4] 1635 movu m4, [t3+wq*2+ 8] 1636 movu m2, [t3+wq*2+20] 1637 movu m5, [t3+wq*2+24] 1638 paddw m3, m0 1639 paddd m4, m1 1640 paddd m5, m2 1641 paddw m3, [t4+wq*1+ 0] 1642 paddd m4, [t3+wq*2+ 0] 1643 paddd m5, [t3+wq*2+16] 1644 paddw m0, m3 1645 psllw m3, 2 1646 paddd m1, m4 1647 pslld m4, 2 1648 paddd m2, m5 1649 pslld m5, 2 1650 paddw m0, m3 ; a 565 1651 paddd m1, m4 ; b 565 1652 paddd m2, m5 1653 paddw m3, m0, [t4+wq*1+400*2+ 0] 1654 paddd m4, m1, [t3+wq*2+400*4+ 0] 1655 paddd m5, m2, [t3+wq*2+400*4+16] 1656 mova [t4+wq*1+400*2+ 0], m0 1657 mova [t3+wq*2+400*4+ 0], m1 1658 mova [t3+wq*2+400*4+16], m2 1659 mova m0, [dstq+wq] 1660 punpcklwd m1, m0, m6 ; src 1661 punpcklwd m2, m3, m6 ; a 1662 pmaddwd m2, m1 ; a * src 1663 punpckhwd m1, m0, m6 1664 punpckhwd m3, m6 1665 pmaddwd m3, m1 1666 psubd m4, m2 ; b - a * src + (1 << 8) 1667 psubd m5, m3 1668 psrad m4, 9 1669 psrad m5, 9 1670 packssdw m4, m5 1671 pmulhrsw m4, m7 1672 paddw m0, m4 1673 pmaxsw m0, m6 1674 pminsw m0, m14 1675 mova [dstq+wq], m0 1676 add wq, 16 1677 jl .n0_loop 1678 add dstq, stridemp 1679 ret 1680 ALIGN function_align 1681 .n1: ; neighbor + output (odd rows) 1682 movif64 wq, r4 1683 movif32 wd, w1m 1684 .n1_loop: 1685 mova m0, [dstq+wq] 1686 mova m3, [t4+wq*1+400*2+ 0] 1687 mova m4, [t3+wq*2+400*4+ 0] 1688 mova m5, [t3+wq*2+400*4+16] 1689 punpcklwd m1, m0, m6 ; src 1690 punpcklwd m2, m3, m6 ; a 1691 pmaddwd m2, m1 1692 punpckhwd m1, m0, m6 1693 punpckhwd m3, m6 1694 pmaddwd m3, m1 1695 psubd m4, m2 ; b - a * src + (1 << 7) 1696 psubd m5, m3 1697 psrad m4, 8 1698 psrad m5, 8 1699 packssdw m4, m5 1700 pmulhrsw m4, m7 1701 paddw m0, m4 1702 pmaxsw m0, m6 1703 pminsw m0, m14 1704 mova [dstq+wq], m0 1705 add wq, 16 1706 jl .n1_loop 1707 add dstq, stridemp 1708 movif32 dstm, dstq 1709 ret 1710 1711 %if ARCH_X86_32 1712 %if STACK_ALIGNMENT < 16 1713 %assign extra_stack 4*16 1714 %else 1715 %assign extra_stack 2*16 1716 %endif 1717 cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ 1718 dst, stride, left, lpf, w 1719 %if STACK_ALIGNMENT < 16 1720 %define dstm dword [esp+calloff+16*2+4*0] 1721 %define stridemp dword [esp+calloff+16*2+4*1] 1722 %define leftm dword [esp+calloff+16*2+4*2] 1723 %define lpfm dword [esp+calloff+16*2+4*3] 1724 %define w0m dword [esp+calloff+16*2+4*4] 1725 %define hd dword [esp+calloff+16*2+4*5] 1726 %define edgeb byte [esp+calloff+16*2+4*6] 1727 %define edged dword [esp+calloff+16*2+4*6] 1728 %define leftmp leftm 1729 %else 1730 %define w0m wm 1731 %define hd dword r5m 1732 %define edgeb byte r7m 1733 %define edged dword r7m 1734 %endif 1735 %define hvsrcm dword [esp+calloff+4*0] 1736 %define w1m dword [esp+calloff+4*1] 1737 %define t3m dword [esp+calloff+4*2] 1738 %define t4m dword [esp+calloff+4*3] 1739 %define m8 [base+pd_8] 1740 %define m9 [esp+calloff+16*1] 1741 %define m10 [base+pw_455_24] 1742 %define m11 [base+pd_34816] 1743 %define m12 [base+sgr_lshuf3] 1744 %define m13 [base+pw_1023] 1745 %define m14 [base+pf_256] 1746 %define base r6-pw_455_24 1747 %assign calloff 0 1748 %if STACK_ALIGNMENT < 16 1749 mov strideq, [rstk+stack_offset+ 8] 1750 mov leftq, [rstk+stack_offset+12] 1751 mov lpfq, [rstk+stack_offset+16] 1752 mov wd, [rstk+stack_offset+20] 1753 mov dstm, dstq 1754 mov stridemp, strideq 1755 mov leftm, leftq 1756 mov r1, [rstk+stack_offset+24] 1757 mov r2, [rstk+stack_offset+32] 1758 mov lpfm, lpfq 1759 mov hd, r1 1760 mov edged, r2 1761 %endif 1762 %else 1763 cglobal sgr_filter_3x3_16bpc, 4, 13, 15, -400*42-8, dst, stride, left, lpf, \ 1764 w, h, edge, params 1765 %endif 1766 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 1767 movifnidn wd, wm 1768 %endif 1769 %if ARCH_X86_64 1770 mov paramsq, r6mp 1771 movifnidn hd, hm 1772 add wd, wd 1773 mov edged, r7m 1774 movq m9, [paramsq+4] 1775 add lpfq, wq 1776 lea t1, [rsp+wq+12] 1777 mova m8, [pd_8] 1778 add dstq, wq 1779 lea t3, [rsp+wq*2+400*12+8] 1780 mova m10, [pw_455_24] 1781 lea t4, [rsp+wq+400*32+8] 1782 mova m11, [pd_34816] 1783 pshuflw m7, m9, q3333 1784 pshufb m9, [pw_256] ; s1 1785 punpcklqdq m7, m7 ; w1 1786 neg wq 1787 pxor m6, m6 1788 mova m13, [pw_1023] 1789 psllw m7, 4 1790 mova m12, [sgr_lshuf3] 1791 movaps m14, [pf_256] 1792 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 1793 %define lpfm [rsp] 1794 %else 1795 mov r1, [rstk+stack_offset+28] ; params 1796 LEA r6, pw_455_24 1797 add wd, wd 1798 movq m1, [r1+4] 1799 add lpfm, wq 1800 lea t1, [rsp+extra_stack+wq+20] 1801 add dstq, wq 1802 lea t3, [rsp+extra_stack+wq*2+400*12+16] 1803 mov dstm, dstq 1804 lea t4, [rsp+extra_stack+wq+400*32+16] 1805 mov t3m, t3 1806 pshuflw m7, m1, q3333 1807 mov t4m, t4 1808 pshufb m1, [base+pw_256] ; s1 1809 punpcklqdq m7, m7 ; w1 1810 psllw m7, 4 1811 neg wq 1812 mova m9, m1 1813 pxor m6, m6 1814 mov w1m, wd 1815 sub wd, 4 1816 mov lpfq, lpfm 1817 mov w0m, wd 1818 %define strideq r5 1819 %endif 1820 test edgeb, 4 ; LR_HAVE_TOP 1821 jz .no_top 1822 call .h_top 1823 add lpfq, stridemp 1824 mov t2, t1 1825 add t1, 400*6 1826 call .h_top 1827 movif32 strideq, stridemp 1828 lea r10, [lpfq+strideq*4] 1829 mov lpfq, dstq 1830 add r10, strideq 1831 mov lpfm, r10 ; below 1832 movif32 t4, t4m 1833 call .hv0 1834 .main: 1835 dec hd 1836 jz .height1 1837 movif32 lpfq, hvsrcm 1838 add lpfq, stridemp 1839 call .hv1 1840 call .prep_n 1841 sub hd, 2 1842 jl .extend_bottom 1843 .main_loop: 1844 movif32 lpfq, hvsrcm 1845 add lpfq, stridemp 1846 call .hv0 1847 %if ARCH_X86_64 1848 test hb, hb 1849 %else 1850 mov r4, hd 1851 test r4, r4 1852 %endif 1853 jz .odd_height 1854 movif32 lpfq, hvsrcm 1855 add lpfq, stridemp 1856 call .hv1 1857 call .n0 1858 call .n1 1859 sub hd, 2 1860 jge .main_loop 1861 test edgeb, 8 ; LR_HAVE_BOTTOM 1862 jz .extend_bottom 1863 mov lpfq, lpfm 1864 call .hv0_bottom 1865 movif32 lpfq, hvsrcm 1866 add lpfq, stridemp 1867 call .hv1_bottom 1868 .end: 1869 call .n0 1870 call .n1 1871 .end2: 1872 RET 1873 .height1: 1874 call .v1 1875 call .prep_n 1876 jmp .odd_height_end 1877 .odd_height: 1878 call .v1 1879 call .n0 1880 call .n1 1881 .odd_height_end: 1882 call .v0 1883 call .v1 1884 call .n0 1885 jmp .end2 1886 .extend_bottom: 1887 call .v0 1888 call .v1 1889 jmp .end 1890 .no_top: 1891 movif32 strideq, stridemp 1892 lea r10, [lpfq+strideq*4] 1893 mov lpfq, dstq 1894 lea r10, [r10+strideq*2] 1895 mov lpfm, r10 1896 call .h 1897 %if ARCH_X86_64 1898 lea wq, [r4-4] 1899 %else 1900 mov wq, w0m 1901 mov hvsrcm, lpfq 1902 %endif 1903 lea t2, [t1+400*6] 1904 .top_fixup_loop: 1905 mova m0, [t1+wq+400*0] 1906 mova m1, [t1+wq+400*2] 1907 mova m2, [t1+wq+400*4] 1908 mova [t2+wq+400*0], m0 1909 mova [t2+wq+400*2], m1 1910 mova [t2+wq+400*4], m2 1911 add wq, 16 1912 jl .top_fixup_loop 1913 movif32 t3, t3m 1914 movif32 t4, t4m 1915 call .v0 1916 jmp .main 1917 .extend_right: 1918 movd m1, wd 1919 movd m5, [lpfq-2] 1920 mova m2, [base+pw_256] 1921 mova m3, [base+pb_0to15] 1922 pshufb m1, m6 1923 pshufb m5, m2 1924 psubb m2, m1 1925 pcmpgtb m2, m3 1926 pand m4, m2 1927 pandn m2, m5 1928 por m4, m2 1929 ret 1930 %assign stack_offset stack_offset+4 1931 %assign calloff 4 1932 .h: ; horizontal boxsum 1933 %if ARCH_X86_64 1934 lea wq, [r4-4] 1935 %else 1936 %define leftq r4 1937 %endif 1938 test edgeb, 1 ; LR_HAVE_LEFT 1939 jz .h_extend_left 1940 movif32 leftq, leftm 1941 movddup m5, [leftq] 1942 movif32 wq, w0m 1943 mova m4, [lpfq+wq+4] 1944 add leftmp, 8 1945 palignr m4, m5, 12 1946 jmp .h_main 1947 .h_extend_left: 1948 movif32 wq, w0m 1949 mova m4, [lpfq+wq+4] 1950 pshufb m4, m12 1951 jmp .h_main 1952 .h_top: 1953 %if ARCH_X86_64 1954 lea wq, [r4-4] 1955 %endif 1956 test edgeb, 1 ; LR_HAVE_LEFT 1957 jz .h_extend_left 1958 movif32 wq, w0m 1959 .h_loop: 1960 movu m4, [lpfq+wq+ 0] 1961 .h_main: 1962 movu m5, [lpfq+wq+16] 1963 test edgeb, 2 ; LR_HAVE_RIGHT 1964 jnz .h_have_right 1965 cmp wd, -18 1966 jl .h_have_right 1967 call .extend_right 1968 .h_have_right: 1969 palignr m0, m5, m4, 2 1970 paddw m1, m4, m0 1971 punpcklwd m2, m4, m0 1972 pmaddwd m2, m2 1973 punpckhwd m3, m4, m0 1974 pmaddwd m3, m3 1975 palignr m5, m4, 4 1976 paddw m1, m5 ; sum 1977 punpcklwd m4, m5, m6 1978 pmaddwd m4, m4 1979 punpckhwd m5, m6 1980 pmaddwd m5, m5 1981 paddd m2, m4 ; sumsq 1982 paddd m3, m5 1983 mova [t1+wq+400*0], m1 1984 mova [t1+wq+400*2], m2 1985 mova [t1+wq+400*4], m3 1986 add wq, 16 1987 jl .h_loop 1988 ret 1989 ALIGN function_align 1990 .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) 1991 %if ARCH_X86_64 1992 lea wq, [r4-4] 1993 %else 1994 mov hvsrcm, lpfq 1995 %endif 1996 test edgeb, 1 ; LR_HAVE_LEFT 1997 jz .hv0_extend_left 1998 movif32 leftq, leftm 1999 movddup m5, [leftq] 2000 movif32 wq, w0m 2001 mova m4, [lpfq+wq+4] 2002 add leftmp, 8 2003 palignr m4, m5, 12 2004 jmp .hv0_main 2005 .hv0_extend_left: 2006 movif32 wq, w0m 2007 mova m4, [lpfq+wq+4] 2008 pshufb m4, m12 2009 jmp .hv0_main 2010 .hv0_bottom: 2011 %if ARCH_X86_64 2012 lea wq, [r4-4] 2013 %else 2014 mov hvsrcm, lpfq 2015 %endif 2016 test edgeb, 1 ; LR_HAVE_LEFT 2017 jz .hv0_extend_left 2018 movif32 wq, w0m 2019 %if ARCH_X86_32 2020 jmp .hv0_loop_start 2021 %endif 2022 .hv0_loop: 2023 movif32 lpfq, hvsrcm 2024 .hv0_loop_start: 2025 movu m4, [lpfq+wq+ 0] 2026 .hv0_main: 2027 movu m5, [lpfq+wq+16] 2028 test edgeb, 2 ; LR_HAVE_RIGHT 2029 jnz .hv0_have_right 2030 cmp wd, -18 2031 jl .hv0_have_right 2032 call .extend_right 2033 .hv0_have_right: 2034 palignr m0, m5, m4, 2 2035 paddw m1, m4, m0 2036 punpcklwd m2, m4, m0 2037 pmaddwd m2, m2 2038 punpckhwd m3, m4, m0 2039 pmaddwd m3, m3 2040 palignr m5, m4, 4 2041 paddw m1, m5 ; sum 2042 punpcklwd m4, m5, m6 2043 pmaddwd m4, m4 2044 punpckhwd m5, m6 2045 pmaddwd m5, m5 2046 paddd m2, m4 ; sumsq 2047 paddd m3, m5 2048 paddw m0, m1, [t1+wq+400*0] 2049 paddd m4, m2, [t1+wq+400*2] 2050 paddd m5, m3, [t1+wq+400*4] 2051 mova [t1+wq+400*0], m1 2052 mova [t1+wq+400*2], m2 2053 mova [t1+wq+400*4], m3 2054 paddw m1, m0, [t2+wq+400*0] 2055 paddd m2, m4, [t2+wq+400*2] 2056 paddd m3, m5, [t2+wq+400*4] 2057 mova [t2+wq+400*0], m0 2058 mova [t2+wq+400*2], m4 2059 mova [t2+wq+400*4], m5 2060 paddd m2, m8 2061 paddd m3, m8 2062 psrld m2, 4 ; (a + 8) >> 4 2063 psrld m3, 4 2064 pslld m4, m2, 3 2065 pslld m5, m3, 3 2066 paddd m4, m2 ; ((a + 8) >> 4) * 9 2067 paddd m5, m3 2068 psrlw m3, m1, 1 2069 pavgw m3, m6 ; (b + 2) >> 2 2070 movif32 t3, t3m 2071 SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m9, m10, m14 2072 punpcklwd m2, m3, m3 2073 mova [t4+wq+4], m3 2074 punpckhwd m3, m3 2075 MUL_32X16X2 m0, m1, m2, m3, m4, m5 2076 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2077 paddd m1, m11 2078 psrld m0, 12 2079 psrld m1, 12 2080 mova [t3+wq*2+ 8], m0 2081 mova [t3+wq*2+24], m1 2082 add wq, 16 2083 jl .hv0_loop 2084 ret 2085 ALIGN function_align 2086 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2087 %if ARCH_X86_64 2088 lea wq, [r4-4] 2089 %else 2090 mov hvsrcm, lpfq 2091 %endif 2092 test edgeb, 1 ; LR_HAVE_LEFT 2093 jz .hv1_extend_left 2094 movif32 leftq, leftm 2095 movddup m5, [leftq] 2096 movif32 wq, w0m 2097 mova m4, [lpfq+wq+4] 2098 add leftmp, 8 2099 palignr m4, m5, 12 2100 jmp .hv1_main 2101 .hv1_extend_left: 2102 movif32 wq, w0m 2103 mova m4, [lpfq+wq+4] 2104 pshufb m4, m12 2105 jmp .hv1_main 2106 .hv1_bottom: 2107 %if ARCH_X86_64 2108 lea wq, [r4-4] 2109 %else 2110 mov hvsrcm, lpfq 2111 %endif 2112 test edgeb, 1 ; LR_HAVE_LEFT 2113 jz .hv1_extend_left 2114 movif32 wq, w0m 2115 %if ARCH_X86_32 2116 jmp .hv1_loop_start 2117 %endif 2118 .hv1_loop: 2119 movif32 lpfq, hvsrcm 2120 .hv1_loop_start: 2121 movu m4, [lpfq+wq+ 0] 2122 .hv1_main: 2123 movu m5, [lpfq+wq+16] 2124 test edgeb, 2 ; LR_HAVE_RIGHT 2125 jnz .hv1_have_right 2126 cmp wd, -18 2127 jl .hv1_have_right 2128 call .extend_right 2129 .hv1_have_right: 2130 palignr m1, m5, m4, 2 2131 paddw m0, m4, m1 2132 punpcklwd m2, m4, m1 2133 pmaddwd m2, m2 2134 punpckhwd m3, m4, m1 2135 pmaddwd m3, m3 2136 palignr m5, m4, 4 2137 paddw m0, m5 ; h sum 2138 punpcklwd m1, m5, m6 2139 pmaddwd m1, m1 2140 punpckhwd m5, m6 2141 pmaddwd m5, m5 2142 paddd m2, m1 ; h sumsq 2143 paddd m3, m5 2144 paddw m1, m0, [t2+wq+400*0] 2145 paddd m4, m2, [t2+wq+400*2] 2146 paddd m5, m3, [t2+wq+400*4] 2147 mova [t2+wq+400*0], m0 2148 mova [t2+wq+400*2], m2 2149 mova [t2+wq+400*4], m3 2150 paddd m4, m8 2151 paddd m5, m8 2152 psrld m4, 4 ; (a + 8) >> 4 2153 psrld m5, 4 2154 pslld m2, m4, 3 2155 pslld m3, m5, 3 2156 paddd m4, m2 ; ((a + 8) >> 4) * 9 2157 paddd m5, m3 2158 psrlw m3, m1, 1 2159 pavgw m3, m6 ; (b + 2) >> 2 2160 movif32 t3, t3m 2161 SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m9, m10, m14 2162 punpcklwd m2, m3, m3 2163 mova [t4+wq*1+400*2 +4], m3 2164 punpckhwd m3, m3 2165 MUL_32X16X2 m0, m1, m2, m3, m4, m5 2166 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2167 paddd m1, m11 2168 psrld m0, 12 2169 psrld m1, 12 2170 mova [t3+wq*2+400*4+ 8], m0 2171 mova [t3+wq*2+400*4+24], m1 2172 add wq, 16 2173 jl .hv1_loop 2174 mov r10, t2 2175 mov t2, t1 2176 mov t1, r10 2177 ret 2178 .v0: ; vertical boxsums + ab (even rows) 2179 %if ARCH_X86_64 2180 lea wq, [r4-4] 2181 %else 2182 mov wd, w0m 2183 %endif 2184 .v0_loop: 2185 mova m0, [t1+wq+400*0] 2186 mova m4, [t1+wq+400*2] 2187 mova m5, [t1+wq+400*4] 2188 paddw m0, m0 2189 paddd m4, m4 2190 paddd m5, m5 2191 paddw m1, m0, [t2+wq+400*0] 2192 paddd m2, m4, [t2+wq+400*2] 2193 paddd m3, m5, [t2+wq+400*4] 2194 mova [t2+wq+400*0], m0 2195 mova [t2+wq+400*2], m4 2196 mova [t2+wq+400*4], m5 2197 paddd m2, m8 2198 paddd m3, m8 2199 psrld m2, 4 ; (a + 8) >> 4 2200 psrld m3, 4 2201 pslld m4, m2, 3 2202 pslld m5, m3, 3 2203 paddd m4, m2 ; ((a + 8) >> 4) * 9 2204 paddd m5, m3 2205 psrlw m3, m1, 1 2206 pavgw m3, m6 ; (b + 2) >> 2 2207 SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m9, m10, m14 2208 punpcklwd m2, m3, m3 2209 mova [t4+wq*1+400*0+ 4], m3 2210 punpckhwd m3, m3 2211 MUL_32X16X2 m0, m1, m2, m3, m4, m5 2212 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2213 paddd m1, m11 2214 psrld m0, 12 2215 psrld m1, 12 2216 mova [t3+wq*2+400*0+ 8], m0 2217 mova [t3+wq*2+400*0+24], m1 2218 add wq, 16 2219 jl .v0_loop 2220 ret 2221 .v1: ; vertical boxsums + ab (odd rows) 2222 %if ARCH_X86_64 2223 lea wq, [r4-4] 2224 %else 2225 mov wd, w0m 2226 %endif 2227 .v1_loop: 2228 mova m0, [t1+wq+400*0] 2229 mova m4, [t1+wq+400*2] 2230 mova m5, [t1+wq+400*4] 2231 paddw m1, m0, [t2+wq+400*0] 2232 paddd m2, m4, [t2+wq+400*2] 2233 paddd m3, m5, [t2+wq+400*4] 2234 mova [t2+wq+400*0], m0 2235 mova [t2+wq+400*2], m4 2236 mova [t2+wq+400*4], m5 2237 paddd m2, m8 2238 paddd m3, m8 2239 psrld m2, 4 ; (a + 8) >> 4 2240 psrld m3, 4 2241 pslld m4, m2, 3 2242 pslld m5, m3, 3 2243 paddd m4, m2 ; ((a + 8) >> 4) * 9 2244 paddd m5, m3 2245 psrlw m3, m1, 1 2246 pavgw m3, m6 ; (b + 2) >> 2 2247 SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m9, m10, m14 2248 punpcklwd m2, m3, m3 2249 mova [t4+wq*1+400*2+ 4], m3 2250 punpckhwd m3, m3 2251 MUL_32X16X2 m0, m1, m2, m3, m4, m5 2252 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) 2253 paddd m1, m11 2254 psrld m0, 12 2255 psrld m1, 12 2256 mova [t3+wq*2+400*4+ 8], m0 2257 mova [t3+wq*2+400*4+24], m1 2258 add wq, 16 2259 jl .v1_loop 2260 mov r10, t2 2261 mov t2, t1 2262 mov t1, r10 2263 ret 2264 .prep_n: ; initial neighbor setup 2265 movif64 wq, r4 2266 movif32 wd, w1m 2267 .prep_n_loop: 2268 movu m0, [t4+wq*1+400*0+ 4] 2269 movu m1, [t3+wq*2+400*0+ 8] 2270 movu m2, [t3+wq*2+400*0+24] 2271 movu m3, [t4+wq*1+400*0+ 2] 2272 movu m4, [t3+wq*2+400*0+ 4] 2273 movu m5, [t3+wq*2+400*0+20] 2274 paddw m0, [t4+wq*1+400*0+ 0] 2275 paddd m1, [t3+wq*2+400*0+ 0] 2276 paddd m2, [t3+wq*2+400*0+16] 2277 paddw m3, m0 2278 paddd m4, m1 2279 paddd m5, m2 2280 psllw m3, 2 ; a[-1] 444 2281 pslld m4, 2 ; b[-1] 444 2282 pslld m5, 2 2283 psubw m3, m0 ; a[-1] 343 2284 psubd m4, m1 ; b[-1] 343 2285 psubd m5, m2 2286 mova [t4+wq*1+400*4], m3 2287 mova [t3+wq*2+400*8+ 0], m4 2288 mova [t3+wq*2+400*8+16], m5 2289 movu m0, [t4+wq*1+400*2+ 4] 2290 movu m1, [t3+wq*2+400*4+ 8] 2291 movu m2, [t3+wq*2+400*4+24] 2292 movu m3, [t4+wq*1+400*2+ 2] 2293 movu m4, [t3+wq*2+400*4+ 4] 2294 movu m5, [t3+wq*2+400*4+20] 2295 paddw m0, [t4+wq*1+400*2+ 0] 2296 paddd m1, [t3+wq*2+400*4+ 0] 2297 paddd m2, [t3+wq*2+400*4+16] 2298 paddw m3, m0 2299 paddd m4, m1 2300 paddd m5, m2 2301 psllw m3, 2 ; a[ 0] 444 2302 pslld m4, 2 ; b[ 0] 444 2303 pslld m5, 2 2304 mova [t4+wq*1+400* 6], m3 2305 mova [t3+wq*2+400*12+ 0], m4 2306 mova [t3+wq*2+400*12+16], m5 2307 psubw m3, m0 ; a[ 0] 343 2308 psubd m4, m1 ; b[ 0] 343 2309 psubd m5, m2 2310 mova [t4+wq*1+400* 8], m3 2311 mova [t3+wq*2+400*16+ 0], m4 2312 mova [t3+wq*2+400*16+16], m5 2313 add wq, 16 2314 jl .prep_n_loop 2315 ret 2316 ALIGN function_align 2317 .n0: ; neighbor + output (even rows) 2318 movif64 wq, r4 2319 movif32 wd, w1m 2320 .n0_loop: 2321 movu m3, [t4+wq*1+400*0+4] 2322 movu m1, [t4+wq*1+400*0+2] 2323 paddw m3, [t4+wq*1+400*0+0] 2324 paddw m1, m3 2325 psllw m1, 2 ; a[ 1] 444 2326 psubw m2, m1, m3 ; a[ 1] 343 2327 paddw m3, m2, [t4+wq*1+400*4] 2328 paddw m3, [t4+wq*1+400*6] 2329 mova [t4+wq*1+400*4], m2 2330 mova [t4+wq*1+400*6], m1 2331 movu m4, [t3+wq*2+400*0+8] 2332 movu m1, [t3+wq*2+400*0+4] 2333 paddd m4, [t3+wq*2+400*0+0] 2334 paddd m1, m4 2335 pslld m1, 2 ; b[ 1] 444 2336 psubd m2, m1, m4 ; b[ 1] 343 2337 paddd m4, m2, [t3+wq*2+400* 8+ 0] 2338 paddd m4, [t3+wq*2+400*12+ 0] 2339 mova [t3+wq*2+400* 8+ 0], m2 2340 mova [t3+wq*2+400*12+ 0], m1 2341 movu m5, [t3+wq*2+400*0+24] 2342 movu m1, [t3+wq*2+400*0+20] 2343 paddd m5, [t3+wq*2+400*0+16] 2344 paddd m1, m5 2345 pslld m1, 2 2346 psubd m2, m1, m5 2347 paddd m5, m2, [t3+wq*2+400* 8+16] 2348 paddd m5, [t3+wq*2+400*12+16] 2349 mova [t3+wq*2+400* 8+16], m2 2350 mova [t3+wq*2+400*12+16], m1 2351 mova m0, [dstq+wq] 2352 punpcklwd m1, m0, m6 2353 punpcklwd m2, m3, m6 2354 pmaddwd m2, m1 ; a * src 2355 punpckhwd m1, m0, m6 2356 punpckhwd m3, m6 2357 pmaddwd m3, m1 2358 psubd m4, m2 ; b - a * src + (1 << 8) 2359 psubd m5, m3 2360 psrad m4, 9 2361 psrad m5, 9 2362 packssdw m4, m5 2363 pmulhrsw m4, m7 2364 paddw m0, m4 2365 pmaxsw m0, m6 2366 pminsw m0, m13 2367 mova [dstq+wq], m0 2368 add wq, 16 2369 jl .n0_loop 2370 add dstq, stridemp 2371 ret 2372 ALIGN function_align 2373 .n1: ; neighbor + output (odd rows) 2374 movif64 wq, r4 2375 movif32 wd, w1m 2376 .n1_loop: 2377 movu m3, [t4+wq*1+400*2+4] 2378 movu m1, [t4+wq*1+400*2+2] 2379 paddw m3, [t4+wq*1+400*2+0] 2380 paddw m1, m3 2381 psllw m1, 2 ; a[ 1] 444 2382 psubw m2, m1, m3 ; a[ 1] 343 2383 paddw m3, m2, [t4+wq*1+400*6] 2384 paddw m3, [t4+wq*1+400*8] 2385 mova [t4+wq*1+400*6], m1 2386 mova [t4+wq*1+400*8], m2 2387 movu m4, [t3+wq*2+400*4+8] 2388 movu m1, [t3+wq*2+400*4+4] 2389 paddd m4, [t3+wq*2+400*4+0] 2390 paddd m1, m4 2391 pslld m1, 2 ; b[ 1] 444 2392 psubd m2, m1, m4 ; b[ 1] 343 2393 paddd m4, m2, [t3+wq*2+400*12+ 0] 2394 paddd m4, [t3+wq*2+400*16+ 0] 2395 mova [t3+wq*2+400*12+ 0], m1 2396 mova [t3+wq*2+400*16+ 0], m2 2397 movu m5, [t3+wq*2+400*4+24] 2398 movu m1, [t3+wq*2+400*4+20] 2399 paddd m5, [t3+wq*2+400*4+16] 2400 paddd m1, m5 2401 pslld m1, 2 2402 psubd m2, m1, m5 2403 paddd m5, m2, [t3+wq*2+400*12+16] 2404 paddd m5, [t3+wq*2+400*16+16] 2405 mova [t3+wq*2+400*12+16], m1 2406 mova [t3+wq*2+400*16+16], m2 2407 mova m0, [dstq+wq] 2408 punpcklwd m1, m0, m6 2409 punpcklwd m2, m3, m6 2410 pmaddwd m2, m1 ; a * src 2411 punpckhwd m1, m0, m6 2412 punpckhwd m3, m6 2413 pmaddwd m3, m1 2414 psubd m4, m2 ; b - a * src + (1 << 8) 2415 psubd m5, m3 2416 psrad m4, 9 2417 psrad m5, 9 2418 packssdw m4, m5 2419 pmulhrsw m4, m7 2420 paddw m0, m4 2421 pmaxsw m0, m6 2422 pminsw m0, m13 2423 mova [dstq+wq], m0 2424 add wq, 16 2425 jl .n1_loop 2426 add dstq, stridemp 2427 movif32 dstm, dstq 2428 ret 2429 2430 %if ARCH_X86_32 2431 %if STACK_ALIGNMENT < 16 2432 %assign extra_stack 10*16 2433 %else 2434 %assign extra_stack 8*16 2435 %endif 2436 cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ 2437 dst, stride, left, lpf, w 2438 %if STACK_ALIGNMENT < 16 2439 %define dstm dword [esp+calloff+16*8+4*0] 2440 %define stridemp dword [esp+calloff+16*8+4*1] 2441 %define leftm dword [esp+calloff+16*8+4*2] 2442 %define lpfm dword [esp+calloff+16*8+4*3] 2443 %define w0m dword [esp+calloff+16*8+4*4] 2444 %define hd dword [esp+calloff+16*8+4*5] 2445 %define edgeb byte [esp+calloff+16*8+4*6] 2446 %define edged dword [esp+calloff+16*8+4*6] 2447 %define leftmp leftm 2448 %else 2449 %define w0m wm 2450 %define hd dword r5m 2451 %define edgeb byte r7m 2452 %define edged dword r7m 2453 %endif 2454 %define hvsrcm dword [esp+calloff+4*0] 2455 %define w1m dword [esp+calloff+4*1] 2456 %define t3m dword [esp+calloff+4*2] 2457 %define t4m dword [esp+calloff+4*3] 2458 %xdefine m8 m6 2459 %define m9 [base+pd_8] 2460 %define m10 [base+pd_34816] 2461 %define m11 [base+pw_455_24] 2462 %define m12 [base+pw_164_24] 2463 %define m13 [esp+calloff+16*4] 2464 %define m14 [esp+calloff+16*5] 2465 %define m15 [esp+calloff+16*6] 2466 %define m6 [esp+calloff+16*7] 2467 %define base r6-pw_455_24 2468 %assign calloff 0 2469 %if STACK_ALIGNMENT < 16 2470 mov strideq, [rstk+stack_offset+ 8] 2471 mov leftq, [rstk+stack_offset+12] 2472 mov lpfq, [rstk+stack_offset+16] 2473 mov wd, [rstk+stack_offset+20] 2474 mov dstm, dstq 2475 mov stridemp, strideq 2476 mov leftm, leftq 2477 mov r1, [rstk+stack_offset+24] 2478 mov r2, [rstk+stack_offset+32] 2479 mov lpfm, lpfq 2480 mov hd, r1 2481 mov edged, r2 2482 %endif 2483 %else 2484 cglobal sgr_filter_mix_16bpc, 4, 13, 16, -400*66-40, dst, stride, left, lpf, \ 2485 w, h, edge, params 2486 %endif 2487 %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 2488 movifnidn wd, wm 2489 %endif 2490 %if ARCH_X86_64 2491 mov paramsq, r6mp 2492 movifnidn hd, hm 2493 add wd, wd 2494 mov edged, r7m 2495 mova m14, [paramsq] 2496 add lpfq, wq 2497 mova m9, [pd_8] 2498 lea t1, [rsp+wq+44] 2499 mova m10, [pd_34816] 2500 add dstq, wq 2501 mova m11, [pw_455_24] 2502 lea t3, [rsp+wq*2+400*24+40] 2503 mova m12, [pw_164_24] 2504 lea t4, [rsp+wq+400*52+40] 2505 neg wq 2506 pshufd m15, m14, q2222 ; w0 w1 2507 punpcklwd m14, m14 2508 pshufd m13, m14, q0000 ; s0 2509 pshufd m14, m14, q2222 ; s1 2510 pxor m6, m6 2511 psllw m15, 2 2512 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w 2513 %define lpfm [rsp] 2514 %else 2515 mov r1, [rstk+stack_offset+28] ; params 2516 LEA r6, pw_455_24 2517 add wd, wd 2518 mova m2, [r1] 2519 add lpfm, wq 2520 lea t1, [rsp+extra_stack+wq+52] 2521 add dstq, wq 2522 lea t3, [rsp+extra_stack+wq*2+400*24+48] 2523 mov dstm, dstq 2524 lea t4, [rsp+extra_stack+wq+400*52+48] 2525 mov t3m, t3 2526 mov t4m, t4 2527 neg wq 2528 pshuflw m0, m2, q0000 2529 pshuflw m1, m2, q2222 2530 pshufhw m2, m2, q1010 2531 punpcklqdq m0, m0 ; s0 2532 punpcklqdq m1, m1 ; s1 2533 punpckhqdq m2, m2 ; w0 w1 2534 mov w1m, wd 2535 pxor m3, m3 2536 psllw m2, 2 2537 mova m13, m0 2538 mova m14, m1 2539 sub wd, 4 2540 mova m15, m2 2541 mova m6, m3 2542 mov lpfq, lpfm 2543 mov w0m, wd 2544 %define strideq r5 2545 %endif 2546 test edgeb, 4 ; LR_HAVE_TOP 2547 jz .no_top 2548 call .h_top 2549 add lpfq, stridemp 2550 mov t2, t1 2551 %if ARCH_X86_64 2552 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup 2553 %else 2554 mov wq, w0m 2555 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop 2556 %endif 2557 add t1, 400*12 2558 call .h_top 2559 movif32 strideq, stridemp 2560 lea r10, [lpfq+strideq*4] 2561 mov lpfq, dstq 2562 add r10, strideq 2563 mov lpfm, r10 ; below 2564 movif32 t4, t4m 2565 call .hv0 2566 .main: 2567 dec hd 2568 jz .height1 2569 movif32 lpfq, hvsrcm 2570 add lpfq, stridemp 2571 call .hv1 2572 call .prep_n 2573 sub hd, 2 2574 jl .extend_bottom 2575 .main_loop: 2576 movif32 lpfq, hvsrcm 2577 add lpfq, stridemp 2578 call .hv0 2579 %if ARCH_X86_64 2580 test hd, hd 2581 %else 2582 mov r4, hd 2583 test r4, r4 2584 %endif 2585 jz .odd_height 2586 movif32 lpfq, hvsrcm 2587 add lpfq, stridemp 2588 call .hv1 2589 call .n0 2590 call .n1 2591 sub hd, 2 2592 jge .main_loop 2593 test edgeb, 8 ; LR_HAVE_BOTTOM 2594 jz .extend_bottom 2595 mov lpfq, lpfm 2596 call .hv0_bottom 2597 movif32 lpfq, hvsrcm 2598 add lpfq, stridemp 2599 call .hv1_bottom 2600 .end: 2601 call .n0 2602 call .n1 2603 .end2: 2604 RET 2605 .height1: 2606 call .v1 2607 call .prep_n 2608 jmp .odd_height_end 2609 .odd_height: 2610 call .v1 2611 call .n0 2612 call .n1 2613 .odd_height_end: 2614 call .v0 2615 call .v1 2616 call .n0 2617 jmp .end2 2618 .extend_bottom: 2619 call .v0 2620 call .v1 2621 jmp .end 2622 .no_top: 2623 movif32 strideq, stridemp 2624 lea r10, [lpfq+strideq*4] 2625 mov lpfq, dstq 2626 lea r10, [r10+strideq*2] 2627 mov lpfm, r10 2628 call .h 2629 %if ARCH_X86_64 2630 lea wq, [r4-4] 2631 %else 2632 mov wq, w0m 2633 mov hvsrcm, lpfq 2634 %endif 2635 lea t2, [t1+400*12] 2636 .top_fixup_loop: 2637 mova m0, [t1+wq+400* 0] 2638 mova m1, [t1+wq+400* 2] 2639 mova m2, [t1+wq+400* 4] 2640 paddw m0, m0 2641 mova m3, [t1+wq+400* 6] 2642 paddd m1, m1 2643 mova m4, [t1+wq+400* 8] 2644 paddd m2, m2 2645 mova m5, [t1+wq+400*10] 2646 mova [t2+wq+400* 0], m0 2647 mova [t2+wq+400* 2], m1 2648 mova [t2+wq+400* 4], m2 2649 mova [t2+wq+400* 6], m3 2650 mova [t2+wq+400* 8], m4 2651 mova [t2+wq+400*10], m5 2652 add wq, 16 2653 jl .top_fixup_loop 2654 movif32 t3, t3m 2655 movif32 t4, t4m 2656 call .v0 2657 jmp .main 2658 .h: ; horizontal boxsum 2659 %assign stack_offset stack_offset+4 2660 %assign calloff 4 2661 %if ARCH_X86_64 2662 lea wq, [r4-4] 2663 %else 2664 %define leftq r4 2665 %endif 2666 test edgeb, 1 ; LR_HAVE_LEFT 2667 jz .h_extend_left 2668 movif32 leftq, leftm 2669 movddup m5, [leftq] 2670 movif32 wq, w0m 2671 mova m4, [lpfq+wq+4] 2672 add leftmp, 8 2673 palignr m4, m5, 10 2674 jmp .h_main 2675 .h_extend_left: 2676 movif32 wq, w0m 2677 mova m4, [lpfq+wq+4] 2678 pshufb m4, [base+sgr_lshuf5] 2679 jmp .h_main 2680 .h_top: 2681 %if ARCH_X86_64 2682 lea wq, [r4-4] 2683 %endif 2684 test edgeb, 1 ; LR_HAVE_LEFT 2685 jz .h_extend_left 2686 movif32 wq, w0m 2687 .h_loop: 2688 movu m4, [lpfq+wq- 2] 2689 .h_main: 2690 movu m5, [lpfq+wq+14] 2691 test edgeb, 2 ; LR_HAVE_RIGHT 2692 jnz .h_have_right 2693 cmp wd, -20 2694 jl .h_have_right 2695 %if ARCH_X86_32 2696 pxor m8, m8 2697 %endif 2698 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 2699 .h_have_right: 2700 palignr m3, m5, m4, 2 2701 palignr m0, m5, m4, 4 2702 paddw m1, m3, m0 2703 punpcklwd m2, m3, m0 2704 pmaddwd m2, m2 2705 punpckhwd m3, m0 2706 pmaddwd m3, m3 2707 palignr m0, m5, m4, 6 2708 paddw m1, m0 ; sum3 2709 punpcklwd m7, m0, m6 2710 pmaddwd m7, m7 2711 punpckhwd m0, m6 2712 pmaddwd m0, m0 2713 paddd m2, m7 ; sumsq3 2714 palignr m5, m4, 8 2715 punpcklwd m7, m5, m4 2716 paddd m3, m0 2717 paddw m0, m4, m5 2718 pmaddwd m7, m7 2719 punpckhwd m5, m4 2720 pmaddwd m5, m5 2721 mova [t1+wq+400* 6], m1 2722 mova [t1+wq+400* 8], m2 2723 mova [t1+wq+400*10], m3 2724 paddw m0, m1 ; sum5 2725 paddd m7, m2 ; sumsq5 2726 paddd m5, m3 2727 mova [t1+wq+400* 0], m0 2728 mova [t1+wq+400* 2], m7 2729 mova [t1+wq+400* 4], m5 2730 add wq, 16 2731 jl .h_loop 2732 ret 2733 ALIGN function_align 2734 .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) 2735 %if ARCH_X86_64 2736 lea wq, [r4-4] 2737 %else 2738 mov hvsrcm, lpfq 2739 %endif 2740 test edgeb, 1 ; LR_HAVE_LEFT 2741 jz .hv0_extend_left 2742 movif32 leftq, leftm 2743 movddup m5, [leftq] 2744 movif32 wq, w0m 2745 mova m4, [lpfq+wq+4] 2746 add leftmp, 8 2747 palignr m4, m5, 10 2748 jmp .hv0_main 2749 .hv0_extend_left: 2750 movif32 wq, w0m 2751 mova m4, [lpfq+wq+4] 2752 pshufb m4, [base+sgr_lshuf5] 2753 jmp .hv0_main 2754 .hv0_bottom: 2755 %if ARCH_X86_64 2756 lea wq, [r4-4] 2757 %else 2758 mov hvsrcm, lpfq 2759 %endif 2760 test edgeb, 1 ; LR_HAVE_LEFT 2761 jz .hv0_extend_left 2762 movif32 wq, w0m 2763 %if ARCH_X86_32 2764 jmp .hv0_loop_start 2765 %endif 2766 .hv0_loop: 2767 movif32 lpfq, hvsrcm 2768 .hv0_loop_start: 2769 movu m4, [lpfq+wq- 2] 2770 .hv0_main: 2771 movu m5, [lpfq+wq+14] 2772 test edgeb, 2 ; LR_HAVE_RIGHT 2773 jnz .hv0_have_right 2774 cmp wd, -20 2775 jl .hv0_have_right 2776 %if ARCH_X86_32 2777 pxor m8, m8 2778 %endif 2779 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 2780 .hv0_have_right: 2781 palignr m3, m5, m4, 2 2782 palignr m0, m5, m4, 4 2783 movif32 t3, t3m 2784 paddw m1, m3, m0 2785 punpcklwd m2, m3, m0 2786 pmaddwd m2, m2 2787 punpckhwd m3, m0 2788 pmaddwd m3, m3 2789 palignr m0, m5, m4, 6 2790 paddw m1, m0 ; h sum3 2791 punpcklwd m7, m0, m6 2792 pmaddwd m7, m7 2793 punpckhwd m0, m6 2794 pmaddwd m0, m0 2795 paddd m2, m7 ; h sumsq3 2796 palignr m5, m4, 8 2797 punpcklwd m7, m5, m4 2798 paddd m3, m0 2799 paddw m0, m4, m5 2800 pmaddwd m7, m7 2801 punpckhwd m5, m4 2802 pmaddwd m5, m5 2803 paddw m0, m1 ; h sum5 2804 paddd m7, m2 ; h sumsq5 2805 paddd m5, m3 2806 mova [t3+wq*2+400*8+ 8], m0 2807 mova [t3+wq*2+400*0+ 8], m7 2808 mova [t3+wq*2+400*0+24], m5 2809 paddw m0, [t1+wq+400* 0] 2810 paddd m7, [t1+wq+400* 2] 2811 paddd m5, [t1+wq+400* 4] 2812 mova [t1+wq+400* 0], m0 2813 mova [t1+wq+400* 2], m7 2814 mova [t1+wq+400* 4], m5 2815 paddw m0, m1, [t1+wq+400* 6] 2816 paddd m4, m2, [t1+wq+400* 8] 2817 paddd m5, m3, [t1+wq+400*10] 2818 mova [t1+wq+400* 6], m1 2819 mova [t1+wq+400* 8], m2 2820 mova [t1+wq+400*10], m3 2821 paddw m1, m0, [t2+wq+400* 6] 2822 paddd m2, m4, [t2+wq+400* 8] 2823 paddd m3, m5, [t2+wq+400*10] 2824 mova [t2+wq+400* 6], m0 2825 mova [t2+wq+400* 8], m4 2826 mova [t2+wq+400*10], m5 2827 paddd m2, m9 2828 paddd m3, m9 2829 movaps m8, [base+pf_256] 2830 psrld m2, 4 ; (a3 + 8) >> 4 2831 psrld m3, 4 2832 pslld m4, m2, 3 2833 pslld m5, m3, 3 2834 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 2835 paddd m5, m3 2836 psrlw m3, m1, 1 2837 %if ARCH_X86_32 2838 pxor m7, m7 2839 pavgw m3, m7 2840 SGR_CALC_X m0, m1, m3, m2, m4, m5, m7, m14, m11, m8 2841 %else 2842 pavgw m3, m6 ; (b3 + 2) >> 2 2843 SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m14, m11, m8 2844 %endif 2845 punpcklwd m2, m3, m3 2846 mova [t4+wq*1+400*2+ 4], m3 2847 punpckhwd m3, m3 2848 MUL_32X16X2 m0, m1, m2, m3, m4, m5 2849 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2850 paddd m1, m10 2851 psrld m0, 12 2852 psrld m1, 12 2853 mova [t3+wq*2+400*4+ 8], m0 2854 mova [t3+wq*2+400*4+24], m1 2855 add wq, 16 2856 jl .hv0_loop 2857 ret 2858 ALIGN function_align 2859 .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 2860 %if ARCH_X86_64 2861 lea wq, [r4-4] 2862 %else 2863 mov hvsrcm, lpfq 2864 %endif 2865 test edgeb, 1 ; LR_HAVE_LEFT 2866 jz .hv1_extend_left 2867 movif32 leftq, leftm 2868 movddup m5, [leftq] 2869 movif32 wq, w0m 2870 mova m4, [lpfq+wq+4] 2871 add leftmp, 8 2872 palignr m4, m5, 10 2873 jmp .hv1_main 2874 .hv1_extend_left: 2875 movif32 wq, w0m 2876 mova m4, [lpfq+wq+4] 2877 pshufb m4, [base+sgr_lshuf5] 2878 jmp .hv1_main 2879 .hv1_bottom: 2880 %if ARCH_X86_64 2881 lea wq, [r4-4] 2882 %else 2883 mov hvsrcm, lpfq 2884 %endif 2885 test edgeb, 1 ; LR_HAVE_LEFT 2886 jz .hv1_extend_left 2887 movif32 wq, w0m 2888 %if ARCH_X86_32 2889 jmp .hv1_loop_start 2890 %endif 2891 .hv1_loop: 2892 movif32 lpfq, hvsrcm 2893 .hv1_loop_start: 2894 movu m4, [lpfq+wq- 2] 2895 .hv1_main: 2896 movu m5, [lpfq+wq+14] 2897 test edgeb, 2 ; LR_HAVE_RIGHT 2898 jnz .hv1_have_right 2899 cmp wd, -20 2900 jl .hv1_have_right 2901 %if ARCH_X86_32 2902 pxor m8, m8 2903 %endif 2904 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right 2905 .hv1_have_right: 2906 palignr m7, m5, m4, 2 2907 palignr m3, m5, m4, 4 2908 paddw m2, m7, m3 2909 punpcklwd m0, m7, m3 2910 pmaddwd m0, m0 2911 punpckhwd m7, m3 2912 pmaddwd m7, m7 2913 palignr m3, m5, m4, 6 2914 paddw m2, m3 ; h sum3 2915 punpcklwd m1, m3, m6 2916 pmaddwd m1, m1 2917 punpckhwd m3, m6 2918 pmaddwd m3, m3 2919 paddd m0, m1 ; h sumsq3 2920 palignr m5, m4, 8 2921 punpckhwd m1, m4, m5 2922 paddw m8, m4, m5 2923 pmaddwd m1, m1 2924 punpcklwd m4, m5 2925 pmaddwd m4, m4 2926 paddd m7, m3 2927 paddw m5, m2, [t2+wq+400* 6] 2928 mova [t2+wq+400* 6], m2 2929 paddw m8, m2 ; h sum5 2930 paddd m2, m0, [t2+wq+400* 8] 2931 paddd m3, m7, [t2+wq+400*10] 2932 mova [t2+wq+400* 8], m0 2933 mova [t2+wq+400*10], m7 2934 paddd m4, m0 ; h sumsq5 2935 paddd m1, m7 2936 paddd m2, m9 2937 paddd m3, m9 2938 psrld m2, 4 ; (a3 + 8) >> 4 2939 psrld m3, 4 2940 pslld m0, m2, 3 2941 pslld m7, m3, 3 2942 paddd m2, m0 ; ((a3 + 8) >> 4) * 9 2943 paddd m3, m7 2944 psrlw m7, m5, 1 2945 pavgw m7, m6 ; (b3 + 2) >> 2 2946 %if ARCH_X86_32 2947 mova [esp+20], m8 2948 mov t3, t3m 2949 SGR_CALC_X m0, m5, m7, m8, m2, m3, m6, m14, m11, [base+pf_256] 2950 %else 2951 SGR_CALC_X m0, m5, m7, m12, m2, m3, m6, m14, m11, [base+pf_256] 2952 %endif 2953 punpcklwd m2, m7, m7 2954 mova [t4+wq*1+400*4+4], m7 2955 punpckhwd m7, m7 2956 %if ARCH_X86_32 2957 MUL_32X16X2 m0, m5, m2, m7, m3, m8 2958 mova m8, [esp+20] 2959 %else 2960 MUL_32X16X2 m0, m5, m2, m7, m3, m12 2961 mova m12, [pw_164_24] 2962 %endif 2963 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2964 paddd m5, m10 2965 psrld m0, 12 2966 psrld m5, 12 2967 mova [t3+wq*2+400*8+ 8], m0 2968 mova [t3+wq*2+400*8+24], m5 2969 paddw m5, m8, [t2+wq+400*0] 2970 paddd m2, m4, [t2+wq+400*2] 2971 paddd m3, m1, [t2+wq+400*4] 2972 paddw m5, [t1+wq+400*0] 2973 paddd m2, [t1+wq+400*2] 2974 paddd m3, [t1+wq+400*4] 2975 mova [t2+wq+400*0], m8 2976 paddd m2, m9 2977 paddd m3, m9 2978 psrld m2, 4 ; (a5 + 8) >> 4 2979 psrld m3, 4 2980 mova [t2+wq+400*2], m4 2981 pslld m8, m2, 4 2982 mova [t2+wq+400*4], m1 2983 pslld m4, m3, 4 2984 paddd m8, m2 2985 pslld m2, 3 2986 paddd m4, m3 2987 pslld m3, 3 2988 paddd m2, m8 ; ((a5 + 8) >> 4) * 25 2989 paddd m3, m4 2990 psrlw m1, m5, 1 2991 %if ARCH_X86_32 2992 pxor m7, m7 2993 pavgw m1, m7 2994 SGR_CALC_X m0, m5, m1, m4, m2, m3, m7, m13, m12, [base+pf_256] 2995 %else 2996 movaps m8, [base+pf_256] 2997 pavgw m1, m6 ; (b5 + 2) >> 2 2998 SGR_CALC_X m0, m5, m1, m4, m2, m3, m6, m13, m12, m8 2999 %endif 3000 punpcklwd m2, m1, m1 3001 mova [t4+wq*1+400*0+ 4], m1 3002 punpckhwd m1, m1 3003 MUL_32X16X2 m0, m5, m2, m1, m3, m4 3004 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3005 paddd m5, m10 3006 psrld m0, 12 3007 psrld m5, 12 3008 mova [t3+wq*2+400*0+ 8], m0 3009 mova [t3+wq*2+400*0+24], m5 3010 add wq, 16 3011 jl .hv1_loop 3012 mov r10, t2 3013 mov t2, t1 3014 mov t1, r10 3015 ret 3016 .v0: ; vertical boxsums + ab3 (even rows) 3017 %if ARCH_X86_64 3018 lea wq, [r4-4] 3019 %else 3020 mov wd, w0m 3021 %endif 3022 movaps m8, [base+pf_256] 3023 .v0_loop: 3024 mova m0, [t1+wq+400* 6] 3025 mova m4, [t1+wq+400* 8] 3026 mova m5, [t1+wq+400*10] 3027 paddw m0, m0 3028 paddd m4, m4 3029 paddd m5, m5 3030 paddw m1, m0, [t2+wq+400* 6] 3031 paddd m2, m4, [t2+wq+400* 8] 3032 paddd m3, m5, [t2+wq+400*10] 3033 mova [t2+wq+400* 6], m0 3034 mova [t2+wq+400* 8], m4 3035 mova [t2+wq+400*10], m5 3036 paddd m2, m9 3037 paddd m3, m9 3038 psrld m2, 4 ; (a3 + 8) >> 4 3039 psrld m3, 4 3040 pslld m4, m2, 3 3041 pslld m5, m3, 3 3042 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3043 paddd m5, m3 3044 psrlw m3, m1, 1 3045 %if ARCH_X86_32 3046 pxor m7, m7 3047 pavgw m3, m7 3048 SGR_CALC_X m0, m1, m3, m2, m4, m5, m7, m14, m11, m8 3049 %else 3050 pavgw m3, m6 ; (b3 + 2) >> 2 3051 SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m14, m11, m8 3052 %endif 3053 punpcklwd m2, m3, m3 3054 mova [t4+wq*1+400*2+4], m3 3055 punpckhwd m3, m3 3056 MUL_32X16X2 m0, m1, m2, m3, m4, m5 3057 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3058 paddd m1, m10 3059 psrld m0, 12 3060 psrld m1, 12 3061 mova m3, [t1+wq+400*0] 3062 mova m4, [t1+wq+400*2] 3063 mova m5, [t1+wq+400*4] 3064 mova [t3+wq*2+400*8+ 8], m3 3065 mova [t3+wq*2+400*0+ 8], m4 3066 mova [t3+wq*2+400*0+24], m5 3067 paddw m3, m3 ; cc5 3068 paddd m4, m4 3069 paddd m5, m5 3070 mova [t1+wq+400*0], m3 3071 mova [t1+wq+400*2], m4 3072 mova [t1+wq+400*4], m5 3073 mova [t3+wq*2+400*4+ 8], m0 3074 mova [t3+wq*2+400*4+24], m1 3075 add wq, 16 3076 jl .v0_loop 3077 ret 3078 .v1: ; vertical boxsums + ab (odd rows) 3079 %if ARCH_X86_64 3080 lea wq, [r4-4] 3081 %else 3082 mov wd, w0m 3083 %endif 3084 movaps m8, [base+pf_256] 3085 .v1_loop: 3086 mova m4, [t1+wq+400* 6] 3087 mova m5, [t1+wq+400* 8] 3088 mova m7, [t1+wq+400*10] 3089 paddw m1, m4, [t2+wq+400* 6] 3090 paddd m2, m5, [t2+wq+400* 8] 3091 paddd m3, m7, [t2+wq+400*10] 3092 mova [t2+wq+400* 6], m4 3093 mova [t2+wq+400* 8], m5 3094 mova [t2+wq+400*10], m7 3095 paddd m2, m9 3096 paddd m3, m9 3097 psrld m2, 4 ; (a3 + 8) >> 4 3098 psrld m3, 4 3099 pslld m4, m2, 3 3100 pslld m5, m3, 3 3101 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 3102 paddd m5, m3 3103 psrlw m3, m1, 1 3104 %if ARCH_X86_32 3105 pxor m7, m7 3106 pavgw m3, m7 3107 SGR_CALC_X m0, m1, m3, m2, m4, m5, m7, m14, m11, m8 3108 %else 3109 pavgw m3, m6 ; (b3 + 2) >> 2 3110 SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m14, m11, m8 3111 %endif 3112 punpcklwd m2, m3, m3 3113 mova [t4+wq*1+400*4+4], m3 3114 punpckhwd m3, m3 3115 MUL_32X16X2 m0, m1, m2, m3, m4, m5 3116 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 3117 paddd m1, m10 3118 psrld m0, 12 3119 psrld m1, 12 3120 mova m4, [t3+wq*2+400*8+ 8] 3121 mova m5, [t3+wq*2+400*0+ 8] 3122 mova m7, [t3+wq*2+400*0+24] 3123 mova [t3+wq*2+400*8+ 8], m0 3124 mova [t3+wq*2+400*8+24], m1 3125 paddw m1, m4, [t2+wq+400*0] 3126 paddd m2, m5, [t2+wq+400*2] 3127 paddd m3, m7, [t2+wq+400*4] 3128 paddw m1, [t1+wq+400*0] 3129 paddd m2, [t1+wq+400*2] 3130 paddd m3, [t1+wq+400*4] 3131 mova [t2+wq+400*0], m4 3132 mova [t2+wq+400*2], m5 3133 mova [t2+wq+400*4], m7 3134 paddd m2, m9 3135 paddd m3, m9 3136 psrld m2, 4 ; (a5 + 8) >> 4 3137 psrld m3, 4 3138 pslld m4, m2, 4 3139 pslld m5, m3, 4 3140 paddd m4, m2 3141 pslld m2, 3 3142 paddd m5, m3 3143 pslld m3, 3 3144 paddd m2, m4 3145 paddd m3, m5 3146 psrlw m5, m1, 1 3147 %if ARCH_X86_32 3148 pxor m7, m7 3149 pavgw m5, m7 3150 SGR_CALC_X m0, m1, m5, m4, m2, m3, m7, m13, m12, m8 3151 %else 3152 pavgw m5, m6 ; (b5 + 2) >> 2 3153 SGR_CALC_X m0, m1, m5, m4, m2, m3, m6, m13, m12, m8 3154 %endif 3155 punpcklwd m4, m5, m5 3156 mova [t4+wq*1+400*0+ 4], m5 3157 punpckhwd m5, m5 3158 MUL_32X16X2 m0, m1, m4, m5, m2, m3 3159 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 3160 paddd m1, m10 3161 psrld m0, 12 3162 psrld m1, 12 3163 mova [t3+wq*2+400*0+ 8], m0 3164 mova [t3+wq*2+400*0+24], m1 3165 add wq, 16 3166 jl .v1_loop 3167 mov r10, t2 3168 mov t2, t1 3169 mov t1, r10 3170 ret 3171 .prep_n: ; initial neighbor setup 3172 movif64 wq, r4 3173 movif32 wd, w1m 3174 .prep_n_loop: 3175 movu m0, [t4+wq*1+400*0+ 2] 3176 movu m1, [t3+wq*2+400*0+ 4] 3177 movu m2, [t3+wq*2+400*0+20] 3178 movu m3, [t4+wq*1+400*0+ 4] 3179 movu m4, [t3+wq*2+400*0+ 8] 3180 paddw m3, [t4+wq*1+400*0+ 0] 3181 paddd m4, [t3+wq*2+400*0+ 0] 3182 paddd m5, m2, [t3+wq*2+400*0+16] 3183 movu m7, [t3+wq*2+400*0+24] 3184 paddw m3, m0 3185 paddd m4, m1 3186 paddd m5, m7 3187 paddw m0, m3 3188 paddd m1, m4 3189 paddd m2, m5 3190 psllw m3, 2 3191 pslld m4, 2 3192 pslld m5, 2 3193 paddw m0, m3 ; a5 565 3194 paddd m1, m4 ; b5 565 3195 paddd m2, m5 3196 mova [t4+wq*1+400* 6+ 0], m0 3197 mova [t3+wq*2+400*12+ 0], m1 3198 mova [t3+wq*2+400*12+16], m2 3199 movu m0, [t4+wq*1+400*2+ 4] 3200 movu m1, [t3+wq*2+400*4+ 8] 3201 movu m2, [t3+wq*2+400*4+24] 3202 movu m3, [t4+wq*1+400*2+ 2] 3203 movu m4, [t3+wq*2+400*4+ 4] 3204 movu m5, [t3+wq*2+400*4+20] 3205 paddw m0, [t4+wq*1+400*2+ 0] 3206 paddd m1, [t3+wq*2+400*4+ 0] 3207 paddd m2, [t3+wq*2+400*4+16] 3208 paddw m3, m0 3209 paddd m4, m1 3210 paddd m5, m2 3211 psllw m3, 2 ; a3[-1] 444 3212 pslld m4, 2 ; b3[-1] 444 3213 pslld m5, 2 3214 psubw m3, m0 ; a3[-1] 343 3215 psubd m4, m1 ; b3[-1] 343 3216 psubd m5, m2 3217 mova [t4+wq*1+400* 8+ 0], m3 3218 mova [t3+wq*2+400*16+ 0], m4 3219 mova [t3+wq*2+400*16+16], m5 3220 movu m0, [t4+wq*1+400*4+ 4] 3221 movu m1, [t3+wq*2+400*8+ 8] 3222 movu m2, [t3+wq*2+400*8+24] 3223 movu m3, [t4+wq*1+400*4+ 2] 3224 movu m4, [t3+wq*2+400*8+ 4] 3225 movu m5, [t3+wq*2+400*8+20] 3226 paddw m0, [t4+wq*1+400*4+ 0] 3227 paddd m1, [t3+wq*2+400*8+ 0] 3228 paddd m2, [t3+wq*2+400*8+16] 3229 paddw m3, m0 3230 paddd m4, m1 3231 paddd m5, m2 3232 psllw m3, 2 ; a3[ 0] 444 3233 pslld m4, 2 ; b3[ 0] 444 3234 pslld m5, 2 3235 mova [t4+wq*1+400*10+ 0], m3 3236 mova [t3+wq*2+400*20+ 0], m4 3237 mova [t3+wq*2+400*20+16], m5 3238 psubw m3, m0 ; a3[ 0] 343 3239 psubd m4, m1 ; b3[ 0] 343 3240 psubd m5, m2 3241 mova [t4+wq*1+400*12+ 0], m3 3242 mova [t3+wq*2+400*24+ 0], m4 3243 mova [t3+wq*2+400*24+16], m5 3244 add wq, 16 3245 jl .prep_n_loop 3246 ret 3247 ALIGN function_align 3248 .n0: ; neighbor + output (even rows) 3249 movif64 wq, r4 3250 movif32 wd, w1m 3251 .n0_loop: 3252 movu m0, [t4+wq*1+ 4] 3253 movu m2, [t4+wq*1+ 2] 3254 paddw m0, [t4+wq*1+ 0] 3255 paddw m0, m2 3256 paddw m2, m0 3257 psllw m0, 2 3258 paddw m0, m2 ; a5 3259 movu m4, [t3+wq*2+ 8] 3260 movu m5, [t3+wq*2+24] 3261 movu m1, [t3+wq*2+ 4] 3262 movu m3, [t3+wq*2+20] 3263 paddd m4, [t3+wq*2+ 0] 3264 paddd m5, [t3+wq*2+16] 3265 paddd m4, m1 3266 paddd m5, m3 3267 paddd m1, m4 3268 paddd m3, m5 3269 pslld m4, 2 3270 pslld m5, 2 3271 paddd m4, m1 ; b5 3272 paddd m5, m3 3273 movu m2, [t4+wq*1+400* 6] 3274 paddw m2, m0 3275 mova [t4+wq*1+400* 6], m0 3276 paddd m0, m4, [t3+wq*2+400*12+ 0] 3277 paddd m1, m5, [t3+wq*2+400*12+16] 3278 mova [t3+wq*2+400*12+ 0], m4 3279 mova [t3+wq*2+400*12+16], m5 3280 mova [rsp+16+ARCH_X86_32*4], m1 3281 movu m3, [t4+wq*1+400*2+4] 3282 movu m5, [t4+wq*1+400*2+2] 3283 paddw m3, [t4+wq*1+400*2+0] 3284 paddw m5, m3 3285 psllw m5, 2 ; a3[ 1] 444 3286 psubw m4, m5, m3 ; a3[ 1] 343 3287 movu m3, [t4+wq*1+400* 8] 3288 paddw m3, [t4+wq*1+400*10] 3289 paddw m3, m4 3290 mova [t4+wq*1+400* 8], m4 3291 mova [t4+wq*1+400*10], m5 3292 movu m1, [t3+wq*2+400*4+ 8] 3293 movu m5, [t3+wq*2+400*4+ 4] 3294 movu m7, [t3+wq*2+400*4+24] 3295 movu m8, [t3+wq*2+400*4+20] 3296 paddd m1, [t3+wq*2+400*4+ 0] 3297 paddd m7, [t3+wq*2+400*4+16] 3298 paddd m5, m1 3299 paddd m8, m7 3300 pslld m5, 2 ; b3[ 1] 444 3301 pslld m8, 2 3302 psubd m4, m5, m1 ; b3[ 1] 343 3303 %if ARCH_X86_32 3304 mova [esp+52], m8 3305 psubd m8, m7 3306 %else 3307 psubd m6, m8, m7 3308 SWAP m8, m6 3309 %endif 3310 paddd m1, m4, [t3+wq*2+400*16+ 0] 3311 paddd m7, m8, [t3+wq*2+400*16+16] 3312 paddd m1, [t3+wq*2+400*20+ 0] 3313 paddd m7, [t3+wq*2+400*20+16] 3314 mova [t3+wq*2+400*16+ 0], m4 3315 mova [t3+wq*2+400*16+16], m8 3316 mova [t3+wq*2+400*20+ 0], m5 3317 %if ARCH_X86_32 3318 mova m8, [esp+52] 3319 %else 3320 SWAP m8, m6 3321 pxor m6, m6 3322 %endif 3323 mova [t3+wq*2+400*20+16], m8 3324 mova [rsp+32+ARCH_X86_32*4], m7 3325 movu m5, [dstq+wq] 3326 punpcklwd m4, m5, m6 3327 punpcklwd m7, m2, m6 3328 pmaddwd m7, m4 ; a5 * src 3329 punpcklwd m8, m3, m6 3330 pmaddwd m8, m4 ; a3 * src 3331 punpckhwd m5, m6 3332 punpckhwd m2, m6 3333 pmaddwd m2, m5 3334 punpckhwd m3, m6 3335 pmaddwd m3, m5 3336 pslld m4, 13 3337 pslld m5, 13 3338 psubd m0, m7 ; b5 - a5 * src + (1 << 8) 3339 psubd m1, m8 ; b3 - a3 * src + (1 << 8) 3340 mova m7, [base+pd_0xffff] 3341 psrld m0, 9 3342 pslld m1, 7 3343 pand m0, m7 3344 pandn m8, m7, m1 3345 por m0, m8 3346 mova m1, [rsp+16+ARCH_X86_32*4] 3347 mova m8, [rsp+32+ARCH_X86_32*4] 3348 psubd m1, m2 3349 psubd m8, m3 3350 mova m2, [base+pd_4096] 3351 psrld m1, 9 3352 pslld m8, 7 3353 pand m1, m7 3354 pandn m7, m8 3355 por m1, m7 3356 pmaddwd m0, m15 3357 pmaddwd m1, m15 3358 %if ARCH_X86_32 3359 pxor m7, m7 3360 %else 3361 SWAP m7, m6 3362 %endif 3363 paddd m4, m2 3364 paddd m5, m2 3365 paddd m0, m4 3366 paddd m1, m5 3367 psrad m0, 8 3368 psrad m1, 8 3369 packssdw m0, m1 ; clip 3370 pmaxsw m0, m7 3371 psrlw m0, 5 3372 mova [dstq+wq], m0 3373 add wq, 16 3374 jl .n0_loop 3375 add dstq, stridemp 3376 ret 3377 %if ARCH_X86_64 3378 SWAP m6, m7 3379 %endif 3380 ALIGN function_align 3381 .n1: ; neighbor + output (odd rows) 3382 movif64 wq, r4 3383 movif32 wd, w1m 3384 .n1_loop: 3385 movu m3, [t4+wq*1+400*4+4] 3386 movu m5, [t4+wq*1+400*4+2] 3387 paddw m3, [t4+wq*1+400*4+0] 3388 paddw m5, m3 3389 psllw m5, 2 ; a3[ 1] 444 3390 psubw m4, m5, m3 ; a3[ 1] 343 3391 paddw m3, m4, [t4+wq*1+400*12] 3392 paddw m3, [t4+wq*1+400*10] 3393 mova [t4+wq*1+400*10], m5 3394 mova [t4+wq*1+400*12], m4 3395 movu m1, [t3+wq*2+400*8+ 8] 3396 movu m5, [t3+wq*2+400*8+ 4] 3397 movu m7, [t3+wq*2+400*8+24] 3398 movu m8, [t3+wq*2+400*8+20] 3399 paddd m1, [t3+wq*2+400*8+ 0] 3400 paddd m7, [t3+wq*2+400*8+16] 3401 paddd m5, m1 3402 paddd m8, m7 3403 pslld m5, 2 ; b3[ 1] 444 3404 pslld m8, 2 3405 psubd m4, m5, m1 ; b3[ 1] 343 3406 psubd m0, m8, m7 3407 paddd m1, m4, [t3+wq*2+400*24+ 0] 3408 paddd m7, m0, [t3+wq*2+400*24+16] 3409 paddd m1, [t3+wq*2+400*20+ 0] 3410 paddd m7, [t3+wq*2+400*20+16] 3411 mova [t3+wq*2+400*20+ 0], m5 3412 mova [t3+wq*2+400*20+16], m8 3413 mova [t3+wq*2+400*24+ 0], m4 3414 mova [t3+wq*2+400*24+16], m0 3415 mova m5, [dstq+wq] 3416 mova m2, [t4+wq*1+400* 6] 3417 punpcklwd m4, m5, m6 3418 punpcklwd m8, m2, m6 3419 pmaddwd m8, m4 ; a5 * src 3420 punpcklwd m0, m3, m6 3421 pmaddwd m0, m4 ; a3 * src 3422 punpckhwd m5, m6 3423 punpckhwd m2, m6 3424 pmaddwd m2, m5 3425 punpckhwd m3, m6 3426 pmaddwd m3, m5 3427 psubd m1, m0 ; b3 - a3 * src + (1 << 8) 3428 pslld m4, 13 3429 pslld m5, 13 3430 mova m0, [t3+wq*2+400*12+ 0] 3431 psubd m0, m8 ; b5 - a5 * src + (1 << 8) 3432 mova m8, [t3+wq*2+400*12+16] 3433 psubd m8, m2 3434 psubd m7, m3 3435 mova m2, [base+pd_0xffff] 3436 pslld m1, 7 3437 psrld m0, 8 3438 psrld m8, 8 3439 pslld m7, 7 3440 pand m0, m2 3441 pandn m3, m2, m1 3442 por m0, m3 3443 pand m8, m2 3444 pandn m2, m7 3445 por m2, m8 3446 mova m1, [base+pd_4096] 3447 pmaddwd m0, m15 3448 pmaddwd m2, m15 3449 %if ARCH_X86_64 3450 SWAP m7, m6 3451 %endif 3452 pxor m7, m7 3453 paddd m4, m1 3454 paddd m5, m1 3455 paddd m0, m4 3456 paddd m2, m5 3457 psrad m0, 8 3458 psrad m2, 8 3459 packssdw m0, m2 ; clip 3460 pmaxsw m0, m7 3461 psrlw m0, 5 3462 mova [dstq+wq], m0 3463 add wq, 16 3464 jl .n1_loop 3465 add dstq, stridemp 3466 movif32 dstm, dstq 3467 ret