vp9lpf_16bpp.asm (24829B)
1 ;****************************************************************************** 2 ;* VP9 loop filter SIMD optimizations 3 ;* 4 ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com> 5 ;* 6 ;* This file is part of FFmpeg. 7 ;* 8 ;* FFmpeg is free software; you can redistribute it and/or 9 ;* modify it under the terms of the GNU Lesser General Public 10 ;* License as published by the Free Software Foundation; either 11 ;* version 2.1 of the License, or (at your option) any later version. 12 ;* 13 ;* FFmpeg is distributed in the hope that it will be useful, 14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 ;* Lesser General Public License for more details. 17 ;* 18 ;* You should have received a copy of the GNU Lesser General Public 19 ;* License along with FFmpeg; if not, write to the Free Software 20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 ;****************************************************************************** 22 23 %include "libavutil/x86/x86util.asm" 24 25 SECTION_RODATA 26 27 pw_511: times 16 dw 511 28 pw_2047: times 16 dw 2047 29 pw_16384: times 16 dw 16384 30 pw_m512: times 16 dw -512 31 pw_m2048: times 16 dw -2048 32 33 cextern pw_1 34 cextern pw_3 35 cextern pw_4 36 cextern pw_8 37 cextern pw_16 38 cextern pw_256 39 cextern pw_1023 40 cextern pw_4095 41 cextern pw_m1 42 43 SECTION .text 44 45 %macro SCRATCH 3-4 46 %if ARCH_X86_64 47 SWAP %1, %2 48 %if %0 == 4 49 %define reg_%4 m%2 50 %endif 51 %else 52 mova [%3], m%1 53 %if %0 == 4 54 %define reg_%4 [%3] 55 %endif 56 %endif 57 %endmacro 58 59 %macro UNSCRATCH 3-4 60 %if ARCH_X86_64 61 SWAP %1, %2 62 %else 63 mova m%1, [%3] 64 %endif 65 %if %0 == 4 66 %undef reg_%4 67 %endif 68 %endmacro 69 70 %macro PRELOAD 2-3 71 %if ARCH_X86_64 72 mova m%1, [%2] 73 %if %0 == 3 74 %define reg_%3 m%1 75 %endif 76 %elif %0 == 3 77 %define reg_%3 [%2] 78 %endif 79 %endmacro 80 81 ; calculate p or q portion of flat8out 82 %macro FLAT8OUT_HALF 0 83 psubw m4, m0 ; q4-q0 84 psubw m5, m0 ; q5-q0 85 psubw m6, m0 ; q6-q0 86 psubw m7, m0 ; q7-q0 87 ABS2 m4, m5, m2, m3 ; abs(q4-q0) | abs(q5-q0) 88 ABS2 m6, m7, m2, m3 ; abs(q6-q0) | abs(q7-q0) 89 pcmpgtw m4, reg_F ; abs(q4-q0) > F 90 pcmpgtw m5, reg_F ; abs(q5-q0) > F 91 pcmpgtw m6, reg_F ; abs(q6-q0) > F 92 pcmpgtw m7, reg_F ; abs(q7-q0) > F 93 por m5, m4 94 por m7, m6 95 por m7, m5 ; !flat8out, q portion 96 %endmacro 97 98 ; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition) 99 %macro FLAT8IN_HALF 1 100 %if %1 > 4 101 psubw m4, m3, m0 ; q3-q0 102 psubw m5, m2, m0 ; q2-q0 103 ABS2 m4, m5, m6, m7 ; abs(q3-q0) | abs(q2-q0) 104 pcmpgtw m4, reg_F ; abs(q3-q0) > F 105 pcmpgtw m5, reg_F ; abs(q2-q0) > F 106 %endif 107 psubw m3, m2 ; q3-q2 108 psubw m2, m1 ; q2-q1 109 ABS2 m3, m2, m6, m7 ; abs(q3-q2) | abs(q2-q1) 110 pcmpgtw m3, reg_I ; abs(q3-q2) > I 111 pcmpgtw m2, reg_I ; abs(q2-q1) > I 112 %if %1 > 4 113 por m4, m5 114 %endif 115 por m2, m3 116 psubw m3, m1, m0 ; q1-q0 117 ABS1 m3, m5 ; abs(q1-q0) 118 %if %1 > 4 119 pcmpgtw m6, m3, reg_F ; abs(q1-q0) > F 120 %endif 121 pcmpgtw m7, m3, reg_H ; abs(q1-q0) > H 122 pcmpgtw m3, reg_I ; abs(q1-q0) > I 123 %if %1 > 4 124 por m4, m6 125 %endif 126 por m2, m3 127 %endmacro 128 129 ; one step in filter_14/filter_6 130 ; 131 ; take sum $reg, downshift, apply mask and write into dst 132 ; 133 ; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next 134 ; step's sum $reg. This is omitted for the last row in each filter. 135 ; 136 ; if dont_store is set, don't write the result into memory, instead keep the 137 ; values in register so we can write it out later 138 %macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \ 139 ; src/sub1, sub2, add1, add2, dont_store 140 psrlw %1, %2, %4 141 psubw %1, %6 ; abs->delta 142 %ifnidn %7, "" 143 psubw %2, %6 144 psubw %2, %7 145 paddw %2, %8 146 paddw %2, %9 147 %endif 148 pand %1, reg_%3 ; apply mask 149 %if %10 == 1 150 paddw %6, %1 ; delta->abs 151 %else 152 paddw %1, %6 ; delta->abs 153 mova [%5], %1 154 %endif 155 %endmacro 156 157 ; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8} 158 159 %macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12] 160 161 %if ARCH_X86_64 162 %if %2 == 16 163 %assign %%num_xmm_regs 16 164 %elif %2 == 8 165 %assign %%num_xmm_regs 15 166 %else ; %2 == 4 167 %assign %%num_xmm_regs 14 168 %endif ; %2 169 %assign %%bak_mem 0 170 %else ; ARCH_X86_32 171 %assign %%num_xmm_regs 8 172 %if %2 == 16 173 %assign %%bak_mem 7 174 %elif %2 == 8 175 %assign %%bak_mem 6 176 %else ; %2 == 4 177 %assign %%bak_mem 5 178 %endif ; %2 179 %endif ; ARCH_X86_64/32 180 181 %if %2 == 16 182 %ifidn %1, v 183 %assign %%num_gpr_regs 6 184 %else ; %1 == h 185 %assign %%num_gpr_regs 5 186 %endif ; %1 187 %assign %%wd_mem 6 188 %else ; %2 == 8/4 189 %assign %%num_gpr_regs 5 190 %if ARCH_X86_32 && %2 == 8 191 %assign %%wd_mem 2 192 %else ; ARCH_X86_64 || %2 == 4 193 %assign %%wd_mem 0 194 %endif ; ARCH_X86_64/32 etc. 195 %endif ; %2 196 197 %ifidn %1, v 198 %assign %%tsp_mem 0 199 %elif %2 == 16 ; && %1 == h 200 %assign %%tsp_mem 16 201 %else ; %1 == h && %1 == 8/4 202 %assign %%tsp_mem 8 203 %endif ; %1/%2 204 205 %assign %%off %%wd_mem 206 %assign %%tspoff %%bak_mem+%%wd_mem 207 %assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize) 208 209 %if %3 == 10 210 %define %%maxsgn 511 211 %define %%minsgn m512 212 %define %%maxusgn 1023 213 %define %%maxf 4 214 %else ; %3 == 12 215 %define %%maxsgn 2047 216 %define %%minsgn m2048 217 %define %%maxusgn 4095 218 %define %%maxf 16 219 %endif ; %3 220 221 cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H 222 ; prepare E, I and H masks 223 shl Ed, %3-8 224 shl Id, %3-8 225 shl Hd, %3-8 226 %if cpuflag(ssse3) 227 mova m0, [pw_256] 228 %endif 229 movd m1, Ed 230 movd m2, Id 231 movd m3, Hd 232 %if cpuflag(ssse3) 233 pshufb m1, m0 ; E << (bit_depth - 8) 234 pshufb m2, m0 ; I << (bit_depth - 8) 235 pshufb m3, m0 ; H << (bit_depth - 8) 236 %else 237 punpcklwd m1, m1 238 punpcklwd m2, m2 239 punpcklwd m3, m3 240 pshufd m1, m1, q0000 241 pshufd m2, m2, q0000 242 pshufd m3, m3, q0000 243 %endif 244 SCRATCH 1, 8, rsp+(%%off+0)*mmsize, E 245 SCRATCH 2, 9, rsp+(%%off+1)*mmsize, I 246 SCRATCH 3, 10, rsp+(%%off+2)*mmsize, H 247 %if %2 > 4 248 PRELOAD 11, pw_ %+ %%maxf, F 249 %endif 250 251 ; set up variables to load data 252 %ifidn %1, v 253 DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12 254 lea stride3q, [strideq*3] 255 neg strideq 256 %if %2 == 16 257 lea dst0q, [dst8q+strideq*8] 258 %else 259 lea dst4q, [dst8q+strideq*4] 260 %endif 261 neg strideq 262 %if %2 == 16 263 lea dst12q, [dst8q+strideq*4] 264 lea dst4q, [dst0q+strideq*4] 265 %endif 266 267 %if %2 == 16 268 %define %%p7 dst0q 269 %define %%p6 dst0q+strideq 270 %define %%p5 dst0q+strideq*2 271 %define %%p4 dst0q+stride3q 272 %endif 273 %define %%p3 dst4q 274 %define %%p2 dst4q+strideq 275 %define %%p1 dst4q+strideq*2 276 %define %%p0 dst4q+stride3q 277 %define %%q0 dst8q 278 %define %%q1 dst8q+strideq 279 %define %%q2 dst8q+strideq*2 280 %define %%q3 dst8q+stride3q 281 %if %2 == 16 282 %define %%q4 dst12q 283 %define %%q5 dst12q+strideq 284 %define %%q6 dst12q+strideq*2 285 %define %%q7 dst12q+stride3q 286 %endif 287 %else ; %1 == h 288 DEFINE_ARGS dst0, stride, stride3, dst4 289 lea stride3q, [strideq*3] 290 lea dst4q, [dst0q+strideq*4] 291 292 %define %%p3 rsp+(%%tspoff+0)*mmsize 293 %define %%p2 rsp+(%%tspoff+1)*mmsize 294 %define %%p1 rsp+(%%tspoff+2)*mmsize 295 %define %%p0 rsp+(%%tspoff+3)*mmsize 296 %define %%q0 rsp+(%%tspoff+4)*mmsize 297 %define %%q1 rsp+(%%tspoff+5)*mmsize 298 %define %%q2 rsp+(%%tspoff+6)*mmsize 299 %define %%q3 rsp+(%%tspoff+7)*mmsize 300 301 %if %2 < 16 302 movu m0, [dst0q+strideq*0-8] 303 movu m1, [dst0q+strideq*1-8] 304 movu m2, [dst0q+strideq*2-8] 305 movu m3, [dst0q+stride3q -8] 306 movu m4, [dst4q+strideq*0-8] 307 movu m5, [dst4q+strideq*1-8] 308 movu m6, [dst4q+strideq*2-8] 309 movu m7, [dst4q+stride3q -8] 310 311 %if ARCH_X86_64 312 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 313 %else 314 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0] 315 %endif 316 317 mova [%%p3], m0 318 mova [%%p2], m1 319 mova [%%p1], m2 320 mova [%%p0], m3 321 %if ARCH_X86_64 322 mova [%%q0], m4 323 %endif 324 mova [%%q1], m5 325 mova [%%q2], m6 326 mova [%%q3], m7 327 328 ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register 329 ; order here accordingly 330 %else ; %2 == 16 331 332 %define %%p7 rsp+(%%tspoff+ 8)*mmsize 333 %define %%p6 rsp+(%%tspoff+ 9)*mmsize 334 %define %%p5 rsp+(%%tspoff+10)*mmsize 335 %define %%p4 rsp+(%%tspoff+11)*mmsize 336 %define %%q4 rsp+(%%tspoff+12)*mmsize 337 %define %%q5 rsp+(%%tspoff+13)*mmsize 338 %define %%q6 rsp+(%%tspoff+14)*mmsize 339 %define %%q7 rsp+(%%tspoff+15)*mmsize 340 341 mova m0, [dst0q+strideq*0-16] 342 mova m1, [dst0q+strideq*1-16] 343 mova m2, [dst0q+strideq*2-16] 344 mova m3, [dst0q+stride3q -16] 345 mova m4, [dst4q+strideq*0-16] 346 mova m5, [dst4q+strideq*1-16] 347 %if ARCH_X86_64 348 mova m6, [dst4q+strideq*2-16] 349 %endif 350 mova m7, [dst4q+stride3q -16] 351 352 %if ARCH_X86_64 353 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 354 %else 355 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1 356 %endif 357 358 mova [%%p7], m0 359 mova [%%p6], m1 360 mova [%%p5], m2 361 mova [%%p4], m3 362 %if ARCH_X86_64 363 mova [%%p3], m4 364 %endif 365 mova [%%p2], m5 366 mova [%%p1], m6 367 mova [%%p0], m7 368 369 mova m0, [dst0q+strideq*0] 370 mova m1, [dst0q+strideq*1] 371 mova m2, [dst0q+strideq*2] 372 mova m3, [dst0q+stride3q ] 373 mova m4, [dst4q+strideq*0] 374 mova m5, [dst4q+strideq*1] 375 %if ARCH_X86_64 376 mova m6, [dst4q+strideq*2] 377 %endif 378 mova m7, [dst4q+stride3q ] 379 380 %if ARCH_X86_64 381 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 382 %else 383 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1 384 %endif 385 386 mova [%%q0], m0 387 mova [%%q1], m1 388 mova [%%q2], m2 389 mova [%%q3], m3 390 %if ARCH_X86_64 391 mova [%%q4], m4 392 %endif 393 mova [%%q5], m5 394 mova [%%q6], m6 395 mova [%%q7], m7 396 397 ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register 398 ; order here accordingly 399 %endif ; %2 400 %endif ; %1 401 402 ; load q0|q4-7 data 403 mova m0, [%%q0] 404 %if %2 == 16 405 mova m4, [%%q4] 406 mova m5, [%%q5] 407 mova m6, [%%q6] 408 mova m7, [%%q7] 409 410 ; flat8out q portion 411 FLAT8OUT_HALF 412 SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O 413 %endif 414 415 ; load q1-3 data 416 mova m1, [%%q1] 417 mova m2, [%%q2] 418 mova m3, [%%q3] 419 420 ; r6-8|pw_4[m8-11]=reg_E/I/H/F 421 ; r9[m15]=!flatout[q] 422 ; m12-14=free 423 ; m0-3=q0-q3 424 ; m4-7=free 425 426 ; flat8in|fm|hev q portion 427 FLAT8IN_HALF %2 428 SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV 429 %if %2 > 4 430 SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8I 431 %endif 432 433 ; r6-8|pw_4[m8-11]=reg_E/I/H/F 434 ; r9[m15]=!flat8out[q] 435 ; r10[m13]=hev[q] 436 ; r11[m14]=!flat8in[q] 437 ; m2=!fm[q] 438 ; m0,1=q0-q1 439 ; m2-7=free 440 ; m12=free 441 442 ; load p0-1 443 mova m3, [%%p0] 444 mova m4, [%%p1] 445 446 ; fm mb_edge portion 447 psubw m5, m3, m0 ; q0-p0 448 psubw m6, m4, m1 ; q1-p1 449 %if ARCH_X86_64 450 ABS2 m5, m6, m7, m12 ; abs(q0-p0) | abs(q1-p1) 451 %else 452 ABS1 m5, m7 ; abs(q0-p0) 453 ABS1 m6, m7 ; abs(q1-p1) 454 %endif 455 paddw m5, m5 456 psraw m6, 1 457 paddw m6, m5 ; abs(q0-p0)*2+(abs(q1-p1)>>1) 458 pcmpgtw m6, reg_E 459 por m2, m6 460 SCRATCH 2, 12, rsp+(%%off+3)*mmsize, FM 461 462 ; r6-8|pw_4[m8-11]=reg_E/I/H/F 463 ; r9[m15]=!flat8out[q] 464 ; r10[m13]=hev[q] 465 ; r11[m14]=!flat8in[q] 466 ; r12[m12]=!fm[q] 467 ; m3-4=q0-1 468 ; m0-2/5-7=free 469 470 ; load p4-7 data 471 SWAP 3, 0 ; p0 472 SWAP 4, 1 ; p1 473 %if %2 == 16 474 mova m7, [%%p7] 475 mova m6, [%%p6] 476 mova m5, [%%p5] 477 mova m4, [%%p4] 478 479 ; flat8out p portion 480 FLAT8OUT_HALF 481 por m7, reg_F8O 482 SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O 483 %endif 484 485 ; r6-8|pw_4[m8-11]=reg_E/I/H/F 486 ; r9[m15]=!flat8out 487 ; r10[m13]=hev[q] 488 ; r11[m14]=!flat8in[q] 489 ; r12[m12]=!fm[q] 490 ; m0=p0 491 ; m1-7=free 492 493 ; load p2-3 data 494 mova m2, [%%p2] 495 mova m3, [%%p3] 496 497 ; flat8in|fm|hev p portion 498 FLAT8IN_HALF %2 499 por m7, reg_HEV 500 %if %2 > 4 501 por m4, reg_F8I 502 %endif 503 por m2, reg_FM 504 %if %2 > 4 505 por m4, m2 ; !flat8|!fm 506 %if %2 == 16 507 por m5, m4, reg_F8O ; !flat16|!fm 508 pandn m2, m4 ; filter4_mask 509 pandn m4, m5 ; filter8_mask 510 pxor m5, [pw_m1] ; filter16_mask 511 SCRATCH 5, 15, rsp+(%%off+6)*mmsize, F16M 512 %else 513 pandn m2, m4 ; filter4_mask 514 pxor m4, [pw_m1] ; filter8_mask 515 %endif 516 SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8M 517 %else 518 pxor m2, [pw_m1] ; filter4_mask 519 %endif 520 SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV 521 SCRATCH 2, 12, rsp+(%%off+3)*mmsize, F4M 522 523 ; r9[m15]=filter16_mask 524 ; r10[m13]=hev 525 ; r11[m14]=filter8_mask 526 ; r12[m12]=filter4_mask 527 ; m0,1=p0-p1 528 ; m2-7=free 529 ; m8-11=free 530 531 %if %2 > 4 532 %if %2 == 16 533 ; filter_14 534 mova m2, [%%p7] 535 mova m3, [%%p6] 536 mova m6, [%%p5] 537 mova m7, [%%p4] 538 PRELOAD 8, %%p3, P3 539 PRELOAD 9, %%p2, P2 540 %endif 541 PRELOAD 10, %%q0, Q0 542 PRELOAD 11, %%q1, Q1 543 %if %2 == 16 544 psllw m4, m2, 3 545 paddw m5, m3, m3 546 paddw m4, m6 547 paddw m5, m7 548 paddw m4, reg_P3 549 paddw m5, reg_P2 550 paddw m4, m1 551 paddw m5, m0 552 paddw m4, reg_Q0 ; q0+p1+p3+p5+p7*8 553 psubw m5, m2 ; p0+p2+p4+p6*2-p7 554 paddw m4, [pw_8] 555 paddw m5, m4 ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8 556 557 ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction 558 ; at the end of the filter 559 560 mova [rsp+0*mmsize], m3 561 FILTER_STEP m4, m5, F16M, 4, %%p6, m3, m2, m6, reg_Q1 562 %endif 563 mova m3, [%%q2] 564 %if %2 == 16 565 mova [rsp+1*mmsize], m6 566 FILTER_STEP m4, m5, F16M, 4, %%p5, m6, m2, m7, m3 567 %endif 568 mova m6, [%%q3] 569 %if %2 == 16 570 mova [rsp+2*mmsize], m7 571 FILTER_STEP m4, m5, F16M, 4, %%p4, m7, m2, reg_P3, m6 572 mova m7, [%%q4] 573 %if ARCH_X86_64 574 mova [rsp+3*mmsize], reg_P3 575 %else 576 mova m4, reg_P3 577 mova [rsp+3*mmsize], m4 578 %endif 579 FILTER_STEP m4, m5, F16M, 4, %%p3, reg_P3, m2, reg_P2, m7 580 PRELOAD 8, %%q5, Q5 581 %if ARCH_X86_64 582 mova [rsp+4*mmsize], reg_P2 583 %else 584 mova m4, reg_P2 585 mova [rsp+4*mmsize], m4 586 %endif 587 FILTER_STEP m4, m5, F16M, 4, %%p2, reg_P2, m2, m1, reg_Q5 588 PRELOAD 9, %%q6, Q6 589 mova [rsp+5*mmsize], m1 590 FILTER_STEP m4, m5, F16M, 4, %%p1, m1, m2, m0, reg_Q6 591 mova m1, [%%q7] 592 FILTER_STEP m4, m5, F16M, 4, %%p0, m0, m2, reg_Q0, m1, 1 593 FILTER_STEP m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1, ARCH_X86_64 594 FILTER_STEP m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3, m1, ARCH_X86_64 595 FILTER_STEP m4, m5, F16M, 4, %%q2, m3, [rsp+2*mmsize], m6, m1, 1 596 FILTER_STEP m4, m5, F16M, 4, %%q3, m6, [rsp+3*mmsize], m7, m1 597 FILTER_STEP m4, m5, F16M, 4, %%q4, m7, [rsp+4*mmsize], reg_Q5, m1 598 FILTER_STEP m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1 599 FILTER_STEP m4, m5, F16M, 4, %%q6, reg_Q6 600 601 mova m7, [%%p1] 602 %else 603 SWAP 1, 7 604 %endif 605 606 mova m2, [%%p3] 607 mova m1, [%%p2] 608 609 ; reg_Q0-1 (m10-m11) 610 ; m0=p0 611 ; m1=p2 612 ; m2=p3 613 ; m3=q2 614 ; m4-5=free 615 ; m6=q3 616 ; m7=p1 617 ; m8-9 unused 618 619 ; filter_6 620 psllw m4, m2, 2 621 paddw m5, m1, m1 622 paddw m4, m7 623 psubw m5, m2 624 paddw m4, m0 625 paddw m5, reg_Q0 626 paddw m4, [pw_4] 627 paddw m5, m4 628 629 %if ARCH_X86_64 630 mova m8, m1 631 mova m9, m7 632 %else 633 mova [rsp+0*mmsize], m1 634 mova [rsp+1*mmsize], m7 635 %endif 636 %ifidn %1, v 637 FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1 638 %else 639 FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1, 1 640 %endif 641 FILTER_STEP m4, m5, F8M, 3, %%p1, m7, m2, m0, m3, 1 642 FILTER_STEP m4, m5, F8M, 3, %%p0, m0, m2, reg_Q0, m6, 1 643 %if ARCH_X86_64 644 FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64 645 FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64 646 %else 647 FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64 648 FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3, m6, ARCH_X86_64 649 %endif 650 FILTER_STEP m4, m5, F8M, 3, %%q2, m3 651 652 UNSCRATCH 2, 10, %%q0 653 UNSCRATCH 6, 11, %%q1 654 %else 655 SWAP 1, 7 656 mova m2, [%%q0] 657 mova m6, [%%q1] 658 %endif 659 UNSCRATCH 3, 13, rsp+(%%off+4)*mmsize, HEV 660 661 ; m0=p0 662 ; m1=p2 663 ; m2=q0 664 ; m3=hev_mask 665 ; m4-5=free 666 ; m6=q1 667 ; m7=p1 668 669 ; filter_4 670 psubw m4, m7, m6 ; p1-q1 671 psubw m5, m2, m0 ; q0-p0 672 pand m4, m3 673 pminsw m4, [pw_ %+ %%maxsgn] 674 pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(p1-q1, 9) -> f 675 paddw m4, m5 676 paddw m5, m5 677 paddw m4, m5 ; 3*(q0-p0)+f 678 pminsw m4, [pw_ %+ %%maxsgn] 679 pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(3*(q0-p0)+f, 9) -> f 680 pand m4, reg_F4M 681 paddw m5, m4, [pw_4] 682 paddw m4, [pw_3] 683 pminsw m5, [pw_ %+ %%maxsgn] 684 pminsw m4, [pw_ %+ %%maxsgn] 685 psraw m5, 3 ; min_intp2(f+4, 9)>>3 -> f1 686 psraw m4, 3 ; min_intp2(f+3, 9)>>3 -> f2 687 psubw m2, m5 ; q0-f1 688 paddw m0, m4 ; p0+f2 689 pandn m3, m5 ; f1 & !hev (for p1/q1 adj) 690 pxor m4, m4 691 mova m5, [pw_ %+ %%maxusgn] 692 pmaxsw m2, m4 693 pmaxsw m0, m4 694 pminsw m2, m5 695 pminsw m0, m5 696 %if cpuflag(ssse3) 697 pmulhrsw m3, [pw_16384] ; (f1+1)>>1 698 %else 699 paddw m3, [pw_1] 700 psraw m3, 1 701 %endif 702 paddw m7, m3 ; p1+f 703 psubw m6, m3 ; q1-f 704 pmaxsw m7, m4 705 pmaxsw m6, m4 706 pminsw m7, m5 707 pminsw m6, m5 708 709 ; store 710 %ifidn %1, v 711 mova [%%p1], m7 712 mova [%%p0], m0 713 mova [%%q0], m2 714 mova [%%q1], m6 715 %else ; %1 == h 716 %if %2 == 4 717 TRANSPOSE4x4W 7, 0, 2, 6, 1 718 movh [dst0q+strideq*0-4], m7 719 movhps [dst0q+strideq*1-4], m7 720 movh [dst0q+strideq*2-4], m0 721 movhps [dst0q+stride3q -4], m0 722 movh [dst4q+strideq*0-4], m2 723 movhps [dst4q+strideq*1-4], m2 724 movh [dst4q+strideq*2-4], m6 725 movhps [dst4q+stride3q -4], m6 726 %elif %2 == 8 727 mova m3, [%%p3] 728 mova m4, [%%q2] 729 mova m5, [%%q3] 730 731 %if ARCH_X86_64 732 TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, 8 733 %else 734 TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1 735 mova m2, [%%q0] 736 %endif 737 738 movu [dst0q+strideq*0-8], m3 739 movu [dst0q+strideq*1-8], m1 740 movu [dst0q+strideq*2-8], m7 741 movu [dst0q+stride3q -8], m0 742 movu [dst4q+strideq*0-8], m2 743 movu [dst4q+strideq*1-8], m6 744 movu [dst4q+strideq*2-8], m4 745 movu [dst4q+stride3q -8], m5 746 %else ; %2 == 16 747 SCRATCH 2, 8, %%q0 748 SCRATCH 6, 9, %%q1 749 mova m2, [%%p7] 750 mova m3, [%%p6] 751 mova m4, [%%p5] 752 mova m5, [%%p4] 753 mova m6, [%%p3] 754 755 %if ARCH_X86_64 756 TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, 10 757 %else 758 mova [%%p1], m7 759 TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1 760 %endif 761 762 mova [dst0q+strideq*0-16], m2 763 mova [dst0q+strideq*1-16], m3 764 mova [dst0q+strideq*2-16], m4 765 mova [dst0q+stride3q -16], m5 766 %if ARCH_X86_64 767 mova [dst4q+strideq*0-16], m6 768 %endif 769 mova [dst4q+strideq*1-16], m1 770 mova [dst4q+strideq*2-16], m7 771 mova [dst4q+stride3q -16], m0 772 773 UNSCRATCH 2, 8, %%q0 774 UNSCRATCH 6, 9, %%q1 775 mova m0, [%%q2] 776 mova m1, [%%q3] 777 mova m3, [%%q4] 778 mova m4, [%%q5] 779 %if ARCH_X86_64 780 mova m5, [%%q6] 781 %endif 782 mova m7, [%%q7] 783 784 %if ARCH_X86_64 785 TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, 8 786 %else 787 TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1 788 %endif 789 790 mova [dst0q+strideq*0], m2 791 mova [dst0q+strideq*1], m6 792 mova [dst0q+strideq*2], m0 793 mova [dst0q+stride3q ], m1 794 %if ARCH_X86_64 795 mova [dst4q+strideq*0], m3 796 %endif 797 mova [dst4q+strideq*1], m4 798 mova [dst4q+strideq*2], m5 799 mova [dst4q+stride3q ], m7 800 %endif ; %2 801 %endif ; %1 802 RET 803 %endmacro 804 805 %macro LOOP_FILTER_CPUSETS 3 806 INIT_XMM sse2 807 LOOP_FILTER %1, %2, %3 808 INIT_XMM ssse3 809 LOOP_FILTER %1, %2, %3 810 INIT_XMM avx 811 LOOP_FILTER %1, %2, %3 812 %endmacro 813 814 %macro LOOP_FILTER_WDSETS 2 815 LOOP_FILTER_CPUSETS %1, 4, %2 816 LOOP_FILTER_CPUSETS %1, 8, %2 817 LOOP_FILTER_CPUSETS %1, 16, %2 818 %endmacro 819 820 LOOP_FILTER_WDSETS h, 10 821 LOOP_FILTER_WDSETS v, 10 822 LOOP_FILTER_WDSETS h, 12 823 LOOP_FILTER_WDSETS v, 12