float_dsp.asm (16959B)
1 ;***************************************************************************** 2 ;* x86-optimized Float DSP functions 3 ;* 4 ;* Copyright 2006 Loren Merritt 5 ;* 6 ;* This file is part of FFmpeg. 7 ;* 8 ;* FFmpeg is free software; you can redistribute it and/or 9 ;* modify it under the terms of the GNU Lesser General Public 10 ;* License as published by the Free Software Foundation; either 11 ;* version 2.1 of the License, or (at your option) any later version. 12 ;* 13 ;* FFmpeg is distributed in the hope that it will be useful, 14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 ;* Lesser General Public License for more details. 17 ;* 18 ;* You should have received a copy of the GNU Lesser General Public 19 ;* License along with FFmpeg; if not, write to the Free Software 20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 ;****************************************************************************** 22 23 %include "libavutil/x86/x86util.asm" 24 25 SECTION_RODATA 32 26 pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0 27 28 SECTION .text 29 30 ;----------------------------------------------------------------------------- 31 ; void vector_fmul(float *dst, const float *src0, const float *src1, int len) 32 ;----------------------------------------------------------------------------- 33 %macro VECTOR_FMUL 0 34 cglobal vector_fmul, 4,4,2, dst, src0, src1, len 35 lea lenq, [lend*4 - 64] 36 ALIGN 16 37 .loop: 38 %assign a 0 39 %rep 32/mmsize 40 mova m0, [src0q + lenq + (a+0)*mmsize] 41 mova m1, [src0q + lenq + (a+1)*mmsize] 42 mulps m0, m0, [src1q + lenq + (a+0)*mmsize] 43 mulps m1, m1, [src1q + lenq + (a+1)*mmsize] 44 mova [dstq + lenq + (a+0)*mmsize], m0 45 mova [dstq + lenq + (a+1)*mmsize], m1 46 %assign a a+2 47 %endrep 48 49 sub lenq, 64 50 jge .loop 51 RET 52 %endmacro 53 54 INIT_XMM sse 55 VECTOR_FMUL 56 %if HAVE_AVX_EXTERNAL 57 INIT_YMM avx 58 VECTOR_FMUL 59 %endif 60 61 ;----------------------------------------------------------------------------- 62 ; void vector_dmul(double *dst, const double *src0, const double *src1, int len) 63 ;----------------------------------------------------------------------------- 64 %macro VECTOR_DMUL 0 65 cglobal vector_dmul, 4,4,4, dst, src0, src1, len 66 lea lend, [lenq*8 - mmsize*4] 67 ALIGN 16 68 .loop: 69 movaps m0, [src0q + lenq + 0*mmsize] 70 movaps m1, [src0q + lenq + 1*mmsize] 71 movaps m2, [src0q + lenq + 2*mmsize] 72 movaps m3, [src0q + lenq + 3*mmsize] 73 mulpd m0, m0, [src1q + lenq + 0*mmsize] 74 mulpd m1, m1, [src1q + lenq + 1*mmsize] 75 mulpd m2, m2, [src1q + lenq + 2*mmsize] 76 mulpd m3, m3, [src1q + lenq + 3*mmsize] 77 movaps [dstq + lenq + 0*mmsize], m0 78 movaps [dstq + lenq + 1*mmsize], m1 79 movaps [dstq + lenq + 2*mmsize], m2 80 movaps [dstq + lenq + 3*mmsize], m3 81 82 sub lenq, mmsize*4 83 jge .loop 84 RET 85 %endmacro 86 87 INIT_XMM sse2 88 VECTOR_DMUL 89 %if HAVE_AVX_EXTERNAL 90 INIT_YMM avx 91 VECTOR_DMUL 92 %endif 93 94 ;------------------------------------------------------------------------------ 95 ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) 96 ;------------------------------------------------------------------------------ 97 98 %macro VECTOR_FMAC_SCALAR 0 99 %if UNIX64 100 cglobal vector_fmac_scalar, 3,3,5, dst, src, len 101 %else 102 cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len 103 %endif 104 %if ARCH_X86_32 105 VBROADCASTSS m0, mulm 106 %else 107 %if WIN64 108 SWAP 0, 2 109 %endif 110 shufps xm0, xm0, 0 111 %if cpuflag(avx) 112 vinsertf128 m0, m0, xm0, 1 113 %endif 114 %endif 115 lea lenq, [lend*4-64] 116 .loop: 117 %if cpuflag(fma3) 118 mova m1, [dstq+lenq] 119 mova m2, [dstq+lenq+1*mmsize] 120 fmaddps m1, m0, [srcq+lenq], m1 121 fmaddps m2, m0, [srcq+lenq+1*mmsize], m2 122 %else ; cpuflag 123 mulps m1, m0, [srcq+lenq] 124 mulps m2, m0, [srcq+lenq+1*mmsize] 125 %if mmsize < 32 126 mulps m3, m0, [srcq+lenq+2*mmsize] 127 mulps m4, m0, [srcq+lenq+3*mmsize] 128 %endif ; mmsize 129 addps m1, m1, [dstq+lenq] 130 addps m2, m2, [dstq+lenq+1*mmsize] 131 %if mmsize < 32 132 addps m3, m3, [dstq+lenq+2*mmsize] 133 addps m4, m4, [dstq+lenq+3*mmsize] 134 %endif ; mmsize 135 %endif ; cpuflag 136 mova [dstq+lenq], m1 137 mova [dstq+lenq+1*mmsize], m2 138 %if mmsize < 32 139 mova [dstq+lenq+2*mmsize], m3 140 mova [dstq+lenq+3*mmsize], m4 141 %endif ; mmsize 142 sub lenq, 64 143 jge .loop 144 RET 145 %endmacro 146 147 INIT_XMM sse 148 VECTOR_FMAC_SCALAR 149 %if HAVE_AVX_EXTERNAL 150 INIT_YMM avx 151 VECTOR_FMAC_SCALAR 152 %endif 153 %if HAVE_FMA3_EXTERNAL 154 INIT_YMM fma3 155 VECTOR_FMAC_SCALAR 156 %endif 157 158 ;------------------------------------------------------------------------------ 159 ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) 160 ;------------------------------------------------------------------------------ 161 162 %macro VECTOR_FMUL_SCALAR 0 163 %if UNIX64 164 cglobal vector_fmul_scalar, 3,3,2, dst, src, len 165 %else 166 cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len 167 %endif 168 %if ARCH_X86_32 169 movss m0, mulm 170 %elif WIN64 171 SWAP 0, 2 172 %endif 173 shufps m0, m0, 0 174 lea lenq, [lend*4-mmsize] 175 .loop: 176 mova m1, [srcq+lenq] 177 mulps m1, m0 178 mova [dstq+lenq], m1 179 sub lenq, mmsize 180 jge .loop 181 RET 182 %endmacro 183 184 INIT_XMM sse 185 VECTOR_FMUL_SCALAR 186 187 ;------------------------------------------------------------------------------ 188 ; void ff_vector_dmac_scalar(double *dst, const double *src, double mul, 189 ; int len) 190 ;------------------------------------------------------------------------------ 191 192 %macro VECTOR_DMAC_SCALAR 0 193 %if ARCH_X86_32 194 cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr 195 mov lenq, lenaddrm 196 VBROADCASTSD m0, mulm 197 %else 198 %if UNIX64 199 cglobal vector_dmac_scalar, 3,3,5, dst, src, len 200 %else 201 cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len 202 SWAP 0, 2 203 %endif 204 movlhps xm0, xm0 205 %if cpuflag(avx) 206 vinsertf128 m0, m0, xm0, 1 207 %endif 208 %endif 209 lea lenq, [lend*8-mmsize*4] 210 .loop: 211 %if cpuflag(fma3) 212 movaps m1, [dstq+lenq] 213 movaps m2, [dstq+lenq+1*mmsize] 214 movaps m3, [dstq+lenq+2*mmsize] 215 movaps m4, [dstq+lenq+3*mmsize] 216 fmaddpd m1, m0, [srcq+lenq], m1 217 fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2 218 fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3 219 fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4 220 %else ; cpuflag 221 mulpd m1, m0, [srcq+lenq] 222 mulpd m2, m0, [srcq+lenq+1*mmsize] 223 mulpd m3, m0, [srcq+lenq+2*mmsize] 224 mulpd m4, m0, [srcq+lenq+3*mmsize] 225 addpd m1, m1, [dstq+lenq] 226 addpd m2, m2, [dstq+lenq+1*mmsize] 227 addpd m3, m3, [dstq+lenq+2*mmsize] 228 addpd m4, m4, [dstq+lenq+3*mmsize] 229 %endif ; cpuflag 230 movaps [dstq+lenq], m1 231 movaps [dstq+lenq+1*mmsize], m2 232 movaps [dstq+lenq+2*mmsize], m3 233 movaps [dstq+lenq+3*mmsize], m4 234 sub lenq, mmsize*4 235 jge .loop 236 RET 237 %endmacro 238 239 INIT_XMM sse2 240 VECTOR_DMAC_SCALAR 241 %if HAVE_AVX_EXTERNAL 242 INIT_YMM avx 243 VECTOR_DMAC_SCALAR 244 %endif 245 %if HAVE_FMA3_EXTERNAL 246 INIT_YMM fma3 247 VECTOR_DMAC_SCALAR 248 %endif 249 250 ;------------------------------------------------------------------------------ 251 ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, 252 ; int len) 253 ;------------------------------------------------------------------------------ 254 255 %macro VECTOR_DMUL_SCALAR 0 256 %if ARCH_X86_32 257 cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr 258 mov lenq, lenaddrm 259 %elif UNIX64 260 cglobal vector_dmul_scalar, 3,3,3, dst, src, len 261 %else 262 cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len 263 %endif 264 %if ARCH_X86_32 265 VBROADCASTSD m0, mulm 266 %else 267 %if WIN64 268 SWAP 0, 2 269 %endif 270 movlhps xm0, xm0 271 %if cpuflag(avx) 272 vinsertf128 ym0, ym0, xm0, 1 273 %endif 274 %endif 275 lea lenq, [lend*8-2*mmsize] 276 .loop: 277 mulpd m1, m0, [srcq+lenq ] 278 mulpd m2, m0, [srcq+lenq+mmsize] 279 movaps [dstq+lenq ], m1 280 movaps [dstq+lenq+mmsize], m2 281 sub lenq, 2*mmsize 282 jge .loop 283 RET 284 %endmacro 285 286 INIT_XMM sse2 287 VECTOR_DMUL_SCALAR 288 %if HAVE_AVX_EXTERNAL 289 INIT_YMM avx 290 VECTOR_DMUL_SCALAR 291 %endif 292 293 ;----------------------------------------------------------------------------- 294 ; vector_fmul_window(float *dst, const float *src0, 295 ; const float *src1, const float *win, int len); 296 ;----------------------------------------------------------------------------- 297 INIT_XMM sse 298 cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1 299 shl lend, 2 300 lea len1q, [lenq - mmsize] 301 add src0q, lenq 302 add dstq, lenq 303 add winq, lenq 304 neg lenq 305 .loop: 306 mova m0, [winq + lenq] 307 mova m4, [src0q + lenq] 308 mova m1, [winq + len1q] 309 mova m5, [src1q + len1q] 310 shufps m1, m1, 0x1b 311 shufps m5, m5, 0x1b 312 mova m2, m0 313 mova m3, m1 314 mulps m2, m4 315 mulps m3, m5 316 mulps m1, m4 317 mulps m0, m5 318 addps m2, m3 319 subps m1, m0 320 shufps m2, m2, 0x1b 321 mova [dstq + lenq], m1 322 mova [dstq + len1q], m2 323 sub len1q, mmsize 324 add lenq, mmsize 325 jl .loop 326 RET 327 328 ;----------------------------------------------------------------------------- 329 ; vector_fmul_add(float *dst, const float *src0, const float *src1, 330 ; const float *src2, int len) 331 ;----------------------------------------------------------------------------- 332 %macro VECTOR_FMUL_ADD 0 333 cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len 334 lea lenq, [lend*4 - 2*mmsize] 335 ALIGN 16 336 .loop: 337 mova m0, [src0q + lenq] 338 mova m1, [src0q + lenq + mmsize] 339 %if cpuflag(fma3) 340 mova m2, [src2q + lenq] 341 mova m3, [src2q + lenq + mmsize] 342 fmaddps m0, m0, [src1q + lenq], m2 343 fmaddps m1, m1, [src1q + lenq + mmsize], m3 344 %else 345 mulps m0, m0, [src1q + lenq] 346 mulps m1, m1, [src1q + lenq + mmsize] 347 addps m0, m0, [src2q + lenq] 348 addps m1, m1, [src2q + lenq + mmsize] 349 %endif 350 mova [dstq + lenq], m0 351 mova [dstq + lenq + mmsize], m1 352 353 sub lenq, 2*mmsize 354 jge .loop 355 RET 356 %endmacro 357 358 INIT_XMM sse 359 VECTOR_FMUL_ADD 360 %if HAVE_AVX_EXTERNAL 361 INIT_YMM avx 362 VECTOR_FMUL_ADD 363 %endif 364 %if HAVE_FMA3_EXTERNAL 365 INIT_YMM fma3 366 VECTOR_FMUL_ADD 367 %endif 368 369 ;----------------------------------------------------------------------------- 370 ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, 371 ; int len) 372 ;----------------------------------------------------------------------------- 373 %macro VECTOR_FMUL_REVERSE 0 374 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len 375 %if cpuflag(avx2) 376 movaps m2, [pd_reverse] 377 %endif 378 lea lenq, [lend*4 - 2*mmsize] 379 ALIGN 16 380 .loop: 381 %if cpuflag(avx2) 382 vpermps m0, m2, [src1q] 383 vpermps m1, m2, [src1q+mmsize] 384 %elif cpuflag(avx) 385 vmovaps xmm0, [src1q + 16] 386 vinsertf128 m0, m0, [src1q], 1 387 vshufps m0, m0, m0, q0123 388 vmovaps xmm1, [src1q + mmsize + 16] 389 vinsertf128 m1, m1, [src1q + mmsize], 1 390 vshufps m1, m1, m1, q0123 391 %else 392 mova m0, [src1q] 393 mova m1, [src1q + mmsize] 394 shufps m0, m0, q0123 395 shufps m1, m1, q0123 396 %endif 397 mulps m0, m0, [src0q + lenq + mmsize] 398 mulps m1, m1, [src0q + lenq] 399 movaps [dstq + lenq + mmsize], m0 400 movaps [dstq + lenq], m1 401 add src1q, 2*mmsize 402 sub lenq, 2*mmsize 403 jge .loop 404 RET 405 %endmacro 406 407 INIT_XMM sse 408 VECTOR_FMUL_REVERSE 409 %if HAVE_AVX_EXTERNAL 410 INIT_YMM avx 411 VECTOR_FMUL_REVERSE 412 %endif 413 %if HAVE_AVX2_EXTERNAL 414 INIT_YMM avx2 415 VECTOR_FMUL_REVERSE 416 %endif 417 418 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) 419 INIT_XMM sse 420 cglobal scalarproduct_float, 3,3,2, v1, v2, offset 421 shl offsetd, 2 422 add v1q, offsetq 423 add v2q, offsetq 424 neg offsetq 425 xorps xmm0, xmm0 426 .loop: 427 movaps xmm1, [v1q+offsetq] 428 mulps xmm1, [v2q+offsetq] 429 addps xmm0, xmm1 430 add offsetq, 16 431 js .loop 432 movhlps xmm1, xmm0 433 addps xmm0, xmm1 434 movss xmm1, xmm0 435 shufps xmm0, xmm0, 1 436 addss xmm0, xmm1 437 %if ARCH_X86_64 == 0 438 movss r0m, xmm0 439 fld dword r0m 440 %endif 441 RET 442 443 INIT_YMM fma3 444 cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset 445 xor offsetq, offsetq 446 xorps m0, m0, m0 447 shl sized, 2 448 mov lenq, sizeq 449 cmp lenq, 32 450 jl .l16 451 cmp lenq, 64 452 jl .l32 453 xorps m1, m1, m1 454 cmp lenq, 128 455 jl .l64 456 and lenq, ~127 457 xorps m2, m2, m2 458 xorps m3, m3, m3 459 .loop128: 460 movups m4, [v1q+offsetq] 461 movups m5, [v1q+offsetq + 32] 462 movups m6, [v1q+offsetq + 64] 463 movups m7, [v1q+offsetq + 96] 464 fmaddps m0, m4, [v2q+offsetq ], m0 465 fmaddps m1, m5, [v2q+offsetq + 32], m1 466 fmaddps m2, m6, [v2q+offsetq + 64], m2 467 fmaddps m3, m7, [v2q+offsetq + 96], m3 468 add offsetq, 128 469 cmp offsetq, lenq 470 jl .loop128 471 addps m0, m0, m2 472 addps m1, m1, m3 473 mov lenq, sizeq 474 and lenq, 127 475 cmp lenq, 64 476 jge .l64 477 addps m0, m0, m1 478 cmp lenq, 32 479 jge .l32 480 vextractf128 xmm2, m0, 1 481 addps xmm0, xmm2 482 cmp lenq, 16 483 jge .l16 484 movhlps xmm1, xmm0 485 addps xmm0, xmm1 486 movss xmm1, xmm0 487 shufps xmm0, xmm0, 1 488 addss xmm0, xmm1 489 %if ARCH_X86_64 == 0 490 movss r0m, xm0 491 fld dword r0m 492 %endif 493 RET 494 .l64: 495 and lenq, ~63 496 add lenq, offsetq 497 .loop64: 498 movups m4, [v1q+offsetq] 499 movups m5, [v1q+offsetq + 32] 500 fmaddps m0, m4, [v2q+offsetq], m0 501 fmaddps m1, m5, [v2q+offsetq + 32], m1 502 add offsetq, 64 503 cmp offsetq, lenq 504 jl .loop64 505 addps m0, m0, m1 506 mov lenq, sizeq 507 and lenq, 63 508 cmp lenq, 32 509 jge .l32 510 vextractf128 xmm2, m0, 1 511 addps xmm0, xmm2 512 cmp lenq, 16 513 jge .l16 514 movhlps xmm1, xmm0 515 addps xmm0, xmm1 516 movss xmm1, xmm0 517 shufps xmm0, xmm0, 1 518 addss xmm0, xmm1 519 %if ARCH_X86_64 == 0 520 movss r0m, xm0 521 fld dword r0m 522 %endif 523 RET 524 .l32: 525 and lenq, ~31 526 add lenq, offsetq 527 .loop32: 528 movups m4, [v1q+offsetq] 529 fmaddps m0, m4, [v2q+offsetq], m0 530 add offsetq, 32 531 cmp offsetq, lenq 532 jl .loop32 533 vextractf128 xmm2, m0, 1 534 addps xmm0, xmm2 535 mov lenq, sizeq 536 and lenq, 31 537 cmp lenq, 16 538 jge .l16 539 movhlps xmm1, xmm0 540 addps xmm0, xmm1 541 movss xmm1, xmm0 542 shufps xmm0, xmm0, 1 543 addss xmm0, xmm1 544 %if ARCH_X86_64 == 0 545 movss r0m, xm0 546 fld dword r0m 547 %endif 548 RET 549 .l16: 550 and lenq, ~15 551 add lenq, offsetq 552 .loop16: 553 movaps xmm1, [v1q+offsetq] 554 mulps xmm1, [v2q+offsetq] 555 addps xmm0, xmm1 556 add offsetq, 16 557 cmp offsetq, lenq 558 jl .loop16 559 movhlps xmm1, xmm0 560 addps xmm0, xmm1 561 movss xmm1, xmm0 562 shufps xmm0, xmm0, 1 563 addss xmm0, xmm1 564 %if ARCH_X86_64 == 0 565 movss r0m, xm0 566 fld dword r0m 567 %endif 568 RET 569 570 ;--------------------------------------------------------------------------------- 571 ; double scalarproduct_double(const double *v1, const double *v2, size_t len) 572 ;--------------------------------------------------------------------------------- 573 %macro SCALARPRODUCT_DOUBLE 0 574 cglobal scalarproduct_double, 3,3,8, v1, v2, offset 575 shl offsetq, 3 576 add v1q, offsetq 577 add v2q, offsetq 578 neg offsetq 579 xorpd m0, m0 580 xorpd m1, m1 581 movapd m2, m0 582 movapd m3, m1 583 align 16 584 .loop: 585 movapd m4, [v1q+offsetq+mmsize*0] 586 movapd m5, [v1q+offsetq+mmsize*1] 587 movapd m6, [v1q+offsetq+mmsize*2] 588 movapd m7, [v1q+offsetq+mmsize*3] 589 mulpd m4, [v2q+offsetq+mmsize*0] 590 mulpd m5, [v2q+offsetq+mmsize*1] 591 mulpd m6, [v2q+offsetq+mmsize*2] 592 mulpd m7, [v2q+offsetq+mmsize*3] 593 addpd m0, m4 594 addpd m1, m5 595 addpd m2, m6 596 addpd m3, m7 597 add offsetq, mmsize*4 598 jl .loop 599 addpd m0, m1 600 addpd m2, m3 601 addpd m0, m2 602 %if mmsize == 32 603 vextractf128 xm1, m0, 1 604 addpd xm0, xm1 605 %endif 606 movhlps xm1, xm0 607 addsd xm0, xm1 608 %if ARCH_X86_64 == 0 609 movsd r0m, xm0 610 fld qword r0m 611 %endif 612 RET 613 %endmacro 614 615 INIT_XMM sse2 616 SCALARPRODUCT_DOUBLE 617 %if HAVE_AVX_EXTERNAL 618 INIT_YMM avx 619 SCALARPRODUCT_DOUBLE 620 %endif 621 622 ;----------------------------------------------------------------------------- 623 ; void ff_butterflies_float(float *src0, float *src1, int len); 624 ;----------------------------------------------------------------------------- 625 INIT_XMM sse 626 cglobal butterflies_float, 3,3,3, src0, src1, len 627 shl lend, 2 628 add src0q, lenq 629 add src1q, lenq 630 neg lenq 631 .loop: 632 mova m0, [src0q + lenq] 633 mova m1, [src1q + lenq] 634 subps m2, m0, m1 635 addps m0, m0, m1 636 mova [src1q + lenq], m2 637 mova [src0q + lenq], m0 638 add lenq, mmsize 639 jl .loop 640 RET