x86inc.asm (59540B)
1 ;***************************************************************************** 2 ;* x86inc.asm: x86 abstraction layer 3 ;***************************************************************************** 4 ;* Copyright (C) 2005-2024 x264 project 5 ;* 6 ;* Authors: Loren Merritt <lorenm@u.washington.edu> 7 ;* Henrik Gramner <henrik@gramner.com> 8 ;* Anton Mitrofanov <BugMaster@narod.ru> 9 ;* Fiona Glaser <fiona@x264.com> 10 ;* 11 ;* Permission to use, copy, modify, and/or distribute this software for any 12 ;* purpose with or without fee is hereby granted, provided that the above 13 ;* copyright notice and this permission notice appear in all copies. 14 ;* 15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22 ;***************************************************************************** 23 24 ; This is a header file for the x86inc.asm assembly language, which uses 25 ; NASM/YASM syntax combined with a large number of macros to provide easy 26 ; abstraction between different calling conventions (x86_32, win64, linux64). 27 ; It also has various other useful features to simplify writing the kind of 28 ; DSP functions that are most often used. 29 30 %ifndef private_prefix 31 %error private_prefix not defined 32 %endif 33 34 %ifndef public_prefix 35 %define public_prefix private_prefix 36 %endif 37 38 %if HAVE_ALIGNED_STACK 39 %define STACK_ALIGNMENT 16 40 %endif 41 %ifndef STACK_ALIGNMENT 42 %if ARCH_X86_64 43 %define STACK_ALIGNMENT 16 44 %else 45 %define STACK_ALIGNMENT 4 46 %endif 47 %endif 48 49 %define WIN64 0 50 %define UNIX64 0 51 %if ARCH_X86_64 52 %ifidn __OUTPUT_FORMAT__,win32 53 %define WIN64 1 54 %elifidn __OUTPUT_FORMAT__,win64 55 %define WIN64 1 56 %elifidn __OUTPUT_FORMAT__,x64 57 %define WIN64 1 58 %else 59 %define UNIX64 1 60 %endif 61 %endif 62 63 %define FORMAT_ELF 0 64 %define FORMAT_MACHO 0 65 %ifidn __OUTPUT_FORMAT__,elf 66 %define FORMAT_ELF 1 67 %elifidn __OUTPUT_FORMAT__,elf32 68 %define FORMAT_ELF 1 69 %elifidn __OUTPUT_FORMAT__,elf64 70 %define FORMAT_ELF 1 71 %elifidn __OUTPUT_FORMAT__,macho 72 %define FORMAT_MACHO 1 73 %elifidn __OUTPUT_FORMAT__,macho32 74 %define FORMAT_MACHO 1 75 %elifidn __OUTPUT_FORMAT__,macho64 76 %define FORMAT_MACHO 1 77 %endif 78 79 %ifdef PREFIX 80 %define mangle(x) _ %+ x 81 %else 82 %define mangle(x) x 83 %endif 84 85 ; Use VEX-encoding even in non-AVX functions 86 %ifndef FORCE_VEX_ENCODING 87 %define FORCE_VEX_ENCODING 0 88 %endif 89 90 ; aout does not support align= 91 ; NOTE: This section is out of sync with x264, in order to 92 ; keep supporting OS/2. 93 %macro SECTION_RODATA 0-1 16 94 %ifidn __OUTPUT_FORMAT__,aout 95 SECTION .text 96 %elifidn __OUTPUT_FORMAT__,coff 97 SECTION .text 98 %elifidn __OUTPUT_FORMAT__,win32 99 SECTION .rdata align=%1 100 %elif WIN64 101 SECTION .rdata align=%1 102 %else 103 SECTION .rodata align=%1 104 %endif 105 %endmacro 106 107 %if ARCH_X86_64 108 %define PIC 1 ; always use PIC on x86-64 109 default rel 110 %elifidn __OUTPUT_FORMAT__,win32 111 %define PIC 0 ; PIC isn't used on 32-bit Windows 112 %elifndef PIC 113 %define PIC 0 114 %endif 115 116 %define HAVE_PRIVATE_EXTERN 1 117 %ifdef __NASM_VERSION_ID__ 118 %use smartalign 119 %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 120 %define HAVE_PRIVATE_EXTERN 0 121 %endif 122 %endif 123 124 ; Macros to eliminate most code duplication between x86_32 and x86_64: 125 ; Currently this works only for leaf functions which load all their arguments 126 ; into registers at the start, and make no other use of the stack. Luckily that 127 ; covers most use cases. 128 129 ; PROLOGUE: 130 ; %1 = number of arguments. loads them from stack if needed. 131 ; %2 = number of registers used. pushes callee-saved regs if needed. 132 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 133 ; %4 = (optional) stack size to be allocated. The stack will be aligned before 134 ; allocating the specified stack size. If the required stack alignment is 135 ; larger than the known stack alignment the stack will be manually aligned 136 ; and an extra register will be allocated to hold the original stack 137 ; pointer (to not invalidate r0m etc.). To prevent the use of an extra 138 ; register as stack pointer, request a negative stack size. 139 ; %4+/%5+ = list of names to define to registers 140 ; PROLOGUE can also be invoked by adding the same options to cglobal 141 142 ; e.g. 143 ; cglobal foo, 2,3,7,0x40, dst, src, tmp 144 ; declares a function (foo) that automatically loads two arguments (dst and 145 ; src) into registers, uses one additional register (tmp) plus 7 vector 146 ; registers (m0-m6) and allocates 0x40 bytes of stack space. 147 148 ; TODO Some functions can use some args directly from the stack. If they're the 149 ; last args then you can just not declare them, but if they're in the middle 150 ; we need more flexible macro. 151 152 ; RET: 153 ; Pops anything that was pushed by PROLOGUE, and returns. 154 155 ; REP_RET: 156 ; Use this instead of RET if it's a branch target. 157 158 ; registers: 159 ; rN and rNq are the native-size register holding function argument N 160 ; rNd, rNw, rNb are dword, word, and byte size 161 ; rNh is the high 8 bits of the word size 162 ; rNm is the original location of arg N (a register or on the stack), dword 163 ; rNmp is native size 164 165 %macro DECLARE_REG 2-3 166 %define r%1q %2 167 %define r%1d %2d 168 %define r%1w %2w 169 %define r%1b %2b 170 %define r%1h %2h 171 %define %2q %2 172 %if %0 == 2 173 %define r%1m %2d 174 %define r%1mp %2 175 %elif ARCH_X86_64 ; memory 176 %define r%1m [rstk + stack_offset + %3] 177 %define r%1mp qword r %+ %1 %+ m 178 %else 179 %define r%1m [rstk + stack_offset + %3] 180 %define r%1mp dword r %+ %1 %+ m 181 %endif 182 %define r%1 %2 183 %endmacro 184 185 %macro DECLARE_REG_SIZE 3 186 %define r%1q r%1 187 %define e%1q r%1 188 %define r%1d e%1 189 %define e%1d e%1 190 %define r%1w %1 191 %define e%1w %1 192 %define r%1h %3 193 %define e%1h %3 194 %define r%1b %2 195 %define e%1b %2 196 %if ARCH_X86_64 == 0 197 %define r%1 e%1 198 %endif 199 %endmacro 200 201 DECLARE_REG_SIZE ax, al, ah 202 DECLARE_REG_SIZE bx, bl, bh 203 DECLARE_REG_SIZE cx, cl, ch 204 DECLARE_REG_SIZE dx, dl, dh 205 DECLARE_REG_SIZE si, sil, null 206 DECLARE_REG_SIZE di, dil, null 207 DECLARE_REG_SIZE bp, bpl, null 208 209 ; t# defines for when per-arch register allocation is more complex than just function arguments 210 211 %macro DECLARE_REG_TMP 1-* 212 %assign %%i 0 213 %rep %0 214 CAT_XDEFINE t, %%i, r%1 215 %assign %%i %%i+1 216 %rotate 1 217 %endrep 218 %endmacro 219 220 %macro DECLARE_REG_TMP_SIZE 0-* 221 %rep %0 222 %define t%1q t%1 %+ q 223 %define t%1d t%1 %+ d 224 %define t%1w t%1 %+ w 225 %define t%1h t%1 %+ h 226 %define t%1b t%1 %+ b 227 %rotate 1 228 %endrep 229 %endmacro 230 231 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 232 233 %if ARCH_X86_64 234 %define gprsize 8 235 %else 236 %define gprsize 4 237 %endif 238 239 %macro LEA 2 240 %if ARCH_X86_64 241 lea %1, [%2] 242 %elif PIC 243 call $+5 ; special-cased to not affect the RSB on most CPU:s 244 pop %1 245 add %1, (%2)-$+1 246 %else 247 mov %1, %2 248 %endif 249 %endmacro 250 251 ; Repeats an instruction/operation for multiple arguments. 252 ; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" 253 %macro REPX 2-* ; operation, args 254 %xdefine %%f(x) %1 255 %rep %0 - 1 256 %rotate 1 257 %%f(%1) 258 %endrep 259 %endmacro 260 261 %macro PUSH 1 262 push %1 263 %ifidn rstk, rsp 264 %assign stack_offset stack_offset+gprsize 265 %endif 266 %endmacro 267 268 %macro POP 1 269 pop %1 270 %ifidn rstk, rsp 271 %assign stack_offset stack_offset-gprsize 272 %endif 273 %endmacro 274 275 %macro PUSH_IF_USED 1-* 276 %rep %0 277 %if %1 < regs_used 278 PUSH r%1 279 %endif 280 %rotate 1 281 %endrep 282 %endmacro 283 284 %macro POP_IF_USED 1-* 285 %rep %0 286 %if %1 < regs_used 287 pop r%1 288 %endif 289 %rotate 1 290 %endrep 291 %endmacro 292 293 %macro LOAD_IF_USED 1-* 294 %rep %0 295 %if %1 < num_args 296 mov r%1, r %+ %1 %+ mp 297 %endif 298 %rotate 1 299 %endrep 300 %endmacro 301 302 %macro SUB 2 303 sub %1, %2 304 %ifidn %1, rstk 305 %assign stack_offset stack_offset+(%2) 306 %endif 307 %endmacro 308 309 %macro ADD 2 310 add %1, %2 311 %ifidn %1, rstk 312 %assign stack_offset stack_offset-(%2) 313 %endif 314 %endmacro 315 316 %macro movifnidn 2 317 %ifnidn %1, %2 318 mov %1, %2 319 %endif 320 %endmacro 321 322 %if ARCH_X86_64 == 0 323 %define movsxd movifnidn 324 %endif 325 326 %macro movsxdifnidn 2 327 %ifnidn %1, %2 328 movsxd %1, %2 329 %endif 330 %endmacro 331 332 %macro ASSERT 1 333 %if (%1) == 0 334 %error assertion ``%1'' failed 335 %endif 336 %endmacro 337 338 %macro DEFINE_ARGS 0-* 339 %ifdef n_arg_names 340 %assign %%i 0 341 %rep n_arg_names 342 CAT_UNDEF arg_name %+ %%i, q 343 CAT_UNDEF arg_name %+ %%i, d 344 CAT_UNDEF arg_name %+ %%i, w 345 CAT_UNDEF arg_name %+ %%i, h 346 CAT_UNDEF arg_name %+ %%i, b 347 CAT_UNDEF arg_name %+ %%i, m 348 CAT_UNDEF arg_name %+ %%i, mp 349 CAT_UNDEF arg_name, %%i 350 %assign %%i %%i+1 351 %endrep 352 %endif 353 354 %xdefine %%stack_offset stack_offset 355 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 356 %assign %%i 0 357 %rep %0 358 %xdefine %1q r %+ %%i %+ q 359 %xdefine %1d r %+ %%i %+ d 360 %xdefine %1w r %+ %%i %+ w 361 %xdefine %1h r %+ %%i %+ h 362 %xdefine %1b r %+ %%i %+ b 363 %xdefine %1m r %+ %%i %+ m 364 %xdefine %1mp r %+ %%i %+ mp 365 CAT_XDEFINE arg_name, %%i, %1 366 %assign %%i %%i+1 367 %rotate 1 368 %endrep 369 %xdefine stack_offset %%stack_offset 370 %assign n_arg_names %0 371 %endmacro 372 373 %define required_stack_alignment ((mmsize + 15) & ~15) 374 %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) 375 %define high_mm_regs (16*cpuflag(avx512)) 376 377 ; Large stack allocations on Windows need to use stack probing in order 378 ; to guarantee that all stack memory is committed before accessing it. 379 ; This is done by ensuring that the guard page(s) at the end of the 380 ; currently committed pages are touched prior to any pages beyond that. 381 %if WIN64 382 %assign STACK_PROBE_SIZE 8192 383 %elifidn __OUTPUT_FORMAT__, win32 384 %assign STACK_PROBE_SIZE 4096 385 %else 386 %assign STACK_PROBE_SIZE 0 387 %endif 388 389 %macro PROBE_STACK 1 ; stack_size 390 %if STACK_PROBE_SIZE 391 %assign %%i STACK_PROBE_SIZE 392 %rep %1 / STACK_PROBE_SIZE 393 mov eax, [rsp-%%i] 394 %assign %%i %%i+STACK_PROBE_SIZE 395 %endrep 396 %endif 397 %endmacro 398 399 %macro RESET_STACK_STATE 0 400 %ifidn rstk, rsp 401 %assign stack_offset stack_offset - stack_size_padded 402 %else 403 %xdefine rstk rsp 404 %endif 405 %assign stack_size 0 406 %assign stack_size_padded 0 407 %assign xmm_regs_used 0 408 %endmacro 409 410 %macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs 411 RESET_STACK_STATE 412 %ifnum %2 413 %if mmsize != 8 414 %assign xmm_regs_used %2 415 %endif 416 %endif 417 %ifnum %1 418 %if %1 != 0 419 %assign %%pad 0 420 %assign stack_size %1 421 %if stack_size < 0 422 %assign stack_size -stack_size 423 %endif 424 %if WIN64 425 %assign %%pad %%pad + 32 ; shadow space 426 %if xmm_regs_used > 8 427 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers 428 %endif 429 %endif 430 %if required_stack_alignment <= STACK_ALIGNMENT 431 ; maintain the current stack alignment 432 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 433 PROBE_STACK stack_size_padded 434 SUB rsp, stack_size_padded 435 %else 436 %assign %%reg_num (regs_used - 1) 437 %xdefine rstk r %+ %%reg_num 438 ; align stack, and save original stack location directly above 439 ; it, i.e. in [rsp+stack_size_padded], so we can restore the 440 ; stack in a single instruction (i.e. mov rsp, rstk or mov 441 ; rsp, [rsp+stack_size_padded]) 442 %if %1 < 0 ; need to store rsp on stack 443 %xdefine rstkm [rsp + stack_size + %%pad] 444 %assign %%pad %%pad + gprsize 445 %else ; can keep rsp in rstk during whole function 446 %xdefine rstkm rstk 447 %endif 448 %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) 449 PROBE_STACK stack_size_padded 450 mov rstk, rsp 451 and rsp, ~(required_stack_alignment-1) 452 sub rsp, stack_size_padded 453 movifnidn rstkm, rstk 454 %endif 455 WIN64_PUSH_XMM 456 %endif 457 %endif 458 %endmacro 459 460 %macro SETUP_STACK_POINTER 0-1 0 461 %ifnum %1 462 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT 463 %if %1 > 0 464 ; Reserve an additional register for storing the original stack pointer, but avoid using 465 ; eax/rax for this purpose since it can potentially get overwritten as a return value. 466 %assign regs_used (regs_used + 1) 467 %if ARCH_X86_64 && regs_used == 7 468 %assign regs_used 8 469 %elif ARCH_X86_64 == 0 && regs_used == 1 470 %assign regs_used 2 471 %endif 472 %endif 473 %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 474 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) 475 ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. 476 %assign regs_used 5 + UNIX64 * 3 477 %endif 478 %endif 479 %endif 480 %endmacro 481 482 %if WIN64 ; Windows x64 ;================================================= 483 484 DECLARE_REG 0, rcx 485 DECLARE_REG 1, rdx 486 DECLARE_REG 2, R8 487 DECLARE_REG 3, R9 488 DECLARE_REG 4, R10, 40 489 DECLARE_REG 5, R11, 48 490 DECLARE_REG 6, rax, 56 491 DECLARE_REG 7, rdi, 64 492 DECLARE_REG 8, rsi, 72 493 DECLARE_REG 9, rbx, 80 494 DECLARE_REG 10, rbp, 88 495 DECLARE_REG 11, R14, 96 496 DECLARE_REG 12, R15, 104 497 DECLARE_REG 13, R12, 112 498 DECLARE_REG 14, R13, 120 499 500 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 501 %assign num_args %1 502 %assign regs_used %2 503 ASSERT regs_used >= num_args 504 SETUP_STACK_POINTER %4 505 ASSERT regs_used <= 15 506 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 507 ALLOC_STACK %4, %3 508 %if mmsize != 8 && stack_size == 0 509 WIN64_SPILL_XMM %3 510 %endif 511 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 512 %if %0 > 4 513 %ifnum %4 514 DEFINE_ARGS %5 515 %else 516 DEFINE_ARGS %4, %5 517 %endif 518 %elifnnum %4 519 DEFINE_ARGS %4 520 %endif 521 %endmacro 522 523 ; Push XMM registers to the stack. If no argument is specified all used register 524 ; will be pushed, otherwise only push previously unpushed registers. 525 %macro WIN64_PUSH_XMM 0-2 ; new_xmm_regs_used, xmm_regs_pushed 526 %if mmsize != 8 527 %if %0 == 2 528 %assign %%pushed %2 529 %assign xmm_regs_used %1 530 %elif %0 == 1 531 %assign %%pushed xmm_regs_used 532 %assign xmm_regs_used %1 533 %else 534 %assign %%pushed 0 535 %endif 536 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. 537 %if %%pushed <= 6 + high_mm_regs && xmm_regs_used > 6 + high_mm_regs 538 movaps [rstk + stack_offset + 8], xmm6 539 %endif 540 %if %%pushed <= 7 + high_mm_regs && xmm_regs_used > 7 + high_mm_regs 541 movaps [rstk + stack_offset + 24], xmm7 542 %endif 543 %assign %%pushed %%pushed - high_mm_regs - 8 544 %if %%pushed < 0 545 %assign %%pushed 0 546 %endif 547 %assign %%regs_to_push xmm_regs_used - %%pushed - high_mm_regs - 8 548 %if %%regs_to_push > 0 549 ASSERT (%%regs_to_push + %%pushed) * 16 <= stack_size_padded - stack_size - 32 550 %assign %%i %%pushed + 8 551 %rep %%regs_to_push 552 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i 553 %assign %%i %%i+1 554 %endrep 555 %endif 556 %endif 557 %endmacro 558 559 ; Allocated stack space for XMM registers and push all, or a subset, of those 560 %macro WIN64_SPILL_XMM 1-2 ; xmm_regs_used, xmm_regs_reserved 561 RESET_STACK_STATE 562 %if mmsize != 8 563 %assign xmm_regs_used %1 564 ASSERT xmm_regs_used <= 16 + high_mm_regs 565 %if %0 == 2 566 ASSERT %2 >= %1 567 %assign %%xmm_regs_on_stack %2 - high_mm_regs - 8 568 %else 569 %assign %%xmm_regs_on_stack %1 - high_mm_regs - 8 570 %endif 571 %if %%xmm_regs_on_stack > 0 572 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. 573 %assign %%pad %%xmm_regs_on_stack*16 + 32 574 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 575 SUB rsp, stack_size_padded 576 %endif 577 WIN64_PUSH_XMM 578 %endif 579 %endmacro 580 581 %macro WIN64_RESTORE_XMM_INTERNAL 0 582 %assign %%pad_size 0 583 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 584 %if %%xmm_regs_on_stack > 0 585 %assign %%i xmm_regs_used - high_mm_regs 586 %rep %%xmm_regs_on_stack 587 %assign %%i %%i-1 588 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] 589 %endrep 590 %endif 591 %if stack_size_padded > 0 592 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT 593 mov rsp, rstkm 594 %else 595 add rsp, stack_size_padded 596 %assign %%pad_size stack_size_padded 597 %endif 598 %endif 599 %if xmm_regs_used > 7 + high_mm_regs 600 movaps xmm7, [rsp + stack_offset - %%pad_size + 24] 601 %endif 602 %if xmm_regs_used > 6 + high_mm_regs 603 movaps xmm6, [rsp + stack_offset - %%pad_size + 8] 604 %endif 605 %endmacro 606 607 %macro WIN64_RESTORE_XMM 0 608 WIN64_RESTORE_XMM_INTERNAL 609 RESET_STACK_STATE 610 %endmacro 611 612 %define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs 613 614 %macro RET 0 615 WIN64_RESTORE_XMM_INTERNAL 616 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 617 %if vzeroupper_required 618 vzeroupper 619 %endif 620 AUTO_REP_RET 621 %endmacro 622 623 %elif ARCH_X86_64 ; *nix x64 ;============================================= 624 625 DECLARE_REG 0, rdi 626 DECLARE_REG 1, rsi 627 DECLARE_REG 2, rdx 628 DECLARE_REG 3, rcx 629 DECLARE_REG 4, R8 630 DECLARE_REG 5, R9 631 DECLARE_REG 6, rax, 8 632 DECLARE_REG 7, R10, 16 633 DECLARE_REG 8, R11, 24 634 DECLARE_REG 9, rbx, 32 635 DECLARE_REG 10, rbp, 40 636 DECLARE_REG 11, R14, 48 637 DECLARE_REG 12, R15, 56 638 DECLARE_REG 13, R12, 64 639 DECLARE_REG 14, R13, 72 640 641 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 642 %assign num_args %1 643 %assign regs_used %2 644 ASSERT regs_used >= num_args 645 SETUP_STACK_POINTER %4 646 ASSERT regs_used <= 15 647 PUSH_IF_USED 9, 10, 11, 12, 13, 14 648 ALLOC_STACK %4, %3 649 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 650 %if %0 > 4 651 %ifnum %4 652 DEFINE_ARGS %5 653 %else 654 DEFINE_ARGS %4, %5 655 %endif 656 %elifnnum %4 657 DEFINE_ARGS %4 658 %endif 659 %endmacro 660 661 %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required 662 663 %macro RET 0 664 %if stack_size_padded > 0 665 %if required_stack_alignment > STACK_ALIGNMENT 666 mov rsp, rstkm 667 %else 668 add rsp, stack_size_padded 669 %endif 670 %endif 671 POP_IF_USED 14, 13, 12, 11, 10, 9 672 %if vzeroupper_required 673 vzeroupper 674 %endif 675 AUTO_REP_RET 676 %endmacro 677 678 %else ; X86_32 ;============================================================== 679 680 DECLARE_REG 0, eax, 4 681 DECLARE_REG 1, ecx, 8 682 DECLARE_REG 2, edx, 12 683 DECLARE_REG 3, ebx, 16 684 DECLARE_REG 4, esi, 20 685 DECLARE_REG 5, edi, 24 686 DECLARE_REG 6, ebp, 28 687 %define rsp esp 688 689 %macro DECLARE_ARG 1-* 690 %rep %0 691 %define r%1m [rstk + stack_offset + 4*%1 + 4] 692 %define r%1mp dword r%1m 693 %rotate 1 694 %endrep 695 %endmacro 696 697 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 698 699 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 700 %assign num_args %1 701 %assign regs_used %2 702 ASSERT regs_used >= num_args 703 %if num_args > 7 704 %assign num_args 7 705 %endif 706 %if regs_used > 7 707 %assign regs_used 7 708 %endif 709 SETUP_STACK_POINTER %4 710 ASSERT regs_used <= 7 711 PUSH_IF_USED 3, 4, 5, 6 712 ALLOC_STACK %4, %3 713 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 714 %if %0 > 4 715 %ifnum %4 716 DEFINE_ARGS %5 717 %else 718 DEFINE_ARGS %4, %5 719 %endif 720 %elifnnum %4 721 DEFINE_ARGS %4 722 %endif 723 %endmacro 724 725 %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required 726 727 %macro RET 0 728 %if stack_size_padded > 0 729 %if required_stack_alignment > STACK_ALIGNMENT 730 mov rsp, rstkm 731 %else 732 add rsp, stack_size_padded 733 %endif 734 %endif 735 POP_IF_USED 6, 5, 4, 3 736 %if vzeroupper_required 737 vzeroupper 738 %endif 739 AUTO_REP_RET 740 %endmacro 741 742 %endif ;====================================================================== 743 744 %if WIN64 == 0 745 %macro WIN64_SPILL_XMM 1-2 746 RESET_STACK_STATE 747 %if mmsize != 8 748 %assign xmm_regs_used %1 749 %endif 750 %endmacro 751 %macro WIN64_RESTORE_XMM 0 752 RESET_STACK_STATE 753 %endmacro 754 %macro WIN64_PUSH_XMM 0-2 755 %if mmsize != 8 && %0 >= 1 756 %assign xmm_regs_used %1 757 %endif 758 %endmacro 759 %endif 760 761 ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either 762 ; a branch or a branch target. So switch to a 2-byte form of ret in that case. 763 ; We can automatically detect "follows a branch", but not a branch target. 764 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) 765 %macro REP_RET 0 766 %if has_epilogue || cpuflag(ssse3) 767 RET 768 %else 769 rep ret 770 %endif 771 annotate_function_size 772 %endmacro 773 774 %define last_branch_adr $$ 775 %macro AUTO_REP_RET 0 776 %if notcpuflag(ssse3) 777 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. 778 %endif 779 ret 780 annotate_function_size 781 %endmacro 782 783 %macro BRANCH_INSTR 0-* 784 %rep %0 785 %macro %1 1-2 %1 786 %2 %1 787 %if notcpuflag(ssse3) 788 %%branch_instr equ $ 789 %xdefine last_branch_adr %%branch_instr 790 %endif 791 %endmacro 792 %rotate 1 793 %endrep 794 %endmacro 795 796 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp 797 798 %macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent 799 %if has_epilogue 800 call %1 801 RET 802 %elif %2 803 jmp %1 804 %endif 805 annotate_function_size 806 %endmacro 807 808 ;============================================================================= 809 ; arch-independent part 810 ;============================================================================= 811 812 %assign function_align 16 813 814 ; Begin a function. 815 ; Applies any symbol mangling needed for C linkage, and sets up a define such that 816 ; subsequent uses of the function name automatically refer to the mangled version. 817 ; Appends cpuflags to the function name if cpuflags has been specified. 818 ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX 819 ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). 820 %macro cglobal 1-2+ "" ; name, [PROLOGUE args] 821 cglobal_internal 1, %1 %+ SUFFIX, %2 822 %endmacro 823 %macro cvisible 1-2+ "" ; name, [PROLOGUE args] 824 cglobal_internal 0, %1 %+ SUFFIX, %2 825 %endmacro 826 %macro cglobal_internal 2-3+ 827 annotate_function_size 828 %ifndef cglobaled_%2 829 %if %1 830 %xdefine %2 mangle(private_prefix %+ _ %+ %2) 831 %else 832 %xdefine %2 mangle(public_prefix %+ _ %+ %2) 833 %endif 834 %xdefine %2.skip_prologue %2 %+ .skip_prologue 835 CAT_XDEFINE cglobaled_, %2, 1 836 %endif 837 %xdefine current_function %2 838 %xdefine current_function_section __SECT__ 839 %if FORMAT_ELF 840 %if %1 841 global %2:function hidden 842 %else 843 global %2:function 844 %endif 845 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 846 global %2:private_extern 847 %else 848 global %2 849 %endif 850 align function_align 851 %2: 852 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer 853 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required 854 %assign stack_offset 0 ; stack pointer offset relative to the return address 855 %assign stack_size 0 ; amount of stack space that can be freely used inside a function 856 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding 857 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper 858 %ifnidn %3, "" 859 PROLOGUE %3 860 %endif 861 %endmacro 862 863 ; Create a global symbol from a local label with the correct name mangling and type 864 %macro cglobal_label 1 865 %if FORMAT_ELF 866 global current_function %+ %1:function hidden 867 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN 868 global current_function %+ %1:private_extern 869 %else 870 global current_function %+ %1 871 %endif 872 %1: 873 %endmacro 874 875 %macro cextern 1 876 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 877 CAT_XDEFINE cglobaled_, %1, 1 878 extern %1 879 %endmacro 880 881 ; like cextern, but without the prefix 882 %macro cextern_naked 1 883 %ifdef PREFIX 884 %xdefine %1 mangle(%1) 885 %endif 886 CAT_XDEFINE cglobaled_, %1, 1 887 extern %1 888 %endmacro 889 890 %macro const 1-2+ 891 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 892 %if FORMAT_ELF 893 global %1:data hidden 894 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN 895 global %1:private_extern 896 %else 897 global %1 898 %endif 899 %1: %2 900 %endmacro 901 902 %if FORMAT_ELF 903 ; The GNU linker assumes the stack is executable by default. 904 [SECTION .note.GNU-stack noalloc noexec nowrite progbits] 905 906 %ifdef __NASM_VERSION_ID__ 907 %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03 908 %if ARCH_X86_64 909 ; Control-flow Enforcement Technology (CET) properties. 910 [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize] 911 dd 0x00000004 ; n_namesz 912 dd gprsize + 8 ; n_descsz 913 dd 0x00000005 ; n_type = NT_GNU_PROPERTY_TYPE_0 914 db "GNU",0 ; n_name 915 dd 0xc0000002 ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND 916 dd 0x00000004 ; pr_datasz 917 dd 0x00000002 ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK 918 dd 0x00000000 ; pr_padding 919 %endif 920 %endif 921 %endif 922 %endif 923 924 ; Tell debuggers how large the function was. 925 ; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. 926 ; This is invoked by RET and similar macros, and also cglobal does it for the previous function, 927 ; but if the last function in a source file doesn't use any of the standard macros for its epilogue, 928 ; then its size might be unspecified. 929 %macro annotate_function_size 0 930 %ifdef __YASM_VER__ 931 %ifdef current_function 932 %if FORMAT_ELF 933 current_function_section 934 %%ecf equ $ 935 size current_function %%ecf - current_function 936 __SECT__ 937 %endif 938 %endif 939 %endif 940 %endmacro 941 942 ; cpuflags 943 944 %assign cpuflags_mmx (1<<0) 945 %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 946 %assign cpuflags_3dnow (1<<2) | cpuflags_mmx 947 %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow 948 %assign cpuflags_sse (1<<4) | cpuflags_mmx2 949 %assign cpuflags_sse2 (1<<5) | cpuflags_sse 950 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 951 %assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 952 %assign cpuflags_sse3 (1<<8) | cpuflags_sse2 953 %assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 954 %assign cpuflags_sse4 (1<<10) | cpuflags_ssse3 955 %assign cpuflags_sse42 (1<<11) | cpuflags_sse4 956 %assign cpuflags_aesni (1<<12) | cpuflags_sse42 957 %assign cpuflags_clmul (1<<13) | cpuflags_sse42 958 %assign cpuflags_gfni (1<<14) | cpuflags_aesni|cpuflags_clmul 959 %assign cpuflags_avx (1<<15) | cpuflags_sse42 960 %assign cpuflags_xop (1<<16) | cpuflags_avx 961 %assign cpuflags_fma4 (1<<17) | cpuflags_avx 962 %assign cpuflags_fma3 (1<<18) | cpuflags_avx 963 %assign cpuflags_bmi1 (1<<19) | cpuflags_avx|cpuflags_lzcnt 964 %assign cpuflags_bmi2 (1<<20) | cpuflags_bmi1 965 %assign cpuflags_avx2 (1<<21) | cpuflags_fma3|cpuflags_bmi2 966 %assign cpuflags_avx512 (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL 967 %assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ 968 969 %assign cpuflags_cache32 (1<<24) 970 %assign cpuflags_cache64 (1<<25) 971 %assign cpuflags_aligned (1<<26) ; not a cpu feature, but a function variant 972 %assign cpuflags_atom (1<<27) 973 974 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. 975 %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) 976 %define notcpuflag(x) (cpuflag(x) ^ 1) 977 978 ; Takes an arbitrary number of cpuflags from the above list. 979 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 980 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 981 %macro INIT_CPUFLAGS 0-* 982 %xdefine SUFFIX 983 %undef cpuname 984 %assign cpuflags 0 985 986 %if %0 >= 1 987 %rep %0 988 %ifdef cpuname 989 %xdefine cpuname cpuname %+ _%1 990 %else 991 %xdefine cpuname %1 992 %endif 993 %assign cpuflags cpuflags | cpuflags_%1 994 %rotate 1 995 %endrep 996 %xdefine SUFFIX _ %+ cpuname 997 998 %if cpuflag(avx) 999 %assign avx_enabled 1 1000 %endif 1001 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) 1002 %define mova movaps 1003 %define movu movups 1004 %define movnta movntps 1005 %endif 1006 %if cpuflag(aligned) 1007 %define movu mova 1008 %elif cpuflag(sse3) && notcpuflag(ssse3) 1009 %define movu lddqu 1010 %endif 1011 %endif 1012 1013 %if ARCH_X86_64 || cpuflag(sse2) 1014 %ifdef __NASM_VERSION_ID__ 1015 ALIGNMODE p6 1016 %else 1017 CPU amdnop 1018 %endif 1019 %else 1020 %ifdef __NASM_VERSION_ID__ 1021 ALIGNMODE nop 1022 %else 1023 CPU basicnop 1024 %endif 1025 %endif 1026 %endmacro 1027 1028 ; Merge mmx, sse*, and avx* 1029 ; m# is a simd register of the currently selected size 1030 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# 1031 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# 1032 ; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# 1033 ; (All 4 remain in sync through SWAP.) 1034 1035 %macro CAT_XDEFINE 3 1036 %xdefine %1%2 %3 1037 %endmacro 1038 1039 %macro CAT_UNDEF 2 1040 %undef %1%2 1041 %endmacro 1042 1043 %macro DEFINE_MMREGS 1 ; mmtype 1044 %assign %%prev_mmregs 0 1045 %ifdef num_mmregs 1046 %assign %%prev_mmregs num_mmregs 1047 %endif 1048 1049 %assign num_mmregs 8 1050 %if ARCH_X86_64 && mmsize >= 16 1051 %assign num_mmregs 16 1052 %if cpuflag(avx512) || mmsize == 64 1053 %assign num_mmregs 32 1054 %endif 1055 %endif 1056 1057 %assign %%i 0 1058 %rep num_mmregs 1059 CAT_XDEFINE m, %%i, %1 %+ %%i 1060 CAT_XDEFINE nn%1, %%i, %%i 1061 %assign %%i %%i+1 1062 %endrep 1063 %if %%prev_mmregs > num_mmregs 1064 %rep %%prev_mmregs - num_mmregs 1065 CAT_UNDEF m, %%i 1066 CAT_UNDEF nn %+ mmtype, %%i 1067 %assign %%i %%i+1 1068 %endrep 1069 %endif 1070 %xdefine mmtype %1 1071 %endmacro 1072 1073 ; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper 1074 %macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg 1075 %if ARCH_X86_64 && cpuflag(avx512) 1076 %assign %%i %1 1077 %rep 16-%1 1078 %assign %%i_high %%i+16 1079 SWAP %%i, %%i_high 1080 %assign %%i %%i+1 1081 %endrep 1082 %endif 1083 %endmacro 1084 1085 %macro INIT_MMX 0-1+ 1086 %assign avx_enabled 0 1087 %define RESET_MM_PERMUTATION INIT_MMX %1 1088 %define mmsize 8 1089 %define mova movq 1090 %define movu movq 1091 %define movh movd 1092 %define movnta movntq 1093 INIT_CPUFLAGS %1 1094 DEFINE_MMREGS mm 1095 %endmacro 1096 1097 %macro INIT_XMM 0-1+ 1098 %assign avx_enabled FORCE_VEX_ENCODING 1099 %define RESET_MM_PERMUTATION INIT_XMM %1 1100 %define mmsize 16 1101 %define mova movdqa 1102 %define movu movdqu 1103 %define movh movq 1104 %define movnta movntdq 1105 INIT_CPUFLAGS %1 1106 DEFINE_MMREGS xmm 1107 %if WIN64 1108 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers 1109 %endif 1110 %xdefine bcstw 1to8 1111 %xdefine bcstd 1to4 1112 %xdefine bcstq 1to2 1113 %endmacro 1114 1115 %macro INIT_YMM 0-1+ 1116 %assign avx_enabled 1 1117 %define RESET_MM_PERMUTATION INIT_YMM %1 1118 %define mmsize 32 1119 %define mova movdqa 1120 %define movu movdqu 1121 %undef movh 1122 %define movnta movntdq 1123 INIT_CPUFLAGS %1 1124 DEFINE_MMREGS ymm 1125 AVX512_MM_PERMUTATION 1126 %xdefine bcstw 1to16 1127 %xdefine bcstd 1to8 1128 %xdefine bcstq 1to4 1129 %endmacro 1130 1131 %macro INIT_ZMM 0-1+ 1132 %assign avx_enabled 1 1133 %define RESET_MM_PERMUTATION INIT_ZMM %1 1134 %define mmsize 64 1135 %define mova movdqa 1136 %define movu movdqu 1137 %undef movh 1138 %define movnta movntdq 1139 INIT_CPUFLAGS %1 1140 DEFINE_MMREGS zmm 1141 AVX512_MM_PERMUTATION 1142 %xdefine bcstw 1to32 1143 %xdefine bcstd 1to16 1144 %xdefine bcstq 1to8 1145 %endmacro 1146 1147 INIT_XMM 1148 1149 %macro DECLARE_MMCAST 1 1150 %define mmmm%1 mm%1 1151 %define mmxmm%1 mm%1 1152 %define mmymm%1 mm%1 1153 %define mmzmm%1 mm%1 1154 %define xmmmm%1 mm%1 1155 %define xmmxmm%1 xmm%1 1156 %define xmmymm%1 xmm%1 1157 %define xmmzmm%1 xmm%1 1158 %define ymmmm%1 mm%1 1159 %define ymmxmm%1 xmm%1 1160 %define ymmymm%1 ymm%1 1161 %define ymmzmm%1 ymm%1 1162 %define zmmmm%1 mm%1 1163 %define zmmxmm%1 xmm%1 1164 %define zmmymm%1 ymm%1 1165 %define zmmzmm%1 zmm%1 1166 %define xm%1 xmm %+ m%1 1167 %define ym%1 ymm %+ m%1 1168 %define zm%1 zmm %+ m%1 1169 %endmacro 1170 1171 %assign i 0 1172 %rep 32 1173 DECLARE_MMCAST i 1174 %assign i i+1 1175 %endrep 1176 1177 ; I often want to use macros that permute their arguments. e.g. there's no 1178 ; efficient way to implement butterfly or transpose or dct without swapping some 1179 ; arguments. 1180 ; 1181 ; I would like to not have to manually keep track of the permutations: 1182 ; If I insert a permutation in the middle of a function, it should automatically 1183 ; change everything that follows. For more complex macros I may also have multiple 1184 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 1185 ; 1186 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 1187 ; permutes its arguments. It's equivalent to exchanging the contents of the 1188 ; registers, except that this way you exchange the register names instead, so it 1189 ; doesn't cost any cycles. 1190 1191 %macro PERMUTE 2-* ; takes a list of pairs to swap 1192 %rep %0/2 1193 %xdefine %%tmp%2 m%2 1194 %rotate 2 1195 %endrep 1196 %rep %0/2 1197 %xdefine m%1 %%tmp%2 1198 CAT_XDEFINE nn, m%1, %1 1199 %rotate 2 1200 %endrep 1201 %endmacro 1202 1203 %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) 1204 %ifnum %1 ; SWAP 0, 1, ... 1205 SWAP_INTERNAL_NUM %1, %2 1206 %else ; SWAP m0, m1, ... 1207 SWAP_INTERNAL_NAME %1, %2 1208 %endif 1209 %endmacro 1210 1211 %macro SWAP_INTERNAL_NUM 2-* 1212 %rep %0-1 1213 %xdefine %%tmp m%1 1214 %xdefine m%1 m%2 1215 %xdefine m%2 %%tmp 1216 CAT_XDEFINE nn, m%1, %1 1217 CAT_XDEFINE nn, m%2, %2 1218 %rotate 1 1219 %endrep 1220 %endmacro 1221 1222 %macro SWAP_INTERNAL_NAME 2-* 1223 %xdefine %%args nn %+ %1 1224 %rep %0-1 1225 %xdefine %%args %%args, nn %+ %2 1226 %rotate 1 1227 %endrep 1228 SWAP_INTERNAL_NUM %%args 1229 %endmacro 1230 1231 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 1232 ; calls to that function will automatically load the permutation, so values can 1233 ; be returned in mmregs. 1234 %macro SAVE_MM_PERMUTATION 0-1 1235 %if %0 1236 %xdefine %%f %1_m 1237 %else 1238 %xdefine %%f current_function %+ _m 1239 %endif 1240 %assign %%i 0 1241 %rep num_mmregs 1242 %xdefine %%tmp m %+ %%i 1243 CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp 1244 %assign %%i %%i+1 1245 %endrep 1246 %endmacro 1247 1248 %macro LOAD_MM_PERMUTATION 0-1 ; name to load from 1249 %if %0 1250 %xdefine %%f %1_m 1251 %else 1252 %xdefine %%f current_function %+ _m 1253 %endif 1254 %xdefine %%tmp %%f %+ 0 1255 %ifnum %%tmp 1256 DEFINE_MMREGS mmtype 1257 %assign %%i 0 1258 %rep num_mmregs 1259 %xdefine %%tmp %%f %+ %%i 1260 CAT_XDEFINE %%m, %%i, m %+ %%tmp 1261 %assign %%i %%i+1 1262 %endrep 1263 %rep num_mmregs 1264 %assign %%i %%i-1 1265 CAT_XDEFINE m, %%i, %%m %+ %%i 1266 CAT_XDEFINE nn, m %+ %%i, %%i 1267 %endrep 1268 %endif 1269 %endmacro 1270 1271 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 1272 %macro call 1 1273 %ifid %1 1274 call_internal %1 %+ SUFFIX, %1 1275 %else 1276 call %1 1277 %endif 1278 %endmacro 1279 %macro call_internal 2 1280 %xdefine %%i %2 1281 %ifndef cglobaled_%2 1282 %ifdef cglobaled_%1 1283 %xdefine %%i %1 1284 %endif 1285 %endif 1286 call %%i 1287 LOAD_MM_PERMUTATION %%i 1288 %endmacro 1289 1290 ; Substitutions that reduce instruction size but are functionally equivalent 1291 %macro add 2 1292 %ifnum %2 1293 %if %2==128 1294 sub %1, -128 1295 %else 1296 add %1, %2 1297 %endif 1298 %else 1299 add %1, %2 1300 %endif 1301 %endmacro 1302 1303 %macro sub 2 1304 %ifnum %2 1305 %if %2==128 1306 add %1, -128 1307 %else 1308 sub %1, %2 1309 %endif 1310 %else 1311 sub %1, %2 1312 %endif 1313 %endmacro 1314 1315 ;============================================================================= 1316 ; AVX abstraction layer 1317 ;============================================================================= 1318 1319 %assign i 0 1320 %rep 32 1321 %if i < 8 1322 CAT_XDEFINE sizeofmm, i, 8 1323 CAT_XDEFINE regnumofmm, i, i 1324 %endif 1325 CAT_XDEFINE sizeofxmm, i, 16 1326 CAT_XDEFINE sizeofymm, i, 32 1327 CAT_XDEFINE sizeofzmm, i, 64 1328 CAT_XDEFINE regnumofxmm, i, i 1329 CAT_XDEFINE regnumofymm, i, i 1330 CAT_XDEFINE regnumofzmm, i, i 1331 %assign i i+1 1332 %endrep 1333 %undef i 1334 1335 %macro CHECK_AVX_INSTR_EMU 3-* 1336 %xdefine %%opcode %1 1337 %xdefine %%dst %2 1338 %rep %0-2 1339 %ifidn %%dst, %3 1340 %error non-avx emulation of ``%%opcode'' is not supported 1341 %endif 1342 %rotate 1 1343 %endrep 1344 %endmacro 1345 1346 ;%1 == instruction 1347 ;%2 == minimal instruction set 1348 ;%3 == 1 if float, 0 if int 1349 ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) 1350 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1351 ;%6+: operands 1352 %macro RUN_AVX_INSTR 6-9+ 1353 %ifnum sizeof%7 1354 %assign __sizeofreg sizeof%7 1355 %elifnum sizeof%6 1356 %assign __sizeofreg sizeof%6 1357 %else 1358 %assign __sizeofreg mmsize 1359 %endif 1360 %assign __emulate_avx 0 1361 %if avx_enabled && __sizeofreg >= 16 1362 %xdefine __instr v%1 1363 %else 1364 %xdefine __instr %1 1365 %if %0 >= 8+%4 1366 %assign __emulate_avx 1 1367 %endif 1368 %endif 1369 %ifnidn %2, fnord 1370 %ifdef cpuname 1371 %if notcpuflag(%2) 1372 %error use of ``%1'' %2 instruction in cpuname function: current_function 1373 %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) 1374 %error use of ``%1'' sse2 instruction in cpuname function: current_function 1375 %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) 1376 %error use of ``%1'' avx2 instruction in cpuname function: current_function 1377 %elif __sizeofreg == 16 && notcpuflag(sse) 1378 %error use of ``%1'' sse instruction in cpuname function: current_function 1379 %elif __sizeofreg == 32 && notcpuflag(avx) 1380 %error use of ``%1'' avx instruction in cpuname function: current_function 1381 %elif __sizeofreg == 64 && notcpuflag(avx512) 1382 %error use of ``%1'' avx512 instruction in cpuname function: current_function 1383 %elifidn %1, pextrw ; special case because the base instruction is mmx2, 1384 %ifnid %6 ; but sse4 is required for memory operands 1385 %if notcpuflag(sse4) 1386 %error use of ``%1'' sse4 instruction in cpuname function: current_function 1387 %endif 1388 %endif 1389 %endif 1390 %endif 1391 %endif 1392 1393 %if __emulate_avx 1394 %xdefine __src1 %7 1395 %xdefine __src2 %8 1396 %if %5 && %4 == 0 1397 %ifnidn %6, %7 1398 %ifidn %6, %8 1399 %xdefine __src1 %8 1400 %xdefine __src2 %7 1401 %elifnnum sizeof%8 1402 ; 3-operand AVX instructions with a memory arg can only have it in src2, 1403 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). 1404 ; So, if the instruction is commutative with a memory arg, swap them. 1405 %xdefine __src1 %8 1406 %xdefine __src2 %7 1407 %endif 1408 %endif 1409 %endif 1410 %ifnidn %6, __src1 1411 %if %0 >= 9 1412 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 1413 %else 1414 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 1415 %endif 1416 %if __sizeofreg == 8 1417 MOVQ %6, __src1 1418 %elif %3 1419 MOVAPS %6, __src1 1420 %else 1421 MOVDQA %6, __src1 1422 %endif 1423 %endif 1424 %if %0 >= 9 1425 %1 %6, __src2, %9 1426 %else 1427 %1 %6, __src2 1428 %endif 1429 %elif %0 >= 9 1430 %if avx_enabled && __sizeofreg >= 16 && %4 == 1 1431 %ifnnum regnumof%7 1432 %if %3 1433 vmovaps %6, %7 1434 %else 1435 vmovdqa %6, %7 1436 %endif 1437 __instr %6, %6, %8, %9 1438 %else 1439 __instr %6, %7, %8, %9 1440 %endif 1441 %else 1442 __instr %6, %7, %8, %9 1443 %endif 1444 %elif %0 == 8 1445 %if avx_enabled && __sizeofreg >= 16 && %4 == 0 1446 %xdefine __src1 %7 1447 %xdefine __src2 %8 1448 %if %5 1449 %ifnum regnumof%7 1450 %ifnum regnumof%8 1451 %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 1452 ; Most VEX-encoded instructions require an additional byte to encode when 1453 ; src2 is a high register (e.g. m8..15). If the instruction is commutative 1454 ; we can swap src1 and src2 when doing so reduces the instruction length. 1455 %xdefine __src1 %8 1456 %xdefine __src2 %7 1457 %endif 1458 %endif 1459 %elifnum regnumof%8 ; put memory operands in src2 when possible 1460 %xdefine __src1 %8 1461 %xdefine __src2 %7 1462 %else 1463 %assign __emulate_avx 1 1464 %endif 1465 %elifnnum regnumof%7 1466 ; EVEX allows imm8 shift instructions to be used with memory operands, 1467 ; but VEX does not. This handles those special cases. 1468 %ifnnum %8 1469 %assign __emulate_avx 1 1470 %elif notcpuflag(avx512) 1471 %assign __emulate_avx 1 1472 %endif 1473 %endif 1474 %if __emulate_avx ; a separate load is required 1475 %if %3 1476 vmovaps %6, %7 1477 %else 1478 vmovdqa %6, %7 1479 %endif 1480 __instr %6, %6, %8 1481 %else 1482 __instr %6, __src1, __src2 1483 %endif 1484 %else 1485 __instr %6, %7, %8 1486 %endif 1487 %elif %0 == 7 1488 %if avx_enabled && __sizeofreg >= 16 && %5 1489 %xdefine __src1 %6 1490 %xdefine __src2 %7 1491 %ifnum regnumof%6 1492 %ifnum regnumof%7 1493 %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 1494 %xdefine __src1 %7 1495 %xdefine __src2 %6 1496 %endif 1497 %endif 1498 %endif 1499 __instr %6, __src1, __src2 1500 %else 1501 __instr %6, %7 1502 %endif 1503 %else 1504 __instr %6 1505 %endif 1506 %endmacro 1507 1508 ;%1 == instruction 1509 ;%2 == minimal instruction set 1510 ;%3 == 1 if float, 0 if int 1511 ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) 1512 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1513 %macro AVX_INSTR 1-5 fnord, 0, 255, 0 1514 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 1515 %ifidn %2, fnord 1516 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 1517 %elifidn %3, fnord 1518 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 1519 %elifidn %4, fnord 1520 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 1521 %elifidn %5, fnord 1522 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 1523 %else 1524 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 1525 %endif 1526 %endmacro 1527 %endmacro 1528 1529 ; Instructions with both VEX/EVEX and legacy encodings 1530 ; Non-destructive instructions are written without parameters 1531 AVX_INSTR addpd, sse2, 1, 0, 1 1532 AVX_INSTR addps, sse, 1, 0, 1 1533 AVX_INSTR addsd, sse2, 1, 0, 0 1534 AVX_INSTR addss, sse, 1, 0, 0 1535 AVX_INSTR addsubpd, sse3, 1, 0, 0 1536 AVX_INSTR addsubps, sse3, 1, 0, 0 1537 AVX_INSTR aesdec, aesni, 0, 0, 0 1538 AVX_INSTR aesdeclast, aesni, 0, 0, 0 1539 AVX_INSTR aesenc, aesni, 0, 0, 0 1540 AVX_INSTR aesenclast, aesni, 0, 0, 0 1541 AVX_INSTR aesimc, aesni 1542 AVX_INSTR aeskeygenassist, aesni 1543 AVX_INSTR andnpd, sse2, 1, 0, 0 1544 AVX_INSTR andnps, sse, 1, 0, 0 1545 AVX_INSTR andpd, sse2, 1, 0, 1 1546 AVX_INSTR andps, sse, 1, 0, 1 1547 AVX_INSTR blendpd, sse4, 1, 1, 0 1548 AVX_INSTR blendps, sse4, 1, 1, 0 1549 AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding 1550 AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding 1551 AVX_INSTR cmpeqpd, sse2, 1, 0, 1 1552 AVX_INSTR cmpeqps, sse, 1, 0, 1 1553 AVX_INSTR cmpeqsd, sse2, 1, 0, 0 1554 AVX_INSTR cmpeqss, sse, 1, 0, 0 1555 AVX_INSTR cmplepd, sse2, 1, 0, 0 1556 AVX_INSTR cmpleps, sse, 1, 0, 0 1557 AVX_INSTR cmplesd, sse2, 1, 0, 0 1558 AVX_INSTR cmpless, sse, 1, 0, 0 1559 AVX_INSTR cmpltpd, sse2, 1, 0, 0 1560 AVX_INSTR cmpltps, sse, 1, 0, 0 1561 AVX_INSTR cmpltsd, sse2, 1, 0, 0 1562 AVX_INSTR cmpltss, sse, 1, 0, 0 1563 AVX_INSTR cmpneqpd, sse2, 1, 0, 1 1564 AVX_INSTR cmpneqps, sse, 1, 0, 1 1565 AVX_INSTR cmpneqsd, sse2, 1, 0, 0 1566 AVX_INSTR cmpneqss, sse, 1, 0, 0 1567 AVX_INSTR cmpnlepd, sse2, 1, 0, 0 1568 AVX_INSTR cmpnleps, sse, 1, 0, 0 1569 AVX_INSTR cmpnlesd, sse2, 1, 0, 0 1570 AVX_INSTR cmpnless, sse, 1, 0, 0 1571 AVX_INSTR cmpnltpd, sse2, 1, 0, 0 1572 AVX_INSTR cmpnltps, sse, 1, 0, 0 1573 AVX_INSTR cmpnltsd, sse2, 1, 0, 0 1574 AVX_INSTR cmpnltss, sse, 1, 0, 0 1575 AVX_INSTR cmpordpd, sse2 1, 0, 1 1576 AVX_INSTR cmpordps, sse 1, 0, 1 1577 AVX_INSTR cmpordsd, sse2 1, 0, 0 1578 AVX_INSTR cmpordss, sse 1, 0, 0 1579 AVX_INSTR cmppd, sse2, 1, 1, 0 1580 AVX_INSTR cmpps, sse, 1, 1, 0 1581 AVX_INSTR cmpsd, sse2, 1, 1, 0 1582 AVX_INSTR cmpss, sse, 1, 1, 0 1583 AVX_INSTR cmpunordpd, sse2, 1, 0, 1 1584 AVX_INSTR cmpunordps, sse, 1, 0, 1 1585 AVX_INSTR cmpunordsd, sse2, 1, 0, 0 1586 AVX_INSTR cmpunordss, sse, 1, 0, 0 1587 AVX_INSTR comisd, sse2, 1 1588 AVX_INSTR comiss, sse, 1 1589 AVX_INSTR cvtdq2pd, sse2, 1 1590 AVX_INSTR cvtdq2ps, sse2, 1 1591 AVX_INSTR cvtpd2dq, sse2, 1 1592 AVX_INSTR cvtpd2ps, sse2, 1 1593 AVX_INSTR cvtps2dq, sse2, 1 1594 AVX_INSTR cvtps2pd, sse2, 1 1595 AVX_INSTR cvtsd2si, sse2, 1 1596 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 1597 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 1598 AVX_INSTR cvtsi2ss, sse, 1, 0, 0 1599 AVX_INSTR cvtss2sd, sse2, 1, 0, 0 1600 AVX_INSTR cvtss2si, sse, 1 1601 AVX_INSTR cvttpd2dq, sse2, 1 1602 AVX_INSTR cvttps2dq, sse2, 1 1603 AVX_INSTR cvttsd2si, sse2, 1 1604 AVX_INSTR cvttss2si, sse, 1 1605 AVX_INSTR divpd, sse2, 1, 0, 0 1606 AVX_INSTR divps, sse, 1, 0, 0 1607 AVX_INSTR divsd, sse2, 1, 0, 0 1608 AVX_INSTR divss, sse, 1, 0, 0 1609 AVX_INSTR dppd, sse4, 1, 1, 0 1610 AVX_INSTR dpps, sse4, 1, 1, 0 1611 AVX_INSTR extractps, sse4, 1 1612 AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 1613 AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 1614 AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 1615 AVX_INSTR haddpd, sse3, 1, 0, 0 1616 AVX_INSTR haddps, sse3, 1, 0, 0 1617 AVX_INSTR hsubpd, sse3, 1, 0, 0 1618 AVX_INSTR hsubps, sse3, 1, 0, 0 1619 AVX_INSTR insertps, sse4, 1, 1, 0 1620 AVX_INSTR lddqu, sse3 1621 AVX_INSTR ldmxcsr, sse, 1 1622 AVX_INSTR maskmovdqu, sse2 1623 AVX_INSTR maxpd, sse2, 1, 0, 1 1624 AVX_INSTR maxps, sse, 1, 0, 1 1625 AVX_INSTR maxsd, sse2, 1, 0, 0 1626 AVX_INSTR maxss, sse, 1, 0, 0 1627 AVX_INSTR minpd, sse2, 1, 0, 1 1628 AVX_INSTR minps, sse, 1, 0, 1 1629 AVX_INSTR minsd, sse2, 1, 0, 0 1630 AVX_INSTR minss, sse, 1, 0, 0 1631 AVX_INSTR movapd, sse2, 1 1632 AVX_INSTR movaps, sse, 1 1633 AVX_INSTR movd, mmx 1634 AVX_INSTR movddup, sse3, 1 1635 AVX_INSTR movdqa, sse2 1636 AVX_INSTR movdqu, sse2 1637 AVX_INSTR movhlps, sse, 1, 0, 0 1638 AVX_INSTR movhpd, sse2, 1, 0, 0 1639 AVX_INSTR movhps, sse, 1, 0, 0 1640 AVX_INSTR movlhps, sse, 1, 0, 0 1641 AVX_INSTR movlpd, sse2, 1, 0, 0 1642 AVX_INSTR movlps, sse, 1, 0, 0 1643 AVX_INSTR movmskpd, sse2, 1 1644 AVX_INSTR movmskps, sse, 1 1645 AVX_INSTR movntdq, sse2 1646 AVX_INSTR movntdqa, sse4 1647 AVX_INSTR movntpd, sse2, 1 1648 AVX_INSTR movntps, sse, 1 1649 AVX_INSTR movq, mmx 1650 AVX_INSTR movsd, sse2, 1, 0, 0 1651 AVX_INSTR movshdup, sse3, 1 1652 AVX_INSTR movsldup, sse3, 1 1653 AVX_INSTR movss, sse, 1, 0, 0 1654 AVX_INSTR movupd, sse2, 1 1655 AVX_INSTR movups, sse, 1 1656 AVX_INSTR mpsadbw, sse4, 0, 1, 0 1657 AVX_INSTR mulpd, sse2, 1, 0, 1 1658 AVX_INSTR mulps, sse, 1, 0, 1 1659 AVX_INSTR mulsd, sse2, 1, 0, 0 1660 AVX_INSTR mulss, sse, 1, 0, 0 1661 AVX_INSTR orpd, sse2, 1, 0, 1 1662 AVX_INSTR orps, sse, 1, 0, 1 1663 AVX_INSTR pabsb, ssse3 1664 AVX_INSTR pabsd, ssse3 1665 AVX_INSTR pabsw, ssse3 1666 AVX_INSTR packssdw, mmx, 0, 0, 0 1667 AVX_INSTR packsswb, mmx, 0, 0, 0 1668 AVX_INSTR packusdw, sse4, 0, 0, 0 1669 AVX_INSTR packuswb, mmx, 0, 0, 0 1670 AVX_INSTR paddb, mmx, 0, 0, 1 1671 AVX_INSTR paddd, mmx, 0, 0, 1 1672 AVX_INSTR paddq, sse2, 0, 0, 1 1673 AVX_INSTR paddsb, mmx, 0, 0, 1 1674 AVX_INSTR paddsw, mmx, 0, 0, 1 1675 AVX_INSTR paddusb, mmx, 0, 0, 1 1676 AVX_INSTR paddusw, mmx, 0, 0, 1 1677 AVX_INSTR paddw, mmx, 0, 0, 1 1678 AVX_INSTR palignr, ssse3, 0, 1, 0 1679 AVX_INSTR pand, mmx, 0, 0, 1 1680 AVX_INSTR pandn, mmx, 0, 0, 0 1681 AVX_INSTR pavgb, mmx2, 0, 0, 1 1682 AVX_INSTR pavgw, mmx2, 0, 0, 1 1683 AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding 1684 AVX_INSTR pblendw, sse4, 0, 1, 0 1685 AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0 1686 AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0 1687 AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0 1688 AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0 1689 AVX_INSTR pclmulqdq, clmul, 0, 1, 0 1690 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 1691 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 1692 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 1693 AVX_INSTR pcmpeqw, mmx, 0, 0, 1 1694 AVX_INSTR pcmpestri, sse42 1695 AVX_INSTR pcmpestrm, sse42 1696 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 1697 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 1698 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 1699 AVX_INSTR pcmpgtw, mmx, 0, 0, 0 1700 AVX_INSTR pcmpistri, sse42 1701 AVX_INSTR pcmpistrm, sse42 1702 AVX_INSTR pextrb, sse4 1703 AVX_INSTR pextrd, sse4 1704 AVX_INSTR pextrq, sse4 1705 AVX_INSTR pextrw, mmx2 1706 AVX_INSTR phaddd, ssse3, 0, 0, 0 1707 AVX_INSTR phaddsw, ssse3, 0, 0, 0 1708 AVX_INSTR phaddw, ssse3, 0, 0, 0 1709 AVX_INSTR phminposuw, sse4 1710 AVX_INSTR phsubd, ssse3, 0, 0, 0 1711 AVX_INSTR phsubsw, ssse3, 0, 0, 0 1712 AVX_INSTR phsubw, ssse3, 0, 0, 0 1713 AVX_INSTR pinsrb, sse4, 0, 1, 0 1714 AVX_INSTR pinsrd, sse4, 0, 1, 0 1715 AVX_INSTR pinsrq, sse4, 0, 1, 0 1716 AVX_INSTR pinsrw, mmx2, 0, 1, 0 1717 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 1718 AVX_INSTR pmaddwd, mmx, 0, 0, 1 1719 AVX_INSTR pmaxsb, sse4, 0, 0, 1 1720 AVX_INSTR pmaxsd, sse4, 0, 0, 1 1721 AVX_INSTR pmaxsw, mmx2, 0, 0, 1 1722 AVX_INSTR pmaxub, mmx2, 0, 0, 1 1723 AVX_INSTR pmaxud, sse4, 0, 0, 1 1724 AVX_INSTR pmaxuw, sse4, 0, 0, 1 1725 AVX_INSTR pminsb, sse4, 0, 0, 1 1726 AVX_INSTR pminsd, sse4, 0, 0, 1 1727 AVX_INSTR pminsw, mmx2, 0, 0, 1 1728 AVX_INSTR pminub, mmx2, 0, 0, 1 1729 AVX_INSTR pminud, sse4, 0, 0, 1 1730 AVX_INSTR pminuw, sse4, 0, 0, 1 1731 AVX_INSTR pmovmskb, mmx2 1732 AVX_INSTR pmovsxbd, sse4 1733 AVX_INSTR pmovsxbq, sse4 1734 AVX_INSTR pmovsxbw, sse4 1735 AVX_INSTR pmovsxdq, sse4 1736 AVX_INSTR pmovsxwd, sse4 1737 AVX_INSTR pmovsxwq, sse4 1738 AVX_INSTR pmovzxbd, sse4 1739 AVX_INSTR pmovzxbq, sse4 1740 AVX_INSTR pmovzxbw, sse4 1741 AVX_INSTR pmovzxdq, sse4 1742 AVX_INSTR pmovzxwd, sse4 1743 AVX_INSTR pmovzxwq, sse4 1744 AVX_INSTR pmuldq, sse4, 0, 0, 1 1745 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 1746 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 1747 AVX_INSTR pmulhw, mmx, 0, 0, 1 1748 AVX_INSTR pmulld, sse4, 0, 0, 1 1749 AVX_INSTR pmullw, mmx, 0, 0, 1 1750 AVX_INSTR pmuludq, sse2, 0, 0, 1 1751 AVX_INSTR por, mmx, 0, 0, 1 1752 AVX_INSTR psadbw, mmx2, 0, 0, 1 1753 AVX_INSTR pshufb, ssse3, 0, 0, 0 1754 AVX_INSTR pshufd, sse2 1755 AVX_INSTR pshufhw, sse2 1756 AVX_INSTR pshuflw, sse2 1757 AVX_INSTR psignb, ssse3, 0, 0, 0 1758 AVX_INSTR psignd, ssse3, 0, 0, 0 1759 AVX_INSTR psignw, ssse3, 0, 0, 0 1760 AVX_INSTR pslld, mmx, 0, 0, 0 1761 AVX_INSTR pslldq, sse2, 0, 0, 0 1762 AVX_INSTR psllq, mmx, 0, 0, 0 1763 AVX_INSTR psllw, mmx, 0, 0, 0 1764 AVX_INSTR psrad, mmx, 0, 0, 0 1765 AVX_INSTR psraw, mmx, 0, 0, 0 1766 AVX_INSTR psrld, mmx, 0, 0, 0 1767 AVX_INSTR psrldq, sse2, 0, 0, 0 1768 AVX_INSTR psrlq, mmx, 0, 0, 0 1769 AVX_INSTR psrlw, mmx, 0, 0, 0 1770 AVX_INSTR psubb, mmx, 0, 0, 0 1771 AVX_INSTR psubd, mmx, 0, 0, 0 1772 AVX_INSTR psubq, sse2, 0, 0, 0 1773 AVX_INSTR psubsb, mmx, 0, 0, 0 1774 AVX_INSTR psubsw, mmx, 0, 0, 0 1775 AVX_INSTR psubusb, mmx, 0, 0, 0 1776 AVX_INSTR psubusw, mmx, 0, 0, 0 1777 AVX_INSTR psubw, mmx, 0, 0, 0 1778 AVX_INSTR ptest, sse4 1779 AVX_INSTR punpckhbw, mmx, 0, 0, 0 1780 AVX_INSTR punpckhdq, mmx, 0, 0, 0 1781 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 1782 AVX_INSTR punpckhwd, mmx, 0, 0, 0 1783 AVX_INSTR punpcklbw, mmx, 0, 0, 0 1784 AVX_INSTR punpckldq, mmx, 0, 0, 0 1785 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 1786 AVX_INSTR punpcklwd, mmx, 0, 0, 0 1787 AVX_INSTR pxor, mmx, 0, 0, 1 1788 AVX_INSTR rcpps, sse, 1 1789 AVX_INSTR rcpss, sse, 1, 0, 0 1790 AVX_INSTR roundpd, sse4, 1 1791 AVX_INSTR roundps, sse4, 1 1792 AVX_INSTR roundsd, sse4, 1, 1, 0 1793 AVX_INSTR roundss, sse4, 1, 1, 0 1794 AVX_INSTR rsqrtps, sse, 1 1795 AVX_INSTR rsqrtss, sse, 1, 0, 0 1796 AVX_INSTR shufpd, sse2, 1, 1, 0 1797 AVX_INSTR shufps, sse, 1, 1, 0 1798 AVX_INSTR sqrtpd, sse2, 1 1799 AVX_INSTR sqrtps, sse, 1 1800 AVX_INSTR sqrtsd, sse2, 1, 0, 0 1801 AVX_INSTR sqrtss, sse, 1, 0, 0 1802 AVX_INSTR stmxcsr, sse, 1 1803 AVX_INSTR subpd, sse2, 1, 0, 0 1804 AVX_INSTR subps, sse, 1, 0, 0 1805 AVX_INSTR subsd, sse2, 1, 0, 0 1806 AVX_INSTR subss, sse, 1, 0, 0 1807 AVX_INSTR ucomisd, sse2, 1 1808 AVX_INSTR ucomiss, sse, 1 1809 AVX_INSTR unpckhpd, sse2, 1, 0, 0 1810 AVX_INSTR unpckhps, sse, 1, 0, 0 1811 AVX_INSTR unpcklpd, sse2, 1, 0, 0 1812 AVX_INSTR unpcklps, sse, 1, 0, 0 1813 AVX_INSTR xorpd, sse2, 1, 0, 1 1814 AVX_INSTR xorps, sse, 1, 0, 1 1815 1816 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1817 AVX_INSTR pfadd, 3dnow, 1, 0, 1 1818 AVX_INSTR pfmul, 3dnow, 1, 0, 1 1819 AVX_INSTR pfsub, 3dnow, 1, 0, 0 1820 1821 ;%1 == instruction 1822 ;%2 == minimal instruction set 1823 %macro GPR_INSTR 2 1824 %macro %1 2-5 fnord, %1, %2 1825 %ifdef cpuname 1826 %if notcpuflag(%5) 1827 %error use of ``%4'' %5 instruction in cpuname function: current_function 1828 %endif 1829 %endif 1830 %ifidn %3, fnord 1831 %4 %1, %2 1832 %else 1833 %4 %1, %2, %3 1834 %endif 1835 %endmacro 1836 %endmacro 1837 1838 GPR_INSTR andn, bmi1 1839 GPR_INSTR bextr, bmi1 1840 GPR_INSTR blsi, bmi1 1841 GPR_INSTR blsmsk, bmi1 1842 GPR_INSTR blsr, bmi1 1843 GPR_INSTR bzhi, bmi2 1844 GPR_INSTR crc32, sse42 1845 GPR_INSTR mulx, bmi2 1846 GPR_INSTR pdep, bmi2 1847 GPR_INSTR pext, bmi2 1848 GPR_INSTR popcnt, sse42 1849 GPR_INSTR rorx, bmi2 1850 GPR_INSTR sarx, bmi2 1851 GPR_INSTR shlx, bmi2 1852 GPR_INSTR shrx, bmi2 1853 1854 ; base-4 constants for shuffles 1855 %assign i 0 1856 %rep 256 1857 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1858 %if j < 10 1859 CAT_XDEFINE q000, j, i 1860 %elif j < 100 1861 CAT_XDEFINE q00, j, i 1862 %elif j < 1000 1863 CAT_XDEFINE q0, j, i 1864 %else 1865 CAT_XDEFINE q, j, i 1866 %endif 1867 %assign i i+1 1868 %endrep 1869 %undef i 1870 %undef j 1871 1872 %macro FMA_INSTR 3 1873 %macro %1 4-7 %1, %2, %3 1874 %if cpuflag(xop) 1875 v%5 %1, %2, %3, %4 1876 %elifnidn %1, %4 1877 %6 %1, %2, %3 1878 %7 %1, %4 1879 %else 1880 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported 1881 %endif 1882 %endmacro 1883 %endmacro 1884 1885 FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation 1886 FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation 1887 FMA_INSTR pmacsww, pmullw, paddw 1888 FMA_INSTR pmadcswd, pmaddwd, paddd 1889 1890 ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. 1891 ; FMA3 is only possible if dst is the same as one of the src registers. 1892 ; Either src2 or src3 can be a memory operand. 1893 %macro FMA4_INSTR 2-* 1894 %push fma4_instr 1895 %xdefine %$prefix %1 1896 %rep %0 - 1 1897 %macro %$prefix%2 4-6 %$prefix, %2 1898 %if notcpuflag(fma3) && notcpuflag(fma4) 1899 %error use of ``%5%6'' fma instruction in cpuname function: current_function 1900 %elif cpuflag(fma4) 1901 v%5%6 %1, %2, %3, %4 1902 %elifidn %1, %2 1903 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. 1904 %ifnum sizeof%3 1905 v%{5}213%6 %2, %3, %4 1906 %else 1907 v%{5}132%6 %2, %4, %3 1908 %endif 1909 %elifidn %1, %3 1910 v%{5}213%6 %3, %2, %4 1911 %elifidn %1, %4 1912 v%{5}231%6 %4, %2, %3 1913 %else 1914 %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported 1915 %endif 1916 %endmacro 1917 %rotate 1 1918 %endrep 1919 %pop 1920 %endmacro 1921 1922 FMA4_INSTR fmadd, pd, ps, sd, ss 1923 FMA4_INSTR fmaddsub, pd, ps 1924 FMA4_INSTR fmsub, pd, ps, sd, ss 1925 FMA4_INSTR fmsubadd, pd, ps 1926 FMA4_INSTR fnmadd, pd, ps, sd, ss 1927 FMA4_INSTR fnmsub, pd, ps, sd, ss 1928 1929 ; Macros for converting VEX instructions to equivalent EVEX ones. 1930 %macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex 1931 %macro %1 2-7 fnord, fnord, %1, %2, %3 1932 %ifidn %3, fnord 1933 %define %%args %1, %2 1934 %elifidn %4, fnord 1935 %define %%args %1, %2, %3 1936 %else 1937 %define %%args %1, %2, %3, %4 1938 %endif 1939 %assign %%evex_required cpuflag(avx512) & %7 1940 %ifnum regnumof%1 1941 %if regnumof%1 >= 16 || sizeof%1 > 32 1942 %assign %%evex_required 1 1943 %endif 1944 %endif 1945 %ifnum regnumof%2 1946 %if regnumof%2 >= 16 || sizeof%2 > 32 1947 %assign %%evex_required 1 1948 %endif 1949 %endif 1950 %ifnum regnumof%3 1951 %if regnumof%3 >= 16 || sizeof%3 > 32 1952 %assign %%evex_required 1 1953 %endif 1954 %endif 1955 %if %%evex_required 1956 %6 %%args 1957 %else 1958 %5 %%args ; Prefer VEX over EVEX due to shorter instruction length 1959 %endif 1960 %endmacro 1961 %endmacro 1962 1963 EVEX_INSTR vbroadcastf128, vbroadcastf32x4 1964 EVEX_INSTR vbroadcasti128, vbroadcasti32x4 1965 EVEX_INSTR vextractf128, vextractf32x4 1966 EVEX_INSTR vextracti128, vextracti32x4 1967 EVEX_INSTR vinsertf128, vinsertf32x4 1968 EVEX_INSTR vinserti128, vinserti32x4 1969 EVEX_INSTR vmovdqa, vmovdqa32 1970 EVEX_INSTR vmovdqu, vmovdqu32 1971 EVEX_INSTR vpand, vpandd 1972 EVEX_INSTR vpandn, vpandnd 1973 EVEX_INSTR vpor, vpord 1974 EVEX_INSTR vpxor, vpxord 1975 EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision 1976 EVEX_INSTR vrcpss, vrcp14ss, 1 1977 EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 1978 EVEX_INSTR vrsqrtss, vrsqrt14ss, 1