x86inc.asm (60248B)
1 ;***************************************************************************** 2 ;* x86inc.asm: x86 abstraction layer 3 ;***************************************************************************** 4 ;* Copyright (C) 2005-2024 x264 project 5 ;* 6 ;* Authors: Loren Merritt <lorenm@u.washington.edu> 7 ;* Henrik Gramner <henrik@gramner.com> 8 ;* Anton Mitrofanov <BugMaster@narod.ru> 9 ;* Fiona Glaser <fiona@x264.com> 10 ;* 11 ;* Permission to use, copy, modify, and/or distribute this software for any 12 ;* purpose with or without fee is hereby granted, provided that the above 13 ;* copyright notice and this permission notice appear in all copies. 14 ;* 15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22 ;***************************************************************************** 23 24 ; This is a header file for the x86inc.asm assembly language, which uses 25 ; NASM/YASM syntax combined with a large number of macros to provide easy 26 ; abstraction between different calling conventions (x86_32, win64, linux64). 27 ; It also has various other useful features to simplify writing the kind of 28 ; DSP functions that are most often used. 29 30 %ifndef private_prefix 31 %error private_prefix not defined 32 %endif 33 34 %ifndef public_prefix 35 %define public_prefix private_prefix 36 %endif 37 38 %ifndef STACK_ALIGNMENT 39 %if ARCH_X86_64 40 %define STACK_ALIGNMENT 16 41 %else 42 %define STACK_ALIGNMENT 4 43 %endif 44 %endif 45 46 %define WIN32 0 47 %define WIN64 0 48 %define UNIX64 0 49 %if ARCH_X86_64 50 %ifidn __OUTPUT_FORMAT__,win32 51 %define WIN32 1 52 %define WIN64 1 53 %elifidn __OUTPUT_FORMAT__,win64 54 %define WIN32 1 55 %define WIN64 1 56 %elifidn __OUTPUT_FORMAT__,x64 57 %define WIN32 1 58 %define WIN64 1 59 %else 60 %define UNIX64 1 61 %endif 62 %else 63 %ifidn __OUTPUT_FORMAT__,win32 64 %define WIN32 1 65 %endif 66 %endif 67 68 %define FORMAT_ELF 0 69 %define FORMAT_MACHO 0 70 %ifidn __OUTPUT_FORMAT__,elf 71 %define FORMAT_ELF 1 72 %elifidn __OUTPUT_FORMAT__,elf32 73 %define FORMAT_ELF 1 74 %elifidn __OUTPUT_FORMAT__,elf64 75 %define FORMAT_ELF 1 76 %elifidn __OUTPUT_FORMAT__,macho 77 %define FORMAT_MACHO 1 78 %elifidn __OUTPUT_FORMAT__,macho32 79 %define FORMAT_MACHO 1 80 %elifidn __OUTPUT_FORMAT__,macho64 81 %define FORMAT_MACHO 1 82 %endif 83 84 %ifdef PREFIX 85 %define mangle(x) _ %+ x 86 %else 87 %define mangle(x) x 88 %endif 89 90 ; Use VEX-encoding even in non-AVX functions 91 %ifndef FORCE_VEX_ENCODING 92 %define FORCE_VEX_ENCODING 0 93 %endif 94 95 %macro SECTION_RODATA 0-1 16 96 %ifidn __OUTPUT_FORMAT__,win32 97 SECTION .rdata align=%1 98 %elif WIN64 99 SECTION .rdata align=%1 100 %else 101 SECTION .rodata align=%1 102 %endif 103 %endmacro 104 105 %if ARCH_X86_64 106 %define PIC 1 ; always use PIC on x86-64 107 default rel 108 %elifidn __OUTPUT_FORMAT__,win32 109 %define PIC 0 ; PIC isn't used on 32-bit Windows 110 %elifndef PIC 111 %define PIC 0 112 %endif 113 114 %define HAVE_PRIVATE_EXTERN 1 115 %ifdef __NASM_VERSION_ID__ 116 %use smartalign 117 %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 118 %define HAVE_PRIVATE_EXTERN 0 119 %endif 120 %endif 121 122 ; Macros to eliminate most code duplication between x86_32 and x86_64: 123 ; Currently this works only for leaf functions which load all their arguments 124 ; into registers at the start, and make no other use of the stack. Luckily that 125 ; covers most use cases. 126 127 ; PROLOGUE: 128 ; %1 = number of arguments. loads them from stack if needed. 129 ; %2 = number of registers used. pushes callee-saved regs if needed. 130 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 131 ; %4 = (optional) stack size to be allocated. The stack will be aligned before 132 ; allocating the specified stack size. If the required stack alignment is 133 ; larger than the known stack alignment the stack will be manually aligned 134 ; and an extra register will be allocated to hold the original stack 135 ; pointer (to not invalidate r0m etc.). To prevent the use of an extra 136 ; register as stack pointer, request a negative stack size. 137 ; %4+/%5+ = list of names to define to registers 138 ; PROLOGUE can also be invoked by adding the same options to cglobal 139 140 ; e.g. 141 ; cglobal foo, 2,3,7,0x40, dst, src, tmp 142 ; declares a function (foo) that automatically loads two arguments (dst and 143 ; src) into registers, uses one additional register (tmp) plus 7 vector 144 ; registers (m0-m6) and allocates 0x40 bytes of stack space. 145 146 ; TODO Some functions can use some args directly from the stack. If they're the 147 ; last args then you can just not declare them, but if they're in the middle 148 ; we need more flexible macro. 149 150 ; RET: 151 ; Pops anything that was pushed by PROLOGUE, and returns. 152 153 ; REP_RET: 154 ; Use this instead of RET if it's a branch target. 155 156 ; registers: 157 ; rN and rNq are the native-size register holding function argument N 158 ; rNd, rNw, rNb are dword, word, and byte size 159 ; rNh is the high 8 bits of the word size 160 ; rNm is the original location of arg N (a register or on the stack), dword 161 ; rNmp is native size 162 163 %macro DECLARE_REG 2-3 164 %define r%1q %2 165 %define r%1d %2d 166 %define r%1w %2w 167 %define r%1b %2b 168 %define r%1h %2h 169 %define %2q %2 170 %if %0 == 2 171 %define r%1m %2d 172 %define r%1mp %2 173 %elif ARCH_X86_64 ; memory 174 %define r%1m [rstk + stack_offset + %3] 175 %define r%1mp qword r %+ %1 %+ m 176 %else 177 %define r%1m [rstk + stack_offset + %3] 178 %define r%1mp dword r %+ %1 %+ m 179 %endif 180 %define r%1 %2 181 %endmacro 182 183 %macro DECLARE_REG_SIZE 3 184 %define r%1q r%1 185 %define e%1q r%1 186 %define r%1d e%1 187 %define e%1d e%1 188 %define r%1w %1 189 %define e%1w %1 190 %define r%1h %3 191 %define e%1h %3 192 %define r%1b %2 193 %define e%1b %2 194 %if ARCH_X86_64 == 0 195 %define r%1 e%1 196 %endif 197 %endmacro 198 199 DECLARE_REG_SIZE ax, al, ah 200 DECLARE_REG_SIZE bx, bl, bh 201 DECLARE_REG_SIZE cx, cl, ch 202 DECLARE_REG_SIZE dx, dl, dh 203 DECLARE_REG_SIZE si, sil, null 204 DECLARE_REG_SIZE di, dil, null 205 DECLARE_REG_SIZE bp, bpl, null 206 207 ; t# defines for when per-arch register allocation is more complex than just function arguments 208 209 %macro DECLARE_REG_TMP 1-* 210 %assign %%i 0 211 %rep %0 212 CAT_XDEFINE t, %%i, r%1 213 %assign %%i %%i+1 214 %rotate 1 215 %endrep 216 %endmacro 217 218 %macro DECLARE_REG_TMP_SIZE 0-* 219 %rep %0 220 %define t%1q t%1 %+ q 221 %define t%1d t%1 %+ d 222 %define t%1w t%1 %+ w 223 %define t%1h t%1 %+ h 224 %define t%1b t%1 %+ b 225 %rotate 1 226 %endrep 227 %endmacro 228 229 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 230 231 %if ARCH_X86_64 232 %define gprsize 8 233 %else 234 %define gprsize 4 235 %endif 236 237 %macro LEA 2 238 %if ARCH_X86_64 239 lea %1, [%2] 240 %elif PIC 241 call $+5 ; special-cased to not affect the RSB on most CPU:s 242 pop %1 243 add %1, -$+1+%2 244 %else 245 mov %1, %2 246 %endif 247 %endmacro 248 249 ; Repeats an instruction/operation for multiple arguments. 250 ; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" 251 %macro REPX 2-* ; operation, args 252 %xdefine %%f(x) %1 253 %rep %0 - 1 254 %rotate 1 255 %%f(%1) 256 %endrep 257 %endmacro 258 259 %macro PUSH 1 260 push %1 261 %ifidn rstk, rsp 262 %assign stack_offset stack_offset+gprsize 263 %endif 264 %endmacro 265 266 %macro POP 1 267 pop %1 268 %ifidn rstk, rsp 269 %assign stack_offset stack_offset-gprsize 270 %endif 271 %endmacro 272 273 %macro PUSH_IF_USED 1-* 274 %rep %0 275 %if %1 < regs_used 276 PUSH r%1 277 %endif 278 %rotate 1 279 %endrep 280 %endmacro 281 282 %macro POP_IF_USED 1-* 283 %rep %0 284 %if %1 < regs_used 285 pop r%1 286 %endif 287 %rotate 1 288 %endrep 289 %endmacro 290 291 %macro LOAD_IF_USED 1-* 292 %rep %0 293 %if %1 < num_args 294 mov r%1, r %+ %1 %+ mp 295 %endif 296 %rotate 1 297 %endrep 298 %endmacro 299 300 %macro SUB 2 301 sub %1, %2 302 %ifidn %1, rstk 303 %assign stack_offset stack_offset+(%2) 304 %endif 305 %endmacro 306 307 %macro ADD 2 308 add %1, %2 309 %ifidn %1, rstk 310 %assign stack_offset stack_offset-(%2) 311 %endif 312 %endmacro 313 314 %macro movifnidn 2 315 %ifnidn %1, %2 316 mov %1, %2 317 %endif 318 %endmacro 319 320 %if ARCH_X86_64 == 0 321 %define movsxd movifnidn 322 %endif 323 324 %macro movsxdifnidn 2 325 %ifnidn %1, %2 326 movsxd %1, %2 327 %endif 328 %endmacro 329 330 %macro ASSERT 1 331 %if (%1) == 0 332 %error assertion ``%1'' failed 333 %endif 334 %endmacro 335 336 %macro DEFINE_ARGS 0-* 337 %ifdef n_arg_names 338 %assign %%i 0 339 %rep n_arg_names 340 CAT_UNDEF arg_name %+ %%i, q 341 CAT_UNDEF arg_name %+ %%i, d 342 CAT_UNDEF arg_name %+ %%i, w 343 CAT_UNDEF arg_name %+ %%i, h 344 CAT_UNDEF arg_name %+ %%i, b 345 CAT_UNDEF arg_name %+ %%i, m 346 CAT_UNDEF arg_name %+ %%i, mp 347 CAT_UNDEF arg_name, %%i 348 %assign %%i %%i+1 349 %endrep 350 %endif 351 352 %xdefine %%stack_offset stack_offset 353 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 354 %assign %%i 0 355 %rep %0 356 %xdefine %1q r %+ %%i %+ q 357 %xdefine %1d r %+ %%i %+ d 358 %xdefine %1w r %+ %%i %+ w 359 %xdefine %1h r %+ %%i %+ h 360 %xdefine %1b r %+ %%i %+ b 361 %xdefine %1m r %+ %%i %+ m 362 %xdefine %1mp r %+ %%i %+ mp 363 CAT_XDEFINE arg_name, %%i, %1 364 %assign %%i %%i+1 365 %rotate 1 366 %endrep 367 %xdefine stack_offset %%stack_offset 368 %assign n_arg_names %0 369 %endmacro 370 371 %define required_stack_alignment ((mmsize + 15) & ~15) 372 %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) 373 %define high_mm_regs (16*cpuflag(avx512)) 374 375 ; Large stack allocations on Windows need to use stack probing in order 376 ; to guarantee that all stack memory is committed before accessing it. 377 ; This is done by ensuring that the guard page(s) at the end of the 378 ; currently committed pages are touched prior to any pages beyond that. 379 %if WIN64 380 %assign STACK_PROBE_SIZE 8192 381 %elifidn __OUTPUT_FORMAT__, win32 382 %assign STACK_PROBE_SIZE 4096 383 %else 384 %assign STACK_PROBE_SIZE 0 385 %endif 386 387 %macro PROBE_STACK 1 ; stack_size 388 %if STACK_PROBE_SIZE 389 %assign %%i STACK_PROBE_SIZE 390 %rep %1 / STACK_PROBE_SIZE 391 mov eax, [rsp-%%i] 392 %assign %%i %%i+STACK_PROBE_SIZE 393 %endrep 394 %endif 395 %endmacro 396 397 %macro RESET_STACK_STATE 0 398 %ifidn rstk, rsp 399 %assign stack_offset stack_offset - stack_size_padded 400 %else 401 %xdefine rstk rsp 402 %endif 403 %assign stack_size 0 404 %assign stack_size_padded 0 405 %assign xmm_regs_used 0 406 %endmacro 407 408 %macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs 409 RESET_STACK_STATE 410 %ifnum %2 411 %if mmsize != 8 412 %assign xmm_regs_used %2 413 %endif 414 %endif 415 %ifnum %1 416 %if %1 != 0 417 %assign %%pad 0 418 %assign stack_size %1 419 %if stack_size < 0 420 %assign stack_size -stack_size 421 %endif 422 %if WIN64 423 %assign %%pad %%pad + 32 ; shadow space 424 %if xmm_regs_used > 8 425 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers 426 %endif 427 %endif 428 %if required_stack_alignment <= STACK_ALIGNMENT 429 ; maintain the current stack alignment 430 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 431 PROBE_STACK stack_size_padded 432 SUB rsp, stack_size_padded 433 %else 434 %assign %%reg_num (regs_used - 1) 435 %xdefine rstk r %+ %%reg_num 436 ; align stack, and save original stack location directly above 437 ; it, i.e. in [rsp+stack_size_padded], so we can restore the 438 ; stack in a single instruction (i.e. mov rsp, rstk or mov 439 ; rsp, [rsp+stack_size_padded]) 440 %if %1 < 0 ; need to store rsp on stack 441 %xdefine rstkm [rsp + stack_size + %%pad] 442 %assign %%pad %%pad + gprsize 443 %else ; can keep rsp in rstk during whole function 444 %xdefine rstkm rstk 445 %endif 446 %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) 447 PROBE_STACK stack_size_padded 448 mov rstk, rsp 449 and rsp, ~(required_stack_alignment-1) 450 sub rsp, stack_size_padded 451 movifnidn rstkm, rstk 452 %endif 453 WIN64_PUSH_XMM 454 %endif 455 %endif 456 %endmacro 457 458 %macro SETUP_STACK_POINTER 0-1 0 459 %ifnum %1 460 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT 461 %if %1 > 0 462 ; Reserve an additional register for storing the original stack pointer, but avoid using 463 ; eax/rax for this purpose since it can potentially get overwritten as a return value. 464 %assign regs_used (regs_used + 1) 465 %if ARCH_X86_64 && regs_used == 7 466 %assign regs_used 8 467 %elif ARCH_X86_64 == 0 && regs_used == 1 468 %assign regs_used 2 469 %endif 470 %endif 471 %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 472 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) 473 ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. 474 %assign regs_used 5 + UNIX64 * 3 475 %endif 476 %endif 477 %endif 478 %endmacro 479 480 %if WIN64 ; Windows x64 ;================================================= 481 482 DECLARE_REG 0, rcx 483 DECLARE_REG 1, rdx 484 DECLARE_REG 2, R8 485 DECLARE_REG 3, R9 486 DECLARE_REG 4, R10, 40 487 DECLARE_REG 5, R11, 48 488 DECLARE_REG 6, rax, 56 489 DECLARE_REG 7, rdi, 64 490 DECLARE_REG 8, rsi, 72 491 DECLARE_REG 9, rbx, 80 492 DECLARE_REG 10, rbp, 88 493 DECLARE_REG 11, R14, 96 494 DECLARE_REG 12, R15, 104 495 DECLARE_REG 13, R12, 112 496 DECLARE_REG 14, R13, 120 497 498 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 499 %assign num_args %1 500 %assign regs_used %2 501 ASSERT regs_used >= num_args 502 SETUP_STACK_POINTER %4 503 ASSERT regs_used <= 15 504 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 505 ALLOC_STACK %4, %3 506 %if mmsize != 8 && stack_size == 0 507 WIN64_SPILL_XMM %3 508 %endif 509 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 510 %if %0 > 4 511 %ifnum %4 512 DEFINE_ARGS %5 513 %else 514 DEFINE_ARGS %4, %5 515 %endif 516 %elifnnum %4 517 DEFINE_ARGS %4 518 %endif 519 %endmacro 520 521 ; Push XMM registers to the stack. If no argument is specified all used register 522 ; will be pushed, otherwise only push previously unpushed registers. 523 %macro WIN64_PUSH_XMM 0-2 ; new_xmm_regs_used, xmm_regs_pushed 524 %if mmsize != 8 525 %if %0 == 2 526 %assign %%pushed %2 527 %assign xmm_regs_used %1 528 %elif %0 == 1 529 %assign %%pushed xmm_regs_used 530 %assign xmm_regs_used %1 531 %else 532 %assign %%pushed 0 533 %endif 534 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. 535 %if %%pushed <= 6 + high_mm_regs && xmm_regs_used > 6 + high_mm_regs 536 movaps [rstk + stack_offset + 8], xmm6 537 %endif 538 %if %%pushed <= 7 + high_mm_regs && xmm_regs_used > 7 + high_mm_regs 539 movaps [rstk + stack_offset + 24], xmm7 540 %endif 541 %assign %%pushed %%pushed - high_mm_regs - 8 542 %if %%pushed < 0 543 %assign %%pushed 0 544 %endif 545 %assign %%regs_to_push xmm_regs_used - %%pushed - high_mm_regs - 8 546 %if %%regs_to_push > 0 547 ASSERT (%%regs_to_push + %%pushed) * 16 <= stack_size_padded - stack_size - 32 548 %assign %%i %%pushed + 8 549 %rep %%regs_to_push 550 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i 551 %assign %%i %%i+1 552 %endrep 553 %endif 554 %endif 555 %endmacro 556 557 ; Allocated stack space for XMM registers and push all, or a subset, of those 558 %macro WIN64_SPILL_XMM 1-2 ; xmm_regs_used, xmm_regs_reserved 559 RESET_STACK_STATE 560 %if mmsize != 8 561 %assign xmm_regs_used %1 562 ASSERT xmm_regs_used <= 16 + high_mm_regs 563 %if %0 == 2 564 ASSERT %2 >= %1 565 %assign %%xmm_regs_on_stack %2 - high_mm_regs - 8 566 %else 567 %assign %%xmm_regs_on_stack %1 - high_mm_regs - 8 568 %endif 569 %if %%xmm_regs_on_stack > 0 570 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. 571 %assign %%pad %%xmm_regs_on_stack*16 + 32 572 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 573 SUB rsp, stack_size_padded 574 %endif 575 WIN64_PUSH_XMM 576 %endif 577 %endmacro 578 579 %macro WIN64_RESTORE_XMM_INTERNAL 0 580 %assign %%pad_size 0 581 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 582 %if %%xmm_regs_on_stack > 0 583 %assign %%i xmm_regs_used - high_mm_regs 584 %rep %%xmm_regs_on_stack 585 %assign %%i %%i-1 586 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] 587 %endrep 588 %endif 589 %if stack_size_padded > 0 590 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT 591 mov rsp, rstkm 592 %else 593 add rsp, stack_size_padded 594 %assign %%pad_size stack_size_padded 595 %endif 596 %endif 597 %if xmm_regs_used > 7 + high_mm_regs 598 movaps xmm7, [rsp + stack_offset - %%pad_size + 24] 599 %endif 600 %if xmm_regs_used > 6 + high_mm_regs 601 movaps xmm6, [rsp + stack_offset - %%pad_size + 8] 602 %endif 603 %endmacro 604 605 %macro WIN64_RESTORE_XMM 0 606 WIN64_RESTORE_XMM_INTERNAL 607 RESET_STACK_STATE 608 %endmacro 609 610 %define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs 611 612 %macro RET 0 613 WIN64_RESTORE_XMM_INTERNAL 614 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 615 %if vzeroupper_required 616 vzeroupper 617 %endif 618 AUTO_REP_RET 619 %endmacro 620 621 %elif ARCH_X86_64 ; *nix x64 ;============================================= 622 623 DECLARE_REG 0, rdi 624 DECLARE_REG 1, rsi 625 DECLARE_REG 2, rdx 626 DECLARE_REG 3, rcx 627 DECLARE_REG 4, R8 628 DECLARE_REG 5, R9 629 DECLARE_REG 6, rax, 8 630 DECLARE_REG 7, R10, 16 631 DECLARE_REG 8, R11, 24 632 DECLARE_REG 9, rbx, 32 633 DECLARE_REG 10, rbp, 40 634 DECLARE_REG 11, R14, 48 635 DECLARE_REG 12, R15, 56 636 DECLARE_REG 13, R12, 64 637 DECLARE_REG 14, R13, 72 638 639 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 640 %assign num_args %1 641 %assign regs_used %2 642 ASSERT regs_used >= num_args 643 SETUP_STACK_POINTER %4 644 ASSERT regs_used <= 15 645 PUSH_IF_USED 9, 10, 11, 12, 13, 14 646 ALLOC_STACK %4, %3 647 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 648 %if %0 > 4 649 %ifnum %4 650 DEFINE_ARGS %5 651 %else 652 DEFINE_ARGS %4, %5 653 %endif 654 %elifnnum %4 655 DEFINE_ARGS %4 656 %endif 657 %endmacro 658 659 %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required 660 661 %macro RET 0 662 %if stack_size_padded > 0 663 %if required_stack_alignment > STACK_ALIGNMENT 664 mov rsp, rstkm 665 %else 666 add rsp, stack_size_padded 667 %endif 668 %endif 669 POP_IF_USED 14, 13, 12, 11, 10, 9 670 %if vzeroupper_required 671 vzeroupper 672 %endif 673 AUTO_REP_RET 674 %endmacro 675 676 %else ; X86_32 ;============================================================== 677 678 DECLARE_REG 0, eax, 4 679 DECLARE_REG 1, ecx, 8 680 DECLARE_REG 2, edx, 12 681 DECLARE_REG 3, ebx, 16 682 DECLARE_REG 4, esi, 20 683 DECLARE_REG 5, edi, 24 684 DECLARE_REG 6, ebp, 28 685 %define rsp esp 686 687 %macro DECLARE_ARG 1-* 688 %rep %0 689 %define r%1m [rstk + stack_offset + 4*%1 + 4] 690 %define r%1mp dword r%1m 691 %rotate 1 692 %endrep 693 %endmacro 694 695 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 696 697 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 698 %assign num_args %1 699 %assign regs_used %2 700 ASSERT regs_used >= num_args 701 %if num_args > 7 702 %assign num_args 7 703 %endif 704 %if regs_used > 7 705 %assign regs_used 7 706 %endif 707 SETUP_STACK_POINTER %4 708 ASSERT regs_used <= 7 709 PUSH_IF_USED 3, 4, 5, 6 710 ALLOC_STACK %4, %3 711 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 712 %if %0 > 4 713 %ifnum %4 714 DEFINE_ARGS %5 715 %else 716 DEFINE_ARGS %4, %5 717 %endif 718 %elifnnum %4 719 DEFINE_ARGS %4 720 %endif 721 %endmacro 722 723 %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required 724 725 %macro RET 0 726 %if stack_size_padded > 0 727 %if required_stack_alignment > STACK_ALIGNMENT 728 mov rsp, rstkm 729 %else 730 add rsp, stack_size_padded 731 %endif 732 %endif 733 POP_IF_USED 6, 5, 4, 3 734 %if vzeroupper_required 735 vzeroupper 736 %endif 737 AUTO_REP_RET 738 %endmacro 739 740 %endif ;====================================================================== 741 742 %if WIN64 == 0 743 %macro WIN64_SPILL_XMM 1-2 744 RESET_STACK_STATE 745 %if mmsize != 8 746 %assign xmm_regs_used %1 747 %endif 748 %endmacro 749 %macro WIN64_RESTORE_XMM 0 750 RESET_STACK_STATE 751 %endmacro 752 %macro WIN64_PUSH_XMM 0-2 753 %if mmsize != 8 && %0 >= 1 754 %assign xmm_regs_used %1 755 %endif 756 %endmacro 757 %endif 758 759 ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either 760 ; a branch or a branch target. So switch to a 2-byte form of ret in that case. 761 ; We can automatically detect "follows a branch", but not a branch target. 762 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) 763 %macro REP_RET 0 764 %if has_epilogue || cpuflag(ssse3) 765 RET 766 %else 767 rep ret 768 %endif 769 annotate_function_size 770 %endmacro 771 772 %define last_branch_adr $$ 773 %macro AUTO_REP_RET 0 774 %if notcpuflag(ssse3) 775 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. 776 %endif 777 ret 778 annotate_function_size 779 %endmacro 780 781 %macro BRANCH_INSTR 0-* 782 %rep %0 783 %macro %1 1-2 %1 784 %2 %1 785 %if notcpuflag(ssse3) 786 %%branch_instr equ $ 787 %xdefine last_branch_adr %%branch_instr 788 %endif 789 %endmacro 790 %rotate 1 791 %endrep 792 %endmacro 793 794 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp 795 796 %macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent 797 %if has_epilogue 798 call %1 799 RET 800 %elif %2 801 jmp %1 802 %endif 803 annotate_function_size 804 %endmacro 805 806 ;============================================================================= 807 ; arch-independent part 808 ;============================================================================= 809 810 %assign function_align 16 811 812 ; Begin a function. 813 ; Applies any symbol mangling needed for C linkage, and sets up a define such that 814 ; subsequent uses of the function name automatically refer to the mangled version. 815 ; Appends cpuflags to the function name if cpuflags has been specified. 816 ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX 817 ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). 818 %macro cglobal 1-2+ "" ; name, [PROLOGUE args] 819 cglobal_internal 1, %1 %+ SUFFIX, %2 820 %endmacro 821 %macro cvisible 1-2+ "" ; name, [PROLOGUE args] 822 cglobal_internal 0, %1 %+ SUFFIX, %2 823 %endmacro 824 %macro cglobal_internal 2-3+ 825 annotate_function_size 826 %ifndef cglobaled_%2 827 %if %1 828 %xdefine %2 mangle(private_prefix %+ _ %+ %2) 829 %else 830 %xdefine %2 mangle(public_prefix %+ _ %+ %2) 831 %endif 832 %xdefine %2.skip_prologue %2 %+ .skip_prologue 833 CAT_XDEFINE cglobaled_, %2, 1 834 %endif 835 %xdefine current_function %2 836 %xdefine current_function_section __SECT__ 837 %if FORMAT_ELF 838 %if %1 839 global %2:function hidden 840 %else 841 global %2:function 842 %endif 843 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 844 global %2:private_extern 845 %else 846 global %2 847 %endif 848 %if WIN32 && !%1 849 %ifdef BUILDING_DLL 850 export %2 851 %endif 852 %endif 853 align function_align 854 %2: 855 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer 856 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required 857 %assign stack_offset 0 ; stack pointer offset relative to the return address 858 %assign stack_size 0 ; amount of stack space that can be freely used inside a function 859 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding 860 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper 861 %ifnidn %3, "" 862 PROLOGUE %3 863 %endif 864 %endmacro 865 866 ; Create a global symbol from a local label with the correct name mangling and type 867 %macro cglobal_label 1 868 %if FORMAT_ELF 869 global current_function %+ %1:function hidden 870 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN 871 global current_function %+ %1:private_extern 872 %else 873 global current_function %+ %1 874 %endif 875 %1: 876 %endmacro 877 878 %macro cextern 1 879 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 880 CAT_XDEFINE cglobaled_, %1, 2 881 extern %1 882 %endmacro 883 884 ; Like cextern, but without the prefix. This should be used for symbols from external libraries. 885 %macro cextern_naked 1 886 %ifdef PREFIX 887 %xdefine %1 mangle(%1) 888 %endif 889 CAT_XDEFINE cglobaled_, %1, 3 890 extern %1 891 %endmacro 892 893 %macro const 1-2+ 894 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 895 %if FORMAT_ELF 896 global %1:data hidden 897 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN 898 global %1:private_extern 899 %else 900 global %1 901 %endif 902 %1: %2 903 %endmacro 904 905 %if FORMAT_ELF 906 ; The GNU linker assumes the stack is executable by default. 907 [SECTION .note.GNU-stack noalloc noexec nowrite progbits] 908 909 %ifdef __NASM_VERSION_ID__ 910 %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03 911 %if ARCH_X86_64 912 ; Control-flow Enforcement Technology (CET) properties. 913 [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize] 914 dd 0x00000004 ; n_namesz 915 dd gprsize + 8 ; n_descsz 916 dd 0x00000005 ; n_type = NT_GNU_PROPERTY_TYPE_0 917 db "GNU",0 ; n_name 918 dd 0xc0000002 ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND 919 dd 0x00000004 ; pr_datasz 920 dd 0x00000002 ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK 921 dd 0x00000000 ; pr_padding 922 %endif 923 %endif 924 %endif 925 %endif 926 927 ; Tell debuggers how large the function was. 928 ; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. 929 ; This is invoked by RET and similar macros, and also cglobal does it for the previous function, 930 ; but if the last function in a source file doesn't use any of the standard macros for its epilogue, 931 ; then its size might be unspecified. 932 %macro annotate_function_size 0 933 %ifdef __YASM_VER__ 934 %ifdef current_function 935 %if FORMAT_ELF 936 current_function_section 937 %%ecf equ $ 938 size current_function %%ecf - current_function 939 __SECT__ 940 %endif 941 %endif 942 %endif 943 %endmacro 944 945 ; cpuflags 946 947 %assign cpuflags_mmx (1<<0) 948 %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 949 %assign cpuflags_3dnow (1<<2) | cpuflags_mmx 950 %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow 951 %assign cpuflags_sse (1<<4) | cpuflags_mmx2 952 %assign cpuflags_sse2 (1<<5) | cpuflags_sse 953 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 954 %assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 955 %assign cpuflags_sse3 (1<<8) | cpuflags_sse2 956 %assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 957 %assign cpuflags_sse4 (1<<10) | cpuflags_ssse3 958 %assign cpuflags_sse42 (1<<11) | cpuflags_sse4 959 %assign cpuflags_aesni (1<<12) | cpuflags_sse42 960 %assign cpuflags_clmul (1<<13) | cpuflags_sse42 961 %assign cpuflags_gfni (1<<14) | cpuflags_aesni|cpuflags_clmul 962 %assign cpuflags_avx (1<<15) | cpuflags_sse42 963 %assign cpuflags_xop (1<<16) | cpuflags_avx 964 %assign cpuflags_fma4 (1<<17) | cpuflags_avx 965 %assign cpuflags_fma3 (1<<18) | cpuflags_avx 966 %assign cpuflags_bmi1 (1<<19) | cpuflags_avx|cpuflags_lzcnt 967 %assign cpuflags_bmi2 (1<<20) | cpuflags_bmi1 968 %assign cpuflags_avx2 (1<<21) | cpuflags_fma3|cpuflags_bmi2 969 %assign cpuflags_avx512 (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL 970 %assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ 971 972 %assign cpuflags_cache32 (1<<24) 973 %assign cpuflags_cache64 (1<<25) 974 %assign cpuflags_aligned (1<<26) ; not a cpu feature, but a function variant 975 %assign cpuflags_atom (1<<27) 976 977 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. 978 %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) 979 %define notcpuflag(x) (cpuflag(x) ^ 1) 980 981 ; Takes an arbitrary number of cpuflags from the above list. 982 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 983 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 984 %macro INIT_CPUFLAGS 0-* 985 %xdefine SUFFIX 986 %undef cpuname 987 %assign cpuflags 0 988 989 %if %0 >= 1 990 %rep %0 991 %ifdef cpuname 992 %xdefine cpuname cpuname %+ _%1 993 %else 994 %xdefine cpuname %1 995 %endif 996 %assign cpuflags cpuflags | cpuflags_%1 997 %rotate 1 998 %endrep 999 %xdefine SUFFIX _ %+ cpuname 1000 1001 %if cpuflag(avx) 1002 %assign avx_enabled 1 1003 %endif 1004 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) 1005 %define mova movaps 1006 %define movu movups 1007 %define movnta movntps 1008 %endif 1009 %if cpuflag(aligned) 1010 %define movu mova 1011 %elif cpuflag(sse3) && notcpuflag(ssse3) 1012 %define movu lddqu 1013 %endif 1014 %endif 1015 1016 %if ARCH_X86_64 || cpuflag(sse2) 1017 %ifdef __NASM_VERSION_ID__ 1018 ALIGNMODE p6 1019 %else 1020 CPU amdnop 1021 %endif 1022 %else 1023 %ifdef __NASM_VERSION_ID__ 1024 ALIGNMODE nop 1025 %else 1026 CPU basicnop 1027 %endif 1028 %endif 1029 %endmacro 1030 1031 ; Merge mmx, sse*, and avx* 1032 ; m# is a simd register of the currently selected size 1033 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# 1034 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# 1035 ; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# 1036 ; (All 4 remain in sync through SWAP.) 1037 1038 %macro CAT_XDEFINE 3 1039 %xdefine %1%2 %3 1040 %endmacro 1041 1042 %macro CAT_UNDEF 2 1043 %undef %1%2 1044 %endmacro 1045 1046 %macro DEFINE_MMREGS 1 ; mmtype 1047 %assign %%prev_mmregs 0 1048 %ifdef num_mmregs 1049 %assign %%prev_mmregs num_mmregs 1050 %endif 1051 1052 %assign num_mmregs 8 1053 %if ARCH_X86_64 && mmsize >= 16 1054 %assign num_mmregs 16 1055 %if cpuflag(avx512) || mmsize == 64 1056 %assign num_mmregs 32 1057 %endif 1058 %endif 1059 1060 %assign %%i 0 1061 %rep num_mmregs 1062 CAT_XDEFINE m, %%i, %1 %+ %%i 1063 CAT_XDEFINE nn%1, %%i, %%i 1064 %assign %%i %%i+1 1065 %endrep 1066 %if %%prev_mmregs > num_mmregs 1067 %rep %%prev_mmregs - num_mmregs 1068 CAT_UNDEF m, %%i 1069 CAT_UNDEF nn %+ mmtype, %%i 1070 %assign %%i %%i+1 1071 %endrep 1072 %endif 1073 %xdefine mmtype %1 1074 %endmacro 1075 1076 ; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper 1077 %macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg 1078 %if ARCH_X86_64 && cpuflag(avx512) 1079 %assign %%i %1 1080 %rep 16-%1 1081 %assign %%i_high %%i+16 1082 SWAP %%i, %%i_high 1083 %assign %%i %%i+1 1084 %endrep 1085 %endif 1086 %endmacro 1087 1088 %macro INIT_MMX 0-1+ 1089 %assign avx_enabled 0 1090 %define RESET_MM_PERMUTATION INIT_MMX %1 1091 %define mmsize 8 1092 %define mova movq 1093 %define movu movq 1094 %define movh movd 1095 %define movnta movntq 1096 INIT_CPUFLAGS %1 1097 DEFINE_MMREGS mm 1098 %endmacro 1099 1100 %macro INIT_XMM 0-1+ 1101 %assign avx_enabled FORCE_VEX_ENCODING 1102 %define RESET_MM_PERMUTATION INIT_XMM %1 1103 %define mmsize 16 1104 %define mova movdqa 1105 %define movu movdqu 1106 %define movh movq 1107 %define movnta movntdq 1108 INIT_CPUFLAGS %1 1109 DEFINE_MMREGS xmm 1110 %if WIN64 1111 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers 1112 %endif 1113 %xdefine bcstw 1to8 1114 %xdefine bcstd 1to4 1115 %xdefine bcstq 1to2 1116 %endmacro 1117 1118 %macro INIT_YMM 0-1+ 1119 %assign avx_enabled 1 1120 %define RESET_MM_PERMUTATION INIT_YMM %1 1121 %define mmsize 32 1122 %define mova movdqa 1123 %define movu movdqu 1124 %undef movh 1125 %define movnta movntdq 1126 INIT_CPUFLAGS %1 1127 DEFINE_MMREGS ymm 1128 AVX512_MM_PERMUTATION 1129 %xdefine bcstw 1to16 1130 %xdefine bcstd 1to8 1131 %xdefine bcstq 1to4 1132 %endmacro 1133 1134 %macro INIT_ZMM 0-1+ 1135 %assign avx_enabled 1 1136 %define RESET_MM_PERMUTATION INIT_ZMM %1 1137 %define mmsize 64 1138 %define mova movdqa 1139 %define movu movdqu 1140 %undef movh 1141 %define movnta movntdq 1142 INIT_CPUFLAGS %1 1143 DEFINE_MMREGS zmm 1144 AVX512_MM_PERMUTATION 1145 %xdefine bcstw 1to32 1146 %xdefine bcstd 1to16 1147 %xdefine bcstq 1to8 1148 %endmacro 1149 1150 INIT_XMM 1151 1152 %macro DECLARE_MMCAST 1 1153 %define mmmm%1 mm%1 1154 %define mmxmm%1 mm%1 1155 %define mmymm%1 mm%1 1156 %define mmzmm%1 mm%1 1157 %define xmmmm%1 mm%1 1158 %define xmmxmm%1 xmm%1 1159 %define xmmymm%1 xmm%1 1160 %define xmmzmm%1 xmm%1 1161 %define ymmmm%1 mm%1 1162 %define ymmxmm%1 xmm%1 1163 %define ymmymm%1 ymm%1 1164 %define ymmzmm%1 ymm%1 1165 %define zmmmm%1 mm%1 1166 %define zmmxmm%1 xmm%1 1167 %define zmmymm%1 ymm%1 1168 %define zmmzmm%1 zmm%1 1169 %define xm%1 xmm %+ m%1 1170 %define ym%1 ymm %+ m%1 1171 %define zm%1 zmm %+ m%1 1172 %endmacro 1173 1174 %assign i 0 1175 %rep 32 1176 DECLARE_MMCAST i 1177 %assign i i+1 1178 %endrep 1179 1180 ; I often want to use macros that permute their arguments. e.g. there's no 1181 ; efficient way to implement butterfly or transpose or dct without swapping some 1182 ; arguments. 1183 ; 1184 ; I would like to not have to manually keep track of the permutations: 1185 ; If I insert a permutation in the middle of a function, it should automatically 1186 ; change everything that follows. For more complex macros I may also have multiple 1187 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 1188 ; 1189 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 1190 ; permutes its arguments. It's equivalent to exchanging the contents of the 1191 ; registers, except that this way you exchange the register names instead, so it 1192 ; doesn't cost any cycles. 1193 1194 %macro PERMUTE 2-* ; takes a list of pairs to swap 1195 %rep %0/2 1196 %xdefine %%tmp%2 m%2 1197 %rotate 2 1198 %endrep 1199 %rep %0/2 1200 %xdefine m%1 %%tmp%2 1201 CAT_XDEFINE nn, m%1, %1 1202 %rotate 2 1203 %endrep 1204 %endmacro 1205 1206 %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) 1207 %ifnum %1 ; SWAP 0, 1, ... 1208 SWAP_INTERNAL_NUM %1, %2 1209 %else ; SWAP m0, m1, ... 1210 SWAP_INTERNAL_NAME %1, %2 1211 %endif 1212 %endmacro 1213 1214 %macro SWAP_INTERNAL_NUM 2-* 1215 %rep %0-1 1216 %xdefine %%tmp m%1 1217 %xdefine m%1 m%2 1218 %xdefine m%2 %%tmp 1219 CAT_XDEFINE nn, m%1, %1 1220 CAT_XDEFINE nn, m%2, %2 1221 %rotate 1 1222 %endrep 1223 %endmacro 1224 1225 %macro SWAP_INTERNAL_NAME 2-* 1226 %xdefine %%args nn %+ %1 1227 %rep %0-1 1228 %xdefine %%args %%args, nn %+ %2 1229 %rotate 1 1230 %endrep 1231 SWAP_INTERNAL_NUM %%args 1232 %endmacro 1233 1234 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 1235 ; calls to that function will automatically load the permutation, so values can 1236 ; be returned in mmregs. 1237 %macro SAVE_MM_PERMUTATION 0-1 1238 %if %0 1239 %xdefine %%f %1_m 1240 %else 1241 %xdefine %%f current_function %+ _m 1242 %endif 1243 %assign %%i 0 1244 %rep num_mmregs 1245 %xdefine %%tmp m %+ %%i 1246 CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp 1247 %assign %%i %%i+1 1248 %endrep 1249 %endmacro 1250 1251 %macro LOAD_MM_PERMUTATION 0-1 ; name to load from 1252 %if %0 1253 %xdefine %%f %1_m 1254 %else 1255 %xdefine %%f current_function %+ _m 1256 %endif 1257 %xdefine %%tmp %%f %+ 0 1258 %ifnum %%tmp 1259 DEFINE_MMREGS mmtype 1260 %assign %%i 0 1261 %rep num_mmregs 1262 %xdefine %%tmp %%f %+ %%i 1263 CAT_XDEFINE %%m, %%i, m %+ %%tmp 1264 %assign %%i %%i+1 1265 %endrep 1266 %rep num_mmregs 1267 %assign %%i %%i-1 1268 CAT_XDEFINE m, %%i, %%m %+ %%i 1269 CAT_XDEFINE nn, m %+ %%i, %%i 1270 %endrep 1271 %endif 1272 %endmacro 1273 1274 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 1275 %macro call 1 1276 %ifid %1 1277 call_internal %1 %+ SUFFIX, %1 1278 %else 1279 call %1 1280 %endif 1281 %endmacro 1282 %macro call_internal 2 1283 %xdefine %%i %2 1284 %define %%j %%i 1285 %ifndef cglobaled_%2 1286 %ifdef cglobaled_%1 1287 %xdefine %%i %1 1288 %endif 1289 %elif FORMAT_ELF 1290 %if ARCH_X86_64 1291 %if cglobaled_%2 >= 2 1292 ; Always emit PLT relocations when calling external functions, 1293 ; the linker will eliminate unnecessary PLT indirections anyway. 1294 %define %%j %%i wrt ..plt 1295 %endif 1296 %elif PIC && cglobaled_%2 == 3 1297 ; Go through the GOT for functions declared using cextern_naked with 1298 ; PIC, as such functions presumably exists in external libraries. 1299 extern _GLOBAL_OFFSET_TABLE_ 1300 LEA eax, $$+_GLOBAL_OFFSET_TABLE_ wrt ..gotpc 1301 %define %%j [eax+%%i wrt ..got] 1302 %endif 1303 %endif 1304 call %%j 1305 LOAD_MM_PERMUTATION %%i 1306 %endmacro 1307 1308 ; Substitutions that reduce instruction size but are functionally equivalent 1309 %macro add 2 1310 %ifnum %2 1311 %if %2==128 1312 sub %1, -128 1313 %else 1314 add %1, %2 1315 %endif 1316 %else 1317 add %1, %2 1318 %endif 1319 %endmacro 1320 1321 %macro sub 2 1322 %ifnum %2 1323 %if %2==128 1324 add %1, -128 1325 %else 1326 sub %1, %2 1327 %endif 1328 %else 1329 sub %1, %2 1330 %endif 1331 %endmacro 1332 1333 ;============================================================================= 1334 ; AVX abstraction layer 1335 ;============================================================================= 1336 1337 %assign i 0 1338 %rep 32 1339 %if i < 8 1340 CAT_XDEFINE sizeofmm, i, 8 1341 CAT_XDEFINE regnumofmm, i, i 1342 %endif 1343 CAT_XDEFINE sizeofxmm, i, 16 1344 CAT_XDEFINE sizeofymm, i, 32 1345 CAT_XDEFINE sizeofzmm, i, 64 1346 CAT_XDEFINE regnumofxmm, i, i 1347 CAT_XDEFINE regnumofymm, i, i 1348 CAT_XDEFINE regnumofzmm, i, i 1349 %assign i i+1 1350 %endrep 1351 %undef i 1352 1353 %macro CHECK_AVX_INSTR_EMU 3-* 1354 %xdefine %%opcode %1 1355 %xdefine %%dst %2 1356 %rep %0-2 1357 %ifidn %%dst, %3 1358 %error non-avx emulation of ``%%opcode'' is not supported 1359 %endif 1360 %rotate 1 1361 %endrep 1362 %endmacro 1363 1364 ;%1 == instruction 1365 ;%2 == minimal instruction set 1366 ;%3 == 1 if float, 0 if int 1367 ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) 1368 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1369 ;%6+: operands 1370 %macro RUN_AVX_INSTR 6-9+ 1371 %ifnum sizeof%7 1372 %assign __sizeofreg sizeof%7 1373 %elifnum sizeof%6 1374 %assign __sizeofreg sizeof%6 1375 %else 1376 %assign __sizeofreg mmsize 1377 %endif 1378 %assign __emulate_avx 0 1379 %if avx_enabled && __sizeofreg >= 16 1380 %xdefine __instr v%1 1381 %else 1382 %xdefine __instr %1 1383 %if %0 >= 8+%4 1384 %assign __emulate_avx 1 1385 %endif 1386 %endif 1387 %ifnidn %2, fnord 1388 %ifdef cpuname 1389 %if notcpuflag(%2) 1390 %error use of ``%1'' %2 instruction in cpuname function: current_function 1391 %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) 1392 %error use of ``%1'' sse2 instruction in cpuname function: current_function 1393 %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) 1394 %error use of ``%1'' avx2 instruction in cpuname function: current_function 1395 %elif __sizeofreg == 16 && notcpuflag(sse) 1396 %error use of ``%1'' sse instruction in cpuname function: current_function 1397 %elif __sizeofreg == 32 && notcpuflag(avx) 1398 %error use of ``%1'' avx instruction in cpuname function: current_function 1399 %elif __sizeofreg == 64 && notcpuflag(avx512) 1400 %error use of ``%1'' avx512 instruction in cpuname function: current_function 1401 %elifidn %1, pextrw ; special case because the base instruction is mmx2, 1402 %ifnid %6 ; but sse4 is required for memory operands 1403 %if notcpuflag(sse4) 1404 %error use of ``%1'' sse4 instruction in cpuname function: current_function 1405 %endif 1406 %endif 1407 %endif 1408 %endif 1409 %endif 1410 1411 %if __emulate_avx 1412 %xdefine __src1 %7 1413 %xdefine __src2 %8 1414 %if %5 && %4 == 0 1415 %ifnidn %6, %7 1416 %ifidn %6, %8 1417 %xdefine __src1 %8 1418 %xdefine __src2 %7 1419 %elifnnum sizeof%8 1420 ; 3-operand AVX instructions with a memory arg can only have it in src2, 1421 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). 1422 ; So, if the instruction is commutative with a memory arg, swap them. 1423 %xdefine __src1 %8 1424 %xdefine __src2 %7 1425 %endif 1426 %endif 1427 %endif 1428 %ifnidn %6, __src1 1429 %if %0 >= 9 1430 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 1431 %else 1432 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 1433 %endif 1434 %if __sizeofreg == 8 1435 MOVQ %6, __src1 1436 %elif %3 1437 MOVAPS %6, __src1 1438 %else 1439 MOVDQA %6, __src1 1440 %endif 1441 %endif 1442 %if %0 >= 9 1443 %1 %6, __src2, %9 1444 %else 1445 %1 %6, __src2 1446 %endif 1447 %elif %0 >= 9 1448 %if avx_enabled && __sizeofreg >= 16 && %4 == 1 1449 %ifnnum regnumof%7 1450 %if %3 1451 vmovaps %6, %7 1452 %else 1453 vmovdqa %6, %7 1454 %endif 1455 __instr %6, %6, %8, %9 1456 %else 1457 __instr %6, %7, %8, %9 1458 %endif 1459 %else 1460 __instr %6, %7, %8, %9 1461 %endif 1462 %elif %0 == 8 1463 %if avx_enabled && __sizeofreg >= 16 && %4 == 0 1464 %xdefine __src1 %7 1465 %xdefine __src2 %8 1466 %if %5 1467 %ifnum regnumof%7 1468 %ifnum regnumof%8 1469 %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 1470 ; Most VEX-encoded instructions require an additional byte to encode when 1471 ; src2 is a high register (e.g. m8..15). If the instruction is commutative 1472 ; we can swap src1 and src2 when doing so reduces the instruction length. 1473 %xdefine __src1 %8 1474 %xdefine __src2 %7 1475 %endif 1476 %endif 1477 %elifnum regnumof%8 ; put memory operands in src2 when possible 1478 %xdefine __src1 %8 1479 %xdefine __src2 %7 1480 %else 1481 %assign __emulate_avx 1 1482 %endif 1483 %elifnnum regnumof%7 1484 ; EVEX allows imm8 shift instructions to be used with memory operands, 1485 ; but VEX does not. This handles those special cases. 1486 %ifnnum %8 1487 %assign __emulate_avx 1 1488 %elif notcpuflag(avx512) 1489 %assign __emulate_avx 1 1490 %endif 1491 %endif 1492 %if __emulate_avx ; a separate load is required 1493 %if %3 1494 vmovaps %6, %7 1495 %else 1496 vmovdqa %6, %7 1497 %endif 1498 __instr %6, %6, %8 1499 %else 1500 __instr %6, __src1, __src2 1501 %endif 1502 %else 1503 __instr %6, %7, %8 1504 %endif 1505 %elif %0 == 7 1506 %if avx_enabled && __sizeofreg >= 16 && %5 1507 %xdefine __src1 %6 1508 %xdefine __src2 %7 1509 %ifnum regnumof%6 1510 %ifnum regnumof%7 1511 %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 1512 %xdefine __src1 %7 1513 %xdefine __src2 %6 1514 %endif 1515 %endif 1516 %endif 1517 __instr %6, __src1, __src2 1518 %else 1519 __instr %6, %7 1520 %endif 1521 %else 1522 __instr %6 1523 %endif 1524 %endmacro 1525 1526 ;%1 == instruction 1527 ;%2 == minimal instruction set 1528 ;%3 == 1 if float, 0 if int 1529 ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) 1530 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1531 %macro AVX_INSTR 1-5 fnord, 0, 255, 0 1532 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 1533 %ifidn %2, fnord 1534 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 1535 %elifidn %3, fnord 1536 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 1537 %elifidn %4, fnord 1538 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 1539 %elifidn %5, fnord 1540 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 1541 %else 1542 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 1543 %endif 1544 %endmacro 1545 %endmacro 1546 1547 ; Instructions with both VEX/EVEX and legacy encodings 1548 ; Non-destructive instructions are written without parameters 1549 AVX_INSTR addpd, sse2, 1, 0, 1 1550 AVX_INSTR addps, sse, 1, 0, 1 1551 AVX_INSTR addsd, sse2, 1, 0, 0 1552 AVX_INSTR addss, sse, 1, 0, 0 1553 AVX_INSTR addsubpd, sse3, 1, 0, 0 1554 AVX_INSTR addsubps, sse3, 1, 0, 0 1555 AVX_INSTR aesdec, aesni, 0, 0, 0 1556 AVX_INSTR aesdeclast, aesni, 0, 0, 0 1557 AVX_INSTR aesenc, aesni, 0, 0, 0 1558 AVX_INSTR aesenclast, aesni, 0, 0, 0 1559 AVX_INSTR aesimc, aesni 1560 AVX_INSTR aeskeygenassist, aesni 1561 AVX_INSTR andnpd, sse2, 1, 0, 0 1562 AVX_INSTR andnps, sse, 1, 0, 0 1563 AVX_INSTR andpd, sse2, 1, 0, 1 1564 AVX_INSTR andps, sse, 1, 0, 1 1565 AVX_INSTR blendpd, sse4, 1, 1, 0 1566 AVX_INSTR blendps, sse4, 1, 1, 0 1567 AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding 1568 AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding 1569 AVX_INSTR cmpeqpd, sse2, 1, 0, 1 1570 AVX_INSTR cmpeqps, sse, 1, 0, 1 1571 AVX_INSTR cmpeqsd, sse2, 1, 0, 0 1572 AVX_INSTR cmpeqss, sse, 1, 0, 0 1573 AVX_INSTR cmplepd, sse2, 1, 0, 0 1574 AVX_INSTR cmpleps, sse, 1, 0, 0 1575 AVX_INSTR cmplesd, sse2, 1, 0, 0 1576 AVX_INSTR cmpless, sse, 1, 0, 0 1577 AVX_INSTR cmpltpd, sse2, 1, 0, 0 1578 AVX_INSTR cmpltps, sse, 1, 0, 0 1579 AVX_INSTR cmpltsd, sse2, 1, 0, 0 1580 AVX_INSTR cmpltss, sse, 1, 0, 0 1581 AVX_INSTR cmpneqpd, sse2, 1, 0, 1 1582 AVX_INSTR cmpneqps, sse, 1, 0, 1 1583 AVX_INSTR cmpneqsd, sse2, 1, 0, 0 1584 AVX_INSTR cmpneqss, sse, 1, 0, 0 1585 AVX_INSTR cmpnlepd, sse2, 1, 0, 0 1586 AVX_INSTR cmpnleps, sse, 1, 0, 0 1587 AVX_INSTR cmpnlesd, sse2, 1, 0, 0 1588 AVX_INSTR cmpnless, sse, 1, 0, 0 1589 AVX_INSTR cmpnltpd, sse2, 1, 0, 0 1590 AVX_INSTR cmpnltps, sse, 1, 0, 0 1591 AVX_INSTR cmpnltsd, sse2, 1, 0, 0 1592 AVX_INSTR cmpnltss, sse, 1, 0, 0 1593 AVX_INSTR cmpordpd, sse2 1, 0, 1 1594 AVX_INSTR cmpordps, sse 1, 0, 1 1595 AVX_INSTR cmpordsd, sse2 1, 0, 0 1596 AVX_INSTR cmpordss, sse 1, 0, 0 1597 AVX_INSTR cmppd, sse2, 1, 1, 0 1598 AVX_INSTR cmpps, sse, 1, 1, 0 1599 AVX_INSTR cmpsd, sse2, 1, 1, 0 1600 AVX_INSTR cmpss, sse, 1, 1, 0 1601 AVX_INSTR cmpunordpd, sse2, 1, 0, 1 1602 AVX_INSTR cmpunordps, sse, 1, 0, 1 1603 AVX_INSTR cmpunordsd, sse2, 1, 0, 0 1604 AVX_INSTR cmpunordss, sse, 1, 0, 0 1605 AVX_INSTR comisd, sse2, 1 1606 AVX_INSTR comiss, sse, 1 1607 AVX_INSTR cvtdq2pd, sse2, 1 1608 AVX_INSTR cvtdq2ps, sse2, 1 1609 AVX_INSTR cvtpd2dq, sse2, 1 1610 AVX_INSTR cvtpd2ps, sse2, 1 1611 AVX_INSTR cvtps2dq, sse2, 1 1612 AVX_INSTR cvtps2pd, sse2, 1 1613 AVX_INSTR cvtsd2si, sse2, 1 1614 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 1615 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 1616 AVX_INSTR cvtsi2ss, sse, 1, 0, 0 1617 AVX_INSTR cvtss2sd, sse2, 1, 0, 0 1618 AVX_INSTR cvtss2si, sse, 1 1619 AVX_INSTR cvttpd2dq, sse2, 1 1620 AVX_INSTR cvttps2dq, sse2, 1 1621 AVX_INSTR cvttsd2si, sse2, 1 1622 AVX_INSTR cvttss2si, sse, 1 1623 AVX_INSTR divpd, sse2, 1, 0, 0 1624 AVX_INSTR divps, sse, 1, 0, 0 1625 AVX_INSTR divsd, sse2, 1, 0, 0 1626 AVX_INSTR divss, sse, 1, 0, 0 1627 AVX_INSTR dppd, sse4, 1, 1, 0 1628 AVX_INSTR dpps, sse4, 1, 1, 0 1629 AVX_INSTR extractps, sse4, 1 1630 AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 1631 AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 1632 AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 1633 AVX_INSTR haddpd, sse3, 1, 0, 0 1634 AVX_INSTR haddps, sse3, 1, 0, 0 1635 AVX_INSTR hsubpd, sse3, 1, 0, 0 1636 AVX_INSTR hsubps, sse3, 1, 0, 0 1637 AVX_INSTR insertps, sse4, 1, 1, 0 1638 AVX_INSTR lddqu, sse3 1639 AVX_INSTR ldmxcsr, sse, 1 1640 AVX_INSTR maskmovdqu, sse2 1641 AVX_INSTR maxpd, sse2, 1, 0, 1 1642 AVX_INSTR maxps, sse, 1, 0, 1 1643 AVX_INSTR maxsd, sse2, 1, 0, 0 1644 AVX_INSTR maxss, sse, 1, 0, 0 1645 AVX_INSTR minpd, sse2, 1, 0, 1 1646 AVX_INSTR minps, sse, 1, 0, 1 1647 AVX_INSTR minsd, sse2, 1, 0, 0 1648 AVX_INSTR minss, sse, 1, 0, 0 1649 AVX_INSTR movapd, sse2, 1 1650 AVX_INSTR movaps, sse, 1 1651 AVX_INSTR movd, mmx 1652 AVX_INSTR movddup, sse3, 1 1653 AVX_INSTR movdqa, sse2 1654 AVX_INSTR movdqu, sse2 1655 AVX_INSTR movhlps, sse, 1, 0, 0 1656 AVX_INSTR movhpd, sse2, 1, 0, 0 1657 AVX_INSTR movhps, sse, 1, 0, 0 1658 AVX_INSTR movlhps, sse, 1, 0, 0 1659 AVX_INSTR movlpd, sse2, 1, 0, 0 1660 AVX_INSTR movlps, sse, 1, 0, 0 1661 AVX_INSTR movmskpd, sse2, 1 1662 AVX_INSTR movmskps, sse, 1 1663 AVX_INSTR movntdq, sse2 1664 AVX_INSTR movntdqa, sse4 1665 AVX_INSTR movntpd, sse2, 1 1666 AVX_INSTR movntps, sse, 1 1667 AVX_INSTR movq, mmx 1668 AVX_INSTR movsd, sse2, 1, 0, 0 1669 AVX_INSTR movshdup, sse3, 1 1670 AVX_INSTR movsldup, sse3, 1 1671 AVX_INSTR movss, sse, 1, 0, 0 1672 AVX_INSTR movupd, sse2, 1 1673 AVX_INSTR movups, sse, 1 1674 AVX_INSTR mpsadbw, sse4, 0, 1, 0 1675 AVX_INSTR mulpd, sse2, 1, 0, 1 1676 AVX_INSTR mulps, sse, 1, 0, 1 1677 AVX_INSTR mulsd, sse2, 1, 0, 0 1678 AVX_INSTR mulss, sse, 1, 0, 0 1679 AVX_INSTR orpd, sse2, 1, 0, 1 1680 AVX_INSTR orps, sse, 1, 0, 1 1681 AVX_INSTR pabsb, ssse3 1682 AVX_INSTR pabsd, ssse3 1683 AVX_INSTR pabsw, ssse3 1684 AVX_INSTR packssdw, mmx, 0, 0, 0 1685 AVX_INSTR packsswb, mmx, 0, 0, 0 1686 AVX_INSTR packusdw, sse4, 0, 0, 0 1687 AVX_INSTR packuswb, mmx, 0, 0, 0 1688 AVX_INSTR paddb, mmx, 0, 0, 1 1689 AVX_INSTR paddd, mmx, 0, 0, 1 1690 AVX_INSTR paddq, sse2, 0, 0, 1 1691 AVX_INSTR paddsb, mmx, 0, 0, 1 1692 AVX_INSTR paddsw, mmx, 0, 0, 1 1693 AVX_INSTR paddusb, mmx, 0, 0, 1 1694 AVX_INSTR paddusw, mmx, 0, 0, 1 1695 AVX_INSTR paddw, mmx, 0, 0, 1 1696 AVX_INSTR palignr, ssse3, 0, 1, 0 1697 AVX_INSTR pand, mmx, 0, 0, 1 1698 AVX_INSTR pandn, mmx, 0, 0, 0 1699 AVX_INSTR pavgb, mmx2, 0, 0, 1 1700 AVX_INSTR pavgw, mmx2, 0, 0, 1 1701 AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding 1702 AVX_INSTR pblendw, sse4, 0, 1, 0 1703 AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0 1704 AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0 1705 AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0 1706 AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0 1707 AVX_INSTR pclmulqdq, clmul, 0, 1, 0 1708 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 1709 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 1710 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 1711 AVX_INSTR pcmpeqw, mmx, 0, 0, 1 1712 AVX_INSTR pcmpestri, sse42 1713 AVX_INSTR pcmpestrm, sse42 1714 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 1715 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 1716 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 1717 AVX_INSTR pcmpgtw, mmx, 0, 0, 0 1718 AVX_INSTR pcmpistri, sse42 1719 AVX_INSTR pcmpistrm, sse42 1720 AVX_INSTR pextrb, sse4 1721 AVX_INSTR pextrd, sse4 1722 AVX_INSTR pextrq, sse4 1723 AVX_INSTR pextrw, mmx2 1724 AVX_INSTR phaddd, ssse3, 0, 0, 0 1725 AVX_INSTR phaddsw, ssse3, 0, 0, 0 1726 AVX_INSTR phaddw, ssse3, 0, 0, 0 1727 AVX_INSTR phminposuw, sse4 1728 AVX_INSTR phsubd, ssse3, 0, 0, 0 1729 AVX_INSTR phsubsw, ssse3, 0, 0, 0 1730 AVX_INSTR phsubw, ssse3, 0, 0, 0 1731 AVX_INSTR pinsrb, sse4, 0, 1, 0 1732 AVX_INSTR pinsrd, sse4, 0, 1, 0 1733 AVX_INSTR pinsrq, sse4, 0, 1, 0 1734 AVX_INSTR pinsrw, mmx2, 0, 1, 0 1735 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 1736 AVX_INSTR pmaddwd, mmx, 0, 0, 1 1737 AVX_INSTR pmaxsb, sse4, 0, 0, 1 1738 AVX_INSTR pmaxsd, sse4, 0, 0, 1 1739 AVX_INSTR pmaxsw, mmx2, 0, 0, 1 1740 AVX_INSTR pmaxub, mmx2, 0, 0, 1 1741 AVX_INSTR pmaxud, sse4, 0, 0, 1 1742 AVX_INSTR pmaxuw, sse4, 0, 0, 1 1743 AVX_INSTR pminsb, sse4, 0, 0, 1 1744 AVX_INSTR pminsd, sse4, 0, 0, 1 1745 AVX_INSTR pminsw, mmx2, 0, 0, 1 1746 AVX_INSTR pminub, mmx2, 0, 0, 1 1747 AVX_INSTR pminud, sse4, 0, 0, 1 1748 AVX_INSTR pminuw, sse4, 0, 0, 1 1749 AVX_INSTR pmovmskb, mmx2 1750 AVX_INSTR pmovsxbd, sse4 1751 AVX_INSTR pmovsxbq, sse4 1752 AVX_INSTR pmovsxbw, sse4 1753 AVX_INSTR pmovsxdq, sse4 1754 AVX_INSTR pmovsxwd, sse4 1755 AVX_INSTR pmovsxwq, sse4 1756 AVX_INSTR pmovzxbd, sse4 1757 AVX_INSTR pmovzxbq, sse4 1758 AVX_INSTR pmovzxbw, sse4 1759 AVX_INSTR pmovzxdq, sse4 1760 AVX_INSTR pmovzxwd, sse4 1761 AVX_INSTR pmovzxwq, sse4 1762 AVX_INSTR pmuldq, sse4, 0, 0, 1 1763 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 1764 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 1765 AVX_INSTR pmulhw, mmx, 0, 0, 1 1766 AVX_INSTR pmulld, sse4, 0, 0, 1 1767 AVX_INSTR pmullw, mmx, 0, 0, 1 1768 AVX_INSTR pmuludq, sse2, 0, 0, 1 1769 AVX_INSTR por, mmx, 0, 0, 1 1770 AVX_INSTR psadbw, mmx2, 0, 0, 1 1771 AVX_INSTR pshufb, ssse3, 0, 0, 0 1772 AVX_INSTR pshufd, sse2 1773 AVX_INSTR pshufhw, sse2 1774 AVX_INSTR pshuflw, sse2 1775 AVX_INSTR psignb, ssse3, 0, 0, 0 1776 AVX_INSTR psignd, ssse3, 0, 0, 0 1777 AVX_INSTR psignw, ssse3, 0, 0, 0 1778 AVX_INSTR pslld, mmx, 0, 0, 0 1779 AVX_INSTR pslldq, sse2, 0, 0, 0 1780 AVX_INSTR psllq, mmx, 0, 0, 0 1781 AVX_INSTR psllw, mmx, 0, 0, 0 1782 AVX_INSTR psrad, mmx, 0, 0, 0 1783 AVX_INSTR psraw, mmx, 0, 0, 0 1784 AVX_INSTR psrld, mmx, 0, 0, 0 1785 AVX_INSTR psrldq, sse2, 0, 0, 0 1786 AVX_INSTR psrlq, mmx, 0, 0, 0 1787 AVX_INSTR psrlw, mmx, 0, 0, 0 1788 AVX_INSTR psubb, mmx, 0, 0, 0 1789 AVX_INSTR psubd, mmx, 0, 0, 0 1790 AVX_INSTR psubq, sse2, 0, 0, 0 1791 AVX_INSTR psubsb, mmx, 0, 0, 0 1792 AVX_INSTR psubsw, mmx, 0, 0, 0 1793 AVX_INSTR psubusb, mmx, 0, 0, 0 1794 AVX_INSTR psubusw, mmx, 0, 0, 0 1795 AVX_INSTR psubw, mmx, 0, 0, 0 1796 AVX_INSTR ptest, sse4 1797 AVX_INSTR punpckhbw, mmx, 0, 0, 0 1798 AVX_INSTR punpckhdq, mmx, 0, 0, 0 1799 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 1800 AVX_INSTR punpckhwd, mmx, 0, 0, 0 1801 AVX_INSTR punpcklbw, mmx, 0, 0, 0 1802 AVX_INSTR punpckldq, mmx, 0, 0, 0 1803 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 1804 AVX_INSTR punpcklwd, mmx, 0, 0, 0 1805 AVX_INSTR pxor, mmx, 0, 0, 1 1806 AVX_INSTR rcpps, sse, 1 1807 AVX_INSTR rcpss, sse, 1, 0, 0 1808 AVX_INSTR roundpd, sse4, 1 1809 AVX_INSTR roundps, sse4, 1 1810 AVX_INSTR roundsd, sse4, 1, 1, 0 1811 AVX_INSTR roundss, sse4, 1, 1, 0 1812 AVX_INSTR rsqrtps, sse, 1 1813 AVX_INSTR rsqrtss, sse, 1, 0, 0 1814 AVX_INSTR shufpd, sse2, 1, 1, 0 1815 AVX_INSTR shufps, sse, 1, 1, 0 1816 AVX_INSTR sqrtpd, sse2, 1 1817 AVX_INSTR sqrtps, sse, 1 1818 AVX_INSTR sqrtsd, sse2, 1, 0, 0 1819 AVX_INSTR sqrtss, sse, 1, 0, 0 1820 AVX_INSTR stmxcsr, sse, 1 1821 AVX_INSTR subpd, sse2, 1, 0, 0 1822 AVX_INSTR subps, sse, 1, 0, 0 1823 AVX_INSTR subsd, sse2, 1, 0, 0 1824 AVX_INSTR subss, sse, 1, 0, 0 1825 AVX_INSTR ucomisd, sse2, 1 1826 AVX_INSTR ucomiss, sse, 1 1827 AVX_INSTR unpckhpd, sse2, 1, 0, 0 1828 AVX_INSTR unpckhps, sse, 1, 0, 0 1829 AVX_INSTR unpcklpd, sse2, 1, 0, 0 1830 AVX_INSTR unpcklps, sse, 1, 0, 0 1831 AVX_INSTR xorpd, sse2, 1, 0, 1 1832 AVX_INSTR xorps, sse, 1, 0, 1 1833 1834 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1835 AVX_INSTR pfadd, 3dnow, 1, 0, 1 1836 AVX_INSTR pfmul, 3dnow, 1, 0, 1 1837 AVX_INSTR pfsub, 3dnow, 1, 0, 0 1838 1839 ;%1 == instruction 1840 ;%2 == minimal instruction set 1841 %macro GPR_INSTR 2 1842 %macro %1 2-5 fnord, %1, %2 1843 %ifdef cpuname 1844 %if notcpuflag(%5) 1845 %error use of ``%4'' %5 instruction in cpuname function: current_function 1846 %endif 1847 %endif 1848 %ifidn %3, fnord 1849 %4 %1, %2 1850 %else 1851 %4 %1, %2, %3 1852 %endif 1853 %endmacro 1854 %endmacro 1855 1856 GPR_INSTR andn, bmi1 1857 GPR_INSTR bextr, bmi1 1858 GPR_INSTR blsi, bmi1 1859 GPR_INSTR blsmsk, bmi1 1860 GPR_INSTR blsr, bmi1 1861 GPR_INSTR bzhi, bmi2 1862 GPR_INSTR crc32, sse42 1863 GPR_INSTR mulx, bmi2 1864 GPR_INSTR pdep, bmi2 1865 GPR_INSTR pext, bmi2 1866 GPR_INSTR popcnt, sse42 1867 GPR_INSTR rorx, bmi2 1868 GPR_INSTR sarx, bmi2 1869 GPR_INSTR shlx, bmi2 1870 GPR_INSTR shrx, bmi2 1871 1872 ; base-4 constants for shuffles 1873 %assign i 0 1874 %rep 256 1875 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1876 %if j < 10 1877 CAT_XDEFINE q000, j, i 1878 %elif j < 100 1879 CAT_XDEFINE q00, j, i 1880 %elif j < 1000 1881 CAT_XDEFINE q0, j, i 1882 %else 1883 CAT_XDEFINE q, j, i 1884 %endif 1885 %assign i i+1 1886 %endrep 1887 %undef i 1888 %undef j 1889 1890 %macro FMA_INSTR 3 1891 %macro %1 4-7 %1, %2, %3 1892 %if cpuflag(xop) 1893 v%5 %1, %2, %3, %4 1894 %elifnidn %1, %4 1895 %6 %1, %2, %3 1896 %7 %1, %4 1897 %else 1898 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported 1899 %endif 1900 %endmacro 1901 %endmacro 1902 1903 FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation 1904 FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation 1905 FMA_INSTR pmacsww, pmullw, paddw 1906 FMA_INSTR pmadcswd, pmaddwd, paddd 1907 1908 ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. 1909 ; FMA3 is only possible if dst is the same as one of the src registers. 1910 ; Either src2 or src3 can be a memory operand. 1911 %macro FMA4_INSTR 2-* 1912 %push fma4_instr 1913 %xdefine %$prefix %1 1914 %rep %0 - 1 1915 %macro %$prefix%2 4-6 %$prefix, %2 1916 %if notcpuflag(fma3) && notcpuflag(fma4) 1917 %error use of ``%5%6'' fma instruction in cpuname function: current_function 1918 %elif cpuflag(fma4) 1919 v%5%6 %1, %2, %3, %4 1920 %elifidn %1, %2 1921 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. 1922 %ifnum sizeof%3 1923 v%{5}213%6 %2, %3, %4 1924 %else 1925 v%{5}132%6 %2, %4, %3 1926 %endif 1927 %elifidn %1, %3 1928 v%{5}213%6 %3, %2, %4 1929 %elifidn %1, %4 1930 v%{5}231%6 %4, %2, %3 1931 %else 1932 %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported 1933 %endif 1934 %endmacro 1935 %rotate 1 1936 %endrep 1937 %pop 1938 %endmacro 1939 1940 FMA4_INSTR fmadd, pd, ps, sd, ss 1941 FMA4_INSTR fmaddsub, pd, ps 1942 FMA4_INSTR fmsub, pd, ps, sd, ss 1943 FMA4_INSTR fmsubadd, pd, ps 1944 FMA4_INSTR fnmadd, pd, ps, sd, ss 1945 FMA4_INSTR fnmsub, pd, ps, sd, ss 1946 1947 ; Macros for converting VEX instructions to equivalent EVEX ones. 1948 %macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex 1949 %macro %1 2-7 fnord, fnord, %1, %2, %3 1950 %ifidn %3, fnord 1951 %define %%args %1, %2 1952 %elifidn %4, fnord 1953 %define %%args %1, %2, %3 1954 %else 1955 %define %%args %1, %2, %3, %4 1956 %endif 1957 %assign %%evex_required cpuflag(avx512) & %7 1958 %ifnum regnumof%1 1959 %if regnumof%1 >= 16 || sizeof%1 > 32 1960 %assign %%evex_required 1 1961 %endif 1962 %endif 1963 %ifnum regnumof%2 1964 %if regnumof%2 >= 16 || sizeof%2 > 32 1965 %assign %%evex_required 1 1966 %endif 1967 %endif 1968 %ifnum regnumof%3 1969 %if regnumof%3 >= 16 || sizeof%3 > 32 1970 %assign %%evex_required 1 1971 %endif 1972 %endif 1973 %if %%evex_required 1974 %6 %%args 1975 %else 1976 %5 %%args ; Prefer VEX over EVEX due to shorter instruction length 1977 %endif 1978 %endmacro 1979 %endmacro 1980 1981 EVEX_INSTR vbroadcastf128, vbroadcastf32x4 1982 EVEX_INSTR vbroadcasti128, vbroadcasti32x4 1983 EVEX_INSTR vextractf128, vextractf32x4 1984 EVEX_INSTR vextracti128, vextracti32x4 1985 EVEX_INSTR vinsertf128, vinsertf32x4 1986 EVEX_INSTR vinserti128, vinserti32x4 1987 EVEX_INSTR vmovdqa, vmovdqa32 1988 EVEX_INSTR vmovdqu, vmovdqu32 1989 EVEX_INSTR vpand, vpandd 1990 EVEX_INSTR vpandn, vpandnd 1991 EVEX_INSTR vpor, vpord 1992 EVEX_INSTR vpxor, vpxord 1993 EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision 1994 EVEX_INSTR vrcpss, vrcp14ss, 1 1995 EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 1996 EVEX_INSTR vrsqrtss, vrsqrt14ss, 1