tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

x86inc.asm (60248B)


      1 ;*****************************************************************************
      2 ;* x86inc.asm: x86 abstraction layer
      3 ;*****************************************************************************
      4 ;* Copyright (C) 2005-2024 x264 project
      5 ;*
      6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
      7 ;*          Henrik Gramner <henrik@gramner.com>
      8 ;*          Anton Mitrofanov <BugMaster@narod.ru>
      9 ;*          Fiona Glaser <fiona@x264.com>
     10 ;*
     11 ;* Permission to use, copy, modify, and/or distribute this software for any
     12 ;* purpose with or without fee is hereby granted, provided that the above
     13 ;* copyright notice and this permission notice appear in all copies.
     14 ;*
     15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     22 ;*****************************************************************************
     23 
     24 ; This is a header file for the x86inc.asm assembly language, which uses
     25 ; NASM/YASM syntax combined with a large number of macros to provide easy
     26 ; abstraction between different calling conventions (x86_32, win64, linux64).
     27 ; It also has various other useful features to simplify writing the kind of
     28 ; DSP functions that are most often used.
     29 
     30 %ifndef private_prefix
     31    %error private_prefix not defined
     32 %endif
     33 
     34 %ifndef public_prefix
     35    %define public_prefix private_prefix
     36 %endif
     37 
     38 %ifndef STACK_ALIGNMENT
     39    %if ARCH_X86_64
     40        %define STACK_ALIGNMENT 16
     41    %else
     42        %define STACK_ALIGNMENT 4
     43    %endif
     44 %endif
     45 
     46 %define WIN32  0
     47 %define WIN64  0
     48 %define UNIX64 0
     49 %if ARCH_X86_64
     50    %ifidn __OUTPUT_FORMAT__,win32
     51        %define WIN32  1
     52        %define WIN64  1
     53    %elifidn __OUTPUT_FORMAT__,win64
     54        %define WIN32  1
     55        %define WIN64  1
     56    %elifidn __OUTPUT_FORMAT__,x64
     57        %define WIN32  1
     58        %define WIN64  1
     59    %else
     60        %define UNIX64 1
     61    %endif
     62 %else
     63    %ifidn __OUTPUT_FORMAT__,win32
     64        %define WIN32  1
     65    %endif
     66 %endif
     67 
     68 %define FORMAT_ELF 0
     69 %define FORMAT_MACHO 0
     70 %ifidn __OUTPUT_FORMAT__,elf
     71    %define FORMAT_ELF 1
     72 %elifidn __OUTPUT_FORMAT__,elf32
     73    %define FORMAT_ELF 1
     74 %elifidn __OUTPUT_FORMAT__,elf64
     75    %define FORMAT_ELF 1
     76 %elifidn __OUTPUT_FORMAT__,macho
     77    %define FORMAT_MACHO 1
     78 %elifidn __OUTPUT_FORMAT__,macho32
     79    %define FORMAT_MACHO 1
     80 %elifidn __OUTPUT_FORMAT__,macho64
     81    %define FORMAT_MACHO 1
     82 %endif
     83 
     84 %ifdef PREFIX
     85    %define mangle(x) _ %+ x
     86 %else
     87    %define mangle(x) x
     88 %endif
     89 
     90 ; Use VEX-encoding even in non-AVX functions
     91 %ifndef FORCE_VEX_ENCODING
     92    %define FORCE_VEX_ENCODING 0
     93 %endif
     94 
     95 %macro SECTION_RODATA 0-1 16
     96    %ifidn __OUTPUT_FORMAT__,win32
     97        SECTION .rdata align=%1
     98    %elif WIN64
     99        SECTION .rdata align=%1
    100    %else
    101        SECTION .rodata align=%1
    102    %endif
    103 %endmacro
    104 
    105 %if ARCH_X86_64
    106    %define PIC 1 ; always use PIC on x86-64
    107    default rel
    108 %elifidn __OUTPUT_FORMAT__,win32
    109    %define PIC 0 ; PIC isn't used on 32-bit Windows
    110 %elifndef PIC
    111    %define PIC 0
    112 %endif
    113 
    114 %define HAVE_PRIVATE_EXTERN 1
    115 %ifdef __NASM_VERSION_ID__
    116    %use smartalign
    117    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
    118        %define HAVE_PRIVATE_EXTERN 0
    119    %endif
    120 %endif
    121 
    122 ; Macros to eliminate most code duplication between x86_32 and x86_64:
    123 ; Currently this works only for leaf functions which load all their arguments
    124 ; into registers at the start, and make no other use of the stack. Luckily that
    125 ; covers most use cases.
    126 
    127 ; PROLOGUE:
    128 ; %1 = number of arguments. loads them from stack if needed.
    129 ; %2 = number of registers used. pushes callee-saved regs if needed.
    130 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
    131 ; %4 = (optional) stack size to be allocated. The stack will be aligned before
    132 ;      allocating the specified stack size. If the required stack alignment is
    133 ;      larger than the known stack alignment the stack will be manually aligned
    134 ;      and an extra register will be allocated to hold the original stack
    135 ;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
    136 ;      register as stack pointer, request a negative stack size.
    137 ; %4+/%5+ = list of names to define to registers
    138 ; PROLOGUE can also be invoked by adding the same options to cglobal
    139 
    140 ; e.g.
    141 ; cglobal foo, 2,3,7,0x40, dst, src, tmp
    142 ; declares a function (foo) that automatically loads two arguments (dst and
    143 ; src) into registers, uses one additional register (tmp) plus 7 vector
    144 ; registers (m0-m6) and allocates 0x40 bytes of stack space.
    145 
    146 ; TODO Some functions can use some args directly from the stack. If they're the
    147 ; last args then you can just not declare them, but if they're in the middle
    148 ; we need more flexible macro.
    149 
    150 ; RET:
    151 ; Pops anything that was pushed by PROLOGUE, and returns.
    152 
    153 ; REP_RET:
    154 ; Use this instead of RET if it's a branch target.
    155 
    156 ; registers:
    157 ; rN and rNq are the native-size register holding function argument N
    158 ; rNd, rNw, rNb are dword, word, and byte size
    159 ; rNh is the high 8 bits of the word size
    160 ; rNm is the original location of arg N (a register or on the stack), dword
    161 ; rNmp is native size
    162 
    163 %macro DECLARE_REG 2-3
    164    %define r%1q %2
    165    %define r%1d %2d
    166    %define r%1w %2w
    167    %define r%1b %2b
    168    %define r%1h %2h
    169    %define %2q %2
    170    %if %0 == 2
    171        %define r%1m  %2d
    172        %define r%1mp %2
    173    %elif ARCH_X86_64 ; memory
    174        %define r%1m [rstk + stack_offset + %3]
    175        %define r%1mp qword r %+ %1 %+ m
    176    %else
    177        %define r%1m [rstk + stack_offset + %3]
    178        %define r%1mp dword r %+ %1 %+ m
    179    %endif
    180    %define r%1  %2
    181 %endmacro
    182 
    183 %macro DECLARE_REG_SIZE 3
    184    %define r%1q r%1
    185    %define e%1q r%1
    186    %define r%1d e%1
    187    %define e%1d e%1
    188    %define r%1w %1
    189    %define e%1w %1
    190    %define r%1h %3
    191    %define e%1h %3
    192    %define r%1b %2
    193    %define e%1b %2
    194    %if ARCH_X86_64 == 0
    195        %define r%1 e%1
    196    %endif
    197 %endmacro
    198 
    199 DECLARE_REG_SIZE ax, al, ah
    200 DECLARE_REG_SIZE bx, bl, bh
    201 DECLARE_REG_SIZE cx, cl, ch
    202 DECLARE_REG_SIZE dx, dl, dh
    203 DECLARE_REG_SIZE si, sil, null
    204 DECLARE_REG_SIZE di, dil, null
    205 DECLARE_REG_SIZE bp, bpl, null
    206 
    207 ; t# defines for when per-arch register allocation is more complex than just function arguments
    208 
    209 %macro DECLARE_REG_TMP 1-*
    210    %assign %%i 0
    211    %rep %0
    212        CAT_XDEFINE t, %%i, r%1
    213        %assign %%i %%i+1
    214        %rotate 1
    215    %endrep
    216 %endmacro
    217 
    218 %macro DECLARE_REG_TMP_SIZE 0-*
    219    %rep %0
    220        %define t%1q t%1 %+ q
    221        %define t%1d t%1 %+ d
    222        %define t%1w t%1 %+ w
    223        %define t%1h t%1 %+ h
    224        %define t%1b t%1 %+ b
    225        %rotate 1
    226    %endrep
    227 %endmacro
    228 
    229 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
    230 
    231 %if ARCH_X86_64
    232    %define gprsize 8
    233 %else
    234    %define gprsize 4
    235 %endif
    236 
    237 %macro LEA 2
    238 %if ARCH_X86_64
    239    lea %1, [%2]
    240 %elif PIC
    241    call $+5 ; special-cased to not affect the RSB on most CPU:s
    242    pop %1
    243    add %1, -$+1+%2
    244 %else
    245    mov %1, %2
    246 %endif
    247 %endmacro
    248 
    249 ; Repeats an instruction/operation for multiple arguments.
    250 ; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3"
    251 %macro REPX 2-* ; operation, args
    252    %xdefine %%f(x) %1
    253    %rep %0 - 1
    254        %rotate 1
    255        %%f(%1)
    256    %endrep
    257 %endmacro
    258 
    259 %macro PUSH 1
    260    push %1
    261    %ifidn rstk, rsp
    262        %assign stack_offset stack_offset+gprsize
    263    %endif
    264 %endmacro
    265 
    266 %macro POP 1
    267    pop %1
    268    %ifidn rstk, rsp
    269        %assign stack_offset stack_offset-gprsize
    270    %endif
    271 %endmacro
    272 
    273 %macro PUSH_IF_USED 1-*
    274    %rep %0
    275        %if %1 < regs_used
    276            PUSH r%1
    277        %endif
    278        %rotate 1
    279    %endrep
    280 %endmacro
    281 
    282 %macro POP_IF_USED 1-*
    283    %rep %0
    284        %if %1 < regs_used
    285            pop r%1
    286        %endif
    287        %rotate 1
    288    %endrep
    289 %endmacro
    290 
    291 %macro LOAD_IF_USED 1-*
    292    %rep %0
    293        %if %1 < num_args
    294            mov r%1, r %+ %1 %+ mp
    295        %endif
    296        %rotate 1
    297    %endrep
    298 %endmacro
    299 
    300 %macro SUB 2
    301    sub %1, %2
    302    %ifidn %1, rstk
    303        %assign stack_offset stack_offset+(%2)
    304    %endif
    305 %endmacro
    306 
    307 %macro ADD 2
    308    add %1, %2
    309    %ifidn %1, rstk
    310        %assign stack_offset stack_offset-(%2)
    311    %endif
    312 %endmacro
    313 
    314 %macro movifnidn 2
    315    %ifnidn %1, %2
    316        mov %1, %2
    317    %endif
    318 %endmacro
    319 
    320 %if ARCH_X86_64 == 0
    321    %define movsxd movifnidn
    322 %endif
    323 
    324 %macro movsxdifnidn 2
    325    %ifnidn %1, %2
    326        movsxd %1, %2
    327    %endif
    328 %endmacro
    329 
    330 %macro ASSERT 1
    331    %if (%1) == 0
    332        %error assertion ``%1'' failed
    333    %endif
    334 %endmacro
    335 
    336 %macro DEFINE_ARGS 0-*
    337    %ifdef n_arg_names
    338        %assign %%i 0
    339        %rep n_arg_names
    340            CAT_UNDEF arg_name %+ %%i, q
    341            CAT_UNDEF arg_name %+ %%i, d
    342            CAT_UNDEF arg_name %+ %%i, w
    343            CAT_UNDEF arg_name %+ %%i, h
    344            CAT_UNDEF arg_name %+ %%i, b
    345            CAT_UNDEF arg_name %+ %%i, m
    346            CAT_UNDEF arg_name %+ %%i, mp
    347            CAT_UNDEF arg_name, %%i
    348            %assign %%i %%i+1
    349        %endrep
    350    %endif
    351 
    352    %xdefine %%stack_offset stack_offset
    353    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
    354    %assign %%i 0
    355    %rep %0
    356        %xdefine %1q r %+ %%i %+ q
    357        %xdefine %1d r %+ %%i %+ d
    358        %xdefine %1w r %+ %%i %+ w
    359        %xdefine %1h r %+ %%i %+ h
    360        %xdefine %1b r %+ %%i %+ b
    361        %xdefine %1m r %+ %%i %+ m
    362        %xdefine %1mp r %+ %%i %+ mp
    363        CAT_XDEFINE arg_name, %%i, %1
    364        %assign %%i %%i+1
    365        %rotate 1
    366    %endrep
    367    %xdefine stack_offset %%stack_offset
    368    %assign n_arg_names %0
    369 %endmacro
    370 
    371 %define required_stack_alignment ((mmsize + 15) & ~15)
    372 %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
    373 %define high_mm_regs (16*cpuflag(avx512))
    374 
    375 ; Large stack allocations on Windows need to use stack probing in order
    376 ; to guarantee that all stack memory is committed before accessing it.
    377 ; This is done by ensuring that the guard page(s) at the end of the
    378 ; currently committed pages are touched prior to any pages beyond that.
    379 %if WIN64
    380    %assign STACK_PROBE_SIZE 8192
    381 %elifidn __OUTPUT_FORMAT__, win32
    382    %assign STACK_PROBE_SIZE 4096
    383 %else
    384    %assign STACK_PROBE_SIZE 0
    385 %endif
    386 
    387 %macro PROBE_STACK 1 ; stack_size
    388    %if STACK_PROBE_SIZE
    389        %assign %%i STACK_PROBE_SIZE
    390        %rep %1 / STACK_PROBE_SIZE
    391            mov eax, [rsp-%%i]
    392            %assign %%i %%i+STACK_PROBE_SIZE
    393        %endrep
    394    %endif
    395 %endmacro
    396 
    397 %macro RESET_STACK_STATE 0
    398    %ifidn rstk, rsp
    399        %assign stack_offset stack_offset - stack_size_padded
    400    %else
    401        %xdefine rstk rsp
    402    %endif
    403    %assign stack_size 0
    404    %assign stack_size_padded 0
    405    %assign xmm_regs_used 0
    406 %endmacro
    407 
    408 %macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs
    409    RESET_STACK_STATE
    410    %ifnum %2
    411        %if mmsize != 8
    412            %assign xmm_regs_used %2
    413        %endif
    414    %endif
    415    %ifnum %1
    416        %if %1 != 0
    417            %assign %%pad 0
    418            %assign stack_size %1
    419            %if stack_size < 0
    420                %assign stack_size -stack_size
    421            %endif
    422            %if WIN64
    423                %assign %%pad %%pad + 32 ; shadow space
    424                %if xmm_regs_used > 8
    425                    %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
    426                %endif
    427            %endif
    428            %if required_stack_alignment <= STACK_ALIGNMENT
    429                ; maintain the current stack alignment
    430                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
    431                PROBE_STACK stack_size_padded
    432                SUB rsp, stack_size_padded
    433            %else
    434                %assign %%reg_num (regs_used - 1)
    435                %xdefine rstk r %+ %%reg_num
    436                ; align stack, and save original stack location directly above
    437                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
    438                ; stack in a single instruction (i.e. mov rsp, rstk or mov
    439                ; rsp, [rsp+stack_size_padded])
    440                %if %1 < 0 ; need to store rsp on stack
    441                    %xdefine rstkm [rsp + stack_size + %%pad]
    442                    %assign %%pad %%pad + gprsize
    443                %else ; can keep rsp in rstk during whole function
    444                    %xdefine rstkm rstk
    445                %endif
    446                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
    447                PROBE_STACK stack_size_padded
    448                mov rstk, rsp
    449                and rsp, ~(required_stack_alignment-1)
    450                sub rsp, stack_size_padded
    451                movifnidn rstkm, rstk
    452            %endif
    453            WIN64_PUSH_XMM
    454        %endif
    455    %endif
    456 %endmacro
    457 
    458 %macro SETUP_STACK_POINTER 0-1 0
    459    %ifnum %1
    460        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
    461            %if %1 > 0
    462                ; Reserve an additional register for storing the original stack pointer, but avoid using
    463                ; eax/rax for this purpose since it can potentially get overwritten as a return value.
    464                %assign regs_used (regs_used + 1)
    465                %if ARCH_X86_64 && regs_used == 7
    466                    %assign regs_used 8
    467                %elif ARCH_X86_64 == 0 && regs_used == 1
    468                    %assign regs_used 2
    469                %endif
    470            %endif
    471            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
    472                ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
    473                ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
    474                %assign regs_used 5 + UNIX64 * 3
    475            %endif
    476        %endif
    477    %endif
    478 %endmacro
    479 
    480 %if WIN64 ; Windows x64 ;=================================================
    481 
    482 DECLARE_REG 0,  rcx
    483 DECLARE_REG 1,  rdx
    484 DECLARE_REG 2,  R8
    485 DECLARE_REG 3,  R9
    486 DECLARE_REG 4,  R10, 40
    487 DECLARE_REG 5,  R11, 48
    488 DECLARE_REG 6,  rax, 56
    489 DECLARE_REG 7,  rdi, 64
    490 DECLARE_REG 8,  rsi, 72
    491 DECLARE_REG 9,  rbx, 80
    492 DECLARE_REG 10, rbp, 88
    493 DECLARE_REG 11, R14, 96
    494 DECLARE_REG 12, R15, 104
    495 DECLARE_REG 13, R12, 112
    496 DECLARE_REG 14, R13, 120
    497 
    498 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
    499    %assign num_args %1
    500    %assign regs_used %2
    501    ASSERT regs_used >= num_args
    502    SETUP_STACK_POINTER %4
    503    ASSERT regs_used <= 15
    504    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
    505    ALLOC_STACK %4, %3
    506    %if mmsize != 8 && stack_size == 0
    507        WIN64_SPILL_XMM %3
    508    %endif
    509    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
    510    %if %0 > 4
    511        %ifnum %4
    512            DEFINE_ARGS %5
    513        %else
    514            DEFINE_ARGS %4, %5
    515        %endif
    516    %elifnnum %4
    517        DEFINE_ARGS %4
    518    %endif
    519 %endmacro
    520 
    521 ; Push XMM registers to the stack. If no argument is specified all used register
    522 ; will be pushed, otherwise only push previously unpushed registers.
    523 %macro WIN64_PUSH_XMM 0-2 ; new_xmm_regs_used, xmm_regs_pushed
    524    %if mmsize != 8
    525        %if %0 == 2
    526            %assign %%pushed %2
    527            %assign xmm_regs_used %1
    528        %elif %0 == 1
    529            %assign %%pushed xmm_regs_used
    530            %assign xmm_regs_used %1
    531        %else
    532            %assign %%pushed 0
    533        %endif
    534        ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
    535        %if %%pushed <= 6 + high_mm_regs && xmm_regs_used > 6 + high_mm_regs
    536            movaps [rstk + stack_offset +  8], xmm6
    537        %endif
    538        %if %%pushed <= 7 + high_mm_regs && xmm_regs_used > 7 + high_mm_regs
    539            movaps [rstk + stack_offset + 24], xmm7
    540        %endif
    541        %assign %%pushed %%pushed - high_mm_regs - 8
    542        %if %%pushed < 0
    543            %assign %%pushed 0
    544        %endif
    545        %assign %%regs_to_push xmm_regs_used - %%pushed - high_mm_regs - 8
    546        %if %%regs_to_push > 0
    547            ASSERT (%%regs_to_push + %%pushed) * 16 <= stack_size_padded - stack_size - 32
    548            %assign %%i %%pushed + 8
    549            %rep %%regs_to_push
    550                movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
    551                %assign %%i %%i+1
    552            %endrep
    553        %endif
    554    %endif
    555 %endmacro
    556 
    557 ; Allocated stack space for XMM registers and push all, or a subset, of those
    558 %macro WIN64_SPILL_XMM 1-2 ; xmm_regs_used, xmm_regs_reserved
    559    RESET_STACK_STATE
    560    %if mmsize != 8
    561        %assign xmm_regs_used %1
    562        ASSERT xmm_regs_used <= 16 + high_mm_regs
    563        %if %0 == 2
    564            ASSERT %2 >= %1
    565            %assign %%xmm_regs_on_stack %2 - high_mm_regs - 8
    566        %else
    567            %assign %%xmm_regs_on_stack %1 - high_mm_regs - 8
    568        %endif
    569        %if %%xmm_regs_on_stack > 0
    570            ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
    571            %assign %%pad %%xmm_regs_on_stack*16 + 32
    572            %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
    573            SUB rsp, stack_size_padded
    574        %endif
    575        WIN64_PUSH_XMM
    576    %endif
    577 %endmacro
    578 
    579 %macro WIN64_RESTORE_XMM_INTERNAL 0
    580    %assign %%pad_size 0
    581    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
    582    %if %%xmm_regs_on_stack > 0
    583        %assign %%i xmm_regs_used - high_mm_regs
    584        %rep %%xmm_regs_on_stack
    585            %assign %%i %%i-1
    586            movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
    587        %endrep
    588    %endif
    589    %if stack_size_padded > 0
    590        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
    591            mov rsp, rstkm
    592        %else
    593            add rsp, stack_size_padded
    594            %assign %%pad_size stack_size_padded
    595        %endif
    596    %endif
    597    %if xmm_regs_used > 7 + high_mm_regs
    598        movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
    599    %endif
    600    %if xmm_regs_used > 6 + high_mm_regs
    601        movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
    602    %endif
    603 %endmacro
    604 
    605 %macro WIN64_RESTORE_XMM 0
    606    WIN64_RESTORE_XMM_INTERNAL
    607    RESET_STACK_STATE
    608 %endmacro
    609 
    610 %define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
    611 
    612 %macro RET 0
    613    WIN64_RESTORE_XMM_INTERNAL
    614    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
    615    %if vzeroupper_required
    616        vzeroupper
    617    %endif
    618    AUTO_REP_RET
    619 %endmacro
    620 
    621 %elif ARCH_X86_64 ; *nix x64 ;=============================================
    622 
    623 DECLARE_REG 0,  rdi
    624 DECLARE_REG 1,  rsi
    625 DECLARE_REG 2,  rdx
    626 DECLARE_REG 3,  rcx
    627 DECLARE_REG 4,  R8
    628 DECLARE_REG 5,  R9
    629 DECLARE_REG 6,  rax, 8
    630 DECLARE_REG 7,  R10, 16
    631 DECLARE_REG 8,  R11, 24
    632 DECLARE_REG 9,  rbx, 32
    633 DECLARE_REG 10, rbp, 40
    634 DECLARE_REG 11, R14, 48
    635 DECLARE_REG 12, R15, 56
    636 DECLARE_REG 13, R12, 64
    637 DECLARE_REG 14, R13, 72
    638 
    639 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
    640    %assign num_args %1
    641    %assign regs_used %2
    642    ASSERT regs_used >= num_args
    643    SETUP_STACK_POINTER %4
    644    ASSERT regs_used <= 15
    645    PUSH_IF_USED 9, 10, 11, 12, 13, 14
    646    ALLOC_STACK %4, %3
    647    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
    648    %if %0 > 4
    649        %ifnum %4
    650            DEFINE_ARGS %5
    651        %else
    652            DEFINE_ARGS %4, %5
    653        %endif
    654    %elifnnum %4
    655        DEFINE_ARGS %4
    656    %endif
    657 %endmacro
    658 
    659 %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
    660 
    661 %macro RET 0
    662    %if stack_size_padded > 0
    663        %if required_stack_alignment > STACK_ALIGNMENT
    664            mov rsp, rstkm
    665        %else
    666            add rsp, stack_size_padded
    667        %endif
    668    %endif
    669    POP_IF_USED 14, 13, 12, 11, 10, 9
    670    %if vzeroupper_required
    671        vzeroupper
    672    %endif
    673    AUTO_REP_RET
    674 %endmacro
    675 
    676 %else ; X86_32 ;==============================================================
    677 
    678 DECLARE_REG 0, eax, 4
    679 DECLARE_REG 1, ecx, 8
    680 DECLARE_REG 2, edx, 12
    681 DECLARE_REG 3, ebx, 16
    682 DECLARE_REG 4, esi, 20
    683 DECLARE_REG 5, edi, 24
    684 DECLARE_REG 6, ebp, 28
    685 %define rsp esp
    686 
    687 %macro DECLARE_ARG 1-*
    688    %rep %0
    689        %define r%1m [rstk + stack_offset + 4*%1 + 4]
    690        %define r%1mp dword r%1m
    691        %rotate 1
    692    %endrep
    693 %endmacro
    694 
    695 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
    696 
    697 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
    698    %assign num_args %1
    699    %assign regs_used %2
    700    ASSERT regs_used >= num_args
    701    %if num_args > 7
    702        %assign num_args 7
    703    %endif
    704    %if regs_used > 7
    705        %assign regs_used 7
    706    %endif
    707    SETUP_STACK_POINTER %4
    708    ASSERT regs_used <= 7
    709    PUSH_IF_USED 3, 4, 5, 6
    710    ALLOC_STACK %4, %3
    711    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
    712    %if %0 > 4
    713        %ifnum %4
    714            DEFINE_ARGS %5
    715        %else
    716            DEFINE_ARGS %4, %5
    717        %endif
    718    %elifnnum %4
    719        DEFINE_ARGS %4
    720    %endif
    721 %endmacro
    722 
    723 %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
    724 
    725 %macro RET 0
    726    %if stack_size_padded > 0
    727        %if required_stack_alignment > STACK_ALIGNMENT
    728            mov rsp, rstkm
    729        %else
    730            add rsp, stack_size_padded
    731        %endif
    732    %endif
    733    POP_IF_USED 6, 5, 4, 3
    734    %if vzeroupper_required
    735        vzeroupper
    736    %endif
    737    AUTO_REP_RET
    738 %endmacro
    739 
    740 %endif ;======================================================================
    741 
    742 %if WIN64 == 0
    743    %macro WIN64_SPILL_XMM 1-2
    744        RESET_STACK_STATE
    745        %if mmsize != 8
    746            %assign xmm_regs_used %1
    747        %endif
    748    %endmacro
    749    %macro WIN64_RESTORE_XMM 0
    750        RESET_STACK_STATE
    751    %endmacro
    752    %macro WIN64_PUSH_XMM 0-2
    753        %if mmsize != 8 && %0 >= 1
    754            %assign xmm_regs_used %1
    755        %endif
    756    %endmacro
    757 %endif
    758 
    759 ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
    760 ; a branch or a branch target. So switch to a 2-byte form of ret in that case.
    761 ; We can automatically detect "follows a branch", but not a branch target.
    762 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
    763 %macro REP_RET 0
    764    %if has_epilogue || cpuflag(ssse3)
    765        RET
    766    %else
    767        rep ret
    768    %endif
    769    annotate_function_size
    770 %endmacro
    771 
    772 %define last_branch_adr $$
    773 %macro AUTO_REP_RET 0
    774    %if notcpuflag(ssse3)
    775        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
    776    %endif
    777    ret
    778    annotate_function_size
    779 %endmacro
    780 
    781 %macro BRANCH_INSTR 0-*
    782    %rep %0
    783        %macro %1 1-2 %1
    784            %2 %1
    785            %if notcpuflag(ssse3)
    786                %%branch_instr equ $
    787                %xdefine last_branch_adr %%branch_instr
    788            %endif
    789        %endmacro
    790        %rotate 1
    791    %endrep
    792 %endmacro
    793 
    794 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
    795 
    796 %macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
    797    %if has_epilogue
    798        call %1
    799        RET
    800    %elif %2
    801        jmp %1
    802    %endif
    803    annotate_function_size
    804 %endmacro
    805 
    806 ;=============================================================================
    807 ; arch-independent part
    808 ;=============================================================================
    809 
    810 %assign function_align 16
    811 
    812 ; Begin a function.
    813 ; Applies any symbol mangling needed for C linkage, and sets up a define such that
    814 ; subsequent uses of the function name automatically refer to the mangled version.
    815 ; Appends cpuflags to the function name if cpuflags has been specified.
    816 ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
    817 ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
    818 %macro cglobal 1-2+ "" ; name, [PROLOGUE args]
    819    cglobal_internal 1, %1 %+ SUFFIX, %2
    820 %endmacro
    821 %macro cvisible 1-2+ "" ; name, [PROLOGUE args]
    822    cglobal_internal 0, %1 %+ SUFFIX, %2
    823 %endmacro
    824 %macro cglobal_internal 2-3+
    825    annotate_function_size
    826    %ifndef cglobaled_%2
    827        %if %1
    828            %xdefine %2 mangle(private_prefix %+ _ %+ %2)
    829        %else
    830            %xdefine %2 mangle(public_prefix %+ _ %+ %2)
    831        %endif
    832        %xdefine %2.skip_prologue %2 %+ .skip_prologue
    833        CAT_XDEFINE cglobaled_, %2, 1
    834    %endif
    835    %xdefine current_function %2
    836    %xdefine current_function_section __SECT__
    837    %if FORMAT_ELF
    838        %if %1
    839            global %2:function hidden
    840        %else
    841            global %2:function
    842        %endif
    843    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
    844        global %2:private_extern
    845    %else
    846        global %2
    847    %endif
    848    %if WIN32 && !%1
    849        %ifdef BUILDING_DLL
    850            export %2
    851        %endif
    852    %endif
    853    align function_align
    854    %2:
    855    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
    856    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
    857    %assign stack_offset 0      ; stack pointer offset relative to the return address
    858    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
    859    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
    860    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
    861    %ifnidn %3, ""
    862        PROLOGUE %3
    863    %endif
    864 %endmacro
    865 
    866 ; Create a global symbol from a local label with the correct name mangling and type
    867 %macro cglobal_label 1
    868    %if FORMAT_ELF
    869        global current_function %+ %1:function hidden
    870    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
    871        global current_function %+ %1:private_extern
    872    %else
    873        global current_function %+ %1
    874    %endif
    875    %1:
    876 %endmacro
    877 
    878 %macro cextern 1
    879    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
    880    CAT_XDEFINE cglobaled_, %1, 2
    881    extern %1
    882 %endmacro
    883 
    884 ; Like cextern, but without the prefix. This should be used for symbols from external libraries.
    885 %macro cextern_naked 1
    886    %ifdef PREFIX
    887        %xdefine %1 mangle(%1)
    888    %endif
    889    CAT_XDEFINE cglobaled_, %1, 3
    890    extern %1
    891 %endmacro
    892 
    893 %macro const 1-2+
    894    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
    895    %if FORMAT_ELF
    896        global %1:data hidden
    897    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
    898        global %1:private_extern
    899    %else
    900        global %1
    901    %endif
    902    %1: %2
    903 %endmacro
    904 
    905 %if FORMAT_ELF
    906    ; The GNU linker assumes the stack is executable by default.
    907    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
    908 
    909    %ifdef __NASM_VERSION_ID__
    910        %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03
    911            %if ARCH_X86_64
    912                ; Control-flow Enforcement Technology (CET) properties.
    913                [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize]
    914                dd 0x00000004  ; n_namesz
    915                dd gprsize + 8 ; n_descsz
    916                dd 0x00000005  ; n_type = NT_GNU_PROPERTY_TYPE_0
    917                db "GNU",0     ; n_name
    918                dd 0xc0000002  ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND
    919                dd 0x00000004  ; pr_datasz
    920                dd 0x00000002  ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK
    921                dd 0x00000000  ; pr_padding
    922            %endif
    923        %endif
    924    %endif
    925 %endif
    926 
    927 ; Tell debuggers how large the function was.
    928 ; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
    929 ; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
    930 ; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
    931 ; then its size might be unspecified.
    932 %macro annotate_function_size 0
    933    %ifdef __YASM_VER__
    934        %ifdef current_function
    935            %if FORMAT_ELF
    936                current_function_section
    937                %%ecf equ $
    938                size current_function %%ecf - current_function
    939                __SECT__
    940            %endif
    941        %endif
    942    %endif
    943 %endmacro
    944 
    945 ; cpuflags
    946 
    947 %assign cpuflags_mmx       (1<<0)
    948 %assign cpuflags_mmx2      (1<<1)  | cpuflags_mmx
    949 %assign cpuflags_3dnow     (1<<2)  | cpuflags_mmx
    950 %assign cpuflags_3dnowext  (1<<3)  | cpuflags_3dnow
    951 %assign cpuflags_sse       (1<<4)  | cpuflags_mmx2
    952 %assign cpuflags_sse2      (1<<5)  | cpuflags_sse
    953 %assign cpuflags_sse2slow  (1<<6)  | cpuflags_sse2
    954 %assign cpuflags_lzcnt     (1<<7)  | cpuflags_sse2
    955 %assign cpuflags_sse3      (1<<8)  | cpuflags_sse2
    956 %assign cpuflags_ssse3     (1<<9)  | cpuflags_sse3
    957 %assign cpuflags_sse4      (1<<10) | cpuflags_ssse3
    958 %assign cpuflags_sse42     (1<<11) | cpuflags_sse4
    959 %assign cpuflags_aesni     (1<<12) | cpuflags_sse42
    960 %assign cpuflags_clmul     (1<<13) | cpuflags_sse42
    961 %assign cpuflags_gfni      (1<<14) | cpuflags_aesni|cpuflags_clmul
    962 %assign cpuflags_avx       (1<<15) | cpuflags_sse42
    963 %assign cpuflags_xop       (1<<16) | cpuflags_avx
    964 %assign cpuflags_fma4      (1<<17) | cpuflags_avx
    965 %assign cpuflags_fma3      (1<<18) | cpuflags_avx
    966 %assign cpuflags_bmi1      (1<<19) | cpuflags_avx|cpuflags_lzcnt
    967 %assign cpuflags_bmi2      (1<<20) | cpuflags_bmi1
    968 %assign cpuflags_avx2      (1<<21) | cpuflags_fma3|cpuflags_bmi2
    969 %assign cpuflags_avx512    (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL
    970 %assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
    971 
    972 %assign cpuflags_cache32   (1<<24)
    973 %assign cpuflags_cache64   (1<<25)
    974 %assign cpuflags_aligned   (1<<26) ; not a cpu feature, but a function variant
    975 %assign cpuflags_atom      (1<<27)
    976 
    977 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
    978 %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
    979 %define notcpuflag(x) (cpuflag(x) ^ 1)
    980 
    981 ; Takes an arbitrary number of cpuflags from the above list.
    982 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
    983 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
    984 %macro INIT_CPUFLAGS 0-*
    985    %xdefine SUFFIX
    986    %undef cpuname
    987    %assign cpuflags 0
    988 
    989    %if %0 >= 1
    990        %rep %0
    991            %ifdef cpuname
    992                %xdefine cpuname cpuname %+ _%1
    993            %else
    994                %xdefine cpuname %1
    995            %endif
    996            %assign cpuflags cpuflags | cpuflags_%1
    997            %rotate 1
    998        %endrep
    999        %xdefine SUFFIX _ %+ cpuname
   1000 
   1001        %if cpuflag(avx)
   1002            %assign avx_enabled 1
   1003        %endif
   1004        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
   1005            %define mova movaps
   1006            %define movu movups
   1007            %define movnta movntps
   1008        %endif
   1009        %if cpuflag(aligned)
   1010            %define movu mova
   1011        %elif cpuflag(sse3) && notcpuflag(ssse3)
   1012            %define movu lddqu
   1013        %endif
   1014    %endif
   1015 
   1016    %if ARCH_X86_64 || cpuflag(sse2)
   1017        %ifdef __NASM_VERSION_ID__
   1018            ALIGNMODE p6
   1019        %else
   1020            CPU amdnop
   1021        %endif
   1022    %else
   1023        %ifdef __NASM_VERSION_ID__
   1024            ALIGNMODE nop
   1025        %else
   1026            CPU basicnop
   1027        %endif
   1028    %endif
   1029 %endmacro
   1030 
   1031 ; Merge mmx, sse*, and avx*
   1032 ; m# is a simd register of the currently selected size
   1033 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
   1034 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
   1035 ; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
   1036 ; (All 4 remain in sync through SWAP.)
   1037 
   1038 %macro CAT_XDEFINE 3
   1039    %xdefine %1%2 %3
   1040 %endmacro
   1041 
   1042 %macro CAT_UNDEF 2
   1043    %undef %1%2
   1044 %endmacro
   1045 
   1046 %macro DEFINE_MMREGS 1 ; mmtype
   1047    %assign %%prev_mmregs 0
   1048    %ifdef num_mmregs
   1049        %assign %%prev_mmregs num_mmregs
   1050    %endif
   1051 
   1052    %assign num_mmregs 8
   1053    %if ARCH_X86_64 && mmsize >= 16
   1054        %assign num_mmregs 16
   1055        %if cpuflag(avx512) || mmsize == 64
   1056            %assign num_mmregs 32
   1057        %endif
   1058    %endif
   1059 
   1060    %assign %%i 0
   1061    %rep num_mmregs
   1062        CAT_XDEFINE m, %%i, %1 %+ %%i
   1063        CAT_XDEFINE nn%1, %%i, %%i
   1064        %assign %%i %%i+1
   1065    %endrep
   1066    %if %%prev_mmregs > num_mmregs
   1067        %rep %%prev_mmregs - num_mmregs
   1068            CAT_UNDEF m, %%i
   1069            CAT_UNDEF nn %+ mmtype, %%i
   1070            %assign %%i %%i+1
   1071        %endrep
   1072    %endif
   1073    %xdefine mmtype %1
   1074 %endmacro
   1075 
   1076 ; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
   1077 %macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
   1078    %if ARCH_X86_64 && cpuflag(avx512)
   1079        %assign %%i %1
   1080        %rep 16-%1
   1081            %assign %%i_high %%i+16
   1082            SWAP %%i, %%i_high
   1083            %assign %%i %%i+1
   1084        %endrep
   1085    %endif
   1086 %endmacro
   1087 
   1088 %macro INIT_MMX 0-1+
   1089    %assign avx_enabled 0
   1090    %define RESET_MM_PERMUTATION INIT_MMX %1
   1091    %define mmsize 8
   1092    %define mova movq
   1093    %define movu movq
   1094    %define movh movd
   1095    %define movnta movntq
   1096    INIT_CPUFLAGS %1
   1097    DEFINE_MMREGS mm
   1098 %endmacro
   1099 
   1100 %macro INIT_XMM 0-1+
   1101    %assign avx_enabled FORCE_VEX_ENCODING
   1102    %define RESET_MM_PERMUTATION INIT_XMM %1
   1103    %define mmsize 16
   1104    %define mova movdqa
   1105    %define movu movdqu
   1106    %define movh movq
   1107    %define movnta movntdq
   1108    INIT_CPUFLAGS %1
   1109    DEFINE_MMREGS xmm
   1110    %if WIN64
   1111        AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
   1112    %endif
   1113    %xdefine bcstw 1to8
   1114    %xdefine bcstd 1to4
   1115    %xdefine bcstq 1to2
   1116 %endmacro
   1117 
   1118 %macro INIT_YMM 0-1+
   1119    %assign avx_enabled 1
   1120    %define RESET_MM_PERMUTATION INIT_YMM %1
   1121    %define mmsize 32
   1122    %define mova movdqa
   1123    %define movu movdqu
   1124    %undef movh
   1125    %define movnta movntdq
   1126    INIT_CPUFLAGS %1
   1127    DEFINE_MMREGS ymm
   1128    AVX512_MM_PERMUTATION
   1129    %xdefine bcstw 1to16
   1130    %xdefine bcstd 1to8
   1131    %xdefine bcstq 1to4
   1132 %endmacro
   1133 
   1134 %macro INIT_ZMM 0-1+
   1135    %assign avx_enabled 1
   1136    %define RESET_MM_PERMUTATION INIT_ZMM %1
   1137    %define mmsize 64
   1138    %define mova movdqa
   1139    %define movu movdqu
   1140    %undef movh
   1141    %define movnta movntdq
   1142    INIT_CPUFLAGS %1
   1143    DEFINE_MMREGS zmm
   1144    AVX512_MM_PERMUTATION
   1145    %xdefine bcstw 1to32
   1146    %xdefine bcstd 1to16
   1147    %xdefine bcstq 1to8
   1148 %endmacro
   1149 
   1150 INIT_XMM
   1151 
   1152 %macro DECLARE_MMCAST 1
   1153    %define  mmmm%1   mm%1
   1154    %define  mmxmm%1  mm%1
   1155    %define  mmymm%1  mm%1
   1156    %define  mmzmm%1  mm%1
   1157    %define xmmmm%1   mm%1
   1158    %define xmmxmm%1 xmm%1
   1159    %define xmmymm%1 xmm%1
   1160    %define xmmzmm%1 xmm%1
   1161    %define ymmmm%1   mm%1
   1162    %define ymmxmm%1 xmm%1
   1163    %define ymmymm%1 ymm%1
   1164    %define ymmzmm%1 ymm%1
   1165    %define zmmmm%1   mm%1
   1166    %define zmmxmm%1 xmm%1
   1167    %define zmmymm%1 ymm%1
   1168    %define zmmzmm%1 zmm%1
   1169    %define xm%1 xmm %+ m%1
   1170    %define ym%1 ymm %+ m%1
   1171    %define zm%1 zmm %+ m%1
   1172 %endmacro
   1173 
   1174 %assign i 0
   1175 %rep 32
   1176    DECLARE_MMCAST i
   1177    %assign i i+1
   1178 %endrep
   1179 
   1180 ; I often want to use macros that permute their arguments. e.g. there's no
   1181 ; efficient way to implement butterfly or transpose or dct without swapping some
   1182 ; arguments.
   1183 ;
   1184 ; I would like to not have to manually keep track of the permutations:
   1185 ; If I insert a permutation in the middle of a function, it should automatically
   1186 ; change everything that follows. For more complex macros I may also have multiple
   1187 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
   1188 ;
   1189 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
   1190 ; permutes its arguments. It's equivalent to exchanging the contents of the
   1191 ; registers, except that this way you exchange the register names instead, so it
   1192 ; doesn't cost any cycles.
   1193 
   1194 %macro PERMUTE 2-* ; takes a list of pairs to swap
   1195    %rep %0/2
   1196        %xdefine %%tmp%2 m%2
   1197        %rotate 2
   1198    %endrep
   1199    %rep %0/2
   1200        %xdefine m%1 %%tmp%2
   1201        CAT_XDEFINE nn, m%1, %1
   1202        %rotate 2
   1203    %endrep
   1204 %endmacro
   1205 
   1206 %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
   1207    %ifnum %1 ; SWAP 0, 1, ...
   1208        SWAP_INTERNAL_NUM %1, %2
   1209    %else ; SWAP m0, m1, ...
   1210        SWAP_INTERNAL_NAME %1, %2
   1211    %endif
   1212 %endmacro
   1213 
   1214 %macro SWAP_INTERNAL_NUM 2-*
   1215    %rep %0-1
   1216        %xdefine %%tmp m%1
   1217        %xdefine m%1 m%2
   1218        %xdefine m%2 %%tmp
   1219        CAT_XDEFINE nn, m%1, %1
   1220        CAT_XDEFINE nn, m%2, %2
   1221        %rotate 1
   1222    %endrep
   1223 %endmacro
   1224 
   1225 %macro SWAP_INTERNAL_NAME 2-*
   1226    %xdefine %%args nn %+ %1
   1227    %rep %0-1
   1228        %xdefine %%args %%args, nn %+ %2
   1229        %rotate 1
   1230    %endrep
   1231    SWAP_INTERNAL_NUM %%args
   1232 %endmacro
   1233 
   1234 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
   1235 ; calls to that function will automatically load the permutation, so values can
   1236 ; be returned in mmregs.
   1237 %macro SAVE_MM_PERMUTATION 0-1
   1238    %if %0
   1239        %xdefine %%f %1_m
   1240    %else
   1241        %xdefine %%f current_function %+ _m
   1242    %endif
   1243    %assign %%i 0
   1244    %rep num_mmregs
   1245        %xdefine %%tmp m %+ %%i
   1246        CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
   1247        %assign %%i %%i+1
   1248    %endrep
   1249 %endmacro
   1250 
   1251 %macro LOAD_MM_PERMUTATION 0-1 ; name to load from
   1252    %if %0
   1253        %xdefine %%f %1_m
   1254    %else
   1255        %xdefine %%f current_function %+ _m
   1256    %endif
   1257    %xdefine %%tmp %%f %+ 0
   1258    %ifnum %%tmp
   1259        DEFINE_MMREGS mmtype
   1260        %assign %%i 0
   1261        %rep num_mmregs
   1262            %xdefine %%tmp %%f %+ %%i
   1263            CAT_XDEFINE %%m, %%i, m %+ %%tmp
   1264            %assign %%i %%i+1
   1265        %endrep
   1266        %rep num_mmregs
   1267            %assign %%i %%i-1
   1268            CAT_XDEFINE m, %%i, %%m %+ %%i
   1269            CAT_XDEFINE nn, m %+ %%i, %%i
   1270        %endrep
   1271    %endif
   1272 %endmacro
   1273 
   1274 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
   1275 %macro call 1
   1276    %ifid %1
   1277        call_internal %1 %+ SUFFIX, %1
   1278    %else
   1279        call %1
   1280    %endif
   1281 %endmacro
   1282 %macro call_internal 2
   1283    %xdefine %%i %2
   1284    %define %%j %%i
   1285    %ifndef cglobaled_%2
   1286        %ifdef cglobaled_%1
   1287            %xdefine %%i %1
   1288        %endif
   1289    %elif FORMAT_ELF
   1290        %if ARCH_X86_64
   1291            %if cglobaled_%2 >= 2
   1292                ; Always emit PLT relocations when calling external functions,
   1293                ; the linker will eliminate unnecessary PLT indirections anyway.
   1294                %define %%j %%i wrt ..plt
   1295            %endif
   1296        %elif PIC && cglobaled_%2 == 3
   1297            ; Go through the GOT for functions declared using cextern_naked with
   1298            ; PIC, as such functions presumably exists in external libraries.
   1299            extern _GLOBAL_OFFSET_TABLE_
   1300            LEA eax, $$+_GLOBAL_OFFSET_TABLE_ wrt ..gotpc
   1301            %define %%j [eax+%%i wrt ..got]
   1302        %endif
   1303    %endif
   1304    call %%j
   1305    LOAD_MM_PERMUTATION %%i
   1306 %endmacro
   1307 
   1308 ; Substitutions that reduce instruction size but are functionally equivalent
   1309 %macro add 2
   1310    %ifnum %2
   1311        %if %2==128
   1312            sub %1, -128
   1313        %else
   1314            add %1, %2
   1315        %endif
   1316    %else
   1317        add %1, %2
   1318    %endif
   1319 %endmacro
   1320 
   1321 %macro sub 2
   1322    %ifnum %2
   1323        %if %2==128
   1324            add %1, -128
   1325        %else
   1326            sub %1, %2
   1327        %endif
   1328    %else
   1329        sub %1, %2
   1330    %endif
   1331 %endmacro
   1332 
   1333 ;=============================================================================
   1334 ; AVX abstraction layer
   1335 ;=============================================================================
   1336 
   1337 %assign i 0
   1338 %rep 32
   1339    %if i < 8
   1340        CAT_XDEFINE sizeofmm, i, 8
   1341        CAT_XDEFINE regnumofmm, i, i
   1342    %endif
   1343    CAT_XDEFINE sizeofxmm, i, 16
   1344    CAT_XDEFINE sizeofymm, i, 32
   1345    CAT_XDEFINE sizeofzmm, i, 64
   1346    CAT_XDEFINE regnumofxmm, i, i
   1347    CAT_XDEFINE regnumofymm, i, i
   1348    CAT_XDEFINE regnumofzmm, i, i
   1349    %assign i i+1
   1350 %endrep
   1351 %undef i
   1352 
   1353 %macro CHECK_AVX_INSTR_EMU 3-*
   1354    %xdefine %%opcode %1
   1355    %xdefine %%dst %2
   1356    %rep %0-2
   1357        %ifidn %%dst, %3
   1358            %error non-avx emulation of ``%%opcode'' is not supported
   1359        %endif
   1360        %rotate 1
   1361    %endrep
   1362 %endmacro
   1363 
   1364 ;%1 == instruction
   1365 ;%2 == minimal instruction set
   1366 ;%3 == 1 if float, 0 if int
   1367 ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
   1368 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
   1369 ;%6+: operands
   1370 %macro RUN_AVX_INSTR 6-9+
   1371    %ifnum sizeof%7
   1372        %assign __sizeofreg sizeof%7
   1373    %elifnum sizeof%6
   1374        %assign __sizeofreg sizeof%6
   1375    %else
   1376        %assign __sizeofreg mmsize
   1377    %endif
   1378    %assign __emulate_avx 0
   1379    %if avx_enabled && __sizeofreg >= 16
   1380        %xdefine __instr v%1
   1381    %else
   1382        %xdefine __instr %1
   1383        %if %0 >= 8+%4
   1384            %assign __emulate_avx 1
   1385        %endif
   1386    %endif
   1387    %ifnidn %2, fnord
   1388        %ifdef cpuname
   1389            %if notcpuflag(%2)
   1390                %error use of ``%1'' %2 instruction in cpuname function: current_function
   1391            %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
   1392                %error use of ``%1'' sse2 instruction in cpuname function: current_function
   1393            %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
   1394                %error use of ``%1'' avx2 instruction in cpuname function: current_function
   1395            %elif __sizeofreg == 16 && notcpuflag(sse)
   1396                %error use of ``%1'' sse instruction in cpuname function: current_function
   1397            %elif __sizeofreg == 32 && notcpuflag(avx)
   1398                %error use of ``%1'' avx instruction in cpuname function: current_function
   1399            %elif __sizeofreg == 64 && notcpuflag(avx512)
   1400                %error use of ``%1'' avx512 instruction in cpuname function: current_function
   1401            %elifidn %1, pextrw ; special case because the base instruction is mmx2,
   1402                %ifnid %6       ; but sse4 is required for memory operands
   1403                    %if notcpuflag(sse4)
   1404                        %error use of ``%1'' sse4 instruction in cpuname function: current_function
   1405                    %endif
   1406                %endif
   1407            %endif
   1408        %endif
   1409    %endif
   1410 
   1411    %if __emulate_avx
   1412        %xdefine __src1 %7
   1413        %xdefine __src2 %8
   1414        %if %5 && %4 == 0
   1415            %ifnidn %6, %7
   1416                %ifidn %6, %8
   1417                    %xdefine __src1 %8
   1418                    %xdefine __src2 %7
   1419                %elifnnum sizeof%8
   1420                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
   1421                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
   1422                    ; So, if the instruction is commutative with a memory arg, swap them.
   1423                    %xdefine __src1 %8
   1424                    %xdefine __src2 %7
   1425                %endif
   1426            %endif
   1427        %endif
   1428        %ifnidn %6, __src1
   1429            %if %0 >= 9
   1430                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
   1431            %else
   1432                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
   1433            %endif
   1434            %if __sizeofreg == 8
   1435                MOVQ %6, __src1
   1436            %elif %3
   1437                MOVAPS %6, __src1
   1438            %else
   1439                MOVDQA %6, __src1
   1440            %endif
   1441        %endif
   1442        %if %0 >= 9
   1443            %1 %6, __src2, %9
   1444        %else
   1445            %1 %6, __src2
   1446        %endif
   1447    %elif %0 >= 9
   1448        %if avx_enabled && __sizeofreg >= 16 && %4 == 1
   1449            %ifnnum regnumof%7
   1450                %if %3
   1451                    vmovaps %6, %7
   1452                %else
   1453                    vmovdqa %6, %7
   1454                %endif
   1455                __instr %6, %6, %8, %9
   1456            %else
   1457                __instr %6, %7, %8, %9
   1458            %endif
   1459        %else
   1460            __instr %6, %7, %8, %9
   1461        %endif
   1462    %elif %0 == 8
   1463        %if avx_enabled && __sizeofreg >= 16 && %4 == 0
   1464            %xdefine __src1 %7
   1465            %xdefine __src2 %8
   1466            %if %5
   1467                %ifnum regnumof%7
   1468                    %ifnum regnumof%8
   1469                        %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
   1470                            ; Most VEX-encoded instructions require an additional byte to encode when
   1471                            ; src2 is a high register (e.g. m8..15). If the instruction is commutative
   1472                            ; we can swap src1 and src2 when doing so reduces the instruction length.
   1473                            %xdefine __src1 %8
   1474                            %xdefine __src2 %7
   1475                        %endif
   1476                    %endif
   1477                %elifnum regnumof%8 ; put memory operands in src2 when possible
   1478                    %xdefine __src1 %8
   1479                    %xdefine __src2 %7
   1480                %else
   1481                    %assign __emulate_avx 1
   1482                %endif
   1483            %elifnnum regnumof%7
   1484                ; EVEX allows imm8 shift instructions to be used with memory operands,
   1485                ; but VEX does not. This handles those special cases.
   1486                %ifnnum %8
   1487                    %assign __emulate_avx 1
   1488                %elif notcpuflag(avx512)
   1489                    %assign __emulate_avx 1
   1490                %endif
   1491            %endif
   1492            %if __emulate_avx ; a separate load is required
   1493                %if %3
   1494                    vmovaps %6, %7
   1495                %else
   1496                    vmovdqa %6, %7
   1497                %endif
   1498                __instr %6, %6, %8
   1499            %else
   1500                __instr %6, __src1, __src2
   1501            %endif
   1502        %else
   1503            __instr %6, %7, %8
   1504        %endif
   1505    %elif %0 == 7
   1506        %if avx_enabled && __sizeofreg >= 16 && %5
   1507            %xdefine __src1 %6
   1508            %xdefine __src2 %7
   1509            %ifnum regnumof%6
   1510                %ifnum regnumof%7
   1511                    %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
   1512                        %xdefine __src1 %7
   1513                        %xdefine __src2 %6
   1514                    %endif
   1515                %endif
   1516            %endif
   1517            __instr %6, __src1, __src2
   1518        %else
   1519            __instr %6, %7
   1520        %endif
   1521    %else
   1522        __instr %6
   1523    %endif
   1524 %endmacro
   1525 
   1526 ;%1 == instruction
   1527 ;%2 == minimal instruction set
   1528 ;%3 == 1 if float, 0 if int
   1529 ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
   1530 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
   1531 %macro AVX_INSTR 1-5 fnord, 0, 255, 0
   1532    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
   1533        %ifidn %2, fnord
   1534            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
   1535        %elifidn %3, fnord
   1536            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
   1537        %elifidn %4, fnord
   1538            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
   1539        %elifidn %5, fnord
   1540            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
   1541        %else
   1542            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
   1543        %endif
   1544    %endmacro
   1545 %endmacro
   1546 
   1547 ; Instructions with both VEX/EVEX and legacy encodings
   1548 ; Non-destructive instructions are written without parameters
   1549 AVX_INSTR addpd, sse2, 1, 0, 1
   1550 AVX_INSTR addps, sse, 1, 0, 1
   1551 AVX_INSTR addsd, sse2, 1, 0, 0
   1552 AVX_INSTR addss, sse, 1, 0, 0
   1553 AVX_INSTR addsubpd, sse3, 1, 0, 0
   1554 AVX_INSTR addsubps, sse3, 1, 0, 0
   1555 AVX_INSTR aesdec, aesni, 0, 0, 0
   1556 AVX_INSTR aesdeclast, aesni, 0, 0, 0
   1557 AVX_INSTR aesenc, aesni, 0, 0, 0
   1558 AVX_INSTR aesenclast, aesni, 0, 0, 0
   1559 AVX_INSTR aesimc, aesni
   1560 AVX_INSTR aeskeygenassist, aesni
   1561 AVX_INSTR andnpd, sse2, 1, 0, 0
   1562 AVX_INSTR andnps, sse, 1, 0, 0
   1563 AVX_INSTR andpd, sse2, 1, 0, 1
   1564 AVX_INSTR andps, sse, 1, 0, 1
   1565 AVX_INSTR blendpd, sse4, 1, 1, 0
   1566 AVX_INSTR blendps, sse4, 1, 1, 0
   1567 AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
   1568 AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
   1569 AVX_INSTR cmpeqpd, sse2, 1, 0, 1
   1570 AVX_INSTR cmpeqps, sse, 1, 0, 1
   1571 AVX_INSTR cmpeqsd, sse2, 1, 0, 0
   1572 AVX_INSTR cmpeqss, sse, 1, 0, 0
   1573 AVX_INSTR cmplepd, sse2, 1, 0, 0
   1574 AVX_INSTR cmpleps, sse, 1, 0, 0
   1575 AVX_INSTR cmplesd, sse2, 1, 0, 0
   1576 AVX_INSTR cmpless, sse, 1, 0, 0
   1577 AVX_INSTR cmpltpd, sse2, 1, 0, 0
   1578 AVX_INSTR cmpltps, sse, 1, 0, 0
   1579 AVX_INSTR cmpltsd, sse2, 1, 0, 0
   1580 AVX_INSTR cmpltss, sse, 1, 0, 0
   1581 AVX_INSTR cmpneqpd, sse2, 1, 0, 1
   1582 AVX_INSTR cmpneqps, sse, 1, 0, 1
   1583 AVX_INSTR cmpneqsd, sse2, 1, 0, 0
   1584 AVX_INSTR cmpneqss, sse, 1, 0, 0
   1585 AVX_INSTR cmpnlepd, sse2, 1, 0, 0
   1586 AVX_INSTR cmpnleps, sse, 1, 0, 0
   1587 AVX_INSTR cmpnlesd, sse2, 1, 0, 0
   1588 AVX_INSTR cmpnless, sse, 1, 0, 0
   1589 AVX_INSTR cmpnltpd, sse2, 1, 0, 0
   1590 AVX_INSTR cmpnltps, sse, 1, 0, 0
   1591 AVX_INSTR cmpnltsd, sse2, 1, 0, 0
   1592 AVX_INSTR cmpnltss, sse, 1, 0, 0
   1593 AVX_INSTR cmpordpd, sse2 1, 0, 1
   1594 AVX_INSTR cmpordps, sse 1, 0, 1
   1595 AVX_INSTR cmpordsd, sse2 1, 0, 0
   1596 AVX_INSTR cmpordss, sse 1, 0, 0
   1597 AVX_INSTR cmppd, sse2, 1, 1, 0
   1598 AVX_INSTR cmpps, sse, 1, 1, 0
   1599 AVX_INSTR cmpsd, sse2, 1, 1, 0
   1600 AVX_INSTR cmpss, sse, 1, 1, 0
   1601 AVX_INSTR cmpunordpd, sse2, 1, 0, 1
   1602 AVX_INSTR cmpunordps, sse, 1, 0, 1
   1603 AVX_INSTR cmpunordsd, sse2, 1, 0, 0
   1604 AVX_INSTR cmpunordss, sse, 1, 0, 0
   1605 AVX_INSTR comisd, sse2, 1
   1606 AVX_INSTR comiss, sse, 1
   1607 AVX_INSTR cvtdq2pd, sse2, 1
   1608 AVX_INSTR cvtdq2ps, sse2, 1
   1609 AVX_INSTR cvtpd2dq, sse2, 1
   1610 AVX_INSTR cvtpd2ps, sse2, 1
   1611 AVX_INSTR cvtps2dq, sse2, 1
   1612 AVX_INSTR cvtps2pd, sse2, 1
   1613 AVX_INSTR cvtsd2si, sse2, 1
   1614 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
   1615 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
   1616 AVX_INSTR cvtsi2ss, sse, 1, 0, 0
   1617 AVX_INSTR cvtss2sd, sse2, 1, 0, 0
   1618 AVX_INSTR cvtss2si, sse, 1
   1619 AVX_INSTR cvttpd2dq, sse2, 1
   1620 AVX_INSTR cvttps2dq, sse2, 1
   1621 AVX_INSTR cvttsd2si, sse2, 1
   1622 AVX_INSTR cvttss2si, sse, 1
   1623 AVX_INSTR divpd, sse2, 1, 0, 0
   1624 AVX_INSTR divps, sse, 1, 0, 0
   1625 AVX_INSTR divsd, sse2, 1, 0, 0
   1626 AVX_INSTR divss, sse, 1, 0, 0
   1627 AVX_INSTR dppd, sse4, 1, 1, 0
   1628 AVX_INSTR dpps, sse4, 1, 1, 0
   1629 AVX_INSTR extractps, sse4, 1
   1630 AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
   1631 AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
   1632 AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
   1633 AVX_INSTR haddpd, sse3, 1, 0, 0
   1634 AVX_INSTR haddps, sse3, 1, 0, 0
   1635 AVX_INSTR hsubpd, sse3, 1, 0, 0
   1636 AVX_INSTR hsubps, sse3, 1, 0, 0
   1637 AVX_INSTR insertps, sse4, 1, 1, 0
   1638 AVX_INSTR lddqu, sse3
   1639 AVX_INSTR ldmxcsr, sse, 1
   1640 AVX_INSTR maskmovdqu, sse2
   1641 AVX_INSTR maxpd, sse2, 1, 0, 1
   1642 AVX_INSTR maxps, sse, 1, 0, 1
   1643 AVX_INSTR maxsd, sse2, 1, 0, 0
   1644 AVX_INSTR maxss, sse, 1, 0, 0
   1645 AVX_INSTR minpd, sse2, 1, 0, 1
   1646 AVX_INSTR minps, sse, 1, 0, 1
   1647 AVX_INSTR minsd, sse2, 1, 0, 0
   1648 AVX_INSTR minss, sse, 1, 0, 0
   1649 AVX_INSTR movapd, sse2, 1
   1650 AVX_INSTR movaps, sse, 1
   1651 AVX_INSTR movd, mmx
   1652 AVX_INSTR movddup, sse3, 1
   1653 AVX_INSTR movdqa, sse2
   1654 AVX_INSTR movdqu, sse2
   1655 AVX_INSTR movhlps, sse, 1, 0, 0
   1656 AVX_INSTR movhpd, sse2, 1, 0, 0
   1657 AVX_INSTR movhps, sse, 1, 0, 0
   1658 AVX_INSTR movlhps, sse, 1, 0, 0
   1659 AVX_INSTR movlpd, sse2, 1, 0, 0
   1660 AVX_INSTR movlps, sse, 1, 0, 0
   1661 AVX_INSTR movmskpd, sse2, 1
   1662 AVX_INSTR movmskps, sse, 1
   1663 AVX_INSTR movntdq, sse2
   1664 AVX_INSTR movntdqa, sse4
   1665 AVX_INSTR movntpd, sse2, 1
   1666 AVX_INSTR movntps, sse, 1
   1667 AVX_INSTR movq, mmx
   1668 AVX_INSTR movsd, sse2, 1, 0, 0
   1669 AVX_INSTR movshdup, sse3, 1
   1670 AVX_INSTR movsldup, sse3, 1
   1671 AVX_INSTR movss, sse, 1, 0, 0
   1672 AVX_INSTR movupd, sse2, 1
   1673 AVX_INSTR movups, sse, 1
   1674 AVX_INSTR mpsadbw, sse4, 0, 1, 0
   1675 AVX_INSTR mulpd, sse2, 1, 0, 1
   1676 AVX_INSTR mulps, sse, 1, 0, 1
   1677 AVX_INSTR mulsd, sse2, 1, 0, 0
   1678 AVX_INSTR mulss, sse, 1, 0, 0
   1679 AVX_INSTR orpd, sse2, 1, 0, 1
   1680 AVX_INSTR orps, sse, 1, 0, 1
   1681 AVX_INSTR pabsb, ssse3
   1682 AVX_INSTR pabsd, ssse3
   1683 AVX_INSTR pabsw, ssse3
   1684 AVX_INSTR packssdw, mmx, 0, 0, 0
   1685 AVX_INSTR packsswb, mmx, 0, 0, 0
   1686 AVX_INSTR packusdw, sse4, 0, 0, 0
   1687 AVX_INSTR packuswb, mmx, 0, 0, 0
   1688 AVX_INSTR paddb, mmx, 0, 0, 1
   1689 AVX_INSTR paddd, mmx, 0, 0, 1
   1690 AVX_INSTR paddq, sse2, 0, 0, 1
   1691 AVX_INSTR paddsb, mmx, 0, 0, 1
   1692 AVX_INSTR paddsw, mmx, 0, 0, 1
   1693 AVX_INSTR paddusb, mmx, 0, 0, 1
   1694 AVX_INSTR paddusw, mmx, 0, 0, 1
   1695 AVX_INSTR paddw, mmx, 0, 0, 1
   1696 AVX_INSTR palignr, ssse3, 0, 1, 0
   1697 AVX_INSTR pand, mmx, 0, 0, 1
   1698 AVX_INSTR pandn, mmx, 0, 0, 0
   1699 AVX_INSTR pavgb, mmx2, 0, 0, 1
   1700 AVX_INSTR pavgw, mmx2, 0, 0, 1
   1701 AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding
   1702 AVX_INSTR pblendw, sse4, 0, 1, 0
   1703 AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0
   1704 AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0
   1705 AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0
   1706 AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0
   1707 AVX_INSTR pclmulqdq, clmul, 0, 1, 0
   1708 AVX_INSTR pcmpeqb, mmx, 0, 0, 1
   1709 AVX_INSTR pcmpeqd, mmx, 0, 0, 1
   1710 AVX_INSTR pcmpeqq, sse4, 0, 0, 1
   1711 AVX_INSTR pcmpeqw, mmx, 0, 0, 1
   1712 AVX_INSTR pcmpestri, sse42
   1713 AVX_INSTR pcmpestrm, sse42
   1714 AVX_INSTR pcmpgtb, mmx, 0, 0, 0
   1715 AVX_INSTR pcmpgtd, mmx, 0, 0, 0
   1716 AVX_INSTR pcmpgtq, sse42, 0, 0, 0
   1717 AVX_INSTR pcmpgtw, mmx, 0, 0, 0
   1718 AVX_INSTR pcmpistri, sse42
   1719 AVX_INSTR pcmpistrm, sse42
   1720 AVX_INSTR pextrb, sse4
   1721 AVX_INSTR pextrd, sse4
   1722 AVX_INSTR pextrq, sse4
   1723 AVX_INSTR pextrw, mmx2
   1724 AVX_INSTR phaddd, ssse3, 0, 0, 0
   1725 AVX_INSTR phaddsw, ssse3, 0, 0, 0
   1726 AVX_INSTR phaddw, ssse3, 0, 0, 0
   1727 AVX_INSTR phminposuw, sse4
   1728 AVX_INSTR phsubd, ssse3, 0, 0, 0
   1729 AVX_INSTR phsubsw, ssse3, 0, 0, 0
   1730 AVX_INSTR phsubw, ssse3, 0, 0, 0
   1731 AVX_INSTR pinsrb, sse4, 0, 1, 0
   1732 AVX_INSTR pinsrd, sse4, 0, 1, 0
   1733 AVX_INSTR pinsrq, sse4, 0, 1, 0
   1734 AVX_INSTR pinsrw, mmx2, 0, 1, 0
   1735 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
   1736 AVX_INSTR pmaddwd, mmx, 0, 0, 1
   1737 AVX_INSTR pmaxsb, sse4, 0, 0, 1
   1738 AVX_INSTR pmaxsd, sse4, 0, 0, 1
   1739 AVX_INSTR pmaxsw, mmx2, 0, 0, 1
   1740 AVX_INSTR pmaxub, mmx2, 0, 0, 1
   1741 AVX_INSTR pmaxud, sse4, 0, 0, 1
   1742 AVX_INSTR pmaxuw, sse4, 0, 0, 1
   1743 AVX_INSTR pminsb, sse4, 0, 0, 1
   1744 AVX_INSTR pminsd, sse4, 0, 0, 1
   1745 AVX_INSTR pminsw, mmx2, 0, 0, 1
   1746 AVX_INSTR pminub, mmx2, 0, 0, 1
   1747 AVX_INSTR pminud, sse4, 0, 0, 1
   1748 AVX_INSTR pminuw, sse4, 0, 0, 1
   1749 AVX_INSTR pmovmskb, mmx2
   1750 AVX_INSTR pmovsxbd, sse4
   1751 AVX_INSTR pmovsxbq, sse4
   1752 AVX_INSTR pmovsxbw, sse4
   1753 AVX_INSTR pmovsxdq, sse4
   1754 AVX_INSTR pmovsxwd, sse4
   1755 AVX_INSTR pmovsxwq, sse4
   1756 AVX_INSTR pmovzxbd, sse4
   1757 AVX_INSTR pmovzxbq, sse4
   1758 AVX_INSTR pmovzxbw, sse4
   1759 AVX_INSTR pmovzxdq, sse4
   1760 AVX_INSTR pmovzxwd, sse4
   1761 AVX_INSTR pmovzxwq, sse4
   1762 AVX_INSTR pmuldq, sse4, 0, 0, 1
   1763 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
   1764 AVX_INSTR pmulhuw, mmx2, 0, 0, 1
   1765 AVX_INSTR pmulhw, mmx, 0, 0, 1
   1766 AVX_INSTR pmulld, sse4, 0, 0, 1
   1767 AVX_INSTR pmullw, mmx, 0, 0, 1
   1768 AVX_INSTR pmuludq, sse2, 0, 0, 1
   1769 AVX_INSTR por, mmx, 0, 0, 1
   1770 AVX_INSTR psadbw, mmx2, 0, 0, 1
   1771 AVX_INSTR pshufb, ssse3, 0, 0, 0
   1772 AVX_INSTR pshufd, sse2
   1773 AVX_INSTR pshufhw, sse2
   1774 AVX_INSTR pshuflw, sse2
   1775 AVX_INSTR psignb, ssse3, 0, 0, 0
   1776 AVX_INSTR psignd, ssse3, 0, 0, 0
   1777 AVX_INSTR psignw, ssse3, 0, 0, 0
   1778 AVX_INSTR pslld, mmx, 0, 0, 0
   1779 AVX_INSTR pslldq, sse2, 0, 0, 0
   1780 AVX_INSTR psllq, mmx, 0, 0, 0
   1781 AVX_INSTR psllw, mmx, 0, 0, 0
   1782 AVX_INSTR psrad, mmx, 0, 0, 0
   1783 AVX_INSTR psraw, mmx, 0, 0, 0
   1784 AVX_INSTR psrld, mmx, 0, 0, 0
   1785 AVX_INSTR psrldq, sse2, 0, 0, 0
   1786 AVX_INSTR psrlq, mmx, 0, 0, 0
   1787 AVX_INSTR psrlw, mmx, 0, 0, 0
   1788 AVX_INSTR psubb, mmx, 0, 0, 0
   1789 AVX_INSTR psubd, mmx, 0, 0, 0
   1790 AVX_INSTR psubq, sse2, 0, 0, 0
   1791 AVX_INSTR psubsb, mmx, 0, 0, 0
   1792 AVX_INSTR psubsw, mmx, 0, 0, 0
   1793 AVX_INSTR psubusb, mmx, 0, 0, 0
   1794 AVX_INSTR psubusw, mmx, 0, 0, 0
   1795 AVX_INSTR psubw, mmx, 0, 0, 0
   1796 AVX_INSTR ptest, sse4
   1797 AVX_INSTR punpckhbw, mmx, 0, 0, 0
   1798 AVX_INSTR punpckhdq, mmx, 0, 0, 0
   1799 AVX_INSTR punpckhqdq, sse2, 0, 0, 0
   1800 AVX_INSTR punpckhwd, mmx, 0, 0, 0
   1801 AVX_INSTR punpcklbw, mmx, 0, 0, 0
   1802 AVX_INSTR punpckldq, mmx, 0, 0, 0
   1803 AVX_INSTR punpcklqdq, sse2, 0, 0, 0
   1804 AVX_INSTR punpcklwd, mmx, 0, 0, 0
   1805 AVX_INSTR pxor, mmx, 0, 0, 1
   1806 AVX_INSTR rcpps, sse, 1
   1807 AVX_INSTR rcpss, sse, 1, 0, 0
   1808 AVX_INSTR roundpd, sse4, 1
   1809 AVX_INSTR roundps, sse4, 1
   1810 AVX_INSTR roundsd, sse4, 1, 1, 0
   1811 AVX_INSTR roundss, sse4, 1, 1, 0
   1812 AVX_INSTR rsqrtps, sse, 1
   1813 AVX_INSTR rsqrtss, sse, 1, 0, 0
   1814 AVX_INSTR shufpd, sse2, 1, 1, 0
   1815 AVX_INSTR shufps, sse, 1, 1, 0
   1816 AVX_INSTR sqrtpd, sse2, 1
   1817 AVX_INSTR sqrtps, sse, 1
   1818 AVX_INSTR sqrtsd, sse2, 1, 0, 0
   1819 AVX_INSTR sqrtss, sse, 1, 0, 0
   1820 AVX_INSTR stmxcsr, sse, 1
   1821 AVX_INSTR subpd, sse2, 1, 0, 0
   1822 AVX_INSTR subps, sse, 1, 0, 0
   1823 AVX_INSTR subsd, sse2, 1, 0, 0
   1824 AVX_INSTR subss, sse, 1, 0, 0
   1825 AVX_INSTR ucomisd, sse2, 1
   1826 AVX_INSTR ucomiss, sse, 1
   1827 AVX_INSTR unpckhpd, sse2, 1, 0, 0
   1828 AVX_INSTR unpckhps, sse, 1, 0, 0
   1829 AVX_INSTR unpcklpd, sse2, 1, 0, 0
   1830 AVX_INSTR unpcklps, sse, 1, 0, 0
   1831 AVX_INSTR xorpd, sse2, 1, 0, 1
   1832 AVX_INSTR xorps, sse, 1, 0, 1
   1833 
   1834 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
   1835 AVX_INSTR pfadd, 3dnow, 1, 0, 1
   1836 AVX_INSTR pfmul, 3dnow, 1, 0, 1
   1837 AVX_INSTR pfsub, 3dnow, 1, 0, 0
   1838 
   1839 ;%1 == instruction
   1840 ;%2 == minimal instruction set
   1841 %macro GPR_INSTR 2
   1842    %macro %1 2-5 fnord, %1, %2
   1843        %ifdef cpuname
   1844            %if notcpuflag(%5)
   1845                %error use of ``%4'' %5 instruction in cpuname function: current_function
   1846            %endif
   1847        %endif
   1848        %ifidn %3, fnord
   1849            %4 %1, %2
   1850        %else
   1851            %4 %1, %2, %3
   1852        %endif
   1853    %endmacro
   1854 %endmacro
   1855 
   1856 GPR_INSTR andn, bmi1
   1857 GPR_INSTR bextr, bmi1
   1858 GPR_INSTR blsi, bmi1
   1859 GPR_INSTR blsmsk, bmi1
   1860 GPR_INSTR blsr, bmi1
   1861 GPR_INSTR bzhi, bmi2
   1862 GPR_INSTR crc32, sse42
   1863 GPR_INSTR mulx, bmi2
   1864 GPR_INSTR pdep, bmi2
   1865 GPR_INSTR pext, bmi2
   1866 GPR_INSTR popcnt, sse42
   1867 GPR_INSTR rorx, bmi2
   1868 GPR_INSTR sarx, bmi2
   1869 GPR_INSTR shlx, bmi2
   1870 GPR_INSTR shrx, bmi2
   1871 
   1872 ; base-4 constants for shuffles
   1873 %assign i 0
   1874 %rep 256
   1875    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
   1876    %if j < 10
   1877        CAT_XDEFINE q000, j, i
   1878    %elif j < 100
   1879        CAT_XDEFINE q00, j, i
   1880    %elif j < 1000
   1881        CAT_XDEFINE q0, j, i
   1882    %else
   1883        CAT_XDEFINE q, j, i
   1884    %endif
   1885    %assign i i+1
   1886 %endrep
   1887 %undef i
   1888 %undef j
   1889 
   1890 %macro FMA_INSTR 3
   1891    %macro %1 4-7 %1, %2, %3
   1892        %if cpuflag(xop)
   1893            v%5 %1, %2, %3, %4
   1894        %elifnidn %1, %4
   1895            %6 %1, %2, %3
   1896            %7 %1, %4
   1897        %else
   1898            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
   1899        %endif
   1900    %endmacro
   1901 %endmacro
   1902 
   1903 FMA_INSTR pmacsdd,  pmulld,  paddd ; sse4 emulation
   1904 FMA_INSTR pmacsdql, pmuldq,  paddq ; sse4 emulation
   1905 FMA_INSTR pmacsww,  pmullw,  paddw
   1906 FMA_INSTR pmadcswd, pmaddwd, paddd
   1907 
   1908 ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
   1909 ; FMA3 is only possible if dst is the same as one of the src registers.
   1910 ; Either src2 or src3 can be a memory operand.
   1911 %macro FMA4_INSTR 2-*
   1912    %push fma4_instr
   1913    %xdefine %$prefix %1
   1914    %rep %0 - 1
   1915        %macro %$prefix%2 4-6 %$prefix, %2
   1916            %if notcpuflag(fma3) && notcpuflag(fma4)
   1917                %error use of ``%5%6'' fma instruction in cpuname function: current_function
   1918            %elif cpuflag(fma4)
   1919                v%5%6 %1, %2, %3, %4
   1920            %elifidn %1, %2
   1921                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
   1922                %ifnum sizeof%3
   1923                    v%{5}213%6 %2, %3, %4
   1924                %else
   1925                    v%{5}132%6 %2, %4, %3
   1926                %endif
   1927            %elifidn %1, %3
   1928                v%{5}213%6 %3, %2, %4
   1929            %elifidn %1, %4
   1930                v%{5}231%6 %4, %2, %3
   1931            %else
   1932                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
   1933            %endif
   1934        %endmacro
   1935        %rotate 1
   1936    %endrep
   1937    %pop
   1938 %endmacro
   1939 
   1940 FMA4_INSTR fmadd,    pd, ps, sd, ss
   1941 FMA4_INSTR fmaddsub, pd, ps
   1942 FMA4_INSTR fmsub,    pd, ps, sd, ss
   1943 FMA4_INSTR fmsubadd, pd, ps
   1944 FMA4_INSTR fnmadd,   pd, ps, sd, ss
   1945 FMA4_INSTR fnmsub,   pd, ps, sd, ss
   1946 
   1947 ; Macros for converting VEX instructions to equivalent EVEX ones.
   1948 %macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
   1949    %macro %1 2-7 fnord, fnord, %1, %2, %3
   1950        %ifidn %3, fnord
   1951            %define %%args %1, %2
   1952        %elifidn %4, fnord
   1953            %define %%args %1, %2, %3
   1954        %else
   1955            %define %%args %1, %2, %3, %4
   1956        %endif
   1957        %assign %%evex_required cpuflag(avx512) & %7
   1958        %ifnum regnumof%1
   1959            %if regnumof%1 >= 16 || sizeof%1 > 32
   1960                %assign %%evex_required 1
   1961            %endif
   1962        %endif
   1963        %ifnum regnumof%2
   1964            %if regnumof%2 >= 16 || sizeof%2 > 32
   1965                %assign %%evex_required 1
   1966            %endif
   1967        %endif
   1968        %ifnum regnumof%3
   1969            %if regnumof%3 >= 16 || sizeof%3 > 32
   1970                %assign %%evex_required 1
   1971            %endif
   1972        %endif
   1973        %if %%evex_required
   1974            %6 %%args
   1975        %else
   1976            %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
   1977        %endif
   1978    %endmacro
   1979 %endmacro
   1980 
   1981 EVEX_INSTR vbroadcastf128, vbroadcastf32x4
   1982 EVEX_INSTR vbroadcasti128, vbroadcasti32x4
   1983 EVEX_INSTR vextractf128,   vextractf32x4
   1984 EVEX_INSTR vextracti128,   vextracti32x4
   1985 EVEX_INSTR vinsertf128,    vinsertf32x4
   1986 EVEX_INSTR vinserti128,    vinserti32x4
   1987 EVEX_INSTR vmovdqa,        vmovdqa32
   1988 EVEX_INSTR vmovdqu,        vmovdqu32
   1989 EVEX_INSTR vpand,          vpandd
   1990 EVEX_INSTR vpandn,         vpandnd
   1991 EVEX_INSTR vpor,           vpord
   1992 EVEX_INSTR vpxor,          vpxord
   1993 EVEX_INSTR vrcpps,         vrcp14ps,   1 ; EVEX versions have higher precision
   1994 EVEX_INSTR vrcpss,         vrcp14ss,   1
   1995 EVEX_INSTR vrsqrtps,       vrsqrt14ps, 1
   1996 EVEX_INSTR vrsqrtss,       vrsqrt14ss, 1