tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

x86inc.asm (59540B)


      1 ;*****************************************************************************
      2 ;* x86inc.asm: x86 abstraction layer
      3 ;*****************************************************************************
      4 ;* Copyright (C) 2005-2024 x264 project
      5 ;*
      6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
      7 ;*          Henrik Gramner <henrik@gramner.com>
      8 ;*          Anton Mitrofanov <BugMaster@narod.ru>
      9 ;*          Fiona Glaser <fiona@x264.com>
     10 ;*
     11 ;* Permission to use, copy, modify, and/or distribute this software for any
     12 ;* purpose with or without fee is hereby granted, provided that the above
     13 ;* copyright notice and this permission notice appear in all copies.
     14 ;*
     15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     22 ;*****************************************************************************
     23 
     24 ; This is a header file for the x86inc.asm assembly language, which uses
     25 ; NASM/YASM syntax combined with a large number of macros to provide easy
     26 ; abstraction between different calling conventions (x86_32, win64, linux64).
     27 ; It also has various other useful features to simplify writing the kind of
     28 ; DSP functions that are most often used.
     29 
     30 %ifndef private_prefix
     31    %error private_prefix not defined
     32 %endif
     33 
     34 %ifndef public_prefix
     35    %define public_prefix private_prefix
     36 %endif
     37 
     38 %if HAVE_ALIGNED_STACK
     39    %define STACK_ALIGNMENT 16
     40 %endif
     41 %ifndef STACK_ALIGNMENT
     42    %if ARCH_X86_64
     43        %define STACK_ALIGNMENT 16
     44    %else
     45        %define STACK_ALIGNMENT 4
     46    %endif
     47 %endif
     48 
     49 %define WIN64  0
     50 %define UNIX64 0
     51 %if ARCH_X86_64
     52    %ifidn __OUTPUT_FORMAT__,win32
     53        %define WIN64  1
     54    %elifidn __OUTPUT_FORMAT__,win64
     55        %define WIN64  1
     56    %elifidn __OUTPUT_FORMAT__,x64
     57        %define WIN64  1
     58    %else
     59        %define UNIX64 1
     60    %endif
     61 %endif
     62 
     63 %define FORMAT_ELF 0
     64 %define FORMAT_MACHO 0
     65 %ifidn __OUTPUT_FORMAT__,elf
     66    %define FORMAT_ELF 1
     67 %elifidn __OUTPUT_FORMAT__,elf32
     68    %define FORMAT_ELF 1
     69 %elifidn __OUTPUT_FORMAT__,elf64
     70    %define FORMAT_ELF 1
     71 %elifidn __OUTPUT_FORMAT__,macho
     72    %define FORMAT_MACHO 1
     73 %elifidn __OUTPUT_FORMAT__,macho32
     74    %define FORMAT_MACHO 1
     75 %elifidn __OUTPUT_FORMAT__,macho64
     76    %define FORMAT_MACHO 1
     77 %endif
     78 
     79 %ifdef PREFIX
     80    %define mangle(x) _ %+ x
     81 %else
     82    %define mangle(x) x
     83 %endif
     84 
     85 ; Use VEX-encoding even in non-AVX functions
     86 %ifndef FORCE_VEX_ENCODING
     87    %define FORCE_VEX_ENCODING 0
     88 %endif
     89 
     90 ; aout does not support align=
     91 ; NOTE: This section is out of sync with x264, in order to
     92 ; keep supporting OS/2.
     93 %macro SECTION_RODATA 0-1 16
     94    %ifidn __OUTPUT_FORMAT__,aout
     95        SECTION .text
     96    %elifidn __OUTPUT_FORMAT__,coff
     97        SECTION .text
     98    %elifidn __OUTPUT_FORMAT__,win32
     99        SECTION .rdata align=%1
    100    %elif WIN64
    101        SECTION .rdata align=%1
    102    %else
    103        SECTION .rodata align=%1
    104    %endif
    105 %endmacro
    106 
    107 %if ARCH_X86_64
    108    %define PIC 1 ; always use PIC on x86-64
    109    default rel
    110 %elifidn __OUTPUT_FORMAT__,win32
    111    %define PIC 0 ; PIC isn't used on 32-bit Windows
    112 %elifndef PIC
    113    %define PIC 0
    114 %endif
    115 
    116 %define HAVE_PRIVATE_EXTERN 1
    117 %ifdef __NASM_VERSION_ID__
    118    %use smartalign
    119    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
    120        %define HAVE_PRIVATE_EXTERN 0
    121    %endif
    122 %endif
    123 
    124 ; Macros to eliminate most code duplication between x86_32 and x86_64:
    125 ; Currently this works only for leaf functions which load all their arguments
    126 ; into registers at the start, and make no other use of the stack. Luckily that
    127 ; covers most use cases.
    128 
    129 ; PROLOGUE:
    130 ; %1 = number of arguments. loads them from stack if needed.
    131 ; %2 = number of registers used. pushes callee-saved regs if needed.
    132 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
    133 ; %4 = (optional) stack size to be allocated. The stack will be aligned before
    134 ;      allocating the specified stack size. If the required stack alignment is
    135 ;      larger than the known stack alignment the stack will be manually aligned
    136 ;      and an extra register will be allocated to hold the original stack
    137 ;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
    138 ;      register as stack pointer, request a negative stack size.
    139 ; %4+/%5+ = list of names to define to registers
    140 ; PROLOGUE can also be invoked by adding the same options to cglobal
    141 
    142 ; e.g.
    143 ; cglobal foo, 2,3,7,0x40, dst, src, tmp
    144 ; declares a function (foo) that automatically loads two arguments (dst and
    145 ; src) into registers, uses one additional register (tmp) plus 7 vector
    146 ; registers (m0-m6) and allocates 0x40 bytes of stack space.
    147 
    148 ; TODO Some functions can use some args directly from the stack. If they're the
    149 ; last args then you can just not declare them, but if they're in the middle
    150 ; we need more flexible macro.
    151 
    152 ; RET:
    153 ; Pops anything that was pushed by PROLOGUE, and returns.
    154 
    155 ; REP_RET:
    156 ; Use this instead of RET if it's a branch target.
    157 
    158 ; registers:
    159 ; rN and rNq are the native-size register holding function argument N
    160 ; rNd, rNw, rNb are dword, word, and byte size
    161 ; rNh is the high 8 bits of the word size
    162 ; rNm is the original location of arg N (a register or on the stack), dword
    163 ; rNmp is native size
    164 
    165 %macro DECLARE_REG 2-3
    166    %define r%1q %2
    167    %define r%1d %2d
    168    %define r%1w %2w
    169    %define r%1b %2b
    170    %define r%1h %2h
    171    %define %2q %2
    172    %if %0 == 2
    173        %define r%1m  %2d
    174        %define r%1mp %2
    175    %elif ARCH_X86_64 ; memory
    176        %define r%1m [rstk + stack_offset + %3]
    177        %define r%1mp qword r %+ %1 %+ m
    178    %else
    179        %define r%1m [rstk + stack_offset + %3]
    180        %define r%1mp dword r %+ %1 %+ m
    181    %endif
    182    %define r%1  %2
    183 %endmacro
    184 
    185 %macro DECLARE_REG_SIZE 3
    186    %define r%1q r%1
    187    %define e%1q r%1
    188    %define r%1d e%1
    189    %define e%1d e%1
    190    %define r%1w %1
    191    %define e%1w %1
    192    %define r%1h %3
    193    %define e%1h %3
    194    %define r%1b %2
    195    %define e%1b %2
    196    %if ARCH_X86_64 == 0
    197        %define r%1 e%1
    198    %endif
    199 %endmacro
    200 
    201 DECLARE_REG_SIZE ax, al, ah
    202 DECLARE_REG_SIZE bx, bl, bh
    203 DECLARE_REG_SIZE cx, cl, ch
    204 DECLARE_REG_SIZE dx, dl, dh
    205 DECLARE_REG_SIZE si, sil, null
    206 DECLARE_REG_SIZE di, dil, null
    207 DECLARE_REG_SIZE bp, bpl, null
    208 
    209 ; t# defines for when per-arch register allocation is more complex than just function arguments
    210 
    211 %macro DECLARE_REG_TMP 1-*
    212    %assign %%i 0
    213    %rep %0
    214        CAT_XDEFINE t, %%i, r%1
    215        %assign %%i %%i+1
    216        %rotate 1
    217    %endrep
    218 %endmacro
    219 
    220 %macro DECLARE_REG_TMP_SIZE 0-*
    221    %rep %0
    222        %define t%1q t%1 %+ q
    223        %define t%1d t%1 %+ d
    224        %define t%1w t%1 %+ w
    225        %define t%1h t%1 %+ h
    226        %define t%1b t%1 %+ b
    227        %rotate 1
    228    %endrep
    229 %endmacro
    230 
    231 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
    232 
    233 %if ARCH_X86_64
    234    %define gprsize 8
    235 %else
    236    %define gprsize 4
    237 %endif
    238 
    239 %macro LEA 2
    240 %if ARCH_X86_64
    241    lea %1, [%2]
    242 %elif PIC
    243    call $+5 ; special-cased to not affect the RSB on most CPU:s
    244    pop %1
    245    add %1, (%2)-$+1
    246 %else
    247    mov %1, %2
    248 %endif
    249 %endmacro
    250 
    251 ; Repeats an instruction/operation for multiple arguments.
    252 ; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3"
    253 %macro REPX 2-* ; operation, args
    254    %xdefine %%f(x) %1
    255    %rep %0 - 1
    256        %rotate 1
    257        %%f(%1)
    258    %endrep
    259 %endmacro
    260 
    261 %macro PUSH 1
    262    push %1
    263    %ifidn rstk, rsp
    264        %assign stack_offset stack_offset+gprsize
    265    %endif
    266 %endmacro
    267 
    268 %macro POP 1
    269    pop %1
    270    %ifidn rstk, rsp
    271        %assign stack_offset stack_offset-gprsize
    272    %endif
    273 %endmacro
    274 
    275 %macro PUSH_IF_USED 1-*
    276    %rep %0
    277        %if %1 < regs_used
    278            PUSH r%1
    279        %endif
    280        %rotate 1
    281    %endrep
    282 %endmacro
    283 
    284 %macro POP_IF_USED 1-*
    285    %rep %0
    286        %if %1 < regs_used
    287            pop r%1
    288        %endif
    289        %rotate 1
    290    %endrep
    291 %endmacro
    292 
    293 %macro LOAD_IF_USED 1-*
    294    %rep %0
    295        %if %1 < num_args
    296            mov r%1, r %+ %1 %+ mp
    297        %endif
    298        %rotate 1
    299    %endrep
    300 %endmacro
    301 
    302 %macro SUB 2
    303    sub %1, %2
    304    %ifidn %1, rstk
    305        %assign stack_offset stack_offset+(%2)
    306    %endif
    307 %endmacro
    308 
    309 %macro ADD 2
    310    add %1, %2
    311    %ifidn %1, rstk
    312        %assign stack_offset stack_offset-(%2)
    313    %endif
    314 %endmacro
    315 
    316 %macro movifnidn 2
    317    %ifnidn %1, %2
    318        mov %1, %2
    319    %endif
    320 %endmacro
    321 
    322 %if ARCH_X86_64 == 0
    323    %define movsxd movifnidn
    324 %endif
    325 
    326 %macro movsxdifnidn 2
    327    %ifnidn %1, %2
    328        movsxd %1, %2
    329    %endif
    330 %endmacro
    331 
    332 %macro ASSERT 1
    333    %if (%1) == 0
    334        %error assertion ``%1'' failed
    335    %endif
    336 %endmacro
    337 
    338 %macro DEFINE_ARGS 0-*
    339    %ifdef n_arg_names
    340        %assign %%i 0
    341        %rep n_arg_names
    342            CAT_UNDEF arg_name %+ %%i, q
    343            CAT_UNDEF arg_name %+ %%i, d
    344            CAT_UNDEF arg_name %+ %%i, w
    345            CAT_UNDEF arg_name %+ %%i, h
    346            CAT_UNDEF arg_name %+ %%i, b
    347            CAT_UNDEF arg_name %+ %%i, m
    348            CAT_UNDEF arg_name %+ %%i, mp
    349            CAT_UNDEF arg_name, %%i
    350            %assign %%i %%i+1
    351        %endrep
    352    %endif
    353 
    354    %xdefine %%stack_offset stack_offset
    355    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
    356    %assign %%i 0
    357    %rep %0
    358        %xdefine %1q r %+ %%i %+ q
    359        %xdefine %1d r %+ %%i %+ d
    360        %xdefine %1w r %+ %%i %+ w
    361        %xdefine %1h r %+ %%i %+ h
    362        %xdefine %1b r %+ %%i %+ b
    363        %xdefine %1m r %+ %%i %+ m
    364        %xdefine %1mp r %+ %%i %+ mp
    365        CAT_XDEFINE arg_name, %%i, %1
    366        %assign %%i %%i+1
    367        %rotate 1
    368    %endrep
    369    %xdefine stack_offset %%stack_offset
    370    %assign n_arg_names %0
    371 %endmacro
    372 
    373 %define required_stack_alignment ((mmsize + 15) & ~15)
    374 %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
    375 %define high_mm_regs (16*cpuflag(avx512))
    376 
    377 ; Large stack allocations on Windows need to use stack probing in order
    378 ; to guarantee that all stack memory is committed before accessing it.
    379 ; This is done by ensuring that the guard page(s) at the end of the
    380 ; currently committed pages are touched prior to any pages beyond that.
    381 %if WIN64
    382    %assign STACK_PROBE_SIZE 8192
    383 %elifidn __OUTPUT_FORMAT__, win32
    384    %assign STACK_PROBE_SIZE 4096
    385 %else
    386    %assign STACK_PROBE_SIZE 0
    387 %endif
    388 
    389 %macro PROBE_STACK 1 ; stack_size
    390    %if STACK_PROBE_SIZE
    391        %assign %%i STACK_PROBE_SIZE
    392        %rep %1 / STACK_PROBE_SIZE
    393            mov eax, [rsp-%%i]
    394            %assign %%i %%i+STACK_PROBE_SIZE
    395        %endrep
    396    %endif
    397 %endmacro
    398 
    399 %macro RESET_STACK_STATE 0
    400    %ifidn rstk, rsp
    401        %assign stack_offset stack_offset - stack_size_padded
    402    %else
    403        %xdefine rstk rsp
    404    %endif
    405    %assign stack_size 0
    406    %assign stack_size_padded 0
    407    %assign xmm_regs_used 0
    408 %endmacro
    409 
    410 %macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs
    411    RESET_STACK_STATE
    412    %ifnum %2
    413        %if mmsize != 8
    414            %assign xmm_regs_used %2
    415        %endif
    416    %endif
    417    %ifnum %1
    418        %if %1 != 0
    419            %assign %%pad 0
    420            %assign stack_size %1
    421            %if stack_size < 0
    422                %assign stack_size -stack_size
    423            %endif
    424            %if WIN64
    425                %assign %%pad %%pad + 32 ; shadow space
    426                %if xmm_regs_used > 8
    427                    %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
    428                %endif
    429            %endif
    430            %if required_stack_alignment <= STACK_ALIGNMENT
    431                ; maintain the current stack alignment
    432                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
    433                PROBE_STACK stack_size_padded
    434                SUB rsp, stack_size_padded
    435            %else
    436                %assign %%reg_num (regs_used - 1)
    437                %xdefine rstk r %+ %%reg_num
    438                ; align stack, and save original stack location directly above
    439                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
    440                ; stack in a single instruction (i.e. mov rsp, rstk or mov
    441                ; rsp, [rsp+stack_size_padded])
    442                %if %1 < 0 ; need to store rsp on stack
    443                    %xdefine rstkm [rsp + stack_size + %%pad]
    444                    %assign %%pad %%pad + gprsize
    445                %else ; can keep rsp in rstk during whole function
    446                    %xdefine rstkm rstk
    447                %endif
    448                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
    449                PROBE_STACK stack_size_padded
    450                mov rstk, rsp
    451                and rsp, ~(required_stack_alignment-1)
    452                sub rsp, stack_size_padded
    453                movifnidn rstkm, rstk
    454            %endif
    455            WIN64_PUSH_XMM
    456        %endif
    457    %endif
    458 %endmacro
    459 
    460 %macro SETUP_STACK_POINTER 0-1 0
    461    %ifnum %1
    462        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
    463            %if %1 > 0
    464                ; Reserve an additional register for storing the original stack pointer, but avoid using
    465                ; eax/rax for this purpose since it can potentially get overwritten as a return value.
    466                %assign regs_used (regs_used + 1)
    467                %if ARCH_X86_64 && regs_used == 7
    468                    %assign regs_used 8
    469                %elif ARCH_X86_64 == 0 && regs_used == 1
    470                    %assign regs_used 2
    471                %endif
    472            %endif
    473            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
    474                ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
    475                ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
    476                %assign regs_used 5 + UNIX64 * 3
    477            %endif
    478        %endif
    479    %endif
    480 %endmacro
    481 
    482 %if WIN64 ; Windows x64 ;=================================================
    483 
    484 DECLARE_REG 0,  rcx
    485 DECLARE_REG 1,  rdx
    486 DECLARE_REG 2,  R8
    487 DECLARE_REG 3,  R9
    488 DECLARE_REG 4,  R10, 40
    489 DECLARE_REG 5,  R11, 48
    490 DECLARE_REG 6,  rax, 56
    491 DECLARE_REG 7,  rdi, 64
    492 DECLARE_REG 8,  rsi, 72
    493 DECLARE_REG 9,  rbx, 80
    494 DECLARE_REG 10, rbp, 88
    495 DECLARE_REG 11, R14, 96
    496 DECLARE_REG 12, R15, 104
    497 DECLARE_REG 13, R12, 112
    498 DECLARE_REG 14, R13, 120
    499 
    500 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
    501    %assign num_args %1
    502    %assign regs_used %2
    503    ASSERT regs_used >= num_args
    504    SETUP_STACK_POINTER %4
    505    ASSERT regs_used <= 15
    506    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
    507    ALLOC_STACK %4, %3
    508    %if mmsize != 8 && stack_size == 0
    509        WIN64_SPILL_XMM %3
    510    %endif
    511    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
    512    %if %0 > 4
    513        %ifnum %4
    514            DEFINE_ARGS %5
    515        %else
    516            DEFINE_ARGS %4, %5
    517        %endif
    518    %elifnnum %4
    519        DEFINE_ARGS %4
    520    %endif
    521 %endmacro
    522 
    523 ; Push XMM registers to the stack. If no argument is specified all used register
    524 ; will be pushed, otherwise only push previously unpushed registers.
    525 %macro WIN64_PUSH_XMM 0-2 ; new_xmm_regs_used, xmm_regs_pushed
    526    %if mmsize != 8
    527        %if %0 == 2
    528            %assign %%pushed %2
    529            %assign xmm_regs_used %1
    530        %elif %0 == 1
    531            %assign %%pushed xmm_regs_used
    532            %assign xmm_regs_used %1
    533        %else
    534            %assign %%pushed 0
    535        %endif
    536        ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
    537        %if %%pushed <= 6 + high_mm_regs && xmm_regs_used > 6 + high_mm_regs
    538            movaps [rstk + stack_offset +  8], xmm6
    539        %endif
    540        %if %%pushed <= 7 + high_mm_regs && xmm_regs_used > 7 + high_mm_regs
    541            movaps [rstk + stack_offset + 24], xmm7
    542        %endif
    543        %assign %%pushed %%pushed - high_mm_regs - 8
    544        %if %%pushed < 0
    545            %assign %%pushed 0
    546        %endif
    547        %assign %%regs_to_push xmm_regs_used - %%pushed - high_mm_regs - 8
    548        %if %%regs_to_push > 0
    549            ASSERT (%%regs_to_push + %%pushed) * 16 <= stack_size_padded - stack_size - 32
    550            %assign %%i %%pushed + 8
    551            %rep %%regs_to_push
    552                movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
    553                %assign %%i %%i+1
    554            %endrep
    555        %endif
    556    %endif
    557 %endmacro
    558 
    559 ; Allocated stack space for XMM registers and push all, or a subset, of those
    560 %macro WIN64_SPILL_XMM 1-2 ; xmm_regs_used, xmm_regs_reserved
    561    RESET_STACK_STATE
    562    %if mmsize != 8
    563        %assign xmm_regs_used %1
    564        ASSERT xmm_regs_used <= 16 + high_mm_regs
    565        %if %0 == 2
    566            ASSERT %2 >= %1
    567            %assign %%xmm_regs_on_stack %2 - high_mm_regs - 8
    568        %else
    569            %assign %%xmm_regs_on_stack %1 - high_mm_regs - 8
    570        %endif
    571        %if %%xmm_regs_on_stack > 0
    572            ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
    573            %assign %%pad %%xmm_regs_on_stack*16 + 32
    574            %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
    575            SUB rsp, stack_size_padded
    576        %endif
    577        WIN64_PUSH_XMM
    578    %endif
    579 %endmacro
    580 
    581 %macro WIN64_RESTORE_XMM_INTERNAL 0
    582    %assign %%pad_size 0
    583    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
    584    %if %%xmm_regs_on_stack > 0
    585        %assign %%i xmm_regs_used - high_mm_regs
    586        %rep %%xmm_regs_on_stack
    587            %assign %%i %%i-1
    588            movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
    589        %endrep
    590    %endif
    591    %if stack_size_padded > 0
    592        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
    593            mov rsp, rstkm
    594        %else
    595            add rsp, stack_size_padded
    596            %assign %%pad_size stack_size_padded
    597        %endif
    598    %endif
    599    %if xmm_regs_used > 7 + high_mm_regs
    600        movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
    601    %endif
    602    %if xmm_regs_used > 6 + high_mm_regs
    603        movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
    604    %endif
    605 %endmacro
    606 
    607 %macro WIN64_RESTORE_XMM 0
    608    WIN64_RESTORE_XMM_INTERNAL
    609    RESET_STACK_STATE
    610 %endmacro
    611 
    612 %define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
    613 
    614 %macro RET 0
    615    WIN64_RESTORE_XMM_INTERNAL
    616    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
    617    %if vzeroupper_required
    618        vzeroupper
    619    %endif
    620    AUTO_REP_RET
    621 %endmacro
    622 
    623 %elif ARCH_X86_64 ; *nix x64 ;=============================================
    624 
    625 DECLARE_REG 0,  rdi
    626 DECLARE_REG 1,  rsi
    627 DECLARE_REG 2,  rdx
    628 DECLARE_REG 3,  rcx
    629 DECLARE_REG 4,  R8
    630 DECLARE_REG 5,  R9
    631 DECLARE_REG 6,  rax, 8
    632 DECLARE_REG 7,  R10, 16
    633 DECLARE_REG 8,  R11, 24
    634 DECLARE_REG 9,  rbx, 32
    635 DECLARE_REG 10, rbp, 40
    636 DECLARE_REG 11, R14, 48
    637 DECLARE_REG 12, R15, 56
    638 DECLARE_REG 13, R12, 64
    639 DECLARE_REG 14, R13, 72
    640 
    641 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
    642    %assign num_args %1
    643    %assign regs_used %2
    644    ASSERT regs_used >= num_args
    645    SETUP_STACK_POINTER %4
    646    ASSERT regs_used <= 15
    647    PUSH_IF_USED 9, 10, 11, 12, 13, 14
    648    ALLOC_STACK %4, %3
    649    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
    650    %if %0 > 4
    651        %ifnum %4
    652            DEFINE_ARGS %5
    653        %else
    654            DEFINE_ARGS %4, %5
    655        %endif
    656    %elifnnum %4
    657        DEFINE_ARGS %4
    658    %endif
    659 %endmacro
    660 
    661 %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
    662 
    663 %macro RET 0
    664    %if stack_size_padded > 0
    665        %if required_stack_alignment > STACK_ALIGNMENT
    666            mov rsp, rstkm
    667        %else
    668            add rsp, stack_size_padded
    669        %endif
    670    %endif
    671    POP_IF_USED 14, 13, 12, 11, 10, 9
    672    %if vzeroupper_required
    673        vzeroupper
    674    %endif
    675    AUTO_REP_RET
    676 %endmacro
    677 
    678 %else ; X86_32 ;==============================================================
    679 
    680 DECLARE_REG 0, eax, 4
    681 DECLARE_REG 1, ecx, 8
    682 DECLARE_REG 2, edx, 12
    683 DECLARE_REG 3, ebx, 16
    684 DECLARE_REG 4, esi, 20
    685 DECLARE_REG 5, edi, 24
    686 DECLARE_REG 6, ebp, 28
    687 %define rsp esp
    688 
    689 %macro DECLARE_ARG 1-*
    690    %rep %0
    691        %define r%1m [rstk + stack_offset + 4*%1 + 4]
    692        %define r%1mp dword r%1m
    693        %rotate 1
    694    %endrep
    695 %endmacro
    696 
    697 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
    698 
    699 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
    700    %assign num_args %1
    701    %assign regs_used %2
    702    ASSERT regs_used >= num_args
    703    %if num_args > 7
    704        %assign num_args 7
    705    %endif
    706    %if regs_used > 7
    707        %assign regs_used 7
    708    %endif
    709    SETUP_STACK_POINTER %4
    710    ASSERT regs_used <= 7
    711    PUSH_IF_USED 3, 4, 5, 6
    712    ALLOC_STACK %4, %3
    713    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
    714    %if %0 > 4
    715        %ifnum %4
    716            DEFINE_ARGS %5
    717        %else
    718            DEFINE_ARGS %4, %5
    719        %endif
    720    %elifnnum %4
    721        DEFINE_ARGS %4
    722    %endif
    723 %endmacro
    724 
    725 %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
    726 
    727 %macro RET 0
    728    %if stack_size_padded > 0
    729        %if required_stack_alignment > STACK_ALIGNMENT
    730            mov rsp, rstkm
    731        %else
    732            add rsp, stack_size_padded
    733        %endif
    734    %endif
    735    POP_IF_USED 6, 5, 4, 3
    736    %if vzeroupper_required
    737        vzeroupper
    738    %endif
    739    AUTO_REP_RET
    740 %endmacro
    741 
    742 %endif ;======================================================================
    743 
    744 %if WIN64 == 0
    745    %macro WIN64_SPILL_XMM 1-2
    746        RESET_STACK_STATE
    747        %if mmsize != 8
    748            %assign xmm_regs_used %1
    749        %endif
    750    %endmacro
    751    %macro WIN64_RESTORE_XMM 0
    752        RESET_STACK_STATE
    753    %endmacro
    754    %macro WIN64_PUSH_XMM 0-2
    755        %if mmsize != 8 && %0 >= 1
    756            %assign xmm_regs_used %1
    757        %endif
    758    %endmacro
    759 %endif
    760 
    761 ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
    762 ; a branch or a branch target. So switch to a 2-byte form of ret in that case.
    763 ; We can automatically detect "follows a branch", but not a branch target.
    764 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
    765 %macro REP_RET 0
    766    %if has_epilogue || cpuflag(ssse3)
    767        RET
    768    %else
    769        rep ret
    770    %endif
    771    annotate_function_size
    772 %endmacro
    773 
    774 %define last_branch_adr $$
    775 %macro AUTO_REP_RET 0
    776    %if notcpuflag(ssse3)
    777        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
    778    %endif
    779    ret
    780    annotate_function_size
    781 %endmacro
    782 
    783 %macro BRANCH_INSTR 0-*
    784    %rep %0
    785        %macro %1 1-2 %1
    786            %2 %1
    787            %if notcpuflag(ssse3)
    788                %%branch_instr equ $
    789                %xdefine last_branch_adr %%branch_instr
    790            %endif
    791        %endmacro
    792        %rotate 1
    793    %endrep
    794 %endmacro
    795 
    796 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
    797 
    798 %macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
    799    %if has_epilogue
    800        call %1
    801        RET
    802    %elif %2
    803        jmp %1
    804    %endif
    805    annotate_function_size
    806 %endmacro
    807 
    808 ;=============================================================================
    809 ; arch-independent part
    810 ;=============================================================================
    811 
    812 %assign function_align 16
    813 
    814 ; Begin a function.
    815 ; Applies any symbol mangling needed for C linkage, and sets up a define such that
    816 ; subsequent uses of the function name automatically refer to the mangled version.
    817 ; Appends cpuflags to the function name if cpuflags has been specified.
    818 ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
    819 ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
    820 %macro cglobal 1-2+ "" ; name, [PROLOGUE args]
    821    cglobal_internal 1, %1 %+ SUFFIX, %2
    822 %endmacro
    823 %macro cvisible 1-2+ "" ; name, [PROLOGUE args]
    824    cglobal_internal 0, %1 %+ SUFFIX, %2
    825 %endmacro
    826 %macro cglobal_internal 2-3+
    827    annotate_function_size
    828    %ifndef cglobaled_%2
    829        %if %1
    830            %xdefine %2 mangle(private_prefix %+ _ %+ %2)
    831        %else
    832            %xdefine %2 mangle(public_prefix %+ _ %+ %2)
    833        %endif
    834        %xdefine %2.skip_prologue %2 %+ .skip_prologue
    835        CAT_XDEFINE cglobaled_, %2, 1
    836    %endif
    837    %xdefine current_function %2
    838    %xdefine current_function_section __SECT__
    839    %if FORMAT_ELF
    840        %if %1
    841            global %2:function hidden
    842        %else
    843            global %2:function
    844        %endif
    845    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
    846        global %2:private_extern
    847    %else
    848        global %2
    849    %endif
    850    align function_align
    851    %2:
    852    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
    853    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
    854    %assign stack_offset 0      ; stack pointer offset relative to the return address
    855    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
    856    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
    857    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
    858    %ifnidn %3, ""
    859        PROLOGUE %3
    860    %endif
    861 %endmacro
    862 
    863 ; Create a global symbol from a local label with the correct name mangling and type
    864 %macro cglobal_label 1
    865    %if FORMAT_ELF
    866        global current_function %+ %1:function hidden
    867    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
    868        global current_function %+ %1:private_extern
    869    %else
    870        global current_function %+ %1
    871    %endif
    872    %1:
    873 %endmacro
    874 
    875 %macro cextern 1
    876    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
    877    CAT_XDEFINE cglobaled_, %1, 1
    878    extern %1
    879 %endmacro
    880 
    881 ; like cextern, but without the prefix
    882 %macro cextern_naked 1
    883    %ifdef PREFIX
    884        %xdefine %1 mangle(%1)
    885    %endif
    886    CAT_XDEFINE cglobaled_, %1, 1
    887    extern %1
    888 %endmacro
    889 
    890 %macro const 1-2+
    891    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
    892    %if FORMAT_ELF
    893        global %1:data hidden
    894    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
    895        global %1:private_extern
    896    %else
    897        global %1
    898    %endif
    899    %1: %2
    900 %endmacro
    901 
    902 %if FORMAT_ELF
    903    ; The GNU linker assumes the stack is executable by default.
    904    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
    905 
    906    %ifdef __NASM_VERSION_ID__
    907        %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03
    908            %if ARCH_X86_64
    909                ; Control-flow Enforcement Technology (CET) properties.
    910                [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize]
    911                dd 0x00000004  ; n_namesz
    912                dd gprsize + 8 ; n_descsz
    913                dd 0x00000005  ; n_type = NT_GNU_PROPERTY_TYPE_0
    914                db "GNU",0     ; n_name
    915                dd 0xc0000002  ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND
    916                dd 0x00000004  ; pr_datasz
    917                dd 0x00000002  ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK
    918                dd 0x00000000  ; pr_padding
    919            %endif
    920        %endif
    921    %endif
    922 %endif
    923 
    924 ; Tell debuggers how large the function was.
    925 ; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
    926 ; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
    927 ; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
    928 ; then its size might be unspecified.
    929 %macro annotate_function_size 0
    930    %ifdef __YASM_VER__
    931        %ifdef current_function
    932            %if FORMAT_ELF
    933                current_function_section
    934                %%ecf equ $
    935                size current_function %%ecf - current_function
    936                __SECT__
    937            %endif
    938        %endif
    939    %endif
    940 %endmacro
    941 
    942 ; cpuflags
    943 
    944 %assign cpuflags_mmx       (1<<0)
    945 %assign cpuflags_mmx2      (1<<1)  | cpuflags_mmx
    946 %assign cpuflags_3dnow     (1<<2)  | cpuflags_mmx
    947 %assign cpuflags_3dnowext  (1<<3)  | cpuflags_3dnow
    948 %assign cpuflags_sse       (1<<4)  | cpuflags_mmx2
    949 %assign cpuflags_sse2      (1<<5)  | cpuflags_sse
    950 %assign cpuflags_sse2slow  (1<<6)  | cpuflags_sse2
    951 %assign cpuflags_lzcnt     (1<<7)  | cpuflags_sse2
    952 %assign cpuflags_sse3      (1<<8)  | cpuflags_sse2
    953 %assign cpuflags_ssse3     (1<<9)  | cpuflags_sse3
    954 %assign cpuflags_sse4      (1<<10) | cpuflags_ssse3
    955 %assign cpuflags_sse42     (1<<11) | cpuflags_sse4
    956 %assign cpuflags_aesni     (1<<12) | cpuflags_sse42
    957 %assign cpuflags_clmul     (1<<13) | cpuflags_sse42
    958 %assign cpuflags_gfni      (1<<14) | cpuflags_aesni|cpuflags_clmul
    959 %assign cpuflags_avx       (1<<15) | cpuflags_sse42
    960 %assign cpuflags_xop       (1<<16) | cpuflags_avx
    961 %assign cpuflags_fma4      (1<<17) | cpuflags_avx
    962 %assign cpuflags_fma3      (1<<18) | cpuflags_avx
    963 %assign cpuflags_bmi1      (1<<19) | cpuflags_avx|cpuflags_lzcnt
    964 %assign cpuflags_bmi2      (1<<20) | cpuflags_bmi1
    965 %assign cpuflags_avx2      (1<<21) | cpuflags_fma3|cpuflags_bmi2
    966 %assign cpuflags_avx512    (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL
    967 %assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
    968 
    969 %assign cpuflags_cache32   (1<<24)
    970 %assign cpuflags_cache64   (1<<25)
    971 %assign cpuflags_aligned   (1<<26) ; not a cpu feature, but a function variant
    972 %assign cpuflags_atom      (1<<27)
    973 
    974 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
    975 %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
    976 %define notcpuflag(x) (cpuflag(x) ^ 1)
    977 
    978 ; Takes an arbitrary number of cpuflags from the above list.
    979 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
    980 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
    981 %macro INIT_CPUFLAGS 0-*
    982    %xdefine SUFFIX
    983    %undef cpuname
    984    %assign cpuflags 0
    985 
    986    %if %0 >= 1
    987        %rep %0
    988            %ifdef cpuname
    989                %xdefine cpuname cpuname %+ _%1
    990            %else
    991                %xdefine cpuname %1
    992            %endif
    993            %assign cpuflags cpuflags | cpuflags_%1
    994            %rotate 1
    995        %endrep
    996        %xdefine SUFFIX _ %+ cpuname
    997 
    998        %if cpuflag(avx)
    999            %assign avx_enabled 1
   1000        %endif
   1001        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
   1002            %define mova movaps
   1003            %define movu movups
   1004            %define movnta movntps
   1005        %endif
   1006        %if cpuflag(aligned)
   1007            %define movu mova
   1008        %elif cpuflag(sse3) && notcpuflag(ssse3)
   1009            %define movu lddqu
   1010        %endif
   1011    %endif
   1012 
   1013    %if ARCH_X86_64 || cpuflag(sse2)
   1014        %ifdef __NASM_VERSION_ID__
   1015            ALIGNMODE p6
   1016        %else
   1017            CPU amdnop
   1018        %endif
   1019    %else
   1020        %ifdef __NASM_VERSION_ID__
   1021            ALIGNMODE nop
   1022        %else
   1023            CPU basicnop
   1024        %endif
   1025    %endif
   1026 %endmacro
   1027 
   1028 ; Merge mmx, sse*, and avx*
   1029 ; m# is a simd register of the currently selected size
   1030 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
   1031 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
   1032 ; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
   1033 ; (All 4 remain in sync through SWAP.)
   1034 
   1035 %macro CAT_XDEFINE 3
   1036    %xdefine %1%2 %3
   1037 %endmacro
   1038 
   1039 %macro CAT_UNDEF 2
   1040    %undef %1%2
   1041 %endmacro
   1042 
   1043 %macro DEFINE_MMREGS 1 ; mmtype
   1044    %assign %%prev_mmregs 0
   1045    %ifdef num_mmregs
   1046        %assign %%prev_mmregs num_mmregs
   1047    %endif
   1048 
   1049    %assign num_mmregs 8
   1050    %if ARCH_X86_64 && mmsize >= 16
   1051        %assign num_mmregs 16
   1052        %if cpuflag(avx512) || mmsize == 64
   1053            %assign num_mmregs 32
   1054        %endif
   1055    %endif
   1056 
   1057    %assign %%i 0
   1058    %rep num_mmregs
   1059        CAT_XDEFINE m, %%i, %1 %+ %%i
   1060        CAT_XDEFINE nn%1, %%i, %%i
   1061        %assign %%i %%i+1
   1062    %endrep
   1063    %if %%prev_mmregs > num_mmregs
   1064        %rep %%prev_mmregs - num_mmregs
   1065            CAT_UNDEF m, %%i
   1066            CAT_UNDEF nn %+ mmtype, %%i
   1067            %assign %%i %%i+1
   1068        %endrep
   1069    %endif
   1070    %xdefine mmtype %1
   1071 %endmacro
   1072 
   1073 ; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
   1074 %macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
   1075    %if ARCH_X86_64 && cpuflag(avx512)
   1076        %assign %%i %1
   1077        %rep 16-%1
   1078            %assign %%i_high %%i+16
   1079            SWAP %%i, %%i_high
   1080            %assign %%i %%i+1
   1081        %endrep
   1082    %endif
   1083 %endmacro
   1084 
   1085 %macro INIT_MMX 0-1+
   1086    %assign avx_enabled 0
   1087    %define RESET_MM_PERMUTATION INIT_MMX %1
   1088    %define mmsize 8
   1089    %define mova movq
   1090    %define movu movq
   1091    %define movh movd
   1092    %define movnta movntq
   1093    INIT_CPUFLAGS %1
   1094    DEFINE_MMREGS mm
   1095 %endmacro
   1096 
   1097 %macro INIT_XMM 0-1+
   1098    %assign avx_enabled FORCE_VEX_ENCODING
   1099    %define RESET_MM_PERMUTATION INIT_XMM %1
   1100    %define mmsize 16
   1101    %define mova movdqa
   1102    %define movu movdqu
   1103    %define movh movq
   1104    %define movnta movntdq
   1105    INIT_CPUFLAGS %1
   1106    DEFINE_MMREGS xmm
   1107    %if WIN64
   1108        AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
   1109    %endif
   1110    %xdefine bcstw 1to8
   1111    %xdefine bcstd 1to4
   1112    %xdefine bcstq 1to2
   1113 %endmacro
   1114 
   1115 %macro INIT_YMM 0-1+
   1116    %assign avx_enabled 1
   1117    %define RESET_MM_PERMUTATION INIT_YMM %1
   1118    %define mmsize 32
   1119    %define mova movdqa
   1120    %define movu movdqu
   1121    %undef movh
   1122    %define movnta movntdq
   1123    INIT_CPUFLAGS %1
   1124    DEFINE_MMREGS ymm
   1125    AVX512_MM_PERMUTATION
   1126    %xdefine bcstw 1to16
   1127    %xdefine bcstd 1to8
   1128    %xdefine bcstq 1to4
   1129 %endmacro
   1130 
   1131 %macro INIT_ZMM 0-1+
   1132    %assign avx_enabled 1
   1133    %define RESET_MM_PERMUTATION INIT_ZMM %1
   1134    %define mmsize 64
   1135    %define mova movdqa
   1136    %define movu movdqu
   1137    %undef movh
   1138    %define movnta movntdq
   1139    INIT_CPUFLAGS %1
   1140    DEFINE_MMREGS zmm
   1141    AVX512_MM_PERMUTATION
   1142    %xdefine bcstw 1to32
   1143    %xdefine bcstd 1to16
   1144    %xdefine bcstq 1to8
   1145 %endmacro
   1146 
   1147 INIT_XMM
   1148 
   1149 %macro DECLARE_MMCAST 1
   1150    %define  mmmm%1   mm%1
   1151    %define  mmxmm%1  mm%1
   1152    %define  mmymm%1  mm%1
   1153    %define  mmzmm%1  mm%1
   1154    %define xmmmm%1   mm%1
   1155    %define xmmxmm%1 xmm%1
   1156    %define xmmymm%1 xmm%1
   1157    %define xmmzmm%1 xmm%1
   1158    %define ymmmm%1   mm%1
   1159    %define ymmxmm%1 xmm%1
   1160    %define ymmymm%1 ymm%1
   1161    %define ymmzmm%1 ymm%1
   1162    %define zmmmm%1   mm%1
   1163    %define zmmxmm%1 xmm%1
   1164    %define zmmymm%1 ymm%1
   1165    %define zmmzmm%1 zmm%1
   1166    %define xm%1 xmm %+ m%1
   1167    %define ym%1 ymm %+ m%1
   1168    %define zm%1 zmm %+ m%1
   1169 %endmacro
   1170 
   1171 %assign i 0
   1172 %rep 32
   1173    DECLARE_MMCAST i
   1174    %assign i i+1
   1175 %endrep
   1176 
   1177 ; I often want to use macros that permute their arguments. e.g. there's no
   1178 ; efficient way to implement butterfly or transpose or dct without swapping some
   1179 ; arguments.
   1180 ;
   1181 ; I would like to not have to manually keep track of the permutations:
   1182 ; If I insert a permutation in the middle of a function, it should automatically
   1183 ; change everything that follows. For more complex macros I may also have multiple
   1184 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
   1185 ;
   1186 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
   1187 ; permutes its arguments. It's equivalent to exchanging the contents of the
   1188 ; registers, except that this way you exchange the register names instead, so it
   1189 ; doesn't cost any cycles.
   1190 
   1191 %macro PERMUTE 2-* ; takes a list of pairs to swap
   1192    %rep %0/2
   1193        %xdefine %%tmp%2 m%2
   1194        %rotate 2
   1195    %endrep
   1196    %rep %0/2
   1197        %xdefine m%1 %%tmp%2
   1198        CAT_XDEFINE nn, m%1, %1
   1199        %rotate 2
   1200    %endrep
   1201 %endmacro
   1202 
   1203 %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
   1204    %ifnum %1 ; SWAP 0, 1, ...
   1205        SWAP_INTERNAL_NUM %1, %2
   1206    %else ; SWAP m0, m1, ...
   1207        SWAP_INTERNAL_NAME %1, %2
   1208    %endif
   1209 %endmacro
   1210 
   1211 %macro SWAP_INTERNAL_NUM 2-*
   1212    %rep %0-1
   1213        %xdefine %%tmp m%1
   1214        %xdefine m%1 m%2
   1215        %xdefine m%2 %%tmp
   1216        CAT_XDEFINE nn, m%1, %1
   1217        CAT_XDEFINE nn, m%2, %2
   1218        %rotate 1
   1219    %endrep
   1220 %endmacro
   1221 
   1222 %macro SWAP_INTERNAL_NAME 2-*
   1223    %xdefine %%args nn %+ %1
   1224    %rep %0-1
   1225        %xdefine %%args %%args, nn %+ %2
   1226        %rotate 1
   1227    %endrep
   1228    SWAP_INTERNAL_NUM %%args
   1229 %endmacro
   1230 
   1231 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
   1232 ; calls to that function will automatically load the permutation, so values can
   1233 ; be returned in mmregs.
   1234 %macro SAVE_MM_PERMUTATION 0-1
   1235    %if %0
   1236        %xdefine %%f %1_m
   1237    %else
   1238        %xdefine %%f current_function %+ _m
   1239    %endif
   1240    %assign %%i 0
   1241    %rep num_mmregs
   1242        %xdefine %%tmp m %+ %%i
   1243        CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
   1244        %assign %%i %%i+1
   1245    %endrep
   1246 %endmacro
   1247 
   1248 %macro LOAD_MM_PERMUTATION 0-1 ; name to load from
   1249    %if %0
   1250        %xdefine %%f %1_m
   1251    %else
   1252        %xdefine %%f current_function %+ _m
   1253    %endif
   1254    %xdefine %%tmp %%f %+ 0
   1255    %ifnum %%tmp
   1256        DEFINE_MMREGS mmtype
   1257        %assign %%i 0
   1258        %rep num_mmregs
   1259            %xdefine %%tmp %%f %+ %%i
   1260            CAT_XDEFINE %%m, %%i, m %+ %%tmp
   1261            %assign %%i %%i+1
   1262        %endrep
   1263        %rep num_mmregs
   1264            %assign %%i %%i-1
   1265            CAT_XDEFINE m, %%i, %%m %+ %%i
   1266            CAT_XDEFINE nn, m %+ %%i, %%i
   1267        %endrep
   1268    %endif
   1269 %endmacro
   1270 
   1271 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
   1272 %macro call 1
   1273    %ifid %1
   1274        call_internal %1 %+ SUFFIX, %1
   1275    %else
   1276        call %1
   1277    %endif
   1278 %endmacro
   1279 %macro call_internal 2
   1280    %xdefine %%i %2
   1281    %ifndef cglobaled_%2
   1282        %ifdef cglobaled_%1
   1283            %xdefine %%i %1
   1284        %endif
   1285    %endif
   1286    call %%i
   1287    LOAD_MM_PERMUTATION %%i
   1288 %endmacro
   1289 
   1290 ; Substitutions that reduce instruction size but are functionally equivalent
   1291 %macro add 2
   1292    %ifnum %2
   1293        %if %2==128
   1294            sub %1, -128
   1295        %else
   1296            add %1, %2
   1297        %endif
   1298    %else
   1299        add %1, %2
   1300    %endif
   1301 %endmacro
   1302 
   1303 %macro sub 2
   1304    %ifnum %2
   1305        %if %2==128
   1306            add %1, -128
   1307        %else
   1308            sub %1, %2
   1309        %endif
   1310    %else
   1311        sub %1, %2
   1312    %endif
   1313 %endmacro
   1314 
   1315 ;=============================================================================
   1316 ; AVX abstraction layer
   1317 ;=============================================================================
   1318 
   1319 %assign i 0
   1320 %rep 32
   1321    %if i < 8
   1322        CAT_XDEFINE sizeofmm, i, 8
   1323        CAT_XDEFINE regnumofmm, i, i
   1324    %endif
   1325    CAT_XDEFINE sizeofxmm, i, 16
   1326    CAT_XDEFINE sizeofymm, i, 32
   1327    CAT_XDEFINE sizeofzmm, i, 64
   1328    CAT_XDEFINE regnumofxmm, i, i
   1329    CAT_XDEFINE regnumofymm, i, i
   1330    CAT_XDEFINE regnumofzmm, i, i
   1331    %assign i i+1
   1332 %endrep
   1333 %undef i
   1334 
   1335 %macro CHECK_AVX_INSTR_EMU 3-*
   1336    %xdefine %%opcode %1
   1337    %xdefine %%dst %2
   1338    %rep %0-2
   1339        %ifidn %%dst, %3
   1340            %error non-avx emulation of ``%%opcode'' is not supported
   1341        %endif
   1342        %rotate 1
   1343    %endrep
   1344 %endmacro
   1345 
   1346 ;%1 == instruction
   1347 ;%2 == minimal instruction set
   1348 ;%3 == 1 if float, 0 if int
   1349 ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
   1350 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
   1351 ;%6+: operands
   1352 %macro RUN_AVX_INSTR 6-9+
   1353    %ifnum sizeof%7
   1354        %assign __sizeofreg sizeof%7
   1355    %elifnum sizeof%6
   1356        %assign __sizeofreg sizeof%6
   1357    %else
   1358        %assign __sizeofreg mmsize
   1359    %endif
   1360    %assign __emulate_avx 0
   1361    %if avx_enabled && __sizeofreg >= 16
   1362        %xdefine __instr v%1
   1363    %else
   1364        %xdefine __instr %1
   1365        %if %0 >= 8+%4
   1366            %assign __emulate_avx 1
   1367        %endif
   1368    %endif
   1369    %ifnidn %2, fnord
   1370        %ifdef cpuname
   1371            %if notcpuflag(%2)
   1372                %error use of ``%1'' %2 instruction in cpuname function: current_function
   1373            %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
   1374                %error use of ``%1'' sse2 instruction in cpuname function: current_function
   1375            %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
   1376                %error use of ``%1'' avx2 instruction in cpuname function: current_function
   1377            %elif __sizeofreg == 16 && notcpuflag(sse)
   1378                %error use of ``%1'' sse instruction in cpuname function: current_function
   1379            %elif __sizeofreg == 32 && notcpuflag(avx)
   1380                %error use of ``%1'' avx instruction in cpuname function: current_function
   1381            %elif __sizeofreg == 64 && notcpuflag(avx512)
   1382                %error use of ``%1'' avx512 instruction in cpuname function: current_function
   1383            %elifidn %1, pextrw ; special case because the base instruction is mmx2,
   1384                %ifnid %6       ; but sse4 is required for memory operands
   1385                    %if notcpuflag(sse4)
   1386                        %error use of ``%1'' sse4 instruction in cpuname function: current_function
   1387                    %endif
   1388                %endif
   1389            %endif
   1390        %endif
   1391    %endif
   1392 
   1393    %if __emulate_avx
   1394        %xdefine __src1 %7
   1395        %xdefine __src2 %8
   1396        %if %5 && %4 == 0
   1397            %ifnidn %6, %7
   1398                %ifidn %6, %8
   1399                    %xdefine __src1 %8
   1400                    %xdefine __src2 %7
   1401                %elifnnum sizeof%8
   1402                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
   1403                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
   1404                    ; So, if the instruction is commutative with a memory arg, swap them.
   1405                    %xdefine __src1 %8
   1406                    %xdefine __src2 %7
   1407                %endif
   1408            %endif
   1409        %endif
   1410        %ifnidn %6, __src1
   1411            %if %0 >= 9
   1412                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
   1413            %else
   1414                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
   1415            %endif
   1416            %if __sizeofreg == 8
   1417                MOVQ %6, __src1
   1418            %elif %3
   1419                MOVAPS %6, __src1
   1420            %else
   1421                MOVDQA %6, __src1
   1422            %endif
   1423        %endif
   1424        %if %0 >= 9
   1425            %1 %6, __src2, %9
   1426        %else
   1427            %1 %6, __src2
   1428        %endif
   1429    %elif %0 >= 9
   1430        %if avx_enabled && __sizeofreg >= 16 && %4 == 1
   1431            %ifnnum regnumof%7
   1432                %if %3
   1433                    vmovaps %6, %7
   1434                %else
   1435                    vmovdqa %6, %7
   1436                %endif
   1437                __instr %6, %6, %8, %9
   1438            %else
   1439                __instr %6, %7, %8, %9
   1440            %endif
   1441        %else
   1442            __instr %6, %7, %8, %9
   1443        %endif
   1444    %elif %0 == 8
   1445        %if avx_enabled && __sizeofreg >= 16 && %4 == 0
   1446            %xdefine __src1 %7
   1447            %xdefine __src2 %8
   1448            %if %5
   1449                %ifnum regnumof%7
   1450                    %ifnum regnumof%8
   1451                        %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
   1452                            ; Most VEX-encoded instructions require an additional byte to encode when
   1453                            ; src2 is a high register (e.g. m8..15). If the instruction is commutative
   1454                            ; we can swap src1 and src2 when doing so reduces the instruction length.
   1455                            %xdefine __src1 %8
   1456                            %xdefine __src2 %7
   1457                        %endif
   1458                    %endif
   1459                %elifnum regnumof%8 ; put memory operands in src2 when possible
   1460                    %xdefine __src1 %8
   1461                    %xdefine __src2 %7
   1462                %else
   1463                    %assign __emulate_avx 1
   1464                %endif
   1465            %elifnnum regnumof%7
   1466                ; EVEX allows imm8 shift instructions to be used with memory operands,
   1467                ; but VEX does not. This handles those special cases.
   1468                %ifnnum %8
   1469                    %assign __emulate_avx 1
   1470                %elif notcpuflag(avx512)
   1471                    %assign __emulate_avx 1
   1472                %endif
   1473            %endif
   1474            %if __emulate_avx ; a separate load is required
   1475                %if %3
   1476                    vmovaps %6, %7
   1477                %else
   1478                    vmovdqa %6, %7
   1479                %endif
   1480                __instr %6, %6, %8
   1481            %else
   1482                __instr %6, __src1, __src2
   1483            %endif
   1484        %else
   1485            __instr %6, %7, %8
   1486        %endif
   1487    %elif %0 == 7
   1488        %if avx_enabled && __sizeofreg >= 16 && %5
   1489            %xdefine __src1 %6
   1490            %xdefine __src2 %7
   1491            %ifnum regnumof%6
   1492                %ifnum regnumof%7
   1493                    %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
   1494                        %xdefine __src1 %7
   1495                        %xdefine __src2 %6
   1496                    %endif
   1497                %endif
   1498            %endif
   1499            __instr %6, __src1, __src2
   1500        %else
   1501            __instr %6, %7
   1502        %endif
   1503    %else
   1504        __instr %6
   1505    %endif
   1506 %endmacro
   1507 
   1508 ;%1 == instruction
   1509 ;%2 == minimal instruction set
   1510 ;%3 == 1 if float, 0 if int
   1511 ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
   1512 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
   1513 %macro AVX_INSTR 1-5 fnord, 0, 255, 0
   1514    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
   1515        %ifidn %2, fnord
   1516            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
   1517        %elifidn %3, fnord
   1518            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
   1519        %elifidn %4, fnord
   1520            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
   1521        %elifidn %5, fnord
   1522            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
   1523        %else
   1524            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
   1525        %endif
   1526    %endmacro
   1527 %endmacro
   1528 
   1529 ; Instructions with both VEX/EVEX and legacy encodings
   1530 ; Non-destructive instructions are written without parameters
   1531 AVX_INSTR addpd, sse2, 1, 0, 1
   1532 AVX_INSTR addps, sse, 1, 0, 1
   1533 AVX_INSTR addsd, sse2, 1, 0, 0
   1534 AVX_INSTR addss, sse, 1, 0, 0
   1535 AVX_INSTR addsubpd, sse3, 1, 0, 0
   1536 AVX_INSTR addsubps, sse3, 1, 0, 0
   1537 AVX_INSTR aesdec, aesni, 0, 0, 0
   1538 AVX_INSTR aesdeclast, aesni, 0, 0, 0
   1539 AVX_INSTR aesenc, aesni, 0, 0, 0
   1540 AVX_INSTR aesenclast, aesni, 0, 0, 0
   1541 AVX_INSTR aesimc, aesni
   1542 AVX_INSTR aeskeygenassist, aesni
   1543 AVX_INSTR andnpd, sse2, 1, 0, 0
   1544 AVX_INSTR andnps, sse, 1, 0, 0
   1545 AVX_INSTR andpd, sse2, 1, 0, 1
   1546 AVX_INSTR andps, sse, 1, 0, 1
   1547 AVX_INSTR blendpd, sse4, 1, 1, 0
   1548 AVX_INSTR blendps, sse4, 1, 1, 0
   1549 AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
   1550 AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
   1551 AVX_INSTR cmpeqpd, sse2, 1, 0, 1
   1552 AVX_INSTR cmpeqps, sse, 1, 0, 1
   1553 AVX_INSTR cmpeqsd, sse2, 1, 0, 0
   1554 AVX_INSTR cmpeqss, sse, 1, 0, 0
   1555 AVX_INSTR cmplepd, sse2, 1, 0, 0
   1556 AVX_INSTR cmpleps, sse, 1, 0, 0
   1557 AVX_INSTR cmplesd, sse2, 1, 0, 0
   1558 AVX_INSTR cmpless, sse, 1, 0, 0
   1559 AVX_INSTR cmpltpd, sse2, 1, 0, 0
   1560 AVX_INSTR cmpltps, sse, 1, 0, 0
   1561 AVX_INSTR cmpltsd, sse2, 1, 0, 0
   1562 AVX_INSTR cmpltss, sse, 1, 0, 0
   1563 AVX_INSTR cmpneqpd, sse2, 1, 0, 1
   1564 AVX_INSTR cmpneqps, sse, 1, 0, 1
   1565 AVX_INSTR cmpneqsd, sse2, 1, 0, 0
   1566 AVX_INSTR cmpneqss, sse, 1, 0, 0
   1567 AVX_INSTR cmpnlepd, sse2, 1, 0, 0
   1568 AVX_INSTR cmpnleps, sse, 1, 0, 0
   1569 AVX_INSTR cmpnlesd, sse2, 1, 0, 0
   1570 AVX_INSTR cmpnless, sse, 1, 0, 0
   1571 AVX_INSTR cmpnltpd, sse2, 1, 0, 0
   1572 AVX_INSTR cmpnltps, sse, 1, 0, 0
   1573 AVX_INSTR cmpnltsd, sse2, 1, 0, 0
   1574 AVX_INSTR cmpnltss, sse, 1, 0, 0
   1575 AVX_INSTR cmpordpd, sse2 1, 0, 1
   1576 AVX_INSTR cmpordps, sse 1, 0, 1
   1577 AVX_INSTR cmpordsd, sse2 1, 0, 0
   1578 AVX_INSTR cmpordss, sse 1, 0, 0
   1579 AVX_INSTR cmppd, sse2, 1, 1, 0
   1580 AVX_INSTR cmpps, sse, 1, 1, 0
   1581 AVX_INSTR cmpsd, sse2, 1, 1, 0
   1582 AVX_INSTR cmpss, sse, 1, 1, 0
   1583 AVX_INSTR cmpunordpd, sse2, 1, 0, 1
   1584 AVX_INSTR cmpunordps, sse, 1, 0, 1
   1585 AVX_INSTR cmpunordsd, sse2, 1, 0, 0
   1586 AVX_INSTR cmpunordss, sse, 1, 0, 0
   1587 AVX_INSTR comisd, sse2, 1
   1588 AVX_INSTR comiss, sse, 1
   1589 AVX_INSTR cvtdq2pd, sse2, 1
   1590 AVX_INSTR cvtdq2ps, sse2, 1
   1591 AVX_INSTR cvtpd2dq, sse2, 1
   1592 AVX_INSTR cvtpd2ps, sse2, 1
   1593 AVX_INSTR cvtps2dq, sse2, 1
   1594 AVX_INSTR cvtps2pd, sse2, 1
   1595 AVX_INSTR cvtsd2si, sse2, 1
   1596 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
   1597 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
   1598 AVX_INSTR cvtsi2ss, sse, 1, 0, 0
   1599 AVX_INSTR cvtss2sd, sse2, 1, 0, 0
   1600 AVX_INSTR cvtss2si, sse, 1
   1601 AVX_INSTR cvttpd2dq, sse2, 1
   1602 AVX_INSTR cvttps2dq, sse2, 1
   1603 AVX_INSTR cvttsd2si, sse2, 1
   1604 AVX_INSTR cvttss2si, sse, 1
   1605 AVX_INSTR divpd, sse2, 1, 0, 0
   1606 AVX_INSTR divps, sse, 1, 0, 0
   1607 AVX_INSTR divsd, sse2, 1, 0, 0
   1608 AVX_INSTR divss, sse, 1, 0, 0
   1609 AVX_INSTR dppd, sse4, 1, 1, 0
   1610 AVX_INSTR dpps, sse4, 1, 1, 0
   1611 AVX_INSTR extractps, sse4, 1
   1612 AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
   1613 AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
   1614 AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
   1615 AVX_INSTR haddpd, sse3, 1, 0, 0
   1616 AVX_INSTR haddps, sse3, 1, 0, 0
   1617 AVX_INSTR hsubpd, sse3, 1, 0, 0
   1618 AVX_INSTR hsubps, sse3, 1, 0, 0
   1619 AVX_INSTR insertps, sse4, 1, 1, 0
   1620 AVX_INSTR lddqu, sse3
   1621 AVX_INSTR ldmxcsr, sse, 1
   1622 AVX_INSTR maskmovdqu, sse2
   1623 AVX_INSTR maxpd, sse2, 1, 0, 1
   1624 AVX_INSTR maxps, sse, 1, 0, 1
   1625 AVX_INSTR maxsd, sse2, 1, 0, 0
   1626 AVX_INSTR maxss, sse, 1, 0, 0
   1627 AVX_INSTR minpd, sse2, 1, 0, 1
   1628 AVX_INSTR minps, sse, 1, 0, 1
   1629 AVX_INSTR minsd, sse2, 1, 0, 0
   1630 AVX_INSTR minss, sse, 1, 0, 0
   1631 AVX_INSTR movapd, sse2, 1
   1632 AVX_INSTR movaps, sse, 1
   1633 AVX_INSTR movd, mmx
   1634 AVX_INSTR movddup, sse3, 1
   1635 AVX_INSTR movdqa, sse2
   1636 AVX_INSTR movdqu, sse2
   1637 AVX_INSTR movhlps, sse, 1, 0, 0
   1638 AVX_INSTR movhpd, sse2, 1, 0, 0
   1639 AVX_INSTR movhps, sse, 1, 0, 0
   1640 AVX_INSTR movlhps, sse, 1, 0, 0
   1641 AVX_INSTR movlpd, sse2, 1, 0, 0
   1642 AVX_INSTR movlps, sse, 1, 0, 0
   1643 AVX_INSTR movmskpd, sse2, 1
   1644 AVX_INSTR movmskps, sse, 1
   1645 AVX_INSTR movntdq, sse2
   1646 AVX_INSTR movntdqa, sse4
   1647 AVX_INSTR movntpd, sse2, 1
   1648 AVX_INSTR movntps, sse, 1
   1649 AVX_INSTR movq, mmx
   1650 AVX_INSTR movsd, sse2, 1, 0, 0
   1651 AVX_INSTR movshdup, sse3, 1
   1652 AVX_INSTR movsldup, sse3, 1
   1653 AVX_INSTR movss, sse, 1, 0, 0
   1654 AVX_INSTR movupd, sse2, 1
   1655 AVX_INSTR movups, sse, 1
   1656 AVX_INSTR mpsadbw, sse4, 0, 1, 0
   1657 AVX_INSTR mulpd, sse2, 1, 0, 1
   1658 AVX_INSTR mulps, sse, 1, 0, 1
   1659 AVX_INSTR mulsd, sse2, 1, 0, 0
   1660 AVX_INSTR mulss, sse, 1, 0, 0
   1661 AVX_INSTR orpd, sse2, 1, 0, 1
   1662 AVX_INSTR orps, sse, 1, 0, 1
   1663 AVX_INSTR pabsb, ssse3
   1664 AVX_INSTR pabsd, ssse3
   1665 AVX_INSTR pabsw, ssse3
   1666 AVX_INSTR packssdw, mmx, 0, 0, 0
   1667 AVX_INSTR packsswb, mmx, 0, 0, 0
   1668 AVX_INSTR packusdw, sse4, 0, 0, 0
   1669 AVX_INSTR packuswb, mmx, 0, 0, 0
   1670 AVX_INSTR paddb, mmx, 0, 0, 1
   1671 AVX_INSTR paddd, mmx, 0, 0, 1
   1672 AVX_INSTR paddq, sse2, 0, 0, 1
   1673 AVX_INSTR paddsb, mmx, 0, 0, 1
   1674 AVX_INSTR paddsw, mmx, 0, 0, 1
   1675 AVX_INSTR paddusb, mmx, 0, 0, 1
   1676 AVX_INSTR paddusw, mmx, 0, 0, 1
   1677 AVX_INSTR paddw, mmx, 0, 0, 1
   1678 AVX_INSTR palignr, ssse3, 0, 1, 0
   1679 AVX_INSTR pand, mmx, 0, 0, 1
   1680 AVX_INSTR pandn, mmx, 0, 0, 0
   1681 AVX_INSTR pavgb, mmx2, 0, 0, 1
   1682 AVX_INSTR pavgw, mmx2, 0, 0, 1
   1683 AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding
   1684 AVX_INSTR pblendw, sse4, 0, 1, 0
   1685 AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0
   1686 AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0
   1687 AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0
   1688 AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0
   1689 AVX_INSTR pclmulqdq, clmul, 0, 1, 0
   1690 AVX_INSTR pcmpeqb, mmx, 0, 0, 1
   1691 AVX_INSTR pcmpeqd, mmx, 0, 0, 1
   1692 AVX_INSTR pcmpeqq, sse4, 0, 0, 1
   1693 AVX_INSTR pcmpeqw, mmx, 0, 0, 1
   1694 AVX_INSTR pcmpestri, sse42
   1695 AVX_INSTR pcmpestrm, sse42
   1696 AVX_INSTR pcmpgtb, mmx, 0, 0, 0
   1697 AVX_INSTR pcmpgtd, mmx, 0, 0, 0
   1698 AVX_INSTR pcmpgtq, sse42, 0, 0, 0
   1699 AVX_INSTR pcmpgtw, mmx, 0, 0, 0
   1700 AVX_INSTR pcmpistri, sse42
   1701 AVX_INSTR pcmpistrm, sse42
   1702 AVX_INSTR pextrb, sse4
   1703 AVX_INSTR pextrd, sse4
   1704 AVX_INSTR pextrq, sse4
   1705 AVX_INSTR pextrw, mmx2
   1706 AVX_INSTR phaddd, ssse3, 0, 0, 0
   1707 AVX_INSTR phaddsw, ssse3, 0, 0, 0
   1708 AVX_INSTR phaddw, ssse3, 0, 0, 0
   1709 AVX_INSTR phminposuw, sse4
   1710 AVX_INSTR phsubd, ssse3, 0, 0, 0
   1711 AVX_INSTR phsubsw, ssse3, 0, 0, 0
   1712 AVX_INSTR phsubw, ssse3, 0, 0, 0
   1713 AVX_INSTR pinsrb, sse4, 0, 1, 0
   1714 AVX_INSTR pinsrd, sse4, 0, 1, 0
   1715 AVX_INSTR pinsrq, sse4, 0, 1, 0
   1716 AVX_INSTR pinsrw, mmx2, 0, 1, 0
   1717 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
   1718 AVX_INSTR pmaddwd, mmx, 0, 0, 1
   1719 AVX_INSTR pmaxsb, sse4, 0, 0, 1
   1720 AVX_INSTR pmaxsd, sse4, 0, 0, 1
   1721 AVX_INSTR pmaxsw, mmx2, 0, 0, 1
   1722 AVX_INSTR pmaxub, mmx2, 0, 0, 1
   1723 AVX_INSTR pmaxud, sse4, 0, 0, 1
   1724 AVX_INSTR pmaxuw, sse4, 0, 0, 1
   1725 AVX_INSTR pminsb, sse4, 0, 0, 1
   1726 AVX_INSTR pminsd, sse4, 0, 0, 1
   1727 AVX_INSTR pminsw, mmx2, 0, 0, 1
   1728 AVX_INSTR pminub, mmx2, 0, 0, 1
   1729 AVX_INSTR pminud, sse4, 0, 0, 1
   1730 AVX_INSTR pminuw, sse4, 0, 0, 1
   1731 AVX_INSTR pmovmskb, mmx2
   1732 AVX_INSTR pmovsxbd, sse4
   1733 AVX_INSTR pmovsxbq, sse4
   1734 AVX_INSTR pmovsxbw, sse4
   1735 AVX_INSTR pmovsxdq, sse4
   1736 AVX_INSTR pmovsxwd, sse4
   1737 AVX_INSTR pmovsxwq, sse4
   1738 AVX_INSTR pmovzxbd, sse4
   1739 AVX_INSTR pmovzxbq, sse4
   1740 AVX_INSTR pmovzxbw, sse4
   1741 AVX_INSTR pmovzxdq, sse4
   1742 AVX_INSTR pmovzxwd, sse4
   1743 AVX_INSTR pmovzxwq, sse4
   1744 AVX_INSTR pmuldq, sse4, 0, 0, 1
   1745 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
   1746 AVX_INSTR pmulhuw, mmx2, 0, 0, 1
   1747 AVX_INSTR pmulhw, mmx, 0, 0, 1
   1748 AVX_INSTR pmulld, sse4, 0, 0, 1
   1749 AVX_INSTR pmullw, mmx, 0, 0, 1
   1750 AVX_INSTR pmuludq, sse2, 0, 0, 1
   1751 AVX_INSTR por, mmx, 0, 0, 1
   1752 AVX_INSTR psadbw, mmx2, 0, 0, 1
   1753 AVX_INSTR pshufb, ssse3, 0, 0, 0
   1754 AVX_INSTR pshufd, sse2
   1755 AVX_INSTR pshufhw, sse2
   1756 AVX_INSTR pshuflw, sse2
   1757 AVX_INSTR psignb, ssse3, 0, 0, 0
   1758 AVX_INSTR psignd, ssse3, 0, 0, 0
   1759 AVX_INSTR psignw, ssse3, 0, 0, 0
   1760 AVX_INSTR pslld, mmx, 0, 0, 0
   1761 AVX_INSTR pslldq, sse2, 0, 0, 0
   1762 AVX_INSTR psllq, mmx, 0, 0, 0
   1763 AVX_INSTR psllw, mmx, 0, 0, 0
   1764 AVX_INSTR psrad, mmx, 0, 0, 0
   1765 AVX_INSTR psraw, mmx, 0, 0, 0
   1766 AVX_INSTR psrld, mmx, 0, 0, 0
   1767 AVX_INSTR psrldq, sse2, 0, 0, 0
   1768 AVX_INSTR psrlq, mmx, 0, 0, 0
   1769 AVX_INSTR psrlw, mmx, 0, 0, 0
   1770 AVX_INSTR psubb, mmx, 0, 0, 0
   1771 AVX_INSTR psubd, mmx, 0, 0, 0
   1772 AVX_INSTR psubq, sse2, 0, 0, 0
   1773 AVX_INSTR psubsb, mmx, 0, 0, 0
   1774 AVX_INSTR psubsw, mmx, 0, 0, 0
   1775 AVX_INSTR psubusb, mmx, 0, 0, 0
   1776 AVX_INSTR psubusw, mmx, 0, 0, 0
   1777 AVX_INSTR psubw, mmx, 0, 0, 0
   1778 AVX_INSTR ptest, sse4
   1779 AVX_INSTR punpckhbw, mmx, 0, 0, 0
   1780 AVX_INSTR punpckhdq, mmx, 0, 0, 0
   1781 AVX_INSTR punpckhqdq, sse2, 0, 0, 0
   1782 AVX_INSTR punpckhwd, mmx, 0, 0, 0
   1783 AVX_INSTR punpcklbw, mmx, 0, 0, 0
   1784 AVX_INSTR punpckldq, mmx, 0, 0, 0
   1785 AVX_INSTR punpcklqdq, sse2, 0, 0, 0
   1786 AVX_INSTR punpcklwd, mmx, 0, 0, 0
   1787 AVX_INSTR pxor, mmx, 0, 0, 1
   1788 AVX_INSTR rcpps, sse, 1
   1789 AVX_INSTR rcpss, sse, 1, 0, 0
   1790 AVX_INSTR roundpd, sse4, 1
   1791 AVX_INSTR roundps, sse4, 1
   1792 AVX_INSTR roundsd, sse4, 1, 1, 0
   1793 AVX_INSTR roundss, sse4, 1, 1, 0
   1794 AVX_INSTR rsqrtps, sse, 1
   1795 AVX_INSTR rsqrtss, sse, 1, 0, 0
   1796 AVX_INSTR shufpd, sse2, 1, 1, 0
   1797 AVX_INSTR shufps, sse, 1, 1, 0
   1798 AVX_INSTR sqrtpd, sse2, 1
   1799 AVX_INSTR sqrtps, sse, 1
   1800 AVX_INSTR sqrtsd, sse2, 1, 0, 0
   1801 AVX_INSTR sqrtss, sse, 1, 0, 0
   1802 AVX_INSTR stmxcsr, sse, 1
   1803 AVX_INSTR subpd, sse2, 1, 0, 0
   1804 AVX_INSTR subps, sse, 1, 0, 0
   1805 AVX_INSTR subsd, sse2, 1, 0, 0
   1806 AVX_INSTR subss, sse, 1, 0, 0
   1807 AVX_INSTR ucomisd, sse2, 1
   1808 AVX_INSTR ucomiss, sse, 1
   1809 AVX_INSTR unpckhpd, sse2, 1, 0, 0
   1810 AVX_INSTR unpckhps, sse, 1, 0, 0
   1811 AVX_INSTR unpcklpd, sse2, 1, 0, 0
   1812 AVX_INSTR unpcklps, sse, 1, 0, 0
   1813 AVX_INSTR xorpd, sse2, 1, 0, 1
   1814 AVX_INSTR xorps, sse, 1, 0, 1
   1815 
   1816 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
   1817 AVX_INSTR pfadd, 3dnow, 1, 0, 1
   1818 AVX_INSTR pfmul, 3dnow, 1, 0, 1
   1819 AVX_INSTR pfsub, 3dnow, 1, 0, 0
   1820 
   1821 ;%1 == instruction
   1822 ;%2 == minimal instruction set
   1823 %macro GPR_INSTR 2
   1824    %macro %1 2-5 fnord, %1, %2
   1825        %ifdef cpuname
   1826            %if notcpuflag(%5)
   1827                %error use of ``%4'' %5 instruction in cpuname function: current_function
   1828            %endif
   1829        %endif
   1830        %ifidn %3, fnord
   1831            %4 %1, %2
   1832        %else
   1833            %4 %1, %2, %3
   1834        %endif
   1835    %endmacro
   1836 %endmacro
   1837 
   1838 GPR_INSTR andn, bmi1
   1839 GPR_INSTR bextr, bmi1
   1840 GPR_INSTR blsi, bmi1
   1841 GPR_INSTR blsmsk, bmi1
   1842 GPR_INSTR blsr, bmi1
   1843 GPR_INSTR bzhi, bmi2
   1844 GPR_INSTR crc32, sse42
   1845 GPR_INSTR mulx, bmi2
   1846 GPR_INSTR pdep, bmi2
   1847 GPR_INSTR pext, bmi2
   1848 GPR_INSTR popcnt, sse42
   1849 GPR_INSTR rorx, bmi2
   1850 GPR_INSTR sarx, bmi2
   1851 GPR_INSTR shlx, bmi2
   1852 GPR_INSTR shrx, bmi2
   1853 
   1854 ; base-4 constants for shuffles
   1855 %assign i 0
   1856 %rep 256
   1857    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
   1858    %if j < 10
   1859        CAT_XDEFINE q000, j, i
   1860    %elif j < 100
   1861        CAT_XDEFINE q00, j, i
   1862    %elif j < 1000
   1863        CAT_XDEFINE q0, j, i
   1864    %else
   1865        CAT_XDEFINE q, j, i
   1866    %endif
   1867    %assign i i+1
   1868 %endrep
   1869 %undef i
   1870 %undef j
   1871 
   1872 %macro FMA_INSTR 3
   1873    %macro %1 4-7 %1, %2, %3
   1874        %if cpuflag(xop)
   1875            v%5 %1, %2, %3, %4
   1876        %elifnidn %1, %4
   1877            %6 %1, %2, %3
   1878            %7 %1, %4
   1879        %else
   1880            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
   1881        %endif
   1882    %endmacro
   1883 %endmacro
   1884 
   1885 FMA_INSTR pmacsdd,  pmulld,  paddd ; sse4 emulation
   1886 FMA_INSTR pmacsdql, pmuldq,  paddq ; sse4 emulation
   1887 FMA_INSTR pmacsww,  pmullw,  paddw
   1888 FMA_INSTR pmadcswd, pmaddwd, paddd
   1889 
   1890 ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
   1891 ; FMA3 is only possible if dst is the same as one of the src registers.
   1892 ; Either src2 or src3 can be a memory operand.
   1893 %macro FMA4_INSTR 2-*
   1894    %push fma4_instr
   1895    %xdefine %$prefix %1
   1896    %rep %0 - 1
   1897        %macro %$prefix%2 4-6 %$prefix, %2
   1898            %if notcpuflag(fma3) && notcpuflag(fma4)
   1899                %error use of ``%5%6'' fma instruction in cpuname function: current_function
   1900            %elif cpuflag(fma4)
   1901                v%5%6 %1, %2, %3, %4
   1902            %elifidn %1, %2
   1903                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
   1904                %ifnum sizeof%3
   1905                    v%{5}213%6 %2, %3, %4
   1906                %else
   1907                    v%{5}132%6 %2, %4, %3
   1908                %endif
   1909            %elifidn %1, %3
   1910                v%{5}213%6 %3, %2, %4
   1911            %elifidn %1, %4
   1912                v%{5}231%6 %4, %2, %3
   1913            %else
   1914                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
   1915            %endif
   1916        %endmacro
   1917        %rotate 1
   1918    %endrep
   1919    %pop
   1920 %endmacro
   1921 
   1922 FMA4_INSTR fmadd,    pd, ps, sd, ss
   1923 FMA4_INSTR fmaddsub, pd, ps
   1924 FMA4_INSTR fmsub,    pd, ps, sd, ss
   1925 FMA4_INSTR fmsubadd, pd, ps
   1926 FMA4_INSTR fnmadd,   pd, ps, sd, ss
   1927 FMA4_INSTR fnmsub,   pd, ps, sd, ss
   1928 
   1929 ; Macros for converting VEX instructions to equivalent EVEX ones.
   1930 %macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
   1931    %macro %1 2-7 fnord, fnord, %1, %2, %3
   1932        %ifidn %3, fnord
   1933            %define %%args %1, %2
   1934        %elifidn %4, fnord
   1935            %define %%args %1, %2, %3
   1936        %else
   1937            %define %%args %1, %2, %3, %4
   1938        %endif
   1939        %assign %%evex_required cpuflag(avx512) & %7
   1940        %ifnum regnumof%1
   1941            %if regnumof%1 >= 16 || sizeof%1 > 32
   1942                %assign %%evex_required 1
   1943            %endif
   1944        %endif
   1945        %ifnum regnumof%2
   1946            %if regnumof%2 >= 16 || sizeof%2 > 32
   1947                %assign %%evex_required 1
   1948            %endif
   1949        %endif
   1950        %ifnum regnumof%3
   1951            %if regnumof%3 >= 16 || sizeof%3 > 32
   1952                %assign %%evex_required 1
   1953            %endif
   1954        %endif
   1955        %if %%evex_required
   1956            %6 %%args
   1957        %else
   1958            %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
   1959        %endif
   1960    %endmacro
   1961 %endmacro
   1962 
   1963 EVEX_INSTR vbroadcastf128, vbroadcastf32x4
   1964 EVEX_INSTR vbroadcasti128, vbroadcasti32x4
   1965 EVEX_INSTR vextractf128,   vextractf32x4
   1966 EVEX_INSTR vextracti128,   vextracti32x4
   1967 EVEX_INSTR vinsertf128,    vinsertf32x4
   1968 EVEX_INSTR vinserti128,    vinserti32x4
   1969 EVEX_INSTR vmovdqa,        vmovdqa32
   1970 EVEX_INSTR vmovdqu,        vmovdqu32
   1971 EVEX_INSTR vpand,          vpandd
   1972 EVEX_INSTR vpandn,         vpandnd
   1973 EVEX_INSTR vpor,           vpord
   1974 EVEX_INSTR vpxor,          vpxord
   1975 EVEX_INSTR vrcpps,         vrcp14ps,   1 ; EVEX versions have higher precision
   1976 EVEX_INSTR vrcpss,         vrcp14ss,   1
   1977 EVEX_INSTR vrsqrtps,       vrsqrt14ps, 1
   1978 EVEX_INSTR vrsqrtss,       vrsqrt14ss, 1