jsimdext.inc (15399B)
1 ; 2 ; jsimdext.inc - common declarations 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2010, 2016, 2018-2019, 2024, D. R. Commander. 6 ; Copyright (C) 2018, Matthieu Darbois. 7 ; Copyright (C) 2018, Matthias Räncker. 8 ; Copyright (C) 2023, Aliaksiej Kandracienka. 9 ; 10 ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 11 ; 12 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 13 ; 14 ; This software is provided 'as-is', without any express or implied 15 ; warranty. In no event will the authors be held liable for any damages 16 ; arising from the use of this software. 17 ; 18 ; Permission is granted to anyone to use this software for any purpose, 19 ; including commercial applications, and to alter it and redistribute it 20 ; freely, subject to the following restrictions: 21 ; 22 ; 1. The origin of this software must not be misrepresented; you must not 23 ; claim that you wrote the original software. If you use this software 24 ; in a product, an acknowledgment in the product documentation would be 25 ; appreciated but is not required. 26 ; 2. Altered source versions must be plainly marked as such, and must not be 27 ; misrepresented as being the original software. 28 ; 3. This notice may not be removed or altered from any source distribution. 29 30 ; ========================================================================== 31 ; System-dependent configurations 32 33 %ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- 34 ; * Microsoft Visual C++ 35 ; * MinGW (Minimalist GNU for Windows) 36 ; * CygWin 37 ; * LCC-Win32 38 39 ; -- segment definition -- 40 ; 41 %ifdef __YASM_VER__ 42 %define SEG_TEXT .text align=32 43 %define SEG_CONST .rdata align=32 44 %else 45 %define SEG_TEXT .text align=32 public use32 class=CODE 46 %define SEG_CONST .rdata align=32 public use32 class=CONST 47 %endif 48 49 %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- 50 ; * Microsoft Visual C++ 51 52 ; -- segment definition -- 53 ; 54 %ifdef __YASM_VER__ 55 %define SEG_TEXT .text align=32 56 %define SEG_CONST .rdata align=32 57 %else 58 %define SEG_TEXT .text align=32 public use64 class=CODE 59 %define SEG_CONST .rdata align=32 public use64 class=CONST 60 %endif 61 %define EXTN(name) name ; foo() -> foo 62 63 %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- 64 ; * Borland C++ (Win32) 65 66 ; -- segment definition -- 67 ; 68 %define SEG_TEXT _text align=32 public use32 class=CODE 69 %define SEG_CONST _data align=32 public use32 class=DATA 70 71 %elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ 72 ; * Linux 73 ; * *BSD family Unix using elf format 74 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix 75 76 ; mark stack as non-executable 77 section .note.GNU-stack noalloc noexec nowrite progbits 78 79 %ifdef __CET__ 80 %ifdef __x86_64__ 81 section .note.gnu.property note alloc noexec align=8 82 dd 0x00000004, 0x00000010, 0x00000005, 0x00554e47 83 dd 0xc0000002, 0x00000004, 0x00000003, 0x00000000 84 %endif 85 %endif 86 87 ; -- segment definition -- 88 ; 89 %ifdef __x86_64__ 90 %define SEG_TEXT .text progbits align=32 91 %define SEG_CONST .rodata progbits align=32 92 %else 93 %define SEG_TEXT .text progbits alloc exec nowrite align=32 94 %define SEG_CONST .rodata progbits alloc noexec nowrite align=32 95 %endif 96 97 ; To make the code position-independent, append -DPIC to the commandline 98 ; 99 %define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC 100 %define EXTN(name) name ; foo() -> foo 101 102 %elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- 103 ; * Older Linux using a.out format (nasm -f aout -DAOUT ...) 104 ; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) 105 106 ; -- segment definition -- 107 ; 108 %define SEG_TEXT .text 109 %define SEG_CONST .data 110 111 ; To make the code position-independent, append -DPIC to the commandline 112 ; 113 %define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC 114 115 %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- 116 ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) 117 118 ; -- segment definition -- 119 ; 120 %define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why? 121 %define SEG_CONST .rodata align=32 122 123 ; The generation of position-independent code (PIC) is the default on Darwin. 124 ; 125 %define PIC 126 %define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing 127 128 %else ; ----(Other case)---------------------- 129 130 ; -- segment definition -- 131 ; 132 %define SEG_TEXT .text 133 %define SEG_CONST .data 134 135 %endif ; ---------------------------------------------- 136 137 ; ========================================================================== 138 139 ; -------------------------------------------------------------------------- 140 ; Common types 141 ; 142 %ifdef __x86_64__ 143 %ifnidn __OUTPUT_FORMAT__, elfx32 144 %define POINTER qword ; general pointer type 145 %define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) 146 %define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT 147 %define resp resq 148 %define dp dq 149 %define raxp rax 150 %define rbxp rbx 151 %define rcxp rcx 152 %define rdxp rdx 153 %define rsip rsi 154 %define rdip rdi 155 %define rbpp rbp 156 %define rspp rsp 157 %define r8p r8 158 %define r9p r9 159 %define r10p r10 160 %define r11p r11 161 %define r12p r12 162 %define r13p r13 163 %define r14p r14 164 %define r15p r15 165 %endif 166 %endif 167 %ifndef raxp 168 %define POINTER dword ; general pointer type 169 %define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) 170 %define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT 171 %define resp resd 172 %define dp dd 173 ; x86_64 ILP32 ABI (x32) 174 %define raxp eax 175 %define rbxp ebx 176 %define rcxp ecx 177 %define rdxp edx 178 %define rsip esi 179 %define rdip edi 180 %define rbpp ebp 181 %define rspp esp 182 %define r8p r8d 183 %define r9p r9d 184 %define r10p r10d 185 %define r11p r11d 186 %define r12p r12d 187 %define r13p r13d 188 %define r14p r14d 189 %define r15p r15d 190 %endif 191 192 %define INT dword ; signed integer type 193 %define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) 194 %define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT 195 196 %define FP32 dword ; IEEE754 single 197 %define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) 198 %define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT 199 200 %define MMWORD qword ; int64 (MMX register) 201 %define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) 202 %define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT 203 204 ; NASM is buggy and doesn't properly handle operand sizes for SSE 205 ; instructions, so for now we have to define XMMWORD as blank. 206 %define XMMWORD ; int128 (SSE register) 207 %define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) 208 %define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT 209 210 %define YMMWORD ; int256 (AVX register) 211 %define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD) 212 %define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT 213 214 ; Similar hacks for when we load a dword or MMWORD into an xmm# register 215 %define XMM_DWORD 216 %define XMM_MMWORD 217 218 %define SIZEOF_BYTE 1 ; sizeof(byte) 219 %define SIZEOF_WORD 2 ; sizeof(word) 220 %define SIZEOF_DWORD 4 ; sizeof(dword) 221 %define SIZEOF_QWORD 8 ; sizeof(qword) 222 %define SIZEOF_OWORD 16 ; sizeof(oword) 223 %define SIZEOF_YWORD 32 ; sizeof(yword) 224 225 %define BYTE_BIT 8 ; CHAR_BIT in C 226 %define WORD_BIT 16 ; sizeof(word)*BYTE_BIT 227 %define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT 228 %define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT 229 %define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT 230 %define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT 231 232 ; -------------------------------------------------------------------------- 233 ; External Symbol Name 234 ; 235 %ifndef EXTN 236 %define EXTN(name) _ %+ name ; foo() -> _foo 237 %endif 238 239 ; -------------------------------------------------------------------------- 240 ; Hidden symbols 241 ; 242 %ifdef ELF ; ----(nasm -felf[64] -DELF ...)-------- 243 %define GLOBAL_FUNCTION(name) global EXTN(name):function hidden 244 %define GLOBAL_DATA(name) global EXTN(name):data hidden 245 %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- 246 %ifdef __YASM_VER__ 247 %define GLOBAL_FUNCTION(name) global EXTN(name):private_extern 248 %define GLOBAL_DATA(name) global EXTN(name):private_extern 249 %else 250 %if __NASM_VERSION_ID__ >= 0x020E0000 251 %define GLOBAL_FUNCTION(name) global EXTN(name):private_extern 252 %define GLOBAL_DATA(name) global EXTN(name):private_extern 253 %endif 254 %endif 255 %endif 256 257 %ifndef GLOBAL_FUNCTION 258 %define GLOBAL_FUNCTION(name) global EXTN(name) 259 %endif 260 %ifndef GLOBAL_DATA 261 %define GLOBAL_DATA(name) global EXTN(name) 262 %endif 263 264 ; -------------------------------------------------------------------------- 265 ; Macros for position-independent code (PIC) support 266 ; 267 %ifndef GOT_SYMBOL 268 %undef PIC 269 %endif 270 271 %ifdef PIC ; ------------------------------------------- 272 273 %ifidn GOT_SYMBOL, _MACHO_PIC_ ; -------------------- 274 275 ; At present, nasm doesn't seem to support PIC generation for Mach-O. 276 ; The PIC support code below is a little tricky. 277 278 SECTION SEG_CONST 279 const_base: 280 281 %define GOTOFF(got, sym) (got) + (sym) - const_base 282 283 %imacro GET_GOT 1 284 ; NOTE: this macro destroys ecx resister. 285 call %%geteip 286 add ecx, byte (%%ref - $) 287 jmp short %%adjust 288 %%geteip: 289 mov ecx, POINTER [esp] 290 ret 291 %%adjust: 292 push ebp 293 xor ebp, ebp ; ebp = 0 294 %ifidni %1, ebx ; (%1 == ebx) 295 ; db 0x8D,0x9C + jmp near const_base = 296 ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) 297 db 0x8D, 0x9C ; 8D,9C 298 jmp near const_base ; E9,(const_base-%%ref) 299 %%ref: 300 %else ; (%1 != ebx) 301 ; db 0x8D,0x8C + jmp near const_base = 302 ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) 303 db 0x8D, 0x8C ; 8D,8C 304 jmp near const_base ; E9,(const_base-%%ref) 305 %%ref: 306 mov %1, ecx 307 %endif ; (%1 == ebx) 308 pop ebp 309 %endmacro 310 311 %else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- 312 313 %define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff 314 315 %imacro GET_GOT 1 316 extern GOT_SYMBOL 317 call %%geteip 318 add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc 319 jmp short %%done 320 %%geteip: 321 mov %1, POINTER [esp] 322 ret 323 %%done: 324 %endmacro 325 326 %endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- 327 328 %imacro PUSHPIC 1.nolist 329 push %1 330 %endmacro 331 %imacro POPPIC 1.nolist 332 pop %1 333 %endmacro 334 %imacro MOVPIC 2.nolist 335 mov %1, %2 336 %endmacro 337 338 %else ; !PIC ----------------------------------------- 339 340 %define GOTOFF(got, sym) (sym) 341 342 %imacro GET_GOT 1.nolist 343 %endmacro 344 %imacro PUSHPIC 1.nolist 345 %endmacro 346 %imacro POPPIC 1.nolist 347 %endmacro 348 %imacro MOVPIC 2.nolist 349 %endmacro 350 351 %endif ; PIC ----------------------------------------- 352 353 ; -------------------------------------------------------------------------- 354 ; Align the next instruction on {2,4,8,16,..}-byte boundary. 355 ; ".balign n,,m" in GNU as 356 ; 357 %define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) 358 %define FILLB(b, n) (($$-(b)) & ((n)-1)) 359 360 %imacro ALIGNX 1-2.nolist 0xFFFF 361 %%bs: \ 362 times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \ 363 db 0x90 ; nop 364 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \ 365 db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000] 366 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \ 367 db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] 368 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \ 369 db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] 370 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \ 371 db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00] 372 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \ 373 db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00] 374 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \ 375 db 0x8B, 0xED ; mov ebp,ebp 376 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \ 377 db 0x90 ; nop 378 %endmacro 379 380 ; Align the next data on {2,4,8,16,..}-byte boundary. 381 ; 382 %imacro ALIGNZ 1.nolist 383 align %1, db 0 ; filling zeros 384 %endmacro 385 386 %ifdef __x86_64__ 387 388 %ifdef WIN64 389 390 %imacro COLLECT_ARGS 1 391 sub rsp, SIZEOF_XMMWORD 392 movaps XMMWORD [rsp], xmm6 393 sub rsp, SIZEOF_XMMWORD 394 movaps XMMWORD [rsp], xmm7 395 mov r10, rcx 396 %if %1 > 1 397 mov r11, rdx 398 %endif 399 %if %1 > 2 400 push r12 401 mov r12, r8 402 %endif 403 %if %1 > 3 404 push r13 405 mov r13, r9 406 %endif 407 %if %1 > 4 408 push r14 409 mov r14, [rbp+48] 410 %endif 411 %if %1 > 5 412 push r15 413 mov r15, [rbp+56] 414 %endif 415 push rsi 416 push rdi 417 %endmacro 418 419 %imacro UNCOLLECT_ARGS 1 420 pop rdi 421 pop rsi 422 %if %1 > 5 423 pop r15 424 %endif 425 %if %1 > 4 426 pop r14 427 %endif 428 %if %1 > 3 429 pop r13 430 %endif 431 %if %1 > 2 432 pop r12 433 %endif 434 movaps xmm7, XMMWORD [rsp] 435 add rsp, SIZEOF_XMMWORD 436 movaps xmm6, XMMWORD [rsp] 437 add rsp, SIZEOF_XMMWORD 438 %endmacro 439 440 %imacro PUSH_XMM 1 441 sub rsp, %1 * SIZEOF_XMMWORD 442 movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8 443 %if %1 > 1 444 movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9 445 %endif 446 %if %1 > 2 447 movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10 448 %endif 449 %if %1 > 3 450 movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11 451 %endif 452 %endmacro 453 454 %imacro POP_XMM 1 455 movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD] 456 %if %1 > 1 457 movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD] 458 %endif 459 %if %1 > 2 460 movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD] 461 %endif 462 %if %1 > 3 463 movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD] 464 %endif 465 add rsp, %1 * SIZEOF_XMMWORD 466 %endmacro 467 468 %else 469 470 %imacro COLLECT_ARGS 1 471 push r10 472 mov r10, rdi 473 %if %1 > 1 474 push r11 475 mov r11, rsi 476 %endif 477 %if %1 > 2 478 push r12 479 mov r12, rdx 480 %endif 481 %if %1 > 3 482 push r13 483 mov r13, rcx 484 %endif 485 %if %1 > 4 486 push r14 487 mov r14, r8 488 %endif 489 %if %1 > 5 490 push r15 491 mov r15, r9 492 %endif 493 %endmacro 494 495 %imacro UNCOLLECT_ARGS 1 496 %if %1 > 5 497 pop r15 498 %endif 499 %if %1 > 4 500 pop r14 501 %endif 502 %if %1 > 3 503 pop r13 504 %endif 505 %if %1 > 2 506 pop r12 507 %endif 508 %if %1 > 1 509 pop r11 510 %endif 511 pop r10 512 %endmacro 513 514 %imacro PUSH_XMM 1 515 %endmacro 516 517 %imacro POP_XMM 1 518 %endmacro 519 520 %endif 521 522 %endif 523 524 %ifdef __CET__ 525 526 %imacro ENDBR64 0 527 dd 0xfa1e0ff3 528 %endmacro 529 530 %else 531 532 %imacro ENDBR64 0 533 %endmacro 534 535 %endif 536 537 ; -------------------------------------------------------------------------- 538 ; Defines picked up from the C headers 539 ; 540 %include "jsimdcfg.inc" 541 542 ; --------------------------------------------------------------------------