set_macros-inl.h (24982B)
1 // Copyright 2020 Google LLC 2 // Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com> 3 // SPDX-License-Identifier: Apache-2.0 4 // SPDX-License-Identifier: BSD-3-Clause 5 // 6 // Licensed under the Apache License, Version 2.0 (the "License"); 7 // you may not use this file except in compliance with the License. 8 // You may obtain a copy of the License at 9 // 10 // http://www.apache.org/licenses/LICENSE-2.0 11 // 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 18 // Sets macros based on HWY_TARGET. 19 20 // This include guard is toggled by foreach_target, so avoid the usual _H_ 21 // suffix to prevent copybara from renaming it. 22 #if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE) 23 #ifdef HWY_SET_MACROS_PER_TARGET 24 #undef HWY_SET_MACROS_PER_TARGET 25 #else 26 #define HWY_SET_MACROS_PER_TARGET 27 #endif 28 29 #endif // HWY_SET_MACROS_PER_TARGET 30 31 #include "hwy/detect_compiler_arch.h" // IWYU: export 32 #include "hwy/detect_targets.h" // IWYU: export 33 34 #undef HWY_NAMESPACE 35 #undef HWY_ALIGN 36 #undef HWY_MAX_BYTES 37 #undef HWY_LANES 38 39 #undef HWY_HAVE_SCALABLE 40 #undef HWY_HAVE_TUPLE 41 #undef HWY_HAVE_INTEGER64 42 #undef HWY_HAVE_FLOAT16 43 #undef HWY_HAVE_FLOAT64 44 #undef HWY_MEM_OPS_MIGHT_FAULT 45 #undef HWY_NATIVE_FMA 46 #undef HWY_NATIVE_DOT_BF16 47 #undef HWY_NATIVE_MASK 48 #undef HWY_CAP_GE256 49 #undef HWY_CAP_GE512 50 51 #undef HWY_TARGET_IS_SVE 52 #if HWY_TARGET & HWY_ALL_SVE 53 #define HWY_TARGET_IS_SVE 1 54 #else 55 #define HWY_TARGET_IS_SVE 0 56 #endif 57 58 #undef HWY_TARGET_IS_NEON 59 #if HWY_TARGET & HWY_ALL_NEON 60 #define HWY_TARGET_IS_NEON 1 61 #else 62 #define HWY_TARGET_IS_NEON 0 63 #endif 64 65 #undef HWY_TARGET_IS_PPC 66 #if HWY_TARGET & HWY_ALL_PPC 67 #define HWY_TARGET_IS_PPC 1 68 #else 69 #define HWY_TARGET_IS_PPC 0 70 #endif 71 72 #undef HWY_TARGET_IS_AVX10_2 73 #if HWY_TARGET == HWY_AVX10_2 74 #define HWY_TARGET_IS_AVX10_2 1 75 #else 76 #define HWY_TARGET_IS_AVX10_2 0 77 #endif 78 79 // Supported on all targets except RVV (requires GCC 14 or upcoming Clang) 80 #if HWY_TARGET == HWY_RVV && \ 81 ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \ 82 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1700)) 83 #define HWY_HAVE_TUPLE 0 84 #else 85 #define HWY_HAVE_TUPLE 1 86 #endif 87 88 // For internal use (clamping/validating N for Simd<>) 89 #undef HWY_MAX_N 90 #if HWY_TARGET == HWY_SCALAR 91 #define HWY_MAX_N 1 92 #else 93 #define HWY_MAX_N 65536 94 #endif 95 96 // For internal use (clamping kPow2 for Simd<>) 97 #undef HWY_MAX_POW2 98 // For HWY_TARGET == HWY_RVV, LMUL <= 8. Even on other targets, we want to 99 // support say Rebind<uint64_t, Simd<uint8_t, 1, 0>> d; whose kPow2 is also 3. 100 // However, those other targets do not actually support multiple vectors, and 101 // thus Lanes(d) must not exceed Lanes(ScalableTag<T>()). 102 #define HWY_MAX_POW2 3 103 104 // User-visible. Loose lower bound that guarantees HWY_MAX_BYTES >> 105 // (-HWY_MIN_POW2) <= 1. Useful for terminating compile-time recursions. 106 #undef HWY_MIN_POW2 107 #if HWY_TARGET == HWY_RVV 108 #define HWY_MIN_POW2 -16 109 #else 110 // Tighter bound for other targets, whose vectors are smaller, to potentially 111 // save compile time. 112 #define HWY_MIN_POW2 -8 113 #endif // HWY_TARGET == HWY_RVV 114 115 #undef HWY_TARGET_STR 116 117 #if defined(HWY_DISABLE_PCLMUL_AES) 118 #define HWY_TARGET_STR_PCLMUL_AES "" 119 #else 120 #define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes" 121 #endif 122 123 #if defined(HWY_DISABLE_BMI2_FMA) 124 #define HWY_TARGET_STR_BMI2_FMA "" 125 #else 126 #define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma" 127 #endif 128 129 #if defined(HWY_DISABLE_F16C) 130 #define HWY_TARGET_STR_F16C "" 131 #else 132 #define HWY_TARGET_STR_F16C ",f16c" 133 #endif 134 135 #define HWY_TARGET_STR_SSE2 "sse2" 136 137 #define HWY_TARGET_STR_SSSE3 "sse2,ssse3" 138 139 #define HWY_TARGET_STR_SSE4 \ 140 HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES 141 // Include previous targets, which are the half-vectors of the next target. 142 #define HWY_TARGET_STR_AVX2 \ 143 HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C 144 145 #ifndef HWY_HAVE_EVEX512 // allow override 146 // evex512 has been removed from clang 22, see 147 // https://github.com/llvm/llvm-project/pull/157034 148 #if (1400 <= HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1600) || \ 149 (1800 <= HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2200) 150 #define HWY_HAVE_EVEX512 1 151 #else 152 #define HWY_HAVE_EVEX512 0 153 #endif 154 #endif 155 156 #if (HWY_HAVE_EVEX512 == 1) 157 #define HWY_TARGET_STR_AVX3_VL512 ",evex512" 158 #else 159 #define HWY_TARGET_STR_AVX3_VL512 160 #endif 161 162 #define HWY_TARGET_STR_AVX3 \ 163 HWY_TARGET_STR_AVX2 \ 164 ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw" HWY_TARGET_STR_AVX3_VL512 165 166 #define HWY_TARGET_STR_AVX3_DL \ 167 HWY_TARGET_STR_AVX3 \ 168 ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \ 169 "avx512vpopcntdq,gfni" 170 171 // Opt-out for compilers that do not properly support avx512bf16. 172 #ifndef HWY_AVX3_ENABLE_AVX512BF16 // allow override 173 // Default is to disable if the DISABLE macro is defined, or if old compiler. 174 // clang-cl 21.1.4 reportedly works; feel free to define this to 1 there. 175 #if defined(HWY_AVX3_DISABLE_AVX512BF16) || \ 176 (HWY_COMPILER_CLANGCL || \ 177 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \ 178 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 900)) 179 #define HWY_AVX3_ENABLE_AVX512BF16 0 180 #else 181 #define HWY_AVX3_ENABLE_AVX512BF16 1 182 #endif 183 #endif // HWY_AVX3_ENABLE_AVX512BF16 184 185 #if HWY_AVX3_ENABLE_AVX512BF16 186 #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL ",avx512bf16" 187 #else 188 #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL 189 #endif 190 191 #if HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1400 192 #define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_ZEN4 ",avx512fp16" 193 #else 194 #define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_ZEN4 195 #endif 196 197 // Support for avx10.2-512 was removed between clang 22 and 23 without a 198 // feature test macro. 199 #if HWY_COMPILER_CLANG >= 2200 && HWY_HAVE_EVEX512 200 #define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR ",avx10.2-512" 201 // Recent compilers drop the -512 suffix because 512 bits are always available. 202 #elif HWY_COMPILER_GCC_ACTUAL >= 1500 || HWY_COMPILER_CLANG >= 2200 203 #define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR ",avx10.2" 204 #else 205 #define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR 206 #endif 207 208 #if defined(HWY_DISABLE_PPC8_CRYPTO) 209 #define HWY_TARGET_STR_PPC8_CRYPTO "" 210 #else 211 #define HWY_TARGET_STR_PPC8_CRYPTO ",crypto" 212 #endif 213 214 #define HWY_TARGET_STR_PPC8 \ 215 "altivec,vsx,power8-vector" HWY_TARGET_STR_PPC8_CRYPTO 216 #define HWY_TARGET_STR_PPC9 HWY_TARGET_STR_PPC8 ",power9-vector" 217 218 #if HWY_COMPILER_CLANG 219 #define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",power10-vector" 220 #else 221 // See #1707 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102059#c35. 222 // When the baseline is PPC 8 or 9, inlining functions such as PreventElision 223 // into PPC10 code fails because PPC10 defaults to no-htm and is thus worse than 224 // the baseline, which has htm. We cannot have pragma target on functions 225 // outside HWY_NAMESPACE such as those in base.h. It would be possible for users 226 // to set -mno-htm globally, but we can also work around this at the library 227 // level by claiming that PPC10 still has HTM, thus avoiding the mismatch. This 228 // seems to be safe because HTM uses builtins rather than modifying codegen, see 229 // https://gcc.gnu.org/legacy-ml/gcc-patches/2013-07/msg00167.html. 230 #define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",cpu=power10,htm" 231 #endif 232 233 #define HWY_TARGET_STR_Z14 "arch=z14" 234 #define HWY_TARGET_STR_Z15 "arch=z15" 235 236 // Before include guard so we redefine HWY_TARGET_STR on each include, 237 // governed by the current HWY_TARGET. 238 239 //----------------------------------------------------------------------------- 240 // SSE2 241 #if HWY_TARGET == HWY_SSE2 242 243 #define HWY_NAMESPACE N_SSE2 244 #define HWY_ALIGN alignas(16) 245 #define HWY_MAX_BYTES 16 246 #define HWY_LANES(T) (16 / sizeof(T)) 247 248 #define HWY_HAVE_SCALABLE 0 249 #define HWY_HAVE_INTEGER64 1 250 #define HWY_HAVE_FLOAT16 0 251 #define HWY_HAVE_FLOAT64 1 252 #define HWY_MEM_OPS_MIGHT_FAULT 1 253 #define HWY_NATIVE_FMA 0 254 #define HWY_NATIVE_DOT_BF16 0 255 #define HWY_NATIVE_MASK 0 // a few actually are 256 #define HWY_CAP_GE256 0 257 #define HWY_CAP_GE512 0 258 259 #define HWY_TARGET_STR HWY_TARGET_STR_SSE2 260 //----------------------------------------------------------------------------- 261 // SSSE3 262 #elif HWY_TARGET == HWY_SSSE3 263 264 #define HWY_NAMESPACE N_SSSE3 265 #define HWY_ALIGN alignas(16) 266 #define HWY_MAX_BYTES 16 267 #define HWY_LANES(T) (16 / sizeof(T)) 268 269 #define HWY_HAVE_SCALABLE 0 270 #define HWY_HAVE_INTEGER64 1 271 #define HWY_HAVE_FLOAT16 0 272 #define HWY_HAVE_FLOAT64 1 273 #define HWY_MEM_OPS_MIGHT_FAULT 1 274 #define HWY_NATIVE_FMA 0 275 #define HWY_NATIVE_DOT_BF16 0 276 #define HWY_NATIVE_MASK 0 // a few actually are 277 #define HWY_CAP_GE256 0 278 #define HWY_CAP_GE512 0 279 280 #define HWY_TARGET_STR HWY_TARGET_STR_SSSE3 281 282 //----------------------------------------------------------------------------- 283 // SSE4 284 #elif HWY_TARGET == HWY_SSE4 285 286 #define HWY_NAMESPACE N_SSE4 287 #define HWY_ALIGN alignas(16) 288 #define HWY_MAX_BYTES 16 289 #define HWY_LANES(T) (16 / sizeof(T)) 290 291 #define HWY_HAVE_SCALABLE 0 292 #define HWY_HAVE_INTEGER64 1 293 #define HWY_HAVE_FLOAT16 0 294 #define HWY_HAVE_FLOAT64 1 295 #define HWY_MEM_OPS_MIGHT_FAULT 1 296 #define HWY_NATIVE_FMA 0 297 #define HWY_NATIVE_DOT_BF16 0 298 #define HWY_NATIVE_MASK 0 // a few actually are 299 #define HWY_CAP_GE256 0 300 #define HWY_CAP_GE512 0 301 302 #define HWY_TARGET_STR HWY_TARGET_STR_SSE4 303 304 //----------------------------------------------------------------------------- 305 // AVX2 306 #elif HWY_TARGET == HWY_AVX2 307 308 #define HWY_NAMESPACE N_AVX2 309 #define HWY_ALIGN alignas(32) 310 #define HWY_MAX_BYTES 32 311 #define HWY_LANES(T) (32 / sizeof(T)) 312 313 #define HWY_HAVE_SCALABLE 0 314 #define HWY_HAVE_INTEGER64 1 315 #define HWY_HAVE_FLOAT16 0 316 #define HWY_HAVE_FLOAT64 1 317 #define HWY_MEM_OPS_MIGHT_FAULT 1 318 319 #ifdef HWY_DISABLE_BMI2_FMA 320 #define HWY_NATIVE_FMA 0 321 #else 322 #define HWY_NATIVE_FMA 1 323 #endif 324 #define HWY_NATIVE_DOT_BF16 0 325 #define HWY_NATIVE_MASK 0 // a few actually are 326 327 #define HWY_CAP_GE256 1 328 #define HWY_CAP_GE512 0 329 330 #define HWY_TARGET_STR HWY_TARGET_STR_AVX2 331 332 //----------------------------------------------------------------------------- 333 // AVX3[_DL/ZEN4/SPR]/AVX10 334 #elif HWY_TARGET <= HWY_AVX3 335 336 #define HWY_ALIGN alignas(64) 337 #define HWY_MAX_BYTES 64 338 #define HWY_LANES(T) (64 / sizeof(T)) 339 340 #define HWY_HAVE_SCALABLE 0 341 #define HWY_HAVE_INTEGER64 1 342 #if HWY_TARGET <= HWY_AVX3_SPR && \ 343 (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 2200) && \ 344 HWY_HAVE_SCALAR_F16_TYPE 345 #define HWY_HAVE_FLOAT16 1 346 #else 347 #define HWY_HAVE_FLOAT16 0 348 #endif 349 #define HWY_HAVE_FLOAT64 1 350 #define HWY_MEM_OPS_MIGHT_FAULT 0 351 #define HWY_NATIVE_FMA 1 352 #if (HWY_TARGET <= HWY_AVX3_ZEN4) && HWY_AVX3_ENABLE_AVX512BF16 353 #define HWY_NATIVE_DOT_BF16 1 354 #else 355 #define HWY_NATIVE_DOT_BF16 0 356 #endif 357 #define HWY_NATIVE_MASK 1 358 #define HWY_CAP_GE256 1 359 360 #if HWY_MAX_BYTES >= 64 361 #define HWY_CAP_GE512 1 362 #else 363 #define HWY_CAP_GE512 0 364 #endif 365 366 #if HWY_TARGET == HWY_AVX3 367 368 #define HWY_NAMESPACE N_AVX3 369 #define HWY_TARGET_STR HWY_TARGET_STR_AVX3 370 371 #elif HWY_TARGET == HWY_AVX3_DL 372 373 #define HWY_NAMESPACE N_AVX3_DL 374 #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL 375 376 #elif HWY_TARGET == HWY_AVX3_ZEN4 377 378 #define HWY_NAMESPACE N_AVX3_ZEN4 379 #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_ZEN4 380 381 #elif HWY_TARGET == HWY_AVX3_SPR 382 383 #define HWY_NAMESPACE N_AVX3_SPR 384 #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_SPR 385 386 #elif HWY_TARGET == HWY_AVX10_2 387 388 #define HWY_NAMESPACE N_AVX10_2 389 #define HWY_TARGET_STR HWY_TARGET_STR_AVX10_2 390 391 #else 392 #error "Logic error" 393 #endif // HWY_TARGET 394 395 //----------------------------------------------------------------------------- 396 // PPC8, PPC9, PPC10 397 #elif HWY_TARGET_IS_PPC 398 399 #define HWY_ALIGN alignas(16) 400 #define HWY_MAX_BYTES 16 401 #define HWY_LANES(T) (16 / sizeof(T)) 402 403 #define HWY_HAVE_SCALABLE 0 404 #define HWY_HAVE_INTEGER64 1 405 #define HWY_HAVE_FLOAT16 0 406 #define HWY_HAVE_FLOAT64 1 407 #define HWY_MEM_OPS_MIGHT_FAULT 1 408 #define HWY_NATIVE_FMA 1 409 #define HWY_NATIVE_DOT_BF16 0 410 #define HWY_NATIVE_MASK 0 411 #define HWY_CAP_GE256 0 412 #define HWY_CAP_GE512 0 413 414 #if HWY_TARGET == HWY_PPC8 415 416 #define HWY_NAMESPACE N_PPC8 417 #define HWY_TARGET_STR HWY_TARGET_STR_PPC8 418 419 #elif HWY_TARGET == HWY_PPC9 420 421 #define HWY_NAMESPACE N_PPC9 422 #define HWY_TARGET_STR HWY_TARGET_STR_PPC9 423 424 #elif HWY_TARGET == HWY_PPC10 425 426 #define HWY_NAMESPACE N_PPC10 427 #define HWY_TARGET_STR HWY_TARGET_STR_PPC10 428 429 #else 430 #error "Logic error" 431 #endif // HWY_TARGET 432 433 //----------------------------------------------------------------------------- 434 // Z14, Z15 435 #elif HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15 436 437 #define HWY_ALIGN alignas(16) 438 #define HWY_MAX_BYTES 16 439 #define HWY_LANES(T) (16 / sizeof(T)) 440 441 #define HWY_HAVE_SCALABLE 0 442 #define HWY_HAVE_INTEGER64 1 443 #define HWY_HAVE_FLOAT16 0 444 #define HWY_HAVE_FLOAT64 1 445 #define HWY_MEM_OPS_MIGHT_FAULT 1 446 #define HWY_NATIVE_FMA 1 447 #define HWY_NATIVE_DOT_BF16 0 448 #define HWY_NATIVE_MASK 0 449 #define HWY_CAP_GE256 0 450 #define HWY_CAP_GE512 0 451 452 #if HWY_TARGET == HWY_Z14 453 454 #define HWY_NAMESPACE N_Z14 455 #define HWY_TARGET_STR HWY_TARGET_STR_Z14 456 457 #elif HWY_TARGET == HWY_Z15 458 459 #define HWY_NAMESPACE N_Z15 460 #define HWY_TARGET_STR HWY_TARGET_STR_Z15 461 462 #else 463 #error "Logic error" 464 #endif // HWY_TARGET == HWY_Z15 465 466 //----------------------------------------------------------------------------- 467 // NEON 468 #elif HWY_TARGET_IS_NEON 469 470 // Clang 17 crashes with bf16, see github.com/llvm/llvm-project/issues/64179. 471 #undef HWY_NEON_HAVE_BFLOAT16 472 #if HWY_HAVE_SCALAR_BF16_TYPE && \ 473 ((HWY_TARGET == HWY_NEON_BF16 && \ 474 (!HWY_COMPILER_CLANG || HWY_COMPILER_CLANG >= 1800)) || \ 475 defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)) 476 #define HWY_NEON_HAVE_BFLOAT16 1 477 #else 478 #define HWY_NEON_HAVE_BFLOAT16 0 479 #endif 480 481 // HWY_NEON_HAVE_F32_TO_BF16C is defined if NEON vcvt_bf16_f32 and 482 // vbfdot_f32 are available, even if the __bf16 type is disabled due to 483 // GCC/Clang bugs. 484 #undef HWY_NEON_HAVE_F32_TO_BF16C 485 #if HWY_NEON_HAVE_BFLOAT16 || HWY_TARGET == HWY_NEON_BF16 || \ 486 (defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \ 487 (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100)) 488 #define HWY_NEON_HAVE_F32_TO_BF16C 1 489 #else 490 #define HWY_NEON_HAVE_F32_TO_BF16C 0 491 #endif 492 493 #define HWY_ALIGN alignas(16) 494 #define HWY_MAX_BYTES 16 495 #define HWY_LANES(T) (16 / sizeof(T)) 496 497 #define HWY_HAVE_SCALABLE 0 498 #define HWY_HAVE_INTEGER64 1 499 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || HWY_TARGET == HWY_NEON_BF16 500 #define HWY_HAVE_FLOAT16 1 501 #else 502 #define HWY_HAVE_FLOAT16 0 503 #endif 504 505 #if HWY_ARCH_ARM_A64 506 #define HWY_HAVE_FLOAT64 1 507 #else 508 #define HWY_HAVE_FLOAT64 0 509 #endif 510 511 #define HWY_MEM_OPS_MIGHT_FAULT 1 512 513 #if defined(__ARM_FEATURE_FMA) || defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 514 #define HWY_NATIVE_FMA 1 515 #else 516 #define HWY_NATIVE_FMA 0 517 #endif 518 519 #if HWY_NEON_HAVE_F32_TO_BF16C 520 #define HWY_NATIVE_DOT_BF16 1 521 #else 522 #define HWY_NATIVE_DOT_BF16 0 523 #endif 524 525 #define HWY_NATIVE_MASK 0 526 527 #define HWY_CAP_GE256 0 528 #define HWY_CAP_GE512 0 529 530 #if HWY_TARGET == HWY_NEON_WITHOUT_AES 531 #define HWY_NAMESPACE N_NEON_WITHOUT_AES 532 #elif HWY_TARGET == HWY_NEON 533 #define HWY_NAMESPACE N_NEON 534 #elif HWY_TARGET == HWY_NEON_BF16 535 #define HWY_NAMESPACE N_NEON_BF16 536 #else 537 #error "Logic error, missing case" 538 #endif // HWY_TARGET 539 540 // Can use pragmas instead of -march compiler flag 541 #if HWY_HAVE_RUNTIME_DISPATCH 542 #if HWY_ARCH_ARM_V7 543 544 // The __attribute__((target(+neon-vfpv4)) was introduced in gcc >= 8. 545 #if HWY_COMPILER_GCC_ACTUAL >= 800 546 #define HWY_TARGET_STR "+neon-vfpv4" 547 #else // GCC < 7 548 // Do not define HWY_TARGET_STR (no pragma). 549 #endif // HWY_COMPILER_GCC_ACTUAL 550 551 #else // !HWY_ARCH_ARM_V7 552 553 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1300) || \ 554 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1300) 555 // GCC 12 or earlier and Clang 12 or earlier require +crypto be added to the 556 // target string to enable AArch64 AES intrinsics 557 #define HWY_TARGET_STR_NEON "+crypto" 558 #else 559 #define HWY_TARGET_STR_NEON "+aes" 560 #endif 561 562 // Clang >= 16 requires +fullfp16 instead of fp16, but Apple Clang 15 = 1600 563 // fails to parse unless the string starts with armv8, whereas 1700 refuses it. 564 #if HWY_COMPILER_CLANG >= 1700 565 #define HWY_TARGET_STR_FP16 "+fullfp16" 566 #elif HWY_COMPILER_CLANG >= 1600 && defined(__apple_build_version__) 567 #define HWY_TARGET_STR_FP16 "armv8.4-a+fullfp16" 568 #else 569 #define HWY_TARGET_STR_FP16 "+fp16" 570 #endif 571 572 #define HWY_TARGET_STR_I8MM "+i8mm" 573 574 #if HWY_TARGET == HWY_NEON_WITHOUT_AES 575 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400 576 // Prevents inadvertent use of SVE by GCC 13.4 and earlier, see #2689. 577 #define HWY_TARGET_STR "+nosve" 578 #else 579 // Do not define HWY_TARGET_STR (no pragma). 580 #endif // HWY_COMPILER_GCC_ACTUAL 581 #elif HWY_TARGET == HWY_NEON 582 #define HWY_TARGET_STR HWY_TARGET_STR_NEON 583 #elif HWY_TARGET == HWY_NEON_BF16 584 #define HWY_TARGET_STR \ 585 HWY_TARGET_STR_FP16 HWY_TARGET_STR_I8MM "+bf16+dotprod" HWY_TARGET_STR_NEON 586 #else 587 #error "Logic error, missing case" 588 #endif // HWY_TARGET 589 590 #endif // !HWY_ARCH_ARM_V7 591 #else // !HWY_HAVE_RUNTIME_DISPATCH 592 // HWY_TARGET_STR remains undefined 593 #endif 594 595 //----------------------------------------------------------------------------- 596 // SVE[2] 597 #elif HWY_TARGET_IS_SVE 598 599 // SVE only requires lane alignment, not natural alignment of the entire vector. 600 #define HWY_ALIGN alignas(8) 601 602 // Value ensures MaxLanes() is the tightest possible upper bound to reduce 603 // overallocation. 604 #define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T)) 605 606 #define HWY_HAVE_INTEGER64 1 607 #define HWY_HAVE_FLOAT16 1 608 #define HWY_HAVE_FLOAT64 1 609 #define HWY_MEM_OPS_MIGHT_FAULT 0 610 #define HWY_NATIVE_FMA 1 611 #if HWY_SVE_HAVE_BF16_FEATURE 612 #define HWY_NATIVE_DOT_BF16 1 613 #else 614 #define HWY_NATIVE_DOT_BF16 0 615 #endif 616 #define HWY_NATIVE_MASK 1 617 #define HWY_CAP_GE256 0 618 #define HWY_CAP_GE512 0 619 620 #if HWY_TARGET == HWY_SVE2 621 #define HWY_NAMESPACE N_SVE2 622 #define HWY_MAX_BYTES 256 623 #define HWY_HAVE_SCALABLE 1 624 #elif HWY_TARGET == HWY_SVE_256 625 #define HWY_NAMESPACE N_SVE_256 626 #define HWY_MAX_BYTES 32 627 #define HWY_HAVE_SCALABLE 0 628 #elif HWY_TARGET == HWY_SVE2_128 629 #define HWY_NAMESPACE N_SVE2_128 630 #define HWY_MAX_BYTES 16 631 #define HWY_HAVE_SCALABLE 0 632 #else 633 #define HWY_NAMESPACE N_SVE 634 #define HWY_MAX_BYTES 256 635 #define HWY_HAVE_SCALABLE 1 636 #endif 637 638 #define HWY_TARGET_STR_I8MM "+i8mm" 639 640 // Can use pragmas instead of -march compiler flag 641 #if HWY_HAVE_RUNTIME_DISPATCH 642 #if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128 643 // Static dispatch with -march=armv8-a+sve2+aes, or no baseline, hence dynamic 644 // dispatch, which checks for AES support at runtime. 645 #if defined(__ARM_FEATURE_SVE2_AES) || (HWY_BASELINE_SVE2 == 0) 646 #define HWY_TARGET_STR "+sve2+sve2-aes,+sve" HWY_TARGET_STR_I8MM 647 #else // SVE2 without AES 648 #define HWY_TARGET_STR "+sve2,+sve" HWY_TARGET_STR_I8MM 649 #endif 650 #else // not SVE2 target 651 #define HWY_TARGET_STR "+sve" 652 #endif 653 #else // !HWY_HAVE_RUNTIME_DISPATCH 654 // HWY_TARGET_STR remains undefined 655 #endif 656 657 //----------------------------------------------------------------------------- 658 // WASM 659 #elif HWY_TARGET == HWY_WASM 660 661 #define HWY_ALIGN alignas(16) 662 #define HWY_MAX_BYTES 16 663 #define HWY_LANES(T) (16 / sizeof(T)) 664 665 #define HWY_HAVE_SCALABLE 0 666 #define HWY_HAVE_INTEGER64 1 667 #define HWY_HAVE_FLOAT16 0 668 #define HWY_HAVE_FLOAT64 1 669 #define HWY_MEM_OPS_MIGHT_FAULT 1 670 #define HWY_NATIVE_FMA 0 671 #define HWY_NATIVE_DOT_BF16 0 672 #define HWY_NATIVE_MASK 0 673 #define HWY_CAP_GE256 0 674 #define HWY_CAP_GE512 0 675 676 #define HWY_NAMESPACE N_WASM 677 678 #define HWY_TARGET_STR "simd128" 679 680 //----------------------------------------------------------------------------- 681 // WASM_EMU256 682 #elif HWY_TARGET == HWY_WASM_EMU256 683 684 #define HWY_ALIGN alignas(32) 685 #define HWY_MAX_BYTES 32 686 #define HWY_LANES(T) (32 / sizeof(T)) 687 688 #define HWY_HAVE_SCALABLE 0 689 #define HWY_HAVE_INTEGER64 1 690 #define HWY_HAVE_FLOAT16 0 691 #define HWY_HAVE_FLOAT64 1 692 #define HWY_MEM_OPS_MIGHT_FAULT 1 693 #define HWY_NATIVE_FMA 0 694 #define HWY_NATIVE_DOT_BF16 0 695 #define HWY_NATIVE_MASK 0 696 #define HWY_CAP_GE256 1 697 #define HWY_CAP_GE512 0 698 699 #define HWY_NAMESPACE N_WASM_EMU256 700 701 #define HWY_TARGET_STR "simd128" 702 703 //----------------------------------------------------------------------------- 704 // RVV 705 #elif HWY_TARGET == HWY_RVV 706 707 // RVV only requires lane alignment, not natural alignment of the entire vector, 708 // and the compiler already aligns builtin types, so nothing to do here. 709 #define HWY_ALIGN 710 711 // The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8). 712 #define HWY_MAX_BYTES 65536 713 714 // = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual 715 // LMUL. This is the tightest possible upper bound. 716 #define HWY_LANES(T) (8192 / sizeof(T)) 717 718 #define HWY_HAVE_SCALABLE 1 719 #define HWY_HAVE_INTEGER64 1 720 #define HWY_HAVE_FLOAT64 1 721 #define HWY_MEM_OPS_MIGHT_FAULT 0 722 #define HWY_NATIVE_FMA 1 723 #define HWY_NATIVE_DOT_BF16 0 724 #define HWY_NATIVE_MASK 1 725 #define HWY_CAP_GE256 0 726 #define HWY_CAP_GE512 0 727 728 #if HWY_RVV_HAVE_F16_VEC 729 #define HWY_HAVE_FLOAT16 1 730 #else 731 #define HWY_HAVE_FLOAT16 0 732 #endif 733 734 #define HWY_NAMESPACE N_RVV 735 736 #if HWY_COMPILER_CLANG >= 1900 737 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#181-zvl-minimum-vector-length-standard-extensions 738 #define HWY_TARGET_STR "arch=+v" 739 #else 740 // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. 741 #endif 742 743 //----------------------------------------------------------------------------- 744 // LSX/LASX 745 #elif HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX 746 747 #if HWY_TARGET == HWY_LSX 748 #define HWY_ALIGN alignas(16) 749 #define HWY_MAX_BYTES 16 750 #ifndef __loongarch_sx 751 #define HWY_TARGET_STR "lsx" 752 #endif 753 #else 754 #define HWY_ALIGN alignas(32) 755 #define HWY_MAX_BYTES 32 756 #ifndef __loongarch_asx 757 #define HWY_TARGET_STR "lsx,lasx" 758 #endif 759 #endif 760 761 #define HWY_LANES(T) (HWY_MAX_BYTES / sizeof(T)) 762 763 #define HWY_HAVE_SCALABLE 0 764 #define HWY_HAVE_INTEGER64 1 765 #define HWY_HAVE_FLOAT16 0 766 #define HWY_HAVE_FLOAT64 1 767 #define HWY_MEM_OPS_MIGHT_FAULT 1 768 #define HWY_NATIVE_FMA 1 769 #define HWY_NATIVE_DOT_BF16 0 770 #define HWY_NATIVE_MASK 0 771 772 #if HWY_TARGET == HWY_LSX 773 #define HWY_CAP_GE256 0 774 #else 775 #define HWY_CAP_GE256 1 776 #endif 777 778 #define HWY_CAP_GE512 0 779 780 #if HWY_TARGET == HWY_LSX 781 #define HWY_NAMESPACE N_LSX 782 #else 783 #define HWY_NAMESPACE N_LASX 784 #endif 785 786 // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. 787 788 //----------------------------------------------------------------------------- 789 // EMU128 790 #elif HWY_TARGET == HWY_EMU128 791 792 #define HWY_ALIGN alignas(16) 793 #define HWY_MAX_BYTES 16 794 #define HWY_LANES(T) (16 / sizeof(T)) 795 796 #define HWY_HAVE_SCALABLE 0 797 #define HWY_HAVE_INTEGER64 1 798 #define HWY_HAVE_FLOAT16 0 799 #define HWY_HAVE_FLOAT64 1 800 #define HWY_MEM_OPS_MIGHT_FAULT 1 801 #define HWY_NATIVE_FMA 0 802 #define HWY_NATIVE_DOT_BF16 0 803 #define HWY_NATIVE_MASK 0 804 #define HWY_CAP_GE256 0 805 #define HWY_CAP_GE512 0 806 807 #define HWY_NAMESPACE N_EMU128 808 809 // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. 810 811 //----------------------------------------------------------------------------- 812 // SCALAR 813 #elif HWY_TARGET == HWY_SCALAR 814 815 #define HWY_ALIGN 816 #define HWY_MAX_BYTES 8 817 #define HWY_LANES(T) 1 818 819 #define HWY_HAVE_SCALABLE 0 820 #define HWY_HAVE_INTEGER64 1 821 #define HWY_HAVE_FLOAT16 0 822 #define HWY_HAVE_FLOAT64 1 823 #define HWY_MEM_OPS_MIGHT_FAULT 0 824 #define HWY_NATIVE_FMA 0 825 #define HWY_NATIVE_DOT_BF16 0 826 #define HWY_NATIVE_MASK 0 827 #define HWY_CAP_GE256 0 828 #define HWY_CAP_GE512 0 829 830 #define HWY_NAMESPACE N_SCALAR 831 832 // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. 833 834 #else 835 #pragma message("HWY_TARGET does not match any known target") 836 #endif // HWY_TARGET 837 838 //----------------------------------------------------------------------------- 839 840 // Sanity check: if we have f16 vector support, then base.h should also be 841 // using a built-in type for f16 scalars. 842 #if HWY_HAVE_FLOAT16 && !HWY_HAVE_SCALAR_F16_TYPE 843 #error "Logic error: f16 vectors but no scalars" 844 #endif 845 846 // Override this to 1 in asan/msan builds, which will still fault. 847 #if HWY_IS_ASAN || HWY_IS_MSAN 848 #undef HWY_MEM_OPS_MIGHT_FAULT 849 #define HWY_MEM_OPS_MIGHT_FAULT 1 850 #endif 851 852 // Clang <9 requires this be invoked at file scope, before any namespace. 853 #undef HWY_BEFORE_NAMESPACE 854 #if defined(HWY_TARGET_STR) 855 #define HWY_BEFORE_NAMESPACE() \ 856 HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \ 857 static_assert(true, "For requiring trailing semicolon") 858 #else 859 // avoids compiler warning if no HWY_TARGET_STR 860 #define HWY_BEFORE_NAMESPACE() \ 861 static_assert(true, "For requiring trailing semicolon") 862 #endif 863 864 // Clang <9 requires any namespaces be closed before this macro. 865 #undef HWY_AFTER_NAMESPACE 866 #if defined(HWY_TARGET_STR) 867 #define HWY_AFTER_NAMESPACE() \ 868 HWY_POP_ATTRIBUTES \ 869 static_assert(true, "For requiring trailing semicolon") 870 #else 871 // avoids compiler warning if no HWY_TARGET_STR 872 #define HWY_AFTER_NAMESPACE() \ 873 static_assert(true, "For requiring trailing semicolon") 874 #endif 875 876 #undef HWY_ATTR 877 #if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target) 878 #define HWY_ATTR __attribute__((target(HWY_TARGET_STR))) 879 #else 880 #define HWY_ATTR 881 #endif