detect_targets.h (36182B)
1 // Copyright 2021 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_DETECT_TARGETS_H_ 17 #define HIGHWAY_HWY_DETECT_TARGETS_H_ 18 19 // Defines targets and chooses which to enable. 20 21 #include "hwy/detect_compiler_arch.h" 22 23 //------------------------------------------------------------------------------ 24 // Optional configuration 25 26 // See g3doc/quick_reference.md for documentation of these macros. 27 28 // Uncomment to override the default baseline determined from predefined macros: 29 // #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR) 30 31 // Uncomment to override the default blocklist: 32 // #define HWY_BROKEN_TARGETS HWY_AVX3 33 34 // Uncomment to definitely avoid generating those target(s): 35 // #define HWY_DISABLED_TARGETS HWY_SSE4 36 37 // Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating 38 // AVX2 target for VMs which support AVX2 but not the other instruction sets) 39 // #define HWY_DISABLE_BMI2_FMA 40 41 // Uncomment to enable these on MSVC even if the predefined macros are not set. 42 // #define HWY_WANT_SSE2 1 43 // #define HWY_WANT_SSSE3 1 44 // #define HWY_WANT_SSE4 1 45 46 //------------------------------------------------------------------------------ 47 // Targets 48 49 // Unique bit value for each target. A lower value is "better" (e.g. more lanes) 50 // than a higher value within the same group/platform - see HWY_STATIC_TARGET. 51 // 52 // All values are unconditionally defined so we can test HWY_TARGETS without 53 // first checking the HWY_ARCH_*. 54 // 55 // The C99 preprocessor evaluates #if expressions using intmax_t types. This 56 // holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on 57 // 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now 58 // avoid overflow when computing HWY_TARGETS (subtracting one instead of 59 // left-shifting 2^62), but still do not use bit 63 because it is the sign bit. 60 61 // --------------------------- x86: 15 targets (+ one fallback) 62 // Bits 0..2 reserved (3 targets) 63 #define HWY_AVX10_2 (1LL << 3) // AVX10.2 with 512-bit vectors 64 #define HWY_AVX3_SPR (1LL << 4) 65 // Bit 5: reserved (1 target) 66 // Currently `HWY_AVX3_DL` plus `AVX512BF16` and a special case for 67 // `CompressStore` (10x as fast, still useful on Zen5). We may later also use 68 // `VPCONFLICT`. Note that `VP2INTERSECT` is available in Zen5. 69 #define HWY_AVX3_ZEN4 (1LL << 6) // see HWY_WANT_AVX3_ZEN4 below 70 71 // Currently satisfiable by Ice Lake (`VNNI`, `VPCLMULQDQ`, `VPOPCNTDQ`, 72 // `VBMI`, `VBMI2`, `VAES`, `BITALG`, `GFNI`). 73 #define HWY_AVX3_DL (1LL << 7) 74 #define HWY_AVX3 (1LL << 8) // HWY_AVX2 plus AVX-512F/BW/CD/DQ/VL 75 #define HWY_AVX2 (1LL << 9) // HWY_SSE4 plus BMI2 + F16 + FMA 76 // Bit 10: reserved 77 #define HWY_SSE4 (1LL << 11) // SSE4.2 plus AES + CLMUL 78 #define HWY_SSSE3 (1LL << 12) // S-SSE3 79 // Bit 13: reserved for SSE3 80 #define HWY_SSE2 (1LL << 14) 81 // The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for 82 // dynamic dispatch. All x86 target bits must be lower or equal to 83 // (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use 84 // HWY_MAX_DYNAMIC_TARGETS in total. 85 #define HWY_HIGHEST_TARGET_BIT_X86 14 86 87 // --------------------------- Arm: 15 targets (+ one fallback) 88 // Bits 15..17 reserved (3 targets) 89 #define HWY_SVE2_128 (1LL << 18) // specialized (e.g. Neoverse V2/N2/N3) 90 #define HWY_SVE_256 (1LL << 19) // specialized (Neoverse V1) 91 // Bits 20-22 reserved for later SVE (3 targets) 92 #define HWY_SVE2 (1LL << 23) 93 #define HWY_SVE (1LL << 24) 94 // Bit 25 reserved for NEON 95 #define HWY_NEON_BF16 (1LL << 26) // fp16/dot/bf16/i8mm (e.g. Neoverse V2/N2) 96 // Bit 27 reserved for NEON 97 #define HWY_NEON (1LL << 28) // Implies support for AES 98 #define HWY_NEON_WITHOUT_AES (1LL << 29) 99 #define HWY_HIGHEST_TARGET_BIT_ARM 29 100 101 #define HWY_ALL_NEON (HWY_NEON_WITHOUT_AES | HWY_NEON | HWY_NEON_BF16) 102 #define HWY_ALL_SVE (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128) 103 104 // --------------------------- RISC-V: 9 targets (+ one fallback) 105 // Bits 30..36 reserved (7 targets) 106 #define HWY_RVV (1LL << 37) 107 // Bit 38 reserved 108 #define HWY_HIGHEST_TARGET_BIT_RVV 38 109 110 // --------------------------- LoongArch: 3 targets (+ one fallback) 111 // Bits 39 reserved (1 target) 112 #define HWY_LASX (1LL << 40) 113 #define HWY_LSX (1LL << 41) 114 #define HWY_HIGHEST_TARGET_BIT_LOONGARCH 41 115 116 // --------------------------- Future expansion: 1 target 117 // Bits 42 reserved 118 119 // --------------------------- IBM Power/ZSeries: 9 targets (+ one fallback) 120 // Bits 43..46 reserved (4 targets) 121 #define HWY_PPC10 (1LL << 47) // v3.1 122 #define HWY_PPC9 (1LL << 48) // v3.0 123 #define HWY_PPC8 (1LL << 49) // v2.07 124 #define HWY_Z15 (1LL << 50) // Z15 125 #define HWY_Z14 (1LL << 51) // Z14 126 #define HWY_HIGHEST_TARGET_BIT_PPC 51 127 128 #define HWY_ALL_PPC (HWY_PPC8 | HWY_PPC9 | HWY_PPC10) 129 130 // --------------------------- WebAssembly: 9 targets (+ one fallback) 131 // Bits 52..57 reserved (6 targets) 132 #define HWY_WASM_EMU256 (1LL << 58) // Experimental 133 #define HWY_WASM (1LL << 59) 134 // Bits 60 reserved 135 #define HWY_HIGHEST_TARGET_BIT_WASM 60 136 137 // --------------------------- Emulation: 2 targets 138 139 #define HWY_EMU128 (1LL << 61) 140 // We do not add/left-shift, so this will not overflow to a negative number. 141 #define HWY_SCALAR (1LL << 62) 142 #define HWY_HIGHEST_TARGET_BIT_SCALAR 62 143 144 // Do not use bit 63 - would be confusing to have negative numbers. 145 146 //------------------------------------------------------------------------------ 147 // Set default blocklists 148 149 // Disabled means excluded from enabled at user's request. A separate config 150 // macro allows disabling without deactivating the blocklist below. 151 #ifndef HWY_DISABLED_TARGETS 152 #define HWY_DISABLED_TARGETS 0 153 #endif 154 155 // Broken means excluded from enabled due to known compiler issues. We define 156 // separate HWY_BROKEN_* and then OR them together (more than one might apply). 157 158 #ifndef HWY_BROKEN_CLANG6 // allow override 159 // x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid 160 // SSE4 codegen (possibly only for msan), so disable all those targets. 161 #if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700) 162 #define HWY_BROKEN_CLANG6 (HWY_SSE4 | (HWY_SSE4 - 1)) 163 // This entails a major speed reduction, so warn unless the user explicitly 164 // opts in to scalar-only. 165 #if !defined(HWY_COMPILE_ONLY_SCALAR) 166 #pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.") 167 #endif 168 169 #else 170 #define HWY_BROKEN_CLANG6 0 171 #endif 172 #endif // HWY_BROKEN_CLANG6 173 174 #ifndef HWY_BROKEN_32BIT // allow override 175 // 32-bit may fail to compile AVX2/3. 176 #if HWY_ARCH_X86_32 177 // GCC-13 is ok with AVX2: 178 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1300) 179 #define HWY_BROKEN_32BIT (HWY_AVX3 | (HWY_AVX3 - 1)) 180 #else 181 #define HWY_BROKEN_32BIT (HWY_AVX2 | (HWY_AVX2 - 1)) 182 #endif 183 #else 184 #define HWY_BROKEN_32BIT 0 185 #endif 186 #endif // HWY_BROKEN_32BIT 187 188 #ifndef HWY_BROKEN_MSVC // allow override 189 // MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16 190 #if HWY_COMPILER_MSVC != 0 191 #define HWY_BROKEN_MSVC (HWY_AVX3 | (HWY_AVX3 - 1)) 192 #else 193 #define HWY_BROKEN_MSVC 0 194 #endif 195 #endif // HWY_BROKEN_MSVC 196 197 #ifndef HWY_BROKEN_AVX10_2 // allow override 198 // AVX10_2 requires clang >= 20.1 (postpone to 23 due to "avx10.2-512" remnant, 199 // only removed in https://github.com/llvm/llvm-project/pull/157034) or 200 // gcc >= 15.2 with binutils 2.44. 201 #if (HWY_COMPILER_CLANG < 2300) && (HWY_COMPILER_GCC_ACTUAL < 1502) 202 #define HWY_BROKEN_AVX10_2 HWY_AVX10_2 203 #else 204 #define HWY_BROKEN_AVX10_2 0 205 #endif 206 #endif // HWY_BROKEN_AVX10_2 207 208 #ifndef HWY_BROKEN_AVX3_DL_ZEN4 // allow override 209 // AVX3_DL and AVX3_ZEN4 require clang >= 7 (ensured above), gcc >= 8.1 or ICC 210 // 2021. 211 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 801) || \ 212 (HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021) 213 #define HWY_BROKEN_AVX3_DL_ZEN4 (HWY_AVX3_DL | HWY_AVX3_ZEN4) 214 #else 215 #define HWY_BROKEN_AVX3_DL_ZEN4 0 216 #endif 217 #endif // HWY_BROKEN_AVX3_DL_ZEN4 218 219 #ifndef HWY_BROKEN_AVX3_SPR // allow override 220 // AVX3_SPR requires clang >= 14, gcc >= 12, or ICC 2021. 221 #if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1400) || \ 222 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200) || \ 223 (HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021) 224 #define HWY_BROKEN_AVX3_SPR (HWY_AVX3_SPR) 225 #else 226 #define HWY_BROKEN_AVX3_SPR 0 227 #endif 228 #endif // HWY_BROKEN_AVX3_SPR 229 230 #ifndef HWY_BROKEN_ARM7_BIG_ENDIAN // allow override 231 // armv7be has not been tested and is not yet supported. 232 #if HWY_ARCH_ARM_V7 && HWY_IS_BIG_ENDIAN 233 #define HWY_BROKEN_ARM7_BIG_ENDIAN HWY_ALL_NEON 234 #else 235 #define HWY_BROKEN_ARM7_BIG_ENDIAN 0 236 #endif 237 #endif // HWY_BROKEN_ARM7_BIG_ENDIAN 238 239 #ifdef __ARM_NEON_FP 240 #define HWY_HAVE_NEON_FP __ARM_NEON_FP 241 #else 242 #define HWY_HAVE_NEON_FP 0 243 #endif 244 245 #ifndef HWY_BROKEN_ARM7_WITHOUT_VFP4 // allow override 246 // armv7-a without a detected vfpv4 is not supported 247 // (for example Cortex-A8, Cortex-A9) 248 // vfpv4 always have neon half-float _and_ FMA. 249 #if HWY_ARCH_ARM_V7 && (__ARM_ARCH_PROFILE == 'A') && \ 250 !defined(__ARM_VFPV4__) && \ 251 !((HWY_HAVE_NEON_FP & 0x2 /* half-float */) && (__ARM_FEATURE_FMA == 1)) 252 #define HWY_BROKEN_ARM7_WITHOUT_VFP4 HWY_ALL_NEON 253 #else 254 #define HWY_BROKEN_ARM7_WITHOUT_VFP4 0 255 #endif 256 #endif // HWY_BROKEN_ARM7_WITHOUT_VFP4 257 258 #ifndef HWY_BROKEN_NEON_BF16 // allow override 259 // Broken on older compilers: 260 #if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1700) || \ 261 (HWY_COMPILER_GCC_ACTUAL != 0 && HWY_COMPILER_GCC_ACTUAL < 1302) || \ 262 (defined(__apple_build_version__) && __apple_build_version__ <= 17000000) 263 #define HWY_BROKEN_NEON_BF16 (HWY_NEON_BF16) 264 #else 265 #define HWY_BROKEN_NEON_BF16 0 266 #endif 267 #endif // HWY_BROKEN_NEON_BF16 268 269 // SVE[2] require recent clang or gcc versions. 270 271 #ifndef HWY_BROKEN_SVE // allow override 272 // GCC 10+. Clang 22 still has test failures for SVE, including MSAN. No Apple 273 // CPU (at least up to and including M4 and A18) has SVE. 274 #if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2300) || \ 275 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \ 276 HWY_OS_APPLE 277 #define HWY_BROKEN_SVE (HWY_SVE | HWY_SVE_256) 278 #else 279 #define HWY_BROKEN_SVE 0 280 #endif 281 #endif // HWY_BROKEN_SVE 282 283 #ifndef HWY_BROKEN_SVE2 // allow override 284 // Clang 21 still has test failures for SVE2, including MSAN. 285 #if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2300) || \ 286 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \ 287 HWY_OS_APPLE 288 #define HWY_BROKEN_SVE2 (HWY_SVE2 | HWY_SVE2_128) 289 #else 290 #define HWY_BROKEN_SVE2 0 291 #endif 292 #endif // HWY_BROKEN_SVE2 293 294 #ifndef HWY_BROKEN_PPC10 // allow override 295 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1100) 296 // GCC 10 supports the -mcpu=power10 option but does not support the PPC10 297 // vector intrinsics 298 #define HWY_BROKEN_PPC10 (HWY_PPC10) 299 #elif HWY_ARCH_PPC && HWY_IS_BIG_ENDIAN && \ 300 ((HWY_COMPILER3_CLANG && HWY_COMPILER3_CLANG < 160001) || \ 301 (HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_COMPILER_GCC_ACTUAL <= 1203) || \ 302 (HWY_COMPILER_GCC_ACTUAL >= 1300 && HWY_COMPILER_GCC_ACTUAL <= 1301)) 303 // GCC 12.0 through 12.3 and GCC 13.0 through 13.1 have a compiler bug where the 304 // vsldoi instruction is sometimes incorrectly optimized out (and this causes 305 // some of the Highway unit tests to fail on big-endian PPC10). Details about 306 // this compiler bug can be found at 307 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109069, and this bug will be 308 // fixed in the upcoming GCC 12.4 and 13.2 releases. 309 310 // Clang 16.0.0 and earlier (but not Clang 16.0.1 and later) have a compiler 311 // bug in the LLVM DAGCombiner that causes a zero-extend followed by an 312 // element insert into a vector, followed by a vector shuffle to be incorrectly 313 // optimized on big-endian PPC (and which caused some of the Highway unit tests 314 // to fail on big-endian PPC10). 315 316 // Details about this bug, which has already been fixed in Clang 16.0.1 and 317 // later, can be found at https://github.com/llvm/llvm-project/issues/61315. 318 #define HWY_BROKEN_PPC10 (HWY_PPC10) 319 #else 320 #define HWY_BROKEN_PPC10 0 321 #endif 322 #endif // HWY_BROKEN_PPC10 323 324 #ifndef HWY_BROKEN_PPC_32BIT // allow override 325 // PPC8/PPC9/PPC10 targets may fail to compile on 32-bit PowerPC 326 #if HWY_ARCH_PPC && !HWY_ARCH_PPC_64 327 #define HWY_BROKEN_PPC_32BIT (HWY_PPC8 | HWY_PPC9 | HWY_PPC10) 328 #else 329 #define HWY_BROKEN_PPC_32BIT 0 330 #endif 331 #endif // HWY_BROKEN_PPC_32BIT 332 333 #ifndef HWY_BROKEN_RVV // allow override 334 // HWY_RVV fails to compile with GCC < 13 or Clang < 16. 335 #if HWY_ARCH_RISCV && \ 336 ((HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600) || \ 337 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1300)) 338 #define HWY_BROKEN_RVV (HWY_RVV) 339 #else 340 #define HWY_BROKEN_RVV 0 341 #endif 342 #endif // HWY_BROKEN_RVV 343 344 #ifndef HWY_BROKEN_LOONGARCH // allow override 345 // Using __loongarch_sx and __loongarch_asx macros to 346 // check whether LSX/LASX targets are available. 347 // GCC does not work yet, see https://gcc.gnu.org/PR121875. 348 #if !defined(__loongarch_sx) && \ 349 !(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1800) 350 #define HWY_BROKEN_LOONGARCH (HWY_LSX | HWY_LASX) 351 #elif !defined(__loongarch_asx) && \ 352 !(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1800) 353 #define HWY_BROKEN_LOONGARCH (HWY_LASX) 354 #else 355 #define HWY_BROKEN_LOONGARCH 0 356 #endif 357 #endif // HWY_BROKEN_LOONGARCH 358 359 #ifndef HWY_BROKEN_Z14 // allow override 360 #if HWY_ARCH_S390X 361 #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1900 362 // Clang 18 and earlier have bugs with some ZVector intrinsics 363 #define HWY_BROKEN_Z14 (HWY_Z14 | HWY_Z15) 364 #elif HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900 365 // Z15 target requires GCC 9 or later 366 #define HWY_BROKEN_Z14 (HWY_Z15) 367 #else 368 #define HWY_BROKEN_Z14 0 369 #endif 370 #else // !HWY_ARCH_S390X 371 #define HWY_BROKEN_Z14 0 372 #endif // HWY_ARCH_S390X 373 #endif // HWY_BROKEN_Z14 374 375 // Allow the user to override this without any guarantee of success. 376 #ifndef HWY_BROKEN_TARGETS 377 378 #define HWY_BROKEN_TARGETS \ 379 (HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC | \ 380 HWY_BROKEN_AVX10_2 | HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \ 381 HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 | \ 382 HWY_BROKEN_NEON_BF16 | HWY_BROKEN_SVE | HWY_BROKEN_SVE2 | \ 383 HWY_BROKEN_PPC10 | HWY_BROKEN_PPC_32BIT | HWY_BROKEN_RVV | \ 384 HWY_BROKEN_LOONGARCH | HWY_BROKEN_Z14) 385 386 #endif // HWY_BROKEN_TARGETS 387 388 // Enabled means not disabled nor blocklisted. 389 #define HWY_ENABLED(targets) \ 390 ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS))) 391 392 // Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3: 393 // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). An issue still 394 // remains with 13.2, see #1683. This is separate from HWY_BROKEN_TARGETS 395 // because it affects the fallback target, which must always be enabled. If 1, 396 // we instead choose HWY_SCALAR even without HWY_COMPILE_ONLY_SCALAR being set. 397 #if !defined(HWY_BROKEN_EMU128) // allow overriding 398 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1600) || \ 399 defined(HWY_NO_LIBCXX) 400 #define HWY_BROKEN_EMU128 1 401 #else 402 #define HWY_BROKEN_EMU128 0 403 #endif 404 #endif // HWY_BROKEN_EMU128 405 406 //------------------------------------------------------------------------------ 407 // Detect baseline targets using predefined macros 408 409 // Baseline means the targets for which the compiler is allowed to generate 410 // instructions, implying the target CPU would have to support them. This does 411 // not take the blocklist into account. 412 413 #if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128 414 #define HWY_BASELINE_SCALAR HWY_SCALAR 415 #else 416 #define HWY_BASELINE_SCALAR HWY_EMU128 417 #endif 418 419 // Also check HWY_ARCH to ensure that simulating unknown platforms ends up with 420 // HWY_TARGET == HWY_BASELINE_SCALAR. 421 422 #if HWY_ARCH_WASM && defined(__wasm_simd128__) 423 #if defined(HWY_WANT_WASM2) 424 #define HWY_BASELINE_WASM HWY_WASM_EMU256 425 #else 426 #define HWY_BASELINE_WASM HWY_WASM 427 #endif // HWY_WANT_WASM2 428 #else 429 #define HWY_BASELINE_WASM 0 430 #endif 431 432 // GCC or Clang. 433 #if HWY_ARCH_PPC && HWY_COMPILER_GCC && defined(__ALTIVEC__) && \ 434 defined(__VSX__) && defined(__POWER8_VECTOR__) && \ 435 (defined(__CRYPTO__) || defined(HWY_DISABLE_PPC8_CRYPTO)) 436 #define HWY_BASELINE_PPC8 HWY_PPC8 437 #else 438 #define HWY_BASELINE_PPC8 0 439 #endif 440 441 #if HWY_BASELINE_PPC8 != 0 && defined(__POWER9_VECTOR__) 442 #define HWY_BASELINE_PPC9 HWY_PPC9 443 #else 444 #define HWY_BASELINE_PPC9 0 445 #endif 446 447 #if HWY_BASELINE_PPC9 != 0 && \ 448 (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__)) 449 #define HWY_BASELINE_PPC10 HWY_PPC10 450 #else 451 #define HWY_BASELINE_PPC10 0 452 #endif 453 454 #if HWY_ARCH_S390X && defined(__VEC__) && defined(__ARCH__) && __ARCH__ >= 12 455 #define HWY_BASELINE_Z14 HWY_Z14 456 #else 457 #define HWY_BASELINE_Z14 0 458 #endif 459 460 #if HWY_BASELINE_Z14 && __ARCH__ >= 13 461 #define HWY_BASELINE_Z15 HWY_Z15 462 #else 463 #define HWY_BASELINE_Z15 0 464 #endif 465 466 #define HWY_BASELINE_SVE2 0 467 #define HWY_BASELINE_SVE 0 468 #define HWY_BASELINE_NEON 0 469 470 #if HWY_ARCH_ARM 471 472 // Also check compiler version as done for HWY_ATTAINABLE_SVE2 because the 473 // static target (influenced here) must be one of the attainable targets. 474 #if defined(__ARM_FEATURE_SVE2) && \ 475 (HWY_COMPILER_CLANG >= 1400 || HWY_COMPILER_GCC_ACTUAL >= 1200) 476 #undef HWY_BASELINE_SVE2 // was 0, will be re-defined 477 // If user specified -msve-vector-bits=128, they assert the vector length is 478 // 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops). 479 #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128 480 #define HWY_BASELINE_SVE2 HWY_SVE2_128 481 // Otherwise we're not sure what the vector length will be. The baseline must be 482 // unconditionally valid, so we can only assume HWY_SVE2. However, when running 483 // on a CPU with 128-bit vectors, user code that supports dynamic dispatch will 484 // still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS. 485 #else 486 #define HWY_BASELINE_SVE2 HWY_SVE2 487 #endif // __ARM_FEATURE_SVE_BITS 488 #endif // __ARM_FEATURE_SVE2 489 490 #if defined(__ARM_FEATURE_SVE) && \ 491 (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800) 492 #undef HWY_BASELINE_SVE // was 0, will be re-defined 493 // See above. If user-specified vector length matches our optimization, use it. 494 #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256 495 #define HWY_BASELINE_SVE HWY_SVE_256 496 #else 497 #define HWY_BASELINE_SVE HWY_SVE 498 #endif // __ARM_FEATURE_SVE_BITS 499 #endif // __ARM_FEATURE_SVE 500 501 // GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both. 502 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 503 #undef HWY_BASELINE_NEON 504 #if defined(__ARM_FEATURE_AES) && \ 505 defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && \ 506 defined(__ARM_FEATURE_DOTPROD) && \ 507 defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \ 508 defined(__ARM_FEATURE_MATMUL_INT8) 509 #define HWY_BASELINE_NEON HWY_ALL_NEON 510 #elif defined(__ARM_FEATURE_AES) 511 #define HWY_BASELINE_NEON (HWY_NEON_WITHOUT_AES | HWY_NEON) 512 #else 513 #define HWY_BASELINE_NEON (HWY_NEON_WITHOUT_AES) 514 #endif // __ARM_FEATURE* 515 #endif // __ARM_NEON 516 517 #endif // HWY_ARCH_ARM 518 519 // Special handling for MSVC because it has fewer predefined macros: 520 #if HWY_COMPILER_MSVC 521 522 #if HWY_ARCH_X86_32 523 #if _M_IX86_FP >= 2 524 #define HWY_CHECK_SSE2 1 525 #else 526 #define HWY_CHECK_SSE2 0 527 #endif 528 #elif HWY_ARCH_X86_64 529 #define HWY_CHECK_SSE2 1 530 #else 531 #define HWY_CHECK_SSE2 0 532 #endif 533 534 // 1) We can only be sure SSSE3/SSE4 are enabled if AVX is: 535 // https://stackoverflow.com/questions/18563978/. 536 #if defined(__AVX__) 537 #define HWY_CHECK_SSSE3 1 538 #define HWY_CHECK_SSE4 1 539 #else 540 #define HWY_CHECK_SSSE3 0 541 #define HWY_CHECK_SSE4 0 542 #endif 543 544 // 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume 545 // PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is. 546 #define HWY_CHECK_PCLMUL_AES 1 547 #define HWY_CHECK_BMI2_FMA 1 548 #define HWY_CHECK_F16C 1 549 550 #else // non-MSVC 551 552 #if defined(__SSE2__) 553 #define HWY_CHECK_SSE2 1 554 #else 555 #define HWY_CHECK_SSE2 0 556 #endif 557 558 #if defined(__SSSE3__) 559 #define HWY_CHECK_SSSE3 1 560 #else 561 #define HWY_CHECK_SSSE3 0 562 #endif 563 564 #if defined(__SSE4_1__) && defined(__SSE4_2__) 565 #define HWY_CHECK_SSE4 1 566 #else 567 #define HWY_CHECK_SSE4 0 568 #endif 569 570 // If these are disabled, they should not gate the availability of SSE4/AVX2. 571 #if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__)) 572 #define HWY_CHECK_PCLMUL_AES 1 573 #else 574 #define HWY_CHECK_PCLMUL_AES 0 575 #endif 576 577 #if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__)) 578 #define HWY_CHECK_BMI2_FMA 1 579 #else 580 #define HWY_CHECK_BMI2_FMA 0 581 #endif 582 583 #if defined(HWY_DISABLE_F16C) || defined(__F16C__) 584 #define HWY_CHECK_F16C 1 585 #else 586 #define HWY_CHECK_F16C 0 587 #endif 588 589 #endif // non-MSVC 590 591 #if HWY_ARCH_X86 && \ 592 ((defined(HWY_WANT_SSE2) && HWY_WANT_SSE2) || HWY_CHECK_SSE2) 593 #define HWY_BASELINE_SSE2 HWY_SSE2 594 #else 595 #define HWY_BASELINE_SSE2 0 596 #endif 597 598 #if HWY_ARCH_X86 && \ 599 ((defined(HWY_WANT_SSSE3) && HWY_WANT_SSSE3) || HWY_CHECK_SSSE3) 600 #define HWY_BASELINE_SSSE3 HWY_SSSE3 601 #else 602 #define HWY_BASELINE_SSSE3 0 603 #endif 604 605 #if HWY_ARCH_X86 && ((defined(HWY_WANT_SSE4) && HWY_WANT_SSE4) || \ 606 (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES)) 607 #define HWY_BASELINE_SSE4 HWY_SSE4 608 #else 609 #define HWY_BASELINE_SSE4 0 610 #endif 611 612 #if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \ 613 defined(__AVX2__) 614 #define HWY_BASELINE_AVX2 HWY_AVX2 615 #else 616 #define HWY_BASELINE_AVX2 0 617 #endif 618 619 // Require everything in AVX2 plus AVX-512 flags (also set by MSVC) 620 #if HWY_BASELINE_AVX2 != 0 && \ 621 ((defined(__AVX512F__) && defined(__AVX512BW__) && \ 622 defined(__AVX512DQ__) && defined(__AVX512VL__)) || \ 623 defined(__AVX10_2__)) && \ 624 ((!HWY_COMPILER_GCC_ACTUAL && !HWY_COMPILER_CLANG) || \ 625 HWY_COMPILER_GCC_ACTUAL < 1400 || HWY_COMPILER_CLANG < 1800 || \ 626 defined(__EVEX512__)) 627 #define HWY_BASELINE_AVX3 HWY_AVX3 628 #else 629 #define HWY_BASELINE_AVX3 0 630 #endif 631 632 // TODO(janwas): not yet known whether these will be set by MSVC 633 #if HWY_BASELINE_AVX3 != 0 && \ 634 ((defined(__AVX512VNNI__) && defined(__VAES__) && \ 635 defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \ 636 defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \ 637 defined(__AVX512BITALG__)) || \ 638 defined(__AVX10_2__)) 639 #define HWY_BASELINE_AVX3_DL HWY_AVX3_DL 640 #else 641 #define HWY_BASELINE_AVX3_DL 0 642 #endif 643 644 // The ZEN4-optimized AVX3 target is numerically lower than AVX3_DL and is thus 645 // considered better. Do not enable it unless the user explicitly requests it - 646 // we do not want to choose the ZEN4 path on Intel because it could be slower. 647 #if defined(HWY_WANT_AVX3_ZEN4) && HWY_BASELINE_AVX3_DL != 0 648 #define HWY_BASELINE_AVX3_ZEN4 HWY_AVX3_ZEN4 649 #else 650 #define HWY_BASELINE_AVX3_ZEN4 0 651 #endif 652 653 #if HWY_BASELINE_AVX3_DL != 0 && \ 654 ((defined(__AVX512BF16__) && defined(__AVX512FP16__)) || \ 655 defined(__AVX10_2__)) 656 #define HWY_BASELINE_AVX3_SPR HWY_AVX3_SPR 657 #else 658 #define HWY_BASELINE_AVX3_SPR 0 659 #endif 660 661 #if HWY_BASELINE_AVX3_SPR != 0 && defined(__AVX10_2__) 662 #define HWY_BASELINE_AVX10_2 HWY_AVX10_2 663 #else 664 #define HWY_BASELINE_AVX10_2 0 665 #endif 666 667 // RVV requires intrinsics 0.11 or later, see #1156. 668 669 // Also check that the __riscv_v macro is defined as GCC or Clang will define 670 // the __risc_v macro if the RISC-V "V" extension is enabled. 671 672 #if HWY_ARCH_RISCV && defined(__riscv_v) && defined(__riscv_v_intrinsic) && \ 673 __riscv_v_intrinsic >= 11000 674 #define HWY_BASELINE_RVV HWY_RVV 675 #else 676 #define HWY_BASELINE_RVV 0 677 #endif 678 679 #if HWY_ARCH_LOONGARCH && defined(__loongarch_sx) && defined(__loongarch_asx) 680 #define HWY_BASELINE_LOONGARCH (HWY_LSX | HWY_LASX) 681 #elif HWY_ARCH_LOONGARCH && defined(__loongarch_sx) 682 #define HWY_BASELINE_LOONGARCH (HWY_LSX) 683 #else 684 #define HWY_BASELINE_LOONGARCH 0 685 #endif 686 687 // Workaround for libaom, which unconditionally defines HWY_BASELINE_TARGETS 688 // even when that would be disabled/broken. If so, at least use AVX2. 689 #if defined(HWY_BASELINE_TARGETS) 690 #if HWY_BASELINE_TARGETS == HWY_AVX3_DL && \ 691 ((HWY_BROKEN_TARGETS | HWY_DISABLED_TARGETS) & HWY_AVX3_DL) 692 #undef HWY_BASELINE_TARGETS 693 #define HWY_BASELINE_TARGETS HWY_AVX2 694 #endif 695 #endif // HWY_BASELINE_TARGETS 696 697 // Allow the user to override this without any guarantee of success. If the 698 // compiler invocation considers that target to be broken/disabled, then 699 // `HWY_ENABLED_BASELINE` will be 0 and users will have to check for that and 700 // skip their code. 701 #ifndef HWY_BASELINE_TARGETS 702 #define HWY_BASELINE_TARGETS \ 703 (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \ 704 HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10 | HWY_BASELINE_Z14 | \ 705 HWY_BASELINE_Z15 | HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | \ 706 HWY_BASELINE_NEON | HWY_BASELINE_SSE2 | HWY_BASELINE_SSSE3 | \ 707 HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \ 708 HWY_BASELINE_AVX3_DL | HWY_BASELINE_AVX3_ZEN4 | HWY_BASELINE_AVX3_SPR | \ 709 HWY_BASELINE_AVX10_2 | HWY_BASELINE_RVV | HWY_BASELINE_LOONGARCH) 710 #endif // HWY_BASELINE_TARGETS 711 712 //------------------------------------------------------------------------------ 713 // Choose target for static dispatch 714 715 #define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS) 716 #if HWY_ENABLED_BASELINE == 0 717 #pragma message \ 718 "All baseline targets are disabled or considered broken." \ 719 "This is typically due to very restrictive HWY_BASELINE_TARGETS, or " \ 720 "too expansive HWY_BROKEN_TARGETS or HWY_DISABLED_TAREGTS. User code " \ 721 "must also check for this and skip any usage of SIMD." 722 #endif 723 724 // Best baseline, used for static dispatch. This is the least-significant 1-bit 725 // within HWY_ENABLED_BASELINE and lower bit values imply "better". 726 #define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE) 727 728 // Start by assuming static dispatch. If we later use dynamic dispatch, this 729 // will be defined to other targets during the multiple-inclusion, and finally 730 // return to the initial value. Defining this outside begin/end_target ensures 731 // inl headers successfully compile by themselves (required by Bazel). 732 #define HWY_TARGET HWY_STATIC_TARGET 733 734 //------------------------------------------------------------------------------ 735 // Choose targets for dynamic dispatch according to one of four policies 736 737 #if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \ 738 defined(HWY_COMPILE_ONLY_STATIC)) 739 #error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?" 740 #endif 741 // Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE. 742 743 #ifndef HWY_HAVE_ASM_HWCAP // allow override 744 #ifdef TOOLCHAIN_MISS_ASM_HWCAP_H 745 #define HWY_HAVE_ASM_HWCAP 0 // CMake failed to find the header 746 #elif defined(__has_include) // note: wrapper macro fails on Clang ~17 747 // clang-format off 748 #if __has_include(<asm/hwcap.h>) 749 // clang-format on 750 #define HWY_HAVE_ASM_HWCAP 1 // header present 751 #else 752 #define HWY_HAVE_ASM_HWCAP 0 // header not present 753 #endif // __has_include 754 #else // compiler lacks __has_include 755 #define HWY_HAVE_ASM_HWCAP 0 756 #endif 757 #endif // HWY_HAVE_ASM_HWCAP 758 759 #ifndef HWY_HAVE_AUXV // allow override 760 #ifdef TOOLCHAIN_MISS_SYS_AUXV_H 761 #define HWY_HAVE_AUXV 0 // CMake failed to find the header 762 // glibc 2.16 added auxv, but checking for that requires features.h, and we do 763 // not want to include system headers here. Instead check for the header 764 // directly, which has been supported at least since GCC 5.4 and Clang 3. 765 #elif defined(__has_include) // note: wrapper macro fails on Clang ~17 766 // clang-format off 767 #if __has_include(<sys/auxv.h>) 768 // clang-format on 769 #define HWY_HAVE_AUXV 1 // header present 770 #else 771 #define HWY_HAVE_AUXV 0 // header not present 772 #endif // __has_include 773 #else // compiler lacks __has_include 774 #define HWY_HAVE_AUXV 0 775 #endif 776 #endif // HWY_HAVE_AUXV 777 778 #ifndef HWY_HAVE_RUNTIME_DISPATCH_RVV // allow override 779 // The riscv_vector.h in Clang 16-18 requires compiler flags, and 19 still has 780 // some missing intrinsics, see 781 // https://github.com/llvm/llvm-project/issues/56592. GCC 13.3 also has an 782 // #error check, whereas 14.1 fails with "argument type 'vuint16m8_t' requires 783 // the V ISA extension": https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115325. 784 #if HWY_ARCH_RISCV && HWY_COMPILER_CLANG >= 1900 && 0 785 #define HWY_HAVE_RUNTIME_DISPATCH_RVV 1 786 #else 787 #define HWY_HAVE_RUNTIME_DISPATCH_RVV 0 788 #endif 789 #endif // HWY_HAVE_RUNTIME_DISPATCH_RVV 790 791 #ifndef HWY_HAVE_RUNTIME_DISPATCH_APPLE // allow override 792 #if HWY_ARCH_ARM_A64 && HWY_OS_APPLE && \ 793 (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1700) 794 #define HWY_HAVE_RUNTIME_DISPATCH_APPLE 1 795 #else 796 #define HWY_HAVE_RUNTIME_DISPATCH_APPLE 0 797 #endif 798 #endif // HWY_HAVE_RUNTIME_DISPATCH_APPLE 799 800 #ifndef HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH // allow override 801 #if HWY_ARCH_LOONGARCH && HWY_HAVE_AUXV && !defined(__loongarch_asx) && \ 802 HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1800 803 #define HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH 1 804 #else 805 #define HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH 0 806 #endif 807 #endif // HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH 808 809 #ifndef HWY_HAVE_RUNTIME_DISPATCH_LINUX // allow override 810 #if (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X) && HWY_OS_LINUX && \ 811 (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1700) && HWY_HAVE_AUXV 812 #define HWY_HAVE_RUNTIME_DISPATCH_LINUX 1 813 #else 814 #define HWY_HAVE_RUNTIME_DISPATCH_LINUX 0 815 #endif 816 #endif // HWY_HAVE_RUNTIME_DISPATCH_LINUX 817 818 // Allow opting out, and without a guarantee of success, opting-in. 819 #ifndef HWY_HAVE_RUNTIME_DISPATCH 820 // Clang, GCC and MSVC allow OS-independent runtime dispatch on x86. 821 #if HWY_ARCH_X86 || HWY_HAVE_RUNTIME_DISPATCH_RVV || \ 822 HWY_HAVE_RUNTIME_DISPATCH_APPLE || HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH || \ 823 HWY_HAVE_RUNTIME_DISPATCH_LINUX 824 #define HWY_HAVE_RUNTIME_DISPATCH 1 825 #else 826 #define HWY_HAVE_RUNTIME_DISPATCH 0 827 #endif 828 #endif // HWY_HAVE_RUNTIME_DISPATCH 829 830 #if HWY_ARCH_ARM_A64 && HWY_HAVE_RUNTIME_DISPATCH 831 #define HWY_ATTAINABLE_NEON HWY_ALL_NEON 832 #elif HWY_ARCH_ARM // static dispatch, or HWY_ARCH_ARM_V7 833 #define HWY_ATTAINABLE_NEON (HWY_BASELINE_NEON) 834 #else 835 #define HWY_ATTAINABLE_NEON 0 836 #endif 837 838 #if HWY_ARCH_ARM_A64 && \ 839 (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800) && \ 840 (HWY_HAVE_RUNTIME_DISPATCH || \ 841 (HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256))) 842 #define HWY_ATTAINABLE_SVE (HWY_SVE | HWY_SVE_256) 843 #else 844 #define HWY_ATTAINABLE_SVE 0 845 #endif 846 847 #if HWY_ARCH_ARM_A64 && \ 848 (HWY_COMPILER_CLANG >= 1400 || HWY_COMPILER_GCC_ACTUAL >= 1200) && \ 849 (HWY_HAVE_RUNTIME_DISPATCH || \ 850 (HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128))) 851 #define HWY_ATTAINABLE_SVE2 (HWY_SVE2 | HWY_SVE2_128) 852 #else 853 #define HWY_ATTAINABLE_SVE2 0 854 #endif 855 856 #if HWY_ARCH_PPC && defined(__ALTIVEC__) && \ 857 (!HWY_COMPILER_CLANG || HWY_BASELINE_PPC8 != 0) 858 859 #if (HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10) && \ 860 !defined(HWY_SKIP_NON_BEST_BASELINE) 861 // On POWER with -m flags, we get compile errors (#1707) for targets older than 862 // the baseline specified via -m, so only generate the static target and better. 863 // Note that some Linux distros actually do set POWER9 as the baseline. 864 // This works by skipping case 3 below, so case 4 is reached. 865 #define HWY_SKIP_NON_BEST_BASELINE 866 #endif 867 868 #define HWY_ATTAINABLE_PPC (HWY_PPC8 | HWY_PPC9 | HWY_PPC10) 869 870 #else 871 #define HWY_ATTAINABLE_PPC 0 872 #endif 873 874 #if HWY_ARCH_S390X && HWY_BASELINE_Z14 != 0 875 #define HWY_ATTAINABLE_S390X (HWY_Z14 | HWY_Z15) 876 #else 877 #define HWY_ATTAINABLE_S390X 0 878 #endif 879 880 #if HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH 881 #define HWY_ATTAINABLE_RISCV HWY_RVV 882 #else 883 #define HWY_ATTAINABLE_RISCV HWY_BASELINE_RVV 884 #endif 885 886 #if HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH 887 #define HWY_ATTAINABLE_LOONGARCH (HWY_LSX | HWY_LASX) 888 #else 889 #define HWY_ATTAINABLE_LOONGARCH HWY_BASELINE_LOONGARCH 890 #endif 891 892 #ifndef HWY_ATTAINABLE_TARGETS_X86 // allow override 893 #if HWY_COMPILER_MSVC && defined(HWY_SLOW_MSVC) 894 // Fewer targets for faster builds. 895 #define HWY_ATTAINABLE_TARGETS_X86 \ 896 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_STATIC_TARGET | HWY_AVX2) 897 #else // !HWY_COMPILER_MSVC 898 #define HWY_ATTAINABLE_TARGETS_X86 \ 899 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \ 900 HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL | HWY_AVX3_ZEN4 | \ 901 HWY_AVX3_SPR | HWY_AVX10_2) 902 #endif // !HWY_COMPILER_MSVC 903 #endif // HWY_ATTAINABLE_TARGETS_X86 904 905 // Attainable means enabled and the compiler allows intrinsics (even when not 906 // allowed to auto-vectorize). Used in 3 and 4. 907 #if HWY_ARCH_X86 908 #define HWY_ATTAINABLE_TARGETS HWY_ATTAINABLE_TARGETS_X86 909 #elif HWY_ARCH_ARM 910 #define HWY_ATTAINABLE_TARGETS \ 911 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_NEON | HWY_ATTAINABLE_SVE | \ 912 HWY_ATTAINABLE_SVE2) 913 #elif HWY_ARCH_PPC 914 #define HWY_ATTAINABLE_TARGETS \ 915 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_PPC) 916 #elif HWY_ARCH_S390X 917 #define HWY_ATTAINABLE_TARGETS \ 918 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_S390X) 919 #elif HWY_ARCH_RISCV 920 #define HWY_ATTAINABLE_TARGETS \ 921 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_RISCV) 922 #elif HWY_ARCH_LOONGARCH 923 #define HWY_ATTAINABLE_TARGETS \ 924 HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_LOONGARCH) 925 #else 926 #define HWY_ATTAINABLE_TARGETS (HWY_ENABLED_BASELINE) 927 #endif // HWY_ARCH_* 928 929 // 1) For older compilers: avoid SIMD intrinsics, but still support all ops. 930 #if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128 931 #undef HWY_STATIC_TARGET 932 #define HWY_STATIC_TARGET HWY_EMU128 // override baseline 933 #define HWY_TARGETS HWY_EMU128 934 935 // 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but 936 // we currently still support it for backwards compatibility. 937 #elif defined(HWY_COMPILE_ONLY_SCALAR) || \ 938 (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128) 939 #undef HWY_STATIC_TARGET 940 #define HWY_STATIC_TARGET HWY_SCALAR // override baseline 941 #define HWY_TARGETS HWY_SCALAR 942 943 // 2) For forcing static dispatch without code changes (removing HWY_EXPORT) 944 #elif defined(HWY_COMPILE_ONLY_STATIC) 945 #define HWY_TARGETS HWY_STATIC_TARGET 946 947 // 3) For tests: include all attainable targets (in particular: scalar) 948 #elif (defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)) && \ 949 !defined(HWY_SKIP_NON_BEST_BASELINE) 950 #define HWY_TARGETS HWY_ATTAINABLE_TARGETS 951 952 // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by 953 // excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET 954 // may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one 955 // sets all lower bits (better targets), then we also include the static target. 956 #else 957 #define HWY_TARGETS \ 958 (HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET)) 959 960 #endif // target policy 961 962 // HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being 963 // one of the dynamic targets. This also implies HWY_TARGETS != 0 and 964 // (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0. 965 #if (HWY_TARGETS & HWY_STATIC_TARGET) == 0 && HWY_ENABLED_BASELINE != 0 966 #error "Logic error: best baseline should be included in dynamic targets" 967 #endif 968 969 #endif // HIGHWAY_HWY_DETECT_TARGETS_H_