tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

detect_targets.h (36182B)


      1 // Copyright 2021 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
     17 #define HIGHWAY_HWY_DETECT_TARGETS_H_
     18 
     19 // Defines targets and chooses which to enable.
     20 
     21 #include "hwy/detect_compiler_arch.h"
     22 
     23 //------------------------------------------------------------------------------
     24 // Optional configuration
     25 
     26 // See g3doc/quick_reference.md for documentation of these macros.
     27 
     28 // Uncomment to override the default baseline determined from predefined macros:
     29 // #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
     30 
     31 // Uncomment to override the default blocklist:
     32 // #define HWY_BROKEN_TARGETS HWY_AVX3
     33 
     34 // Uncomment to definitely avoid generating those target(s):
     35 // #define HWY_DISABLED_TARGETS HWY_SSE4
     36 
     37 // Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
     38 // AVX2 target for VMs which support AVX2 but not the other instruction sets)
     39 // #define HWY_DISABLE_BMI2_FMA
     40 
     41 // Uncomment to enable these on MSVC even if the predefined macros are not set.
     42 // #define HWY_WANT_SSE2 1
     43 // #define HWY_WANT_SSSE3 1
     44 // #define HWY_WANT_SSE4 1
     45 
     46 //------------------------------------------------------------------------------
     47 // Targets
     48 
     49 // Unique bit value for each target. A lower value is "better" (e.g. more lanes)
     50 // than a higher value within the same group/platform - see HWY_STATIC_TARGET.
     51 //
     52 // All values are unconditionally defined so we can test HWY_TARGETS without
     53 // first checking the HWY_ARCH_*.
     54 //
     55 // The C99 preprocessor evaluates #if expressions using intmax_t types. This
     56 // holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
     57 // 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
     58 // avoid overflow when computing HWY_TARGETS (subtracting one instead of
     59 // left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
     60 
     61 // --------------------------- x86: 15 targets (+ one fallback)
     62 // Bits 0..2 reserved (3 targets)
     63 #define HWY_AVX10_2 (1LL << 3)  // AVX10.2 with 512-bit vectors
     64 #define HWY_AVX3_SPR (1LL << 4)
     65 // Bit 5: reserved (1 target)
     66 // Currently `HWY_AVX3_DL` plus `AVX512BF16` and a special case for
     67 // `CompressStore` (10x as fast, still useful on Zen5). We may later also use
     68 // `VPCONFLICT`. Note that `VP2INTERSECT` is available in Zen5.
     69 #define HWY_AVX3_ZEN4 (1LL << 6)  // see HWY_WANT_AVX3_ZEN4 below
     70 
     71 // Currently satisfiable by Ice Lake (`VNNI`, `VPCLMULQDQ`, `VPOPCNTDQ`,
     72 // `VBMI`, `VBMI2`, `VAES`, `BITALG`, `GFNI`).
     73 #define HWY_AVX3_DL (1LL << 7)
     74 #define HWY_AVX3 (1LL << 8)  // HWY_AVX2 plus AVX-512F/BW/CD/DQ/VL
     75 #define HWY_AVX2 (1LL << 9)  // HWY_SSE4 plus BMI2 + F16 + FMA
     76 // Bit 10: reserved
     77 #define HWY_SSE4 (1LL << 11)   // SSE4.2 plus AES + CLMUL
     78 #define HWY_SSSE3 (1LL << 12)  // S-SSE3
     79 // Bit 13: reserved for SSE3
     80 #define HWY_SSE2 (1LL << 14)
     81 // The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
     82 // dynamic dispatch. All x86 target bits must be lower or equal to
     83 // (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
     84 // HWY_MAX_DYNAMIC_TARGETS in total.
     85 #define HWY_HIGHEST_TARGET_BIT_X86 14
     86 
     87 // --------------------------- Arm: 15 targets (+ one fallback)
     88 // Bits 15..17 reserved (3 targets)
     89 #define HWY_SVE2_128 (1LL << 18)  // specialized (e.g. Neoverse V2/N2/N3)
     90 #define HWY_SVE_256 (1LL << 19)   // specialized (Neoverse V1)
     91 // Bits 20-22 reserved for later SVE (3 targets)
     92 #define HWY_SVE2 (1LL << 23)
     93 #define HWY_SVE (1LL << 24)
     94 // Bit 25 reserved for NEON
     95 #define HWY_NEON_BF16 (1LL << 26)  // fp16/dot/bf16/i8mm (e.g. Neoverse V2/N2)
     96 // Bit 27 reserved for NEON
     97 #define HWY_NEON (1LL << 28)  // Implies support for AES
     98 #define HWY_NEON_WITHOUT_AES (1LL << 29)
     99 #define HWY_HIGHEST_TARGET_BIT_ARM 29
    100 
    101 #define HWY_ALL_NEON (HWY_NEON_WITHOUT_AES | HWY_NEON | HWY_NEON_BF16)
    102 #define HWY_ALL_SVE (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
    103 
    104 // --------------------------- RISC-V: 9 targets (+ one fallback)
    105 // Bits 30..36 reserved (7 targets)
    106 #define HWY_RVV (1LL << 37)
    107 // Bit 38 reserved
    108 #define HWY_HIGHEST_TARGET_BIT_RVV 38
    109 
    110 // --------------------------- LoongArch: 3 targets (+ one fallback)
    111 // Bits 39 reserved (1 target)
    112 #define HWY_LASX (1LL << 40)
    113 #define HWY_LSX (1LL << 41)
    114 #define HWY_HIGHEST_TARGET_BIT_LOONGARCH 41
    115 
    116 // --------------------------- Future expansion: 1 target
    117 // Bits 42 reserved
    118 
    119 // --------------------------- IBM Power/ZSeries: 9 targets (+ one fallback)
    120 // Bits 43..46 reserved (4 targets)
    121 #define HWY_PPC10 (1LL << 47)  // v3.1
    122 #define HWY_PPC9 (1LL << 48)   // v3.0
    123 #define HWY_PPC8 (1LL << 49)   // v2.07
    124 #define HWY_Z15 (1LL << 50)    // Z15
    125 #define HWY_Z14 (1LL << 51)    // Z14
    126 #define HWY_HIGHEST_TARGET_BIT_PPC 51
    127 
    128 #define HWY_ALL_PPC (HWY_PPC8 | HWY_PPC9 | HWY_PPC10)
    129 
    130 // --------------------------- WebAssembly: 9 targets (+ one fallback)
    131 // Bits 52..57 reserved (6 targets)
    132 #define HWY_WASM_EMU256 (1LL << 58)  // Experimental
    133 #define HWY_WASM (1LL << 59)
    134 // Bits 60 reserved
    135 #define HWY_HIGHEST_TARGET_BIT_WASM 60
    136 
    137 // --------------------------- Emulation: 2 targets
    138 
    139 #define HWY_EMU128 (1LL << 61)
    140 // We do not add/left-shift, so this will not overflow to a negative number.
    141 #define HWY_SCALAR (1LL << 62)
    142 #define HWY_HIGHEST_TARGET_BIT_SCALAR 62
    143 
    144 // Do not use bit 63 - would be confusing to have negative numbers.
    145 
    146 //------------------------------------------------------------------------------
    147 // Set default blocklists
    148 
    149 // Disabled means excluded from enabled at user's request. A separate config
    150 // macro allows disabling without deactivating the blocklist below.
    151 #ifndef HWY_DISABLED_TARGETS
    152 #define HWY_DISABLED_TARGETS 0
    153 #endif
    154 
    155 // Broken means excluded from enabled due to known compiler issues. We define
    156 // separate HWY_BROKEN_* and then OR them together (more than one might apply).
    157 
    158 #ifndef HWY_BROKEN_CLANG6  // allow override
    159 // x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
    160 // SSE4 codegen (possibly only for msan), so disable all those targets.
    161 #if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
    162 #define HWY_BROKEN_CLANG6 (HWY_SSE4 | (HWY_SSE4 - 1))
    163 // This entails a major speed reduction, so warn unless the user explicitly
    164 // opts in to scalar-only.
    165 #if !defined(HWY_COMPILE_ONLY_SCALAR)
    166 #pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
    167 #endif
    168 
    169 #else
    170 #define HWY_BROKEN_CLANG6 0
    171 #endif
    172 #endif  // HWY_BROKEN_CLANG6
    173 
    174 #ifndef HWY_BROKEN_32BIT  // allow override
    175 // 32-bit may fail to compile AVX2/3.
    176 #if HWY_ARCH_X86_32
    177 // GCC-13 is ok with AVX2:
    178 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1300)
    179 #define HWY_BROKEN_32BIT (HWY_AVX3 | (HWY_AVX3 - 1))
    180 #else
    181 #define HWY_BROKEN_32BIT (HWY_AVX2 | (HWY_AVX2 - 1))
    182 #endif
    183 #else
    184 #define HWY_BROKEN_32BIT 0
    185 #endif
    186 #endif  // HWY_BROKEN_32BIT
    187 
    188 #ifndef HWY_BROKEN_MSVC  // allow override
    189 // MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
    190 #if HWY_COMPILER_MSVC != 0
    191 #define HWY_BROKEN_MSVC (HWY_AVX3 | (HWY_AVX3 - 1))
    192 #else
    193 #define HWY_BROKEN_MSVC 0
    194 #endif
    195 #endif  // HWY_BROKEN_MSVC
    196 
    197 #ifndef HWY_BROKEN_AVX10_2  // allow override
    198 // AVX10_2 requires clang >= 20.1 (postpone to 23 due to "avx10.2-512" remnant,
    199 // only removed in https://github.com/llvm/llvm-project/pull/157034) or
    200 // gcc >= 15.2 with binutils 2.44.
    201 #if (HWY_COMPILER_CLANG < 2300) && (HWY_COMPILER_GCC_ACTUAL < 1502)
    202 #define HWY_BROKEN_AVX10_2 HWY_AVX10_2
    203 #else
    204 #define HWY_BROKEN_AVX10_2 0
    205 #endif
    206 #endif  // HWY_BROKEN_AVX10_2
    207 
    208 #ifndef HWY_BROKEN_AVX3_DL_ZEN4  // allow override
    209 // AVX3_DL and AVX3_ZEN4 require clang >= 7 (ensured above), gcc >= 8.1 or ICC
    210 // 2021.
    211 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 801) || \
    212    (HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021)
    213 #define HWY_BROKEN_AVX3_DL_ZEN4 (HWY_AVX3_DL | HWY_AVX3_ZEN4)
    214 #else
    215 #define HWY_BROKEN_AVX3_DL_ZEN4 0
    216 #endif
    217 #endif  // HWY_BROKEN_AVX3_DL_ZEN4
    218 
    219 #ifndef HWY_BROKEN_AVX3_SPR  // allow override
    220 // AVX3_SPR requires clang >= 14, gcc >= 12, or ICC 2021.
    221 #if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1400) ||      \
    222    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200) || \
    223    (HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021)
    224 #define HWY_BROKEN_AVX3_SPR (HWY_AVX3_SPR)
    225 #else
    226 #define HWY_BROKEN_AVX3_SPR 0
    227 #endif
    228 #endif  // HWY_BROKEN_AVX3_SPR
    229 
    230 #ifndef HWY_BROKEN_ARM7_BIG_ENDIAN  // allow override
    231 // armv7be has not been tested and is not yet supported.
    232 #if HWY_ARCH_ARM_V7 && HWY_IS_BIG_ENDIAN
    233 #define HWY_BROKEN_ARM7_BIG_ENDIAN HWY_ALL_NEON
    234 #else
    235 #define HWY_BROKEN_ARM7_BIG_ENDIAN 0
    236 #endif
    237 #endif  // HWY_BROKEN_ARM7_BIG_ENDIAN
    238 
    239 #ifdef __ARM_NEON_FP
    240 #define HWY_HAVE_NEON_FP __ARM_NEON_FP
    241 #else
    242 #define HWY_HAVE_NEON_FP 0
    243 #endif
    244 
    245 #ifndef HWY_BROKEN_ARM7_WITHOUT_VFP4  // allow override
    246 // armv7-a without a detected vfpv4 is not supported
    247 // (for example Cortex-A8, Cortex-A9)
    248 // vfpv4 always have neon half-float _and_ FMA.
    249 #if HWY_ARCH_ARM_V7 && (__ARM_ARCH_PROFILE == 'A') && \
    250    !defined(__ARM_VFPV4__) &&                        \
    251    !((HWY_HAVE_NEON_FP & 0x2 /* half-float */) && (__ARM_FEATURE_FMA == 1))
    252 #define HWY_BROKEN_ARM7_WITHOUT_VFP4 HWY_ALL_NEON
    253 #else
    254 #define HWY_BROKEN_ARM7_WITHOUT_VFP4 0
    255 #endif
    256 #endif  // HWY_BROKEN_ARM7_WITHOUT_VFP4
    257 
    258 #ifndef HWY_BROKEN_NEON_BF16  // allow override
    259 // Broken on older compilers:
    260 #if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1700) || \
    261    (HWY_COMPILER_GCC_ACTUAL != 0 && HWY_COMPILER_GCC_ACTUAL < 1302) || \
    262    (defined(__apple_build_version__) && __apple_build_version__ <= 17000000)
    263 #define HWY_BROKEN_NEON_BF16 (HWY_NEON_BF16)
    264 #else
    265 #define HWY_BROKEN_NEON_BF16 0
    266 #endif
    267 #endif  // HWY_BROKEN_NEON_BF16
    268 
    269 // SVE[2] require recent clang or gcc versions.
    270 
    271 #ifndef HWY_BROKEN_SVE  // allow override
    272 // GCC 10+. Clang 22 still has test failures for SVE, including MSAN. No Apple
    273 // CPU (at least up to and including M4 and A18) has SVE.
    274 #if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2300) ||           \
    275    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
    276    HWY_OS_APPLE
    277 #define HWY_BROKEN_SVE (HWY_SVE | HWY_SVE_256)
    278 #else
    279 #define HWY_BROKEN_SVE 0
    280 #endif
    281 #endif  // HWY_BROKEN_SVE
    282 
    283 #ifndef HWY_BROKEN_SVE2  // allow override
    284 // Clang 21 still has test failures for SVE2, including MSAN.
    285 #if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2300) ||           \
    286    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
    287    HWY_OS_APPLE
    288 #define HWY_BROKEN_SVE2 (HWY_SVE2 | HWY_SVE2_128)
    289 #else
    290 #define HWY_BROKEN_SVE2 0
    291 #endif
    292 #endif  // HWY_BROKEN_SVE2
    293 
    294 #ifndef HWY_BROKEN_PPC10  // allow override
    295 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1100)
    296 // GCC 10 supports the -mcpu=power10 option but does not support the PPC10
    297 // vector intrinsics
    298 #define HWY_BROKEN_PPC10 (HWY_PPC10)
    299 #elif HWY_ARCH_PPC && HWY_IS_BIG_ENDIAN &&                                   \
    300    ((HWY_COMPILER3_CLANG && HWY_COMPILER3_CLANG < 160001) ||                \
    301     (HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_COMPILER_GCC_ACTUAL <= 1203) || \
    302     (HWY_COMPILER_GCC_ACTUAL >= 1300 && HWY_COMPILER_GCC_ACTUAL <= 1301))
    303 // GCC 12.0 through 12.3 and GCC 13.0 through 13.1 have a compiler bug where the
    304 // vsldoi instruction is sometimes incorrectly optimized out (and this causes
    305 // some of the Highway unit tests to fail on big-endian PPC10). Details about
    306 // this compiler bug can be found at
    307 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109069, and this bug will be
    308 // fixed in the upcoming GCC 12.4 and 13.2 releases.
    309 
    310 // Clang 16.0.0 and earlier (but not Clang 16.0.1 and later) have a compiler
    311 // bug in the LLVM DAGCombiner that causes a zero-extend followed by an
    312 // element insert into a vector, followed by a vector shuffle to be incorrectly
    313 // optimized on big-endian PPC (and which caused some of the Highway unit tests
    314 // to fail on big-endian PPC10).
    315 
    316 // Details about this bug, which has already been fixed in Clang 16.0.1 and
    317 // later, can be found at https://github.com/llvm/llvm-project/issues/61315.
    318 #define HWY_BROKEN_PPC10 (HWY_PPC10)
    319 #else
    320 #define HWY_BROKEN_PPC10 0
    321 #endif
    322 #endif  // HWY_BROKEN_PPC10
    323 
    324 #ifndef HWY_BROKEN_PPC_32BIT  // allow override
    325 // PPC8/PPC9/PPC10 targets may fail to compile on 32-bit PowerPC
    326 #if HWY_ARCH_PPC && !HWY_ARCH_PPC_64
    327 #define HWY_BROKEN_PPC_32BIT (HWY_PPC8 | HWY_PPC9 | HWY_PPC10)
    328 #else
    329 #define HWY_BROKEN_PPC_32BIT 0
    330 #endif
    331 #endif  // HWY_BROKEN_PPC_32BIT
    332 
    333 #ifndef HWY_BROKEN_RVV  // allow override
    334 // HWY_RVV fails to compile with GCC < 13 or Clang < 16.
    335 #if HWY_ARCH_RISCV &&                                     \
    336    ((HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600) || \
    337     (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1300))
    338 #define HWY_BROKEN_RVV (HWY_RVV)
    339 #else
    340 #define HWY_BROKEN_RVV 0
    341 #endif
    342 #endif  // HWY_BROKEN_RVV
    343 
    344 #ifndef HWY_BROKEN_LOONGARCH  // allow override
    345 // Using __loongarch_sx and __loongarch_asx macros to
    346 // check whether LSX/LASX targets are available.
    347 // GCC does not work yet, see https://gcc.gnu.org/PR121875.
    348 #if !defined(__loongarch_sx) && \
    349    !(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1800)
    350 #define HWY_BROKEN_LOONGARCH (HWY_LSX | HWY_LASX)
    351 #elif !defined(__loongarch_asx) && \
    352      !(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1800)
    353 #define HWY_BROKEN_LOONGARCH (HWY_LASX)
    354 #else
    355 #define HWY_BROKEN_LOONGARCH 0
    356 #endif
    357 #endif  // HWY_BROKEN_LOONGARCH
    358 
    359 #ifndef HWY_BROKEN_Z14  // allow override
    360 #if HWY_ARCH_S390X
    361 #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1900
    362 // Clang 18 and earlier have bugs with some ZVector intrinsics
    363 #define HWY_BROKEN_Z14 (HWY_Z14 | HWY_Z15)
    364 #elif HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900
    365 // Z15 target requires GCC 9 or later
    366 #define HWY_BROKEN_Z14 (HWY_Z15)
    367 #else
    368 #define HWY_BROKEN_Z14 0
    369 #endif
    370 #else  // !HWY_ARCH_S390X
    371 #define HWY_BROKEN_Z14 0
    372 #endif  // HWY_ARCH_S390X
    373 #endif  // HWY_BROKEN_Z14
    374 
    375 // Allow the user to override this without any guarantee of success.
    376 #ifndef HWY_BROKEN_TARGETS
    377 
    378 #define HWY_BROKEN_TARGETS                                              \
    379  (HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC |             \
    380   HWY_BROKEN_AVX10_2 | HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \
    381   HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 |          \
    382   HWY_BROKEN_NEON_BF16 | HWY_BROKEN_SVE | HWY_BROKEN_SVE2 |            \
    383   HWY_BROKEN_PPC10 | HWY_BROKEN_PPC_32BIT | HWY_BROKEN_RVV |           \
    384   HWY_BROKEN_LOONGARCH | HWY_BROKEN_Z14)
    385 
    386 #endif  // HWY_BROKEN_TARGETS
    387 
    388 // Enabled means not disabled nor blocklisted.
    389 #define HWY_ENABLED(targets) \
    390  ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
    391 
    392 // Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
    393 // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). An issue still
    394 // remains with 13.2, see #1683. This is separate from HWY_BROKEN_TARGETS
    395 // because it affects the fallback target, which must always be enabled. If 1,
    396 // we instead choose HWY_SCALAR even without HWY_COMPILE_ONLY_SCALAR being set.
    397 #if !defined(HWY_BROKEN_EMU128)  // allow overriding
    398 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1600) || \
    399    defined(HWY_NO_LIBCXX)
    400 #define HWY_BROKEN_EMU128 1
    401 #else
    402 #define HWY_BROKEN_EMU128 0
    403 #endif
    404 #endif  // HWY_BROKEN_EMU128
    405 
    406 //------------------------------------------------------------------------------
    407 // Detect baseline targets using predefined macros
    408 
    409 // Baseline means the targets for which the compiler is allowed to generate
    410 // instructions, implying the target CPU would have to support them. This does
    411 // not take the blocklist into account.
    412 
    413 #if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
    414 #define HWY_BASELINE_SCALAR HWY_SCALAR
    415 #else
    416 #define HWY_BASELINE_SCALAR HWY_EMU128
    417 #endif
    418 
    419 // Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
    420 // HWY_TARGET == HWY_BASELINE_SCALAR.
    421 
    422 #if HWY_ARCH_WASM && defined(__wasm_simd128__)
    423 #if defined(HWY_WANT_WASM2)
    424 #define HWY_BASELINE_WASM HWY_WASM_EMU256
    425 #else
    426 #define HWY_BASELINE_WASM HWY_WASM
    427 #endif  // HWY_WANT_WASM2
    428 #else
    429 #define HWY_BASELINE_WASM 0
    430 #endif
    431 
    432 // GCC or Clang.
    433 #if HWY_ARCH_PPC && HWY_COMPILER_GCC && defined(__ALTIVEC__) && \
    434    defined(__VSX__) && defined(__POWER8_VECTOR__) &&           \
    435    (defined(__CRYPTO__) || defined(HWY_DISABLE_PPC8_CRYPTO))
    436 #define HWY_BASELINE_PPC8 HWY_PPC8
    437 #else
    438 #define HWY_BASELINE_PPC8 0
    439 #endif
    440 
    441 #if HWY_BASELINE_PPC8 != 0 && defined(__POWER9_VECTOR__)
    442 #define HWY_BASELINE_PPC9 HWY_PPC9
    443 #else
    444 #define HWY_BASELINE_PPC9 0
    445 #endif
    446 
    447 #if HWY_BASELINE_PPC9 != 0 && \
    448    (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
    449 #define HWY_BASELINE_PPC10 HWY_PPC10
    450 #else
    451 #define HWY_BASELINE_PPC10 0
    452 #endif
    453 
    454 #if HWY_ARCH_S390X && defined(__VEC__) && defined(__ARCH__) && __ARCH__ >= 12
    455 #define HWY_BASELINE_Z14 HWY_Z14
    456 #else
    457 #define HWY_BASELINE_Z14 0
    458 #endif
    459 
    460 #if HWY_BASELINE_Z14 && __ARCH__ >= 13
    461 #define HWY_BASELINE_Z15 HWY_Z15
    462 #else
    463 #define HWY_BASELINE_Z15 0
    464 #endif
    465 
    466 #define HWY_BASELINE_SVE2 0
    467 #define HWY_BASELINE_SVE 0
    468 #define HWY_BASELINE_NEON 0
    469 
    470 #if HWY_ARCH_ARM
    471 
    472 // Also check compiler version as done for HWY_ATTAINABLE_SVE2 because the
    473 // static target (influenced here) must be one of the attainable targets.
    474 #if defined(__ARM_FEATURE_SVE2) && \
    475    (HWY_COMPILER_CLANG >= 1400 || HWY_COMPILER_GCC_ACTUAL >= 1200)
    476 #undef HWY_BASELINE_SVE2  // was 0, will be re-defined
    477 // If user specified -msve-vector-bits=128, they assert the vector length is
    478 // 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
    479 #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
    480 #define HWY_BASELINE_SVE2 HWY_SVE2_128
    481 // Otherwise we're not sure what the vector length will be. The baseline must be
    482 // unconditionally valid, so we can only assume HWY_SVE2. However, when running
    483 // on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
    484 // still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
    485 #else
    486 #define HWY_BASELINE_SVE2 HWY_SVE2
    487 #endif  // __ARM_FEATURE_SVE_BITS
    488 #endif  // __ARM_FEATURE_SVE2
    489 
    490 #if defined(__ARM_FEATURE_SVE) && \
    491    (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)
    492 #undef HWY_BASELINE_SVE  // was 0, will be re-defined
    493 // See above. If user-specified vector length matches our optimization, use it.
    494 #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
    495 #define HWY_BASELINE_SVE HWY_SVE_256
    496 #else
    497 #define HWY_BASELINE_SVE HWY_SVE
    498 #endif  // __ARM_FEATURE_SVE_BITS
    499 #endif  // __ARM_FEATURE_SVE
    500 
    501 // GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
    502 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
    503 #undef HWY_BASELINE_NEON
    504 #if defined(__ARM_FEATURE_AES) &&                    \
    505    defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && \
    506    defined(__ARM_FEATURE_DOTPROD) &&                \
    507    defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \
    508    defined(__ARM_FEATURE_MATMUL_INT8)
    509 #define HWY_BASELINE_NEON HWY_ALL_NEON
    510 #elif defined(__ARM_FEATURE_AES)
    511 #define HWY_BASELINE_NEON (HWY_NEON_WITHOUT_AES | HWY_NEON)
    512 #else
    513 #define HWY_BASELINE_NEON (HWY_NEON_WITHOUT_AES)
    514 #endif  // __ARM_FEATURE*
    515 #endif  // __ARM_NEON
    516 
    517 #endif  // HWY_ARCH_ARM
    518 
    519 // Special handling for MSVC because it has fewer predefined macros:
    520 #if HWY_COMPILER_MSVC
    521 
    522 #if HWY_ARCH_X86_32
    523 #if _M_IX86_FP >= 2
    524 #define HWY_CHECK_SSE2 1
    525 #else
    526 #define HWY_CHECK_SSE2 0
    527 #endif
    528 #elif HWY_ARCH_X86_64
    529 #define HWY_CHECK_SSE2 1
    530 #else
    531 #define HWY_CHECK_SSE2 0
    532 #endif
    533 
    534 // 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
    535 //    https://stackoverflow.com/questions/18563978/.
    536 #if defined(__AVX__)
    537 #define HWY_CHECK_SSSE3 1
    538 #define HWY_CHECK_SSE4 1
    539 #else
    540 #define HWY_CHECK_SSSE3 0
    541 #define HWY_CHECK_SSE4 0
    542 #endif
    543 
    544 // 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
    545 //    PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
    546 #define HWY_CHECK_PCLMUL_AES 1
    547 #define HWY_CHECK_BMI2_FMA 1
    548 #define HWY_CHECK_F16C 1
    549 
    550 #else  // non-MSVC
    551 
    552 #if defined(__SSE2__)
    553 #define HWY_CHECK_SSE2 1
    554 #else
    555 #define HWY_CHECK_SSE2 0
    556 #endif
    557 
    558 #if defined(__SSSE3__)
    559 #define HWY_CHECK_SSSE3 1
    560 #else
    561 #define HWY_CHECK_SSSE3 0
    562 #endif
    563 
    564 #if defined(__SSE4_1__) && defined(__SSE4_2__)
    565 #define HWY_CHECK_SSE4 1
    566 #else
    567 #define HWY_CHECK_SSE4 0
    568 #endif
    569 
    570 // If these are disabled, they should not gate the availability of SSE4/AVX2.
    571 #if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
    572 #define HWY_CHECK_PCLMUL_AES 1
    573 #else
    574 #define HWY_CHECK_PCLMUL_AES 0
    575 #endif
    576 
    577 #if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
    578 #define HWY_CHECK_BMI2_FMA 1
    579 #else
    580 #define HWY_CHECK_BMI2_FMA 0
    581 #endif
    582 
    583 #if defined(HWY_DISABLE_F16C) || defined(__F16C__)
    584 #define HWY_CHECK_F16C 1
    585 #else
    586 #define HWY_CHECK_F16C 0
    587 #endif
    588 
    589 #endif  // non-MSVC
    590 
    591 #if HWY_ARCH_X86 && \
    592    ((defined(HWY_WANT_SSE2) && HWY_WANT_SSE2) || HWY_CHECK_SSE2)
    593 #define HWY_BASELINE_SSE2 HWY_SSE2
    594 #else
    595 #define HWY_BASELINE_SSE2 0
    596 #endif
    597 
    598 #if HWY_ARCH_X86 && \
    599    ((defined(HWY_WANT_SSSE3) && HWY_WANT_SSSE3) || HWY_CHECK_SSSE3)
    600 #define HWY_BASELINE_SSSE3 HWY_SSSE3
    601 #else
    602 #define HWY_BASELINE_SSSE3 0
    603 #endif
    604 
    605 #if HWY_ARCH_X86 && ((defined(HWY_WANT_SSE4) && HWY_WANT_SSE4) || \
    606                     (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
    607 #define HWY_BASELINE_SSE4 HWY_SSE4
    608 #else
    609 #define HWY_BASELINE_SSE4 0
    610 #endif
    611 
    612 #if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
    613    defined(__AVX2__)
    614 #define HWY_BASELINE_AVX2 HWY_AVX2
    615 #else
    616 #define HWY_BASELINE_AVX2 0
    617 #endif
    618 
    619 // Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
    620 #if HWY_BASELINE_AVX2 != 0 &&                                       \
    621    ((defined(__AVX512F__) && defined(__AVX512BW__) &&              \
    622      defined(__AVX512DQ__) && defined(__AVX512VL__)) ||            \
    623     defined(__AVX10_2__)) &&                                       \
    624    ((!HWY_COMPILER_GCC_ACTUAL && !HWY_COMPILER_CLANG) ||           \
    625     HWY_COMPILER_GCC_ACTUAL < 1400 || HWY_COMPILER_CLANG < 1800 || \
    626     defined(__EVEX512__))
    627 #define HWY_BASELINE_AVX3 HWY_AVX3
    628 #else
    629 #define HWY_BASELINE_AVX3 0
    630 #endif
    631 
    632 // TODO(janwas): not yet known whether these will be set by MSVC
    633 #if HWY_BASELINE_AVX3 != 0 &&                                     \
    634    ((defined(__AVX512VNNI__) && defined(__VAES__) &&             \
    635      defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) &&       \
    636      defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
    637      defined(__AVX512BITALG__)) ||                               \
    638     defined(__AVX10_2__))
    639 #define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
    640 #else
    641 #define HWY_BASELINE_AVX3_DL 0
    642 #endif
    643 
    644 // The ZEN4-optimized AVX3 target is numerically lower than AVX3_DL and is thus
    645 // considered better. Do not enable it unless the user explicitly requests it -
    646 // we do not want to choose the ZEN4 path on Intel because it could be slower.
    647 #if defined(HWY_WANT_AVX3_ZEN4) && HWY_BASELINE_AVX3_DL != 0
    648 #define HWY_BASELINE_AVX3_ZEN4 HWY_AVX3_ZEN4
    649 #else
    650 #define HWY_BASELINE_AVX3_ZEN4 0
    651 #endif
    652 
    653 #if HWY_BASELINE_AVX3_DL != 0 &&                             \
    654    ((defined(__AVX512BF16__) && defined(__AVX512FP16__)) || \
    655     defined(__AVX10_2__))
    656 #define HWY_BASELINE_AVX3_SPR HWY_AVX3_SPR
    657 #else
    658 #define HWY_BASELINE_AVX3_SPR 0
    659 #endif
    660 
    661 #if HWY_BASELINE_AVX3_SPR != 0 && defined(__AVX10_2__)
    662 #define HWY_BASELINE_AVX10_2 HWY_AVX10_2
    663 #else
    664 #define HWY_BASELINE_AVX10_2 0
    665 #endif
    666 
    667 // RVV requires intrinsics 0.11 or later, see #1156.
    668 
    669 // Also check that the __riscv_v macro is defined as GCC or Clang will define
    670 // the __risc_v macro if the RISC-V "V" extension is enabled.
    671 
    672 #if HWY_ARCH_RISCV && defined(__riscv_v) && defined(__riscv_v_intrinsic) && \
    673    __riscv_v_intrinsic >= 11000
    674 #define HWY_BASELINE_RVV HWY_RVV
    675 #else
    676 #define HWY_BASELINE_RVV 0
    677 #endif
    678 
    679 #if HWY_ARCH_LOONGARCH && defined(__loongarch_sx) && defined(__loongarch_asx)
    680 #define HWY_BASELINE_LOONGARCH (HWY_LSX | HWY_LASX)
    681 #elif HWY_ARCH_LOONGARCH && defined(__loongarch_sx)
    682 #define HWY_BASELINE_LOONGARCH (HWY_LSX)
    683 #else
    684 #define HWY_BASELINE_LOONGARCH 0
    685 #endif
    686 
    687 // Workaround for libaom, which unconditionally defines HWY_BASELINE_TARGETS
    688 // even when that would be disabled/broken. If so, at least use AVX2.
    689 #if defined(HWY_BASELINE_TARGETS)
    690 #if HWY_BASELINE_TARGETS == HWY_AVX3_DL && \
    691    ((HWY_BROKEN_TARGETS | HWY_DISABLED_TARGETS) & HWY_AVX3_DL)
    692 #undef HWY_BASELINE_TARGETS
    693 #define HWY_BASELINE_TARGETS HWY_AVX2
    694 #endif
    695 #endif  // HWY_BASELINE_TARGETS
    696 
    697 // Allow the user to override this without any guarantee of success. If the
    698 // compiler invocation considers that target to be broken/disabled, then
    699 // `HWY_ENABLED_BASELINE` will be 0 and users will have to check for that and
    700 // skip their code.
    701 #ifndef HWY_BASELINE_TARGETS
    702 #define HWY_BASELINE_TARGETS                                               \
    703  (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 |           \
    704   HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10 | HWY_BASELINE_Z14 |             \
    705   HWY_BASELINE_Z15 | HWY_BASELINE_SVE2 | HWY_BASELINE_SVE |               \
    706   HWY_BASELINE_NEON | HWY_BASELINE_SSE2 | HWY_BASELINE_SSSE3 |            \
    707   HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 |             \
    708   HWY_BASELINE_AVX3_DL | HWY_BASELINE_AVX3_ZEN4 | HWY_BASELINE_AVX3_SPR | \
    709   HWY_BASELINE_AVX10_2 | HWY_BASELINE_RVV | HWY_BASELINE_LOONGARCH)
    710 #endif  // HWY_BASELINE_TARGETS
    711 
    712 //------------------------------------------------------------------------------
    713 // Choose target for static dispatch
    714 
    715 #define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
    716 #if HWY_ENABLED_BASELINE == 0
    717 #pragma message                                                            \
    718    "All baseline targets are disabled or considered broken."              \
    719    "This is typically due to very restrictive HWY_BASELINE_TARGETS, or "  \
    720    "too expansive HWY_BROKEN_TARGETS or HWY_DISABLED_TAREGTS. User code " \
    721    "must also check for this and skip any usage of SIMD."
    722 #endif
    723 
    724 // Best baseline, used for static dispatch. This is the least-significant 1-bit
    725 // within HWY_ENABLED_BASELINE and lower bit values imply "better".
    726 #define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
    727 
    728 // Start by assuming static dispatch. If we later use dynamic dispatch, this
    729 // will be defined to other targets during the multiple-inclusion, and finally
    730 // return to the initial value. Defining this outside begin/end_target ensures
    731 // inl headers successfully compile by themselves (required by Bazel).
    732 #define HWY_TARGET HWY_STATIC_TARGET
    733 
    734 //------------------------------------------------------------------------------
    735 // Choose targets for dynamic dispatch according to one of four policies
    736 
    737 #if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
    738         defined(HWY_COMPILE_ONLY_STATIC))
    739 #error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
    740 #endif
    741 // Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
    742 
    743 #ifndef HWY_HAVE_ASM_HWCAP  // allow override
    744 #ifdef TOOLCHAIN_MISS_ASM_HWCAP_H
    745 #define HWY_HAVE_ASM_HWCAP 0  // CMake failed to find the header
    746 #elif defined(__has_include)  // note: wrapper macro fails on Clang ~17
    747 // clang-format off
    748 #if __has_include(<asm/hwcap.h>)
    749 // clang-format on
    750 #define HWY_HAVE_ASM_HWCAP 1  // header present
    751 #else
    752 #define HWY_HAVE_ASM_HWCAP 0  // header not present
    753 #endif                        // __has_include
    754 #else                         // compiler lacks __has_include
    755 #define HWY_HAVE_ASM_HWCAP 0
    756 #endif
    757 #endif  // HWY_HAVE_ASM_HWCAP
    758 
    759 #ifndef HWY_HAVE_AUXV  // allow override
    760 #ifdef TOOLCHAIN_MISS_SYS_AUXV_H
    761 #define HWY_HAVE_AUXV 0  // CMake failed to find the header
    762 // glibc 2.16 added auxv, but checking for that requires features.h, and we do
    763 // not want to include system headers here. Instead check for the header
    764 // directly, which has been supported at least since GCC 5.4 and Clang 3.
    765 #elif defined(__has_include)  // note: wrapper macro fails on Clang ~17
    766 // clang-format off
    767 #if __has_include(<sys/auxv.h>)
    768 // clang-format on
    769 #define HWY_HAVE_AUXV 1       // header present
    770 #else
    771 #define HWY_HAVE_AUXV 0  // header not present
    772 #endif                   // __has_include
    773 #else                    // compiler lacks __has_include
    774 #define HWY_HAVE_AUXV 0
    775 #endif
    776 #endif  // HWY_HAVE_AUXV
    777 
    778 #ifndef HWY_HAVE_RUNTIME_DISPATCH_RVV  // allow override
    779 // The riscv_vector.h in Clang 16-18 requires compiler flags, and 19 still has
    780 // some missing intrinsics, see
    781 // https://github.com/llvm/llvm-project/issues/56592. GCC 13.3 also has an
    782 // #error check, whereas 14.1 fails with "argument type 'vuint16m8_t' requires
    783 // the V ISA extension": https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115325.
    784 #if HWY_ARCH_RISCV && HWY_COMPILER_CLANG >= 1900 && 0
    785 #define HWY_HAVE_RUNTIME_DISPATCH_RVV 1
    786 #else
    787 #define HWY_HAVE_RUNTIME_DISPATCH_RVV 0
    788 #endif
    789 #endif  // HWY_HAVE_RUNTIME_DISPATCH_RVV
    790 
    791 #ifndef HWY_HAVE_RUNTIME_DISPATCH_APPLE  // allow override
    792 #if HWY_ARCH_ARM_A64 && HWY_OS_APPLE && \
    793    (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1700)
    794 #define HWY_HAVE_RUNTIME_DISPATCH_APPLE 1
    795 #else
    796 #define HWY_HAVE_RUNTIME_DISPATCH_APPLE 0
    797 #endif
    798 #endif  // HWY_HAVE_RUNTIME_DISPATCH_APPLE
    799 
    800 #ifndef HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH  // allow override
    801 #if HWY_ARCH_LOONGARCH && HWY_HAVE_AUXV && !defined(__loongarch_asx) && \
    802    HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 1800
    803 #define HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH 1
    804 #else
    805 #define HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH 0
    806 #endif
    807 #endif  // HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH
    808 
    809 #ifndef HWY_HAVE_RUNTIME_DISPATCH_LINUX  // allow override
    810 #if (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X) && HWY_OS_LINUX && \
    811    (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1700) && HWY_HAVE_AUXV
    812 #define HWY_HAVE_RUNTIME_DISPATCH_LINUX 1
    813 #else
    814 #define HWY_HAVE_RUNTIME_DISPATCH_LINUX 0
    815 #endif
    816 #endif  // HWY_HAVE_RUNTIME_DISPATCH_LINUX
    817 
    818 // Allow opting out, and without a guarantee of success, opting-in.
    819 #ifndef HWY_HAVE_RUNTIME_DISPATCH
    820 // Clang, GCC and MSVC allow OS-independent runtime dispatch on x86.
    821 #if HWY_ARCH_X86 || HWY_HAVE_RUNTIME_DISPATCH_RVV ||                          \
    822    HWY_HAVE_RUNTIME_DISPATCH_APPLE || HWY_HAVE_RUNTIME_DISPATCH_LOONGARCH || \
    823    HWY_HAVE_RUNTIME_DISPATCH_LINUX
    824 #define HWY_HAVE_RUNTIME_DISPATCH 1
    825 #else
    826 #define HWY_HAVE_RUNTIME_DISPATCH 0
    827 #endif
    828 #endif  // HWY_HAVE_RUNTIME_DISPATCH
    829 
    830 #if HWY_ARCH_ARM_A64 && HWY_HAVE_RUNTIME_DISPATCH
    831 #define HWY_ATTAINABLE_NEON HWY_ALL_NEON
    832 #elif HWY_ARCH_ARM  // static dispatch, or HWY_ARCH_ARM_V7
    833 #define HWY_ATTAINABLE_NEON (HWY_BASELINE_NEON)
    834 #else
    835 #define HWY_ATTAINABLE_NEON 0
    836 #endif
    837 
    838 #if HWY_ARCH_ARM_A64 &&                                              \
    839    (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800) && \
    840    (HWY_HAVE_RUNTIME_DISPATCH ||                                    \
    841     (HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
    842 #define HWY_ATTAINABLE_SVE (HWY_SVE | HWY_SVE_256)
    843 #else
    844 #define HWY_ATTAINABLE_SVE 0
    845 #endif
    846 
    847 #if HWY_ARCH_ARM_A64 &&                                                \
    848    (HWY_COMPILER_CLANG >= 1400 || HWY_COMPILER_GCC_ACTUAL >= 1200) && \
    849    (HWY_HAVE_RUNTIME_DISPATCH ||                                      \
    850     (HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
    851 #define HWY_ATTAINABLE_SVE2 (HWY_SVE2 | HWY_SVE2_128)
    852 #else
    853 #define HWY_ATTAINABLE_SVE2 0
    854 #endif
    855 
    856 #if HWY_ARCH_PPC && defined(__ALTIVEC__) && \
    857    (!HWY_COMPILER_CLANG || HWY_BASELINE_PPC8 != 0)
    858 
    859 #if (HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10) && \
    860    !defined(HWY_SKIP_NON_BEST_BASELINE)
    861 // On POWER with -m flags, we get compile errors (#1707) for targets older than
    862 // the baseline specified via -m, so only generate the static target and better.
    863 // Note that some Linux distros actually do set POWER9 as the baseline.
    864 // This works by skipping case 3 below, so case 4 is reached.
    865 #define HWY_SKIP_NON_BEST_BASELINE
    866 #endif
    867 
    868 #define HWY_ATTAINABLE_PPC (HWY_PPC8 | HWY_PPC9 | HWY_PPC10)
    869 
    870 #else
    871 #define HWY_ATTAINABLE_PPC 0
    872 #endif
    873 
    874 #if HWY_ARCH_S390X && HWY_BASELINE_Z14 != 0
    875 #define HWY_ATTAINABLE_S390X (HWY_Z14 | HWY_Z15)
    876 #else
    877 #define HWY_ATTAINABLE_S390X 0
    878 #endif
    879 
    880 #if HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH
    881 #define HWY_ATTAINABLE_RISCV HWY_RVV
    882 #else
    883 #define HWY_ATTAINABLE_RISCV HWY_BASELINE_RVV
    884 #endif
    885 
    886 #if HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH
    887 #define HWY_ATTAINABLE_LOONGARCH (HWY_LSX | HWY_LASX)
    888 #else
    889 #define HWY_ATTAINABLE_LOONGARCH HWY_BASELINE_LOONGARCH
    890 #endif
    891 
    892 #ifndef HWY_ATTAINABLE_TARGETS_X86  // allow override
    893 #if HWY_COMPILER_MSVC && defined(HWY_SLOW_MSVC)
    894 // Fewer targets for faster builds.
    895 #define HWY_ATTAINABLE_TARGETS_X86 \
    896  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_STATIC_TARGET | HWY_AVX2)
    897 #else  // !HWY_COMPILER_MSVC
    898 #define HWY_ATTAINABLE_TARGETS_X86                                    \
    899  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \
    900              HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL | HWY_AVX3_ZEN4 |     \
    901              HWY_AVX3_SPR | HWY_AVX10_2)
    902 #endif  // !HWY_COMPILER_MSVC
    903 #endif  // HWY_ATTAINABLE_TARGETS_X86
    904 
    905 // Attainable means enabled and the compiler allows intrinsics (even when not
    906 // allowed to auto-vectorize). Used in 3 and 4.
    907 #if HWY_ARCH_X86
    908 #define HWY_ATTAINABLE_TARGETS HWY_ATTAINABLE_TARGETS_X86
    909 #elif HWY_ARCH_ARM
    910 #define HWY_ATTAINABLE_TARGETS                                                 \
    911  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_NEON | HWY_ATTAINABLE_SVE | \
    912              HWY_ATTAINABLE_SVE2)
    913 #elif HWY_ARCH_PPC
    914 #define HWY_ATTAINABLE_TARGETS \
    915  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_PPC)
    916 #elif HWY_ARCH_S390X
    917 #define HWY_ATTAINABLE_TARGETS \
    918  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_S390X)
    919 #elif HWY_ARCH_RISCV
    920 #define HWY_ATTAINABLE_TARGETS \
    921  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_RISCV)
    922 #elif HWY_ARCH_LOONGARCH
    923 #define HWY_ATTAINABLE_TARGETS \
    924  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_LOONGARCH)
    925 #else
    926 #define HWY_ATTAINABLE_TARGETS (HWY_ENABLED_BASELINE)
    927 #endif  // HWY_ARCH_*
    928 
    929 // 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
    930 #if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
    931 #undef HWY_STATIC_TARGET
    932 #define HWY_STATIC_TARGET HWY_EMU128  // override baseline
    933 #define HWY_TARGETS HWY_EMU128
    934 
    935 // 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
    936 // we currently still support it for backwards compatibility.
    937 #elif defined(HWY_COMPILE_ONLY_SCALAR) || \
    938    (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
    939 #undef HWY_STATIC_TARGET
    940 #define HWY_STATIC_TARGET HWY_SCALAR  // override baseline
    941 #define HWY_TARGETS HWY_SCALAR
    942 
    943 // 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
    944 #elif defined(HWY_COMPILE_ONLY_STATIC)
    945 #define HWY_TARGETS HWY_STATIC_TARGET
    946 
    947 // 3) For tests: include all attainable targets (in particular: scalar)
    948 #elif (defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)) && \
    949    !defined(HWY_SKIP_NON_BEST_BASELINE)
    950 #define HWY_TARGETS HWY_ATTAINABLE_TARGETS
    951 
    952 // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
    953 // excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
    954 // may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
    955 // sets all lower bits (better targets), then we also include the static target.
    956 #else
    957 #define HWY_TARGETS \
    958  (HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
    959 
    960 #endif  // target policy
    961 
    962 // HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
    963 // one of the dynamic targets. This also implies HWY_TARGETS != 0 and
    964 // (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
    965 #if (HWY_TARGETS & HWY_STATIC_TARGET) == 0 && HWY_ENABLED_BASELINE != 0
    966 #error "Logic error: best baseline should be included in dynamic targets"
    967 #endif
    968 
    969 #endif  // HIGHWAY_HWY_DETECT_TARGETS_H_