tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

targets.cc (25719B)


      1 // Copyright 2019 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include "hwy/targets.h"
     17 
     18 #include <stdint.h>
     19 #include <stdio.h>
     20 
     21 #include "hwy/base.h"
     22 #include "hwy/detect_targets.h"
     23 #include "hwy/highway.h"
     24 #include "hwy/x86_cpuid.h"
     25 
     26 #if HWY_ARCH_X86
     27 #include <xmmintrin.h>
     28 
     29 #elif (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X || HWY_ARCH_RISCV || \
     30       HWY_ARCH_LOONGARCH) &&                                              \
     31    HWY_OS_LINUX
     32 // sys/auxv.h does not always include asm/hwcap.h, or define HWCAP*, hence we
     33 // still include this directly. See #1199.
     34 #if HWY_HAVE_ASM_HWCAP
     35 #include <asm/hwcap.h>
     36 #endif
     37 #if HWY_HAVE_AUXV
     38 #include <sys/auxv.h>
     39 #endif
     40 
     41 #endif  // HWY_ARCH_*
     42 
     43 #if HWY_OS_APPLE
     44 #include <sys/sysctl.h>
     45 #include <sys/utsname.h>
     46 #endif  // HWY_OS_APPLE
     47 
     48 namespace hwy {
     49 
     50 #if HWY_OS_APPLE
     51 static HWY_INLINE HWY_MAYBE_UNUSED bool HasCpuFeature(
     52    const char* feature_name) {
     53  int result = 0;
     54  size_t len = sizeof(int);
     55  return (sysctlbyname(feature_name, &result, &len, nullptr, 0) == 0 &&
     56          result != 0);
     57 }
     58 
     59 static HWY_INLINE HWY_MAYBE_UNUSED bool ParseU32(const char*& ptr,
     60                                                 uint32_t& parsed_val) {
     61  uint64_t parsed_u64 = 0;
     62 
     63  const char* start_ptr = ptr;
     64  for (char ch; (ch = (*ptr)) != '\0'; ++ptr) {
     65    unsigned digit = static_cast<unsigned>(static_cast<unsigned char>(ch)) -
     66                     static_cast<unsigned>(static_cast<unsigned char>('0'));
     67    if (digit > 9u) {
     68      break;
     69    }
     70 
     71    parsed_u64 = (parsed_u64 * 10u) + digit;
     72    if (parsed_u64 > 0xFFFFFFFFu) {
     73      return false;
     74    }
     75  }
     76 
     77  parsed_val = static_cast<uint32_t>(parsed_u64);
     78  return (ptr != start_ptr);
     79 }
     80 
     81 static HWY_INLINE HWY_MAYBE_UNUSED bool IsMacOs12_2OrLater() {
     82  utsname uname_buf;
     83  ZeroBytes(&uname_buf, sizeof(utsname));
     84 
     85  if ((uname(&uname_buf)) != 0) {
     86    return false;
     87  }
     88 
     89  const char* ptr = uname_buf.release;
     90  if (!ptr) {
     91    return false;
     92  }
     93 
     94  uint32_t major;
     95  uint32_t minor;
     96  if (!ParseU32(ptr, major)) {
     97    return false;
     98  }
     99 
    100  if (*ptr != '.') {
    101    return false;
    102  }
    103 
    104  ++ptr;
    105  if (!ParseU32(ptr, minor)) {
    106    return false;
    107  }
    108 
    109  // We are running on macOS 12.2 or later if the Darwin kernel version is 21.3
    110  // or later
    111  return (major > 21 || (major == 21 && minor >= 3));
    112 }
    113 #endif  // HWY_OS_APPLE
    114 
    115 #if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH
    116 namespace x86 {
    117 
    118 // Returns the lower 32 bits of extended control register 0.
    119 // Requires CPU support for "OSXSAVE" (see below).
    120 static uint32_t ReadXCR0() {
    121 #if HWY_COMPILER_MSVC
    122  return static_cast<uint32_t>(_xgetbv(0));
    123 #else   // HWY_COMPILER_MSVC
    124  uint32_t xcr0, xcr0_high;
    125  const uint32_t index = 0;
    126  asm volatile(".byte 0x0F, 0x01, 0xD0"
    127               : "=a"(xcr0), "=d"(xcr0_high)
    128               : "c"(index));
    129  return xcr0;
    130 #endif  // HWY_COMPILER_MSVC
    131 }
    132 
    133 // Arbitrary bit indices indicating which instruction set extensions are
    134 // supported. Use enum to ensure values are distinct.
    135 enum class FeatureIndex : uint32_t {
    136  kSSE = 0,
    137  kSSE2,
    138  kSSE3,
    139  kSSSE3,
    140 
    141  kSSE41,
    142  kSSE42,
    143  kCLMUL,
    144  kAES,
    145 
    146  kAVX,
    147  kAVX2,
    148  kF16C,
    149  kFMA,
    150  kLZCNT,
    151  kBMI,
    152  kBMI2,
    153 
    154  kAVX512F,
    155  kAVX512VL,
    156  kAVX512CD,
    157  kAVX512DQ,
    158  kAVX512BW,
    159  kAVX512FP16,
    160  kAVX512BF16,
    161 
    162  kVNNI,
    163  kVPCLMULQDQ,
    164  kVBMI,
    165  kVBMI2,
    166  kVAES,
    167  kPOPCNTDQ,
    168  kBITALG,
    169  kGFNI,
    170 
    171  kAVX10,
    172  kAPX,
    173 
    174  kSentinel
    175 };
    176 static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64,
    177              "Too many bits for u64");
    178 
    179 static HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) {
    180  return 1ull << static_cast<size_t>(index);
    181 }
    182 
    183 // Returns bit array of FeatureIndex from CPUID feature flags.
    184 static uint64_t FlagsFromCPUID() {
    185  uint64_t flags = 0;  // return value
    186  uint32_t abcd[4];
    187  Cpuid(0, 0, abcd);
    188  const uint32_t max_level = abcd[0];
    189 
    190  // Standard feature flags
    191  Cpuid(1, 0, abcd);
    192  flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0;
    193  flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0;
    194  flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0;
    195  flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0;
    196  flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0;
    197  flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0;
    198  flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0;
    199  flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0;
    200  flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0;
    201  flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0;
    202  flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0;
    203 
    204  // Extended feature flags
    205  Cpuid(0x80000001U, 0, abcd);
    206  flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0;
    207 
    208  // Extended features
    209  if (max_level >= 7) {
    210    Cpuid(7, 0, abcd);
    211    flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0;
    212    flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0;
    213    flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0;
    214 
    215    flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0;
    216    flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0;
    217    flags |= IsBitSet(abcd[1], 28) ? Bit(FeatureIndex::kAVX512CD) : 0;
    218    flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0;
    219    flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0;
    220 
    221    flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0;
    222    flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0;
    223    flags |= IsBitSet(abcd[2], 8) ? Bit(FeatureIndex::kGFNI) : 0;
    224    flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0;
    225    flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0;
    226    flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0;
    227    flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0;
    228    flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0;
    229 
    230    flags |= IsBitSet(abcd[3], 23) ? Bit(FeatureIndex::kAVX512FP16) : 0;
    231 
    232    Cpuid(7, 1, abcd);
    233    flags |= IsBitSet(abcd[0], 5) ? Bit(FeatureIndex::kAVX512BF16) : 0;
    234    flags |= IsBitSet(abcd[3], 19) ? Bit(FeatureIndex::kAVX10) : 0;
    235    flags |= IsBitSet(abcd[3], 21) ? Bit(FeatureIndex::kAPX) : 0;
    236  }
    237 
    238  return flags;
    239 }
    240 
    241 // Each Highway target requires a 'group' of multiple features/flags.
    242 static constexpr uint64_t kGroupSSE2 =
    243    Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2);
    244 
    245 static constexpr uint64_t kGroupSSSE3 =
    246    Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3) | kGroupSSE2;
    247 
    248 #ifdef HWY_DISABLE_PCLMUL_AES
    249 static constexpr uint64_t kGroupSSE4 =
    250    Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) | kGroupSSSE3;
    251 #else
    252 static constexpr uint64_t kGroupSSE4 =
    253    Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) |
    254    Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3;
    255 #endif  // HWY_DISABLE_PCLMUL_AES
    256 
    257 // We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
    258 // use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
    259 // [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
    260 // avoiding using and requiring these so AVX2 can still be used.
    261 #ifdef HWY_DISABLE_BMI2_FMA
    262 static constexpr uint64_t kGroupBMI2_FMA = 0;
    263 #else
    264 static constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) |
    265                                           Bit(FeatureIndex::kBMI2) |
    266                                           Bit(FeatureIndex::kFMA);
    267 #endif
    268 
    269 #ifdef HWY_DISABLE_F16C
    270 static constexpr uint64_t kGroupF16C = 0;
    271 #else
    272 static constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C);
    273 #endif
    274 
    275 static constexpr uint64_t kGroupAVX2 =
    276    Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) |
    277    Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4;
    278 
    279 static constexpr uint64_t kGroupAVX3 =
    280    Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) |
    281    Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) |
    282    Bit(FeatureIndex::kAVX512CD) | kGroupAVX2;
    283 
    284 static constexpr uint64_t kGroupAVX3_DL =
    285    Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) |
    286    Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) |
    287    Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) |
    288    Bit(FeatureIndex::kBITALG) | Bit(FeatureIndex::kGFNI) | kGroupAVX3;
    289 
    290 static constexpr uint64_t kGroupAVX3_ZEN4 =
    291    Bit(FeatureIndex::kAVX512BF16) | kGroupAVX3_DL;
    292 
    293 static constexpr uint64_t kGroupAVX3_SPR =
    294    Bit(FeatureIndex::kAVX512FP16) | kGroupAVX3_ZEN4;
    295 
    296 static constexpr uint64_t kGroupAVX10 =
    297    Bit(FeatureIndex::kAVX10) | Bit(FeatureIndex::kAPX) |
    298    Bit(FeatureIndex::kVPCLMULQDQ) | Bit(FeatureIndex::kVAES) |
    299    Bit(FeatureIndex::kGFNI) | kGroupAVX2;
    300 
    301 static int64_t DetectTargets() {
    302  int64_t bits = 0;  // return value of supported targets.
    303  HWY_IF_CONSTEXPR(HWY_ARCH_X86_64) {
    304    bits |= HWY_SSE2;  // always present in x64
    305  }
    306 
    307  const uint64_t flags = FlagsFromCPUID();
    308  // Set target bit(s) if all their group's flags are all set.
    309  if ((flags & kGroupAVX3_SPR) == kGroupAVX3_SPR) {
    310    bits |= HWY_AVX3_SPR;
    311  }
    312  if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) {
    313    bits |= HWY_AVX3_DL;
    314  }
    315  if ((flags & kGroupAVX3) == kGroupAVX3) {
    316    bits |= HWY_AVX3;
    317  }
    318  if ((flags & kGroupAVX2) == kGroupAVX2) {
    319    bits |= HWY_AVX2;
    320  }
    321  if ((flags & kGroupSSE4) == kGroupSSE4) {
    322    bits |= HWY_SSE4;
    323  }
    324  if ((flags & kGroupSSSE3) == kGroupSSSE3) {
    325    bits |= HWY_SSSE3;
    326  }
    327  HWY_IF_CONSTEXPR(HWY_ARCH_X86_32) {
    328    if ((flags & kGroupSSE2) == kGroupSSE2) {
    329      bits |= HWY_SSE2;
    330    }
    331  }
    332 
    333  uint32_t abcd[4];
    334 
    335  if ((flags & kGroupAVX10) == kGroupAVX10) {
    336    Cpuid(0x24, 0, abcd);
    337 
    338    // AVX10 version is in lower 8 bits of abcd[1]
    339    const uint32_t avx10_ver = abcd[1] & 0xFFu;
    340 
    341    // 512-bit vectors are supported if avx10_ver >= 1 is true and bit 18 of
    342    // abcd[1] is set
    343    const bool has_avx10_with_512bit_vectors =
    344        (avx10_ver >= 1) && IsBitSet(abcd[1], 18);
    345 
    346    if (has_avx10_with_512bit_vectors) {
    347      // AVX10.1 or later with support for 512-bit vectors implies support for
    348      // the AVX3/AVX3_DL/AVX3_SPR targets
    349      bits |= (HWY_AVX3_SPR | HWY_AVX3_DL | HWY_AVX3);
    350 
    351      if (avx10_ver >= 2) {
    352        // AVX10.2 is supported if avx10_ver >= 2 is true
    353        bits |= HWY_AVX10_2;
    354      }
    355    }
    356  }
    357 
    358  // Clear AVX2/AVX3 bits if the CPU or OS does not support XSAVE - otherwise,
    359  // YMM/ZMM registers are not preserved across context switches.
    360 
    361  // The lower 128 bits of XMM0-XMM15 are guaranteed to be preserved across
    362  // context switches on x86_64
    363 
    364  // The following OS's are known to preserve the lower 128 bits of XMM
    365  // registers across context switches on x86 CPUs that support SSE (even in
    366  // 32-bit mode):
    367  // - Windows 2000 or later
    368  // - Linux 2.4.0 or later
    369  // - Mac OS X 10.4 or later
    370  // - FreeBSD 4.4 or later
    371  // - NetBSD 1.6 or later
    372  // - OpenBSD 3.5 or later
    373  // - UnixWare 7 Release 7.1.1 or later
    374  // - Solaris 9 4/04 or later
    375 
    376  Cpuid(1, 0, abcd);
    377  const bool has_xsave = IsBitSet(abcd[2], 26);
    378  const bool has_osxsave = IsBitSet(abcd[2], 27);
    379  constexpr int64_t min_avx2 = HWY_AVX2 | (HWY_AVX2 - 1);
    380 
    381  if (has_xsave && has_osxsave) {
    382 #if HWY_OS_APPLE
    383    // On macOS, check for AVX3 XSAVE support by checking that we are running on
    384    // macOS 12.2 or later and HasCpuFeature("hw.optional.avx512f") returns true
    385 
    386    // There is a bug in macOS 12.1 or earlier that can cause ZMM16-ZMM31, the
    387    // upper 256 bits of the ZMM registers, and K0-K7 (the AVX512 mask
    388    // registers) to not be properly preserved across a context switch on
    389    // macOS 12.1 or earlier.
    390 
    391    // This bug on macOS 12.1 or earlier on x86_64 CPU's with AVX3 support is
    392    // described at
    393    // https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259,
    394    // https://github.com/golang/go/issues/49233, and
    395    // https://github.com/simdutf/simdutf/pull/236.
    396 
    397    // In addition to the bug that is there on macOS 12.1 or earlier, bits 5, 6,
    398    // and 7 can be set to 0 on x86_64 CPUs with AVX3 support on macOS until
    399    // the first AVX512 instruction is executed as macOS only preserves
    400    // ZMM16-ZMM31, the upper 256 bits of the ZMM registers, and K0-K7 across a
    401    // context switch on threads that have executed an AVX512 instruction.
    402 
    403    // Checking for AVX3 XSAVE support on macOS using
    404    // HasCpuFeature("hw.optional.avx512f") avoids false negative results
    405    // on x86_64 CPU's that have AVX3 support.
    406    const bool have_avx3_xsave_support =
    407        IsMacOs12_2OrLater() && HasCpuFeature("hw.optional.avx512f");
    408 #endif
    409 
    410    const uint32_t xcr0 = ReadXCR0();
    411    constexpr int64_t min_avx3 = HWY_AVX3 | (HWY_AVX3 - 1);
    412    // XMM/YMM
    413    if (!IsBitSet(xcr0, 1) || !IsBitSet(xcr0, 2)) {
    414      // Clear the AVX2/AVX3 bits if XMM/YMM XSAVE is not enabled
    415      bits &= ~min_avx2;
    416    }
    417 
    418 #if !HWY_OS_APPLE
    419    // On OS's other than macOS, check for AVX3 XSAVE support by checking that
    420    // bits 5, 6, and 7 of XCR0 are set.
    421    const bool have_avx3_xsave_support =
    422        IsBitSet(xcr0, 5) && IsBitSet(xcr0, 6) && IsBitSet(xcr0, 7);
    423 #endif
    424 
    425    // opmask, ZMM lo/hi
    426    if (!have_avx3_xsave_support) {
    427      bits &= ~min_avx3;
    428    }
    429  } else {  // !has_xsave || !has_osxsave
    430    // Clear the AVX2/AVX3 bits if the CPU or OS does not support XSAVE
    431    bits &= ~min_avx2;
    432  }
    433 
    434  // This is mainly to work around the slow Zen4 CompressStore. It's unclear
    435  // whether subsequent AMD models will be affected; assume yes.
    436  if ((bits & HWY_AVX3_DL) && (flags & kGroupAVX3_ZEN4) == kGroupAVX3_ZEN4 &&
    437      IsAMD()) {
    438    bits |= HWY_AVX3_ZEN4;
    439  }
    440 
    441  return bits;
    442 }
    443 
    444 }  // namespace x86
    445 #elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
    446 namespace arm {
    447 
    448 #ifndef HWCAP2_I8MM
    449 #define HWCAP2_I8MM (1 << 13)
    450 #endif
    451 
    452 #if HWY_ARCH_ARM_A64 && !HWY_OS_APPLE &&        \
    453    (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
    454    ((HWY_TARGETS & HWY_ALL_SVE) != 0)
    455 HWY_PUSH_ATTRIBUTES("+sve")
    456 static int64_t DetectAdditionalSveTargets(int64_t detected_targets) {
    457  uint64_t sve_vec_len;
    458 
    459  // Use inline assembly instead of svcntb_pat(SV_ALL) as GCC or Clang might
    460  // possibly optimize a svcntb_pat(SV_ALL) call to a constant if the
    461  // -msve-vector-bits option is specified
    462  asm("cntb %0" : "=r"(sve_vec_len)::);
    463 
    464  return ((sve_vec_len == 32)
    465              ? HWY_SVE_256
    466              : (((detected_targets & HWY_SVE2) != 0 && sve_vec_len == 16)
    467                     ? HWY_SVE2_128
    468                     : 0));
    469 }
    470 HWY_POP_ATTRIBUTES
    471 #endif
    472 
    473 static int64_t DetectTargets() {
    474  int64_t bits = 0;  // return value of supported targets.
    475 
    476  using CapBits = unsigned long;  // NOLINT
    477 #if HWY_OS_APPLE
    478  const CapBits hw = 0UL;
    479 #else
    480  // For Android, this has been supported since API 20 (2014).
    481  const CapBits hw = getauxval(AT_HWCAP);
    482 #endif
    483  (void)hw;
    484 
    485 #if HWY_ARCH_ARM_A64
    486  bits |= HWY_NEON_WITHOUT_AES;  // aarch64 always has NEON and VFPv4..
    487 
    488 #if HWY_OS_APPLE
    489  if (HasCpuFeature("hw.optional.arm.FEAT_AES")) {
    490    bits |= HWY_NEON;
    491 
    492    // Some macOS versions report AdvSIMD_HPFPCvt under a different key.
    493    // Check both known variants for compatibility.
    494    if ((HasCpuFeature("hw.optional.AdvSIMD_HPFPCvt") ||
    495         HasCpuFeature("hw.optional.arm.AdvSIMD_HPFPCvt")) &&
    496        HasCpuFeature("hw.optional.arm.FEAT_DotProd") &&
    497        HasCpuFeature("hw.optional.arm.FEAT_BF16") &&
    498        HasCpuFeature("hw.optional.arm.FEAT_I8MM")) {
    499      bits |= HWY_NEON_BF16;
    500    }
    501  }
    502 #else  // !HWY_OS_APPLE
    503  // .. but not necessarily AES, which is required for HWY_NEON.
    504 #if defined(HWCAP_AES)
    505  if (hw & HWCAP_AES) {
    506    bits |= HWY_NEON;
    507 
    508 #if defined(HWCAP_ASIMDHP) && defined(HWCAP_ASIMDDP) && defined(HWCAP2_BF16)
    509    const CapBits hw2 = getauxval(AT_HWCAP2);
    510    constexpr CapBits kGroupF16Dot = HWCAP_ASIMDHP | HWCAP_ASIMDDP;
    511    constexpr CapBits kGroupBF16 = HWCAP2_BF16 | HWCAP2_I8MM;
    512    if ((hw & kGroupF16Dot) == kGroupF16Dot &&
    513        (hw2 & kGroupBF16) == kGroupBF16) {
    514      bits |= HWY_NEON_BF16;
    515    }
    516 #endif  // HWCAP_ASIMDHP && HWCAP_ASIMDDP && HWCAP2_BF16
    517  }
    518 #endif  // HWCAP_AES
    519 
    520 #if defined(HWCAP_SVE)
    521  if (hw & HWCAP_SVE) {
    522    bits |= HWY_SVE;
    523  }
    524 #endif
    525 
    526 #ifndef HWCAP2_SVE2
    527 #define HWCAP2_SVE2 (1 << 1)
    528 #endif
    529 #ifndef HWCAP2_SVEAES
    530 #define HWCAP2_SVEAES (1 << 2)
    531 #endif
    532 #ifndef HWCAP2_SVEI8MM
    533 #define HWCAP2_SVEI8MM (1 << 9)
    534 #endif
    535  constexpr CapBits kGroupSVE2 =
    536      HWCAP2_SVE2 | HWCAP2_SVEAES | HWCAP2_SVEI8MM | HWCAP2_I8MM;
    537  const CapBits hw2 = getauxval(AT_HWCAP2);
    538  if ((hw2 & kGroupSVE2) == kGroupSVE2) {
    539    bits |= HWY_SVE2;
    540  }
    541 
    542 #if (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
    543    ((HWY_TARGETS & HWY_ALL_SVE) != 0)
    544  if ((bits & HWY_ALL_SVE) != 0) {
    545    bits |= DetectAdditionalSveTargets(bits);
    546  }
    547 #endif  // (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) &&
    548        // ((HWY_TARGETS & HWY_ALL_SVE) != 0)
    549 
    550 #endif  // HWY_OS_APPLE
    551 
    552 #else  // !HWY_ARCH_ARM_A64
    553 
    554 // Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported.
    555 #if defined(HWCAP_NEON) && defined(HWCAP_VFPv4)
    556  if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) {
    557    bits |= HWY_NEON_WITHOUT_AES;
    558  }
    559 #endif
    560 
    561  // aarch32 would check getauxval(AT_HWCAP2) & HWCAP2_AES, but we do not yet
    562  // support that platform, and Armv7 lacks AES entirely. Because HWY_NEON
    563  // requires native AES instructions, we do not enable that target here.
    564 
    565 #endif  // HWY_ARCH_ARM_A64
    566  return bits;
    567 }
    568 }  // namespace arm
    569 #elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH
    570 namespace ppc {
    571 
    572 #ifndef PPC_FEATURE_HAS_ALTIVEC
    573 #define PPC_FEATURE_HAS_ALTIVEC 0x10000000
    574 #endif
    575 
    576 #ifndef PPC_FEATURE_HAS_VSX
    577 #define PPC_FEATURE_HAS_VSX 0x00000080
    578 #endif
    579 
    580 #ifndef PPC_FEATURE2_ARCH_2_07
    581 #define PPC_FEATURE2_ARCH_2_07 0x80000000
    582 #endif
    583 
    584 #ifndef PPC_FEATURE2_VEC_CRYPTO
    585 #define PPC_FEATURE2_VEC_CRYPTO 0x02000000
    586 #endif
    587 
    588 #ifndef PPC_FEATURE2_ARCH_3_00
    589 #define PPC_FEATURE2_ARCH_3_00 0x00800000
    590 #endif
    591 
    592 #ifndef PPC_FEATURE2_ARCH_3_1
    593 #define PPC_FEATURE2_ARCH_3_1 0x00040000
    594 #endif
    595 
    596 using CapBits = unsigned long;  // NOLINT
    597 
    598 // For AT_HWCAP, the others are for AT_HWCAP2
    599 static constexpr CapBits kGroupVSX =
    600    PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX;
    601 
    602 #if defined(HWY_DISABLE_PPC8_CRYPTO)
    603 static constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07;
    604 #else
    605 static constexpr CapBits kGroupPPC8 =
    606    PPC_FEATURE2_ARCH_2_07 | PPC_FEATURE2_VEC_CRYPTO;
    607 #endif
    608 static constexpr CapBits kGroupPPC9 = kGroupPPC8 | PPC_FEATURE2_ARCH_3_00;
    609 static constexpr CapBits kGroupPPC10 = kGroupPPC9 | PPC_FEATURE2_ARCH_3_1;
    610 
    611 static int64_t DetectTargets() {
    612  int64_t bits = 0;  // return value of supported targets.
    613 
    614 #if defined(AT_HWCAP) && defined(AT_HWCAP2)
    615  const CapBits hw = getauxval(AT_HWCAP);
    616 
    617  if ((hw & kGroupVSX) == kGroupVSX) {
    618    const CapBits hw2 = getauxval(AT_HWCAP2);
    619    if ((hw2 & kGroupPPC8) == kGroupPPC8) {
    620      bits |= HWY_PPC8;
    621    }
    622    if ((hw2 & kGroupPPC9) == kGroupPPC9) {
    623      bits |= HWY_PPC9;
    624    }
    625    if ((hw2 & kGroupPPC10) == kGroupPPC10) {
    626      bits |= HWY_PPC10;
    627    }
    628  }  // VSX
    629 #endif  // defined(AT_HWCAP) && defined(AT_HWCAP2)
    630 
    631  return bits;
    632 }
    633 }  // namespace ppc
    634 #elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH
    635 namespace s390x {
    636 
    637 #ifndef HWCAP_S390_VX
    638 #define HWCAP_S390_VX 2048
    639 #endif
    640 
    641 #ifndef HWCAP_S390_VXE
    642 #define HWCAP_S390_VXE 8192
    643 #endif
    644 
    645 #ifndef HWCAP_S390_VXRS_EXT2
    646 #define HWCAP_S390_VXRS_EXT2 32768
    647 #endif
    648 
    649 using CapBits = unsigned long;  // NOLINT
    650 
    651 static constexpr CapBits kGroupZ14 = HWCAP_S390_VX | HWCAP_S390_VXE;
    652 static constexpr CapBits kGroupZ15 =
    653    HWCAP_S390_VX | HWCAP_S390_VXE | HWCAP_S390_VXRS_EXT2;
    654 
    655 static int64_t DetectTargets() {
    656  int64_t bits = 0;
    657 
    658 #if defined(AT_HWCAP)
    659  const CapBits hw = getauxval(AT_HWCAP);
    660 
    661  if ((hw & kGroupZ14) == kGroupZ14) {
    662    bits |= HWY_Z14;
    663  }
    664 
    665  if ((hw & kGroupZ15) == kGroupZ15) {
    666    bits |= HWY_Z15;
    667  }
    668 #endif
    669 
    670  return bits;
    671 }
    672 }  // namespace s390x
    673 #elif HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH
    674 namespace rvv {
    675 
    676 #ifndef HWCAP_RVV
    677 #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
    678 #endif
    679 
    680 using CapBits = unsigned long;  // NOLINT
    681 
    682 static int64_t DetectTargets() {
    683  int64_t bits = 0;
    684 
    685  const CapBits hw = getauxval(AT_HWCAP);
    686 
    687  if ((hw & COMPAT_HWCAP_ISA_V) == COMPAT_HWCAP_ISA_V) {
    688    size_t e8m1_vec_len;
    689 #if HWY_ARCH_RISCV_64
    690    int64_t vtype_reg_val;
    691 #else
    692    int32_t vtype_reg_val;
    693 #endif
    694 
    695    // Check that a vuint8m1_t vector is at least 16 bytes and that tail
    696    // agnostic and mask agnostic mode are supported
    697    asm volatile(
    698        // Avoid compiler error on GCC or Clang if -march=rv64gcv1p0 or
    699        // -march=rv32gcv1p0 option is not specified on the command line
    700        ".option push\n\t"
    701        ".option arch, +v\n\t"
    702        "vsetvli %0, zero, e8, m1, ta, ma\n\t"
    703        "csrr %1, vtype\n\t"
    704        ".option pop"
    705        : "=r"(e8m1_vec_len), "=r"(vtype_reg_val));
    706 
    707    // The RVV target is supported if the VILL bit of VTYPE (the MSB bit of
    708    // VTYPE) is not set and the length of a vuint8m1_t vector is at least 16
    709    // bytes
    710    if (vtype_reg_val >= 0 && e8m1_vec_len >= 16) {
    711      bits |= HWY_RVV;
    712    }
    713  }
    714 
    715  return bits;
    716 }
    717 }  // namespace rvv
    718 #elif HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH
    719 
    720 namespace loongarch {
    721 
    722 #ifndef LA_HWCAP_LSX
    723 #define LA_HWCAP_LSX (1u << 4)
    724 #endif
    725 #ifndef LA_HWCAP_LASX
    726 #define LA_HWCAP_LASX (1u << 5)
    727 #endif
    728 
    729 using CapBits = unsigned long;  // NOLINT
    730 
    731 static int64_t DetectTargets() {
    732  int64_t bits = 0;
    733  const CapBits hw = getauxval(AT_HWCAP);
    734  if (hw & LA_HWCAP_LSX) bits |= HWY_LSX;
    735  if (hw & LA_HWCAP_LASX) bits |= HWY_LASX;
    736  return bits;
    737 }
    738 }  // namespace loongarch
    739 #endif  // HWY_ARCH_*
    740 
    741 // Returns targets supported by the CPU, independently of DisableTargets.
    742 // Factored out of SupportedTargets to make its structure more obvious. Note
    743 // that x86 CPUID may take several hundred cycles.
    744 static int64_t DetectTargets() {
    745  // Apps will use only one of these (the default is EMU128), but compile flags
    746  // for this TU may differ from that of the app, so allow both.
    747  int64_t bits = HWY_SCALAR | HWY_EMU128;
    748 
    749 #if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH
    750  bits |= x86::DetectTargets();
    751 #elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
    752  bits |= arm::DetectTargets();
    753 #elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH
    754  bits |= ppc::DetectTargets();
    755 #elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH
    756  bits |= s390x::DetectTargets();
    757 #elif HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH
    758  bits |= rvv::DetectTargets();
    759 #elif HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH
    760  bits |= loongarch::DetectTargets();
    761 
    762 #else
    763  // TODO(janwas): detect support for WASM.
    764  // This file is typically compiled without HWY_IS_TEST, but targets_test has
    765  // it set, and will expect all of its HWY_TARGETS (= all attainable) to be
    766  // supported.
    767  bits |= HWY_ENABLED_BASELINE;
    768 #endif  // HWY_ARCH_*
    769 
    770  if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
    771    const uint64_t bits_u = static_cast<uint64_t>(bits);
    772    const uint64_t enabled = static_cast<uint64_t>(HWY_ENABLED_BASELINE);
    773    HWY_WARN("CPU supports 0x%08x%08x, software requires 0x%08x%08x\n",
    774             static_cast<uint32_t>(bits_u >> 32),
    775             static_cast<uint32_t>(bits_u & 0xFFFFFFFF),
    776             static_cast<uint32_t>(enabled >> 32),
    777             static_cast<uint32_t>(enabled & 0xFFFFFFFF));
    778  }
    779 
    780  return bits;
    781 }
    782 
    783 // When running tests, this value can be set to the mocked supported targets
    784 // mask. Only written to from a single thread before the test starts.
    785 static int64_t supported_targets_for_test_ = 0;
    786 
    787 // Mask of targets disabled at runtime with DisableTargets.
    788 static int64_t supported_mask_ = LimitsMax<int64_t>();
    789 
    790 HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) {
    791  supported_mask_ = static_cast<int64_t>(~disabled_targets);
    792  // This will take effect on the next call to SupportedTargets, which is
    793  // called right before GetChosenTarget::Update. However, calling Update here
    794  // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want
    795  // to check in tests. We instead de-initialize such that the next
    796  // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache.
    797  GetChosenTarget().DeInit();
    798 }
    799 
    800 HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) {
    801  supported_targets_for_test_ = targets;
    802  GetChosenTarget().DeInit();  // see comment above
    803 }
    804 
    805 HWY_DLLEXPORT int64_t SupportedTargets() {
    806  int64_t targets = supported_targets_for_test_;
    807  if (HWY_LIKELY(targets == 0)) {
    808    // Mock not active. Re-detect instead of caching just in case we're on a
    809    // heterogeneous ISA (also requires some app support to pin threads). This
    810    // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to
    811    // DisableTargets or SetSupportedTargetsForTest.
    812    targets = DetectTargets();
    813 
    814    // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion,
    815    // first set up ChosenTarget. No need to Update() again afterwards with the
    816    // final targets - that will be done by a caller of this function.
    817    GetChosenTarget().Update(targets);
    818  }
    819 
    820  targets &= supported_mask_;
    821  return targets == 0 ? HWY_STATIC_TARGET : targets;
    822 }
    823 
    824 HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
    825  static ChosenTarget chosen_target;
    826  return chosen_target;
    827 }
    828 
    829 }  // namespace hwy