targets.cc (25719B)
1 // Copyright 2019 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include "hwy/targets.h" 17 18 #include <stdint.h> 19 #include <stdio.h> 20 21 #include "hwy/base.h" 22 #include "hwy/detect_targets.h" 23 #include "hwy/highway.h" 24 #include "hwy/x86_cpuid.h" 25 26 #if HWY_ARCH_X86 27 #include <xmmintrin.h> 28 29 #elif (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X || HWY_ARCH_RISCV || \ 30 HWY_ARCH_LOONGARCH) && \ 31 HWY_OS_LINUX 32 // sys/auxv.h does not always include asm/hwcap.h, or define HWCAP*, hence we 33 // still include this directly. See #1199. 34 #if HWY_HAVE_ASM_HWCAP 35 #include <asm/hwcap.h> 36 #endif 37 #if HWY_HAVE_AUXV 38 #include <sys/auxv.h> 39 #endif 40 41 #endif // HWY_ARCH_* 42 43 #if HWY_OS_APPLE 44 #include <sys/sysctl.h> 45 #include <sys/utsname.h> 46 #endif // HWY_OS_APPLE 47 48 namespace hwy { 49 50 #if HWY_OS_APPLE 51 static HWY_INLINE HWY_MAYBE_UNUSED bool HasCpuFeature( 52 const char* feature_name) { 53 int result = 0; 54 size_t len = sizeof(int); 55 return (sysctlbyname(feature_name, &result, &len, nullptr, 0) == 0 && 56 result != 0); 57 } 58 59 static HWY_INLINE HWY_MAYBE_UNUSED bool ParseU32(const char*& ptr, 60 uint32_t& parsed_val) { 61 uint64_t parsed_u64 = 0; 62 63 const char* start_ptr = ptr; 64 for (char ch; (ch = (*ptr)) != '\0'; ++ptr) { 65 unsigned digit = static_cast<unsigned>(static_cast<unsigned char>(ch)) - 66 static_cast<unsigned>(static_cast<unsigned char>('0')); 67 if (digit > 9u) { 68 break; 69 } 70 71 parsed_u64 = (parsed_u64 * 10u) + digit; 72 if (parsed_u64 > 0xFFFFFFFFu) { 73 return false; 74 } 75 } 76 77 parsed_val = static_cast<uint32_t>(parsed_u64); 78 return (ptr != start_ptr); 79 } 80 81 static HWY_INLINE HWY_MAYBE_UNUSED bool IsMacOs12_2OrLater() { 82 utsname uname_buf; 83 ZeroBytes(&uname_buf, sizeof(utsname)); 84 85 if ((uname(&uname_buf)) != 0) { 86 return false; 87 } 88 89 const char* ptr = uname_buf.release; 90 if (!ptr) { 91 return false; 92 } 93 94 uint32_t major; 95 uint32_t minor; 96 if (!ParseU32(ptr, major)) { 97 return false; 98 } 99 100 if (*ptr != '.') { 101 return false; 102 } 103 104 ++ptr; 105 if (!ParseU32(ptr, minor)) { 106 return false; 107 } 108 109 // We are running on macOS 12.2 or later if the Darwin kernel version is 21.3 110 // or later 111 return (major > 21 || (major == 21 && minor >= 3)); 112 } 113 #endif // HWY_OS_APPLE 114 115 #if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH 116 namespace x86 { 117 118 // Returns the lower 32 bits of extended control register 0. 119 // Requires CPU support for "OSXSAVE" (see below). 120 static uint32_t ReadXCR0() { 121 #if HWY_COMPILER_MSVC 122 return static_cast<uint32_t>(_xgetbv(0)); 123 #else // HWY_COMPILER_MSVC 124 uint32_t xcr0, xcr0_high; 125 const uint32_t index = 0; 126 asm volatile(".byte 0x0F, 0x01, 0xD0" 127 : "=a"(xcr0), "=d"(xcr0_high) 128 : "c"(index)); 129 return xcr0; 130 #endif // HWY_COMPILER_MSVC 131 } 132 133 // Arbitrary bit indices indicating which instruction set extensions are 134 // supported. Use enum to ensure values are distinct. 135 enum class FeatureIndex : uint32_t { 136 kSSE = 0, 137 kSSE2, 138 kSSE3, 139 kSSSE3, 140 141 kSSE41, 142 kSSE42, 143 kCLMUL, 144 kAES, 145 146 kAVX, 147 kAVX2, 148 kF16C, 149 kFMA, 150 kLZCNT, 151 kBMI, 152 kBMI2, 153 154 kAVX512F, 155 kAVX512VL, 156 kAVX512CD, 157 kAVX512DQ, 158 kAVX512BW, 159 kAVX512FP16, 160 kAVX512BF16, 161 162 kVNNI, 163 kVPCLMULQDQ, 164 kVBMI, 165 kVBMI2, 166 kVAES, 167 kPOPCNTDQ, 168 kBITALG, 169 kGFNI, 170 171 kAVX10, 172 kAPX, 173 174 kSentinel 175 }; 176 static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64, 177 "Too many bits for u64"); 178 179 static HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) { 180 return 1ull << static_cast<size_t>(index); 181 } 182 183 // Returns bit array of FeatureIndex from CPUID feature flags. 184 static uint64_t FlagsFromCPUID() { 185 uint64_t flags = 0; // return value 186 uint32_t abcd[4]; 187 Cpuid(0, 0, abcd); 188 const uint32_t max_level = abcd[0]; 189 190 // Standard feature flags 191 Cpuid(1, 0, abcd); 192 flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0; 193 flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0; 194 flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0; 195 flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0; 196 flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0; 197 flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0; 198 flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0; 199 flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0; 200 flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0; 201 flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0; 202 flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0; 203 204 // Extended feature flags 205 Cpuid(0x80000001U, 0, abcd); 206 flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0; 207 208 // Extended features 209 if (max_level >= 7) { 210 Cpuid(7, 0, abcd); 211 flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0; 212 flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0; 213 flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0; 214 215 flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0; 216 flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0; 217 flags |= IsBitSet(abcd[1], 28) ? Bit(FeatureIndex::kAVX512CD) : 0; 218 flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0; 219 flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0; 220 221 flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0; 222 flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0; 223 flags |= IsBitSet(abcd[2], 8) ? Bit(FeatureIndex::kGFNI) : 0; 224 flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0; 225 flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0; 226 flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0; 227 flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0; 228 flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0; 229 230 flags |= IsBitSet(abcd[3], 23) ? Bit(FeatureIndex::kAVX512FP16) : 0; 231 232 Cpuid(7, 1, abcd); 233 flags |= IsBitSet(abcd[0], 5) ? Bit(FeatureIndex::kAVX512BF16) : 0; 234 flags |= IsBitSet(abcd[3], 19) ? Bit(FeatureIndex::kAVX10) : 0; 235 flags |= IsBitSet(abcd[3], 21) ? Bit(FeatureIndex::kAPX) : 0; 236 } 237 238 return flags; 239 } 240 241 // Each Highway target requires a 'group' of multiple features/flags. 242 static constexpr uint64_t kGroupSSE2 = 243 Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2); 244 245 static constexpr uint64_t kGroupSSSE3 = 246 Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3) | kGroupSSE2; 247 248 #ifdef HWY_DISABLE_PCLMUL_AES 249 static constexpr uint64_t kGroupSSE4 = 250 Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) | kGroupSSSE3; 251 #else 252 static constexpr uint64_t kGroupSSE4 = 253 Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) | 254 Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3; 255 #endif // HWY_DISABLE_PCLMUL_AES 256 257 // We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to 258 // use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them 259 // [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of 260 // avoiding using and requiring these so AVX2 can still be used. 261 #ifdef HWY_DISABLE_BMI2_FMA 262 static constexpr uint64_t kGroupBMI2_FMA = 0; 263 #else 264 static constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) | 265 Bit(FeatureIndex::kBMI2) | 266 Bit(FeatureIndex::kFMA); 267 #endif 268 269 #ifdef HWY_DISABLE_F16C 270 static constexpr uint64_t kGroupF16C = 0; 271 #else 272 static constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C); 273 #endif 274 275 static constexpr uint64_t kGroupAVX2 = 276 Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) | 277 Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4; 278 279 static constexpr uint64_t kGroupAVX3 = 280 Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) | 281 Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | 282 Bit(FeatureIndex::kAVX512CD) | kGroupAVX2; 283 284 static constexpr uint64_t kGroupAVX3_DL = 285 Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) | 286 Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) | 287 Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) | 288 Bit(FeatureIndex::kBITALG) | Bit(FeatureIndex::kGFNI) | kGroupAVX3; 289 290 static constexpr uint64_t kGroupAVX3_ZEN4 = 291 Bit(FeatureIndex::kAVX512BF16) | kGroupAVX3_DL; 292 293 static constexpr uint64_t kGroupAVX3_SPR = 294 Bit(FeatureIndex::kAVX512FP16) | kGroupAVX3_ZEN4; 295 296 static constexpr uint64_t kGroupAVX10 = 297 Bit(FeatureIndex::kAVX10) | Bit(FeatureIndex::kAPX) | 298 Bit(FeatureIndex::kVPCLMULQDQ) | Bit(FeatureIndex::kVAES) | 299 Bit(FeatureIndex::kGFNI) | kGroupAVX2; 300 301 static int64_t DetectTargets() { 302 int64_t bits = 0; // return value of supported targets. 303 HWY_IF_CONSTEXPR(HWY_ARCH_X86_64) { 304 bits |= HWY_SSE2; // always present in x64 305 } 306 307 const uint64_t flags = FlagsFromCPUID(); 308 // Set target bit(s) if all their group's flags are all set. 309 if ((flags & kGroupAVX3_SPR) == kGroupAVX3_SPR) { 310 bits |= HWY_AVX3_SPR; 311 } 312 if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) { 313 bits |= HWY_AVX3_DL; 314 } 315 if ((flags & kGroupAVX3) == kGroupAVX3) { 316 bits |= HWY_AVX3; 317 } 318 if ((flags & kGroupAVX2) == kGroupAVX2) { 319 bits |= HWY_AVX2; 320 } 321 if ((flags & kGroupSSE4) == kGroupSSE4) { 322 bits |= HWY_SSE4; 323 } 324 if ((flags & kGroupSSSE3) == kGroupSSSE3) { 325 bits |= HWY_SSSE3; 326 } 327 HWY_IF_CONSTEXPR(HWY_ARCH_X86_32) { 328 if ((flags & kGroupSSE2) == kGroupSSE2) { 329 bits |= HWY_SSE2; 330 } 331 } 332 333 uint32_t abcd[4]; 334 335 if ((flags & kGroupAVX10) == kGroupAVX10) { 336 Cpuid(0x24, 0, abcd); 337 338 // AVX10 version is in lower 8 bits of abcd[1] 339 const uint32_t avx10_ver = abcd[1] & 0xFFu; 340 341 // 512-bit vectors are supported if avx10_ver >= 1 is true and bit 18 of 342 // abcd[1] is set 343 const bool has_avx10_with_512bit_vectors = 344 (avx10_ver >= 1) && IsBitSet(abcd[1], 18); 345 346 if (has_avx10_with_512bit_vectors) { 347 // AVX10.1 or later with support for 512-bit vectors implies support for 348 // the AVX3/AVX3_DL/AVX3_SPR targets 349 bits |= (HWY_AVX3_SPR | HWY_AVX3_DL | HWY_AVX3); 350 351 if (avx10_ver >= 2) { 352 // AVX10.2 is supported if avx10_ver >= 2 is true 353 bits |= HWY_AVX10_2; 354 } 355 } 356 } 357 358 // Clear AVX2/AVX3 bits if the CPU or OS does not support XSAVE - otherwise, 359 // YMM/ZMM registers are not preserved across context switches. 360 361 // The lower 128 bits of XMM0-XMM15 are guaranteed to be preserved across 362 // context switches on x86_64 363 364 // The following OS's are known to preserve the lower 128 bits of XMM 365 // registers across context switches on x86 CPUs that support SSE (even in 366 // 32-bit mode): 367 // - Windows 2000 or later 368 // - Linux 2.4.0 or later 369 // - Mac OS X 10.4 or later 370 // - FreeBSD 4.4 or later 371 // - NetBSD 1.6 or later 372 // - OpenBSD 3.5 or later 373 // - UnixWare 7 Release 7.1.1 or later 374 // - Solaris 9 4/04 or later 375 376 Cpuid(1, 0, abcd); 377 const bool has_xsave = IsBitSet(abcd[2], 26); 378 const bool has_osxsave = IsBitSet(abcd[2], 27); 379 constexpr int64_t min_avx2 = HWY_AVX2 | (HWY_AVX2 - 1); 380 381 if (has_xsave && has_osxsave) { 382 #if HWY_OS_APPLE 383 // On macOS, check for AVX3 XSAVE support by checking that we are running on 384 // macOS 12.2 or later and HasCpuFeature("hw.optional.avx512f") returns true 385 386 // There is a bug in macOS 12.1 or earlier that can cause ZMM16-ZMM31, the 387 // upper 256 bits of the ZMM registers, and K0-K7 (the AVX512 mask 388 // registers) to not be properly preserved across a context switch on 389 // macOS 12.1 or earlier. 390 391 // This bug on macOS 12.1 or earlier on x86_64 CPU's with AVX3 support is 392 // described at 393 // https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259, 394 // https://github.com/golang/go/issues/49233, and 395 // https://github.com/simdutf/simdutf/pull/236. 396 397 // In addition to the bug that is there on macOS 12.1 or earlier, bits 5, 6, 398 // and 7 can be set to 0 on x86_64 CPUs with AVX3 support on macOS until 399 // the first AVX512 instruction is executed as macOS only preserves 400 // ZMM16-ZMM31, the upper 256 bits of the ZMM registers, and K0-K7 across a 401 // context switch on threads that have executed an AVX512 instruction. 402 403 // Checking for AVX3 XSAVE support on macOS using 404 // HasCpuFeature("hw.optional.avx512f") avoids false negative results 405 // on x86_64 CPU's that have AVX3 support. 406 const bool have_avx3_xsave_support = 407 IsMacOs12_2OrLater() && HasCpuFeature("hw.optional.avx512f"); 408 #endif 409 410 const uint32_t xcr0 = ReadXCR0(); 411 constexpr int64_t min_avx3 = HWY_AVX3 | (HWY_AVX3 - 1); 412 // XMM/YMM 413 if (!IsBitSet(xcr0, 1) || !IsBitSet(xcr0, 2)) { 414 // Clear the AVX2/AVX3 bits if XMM/YMM XSAVE is not enabled 415 bits &= ~min_avx2; 416 } 417 418 #if !HWY_OS_APPLE 419 // On OS's other than macOS, check for AVX3 XSAVE support by checking that 420 // bits 5, 6, and 7 of XCR0 are set. 421 const bool have_avx3_xsave_support = 422 IsBitSet(xcr0, 5) && IsBitSet(xcr0, 6) && IsBitSet(xcr0, 7); 423 #endif 424 425 // opmask, ZMM lo/hi 426 if (!have_avx3_xsave_support) { 427 bits &= ~min_avx3; 428 } 429 } else { // !has_xsave || !has_osxsave 430 // Clear the AVX2/AVX3 bits if the CPU or OS does not support XSAVE 431 bits &= ~min_avx2; 432 } 433 434 // This is mainly to work around the slow Zen4 CompressStore. It's unclear 435 // whether subsequent AMD models will be affected; assume yes. 436 if ((bits & HWY_AVX3_DL) && (flags & kGroupAVX3_ZEN4) == kGroupAVX3_ZEN4 && 437 IsAMD()) { 438 bits |= HWY_AVX3_ZEN4; 439 } 440 441 return bits; 442 } 443 444 } // namespace x86 445 #elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH 446 namespace arm { 447 448 #ifndef HWCAP2_I8MM 449 #define HWCAP2_I8MM (1 << 13) 450 #endif 451 452 #if HWY_ARCH_ARM_A64 && !HWY_OS_APPLE && \ 453 (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \ 454 ((HWY_TARGETS & HWY_ALL_SVE) != 0) 455 HWY_PUSH_ATTRIBUTES("+sve") 456 static int64_t DetectAdditionalSveTargets(int64_t detected_targets) { 457 uint64_t sve_vec_len; 458 459 // Use inline assembly instead of svcntb_pat(SV_ALL) as GCC or Clang might 460 // possibly optimize a svcntb_pat(SV_ALL) call to a constant if the 461 // -msve-vector-bits option is specified 462 asm("cntb %0" : "=r"(sve_vec_len)::); 463 464 return ((sve_vec_len == 32) 465 ? HWY_SVE_256 466 : (((detected_targets & HWY_SVE2) != 0 && sve_vec_len == 16) 467 ? HWY_SVE2_128 468 : 0)); 469 } 470 HWY_POP_ATTRIBUTES 471 #endif 472 473 static int64_t DetectTargets() { 474 int64_t bits = 0; // return value of supported targets. 475 476 using CapBits = unsigned long; // NOLINT 477 #if HWY_OS_APPLE 478 const CapBits hw = 0UL; 479 #else 480 // For Android, this has been supported since API 20 (2014). 481 const CapBits hw = getauxval(AT_HWCAP); 482 #endif 483 (void)hw; 484 485 #if HWY_ARCH_ARM_A64 486 bits |= HWY_NEON_WITHOUT_AES; // aarch64 always has NEON and VFPv4.. 487 488 #if HWY_OS_APPLE 489 if (HasCpuFeature("hw.optional.arm.FEAT_AES")) { 490 bits |= HWY_NEON; 491 492 // Some macOS versions report AdvSIMD_HPFPCvt under a different key. 493 // Check both known variants for compatibility. 494 if ((HasCpuFeature("hw.optional.AdvSIMD_HPFPCvt") || 495 HasCpuFeature("hw.optional.arm.AdvSIMD_HPFPCvt")) && 496 HasCpuFeature("hw.optional.arm.FEAT_DotProd") && 497 HasCpuFeature("hw.optional.arm.FEAT_BF16") && 498 HasCpuFeature("hw.optional.arm.FEAT_I8MM")) { 499 bits |= HWY_NEON_BF16; 500 } 501 } 502 #else // !HWY_OS_APPLE 503 // .. but not necessarily AES, which is required for HWY_NEON. 504 #if defined(HWCAP_AES) 505 if (hw & HWCAP_AES) { 506 bits |= HWY_NEON; 507 508 #if defined(HWCAP_ASIMDHP) && defined(HWCAP_ASIMDDP) && defined(HWCAP2_BF16) 509 const CapBits hw2 = getauxval(AT_HWCAP2); 510 constexpr CapBits kGroupF16Dot = HWCAP_ASIMDHP | HWCAP_ASIMDDP; 511 constexpr CapBits kGroupBF16 = HWCAP2_BF16 | HWCAP2_I8MM; 512 if ((hw & kGroupF16Dot) == kGroupF16Dot && 513 (hw2 & kGroupBF16) == kGroupBF16) { 514 bits |= HWY_NEON_BF16; 515 } 516 #endif // HWCAP_ASIMDHP && HWCAP_ASIMDDP && HWCAP2_BF16 517 } 518 #endif // HWCAP_AES 519 520 #if defined(HWCAP_SVE) 521 if (hw & HWCAP_SVE) { 522 bits |= HWY_SVE; 523 } 524 #endif 525 526 #ifndef HWCAP2_SVE2 527 #define HWCAP2_SVE2 (1 << 1) 528 #endif 529 #ifndef HWCAP2_SVEAES 530 #define HWCAP2_SVEAES (1 << 2) 531 #endif 532 #ifndef HWCAP2_SVEI8MM 533 #define HWCAP2_SVEI8MM (1 << 9) 534 #endif 535 constexpr CapBits kGroupSVE2 = 536 HWCAP2_SVE2 | HWCAP2_SVEAES | HWCAP2_SVEI8MM | HWCAP2_I8MM; 537 const CapBits hw2 = getauxval(AT_HWCAP2); 538 if ((hw2 & kGroupSVE2) == kGroupSVE2) { 539 bits |= HWY_SVE2; 540 } 541 542 #if (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \ 543 ((HWY_TARGETS & HWY_ALL_SVE) != 0) 544 if ((bits & HWY_ALL_SVE) != 0) { 545 bits |= DetectAdditionalSveTargets(bits); 546 } 547 #endif // (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && 548 // ((HWY_TARGETS & HWY_ALL_SVE) != 0) 549 550 #endif // HWY_OS_APPLE 551 552 #else // !HWY_ARCH_ARM_A64 553 554 // Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported. 555 #if defined(HWCAP_NEON) && defined(HWCAP_VFPv4) 556 if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) { 557 bits |= HWY_NEON_WITHOUT_AES; 558 } 559 #endif 560 561 // aarch32 would check getauxval(AT_HWCAP2) & HWCAP2_AES, but we do not yet 562 // support that platform, and Armv7 lacks AES entirely. Because HWY_NEON 563 // requires native AES instructions, we do not enable that target here. 564 565 #endif // HWY_ARCH_ARM_A64 566 return bits; 567 } 568 } // namespace arm 569 #elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH 570 namespace ppc { 571 572 #ifndef PPC_FEATURE_HAS_ALTIVEC 573 #define PPC_FEATURE_HAS_ALTIVEC 0x10000000 574 #endif 575 576 #ifndef PPC_FEATURE_HAS_VSX 577 #define PPC_FEATURE_HAS_VSX 0x00000080 578 #endif 579 580 #ifndef PPC_FEATURE2_ARCH_2_07 581 #define PPC_FEATURE2_ARCH_2_07 0x80000000 582 #endif 583 584 #ifndef PPC_FEATURE2_VEC_CRYPTO 585 #define PPC_FEATURE2_VEC_CRYPTO 0x02000000 586 #endif 587 588 #ifndef PPC_FEATURE2_ARCH_3_00 589 #define PPC_FEATURE2_ARCH_3_00 0x00800000 590 #endif 591 592 #ifndef PPC_FEATURE2_ARCH_3_1 593 #define PPC_FEATURE2_ARCH_3_1 0x00040000 594 #endif 595 596 using CapBits = unsigned long; // NOLINT 597 598 // For AT_HWCAP, the others are for AT_HWCAP2 599 static constexpr CapBits kGroupVSX = 600 PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX; 601 602 #if defined(HWY_DISABLE_PPC8_CRYPTO) 603 static constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07; 604 #else 605 static constexpr CapBits kGroupPPC8 = 606 PPC_FEATURE2_ARCH_2_07 | PPC_FEATURE2_VEC_CRYPTO; 607 #endif 608 static constexpr CapBits kGroupPPC9 = kGroupPPC8 | PPC_FEATURE2_ARCH_3_00; 609 static constexpr CapBits kGroupPPC10 = kGroupPPC9 | PPC_FEATURE2_ARCH_3_1; 610 611 static int64_t DetectTargets() { 612 int64_t bits = 0; // return value of supported targets. 613 614 #if defined(AT_HWCAP) && defined(AT_HWCAP2) 615 const CapBits hw = getauxval(AT_HWCAP); 616 617 if ((hw & kGroupVSX) == kGroupVSX) { 618 const CapBits hw2 = getauxval(AT_HWCAP2); 619 if ((hw2 & kGroupPPC8) == kGroupPPC8) { 620 bits |= HWY_PPC8; 621 } 622 if ((hw2 & kGroupPPC9) == kGroupPPC9) { 623 bits |= HWY_PPC9; 624 } 625 if ((hw2 & kGroupPPC10) == kGroupPPC10) { 626 bits |= HWY_PPC10; 627 } 628 } // VSX 629 #endif // defined(AT_HWCAP) && defined(AT_HWCAP2) 630 631 return bits; 632 } 633 } // namespace ppc 634 #elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH 635 namespace s390x { 636 637 #ifndef HWCAP_S390_VX 638 #define HWCAP_S390_VX 2048 639 #endif 640 641 #ifndef HWCAP_S390_VXE 642 #define HWCAP_S390_VXE 8192 643 #endif 644 645 #ifndef HWCAP_S390_VXRS_EXT2 646 #define HWCAP_S390_VXRS_EXT2 32768 647 #endif 648 649 using CapBits = unsigned long; // NOLINT 650 651 static constexpr CapBits kGroupZ14 = HWCAP_S390_VX | HWCAP_S390_VXE; 652 static constexpr CapBits kGroupZ15 = 653 HWCAP_S390_VX | HWCAP_S390_VXE | HWCAP_S390_VXRS_EXT2; 654 655 static int64_t DetectTargets() { 656 int64_t bits = 0; 657 658 #if defined(AT_HWCAP) 659 const CapBits hw = getauxval(AT_HWCAP); 660 661 if ((hw & kGroupZ14) == kGroupZ14) { 662 bits |= HWY_Z14; 663 } 664 665 if ((hw & kGroupZ15) == kGroupZ15) { 666 bits |= HWY_Z15; 667 } 668 #endif 669 670 return bits; 671 } 672 } // namespace s390x 673 #elif HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH 674 namespace rvv { 675 676 #ifndef HWCAP_RVV 677 #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A')) 678 #endif 679 680 using CapBits = unsigned long; // NOLINT 681 682 static int64_t DetectTargets() { 683 int64_t bits = 0; 684 685 const CapBits hw = getauxval(AT_HWCAP); 686 687 if ((hw & COMPAT_HWCAP_ISA_V) == COMPAT_HWCAP_ISA_V) { 688 size_t e8m1_vec_len; 689 #if HWY_ARCH_RISCV_64 690 int64_t vtype_reg_val; 691 #else 692 int32_t vtype_reg_val; 693 #endif 694 695 // Check that a vuint8m1_t vector is at least 16 bytes and that tail 696 // agnostic and mask agnostic mode are supported 697 asm volatile( 698 // Avoid compiler error on GCC or Clang if -march=rv64gcv1p0 or 699 // -march=rv32gcv1p0 option is not specified on the command line 700 ".option push\n\t" 701 ".option arch, +v\n\t" 702 "vsetvli %0, zero, e8, m1, ta, ma\n\t" 703 "csrr %1, vtype\n\t" 704 ".option pop" 705 : "=r"(e8m1_vec_len), "=r"(vtype_reg_val)); 706 707 // The RVV target is supported if the VILL bit of VTYPE (the MSB bit of 708 // VTYPE) is not set and the length of a vuint8m1_t vector is at least 16 709 // bytes 710 if (vtype_reg_val >= 0 && e8m1_vec_len >= 16) { 711 bits |= HWY_RVV; 712 } 713 } 714 715 return bits; 716 } 717 } // namespace rvv 718 #elif HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH 719 720 namespace loongarch { 721 722 #ifndef LA_HWCAP_LSX 723 #define LA_HWCAP_LSX (1u << 4) 724 #endif 725 #ifndef LA_HWCAP_LASX 726 #define LA_HWCAP_LASX (1u << 5) 727 #endif 728 729 using CapBits = unsigned long; // NOLINT 730 731 static int64_t DetectTargets() { 732 int64_t bits = 0; 733 const CapBits hw = getauxval(AT_HWCAP); 734 if (hw & LA_HWCAP_LSX) bits |= HWY_LSX; 735 if (hw & LA_HWCAP_LASX) bits |= HWY_LASX; 736 return bits; 737 } 738 } // namespace loongarch 739 #endif // HWY_ARCH_* 740 741 // Returns targets supported by the CPU, independently of DisableTargets. 742 // Factored out of SupportedTargets to make its structure more obvious. Note 743 // that x86 CPUID may take several hundred cycles. 744 static int64_t DetectTargets() { 745 // Apps will use only one of these (the default is EMU128), but compile flags 746 // for this TU may differ from that of the app, so allow both. 747 int64_t bits = HWY_SCALAR | HWY_EMU128; 748 749 #if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH 750 bits |= x86::DetectTargets(); 751 #elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH 752 bits |= arm::DetectTargets(); 753 #elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH 754 bits |= ppc::DetectTargets(); 755 #elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH 756 bits |= s390x::DetectTargets(); 757 #elif HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH 758 bits |= rvv::DetectTargets(); 759 #elif HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH 760 bits |= loongarch::DetectTargets(); 761 762 #else 763 // TODO(janwas): detect support for WASM. 764 // This file is typically compiled without HWY_IS_TEST, but targets_test has 765 // it set, and will expect all of its HWY_TARGETS (= all attainable) to be 766 // supported. 767 bits |= HWY_ENABLED_BASELINE; 768 #endif // HWY_ARCH_* 769 770 if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) { 771 const uint64_t bits_u = static_cast<uint64_t>(bits); 772 const uint64_t enabled = static_cast<uint64_t>(HWY_ENABLED_BASELINE); 773 HWY_WARN("CPU supports 0x%08x%08x, software requires 0x%08x%08x\n", 774 static_cast<uint32_t>(bits_u >> 32), 775 static_cast<uint32_t>(bits_u & 0xFFFFFFFF), 776 static_cast<uint32_t>(enabled >> 32), 777 static_cast<uint32_t>(enabled & 0xFFFFFFFF)); 778 } 779 780 return bits; 781 } 782 783 // When running tests, this value can be set to the mocked supported targets 784 // mask. Only written to from a single thread before the test starts. 785 static int64_t supported_targets_for_test_ = 0; 786 787 // Mask of targets disabled at runtime with DisableTargets. 788 static int64_t supported_mask_ = LimitsMax<int64_t>(); 789 790 HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) { 791 supported_mask_ = static_cast<int64_t>(~disabled_targets); 792 // This will take effect on the next call to SupportedTargets, which is 793 // called right before GetChosenTarget::Update. However, calling Update here 794 // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want 795 // to check in tests. We instead de-initialize such that the next 796 // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache. 797 GetChosenTarget().DeInit(); 798 } 799 800 HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) { 801 supported_targets_for_test_ = targets; 802 GetChosenTarget().DeInit(); // see comment above 803 } 804 805 HWY_DLLEXPORT int64_t SupportedTargets() { 806 int64_t targets = supported_targets_for_test_; 807 if (HWY_LIKELY(targets == 0)) { 808 // Mock not active. Re-detect instead of caching just in case we're on a 809 // heterogeneous ISA (also requires some app support to pin threads). This 810 // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to 811 // DisableTargets or SetSupportedTargetsForTest. 812 targets = DetectTargets(); 813 814 // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion, 815 // first set up ChosenTarget. No need to Update() again afterwards with the 816 // final targets - that will be done by a caller of this function. 817 GetChosenTarget().Update(targets); 818 } 819 820 targets &= supported_mask_; 821 return targets == 0 ? HWY_STATIC_TARGET : targets; 822 } 823 824 HWY_DLLEXPORT ChosenTarget& GetChosenTarget() { 825 static ChosenTarget chosen_target; 826 return chosen_target; 827 } 828 829 } // namespace hwy