topology.cc (42877B)
1 // Copyright 2024 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include "hwy/contrib/thread_pool/topology.h" 17 18 #include <ctype.h> // isspace 19 #include <stddef.h> 20 #include <stdint.h> 21 #include <stdio.h> 22 #include <string.h> // strchr 23 24 #include <array> 25 #include <map> 26 #include <string> 27 #include <vector> 28 29 #include "hwy/base.h" // HWY_OS_WIN, HWY_WARN 30 31 #if HWY_OS_APPLE 32 #include <sys/sysctl.h> 33 34 #include "hwy/aligned_allocator.h" // HWY_ALIGNMENT 35 #endif 36 37 #if HWY_OS_WIN 38 #ifndef NOMINMAX 39 #define NOMINMAX 40 #endif 41 #ifndef WIN32_LEAN_AND_MEAN 42 #define WIN32_LEAN_AND_MEAN 43 #endif 44 #ifndef _WIN32_WINNT 45 #define _WIN32_WINNT 0x0601 // Windows 7 / Server 2008 46 #endif 47 #include <windows.h> 48 #endif // HWY_OS_WIN 49 50 #if HWY_OS_LINUX || HWY_OS_FREEBSD 51 #ifndef _GNU_SOURCE 52 #define _GNU_SOURCE 53 #endif 54 #include <errno.h> 55 #include <fcntl.h> 56 #include <pthread.h> 57 #include <sched.h> 58 #include <sys/stat.h> 59 #include <sys/syscall.h> 60 #include <sys/types.h> 61 #include <unistd.h> // sysconf 62 #endif // HWY_OS_LINUX || HWY_OS_FREEBSD 63 64 #if HWY_OS_FREEBSD 65 #include <sys/param.h> 66 // After param.h / types.h. 67 #include <sys/cpuset.h> 68 #endif // HWY_OS_FREEBSD 69 70 #if HWY_ARCH_WASM 71 #include <emscripten/threading.h> 72 #endif 73 74 namespace hwy { 75 76 HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport() { 77 #if HWY_ARCH_WASM 78 return emscripten_has_threading_support() != 0; 79 #else 80 return true; 81 #endif 82 } 83 84 namespace { 85 86 // Returns `whole / part`, with a check that `part` evenly divides `whole`, 87 // which implies the result is exact. 88 HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) { 89 HWY_ASSERT(part != 0); 90 const size_t div = whole / part; 91 const size_t mul = div * part; 92 if (mul != whole) { 93 HWY_ABORT("%zu / %zu = %zu; *%zu = %zu\n", whole, part, div, part, mul); 94 } 95 return div; 96 } 97 98 #if HWY_OS_WIN 99 100 using SLPI = SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX; 101 102 template <class Func> 103 bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) { 104 // Get required buffer size. 105 DWORD buf_bytes = 0; 106 HWY_ASSERT(!GetLogicalProcessorInformationEx(rel, nullptr, &buf_bytes)); 107 // Observed when `rel` is not supported: 108 if (HWY_UNLIKELY(buf_bytes == 0 && GetLastError() == ERROR_GEN_FAILURE)) { 109 if (rel != RelationNumaNodeEx && rel != RelationProcessorDie) { 110 HWY_WARN("Unexpected err %lx for GLPI relationship %d\n", GetLastError(), 111 static_cast<int>(rel)); 112 } 113 return false; 114 } 115 HWY_ASSERT(GetLastError() == ERROR_INSUFFICIENT_BUFFER); 116 // Note: `buf_bytes` may be less than `sizeof(SLPI)`, which has padding. 117 // `calloc` zero-initializes the `Reserved` field, part of which has been 118 // repurposed into `GroupCount` in SDKs, 10.0.22000.0 or possibly earlier. 119 uint8_t* buf = static_cast<uint8_t*>(calloc(1, buf_bytes)); 120 HWY_ASSERT(buf); 121 122 // Fill the buffer. 123 SLPI* info = reinterpret_cast<SLPI*>(buf); 124 if (HWY_UNLIKELY(!GetLogicalProcessorInformationEx(rel, info, &buf_bytes))) { 125 free(buf); 126 return false; 127 } 128 129 // Iterate over each SLPI. `sizeof(SLPI)` is unreliable, see above. 130 uint8_t* pos = buf; 131 while (pos < buf + buf_bytes) { 132 info = reinterpret_cast<SLPI*>(pos); 133 HWY_ASSERT(rel == RelationAll || info->Relationship == rel); 134 func(*info); 135 pos += info->Size; 136 } 137 if (pos != buf + buf_bytes) { 138 HWY_WARN("unexpected pos %p, end %p, buf_bytes %lu, sizeof(SLPI) %zu\n", 139 pos, buf + buf_bytes, buf_bytes, sizeof(SLPI)); 140 } 141 142 free(buf); 143 return true; 144 } 145 146 size_t NumBits(size_t num_groups, const GROUP_AFFINITY* affinity) { 147 size_t total_bits = 0; 148 for (size_t i = 0; i < num_groups; ++i) { 149 size_t bits = 0; 150 hwy::CopyBytes<sizeof(bits)>(&affinity[i].Mask, &bits); 151 total_bits += hwy::PopCount(bits); 152 } 153 return total_bits; 154 } 155 156 // Calls `func(lp, lps)` for each index `lp` in the set, after ensuring that 157 // `lp < lps.size()`. `line` is for debugging via Warn(). 158 template <class Func> 159 void ForeachBit(size_t num_groups, const GROUP_AFFINITY* affinity, 160 std::vector<Topology::LP>& lps, int line, const Func& func) { 161 for (size_t group = 0; group < num_groups; ++group) { 162 size_t bits = 0; 163 hwy::CopyBytes<sizeof(bits)>(&affinity[group].Mask, &bits); 164 while (bits != 0) { 165 size_t lp = Num0BitsBelowLS1Bit_Nonzero64(bits); 166 bits &= bits - 1; // clear LSB 167 if (HWY_UNLIKELY(lp >= lps.size())) { 168 Warn(__FILE__, line, "Clamping lp %zu to lps.size() %zu, groups %zu\n", 169 lp, lps.size(), num_groups); 170 lp = lps.size() - 1; 171 } 172 func(lp, lps); 173 } 174 } 175 } 176 177 #elif HWY_OS_APPLE 178 179 // Returns whether sysctlbyname() succeeded; if so, writes `val / div` to 180 // `out`, otherwise sets `err`. 181 template <typename T> 182 bool Sysctl(const char* name, size_t div, int& err, T* out) { 183 size_t val = 0; 184 size_t size = sizeof(val); 185 // Last two arguments are for updating the value, which we do not want. 186 const int ret = sysctlbyname(name, &val, &size, nullptr, 0); 187 if (HWY_UNLIKELY(ret != 0)) { 188 // Do not print warnings because some `name` are expected to fail. 189 err = ret; 190 return false; 191 } 192 *out = static_cast<T>(DivByFactor(val, div)); 193 return true; 194 } 195 196 #endif // HWY_OS_* 197 198 } // namespace 199 200 HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() { 201 size_t total_lps = 0; 202 #if HWY_ARCH_WASM 203 const int num_cores = emscripten_num_logical_cores(); 204 if (num_cores > 0) total_lps = static_cast<size_t>(num_cores); 205 #elif HWY_OS_WIN 206 // If there are multiple groups, this should return them all, rather than 207 // just the first 64, but VMs report less. 208 (void)ForEachSLPI(RelationProcessorCore, [&total_lps](const SLPI& info) { 209 const PROCESSOR_RELATIONSHIP& p = info.Processor; 210 total_lps += NumBits(p.GroupCount, p.GroupMask); 211 }); 212 #elif HWY_OS_LINUX 213 // Only check "online" because sysfs entries such as topology are missing for 214 // offline CPUs, which will cause `DetectPackages` to fail. 215 const long ret = sysconf(_SC_NPROCESSORS_ONLN); // NOLINT(runtime/int) 216 if (ret < 0) { 217 HWY_WARN("Unexpected _SC_NPROCESSORS_CONF = %d\n", static_cast<int>(ret)); 218 } else { 219 total_lps = static_cast<size_t>(ret); 220 } 221 #elif HWY_OS_APPLE 222 int err; 223 // Only report P processors. 224 if (!Sysctl("hw.perflevel0.logicalcpu", 1, err, &total_lps)) { 225 total_lps = 0; 226 } 227 #endif 228 229 if (HWY_UNLIKELY(total_lps == 0)) { // Failed to detect. 230 HWY_WARN( 231 "Unknown TotalLogicalProcessors, assuming 1. " 232 "HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n" 233 "HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n", 234 HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, HWY_ARCH_X86, 235 HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, HWY_ARCH_S390X); 236 return 1; 237 } 238 239 // Warn that we are clamping. 240 if (HWY_UNLIKELY(total_lps > kMaxLogicalProcessors)) { 241 HWY_WARN("OS reports %zu processors but clamping to %zu\n", total_lps, 242 kMaxLogicalProcessors); 243 total_lps = kMaxLogicalProcessors; 244 } 245 246 return total_lps; 247 } 248 249 // ------------------------------ Affinity 250 251 #if HWY_OS_LINUX || HWY_OS_FREEBSD 252 namespace { 253 254 #if HWY_OS_LINUX 255 using CpuSet = cpu_set_t; 256 #else 257 using CpuSet = cpuset_t; 258 #endif 259 260 // Helper functions reduce the number of #if in GetThreadAffinity. 261 int GetAffinity(CpuSet* set) { 262 // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD. 263 #if defined(__ANDROID__) && __ANDROID_API__ < 12 264 return syscall(__NR_sched_getaffinity, 0, sizeof(CpuSet), set); 265 #elif HWY_OS_FREEBSD 266 return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet), 267 set); 268 #else // normal Linux 269 return sched_getaffinity(0, sizeof(CpuSet), set); 270 #endif 271 } 272 273 int SetAffinity(CpuSet* set) { 274 // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD. 275 #if defined(__ANDROID__) && __ANDROID_API__ < 12 276 return syscall(__NR_sched_setaffinity, 0, sizeof(CpuSet), set); 277 #elif HWY_OS_FREEBSD 278 return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet), 279 set); 280 #else // normal Linux 281 return sched_setaffinity(0, sizeof(CpuSet), set); 282 #endif 283 } 284 285 bool IsSet(size_t lp, const CpuSet* set) { 286 #if HWY_COMPILER_GCC_ACTUAL 287 // Workaround for GCC compiler warning with CPU_ISSET macro 288 HWY_DIAGNOSTICS(push) 289 HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion") 290 #endif 291 const int is_set = CPU_ISSET(static_cast<int>(lp), set); 292 #if HWY_COMPILER_GCC_ACTUAL 293 HWY_DIAGNOSTICS(pop) 294 #endif 295 return is_set != 0; 296 } 297 298 void Set(size_t lp, CpuSet* set) { 299 #if HWY_COMPILER_GCC_ACTUAL 300 // Workaround for GCC compiler warning with CPU_SET macro 301 HWY_DIAGNOSTICS(push) 302 HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion") 303 #endif 304 CPU_SET(static_cast<int>(lp), set); 305 #if HWY_COMPILER_GCC_ACTUAL 306 HWY_DIAGNOSTICS(pop) 307 #endif 308 } 309 310 } // namespace 311 #endif // HWY_OS_LINUX || HWY_OS_FREEBSD 312 313 HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps) { 314 #if HWY_OS_WIN 315 // Only support the first 64 because WINE does not support processor groups. 316 const HANDLE hThread = GetCurrentThread(); 317 const DWORD_PTR prev = SetThreadAffinityMask(hThread, ~DWORD_PTR(0)); 318 if (!prev) return false; 319 (void)SetThreadAffinityMask(hThread, prev); 320 lps = LogicalProcessorSet(); // clear all 321 lps.SetNonzeroBitsFrom64(prev); 322 return true; 323 #elif HWY_OS_LINUX || HWY_OS_FREEBSD 324 CpuSet set; 325 CPU_ZERO(&set); 326 const int err = GetAffinity(&set); 327 if (err != 0) return false; 328 for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) { 329 if (IsSet(lp, &set)) lps.Set(lp); 330 } 331 return true; 332 #else 333 // For HWY_OS_APPLE, affinity is not supported. Do not even set lp=0 to force 334 // callers to handle this case. 335 (void)lps; 336 return false; 337 #endif 338 } 339 340 HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps) { 341 #if HWY_OS_WIN 342 const HANDLE hThread = GetCurrentThread(); 343 const DWORD_PTR prev = SetThreadAffinityMask(hThread, lps.Get64()); 344 return prev != 0; 345 #elif HWY_OS_LINUX || HWY_OS_FREEBSD 346 CpuSet set; 347 CPU_ZERO(&set); 348 lps.Foreach([&set](size_t lp) { Set(lp, &set); }); 349 const int err = SetAffinity(&set); 350 if (err != 0) return false; 351 return true; 352 #else 353 // Apple THREAD_AFFINITY_POLICY is only an (often ignored) hint. 354 (void)lps; 355 return false; 356 #endif 357 } 358 359 namespace { 360 361 struct PackageSizes { 362 size_t num_clusters; 363 size_t num_cores; 364 }; 365 366 #if HWY_OS_LINUX 367 368 class File { 369 public: 370 explicit File(const char* path) { 371 for (;;) { 372 fd_ = open(path, O_RDONLY); 373 if (fd_ > 0) return; // success 374 if (errno == EINTR) continue; // signal: retry 375 if (errno == ENOENT) return; // not found, give up 376 HWY_WARN("Unexpected error opening %s: %d\n", path, errno); 377 return; // unknown error, give up 378 } 379 } 380 381 ~File() { 382 if (fd_ > 0) { 383 for (;;) { 384 const int ret = close(fd_); 385 if (ret == 0) break; // success 386 if (errno == EINTR) continue; // signal: retry 387 HWY_WARN("Unexpected error closing file: %d\n", errno); 388 return; // unknown error, ignore 389 } 390 } 391 } 392 393 // Returns number of bytes read or 0 on failure. 394 size_t Read(char* buf200) const { 395 if (fd_ < 0) return 0; 396 size_t pos = 0; 397 for (;;) { 398 // read instead of `pread`, which might not work for sysfs. 399 const auto bytes_read = read(fd_, buf200 + pos, 200 - pos); 400 if (bytes_read == 0) { // EOF: done 401 buf200[pos++] = '\0'; 402 return pos; 403 } 404 if (bytes_read == -1) { 405 if (errno == EINTR) continue; // signal: retry 406 HWY_WARN("Unexpected error reading file: %d\n", errno); 407 return 0; 408 } 409 pos += static_cast<size_t>(bytes_read); 410 HWY_ASSERT(pos <= 200); 411 } 412 } 413 414 private: 415 int fd_; 416 }; 417 418 // Returns bytes read, or 0 on failure. 419 size_t ReadSysfs(const char* format, size_t lp, char* buf200) { 420 char path[200]; 421 const int bytes_written = snprintf(path, sizeof(path), format, lp); 422 HWY_ASSERT(0 < bytes_written && 423 bytes_written < static_cast<int>(sizeof(path) - 1)); 424 425 const File file(path); 426 return file.Read(buf200); 427 } 428 429 // Interprets [str + pos, str + end) as base-10 ASCII. Stops when any non-digit 430 // is found, or at end. Returns false if no digits found. 431 bool ParseDigits(const char* str, const size_t end, size_t& pos, size_t* out) { 432 HWY_ASSERT(pos <= end); 433 // 9 digits cannot overflow even 32-bit size_t. 434 const size_t stop = pos + 9; 435 *out = 0; 436 for (; pos < HWY_MIN(end, stop); ++pos) { 437 const int c = str[pos]; 438 if (c < '0' || c > '9') break; 439 *out *= 10; 440 *out += static_cast<size_t>(c - '0'); 441 } 442 if (pos == 0) { // No digits found 443 *out = 0; 444 return false; 445 } 446 return true; 447 } 448 449 // Number, plus optional K or M suffix, plus terminator. 450 bool ParseNumberWithOptionalSuffix(const char* str, size_t len, size_t* out) { 451 size_t pos = 0; 452 if (!ParseDigits(str, len, pos, out)) return false; 453 if (str[pos] == 'K') { 454 *out <<= 10; 455 ++pos; 456 } 457 if (str[pos] == 'M') { 458 *out <<= 20; 459 ++pos; 460 } 461 if (str[pos] != '\0' && str[pos] != '\n') { 462 HWY_ABORT("Expected [suffix] terminator at %zu %s\n", pos, str); 463 } 464 return true; 465 } 466 467 bool ReadNumberWithOptionalSuffix(const char* format, size_t lp, size_t* out) { 468 char buf200[200]; 469 const size_t pos = ReadSysfs(format, lp, buf200); 470 if (pos == 0) return false; 471 return ParseNumberWithOptionalSuffix(buf200, pos, out); 472 } 473 474 const char* kPackage = 475 "/sys/devices/system/cpu/cpu%zu/topology/physical_package_id"; 476 const char* kCluster = "/sys/devices/system/cpu/cpu%zu/cache/index3/id"; 477 const char* kCore = "/sys/devices/system/cpu/cpu%zu/topology/core_id"; 478 const char* kL2Size = "/sys/devices/system/cpu/cpu%zu/cache/index2/size"; 479 const char* kL3Size = "/sys/devices/system/cpu/cpu%zu/cache/index3/size"; 480 const char* kNode = "/sys/devices/system/node/node%zu/cpulist"; 481 482 // sysfs values can be arbitrarily large, so store in a map and replace with 483 // indices in order of appearance. 484 class Remapper { 485 public: 486 // Returns false on error, or sets `out_index` to the index of the sysfs 487 // value selected by `format` and `lp`. 488 template <typename T> 489 bool operator()(const char* format, size_t lp, T* HWY_RESTRICT out_index) { 490 size_t opaque; 491 if (!ReadNumberWithOptionalSuffix(format, lp, &opaque)) return false; 492 493 const auto ib = indices_.insert({opaque, num_}); 494 num_ += ib.second; // increment if inserted 495 const size_t index = ib.first->second; // new or existing 496 HWY_ASSERT(index < num_); 497 HWY_ASSERT(index < hwy::LimitsMax<T>()); 498 *out_index = static_cast<T>(index); 499 return true; 500 } 501 502 size_t Num() const { return num_; } 503 504 private: 505 std::map<size_t, size_t> indices_; 506 size_t num_ = 0; 507 }; 508 509 // For internal use by `DetectPackages`. 510 struct PerPackage { 511 Remapper clusters; 512 Remapper cores; 513 // We rely on this zero-init and increment it below. 514 uint8_t smt_per_core[kMaxLogicalProcessors] = {0}; 515 }; 516 517 // Initializes `lps` and returns a PackageSizes vector (empty on failure) 518 // indicating the number of clusters and cores per package. 519 std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) { 520 std::vector<PackageSizes> empty; 521 522 Remapper packages; 523 for (size_t lp = 0; lp < lps.size(); ++lp) { 524 if (!packages(kPackage, lp, &lps[lp].package)) { 525 HWY_WARN("Failed to read sysfs package for LP %zu\n", lp); 526 return empty; 527 } 528 } 529 std::vector<PerPackage> per_package(packages.Num()); 530 HWY_ASSERT(!per_package.empty()); 531 532 for (size_t lp = 0; lp < lps.size(); ++lp) { 533 PerPackage& pp = per_package[lps[lp].package]; 534 // Not a failure: some CPUs lack a (shared) L3 cache. 535 if (!pp.clusters(kCluster, lp, &lps[lp].cluster)) { 536 lps[lp].cluster = 0; 537 } 538 539 if (!pp.cores(kCore, lp, &lps[lp].core)) { 540 HWY_WARN("Failed to read sysfs core for LP %zu\n", lp); 541 return empty; 542 } 543 544 // SMT ID is how many LP we have already seen assigned to the same core. 545 HWY_ASSERT(lps[lp].core < kMaxLogicalProcessors); 546 lps[lp].smt = pp.smt_per_core[lps[lp].core]++; 547 HWY_ASSERT(lps[lp].smt < 16); 548 } 549 550 std::vector<PackageSizes> package_sizes(per_package.size()); 551 for (size_t p = 0; p < package_sizes.size(); ++p) { 552 // Was zero if the package has no shared L3, see above. 553 package_sizes[p].num_clusters = HWY_MAX(1, per_package[p].clusters.Num()); 554 package_sizes[p].num_cores = per_package[p].cores.Num(); 555 HWY_ASSERT(package_sizes[p].num_cores != 0); 556 } 557 return package_sizes; 558 } 559 560 std::vector<size_t> ExpandList(const char* list, size_t list_end, 561 size_t max_lp) { 562 std::vector<size_t> expanded; 563 constexpr size_t kNotFound = ~size_t{0}; 564 size_t pos = 0; 565 566 // Gracefully handle empty lists, happens on GH200 systems (#2668). 567 if (isspace(list[0]) && list_end <= 2) return expanded; 568 569 // Returns first `found_pos >= pos` where `list[found_pos] == c`, or 570 // `kNotFound`. 571 const auto find = [list, list_end, &pos](char c) -> size_t { 572 const char* found_ptr = strchr(list + pos, c); 573 if (found_ptr == nullptr) return kNotFound; 574 const size_t found_pos = static_cast<size_t>(found_ptr - list); 575 HWY_ASSERT(found_pos < list_end && list[found_pos] == c); 576 return found_pos; 577 }; 578 579 // Reads LP number and advances `pos`. `end` is for verifying we did not 580 // read past a known terminator, or the end of string. 581 const auto parse_lp = [list, list_end, &pos, max_lp](size_t end) -> size_t { 582 end = HWY_MIN(end, list_end); 583 size_t lp; 584 HWY_ASSERT(ParseDigits(list, end, pos, &lp)); 585 HWY_IF_CONSTEXPR(HWY_ARCH_RISCV) { 586 // On RISC-V, both TotalLogicalProcessors and GetThreadAffinity may 587 // under-report the count, hence clamp. 588 lp = HWY_MIN(lp, max_lp); 589 } 590 HWY_ASSERT(lp <= max_lp); 591 HWY_ASSERT(pos <= end); 592 return lp; 593 }; 594 595 // Parse all [first-]last separated by commas. 596 for (;;) { 597 // Single number or first of range: ends with dash, comma, or end. 598 const size_t lp_range_first = parse_lp(HWY_MIN(find('-'), find(','))); 599 600 if (list[pos] == '-') { // range 601 ++pos; // skip dash 602 // Last of range ends with comma or end. 603 const size_t lp_range_last = parse_lp(find(',')); 604 605 expanded.reserve(expanded.size() + lp_range_last - lp_range_first + 1); 606 for (size_t lp = lp_range_first; lp <= lp_range_last; ++lp) { 607 expanded.push_back(lp); 608 } 609 } else { // single number 610 expanded.push_back(lp_range_first); 611 } 612 613 // Done if reached end of string. 614 if (pos == list_end || list[pos] == '\0' || list[pos] == '\n') { 615 break; 616 } 617 // Comma means at least one more term is coming. 618 if (list[pos] == ',') { 619 ++pos; 620 continue; 621 } 622 HWY_ABORT("Unexpected character at %zu in %s\n", pos, list); 623 } // for pos 624 625 return expanded; 626 } 627 628 // Sets LP.node for all `lps`. 629 void SetNodes(std::vector<Topology::LP>& lps) { 630 // For each NUMA node found via sysfs: 631 for (size_t node = 0;; node++) { 632 // Read its cpulist so we can scatter `node` to all its `lps`. 633 char buf200[200]; 634 const size_t bytes_read = ReadSysfs(kNode, node, buf200); 635 if (bytes_read == 0) break; 636 const std::vector<size_t> list = 637 ExpandList(buf200, bytes_read, lps.size() - 1); 638 for (size_t lp : list) { 639 lps[lp].node = static_cast<uint8_t>(node); 640 } 641 } 642 } 643 644 void SetClusterCacheSizes(std::vector<Topology::Package>& packages) { 645 for (size_t ip = 0; ip < packages.size(); ++ip) { 646 Topology::Package& p = packages[ip]; 647 for (size_t ic = 0; ic < p.clusters.size(); ++ic) { 648 Topology::Cluster& c = p.clusters[ic]; 649 const size_t lp = c.lps.First(); 650 size_t bytes; 651 if (ReadNumberWithOptionalSuffix(kL2Size, lp, &bytes)) { 652 c.private_kib = bytes >> 10; 653 } 654 if (ReadNumberWithOptionalSuffix(kL3Size, lp, &bytes)) { 655 c.shared_kib = bytes >> 10; 656 } 657 } 658 } 659 } 660 661 #elif HWY_OS_WIN 662 663 // See #2734. GroupCount was added around Windows 10, but SDK docs do not 664 // mention the actual version required. It is known to be absent in 8.1 and 665 // MinGW 5.0.1, and present in the 10.0.22000.0 SDK. However, the OS must also 666 // know about the field. Thus we zero-initialize the reserved field, assume it 667 // remains zero, and return 1 if zero (old style single GroupMask), otherwise 668 // the number of groups. There are two such structures, but note that 669 // `PROCESSOR_RELATIONSHIP` already had this field. 670 static size_t GroupCount(const CACHE_RELATIONSHIP& cr) { 671 // Added as the last u16 in the reserved area before GroupMask. We only read 672 // one byte because 256*64 processor bits are plenty. 673 const uint8_t* pcount = 674 reinterpret_cast<const uint8_t*>(&cr.GroupMask) - sizeof(uint16_t); 675 return HWY_MAX(pcount[HWY_IS_BIG_ENDIAN], 1); 676 } 677 678 static size_t GroupCount(const NUMA_NODE_RELATIONSHIP& nn) { 679 const uint8_t* pcount = 680 reinterpret_cast<const uint8_t*>(&nn.GroupMask) - sizeof(uint16_t); 681 return HWY_MAX(pcount[HWY_IS_BIG_ENDIAN], 1); 682 } 683 684 // Also sets LP.core and LP.smt. 685 size_t MaxLpsPerCore(std::vector<Topology::LP>& lps) { 686 size_t max_lps_per_core = 0; 687 size_t core_idx = 0; 688 (void)ForEachSLPI(RelationProcessorCore, [&max_lps_per_core, &core_idx, 689 &lps](const SLPI& info) { 690 const PROCESSOR_RELATIONSHIP& p = info.Processor; 691 const size_t lps_per_core = NumBits(p.GroupCount, p.GroupMask); 692 max_lps_per_core = HWY_MAX(max_lps_per_core, lps_per_core); 693 694 size_t smt = 0; 695 ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__, 696 [core_idx, &smt](size_t lp, std::vector<Topology::LP>& lps) { 697 lps[lp].core = static_cast<uint16_t>(core_idx); 698 lps[lp].smt = static_cast<uint8_t>(smt++); 699 }); 700 ++core_idx; 701 }); 702 HWY_ASSERT(max_lps_per_core != 0); 703 return max_lps_per_core; 704 } 705 706 // Interprets cluster (typically a shared L3 cache) as a "processor die". Also 707 // sets LP.cluster. 708 size_t MaxCoresPerCluster(const size_t max_lps_per_core, 709 std::vector<Topology::LP>& lps) { 710 size_t max_cores_per_cluster = 0; 711 size_t cluster_idx = 0; 712 // Shared between `foreach_die` and `foreach_l3`. 713 const auto foreach_cluster = [&](size_t num_groups, 714 const GROUP_AFFINITY* groups) { 715 const size_t lps_per_cluster = NumBits(num_groups, groups); 716 // `max_lps_per_core` is an upper bound, hence round up. It is not an error 717 // if there is only one core per cluster - can happen for L3. 718 const size_t cores_per_cluster = DivCeil(lps_per_cluster, max_lps_per_core); 719 max_cores_per_cluster = HWY_MAX(max_cores_per_cluster, cores_per_cluster); 720 721 ForeachBit(num_groups, groups, lps, __LINE__, 722 [cluster_idx](size_t lp, std::vector<Topology::LP>& lps) { 723 lps[lp].cluster = static_cast<uint16_t>(cluster_idx); 724 }); 725 ++cluster_idx; 726 }; 727 728 // Passes group bits to `foreach_cluster`, depending on relationship type. 729 const auto foreach_die = [&foreach_cluster](const SLPI& info) { 730 const PROCESSOR_RELATIONSHIP& p = info.Processor; 731 foreach_cluster(p.GroupCount, p.GroupMask); 732 }; 733 const auto foreach_l3 = [&foreach_cluster](const SLPI& info) { 734 const CACHE_RELATIONSHIP& cr = info.Cache; 735 if (cr.Type != CacheUnified && cr.Type != CacheData) return; 736 if (cr.Level != 3) return; 737 foreach_cluster(GroupCount(cr), cr.GroupMasks); 738 }; 739 740 if (!ForEachSLPI(RelationProcessorDie, foreach_die)) { 741 // Has been observed to fail; also check for shared L3 caches. 742 (void)ForEachSLPI(RelationCache, foreach_l3); 743 } 744 if (max_cores_per_cluster == 0) { 745 HWY_WARN("All clusters empty, assuming 1 core each\n"); 746 max_cores_per_cluster = 1; 747 } 748 return max_cores_per_cluster; 749 } 750 751 // Initializes `lps` and returns a `PackageSizes` vector (empty on failure) 752 // indicating the number of clusters and cores per package. 753 std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) { 754 const size_t max_lps_per_core = MaxLpsPerCore(lps); 755 const size_t max_cores_per_cluster = 756 MaxCoresPerCluster(max_lps_per_core, lps); 757 758 std::vector<PackageSizes> packages; 759 size_t package_idx = 0; 760 (void)ForEachSLPI(RelationProcessorPackage, [&](const SLPI& info) { 761 const PROCESSOR_RELATIONSHIP& p = info.Processor; 762 const size_t lps_per_package = NumBits(p.GroupCount, p.GroupMask); 763 PackageSizes ps; // avoid designated initializers for MSVC 764 ps.num_clusters = max_cores_per_cluster; 765 // `max_lps_per_core` is an upper bound, hence round up. 766 ps.num_cores = DivCeil(lps_per_package, max_lps_per_core); 767 packages.push_back(ps); 768 769 ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__, 770 [package_idx](size_t lp, std::vector<Topology::LP>& lps) { 771 lps[lp].package = static_cast<uint8_t>(package_idx); 772 }); 773 ++package_idx; 774 }); 775 776 return packages; 777 } 778 779 // Sets LP.node for all `lps`. 780 void SetNodes(std::vector<Topology::LP>& lps) { 781 // Zero-initialize all nodes in case the below fails. 782 for (size_t lp = 0; lp < lps.size(); ++lp) { 783 lps[lp].node = 0; 784 } 785 786 // We want the full NUMA nodes, but Windows Server 2022 truncates the results 787 // of `RelationNumaNode` to a single 64-LP group. To get the old, unlimited 788 // behavior without using the new `RelationNumaNodeEx` symbol, use the old 789 // `RelationAll` and filter the SLPI we want. 790 (void)ForEachSLPI(RelationAll, [&](const SLPI& info) { 791 if (info.Relationship != RelationNumaNode) return; 792 const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode; 793 // This field was previously reserved/zero. There is at least one group. 794 const size_t num_groups = HWY_MAX(1, GroupCount(nn)); 795 const uint8_t node = static_cast<uint8_t>(nn.NodeNumber); 796 ForeachBit(num_groups, nn.GroupMasks, lps, __LINE__, 797 [node](size_t lp, std::vector<Topology::LP>& lps) { 798 lps[lp].node = node; 799 }); 800 }); 801 } 802 803 #elif HWY_OS_APPLE 804 805 // Initializes `lps` and returns a `PackageSizes` vector (empty on failure) 806 // indicating the number of clusters and cores per package. 807 std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) { 808 int err; 809 810 size_t total_cores = 0; 811 if (!Sysctl("hw.perflevel0.physicalcpu", 1, err, &total_cores)) { 812 HWY_WARN("Error %d detecting total_cores, assuming one per LP\n", err); 813 total_cores = lps.size(); 814 } 815 816 if (lps.size() % total_cores != 0) { 817 HWY_WARN("LPs %zu not a multiple of total_cores %zu\n", lps.size(), 818 total_cores); 819 } 820 const size_t lp_per_core = DivCeil(lps.size(), total_cores); 821 822 size_t cores_per_cluster = 0; 823 if (!Sysctl("hw.perflevel0.cpusperl2", 1, err, &cores_per_cluster)) { 824 HWY_WARN("Error %d detecting cores_per_cluster\n", err); 825 cores_per_cluster = HWY_MIN(4, total_cores); 826 } 827 828 if (total_cores % cores_per_cluster != 0) { 829 HWY_WARN("total_cores %zu not a multiple of cores_per_cluster %zu\n", 830 total_cores, cores_per_cluster); 831 } 832 833 for (size_t lp = 0; lp < lps.size(); ++lp) { 834 lps[lp].package = 0; // single package 835 lps[lp].core = static_cast<uint16_t>(lp / lp_per_core); 836 lps[lp].smt = static_cast<uint8_t>(lp % lp_per_core); 837 lps[lp].cluster = static_cast<uint16_t>(lps[lp].core / cores_per_cluster); 838 } 839 840 PackageSizes ps; 841 ps.num_clusters = DivCeil(total_cores, cores_per_cluster); 842 ps.num_cores = total_cores; 843 return std::vector<PackageSizes>{ps}; 844 } 845 846 // Sets LP.node for all `lps`. 847 void SetNodes(std::vector<Topology::LP>& lps) { 848 for (size_t lp = 0; lp < lps.size(); ++lp) { 849 lps[lp].node = 0; // no NUMA 850 } 851 } 852 853 #endif // HWY_OS_* 854 855 #if HWY_OS_WIN || HWY_OS_APPLE 856 857 void SetClusterCacheSizes(std::vector<Topology::Package>& packages) { 858 // Assumes clusters are homogeneous. Otherwise, we would have to scan 859 // `RelationCache` again and find the corresponding package_idx. 860 const Cache* caches = DataCaches(); 861 const size_t private_kib = caches ? caches[2].size_kib : 0; 862 const size_t shared_kib = caches ? caches[3].size_kib : 0; 863 864 for (size_t ip = 0; ip < packages.size(); ++ip) { 865 Topology::Package& p = packages[ip]; 866 for (size_t ic = 0; ic < p.clusters.size(); ++ic) { 867 Topology::Cluster& c = p.clusters[ic]; 868 c.private_kib = private_kib; 869 c.shared_kib = shared_kib; 870 } 871 } 872 } 873 874 #endif // HWY_OS_WIN || HWY_OS_APPLE 875 876 } // namespace 877 878 HWY_CONTRIB_DLLEXPORT Topology::Topology() { 879 #if HWY_OS_LINUX || HWY_OS_WIN || HWY_OS_APPLE 880 lps.resize(TotalLogicalProcessors()); 881 const std::vector<PackageSizes>& package_sizes = DetectPackages(lps); 882 if (package_sizes.empty()) return; 883 SetNodes(lps); 884 885 // Allocate per-package/cluster/core vectors. This indicates to callers that 886 // detection succeeded. 887 packages.resize(package_sizes.size()); 888 for (size_t p = 0; p < packages.size(); ++p) { 889 packages[p].clusters.resize(package_sizes[p].num_clusters); 890 packages[p].cores.resize(package_sizes[p].num_cores); 891 } 892 893 // Populate the per-cluster/core sets of LP. 894 for (size_t lp = 0; lp < lps.size(); ++lp) { 895 Package& p = packages[lps[lp].package]; 896 p.clusters[lps[lp].cluster].lps.Set(lp); 897 p.cores[lps[lp].core].lps.Set(lp); 898 } 899 900 SetClusterCacheSizes(packages); 901 #endif // HWY_OS_* 902 } 903 904 // ------------------------------ Cache detection 905 906 namespace { 907 908 using Caches = std::array<Cache, 4>; 909 910 // We assume homogeneous caches across all clusters because some OS APIs return 911 // a single value for a class of CPUs. 912 913 #if HWY_OS_LINUX 914 std::string ReadString(const char* name, size_t index) { 915 // First CPU is usually a P core. 916 const std::string path("/sys/devices/system/cpu/cpu0/cache/index%zu/"); 917 char buf200[200]; 918 size_t end = ReadSysfs((path + name).c_str(), index, buf200); 919 // Remove trailing newline/null to simplify string comparison. 920 for (; end != 0; --end) { 921 if (buf200[end - 1] != '\0' && buf200[end - 1] != '\n') break; 922 } 923 return std::string(buf200, buf200 + end); 924 } 925 926 template <typename T> 927 bool WriteSysfs(const char* name, size_t index, T* out) { 928 const std::string str = ReadString(name, index); 929 // Do not call `ParseNumberWithOptionalSuffix` because it acts on the 930 // K suffix in "size", but we actually want KiB. 931 size_t pos = 0; 932 size_t val; 933 if (!ParseDigits(str.c_str(), str.length(), pos, &val)) return false; 934 HWY_ASSERT(pos <= str.length()); 935 *out = static_cast<T>(val); 936 return true; 937 } 938 939 // Reading from sysfs is preferred because sysconf returns L3 associativity = 0 940 // on some CPUs, and does not indicate sharing across cores. 941 // https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu 942 bool InitCachesSysfs(Caches& caches) { 943 // For computing shared cache sizes. 944 std::vector<hwy::Topology::LP> lps(TotalLogicalProcessors()); 945 const std::vector<PackageSizes> package_sizes = DetectPackages(lps); 946 // `package_sizes` is only used to check that `lps` were filled. 947 if (package_sizes.empty()) { 948 HWY_WARN("no packages, shared cache sizes may be incorrect\n"); 949 return false; 950 } 951 952 for (size_t i = 0;; ++i) { 953 const std::string type = ReadString("type", i); 954 if (type.empty()) break; // done, no more entries 955 if (type != "Data" && type != "Unified") continue; 956 uint32_t level; 957 if (!WriteSysfs("level", i, &level)) continue; 958 if (level != 1 && level != 2 && level != 3) continue; 959 Cache& c = caches[level]; 960 961 // Check before overwriting any fields. 962 if (c.size_kib != 0) { 963 HWY_WARN("ignoring another L%u, first size %u\n", level, c.size_kib); 964 continue; 965 } 966 967 const bool ok = WriteSysfs("size", i, &c.size_kib) && 968 WriteSysfs("ways_of_associativity", i, &c.associativity) && 969 WriteSysfs("number_of_sets", i, &c.sets); 970 if (HWY_UNLIKELY(!ok)) { 971 HWY_WARN("skipping partially-detected L%u, error %d\n", level, errno); 972 c = Cache(); 973 continue; 974 } 975 976 // Compute line size *before* adjusting the size for sharing. Note that 977 // `coherency_line_size` exists, but we are not sure that is the line size. 978 const size_t bytes = static_cast<size_t>(c.size_kib) * 1024; 979 const size_t lines = c.associativity * c.sets; 980 c.bytes_per_line = static_cast<uint16_t>(DivByFactor(bytes, lines)); 981 982 // Divide by number of *cores* sharing the cache. 983 const std::string shared_str = ReadString("shared_cpu_list", i); 984 if (HWY_UNLIKELY(shared_str.empty())) { 985 HWY_WARN("no shared_cpu_list for L%u %s\n", level, type.c_str()); 986 c.cores_sharing = 1; 987 } else { 988 const std::vector<size_t> shared_lps = 989 ExpandList(shared_str.c_str(), shared_str.length(), lps.size() - 1); 990 size_t num_cores = 0; 991 for (size_t lp : shared_lps) { 992 if (HWY_LIKELY(lp < lps.size())) { 993 num_cores += lps[lp].smt == 0; 994 } else { 995 HWY_WARN("out of bounds lp %zu of %zu from %s\n", lp, lps.size(), 996 shared_str.c_str()); 997 } 998 } 999 if (num_cores == 0) { 1000 HWY_WARN("no cores sharing L%u %s, setting to 1\n", level, 1001 type.c_str()); 1002 num_cores = 1; 1003 } 1004 c.cores_sharing = static_cast<uint16_t>(num_cores); 1005 // There exist CPUs for which L3 is not evenly divisible by `num_cores`, 1006 // hence do not use `DivByFactor`. It is safer to round down. 1007 c.size_kib = static_cast<uint32_t>(c.size_kib / num_cores); 1008 c.sets = static_cast<uint32_t>(c.sets / num_cores); 1009 } 1010 } 1011 1012 // Require L1 and L2 cache. 1013 if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) { 1014 // Don't complain on Android because this is known to happen there. We are 1015 // unaware of good alternatives: `getauxval(AT_L1D_CACHEGEOMETRY)` and 1016 // `sysconf(_SC_LEVEL1_DCACHE_SIZE)` are unreliable, detecting via timing seems 1017 // difficult to do reliably, and we do not want to maintain lists of known CPUs 1018 // and their properties. It's OK to return false; callers are responsible for 1019 // assuming reasonable defaults. 1020 #ifndef __ANDROID__ 1021 HWY_WARN("sysfs detected L1=%u L2=%u, err %d\n", caches[1].size_kib, 1022 caches[2].size_kib, errno); 1023 #endif 1024 return false; 1025 } 1026 1027 // L3 is optional; if not found, its size is already zero from static init. 1028 return true; 1029 } 1030 1031 #elif HWY_OS_WIN 1032 1033 bool InitCachesWin(Caches& caches) { 1034 std::vector<hwy::Topology::LP> lps(TotalLogicalProcessors()); 1035 const size_t max_lps_per_core = MaxLpsPerCore(lps); 1036 1037 (void)ForEachSLPI(RelationCache, [max_lps_per_core, 1038 &caches](const SLPI& info) { 1039 const CACHE_RELATIONSHIP& cr = info.Cache; 1040 if (cr.Type != CacheUnified && cr.Type != CacheData) return; 1041 if (1 <= cr.Level && cr.Level <= 3) { 1042 Cache& c = caches[cr.Level]; 1043 // If the size is non-zero then we (probably) have already detected this 1044 // cache and can skip the CR. 1045 if (c.size_kib > 0) return; 1046 c.size_kib = static_cast<uint32_t>(DivByFactor(cr.CacheSize, 1024)); 1047 c.bytes_per_line = static_cast<uint16_t>(cr.LineSize); 1048 c.associativity = (cr.Associativity == CACHE_FULLY_ASSOCIATIVE) 1049 ? Cache::kMaxAssociativity 1050 : cr.Associativity; 1051 1052 // How many cores share this cache? 1053 size_t shared_with = NumBits(GroupCount(cr), cr.GroupMasks); 1054 // Divide out hyperthreads. This core may have fewer than 1055 // `max_lps_per_core`, hence round up. 1056 shared_with = DivCeil(shared_with, max_lps_per_core); 1057 if (shared_with == 0) { 1058 HWY_WARN("no cores sharing L%u, setting to 1\n", cr.Level); 1059 shared_with = 1; 1060 } 1061 1062 // Update `size_kib` to *per-core* portion. 1063 // There exist CPUs for which L3 is not evenly divisible by `shared_with`, 1064 // hence do not use `DivByFactor`. It is safer to round down. 1065 c.size_kib = static_cast<uint32_t>(c.size_kib / shared_with); 1066 c.cores_sharing = static_cast<uint16_t>(shared_with); 1067 } 1068 }); 1069 1070 // Require L1 and L2 cache. 1071 if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) { 1072 HWY_WARN("Windows detected L1=%u, L2=%u, err %lx\n", caches[1].size_kib, 1073 caches[2].size_kib, GetLastError()); 1074 return false; 1075 } 1076 1077 // L3 is optional; if not found, its size is already zero from static init. 1078 return true; 1079 } 1080 1081 #elif HWY_OS_APPLE 1082 1083 bool InitCachesApple(Caches& caches) { 1084 int err = 0; 1085 Cache& L1 = caches[1]; 1086 Cache& L2 = caches[2]; 1087 Cache& L3 = caches[3]; 1088 1089 // Total L1 and L2 size can be reliably queried, but prefer perflevel0 1090 // (P-cores) because hw.l1dcachesize etc. are documented to describe the 1091 // "least performant core". 1092 bool ok = Sysctl("hw.perflevel0.l1dcachesize", 1024, err, &L1.size_kib) || 1093 Sysctl("hw.l1dcachesize", 1024, err, &L1.size_kib); 1094 ok &= Sysctl("hw.perflevel0.l2cachesize", 1024, err, &L2.size_kib) || 1095 Sysctl("hw.l2cachesize", 1024, err, &L2.size_kib); 1096 if (HWY_UNLIKELY(!ok)) { 1097 HWY_WARN("Apple cache detection failed, error %d\n", err); 1098 return false; 1099 } 1100 L1.cores_sharing = 1; 1101 if (Sysctl("hw.perflevel0.cpusperl2", 1, err, &L2.cores_sharing)) { 1102 // There exist CPUs for which L2 is not evenly divisible by `cores_sharing`, 1103 // hence do not use `DivByFactor`. It is safer to round down. 1104 L2.size_kib /= L2.cores_sharing; 1105 } else { 1106 L2.cores_sharing = 1; 1107 } 1108 1109 // Other properties are not always reported. Set `associativity` and 1110 // `bytes_per_line` based on known models. 1111 char brand[128] = {0}; 1112 size_t size = sizeof(brand); 1113 if (!sysctlbyname("machdep.cpu.brand_string", brand, &size, nullptr, 0)) { 1114 if (strncmp(brand, "Apple ", 6) != 0) { 1115 // Unexpected, but we will continue check the string suffixes. 1116 HWY_WARN("unexpected Apple brand %s\n", brand); 1117 } 1118 1119 if (brand[6] == 'M') { 1120 // https://dougallj.github.io/applecpu/firestorm.html, 1121 // https://www.7-cpu.com/cpu/Apple_M1.html: 1122 L1.bytes_per_line = 64; 1123 L1.associativity = 8; 1124 L2.bytes_per_line = 128; 1125 if (brand[7] == '1') { // M1 1126 L2.associativity = 12; 1127 } else if ('2' <= brand[7] && brand[7] <= '4') { // M2/M3, maybe also M4 1128 L2.associativity = 16; 1129 } else { 1130 L2.associativity = 0; // Unknown, set below via sysctl. 1131 } 1132 1133 // Although Wikipedia lists SLC sizes per model, we do not know how it is 1134 // partitioned/allocated, so do not treat it as a reliable L3. 1135 } // M* 1136 } // brand string 1137 1138 // This sysctl does not distinguish between L1 and L2 line sizes, so only use 1139 // it if we have not already set `bytes_per_line` above. 1140 uint16_t bytes_per_line; 1141 if (!Sysctl("hw.cachelinesize", 1, err, &bytes_per_line)) { 1142 bytes_per_line = static_cast<uint16_t>(HWY_ALIGNMENT); // guess 1143 } 1144 for (size_t level = 1; level <= 3; ++level) { 1145 if (caches[level].bytes_per_line == 0) { 1146 caches[level].bytes_per_line = bytes_per_line; 1147 } 1148 } 1149 1150 // Fill in associativity if not already set. Unfortunately this is only 1151 // reported on x86, not on M*. 1152 if (L1.associativity == 0 && !Sysctl("machdep.cpu.cache.L1_associativity", 1, 1153 err, &L1.associativity)) { 1154 L1.associativity = 8; // guess 1155 } 1156 if (L2.associativity == 0 && !Sysctl("machdep.cpu.cache.L2_associativity", 1, 1157 err, &L2.associativity)) { 1158 L2.associativity = 12; // guess 1159 } 1160 // There is no L3_associativity. 1161 if (L3.associativity == 0) { 1162 L3.associativity = 12; // guess 1163 } 1164 1165 // Now attempt to query L3. Although this sysctl is documented, M3 does not 1166 // report an L3 cache. 1167 if (L3.size_kib == 0 && 1168 (Sysctl("hw.perflevel0.l3cachesize", 1024, err, &L3.size_kib) || 1169 Sysctl("hw.l3cachesize", 1024, err, &L3.size_kib))) { 1170 // There exist CPUs for which L3 is not evenly divisible by `cores_sharing`, 1171 // hence do not use `DivByFactor`. It is safer to round down. 1172 if (Sysctl("hw.perflevel0.cpusperl3", 1, err, &L3.cores_sharing)) { 1173 L3.size_kib /= L3.cores_sharing; 1174 } else { 1175 L3.cores_sharing = 1; 1176 } 1177 } 1178 // If no L3 cache, reset all fields for consistency. 1179 if (L3.size_kib == 0) { 1180 L3 = Cache(); 1181 } 1182 1183 // Are there other useful sysctls? hw.cacheconfig appears to be how many 1184 // cores share the memory and caches, though this is not documented, and 1185 // duplicates information in hw.perflevel0.cpusperl*. 1186 1187 return true; 1188 } 1189 1190 #endif // HWY_OS_* 1191 1192 // Most APIs do not set the `sets` field, so compute it from the size and 1193 // associativity, and if a value is already set, ensure it matches. 1194 HWY_MAYBE_UNUSED void ComputeSets(Cache& c) { 1195 // If there is no such cache, avoid division by zero. 1196 if (HWY_UNLIKELY(c.size_kib == 0)) { 1197 c.sets = 0; 1198 return; 1199 } 1200 const size_t bytes = static_cast<size_t>(c.size_kib) * 1024; 1201 // `size_kib` may have been rounded down, hence `lines` and `sets` are not 1202 // necessarily evenly divisible, so round down instead of `DivByFactor`. 1203 const size_t lines = bytes / c.bytes_per_line; 1204 const size_t sets = lines / c.associativity; 1205 1206 if (c.sets == 0) { 1207 c.sets = static_cast<uint32_t>(sets); 1208 } else { 1209 const size_t diff = c.sets - sets; 1210 if (diff > 1) { 1211 HWY_ABORT("Inconsistent cache sets %u != %zu\n", c.sets, sets); 1212 } 1213 } 1214 } 1215 1216 const Cache* InitDataCaches() { 1217 alignas(64) static Caches caches; 1218 1219 // On failure, return immediately because InitCaches*() already warn. 1220 #if HWY_OS_LINUX 1221 if (HWY_UNLIKELY(!InitCachesSysfs(caches))) return nullptr; 1222 #elif HWY_OS_WIN 1223 if (HWY_UNLIKELY(!InitCachesWin(caches))) return nullptr; 1224 #elif HWY_OS_APPLE 1225 if (HWY_UNLIKELY(!InitCachesApple(caches))) return nullptr; 1226 #else 1227 HWY_WARN("Cache detection not implemented for this platform.\n"); 1228 (void)caches; 1229 return nullptr; 1230 #define HWY_NO_CACHE_DETECTION 1231 #endif 1232 1233 // Prevents "code not reached" warnings on WASM. 1234 #ifndef HWY_NO_CACHE_DETECTION 1235 for (size_t level = 1; level <= 3; ++level) { 1236 ComputeSets(caches[level]); 1237 } 1238 1239 // Heuristic to ignore SLCs such as on Ampere Altra, which should not be 1240 // treated as a reliable L3 because of their cache inclusion policy. 1241 // On Apple M*, these are not even reported as an L3. 1242 if (caches[3].cores_sharing >= 16 && caches[3].size_kib <= 512) { 1243 caches[3] = Cache(); 1244 } 1245 1246 return &caches[0]; 1247 #endif // HWY_NO_CACHE_DETECTION 1248 } 1249 1250 } // namespace 1251 1252 HWY_CONTRIB_DLLEXPORT const Cache* DataCaches() { 1253 static const Cache* caches = InitDataCaches(); 1254 return caches; 1255 } 1256 1257 } // namespace hwy