tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

topology.cc (42877B)


      1 // Copyright 2024 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include "hwy/contrib/thread_pool/topology.h"
     17 
     18 #include <ctype.h>  // isspace
     19 #include <stddef.h>
     20 #include <stdint.h>
     21 #include <stdio.h>
     22 #include <string.h>  // strchr
     23 
     24 #include <array>
     25 #include <map>
     26 #include <string>
     27 #include <vector>
     28 
     29 #include "hwy/base.h"  // HWY_OS_WIN, HWY_WARN
     30 
     31 #if HWY_OS_APPLE
     32 #include <sys/sysctl.h>
     33 
     34 #include "hwy/aligned_allocator.h"  // HWY_ALIGNMENT
     35 #endif
     36 
     37 #if HWY_OS_WIN
     38 #ifndef NOMINMAX
     39 #define NOMINMAX
     40 #endif
     41 #ifndef WIN32_LEAN_AND_MEAN
     42 #define WIN32_LEAN_AND_MEAN
     43 #endif
     44 #ifndef _WIN32_WINNT
     45 #define _WIN32_WINNT 0x0601  // Windows 7 / Server 2008
     46 #endif
     47 #include <windows.h>
     48 #endif  // HWY_OS_WIN
     49 
     50 #if HWY_OS_LINUX || HWY_OS_FREEBSD
     51 #ifndef _GNU_SOURCE
     52 #define _GNU_SOURCE
     53 #endif
     54 #include <errno.h>
     55 #include <fcntl.h>
     56 #include <pthread.h>
     57 #include <sched.h>
     58 #include <sys/stat.h>
     59 #include <sys/syscall.h>
     60 #include <sys/types.h>
     61 #include <unistd.h>  // sysconf
     62 #endif  // HWY_OS_LINUX || HWY_OS_FREEBSD
     63 
     64 #if HWY_OS_FREEBSD
     65 #include <sys/param.h>
     66 // After param.h / types.h.
     67 #include <sys/cpuset.h>
     68 #endif  // HWY_OS_FREEBSD
     69 
     70 #if HWY_ARCH_WASM
     71 #include <emscripten/threading.h>
     72 #endif
     73 
     74 namespace hwy {
     75 
     76 HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport() {
     77 #if HWY_ARCH_WASM
     78  return emscripten_has_threading_support() != 0;
     79 #else
     80  return true;
     81 #endif
     82 }
     83 
     84 namespace {
     85 
     86 // Returns `whole / part`, with a check that `part` evenly divides `whole`,
     87 // which implies the result is exact.
     88 HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) {
     89  HWY_ASSERT(part != 0);
     90  const size_t div = whole / part;
     91  const size_t mul = div * part;
     92  if (mul != whole) {
     93    HWY_ABORT("%zu / %zu = %zu; *%zu = %zu\n", whole, part, div, part, mul);
     94  }
     95  return div;
     96 }
     97 
     98 #if HWY_OS_WIN
     99 
    100 using SLPI = SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX;
    101 
    102 template <class Func>
    103 bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) {
    104  // Get required buffer size.
    105  DWORD buf_bytes = 0;
    106  HWY_ASSERT(!GetLogicalProcessorInformationEx(rel, nullptr, &buf_bytes));
    107  // Observed when `rel` is not supported:
    108  if (HWY_UNLIKELY(buf_bytes == 0 && GetLastError() == ERROR_GEN_FAILURE)) {
    109    if (rel != RelationNumaNodeEx && rel != RelationProcessorDie) {
    110      HWY_WARN("Unexpected err %lx for GLPI relationship %d\n", GetLastError(),
    111               static_cast<int>(rel));
    112    }
    113    return false;
    114  }
    115  HWY_ASSERT(GetLastError() == ERROR_INSUFFICIENT_BUFFER);
    116  // Note: `buf_bytes` may be less than `sizeof(SLPI)`, which has padding.
    117  // `calloc` zero-initializes the `Reserved` field, part of which has been
    118  // repurposed into `GroupCount` in SDKs, 10.0.22000.0 or possibly earlier.
    119  uint8_t* buf = static_cast<uint8_t*>(calloc(1, buf_bytes));
    120  HWY_ASSERT(buf);
    121 
    122  // Fill the buffer.
    123  SLPI* info = reinterpret_cast<SLPI*>(buf);
    124  if (HWY_UNLIKELY(!GetLogicalProcessorInformationEx(rel, info, &buf_bytes))) {
    125    free(buf);
    126    return false;
    127  }
    128 
    129  // Iterate over each SLPI. `sizeof(SLPI)` is unreliable, see above.
    130  uint8_t* pos = buf;
    131  while (pos < buf + buf_bytes) {
    132    info = reinterpret_cast<SLPI*>(pos);
    133    HWY_ASSERT(rel == RelationAll || info->Relationship == rel);
    134    func(*info);
    135    pos += info->Size;
    136  }
    137  if (pos != buf + buf_bytes) {
    138    HWY_WARN("unexpected pos %p, end %p, buf_bytes %lu, sizeof(SLPI) %zu\n",
    139             pos, buf + buf_bytes, buf_bytes, sizeof(SLPI));
    140  }
    141 
    142  free(buf);
    143  return true;
    144 }
    145 
    146 size_t NumBits(size_t num_groups, const GROUP_AFFINITY* affinity) {
    147  size_t total_bits = 0;
    148  for (size_t i = 0; i < num_groups; ++i) {
    149    size_t bits = 0;
    150    hwy::CopyBytes<sizeof(bits)>(&affinity[i].Mask, &bits);
    151    total_bits += hwy::PopCount(bits);
    152  }
    153  return total_bits;
    154 }
    155 
    156 // Calls `func(lp, lps)` for each index `lp` in the set, after ensuring that
    157 // `lp < lps.size()`. `line` is for debugging via Warn().
    158 template <class Func>
    159 void ForeachBit(size_t num_groups, const GROUP_AFFINITY* affinity,
    160                std::vector<Topology::LP>& lps, int line, const Func& func) {
    161  for (size_t group = 0; group < num_groups; ++group) {
    162    size_t bits = 0;
    163    hwy::CopyBytes<sizeof(bits)>(&affinity[group].Mask, &bits);
    164    while (bits != 0) {
    165      size_t lp = Num0BitsBelowLS1Bit_Nonzero64(bits);
    166      bits &= bits - 1;  // clear LSB
    167      if (HWY_UNLIKELY(lp >= lps.size())) {
    168        Warn(__FILE__, line, "Clamping lp %zu to lps.size() %zu, groups %zu\n",
    169             lp, lps.size(), num_groups);
    170        lp = lps.size() - 1;
    171      }
    172      func(lp, lps);
    173    }
    174  }
    175 }
    176 
    177 #elif HWY_OS_APPLE
    178 
    179 // Returns whether sysctlbyname() succeeded; if so, writes `val / div` to
    180 // `out`, otherwise sets `err`.
    181 template <typename T>
    182 bool Sysctl(const char* name, size_t div, int& err, T* out) {
    183  size_t val = 0;
    184  size_t size = sizeof(val);
    185  // Last two arguments are for updating the value, which we do not want.
    186  const int ret = sysctlbyname(name, &val, &size, nullptr, 0);
    187  if (HWY_UNLIKELY(ret != 0)) {
    188    // Do not print warnings because some `name` are expected to fail.
    189    err = ret;
    190    return false;
    191  }
    192  *out = static_cast<T>(DivByFactor(val, div));
    193  return true;
    194 }
    195 
    196 #endif  // HWY_OS_*
    197 
    198 }  // namespace
    199 
    200 HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() {
    201  size_t total_lps = 0;
    202 #if HWY_ARCH_WASM
    203  const int num_cores = emscripten_num_logical_cores();
    204  if (num_cores > 0) total_lps = static_cast<size_t>(num_cores);
    205 #elif HWY_OS_WIN
    206  // If there are multiple groups, this should return them all, rather than
    207  // just the first 64, but VMs report less.
    208  (void)ForEachSLPI(RelationProcessorCore, [&total_lps](const SLPI& info) {
    209    const PROCESSOR_RELATIONSHIP& p = info.Processor;
    210    total_lps += NumBits(p.GroupCount, p.GroupMask);
    211  });
    212 #elif HWY_OS_LINUX
    213  // Only check "online" because sysfs entries such as topology are missing for
    214  // offline CPUs, which will cause `DetectPackages` to fail.
    215  const long ret = sysconf(_SC_NPROCESSORS_ONLN);  // NOLINT(runtime/int)
    216  if (ret < 0) {
    217    HWY_WARN("Unexpected _SC_NPROCESSORS_CONF = %d\n", static_cast<int>(ret));
    218  } else {
    219    total_lps = static_cast<size_t>(ret);
    220  }
    221 #elif HWY_OS_APPLE
    222  int err;
    223  // Only report P processors.
    224  if (!Sysctl("hw.perflevel0.logicalcpu", 1, err, &total_lps)) {
    225    total_lps = 0;
    226  }
    227 #endif
    228 
    229  if (HWY_UNLIKELY(total_lps == 0)) {  // Failed to detect.
    230    HWY_WARN(
    231        "Unknown TotalLogicalProcessors, assuming 1. "
    232        "HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n"
    233        "HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n",
    234        HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, HWY_ARCH_X86,
    235        HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, HWY_ARCH_S390X);
    236    return 1;
    237  }
    238 
    239  // Warn that we are clamping.
    240  if (HWY_UNLIKELY(total_lps > kMaxLogicalProcessors)) {
    241    HWY_WARN("OS reports %zu processors but clamping to %zu\n", total_lps,
    242             kMaxLogicalProcessors);
    243    total_lps = kMaxLogicalProcessors;
    244  }
    245 
    246  return total_lps;
    247 }
    248 
    249 // ------------------------------ Affinity
    250 
    251 #if HWY_OS_LINUX || HWY_OS_FREEBSD
    252 namespace {
    253 
    254 #if HWY_OS_LINUX
    255 using CpuSet = cpu_set_t;
    256 #else
    257 using CpuSet = cpuset_t;
    258 #endif
    259 
    260 // Helper functions reduce the number of #if in GetThreadAffinity.
    261 int GetAffinity(CpuSet* set) {
    262  // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD.
    263 #if defined(__ANDROID__) && __ANDROID_API__ < 12
    264  return syscall(__NR_sched_getaffinity, 0, sizeof(CpuSet), set);
    265 #elif HWY_OS_FREEBSD
    266  return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet),
    267                            set);
    268 #else  // normal Linux
    269  return sched_getaffinity(0, sizeof(CpuSet), set);
    270 #endif
    271 }
    272 
    273 int SetAffinity(CpuSet* set) {
    274  // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD.
    275 #if defined(__ANDROID__) && __ANDROID_API__ < 12
    276  return syscall(__NR_sched_setaffinity, 0, sizeof(CpuSet), set);
    277 #elif HWY_OS_FREEBSD
    278  return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet),
    279                            set);
    280 #else  // normal Linux
    281  return sched_setaffinity(0, sizeof(CpuSet), set);
    282 #endif
    283 }
    284 
    285 bool IsSet(size_t lp, const CpuSet* set) {
    286 #if HWY_COMPILER_GCC_ACTUAL
    287  // Workaround for GCC compiler warning with CPU_ISSET macro
    288  HWY_DIAGNOSTICS(push)
    289  HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
    290 #endif
    291  const int is_set = CPU_ISSET(static_cast<int>(lp), set);
    292 #if HWY_COMPILER_GCC_ACTUAL
    293  HWY_DIAGNOSTICS(pop)
    294 #endif
    295  return is_set != 0;
    296 }
    297 
    298 void Set(size_t lp, CpuSet* set) {
    299 #if HWY_COMPILER_GCC_ACTUAL
    300  // Workaround for GCC compiler warning with CPU_SET macro
    301  HWY_DIAGNOSTICS(push)
    302  HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
    303 #endif
    304  CPU_SET(static_cast<int>(lp), set);
    305 #if HWY_COMPILER_GCC_ACTUAL
    306  HWY_DIAGNOSTICS(pop)
    307 #endif
    308 }
    309 
    310 }  // namespace
    311 #endif  // HWY_OS_LINUX || HWY_OS_FREEBSD
    312 
    313 HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps) {
    314 #if HWY_OS_WIN
    315  // Only support the first 64 because WINE does not support processor groups.
    316  const HANDLE hThread = GetCurrentThread();
    317  const DWORD_PTR prev = SetThreadAffinityMask(hThread, ~DWORD_PTR(0));
    318  if (!prev) return false;
    319  (void)SetThreadAffinityMask(hThread, prev);
    320  lps = LogicalProcessorSet();  // clear all
    321  lps.SetNonzeroBitsFrom64(prev);
    322  return true;
    323 #elif HWY_OS_LINUX || HWY_OS_FREEBSD
    324  CpuSet set;
    325  CPU_ZERO(&set);
    326  const int err = GetAffinity(&set);
    327  if (err != 0) return false;
    328  for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) {
    329    if (IsSet(lp, &set)) lps.Set(lp);
    330  }
    331  return true;
    332 #else
    333  // For HWY_OS_APPLE, affinity is not supported. Do not even set lp=0 to force
    334  // callers to handle this case.
    335  (void)lps;
    336  return false;
    337 #endif
    338 }
    339 
    340 HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps) {
    341 #if HWY_OS_WIN
    342  const HANDLE hThread = GetCurrentThread();
    343  const DWORD_PTR prev = SetThreadAffinityMask(hThread, lps.Get64());
    344  return prev != 0;
    345 #elif HWY_OS_LINUX || HWY_OS_FREEBSD
    346  CpuSet set;
    347  CPU_ZERO(&set);
    348  lps.Foreach([&set](size_t lp) { Set(lp, &set); });
    349  const int err = SetAffinity(&set);
    350  if (err != 0) return false;
    351  return true;
    352 #else
    353  // Apple THREAD_AFFINITY_POLICY is only an (often ignored) hint.
    354  (void)lps;
    355  return false;
    356 #endif
    357 }
    358 
    359 namespace {
    360 
    361 struct PackageSizes {
    362  size_t num_clusters;
    363  size_t num_cores;
    364 };
    365 
    366 #if HWY_OS_LINUX
    367 
    368 class File {
    369 public:
    370  explicit File(const char* path) {
    371    for (;;) {
    372      fd_ = open(path, O_RDONLY);
    373      if (fd_ > 0) return;           // success
    374      if (errno == EINTR) continue;  // signal: retry
    375      if (errno == ENOENT) return;   // not found, give up
    376      HWY_WARN("Unexpected error opening %s: %d\n", path, errno);
    377      return;  // unknown error, give up
    378    }
    379  }
    380 
    381  ~File() {
    382    if (fd_ > 0) {
    383      for (;;) {
    384        const int ret = close(fd_);
    385        if (ret == 0) break;           // success
    386        if (errno == EINTR) continue;  // signal: retry
    387        HWY_WARN("Unexpected error closing file: %d\n", errno);
    388        return;  // unknown error, ignore
    389      }
    390    }
    391  }
    392 
    393  // Returns number of bytes read or 0 on failure.
    394  size_t Read(char* buf200) const {
    395    if (fd_ < 0) return 0;
    396    size_t pos = 0;
    397    for (;;) {
    398      // read instead of `pread`, which might not work for sysfs.
    399      const auto bytes_read = read(fd_, buf200 + pos, 200 - pos);
    400      if (bytes_read == 0) {  // EOF: done
    401        buf200[pos++] = '\0';
    402        return pos;
    403      }
    404      if (bytes_read == -1) {
    405        if (errno == EINTR) continue;  // signal: retry
    406        HWY_WARN("Unexpected error reading file: %d\n", errno);
    407        return 0;
    408      }
    409      pos += static_cast<size_t>(bytes_read);
    410      HWY_ASSERT(pos <= 200);
    411    }
    412  }
    413 
    414 private:
    415  int fd_;
    416 };
    417 
    418 // Returns bytes read, or 0 on failure.
    419 size_t ReadSysfs(const char* format, size_t lp, char* buf200) {
    420  char path[200];
    421  const int bytes_written = snprintf(path, sizeof(path), format, lp);
    422  HWY_ASSERT(0 < bytes_written &&
    423             bytes_written < static_cast<int>(sizeof(path) - 1));
    424 
    425  const File file(path);
    426  return file.Read(buf200);
    427 }
    428 
    429 // Interprets [str + pos, str + end) as base-10 ASCII. Stops when any non-digit
    430 // is found, or at end. Returns false if no digits found.
    431 bool ParseDigits(const char* str, const size_t end, size_t& pos, size_t* out) {
    432  HWY_ASSERT(pos <= end);
    433  // 9 digits cannot overflow even 32-bit size_t.
    434  const size_t stop = pos + 9;
    435  *out = 0;
    436  for (; pos < HWY_MIN(end, stop); ++pos) {
    437    const int c = str[pos];
    438    if (c < '0' || c > '9') break;
    439    *out *= 10;
    440    *out += static_cast<size_t>(c - '0');
    441  }
    442  if (pos == 0) {  // No digits found
    443    *out = 0;
    444    return false;
    445  }
    446  return true;
    447 }
    448 
    449 // Number, plus optional K or M suffix, plus terminator.
    450 bool ParseNumberWithOptionalSuffix(const char* str, size_t len, size_t* out) {
    451  size_t pos = 0;
    452  if (!ParseDigits(str, len, pos, out)) return false;
    453  if (str[pos] == 'K') {
    454    *out <<= 10;
    455    ++pos;
    456  }
    457  if (str[pos] == 'M') {
    458    *out <<= 20;
    459    ++pos;
    460  }
    461  if (str[pos] != '\0' && str[pos] != '\n') {
    462    HWY_ABORT("Expected [suffix] terminator at %zu %s\n", pos, str);
    463  }
    464  return true;
    465 }
    466 
    467 bool ReadNumberWithOptionalSuffix(const char* format, size_t lp, size_t* out) {
    468  char buf200[200];
    469  const size_t pos = ReadSysfs(format, lp, buf200);
    470  if (pos == 0) return false;
    471  return ParseNumberWithOptionalSuffix(buf200, pos, out);
    472 }
    473 
    474 const char* kPackage =
    475    "/sys/devices/system/cpu/cpu%zu/topology/physical_package_id";
    476 const char* kCluster = "/sys/devices/system/cpu/cpu%zu/cache/index3/id";
    477 const char* kCore = "/sys/devices/system/cpu/cpu%zu/topology/core_id";
    478 const char* kL2Size = "/sys/devices/system/cpu/cpu%zu/cache/index2/size";
    479 const char* kL3Size = "/sys/devices/system/cpu/cpu%zu/cache/index3/size";
    480 const char* kNode = "/sys/devices/system/node/node%zu/cpulist";
    481 
    482 // sysfs values can be arbitrarily large, so store in a map and replace with
    483 // indices in order of appearance.
    484 class Remapper {
    485 public:
    486  // Returns false on error, or sets `out_index` to the index of the sysfs
    487  // value selected by `format` and `lp`.
    488  template <typename T>
    489  bool operator()(const char* format, size_t lp, T* HWY_RESTRICT out_index) {
    490    size_t opaque;
    491    if (!ReadNumberWithOptionalSuffix(format, lp, &opaque)) return false;
    492 
    493    const auto ib = indices_.insert({opaque, num_});
    494    num_ += ib.second;                      // increment if inserted
    495    const size_t index = ib.first->second;  // new or existing
    496    HWY_ASSERT(index < num_);
    497    HWY_ASSERT(index < hwy::LimitsMax<T>());
    498    *out_index = static_cast<T>(index);
    499    return true;
    500  }
    501 
    502  size_t Num() const { return num_; }
    503 
    504 private:
    505  std::map<size_t, size_t> indices_;
    506  size_t num_ = 0;
    507 };
    508 
    509 // For internal use by `DetectPackages`.
    510 struct PerPackage {
    511  Remapper clusters;
    512  Remapper cores;
    513  // We rely on this zero-init and increment it below.
    514  uint8_t smt_per_core[kMaxLogicalProcessors] = {0};
    515 };
    516 
    517 // Initializes `lps` and returns a PackageSizes vector (empty on failure)
    518 // indicating the number of clusters and cores per package.
    519 std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {
    520  std::vector<PackageSizes> empty;
    521 
    522  Remapper packages;
    523  for (size_t lp = 0; lp < lps.size(); ++lp) {
    524    if (!packages(kPackage, lp, &lps[lp].package)) {
    525      HWY_WARN("Failed to read sysfs package for LP %zu\n", lp);
    526      return empty;
    527    }
    528  }
    529  std::vector<PerPackage> per_package(packages.Num());
    530  HWY_ASSERT(!per_package.empty());
    531 
    532  for (size_t lp = 0; lp < lps.size(); ++lp) {
    533    PerPackage& pp = per_package[lps[lp].package];
    534    // Not a failure: some CPUs lack a (shared) L3 cache.
    535    if (!pp.clusters(kCluster, lp, &lps[lp].cluster)) {
    536      lps[lp].cluster = 0;
    537    }
    538 
    539    if (!pp.cores(kCore, lp, &lps[lp].core)) {
    540      HWY_WARN("Failed to read sysfs core for LP %zu\n", lp);
    541      return empty;
    542    }
    543 
    544    // SMT ID is how many LP we have already seen assigned to the same core.
    545    HWY_ASSERT(lps[lp].core < kMaxLogicalProcessors);
    546    lps[lp].smt = pp.smt_per_core[lps[lp].core]++;
    547    HWY_ASSERT(lps[lp].smt < 16);
    548  }
    549 
    550  std::vector<PackageSizes> package_sizes(per_package.size());
    551  for (size_t p = 0; p < package_sizes.size(); ++p) {
    552    // Was zero if the package has no shared L3, see above.
    553    package_sizes[p].num_clusters = HWY_MAX(1, per_package[p].clusters.Num());
    554    package_sizes[p].num_cores = per_package[p].cores.Num();
    555    HWY_ASSERT(package_sizes[p].num_cores != 0);
    556  }
    557  return package_sizes;
    558 }
    559 
    560 std::vector<size_t> ExpandList(const char* list, size_t list_end,
    561                               size_t max_lp) {
    562  std::vector<size_t> expanded;
    563  constexpr size_t kNotFound = ~size_t{0};
    564  size_t pos = 0;
    565 
    566  // Gracefully handle empty lists, happens on GH200 systems (#2668).
    567  if (isspace(list[0]) && list_end <= 2) return expanded;
    568 
    569  // Returns first `found_pos >= pos` where `list[found_pos] == c`, or
    570  // `kNotFound`.
    571  const auto find = [list, list_end, &pos](char c) -> size_t {
    572    const char* found_ptr = strchr(list + pos, c);
    573    if (found_ptr == nullptr) return kNotFound;
    574    const size_t found_pos = static_cast<size_t>(found_ptr - list);
    575    HWY_ASSERT(found_pos < list_end && list[found_pos] == c);
    576    return found_pos;
    577  };
    578 
    579  // Reads LP number and advances `pos`. `end` is for verifying we did not
    580  // read past a known terminator, or the end of string.
    581  const auto parse_lp = [list, list_end, &pos, max_lp](size_t end) -> size_t {
    582    end = HWY_MIN(end, list_end);
    583    size_t lp;
    584    HWY_ASSERT(ParseDigits(list, end, pos, &lp));
    585    HWY_IF_CONSTEXPR(HWY_ARCH_RISCV) {
    586      // On RISC-V, both TotalLogicalProcessors and GetThreadAffinity may
    587      // under-report the count, hence clamp.
    588      lp = HWY_MIN(lp, max_lp);
    589    }
    590    HWY_ASSERT(lp <= max_lp);
    591    HWY_ASSERT(pos <= end);
    592    return lp;
    593  };
    594 
    595  // Parse all [first-]last separated by commas.
    596  for (;;) {
    597    // Single number or first of range: ends with dash, comma, or end.
    598    const size_t lp_range_first = parse_lp(HWY_MIN(find('-'), find(',')));
    599 
    600    if (list[pos] == '-') {  // range
    601      ++pos;                 // skip dash
    602      // Last of range ends with comma or end.
    603      const size_t lp_range_last = parse_lp(find(','));
    604 
    605      expanded.reserve(expanded.size() + lp_range_last - lp_range_first + 1);
    606      for (size_t lp = lp_range_first; lp <= lp_range_last; ++lp) {
    607        expanded.push_back(lp);
    608      }
    609    } else {  // single number
    610      expanded.push_back(lp_range_first);
    611    }
    612 
    613    // Done if reached end of string.
    614    if (pos == list_end || list[pos] == '\0' || list[pos] == '\n') {
    615      break;
    616    }
    617    // Comma means at least one more term is coming.
    618    if (list[pos] == ',') {
    619      ++pos;
    620      continue;
    621    }
    622    HWY_ABORT("Unexpected character at %zu in %s\n", pos, list);
    623  }  // for pos
    624 
    625  return expanded;
    626 }
    627 
    628 // Sets LP.node for all `lps`.
    629 void SetNodes(std::vector<Topology::LP>& lps) {
    630  // For each NUMA node found via sysfs:
    631  for (size_t node = 0;; node++) {
    632    // Read its cpulist so we can scatter `node` to all its `lps`.
    633    char buf200[200];
    634    const size_t bytes_read = ReadSysfs(kNode, node, buf200);
    635    if (bytes_read == 0) break;
    636    const std::vector<size_t> list =
    637        ExpandList(buf200, bytes_read, lps.size() - 1);
    638    for (size_t lp : list) {
    639      lps[lp].node = static_cast<uint8_t>(node);
    640    }
    641  }
    642 }
    643 
    644 void SetClusterCacheSizes(std::vector<Topology::Package>& packages) {
    645  for (size_t ip = 0; ip < packages.size(); ++ip) {
    646    Topology::Package& p = packages[ip];
    647    for (size_t ic = 0; ic < p.clusters.size(); ++ic) {
    648      Topology::Cluster& c = p.clusters[ic];
    649      const size_t lp = c.lps.First();
    650      size_t bytes;
    651      if (ReadNumberWithOptionalSuffix(kL2Size, lp, &bytes)) {
    652        c.private_kib = bytes >> 10;
    653      }
    654      if (ReadNumberWithOptionalSuffix(kL3Size, lp, &bytes)) {
    655        c.shared_kib = bytes >> 10;
    656      }
    657    }
    658  }
    659 }
    660 
    661 #elif HWY_OS_WIN
    662 
    663 // See #2734. GroupCount was added around Windows 10, but SDK docs do not
    664 // mention the actual version required. It is known to be absent in 8.1 and
    665 // MinGW 5.0.1, and present in the 10.0.22000.0 SDK. However, the OS must also
    666 // know about the field. Thus we zero-initialize the reserved field, assume it
    667 // remains zero, and return 1 if zero (old style single GroupMask), otherwise
    668 // the number of groups. There are two such structures, but note that
    669 // `PROCESSOR_RELATIONSHIP` already had this field.
    670 static size_t GroupCount(const CACHE_RELATIONSHIP& cr) {
    671  // Added as the last u16 in the reserved area before GroupMask. We only read
    672  // one byte because 256*64 processor bits are plenty.
    673  const uint8_t* pcount =
    674      reinterpret_cast<const uint8_t*>(&cr.GroupMask) - sizeof(uint16_t);
    675  return HWY_MAX(pcount[HWY_IS_BIG_ENDIAN], 1);
    676 }
    677 
    678 static size_t GroupCount(const NUMA_NODE_RELATIONSHIP& nn) {
    679  const uint8_t* pcount =
    680      reinterpret_cast<const uint8_t*>(&nn.GroupMask) - sizeof(uint16_t);
    681  return HWY_MAX(pcount[HWY_IS_BIG_ENDIAN], 1);
    682 }
    683 
    684 // Also sets LP.core and LP.smt.
    685 size_t MaxLpsPerCore(std::vector<Topology::LP>& lps) {
    686  size_t max_lps_per_core = 0;
    687  size_t core_idx = 0;
    688  (void)ForEachSLPI(RelationProcessorCore, [&max_lps_per_core, &core_idx,
    689                                            &lps](const SLPI& info) {
    690    const PROCESSOR_RELATIONSHIP& p = info.Processor;
    691    const size_t lps_per_core = NumBits(p.GroupCount, p.GroupMask);
    692    max_lps_per_core = HWY_MAX(max_lps_per_core, lps_per_core);
    693 
    694    size_t smt = 0;
    695    ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__,
    696               [core_idx, &smt](size_t lp, std::vector<Topology::LP>& lps) {
    697                 lps[lp].core = static_cast<uint16_t>(core_idx);
    698                 lps[lp].smt = static_cast<uint8_t>(smt++);
    699               });
    700    ++core_idx;
    701  });
    702  HWY_ASSERT(max_lps_per_core != 0);
    703  return max_lps_per_core;
    704 }
    705 
    706 // Interprets cluster (typically a shared L3 cache) as a "processor die". Also
    707 // sets LP.cluster.
    708 size_t MaxCoresPerCluster(const size_t max_lps_per_core,
    709                          std::vector<Topology::LP>& lps) {
    710  size_t max_cores_per_cluster = 0;
    711  size_t cluster_idx = 0;
    712  // Shared between `foreach_die` and `foreach_l3`.
    713  const auto foreach_cluster = [&](size_t num_groups,
    714                                   const GROUP_AFFINITY* groups) {
    715    const size_t lps_per_cluster = NumBits(num_groups, groups);
    716    // `max_lps_per_core` is an upper bound, hence round up. It is not an error
    717    // if there is only one core per cluster - can happen for L3.
    718    const size_t cores_per_cluster = DivCeil(lps_per_cluster, max_lps_per_core);
    719    max_cores_per_cluster = HWY_MAX(max_cores_per_cluster, cores_per_cluster);
    720 
    721    ForeachBit(num_groups, groups, lps, __LINE__,
    722               [cluster_idx](size_t lp, std::vector<Topology::LP>& lps) {
    723                 lps[lp].cluster = static_cast<uint16_t>(cluster_idx);
    724               });
    725    ++cluster_idx;
    726  };
    727 
    728  // Passes group bits to `foreach_cluster`, depending on relationship type.
    729  const auto foreach_die = [&foreach_cluster](const SLPI& info) {
    730    const PROCESSOR_RELATIONSHIP& p = info.Processor;
    731    foreach_cluster(p.GroupCount, p.GroupMask);
    732  };
    733  const auto foreach_l3 = [&foreach_cluster](const SLPI& info) {
    734    const CACHE_RELATIONSHIP& cr = info.Cache;
    735    if (cr.Type != CacheUnified && cr.Type != CacheData) return;
    736    if (cr.Level != 3) return;
    737    foreach_cluster(GroupCount(cr), cr.GroupMasks);
    738  };
    739 
    740  if (!ForEachSLPI(RelationProcessorDie, foreach_die)) {
    741    // Has been observed to fail; also check for shared L3 caches.
    742    (void)ForEachSLPI(RelationCache, foreach_l3);
    743  }
    744  if (max_cores_per_cluster == 0) {
    745    HWY_WARN("All clusters empty, assuming 1 core each\n");
    746    max_cores_per_cluster = 1;
    747  }
    748  return max_cores_per_cluster;
    749 }
    750 
    751 // Initializes `lps` and returns a `PackageSizes` vector (empty on failure)
    752 // indicating the number of clusters and cores per package.
    753 std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {
    754  const size_t max_lps_per_core = MaxLpsPerCore(lps);
    755  const size_t max_cores_per_cluster =
    756      MaxCoresPerCluster(max_lps_per_core, lps);
    757 
    758  std::vector<PackageSizes> packages;
    759  size_t package_idx = 0;
    760  (void)ForEachSLPI(RelationProcessorPackage, [&](const SLPI& info) {
    761    const PROCESSOR_RELATIONSHIP& p = info.Processor;
    762    const size_t lps_per_package = NumBits(p.GroupCount, p.GroupMask);
    763    PackageSizes ps;  // avoid designated initializers for MSVC
    764    ps.num_clusters = max_cores_per_cluster;
    765    // `max_lps_per_core` is an upper bound, hence round up.
    766    ps.num_cores = DivCeil(lps_per_package, max_lps_per_core);
    767    packages.push_back(ps);
    768 
    769    ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__,
    770               [package_idx](size_t lp, std::vector<Topology::LP>& lps) {
    771                 lps[lp].package = static_cast<uint8_t>(package_idx);
    772               });
    773    ++package_idx;
    774  });
    775 
    776  return packages;
    777 }
    778 
    779 // Sets LP.node for all `lps`.
    780 void SetNodes(std::vector<Topology::LP>& lps) {
    781  // Zero-initialize all nodes in case the below fails.
    782  for (size_t lp = 0; lp < lps.size(); ++lp) {
    783    lps[lp].node = 0;
    784  }
    785 
    786  // We want the full NUMA nodes, but Windows Server 2022 truncates the results
    787  // of `RelationNumaNode` to a single 64-LP group. To get the old, unlimited
    788  // behavior without using the new `RelationNumaNodeEx` symbol, use the old
    789  // `RelationAll` and filter the SLPI we want.
    790  (void)ForEachSLPI(RelationAll, [&](const SLPI& info) {
    791    if (info.Relationship != RelationNumaNode) return;
    792    const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode;
    793    // This field was previously reserved/zero. There is at least one group.
    794    const size_t num_groups = HWY_MAX(1, GroupCount(nn));
    795    const uint8_t node = static_cast<uint8_t>(nn.NodeNumber);
    796    ForeachBit(num_groups, nn.GroupMasks, lps, __LINE__,
    797               [node](size_t lp, std::vector<Topology::LP>& lps) {
    798                 lps[lp].node = node;
    799               });
    800  });
    801 }
    802 
    803 #elif HWY_OS_APPLE
    804 
    805 // Initializes `lps` and returns a `PackageSizes` vector (empty on failure)
    806 // indicating the number of clusters and cores per package.
    807 std::vector<PackageSizes> DetectPackages(std::vector<Topology::LP>& lps) {
    808  int err;
    809 
    810  size_t total_cores = 0;
    811  if (!Sysctl("hw.perflevel0.physicalcpu", 1, err, &total_cores)) {
    812    HWY_WARN("Error %d detecting total_cores, assuming one per LP\n", err);
    813    total_cores = lps.size();
    814  }
    815 
    816  if (lps.size() % total_cores != 0) {
    817    HWY_WARN("LPs %zu not a multiple of total_cores %zu\n", lps.size(),
    818             total_cores);
    819  }
    820  const size_t lp_per_core = DivCeil(lps.size(), total_cores);
    821 
    822  size_t cores_per_cluster = 0;
    823  if (!Sysctl("hw.perflevel0.cpusperl2", 1, err, &cores_per_cluster)) {
    824    HWY_WARN("Error %d detecting cores_per_cluster\n", err);
    825    cores_per_cluster = HWY_MIN(4, total_cores);
    826  }
    827 
    828  if (total_cores % cores_per_cluster != 0) {
    829    HWY_WARN("total_cores %zu not a multiple of cores_per_cluster %zu\n",
    830             total_cores, cores_per_cluster);
    831  }
    832 
    833  for (size_t lp = 0; lp < lps.size(); ++lp) {
    834    lps[lp].package = 0;  // single package
    835    lps[lp].core = static_cast<uint16_t>(lp / lp_per_core);
    836    lps[lp].smt = static_cast<uint8_t>(lp % lp_per_core);
    837    lps[lp].cluster = static_cast<uint16_t>(lps[lp].core / cores_per_cluster);
    838  }
    839 
    840  PackageSizes ps;
    841  ps.num_clusters = DivCeil(total_cores, cores_per_cluster);
    842  ps.num_cores = total_cores;
    843  return std::vector<PackageSizes>{ps};
    844 }
    845 
    846 // Sets LP.node for all `lps`.
    847 void SetNodes(std::vector<Topology::LP>& lps) {
    848  for (size_t lp = 0; lp < lps.size(); ++lp) {
    849    lps[lp].node = 0;  // no NUMA
    850  }
    851 }
    852 
    853 #endif  // HWY_OS_*
    854 
    855 #if HWY_OS_WIN || HWY_OS_APPLE
    856 
    857 void SetClusterCacheSizes(std::vector<Topology::Package>& packages) {
    858  // Assumes clusters are homogeneous. Otherwise, we would have to scan
    859  // `RelationCache` again and find the corresponding package_idx.
    860  const Cache* caches = DataCaches();
    861  const size_t private_kib = caches ? caches[2].size_kib : 0;
    862  const size_t shared_kib = caches ? caches[3].size_kib : 0;
    863 
    864  for (size_t ip = 0; ip < packages.size(); ++ip) {
    865    Topology::Package& p = packages[ip];
    866    for (size_t ic = 0; ic < p.clusters.size(); ++ic) {
    867      Topology::Cluster& c = p.clusters[ic];
    868      c.private_kib = private_kib;
    869      c.shared_kib = shared_kib;
    870    }
    871  }
    872 }
    873 
    874 #endif  // HWY_OS_WIN || HWY_OS_APPLE
    875 
    876 }  // namespace
    877 
    878 HWY_CONTRIB_DLLEXPORT Topology::Topology() {
    879 #if HWY_OS_LINUX || HWY_OS_WIN || HWY_OS_APPLE
    880  lps.resize(TotalLogicalProcessors());
    881  const std::vector<PackageSizes>& package_sizes = DetectPackages(lps);
    882  if (package_sizes.empty()) return;
    883  SetNodes(lps);
    884 
    885  // Allocate per-package/cluster/core vectors. This indicates to callers that
    886  // detection succeeded.
    887  packages.resize(package_sizes.size());
    888  for (size_t p = 0; p < packages.size(); ++p) {
    889    packages[p].clusters.resize(package_sizes[p].num_clusters);
    890    packages[p].cores.resize(package_sizes[p].num_cores);
    891  }
    892 
    893  // Populate the per-cluster/core sets of LP.
    894  for (size_t lp = 0; lp < lps.size(); ++lp) {
    895    Package& p = packages[lps[lp].package];
    896    p.clusters[lps[lp].cluster].lps.Set(lp);
    897    p.cores[lps[lp].core].lps.Set(lp);
    898  }
    899 
    900  SetClusterCacheSizes(packages);
    901 #endif  // HWY_OS_*
    902 }
    903 
    904 // ------------------------------ Cache detection
    905 
    906 namespace {
    907 
    908 using Caches = std::array<Cache, 4>;
    909 
    910 // We assume homogeneous caches across all clusters because some OS APIs return
    911 // a single value for a class of CPUs.
    912 
    913 #if HWY_OS_LINUX
    914 std::string ReadString(const char* name, size_t index) {
    915  // First CPU is usually a P core.
    916  const std::string path("/sys/devices/system/cpu/cpu0/cache/index%zu/");
    917  char buf200[200];
    918  size_t end = ReadSysfs((path + name).c_str(), index, buf200);
    919  // Remove trailing newline/null to simplify string comparison.
    920  for (; end != 0; --end) {
    921    if (buf200[end - 1] != '\0' && buf200[end - 1] != '\n') break;
    922  }
    923  return std::string(buf200, buf200 + end);
    924 }
    925 
    926 template <typename T>
    927 bool WriteSysfs(const char* name, size_t index, T* out) {
    928  const std::string str = ReadString(name, index);
    929  // Do not call `ParseNumberWithOptionalSuffix` because it acts on the
    930  // K suffix in "size", but we actually want KiB.
    931  size_t pos = 0;
    932  size_t val;
    933  if (!ParseDigits(str.c_str(), str.length(), pos, &val)) return false;
    934  HWY_ASSERT(pos <= str.length());
    935  *out = static_cast<T>(val);
    936  return true;
    937 }
    938 
    939 // Reading from sysfs is preferred because sysconf returns L3 associativity = 0
    940 // on some CPUs, and does not indicate sharing across cores.
    941 // https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu
    942 bool InitCachesSysfs(Caches& caches) {
    943  // For computing shared cache sizes.
    944  std::vector<hwy::Topology::LP> lps(TotalLogicalProcessors());
    945  const std::vector<PackageSizes> package_sizes = DetectPackages(lps);
    946  // `package_sizes` is only used to check that `lps` were filled.
    947  if (package_sizes.empty()) {
    948    HWY_WARN("no packages, shared cache sizes may be incorrect\n");
    949    return false;
    950  }
    951 
    952  for (size_t i = 0;; ++i) {
    953    const std::string type = ReadString("type", i);
    954    if (type.empty()) break;  // done, no more entries
    955    if (type != "Data" && type != "Unified") continue;
    956    uint32_t level;
    957    if (!WriteSysfs("level", i, &level)) continue;
    958    if (level != 1 && level != 2 && level != 3) continue;
    959    Cache& c = caches[level];
    960 
    961    // Check before overwriting any fields.
    962    if (c.size_kib != 0) {
    963      HWY_WARN("ignoring another L%u, first size %u\n", level, c.size_kib);
    964      continue;
    965    }
    966 
    967    const bool ok = WriteSysfs("size", i, &c.size_kib) &&
    968                    WriteSysfs("ways_of_associativity", i, &c.associativity) &&
    969                    WriteSysfs("number_of_sets", i, &c.sets);
    970    if (HWY_UNLIKELY(!ok)) {
    971      HWY_WARN("skipping partially-detected L%u, error %d\n", level, errno);
    972      c = Cache();
    973      continue;
    974    }
    975 
    976    // Compute line size *before* adjusting the size for sharing. Note that
    977    // `coherency_line_size` exists, but we are not sure that is the line size.
    978    const size_t bytes = static_cast<size_t>(c.size_kib) * 1024;
    979    const size_t lines = c.associativity * c.sets;
    980    c.bytes_per_line = static_cast<uint16_t>(DivByFactor(bytes, lines));
    981 
    982    // Divide by number of *cores* sharing the cache.
    983    const std::string shared_str = ReadString("shared_cpu_list", i);
    984    if (HWY_UNLIKELY(shared_str.empty())) {
    985      HWY_WARN("no shared_cpu_list for L%u %s\n", level, type.c_str());
    986      c.cores_sharing = 1;
    987    } else {
    988      const std::vector<size_t> shared_lps =
    989          ExpandList(shared_str.c_str(), shared_str.length(), lps.size() - 1);
    990      size_t num_cores = 0;
    991      for (size_t lp : shared_lps) {
    992        if (HWY_LIKELY(lp < lps.size())) {
    993          num_cores += lps[lp].smt == 0;
    994        } else {
    995          HWY_WARN("out of bounds lp %zu of %zu from %s\n", lp, lps.size(),
    996                   shared_str.c_str());
    997        }
    998      }
    999      if (num_cores == 0) {
   1000        HWY_WARN("no cores sharing L%u %s, setting to 1\n", level,
   1001                 type.c_str());
   1002        num_cores = 1;
   1003      }
   1004      c.cores_sharing = static_cast<uint16_t>(num_cores);
   1005      // There exist CPUs for which L3 is not evenly divisible by `num_cores`,
   1006      // hence do not use `DivByFactor`. It is safer to round down.
   1007      c.size_kib = static_cast<uint32_t>(c.size_kib / num_cores);
   1008      c.sets = static_cast<uint32_t>(c.sets / num_cores);
   1009    }
   1010  }
   1011 
   1012  // Require L1 and L2 cache.
   1013  if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) {
   1014 // Don't complain on Android because this is known to happen there. We are
   1015 // unaware of good alternatives: `getauxval(AT_L1D_CACHEGEOMETRY)` and
   1016 // `sysconf(_SC_LEVEL1_DCACHE_SIZE)` are unreliable, detecting via timing seems
   1017 // difficult to do reliably, and we do not want to maintain lists of known CPUs
   1018 // and their properties. It's OK to return false; callers are responsible for
   1019 // assuming reasonable defaults.
   1020 #ifndef __ANDROID__
   1021    HWY_WARN("sysfs detected L1=%u L2=%u, err %d\n", caches[1].size_kib,
   1022             caches[2].size_kib, errno);
   1023 #endif
   1024    return false;
   1025  }
   1026 
   1027  // L3 is optional; if not found, its size is already zero from static init.
   1028  return true;
   1029 }
   1030 
   1031 #elif HWY_OS_WIN
   1032 
   1033 bool InitCachesWin(Caches& caches) {
   1034  std::vector<hwy::Topology::LP> lps(TotalLogicalProcessors());
   1035  const size_t max_lps_per_core = MaxLpsPerCore(lps);
   1036 
   1037  (void)ForEachSLPI(RelationCache, [max_lps_per_core,
   1038                                    &caches](const SLPI& info) {
   1039    const CACHE_RELATIONSHIP& cr = info.Cache;
   1040    if (cr.Type != CacheUnified && cr.Type != CacheData) return;
   1041    if (1 <= cr.Level && cr.Level <= 3) {
   1042      Cache& c = caches[cr.Level];
   1043      // If the size is non-zero then we (probably) have already detected this
   1044      // cache and can skip the CR.
   1045      if (c.size_kib > 0) return;
   1046      c.size_kib = static_cast<uint32_t>(DivByFactor(cr.CacheSize, 1024));
   1047      c.bytes_per_line = static_cast<uint16_t>(cr.LineSize);
   1048      c.associativity = (cr.Associativity == CACHE_FULLY_ASSOCIATIVE)
   1049                            ? Cache::kMaxAssociativity
   1050                            : cr.Associativity;
   1051 
   1052      // How many cores share this cache?
   1053      size_t shared_with = NumBits(GroupCount(cr), cr.GroupMasks);
   1054      // Divide out hyperthreads. This core may have fewer than
   1055      // `max_lps_per_core`, hence round up.
   1056      shared_with = DivCeil(shared_with, max_lps_per_core);
   1057      if (shared_with == 0) {
   1058        HWY_WARN("no cores sharing L%u, setting to 1\n", cr.Level);
   1059        shared_with = 1;
   1060      }
   1061 
   1062      // Update `size_kib` to *per-core* portion.
   1063      // There exist CPUs for which L3 is not evenly divisible by `shared_with`,
   1064      // hence do not use `DivByFactor`. It is safer to round down.
   1065      c.size_kib = static_cast<uint32_t>(c.size_kib / shared_with);
   1066      c.cores_sharing = static_cast<uint16_t>(shared_with);
   1067    }
   1068  });
   1069 
   1070  // Require L1 and L2 cache.
   1071  if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) {
   1072    HWY_WARN("Windows detected L1=%u, L2=%u, err %lx\n", caches[1].size_kib,
   1073             caches[2].size_kib, GetLastError());
   1074    return false;
   1075  }
   1076 
   1077  // L3 is optional; if not found, its size is already zero from static init.
   1078  return true;
   1079 }
   1080 
   1081 #elif HWY_OS_APPLE
   1082 
   1083 bool InitCachesApple(Caches& caches) {
   1084  int err = 0;
   1085  Cache& L1 = caches[1];
   1086  Cache& L2 = caches[2];
   1087  Cache& L3 = caches[3];
   1088 
   1089  // Total L1 and L2 size can be reliably queried, but prefer perflevel0
   1090  // (P-cores) because hw.l1dcachesize etc. are documented to describe the
   1091  // "least performant core".
   1092  bool ok = Sysctl("hw.perflevel0.l1dcachesize", 1024, err, &L1.size_kib) ||
   1093            Sysctl("hw.l1dcachesize", 1024, err, &L1.size_kib);
   1094  ok &= Sysctl("hw.perflevel0.l2cachesize", 1024, err, &L2.size_kib) ||
   1095        Sysctl("hw.l2cachesize", 1024, err, &L2.size_kib);
   1096  if (HWY_UNLIKELY(!ok)) {
   1097    HWY_WARN("Apple cache detection failed, error %d\n", err);
   1098    return false;
   1099  }
   1100  L1.cores_sharing = 1;
   1101  if (Sysctl("hw.perflevel0.cpusperl2", 1, err, &L2.cores_sharing)) {
   1102    // There exist CPUs for which L2 is not evenly divisible by `cores_sharing`,
   1103    // hence do not use `DivByFactor`. It is safer to round down.
   1104    L2.size_kib /= L2.cores_sharing;
   1105  } else {
   1106    L2.cores_sharing = 1;
   1107  }
   1108 
   1109  // Other properties are not always reported. Set `associativity` and
   1110  // `bytes_per_line` based on known models.
   1111  char brand[128] = {0};
   1112  size_t size = sizeof(brand);
   1113  if (!sysctlbyname("machdep.cpu.brand_string", brand, &size, nullptr, 0)) {
   1114    if (strncmp(brand, "Apple ", 6) != 0) {
   1115      // Unexpected, but we will continue check the string suffixes.
   1116      HWY_WARN("unexpected Apple brand %s\n", brand);
   1117    }
   1118 
   1119    if (brand[6] == 'M') {
   1120      // https://dougallj.github.io/applecpu/firestorm.html,
   1121      // https://www.7-cpu.com/cpu/Apple_M1.html:
   1122      L1.bytes_per_line = 64;
   1123      L1.associativity = 8;
   1124      L2.bytes_per_line = 128;
   1125      if (brand[7] == '1') {  // M1
   1126        L2.associativity = 12;
   1127      } else if ('2' <= brand[7] && brand[7] <= '4') {  // M2/M3, maybe also M4
   1128        L2.associativity = 16;
   1129      } else {
   1130        L2.associativity = 0;  // Unknown, set below via sysctl.
   1131      }
   1132 
   1133      // Although Wikipedia lists SLC sizes per model, we do not know how it is
   1134      // partitioned/allocated, so do not treat it as a reliable L3.
   1135    }  // M*
   1136  }  // brand string
   1137 
   1138  // This sysctl does not distinguish between L1 and L2 line sizes, so only use
   1139  // it if we have not already set `bytes_per_line` above.
   1140  uint16_t bytes_per_line;
   1141  if (!Sysctl("hw.cachelinesize", 1, err, &bytes_per_line)) {
   1142    bytes_per_line = static_cast<uint16_t>(HWY_ALIGNMENT);  // guess
   1143  }
   1144  for (size_t level = 1; level <= 3; ++level) {
   1145    if (caches[level].bytes_per_line == 0) {
   1146      caches[level].bytes_per_line = bytes_per_line;
   1147    }
   1148  }
   1149 
   1150  // Fill in associativity if not already set. Unfortunately this is only
   1151  // reported on x86, not on M*.
   1152  if (L1.associativity == 0 && !Sysctl("machdep.cpu.cache.L1_associativity", 1,
   1153                                       err, &L1.associativity)) {
   1154    L1.associativity = 8;  // guess
   1155  }
   1156  if (L2.associativity == 0 && !Sysctl("machdep.cpu.cache.L2_associativity", 1,
   1157                                       err, &L2.associativity)) {
   1158    L2.associativity = 12;  // guess
   1159  }
   1160  // There is no L3_associativity.
   1161  if (L3.associativity == 0) {
   1162    L3.associativity = 12;  // guess
   1163  }
   1164 
   1165  // Now attempt to query L3. Although this sysctl is documented, M3 does not
   1166  // report an L3 cache.
   1167  if (L3.size_kib == 0 &&
   1168      (Sysctl("hw.perflevel0.l3cachesize", 1024, err, &L3.size_kib) ||
   1169       Sysctl("hw.l3cachesize", 1024, err, &L3.size_kib))) {
   1170    // There exist CPUs for which L3 is not evenly divisible by `cores_sharing`,
   1171    // hence do not use `DivByFactor`. It is safer to round down.
   1172    if (Sysctl("hw.perflevel0.cpusperl3", 1, err, &L3.cores_sharing)) {
   1173      L3.size_kib /= L3.cores_sharing;
   1174    } else {
   1175      L3.cores_sharing = 1;
   1176    }
   1177  }
   1178  // If no L3 cache, reset all fields for consistency.
   1179  if (L3.size_kib == 0) {
   1180    L3 = Cache();
   1181  }
   1182 
   1183  // Are there other useful sysctls? hw.cacheconfig appears to be how many
   1184  // cores share the memory and caches, though this is not documented, and
   1185  // duplicates information in hw.perflevel0.cpusperl*.
   1186 
   1187  return true;
   1188 }
   1189 
   1190 #endif  // HWY_OS_*
   1191 
   1192 // Most APIs do not set the `sets` field, so compute it from the size and
   1193 // associativity, and if a value is already set, ensure it matches.
   1194 HWY_MAYBE_UNUSED void ComputeSets(Cache& c) {
   1195  // If there is no such cache, avoid division by zero.
   1196  if (HWY_UNLIKELY(c.size_kib == 0)) {
   1197    c.sets = 0;
   1198    return;
   1199  }
   1200  const size_t bytes = static_cast<size_t>(c.size_kib) * 1024;
   1201  // `size_kib` may have been rounded down, hence `lines` and `sets` are not
   1202  // necessarily evenly divisible, so round down instead of `DivByFactor`.
   1203  const size_t lines = bytes / c.bytes_per_line;
   1204  const size_t sets = lines / c.associativity;
   1205 
   1206  if (c.sets == 0) {
   1207    c.sets = static_cast<uint32_t>(sets);
   1208  } else {
   1209    const size_t diff = c.sets - sets;
   1210    if (diff > 1) {
   1211      HWY_ABORT("Inconsistent cache sets %u != %zu\n", c.sets, sets);
   1212    }
   1213  }
   1214 }
   1215 
   1216 const Cache* InitDataCaches() {
   1217  alignas(64) static Caches caches;
   1218 
   1219  // On failure, return immediately because InitCaches*() already warn.
   1220 #if HWY_OS_LINUX
   1221  if (HWY_UNLIKELY(!InitCachesSysfs(caches))) return nullptr;
   1222 #elif HWY_OS_WIN
   1223  if (HWY_UNLIKELY(!InitCachesWin(caches))) return nullptr;
   1224 #elif HWY_OS_APPLE
   1225  if (HWY_UNLIKELY(!InitCachesApple(caches))) return nullptr;
   1226 #else
   1227  HWY_WARN("Cache detection not implemented for this platform.\n");
   1228  (void)caches;
   1229  return nullptr;
   1230 #define HWY_NO_CACHE_DETECTION
   1231 #endif
   1232 
   1233  // Prevents "code not reached" warnings on WASM.
   1234 #ifndef HWY_NO_CACHE_DETECTION
   1235  for (size_t level = 1; level <= 3; ++level) {
   1236    ComputeSets(caches[level]);
   1237  }
   1238 
   1239  // Heuristic to ignore SLCs such as on Ampere Altra, which should not be
   1240  // treated as a reliable L3 because of their cache inclusion policy.
   1241  // On Apple M*, these are not even reported as an L3.
   1242  if (caches[3].cores_sharing >= 16 && caches[3].size_kib <= 512) {
   1243    caches[3] = Cache();
   1244  }
   1245 
   1246  return &caches[0];
   1247 #endif  // HWY_NO_CACHE_DETECTION
   1248 }
   1249 
   1250 }  // namespace
   1251 
   1252 HWY_CONTRIB_DLLEXPORT const Cache* DataCaches() {
   1253  static const Cache* caches = InitDataCaches();
   1254  return caches;
   1255 }
   1256 
   1257 }  // namespace hwy