tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

topology.h (5186B)


      1 // Copyright 2024 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_
     17 #define HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_
     18 
     19 // OS-specific functions for processor topology and thread affinity.
     20 
     21 #include <stddef.h>
     22 
     23 #include <vector>
     24 
     25 #include "hwy/base.h"
     26 #include "hwy/bit_set.h"
     27 
     28 namespace hwy {
     29 
     30 // Returns false if std::thread should not be used.
     31 HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport();
     32 
     33 // Upper bound on logical processors, including hyperthreads.
     34 static constexpr size_t kMaxLogicalProcessors = 1024;  // matches glibc
     35 
     36 // Set used by Get/SetThreadAffinity.
     37 using LogicalProcessorSet = BitSet4096<kMaxLogicalProcessors>;
     38 
     39 // Returns false, or sets `lps` to all logical processors which are online and
     40 // available to the current thread.
     41 HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps);
     42 
     43 // Ensures the current thread can only run on the logical processors in `lps`.
     44 // Returns false if not supported (in particular on Apple), or if the
     45 // intersection between `lps` and `GetThreadAffinity` is the empty set.
     46 HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps);
     47 
     48 // Returns false, or ensures the current thread will only run on `lp`, which
     49 // must not exceed `TotalLogicalProcessors`. Note that this merely calls
     50 // `SetThreadAffinity`, see the comment there.
     51 static inline bool PinThreadToLogicalProcessor(size_t lp) {
     52  LogicalProcessorSet lps;
     53  lps.Set(lp);
     54  return SetThreadAffinity(lps);
     55 }
     56 
     57 // Returns 1 if unknown, otherwise the total number of logical processors
     58 // provided by the hardware clamped to `kMaxLogicalProcessors`.
     59 // These processors are not necessarily all usable; you can determine which are
     60 // via GetThreadAffinity().
     61 HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors();
     62 
     63 struct Topology {
     64  // Caller must check packages.empty(); if so, do not use any fields.
     65  HWY_CONTRIB_DLLEXPORT Topology();
     66 
     67  // Clique of cores with lower latency to each other. On Apple M1 these are
     68  // four cores sharing an L2. On Zen4 these 'CCX' are up to eight cores sharing
     69  // an L3 and a memory controller, or for Zen4c up to 16 and half the L3 size.
     70  struct Cluster {
     71    LogicalProcessorSet lps;
     72    uint64_t private_kib = 0;  // 0 if unknown
     73    uint64_t shared_kib = 0;   // 0 if unknown
     74    uint64_t reserved1 = 0;
     75    uint64_t reserved2 = 0;
     76    uint64_t reserved3 = 0;
     77  };
     78 
     79  struct Core {
     80    LogicalProcessorSet lps;
     81    uint64_t reserved = 0;
     82  };
     83 
     84  struct Package {
     85    std::vector<Cluster> clusters;
     86    std::vector<Core> cores;
     87  };
     88 
     89  std::vector<Package> packages;
     90 
     91  // Several hundred instances, so prefer a compact representation.
     92 #pragma pack(push, 1)
     93  struct LP {
     94    uint16_t cluster = 0;  // < packages[package].clusters.size()
     95    uint16_t core = 0;     // < packages[package].cores.size()
     96    uint8_t package = 0;   // < packages.size()
     97    uint8_t smt = 0;       // < packages[package].cores[core].lps.Count()
     98    uint8_t node = 0;
     99 
    100    uint8_t reserved = 0;
    101  };
    102 #pragma pack(pop)
    103  std::vector<LP> lps;  // size() == TotalLogicalProcessors().
    104 };
    105 
    106 #pragma pack(push, 1)
    107 // Cache parameters. Note the overlap with `HWY_ALIGNMENT`, which is intended
    108 // but not guaranteed to be an upper bound for L1/L2 line sizes, and
    109 // `Topology::Cluster::private_kib/shared_kib`, which are intended but not
    110 // guaranteed to be the L2/L3 sizes. Getting the exact parameters, including the
    111 // ways of associativity, can be useful for modeling cache conflicts.
    112 //
    113 // Uses packed fields so the array of `Cache` fits in a typical cache line.
    114 struct Cache {
    115  // Arbitrary upper bound for sanity checking.
    116  static constexpr uint16_t kMaxAssociativity = 128;
    117 
    118  // Zero if the level does not exist; *per-core* portion for shared caches.
    119  uint32_t size_kib = 0;
    120  // Also per-core portion, computed as number of lines / associativity.
    121  uint32_t sets = 0;
    122  uint16_t bytes_per_line = 0;
    123  uint16_t associativity = 0;  // number of ways
    124  uint16_t cores_sharing = 0;  // usually 1 for L1
    125  uint16_t reserved = 0;
    126 };
    127 static_assert(sizeof(Cache) == 16, "Unexpected size");
    128 #pragma pack(pop)
    129 
    130 // Returns null if unknown, otherwise pointer to an array of `Cache` instances,
    131 // where entry 0 is reserved, entry 1 describes the L1 data cache, entry 2
    132 // describes the (possibly unified or shared) L2, and entry 3 describes the L3
    133 // if its `size_kib != 0`.
    134 //
    135 // Initializes on-demand, which has some overhead for thread safety, hence
    136 // callers should cache the result.
    137 HWY_CONTRIB_DLLEXPORT const Cache* DataCaches();
    138 
    139 }  // namespace hwy
    140 
    141 #endif  // HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_