topology.h (5186B)
1 // Copyright 2024 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_ 17 #define HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_ 18 19 // OS-specific functions for processor topology and thread affinity. 20 21 #include <stddef.h> 22 23 #include <vector> 24 25 #include "hwy/base.h" 26 #include "hwy/bit_set.h" 27 28 namespace hwy { 29 30 // Returns false if std::thread should not be used. 31 HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport(); 32 33 // Upper bound on logical processors, including hyperthreads. 34 static constexpr size_t kMaxLogicalProcessors = 1024; // matches glibc 35 36 // Set used by Get/SetThreadAffinity. 37 using LogicalProcessorSet = BitSet4096<kMaxLogicalProcessors>; 38 39 // Returns false, or sets `lps` to all logical processors which are online and 40 // available to the current thread. 41 HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps); 42 43 // Ensures the current thread can only run on the logical processors in `lps`. 44 // Returns false if not supported (in particular on Apple), or if the 45 // intersection between `lps` and `GetThreadAffinity` is the empty set. 46 HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps); 47 48 // Returns false, or ensures the current thread will only run on `lp`, which 49 // must not exceed `TotalLogicalProcessors`. Note that this merely calls 50 // `SetThreadAffinity`, see the comment there. 51 static inline bool PinThreadToLogicalProcessor(size_t lp) { 52 LogicalProcessorSet lps; 53 lps.Set(lp); 54 return SetThreadAffinity(lps); 55 } 56 57 // Returns 1 if unknown, otherwise the total number of logical processors 58 // provided by the hardware clamped to `kMaxLogicalProcessors`. 59 // These processors are not necessarily all usable; you can determine which are 60 // via GetThreadAffinity(). 61 HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors(); 62 63 struct Topology { 64 // Caller must check packages.empty(); if so, do not use any fields. 65 HWY_CONTRIB_DLLEXPORT Topology(); 66 67 // Clique of cores with lower latency to each other. On Apple M1 these are 68 // four cores sharing an L2. On Zen4 these 'CCX' are up to eight cores sharing 69 // an L3 and a memory controller, or for Zen4c up to 16 and half the L3 size. 70 struct Cluster { 71 LogicalProcessorSet lps; 72 uint64_t private_kib = 0; // 0 if unknown 73 uint64_t shared_kib = 0; // 0 if unknown 74 uint64_t reserved1 = 0; 75 uint64_t reserved2 = 0; 76 uint64_t reserved3 = 0; 77 }; 78 79 struct Core { 80 LogicalProcessorSet lps; 81 uint64_t reserved = 0; 82 }; 83 84 struct Package { 85 std::vector<Cluster> clusters; 86 std::vector<Core> cores; 87 }; 88 89 std::vector<Package> packages; 90 91 // Several hundred instances, so prefer a compact representation. 92 #pragma pack(push, 1) 93 struct LP { 94 uint16_t cluster = 0; // < packages[package].clusters.size() 95 uint16_t core = 0; // < packages[package].cores.size() 96 uint8_t package = 0; // < packages.size() 97 uint8_t smt = 0; // < packages[package].cores[core].lps.Count() 98 uint8_t node = 0; 99 100 uint8_t reserved = 0; 101 }; 102 #pragma pack(pop) 103 std::vector<LP> lps; // size() == TotalLogicalProcessors(). 104 }; 105 106 #pragma pack(push, 1) 107 // Cache parameters. Note the overlap with `HWY_ALIGNMENT`, which is intended 108 // but not guaranteed to be an upper bound for L1/L2 line sizes, and 109 // `Topology::Cluster::private_kib/shared_kib`, which are intended but not 110 // guaranteed to be the L2/L3 sizes. Getting the exact parameters, including the 111 // ways of associativity, can be useful for modeling cache conflicts. 112 // 113 // Uses packed fields so the array of `Cache` fits in a typical cache line. 114 struct Cache { 115 // Arbitrary upper bound for sanity checking. 116 static constexpr uint16_t kMaxAssociativity = 128; 117 118 // Zero if the level does not exist; *per-core* portion for shared caches. 119 uint32_t size_kib = 0; 120 // Also per-core portion, computed as number of lines / associativity. 121 uint32_t sets = 0; 122 uint16_t bytes_per_line = 0; 123 uint16_t associativity = 0; // number of ways 124 uint16_t cores_sharing = 0; // usually 1 for L1 125 uint16_t reserved = 0; 126 }; 127 static_assert(sizeof(Cache) == 16, "Unexpected size"); 128 #pragma pack(pop) 129 130 // Returns null if unknown, otherwise pointer to an array of `Cache` instances, 131 // where entry 0 is reserved, entry 1 describes the L1 data cache, entry 2 132 // describes the (possibly unified or shared) L2, and entry 3 describes the L3 133 // if its `size_kib != 0`. 134 // 135 // Initializes on-demand, which has some overhead for thread safety, hence 136 // callers should cache the result. 137 HWY_CONTRIB_DLLEXPORT const Cache* DataCaches(); 138 139 } // namespace hwy 140 141 #endif // HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_