targets.h (15830B)
1 // Copyright 2020 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_TARGETS_H_ 17 #define HIGHWAY_HWY_TARGETS_H_ 18 19 // Allows opting out of C++ standard library usage, which is not available in 20 // some Compiler Explorer environments. 21 #ifndef HWY_NO_LIBCXX 22 #include <vector> 23 #endif 24 25 // For SIMD module implementations and their callers. Defines which targets to 26 // generate and call. 27 28 #include "hwy/base.h" 29 #include "hwy/detect_targets.h" 30 #include "hwy/highway_export.h" 31 32 #if !defined(HWY_NO_LIBCXX) 33 #include <atomic> 34 #endif 35 36 namespace hwy { 37 38 // Returns bitfield of enabled targets that are supported on this CPU; there is 39 // always at least one such target, hence the return value is never 0. The 40 // targets returned may change after calling DisableTargets. This function is 41 // always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding 42 // calls to it if there is only a single target enabled. 43 HWY_DLLEXPORT int64_t SupportedTargets(); 44 45 // Evaluates to a function call, or literal if there is a single target. 46 #if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0 47 #define HWY_SUPPORTED_TARGETS HWY_TARGETS 48 #else 49 #define HWY_SUPPORTED_TARGETS hwy::SupportedTargets() 50 #endif 51 52 // Subsequent SupportedTargets will not return targets whose bit(s) are set in 53 // `disabled_targets`. Exception: if SupportedTargets would return 0, it will 54 // instead return HWY_STATIC_TARGET (there must always be one target to call). 55 // 56 // This function is useful for disabling targets known to be buggy, or if the 57 // best available target is undesirable (perhaps due to throttling or memory 58 // bandwidth limitations). Use SetSupportedTargetsForTest instead of this 59 // function for iteratively enabling specific targets for testing. 60 HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets); 61 62 // Subsequent SupportedTargets will return the given set of targets, except 63 // those disabled via DisableTargets. Call with a mask of 0 to disable the mock 64 // and return to the normal SupportedTargets behavior. Used to run tests for 65 // all targets. 66 HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets); 67 68 #ifndef HWY_NO_LIBCXX 69 70 // Return the list of targets in HWY_TARGETS supported by the CPU as a list of 71 // individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list 72 // is affected by the current SetSupportedTargetsForTest() mock if any. 73 HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() { 74 std::vector<int64_t> ret; 75 for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0; 76 targets = targets & (targets - 1)) { 77 int64_t current_target = targets & ~(targets - 1); 78 ret.push_back(current_target); 79 } 80 return ret; 81 } 82 83 #endif // HWY_NO_LIBCXX 84 85 // Returns a string that satisfies gtest IsValidParamName(). No longer report 86 // targets as "Unknown" if they are for a different architecture, because some 87 // users unconditionally disable targets and we want to see which. 88 static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) { 89 switch (target) { 90 case HWY_EMU128: 91 return "EMU128"; 92 case HWY_SCALAR: 93 return "SCALAR"; 94 95 // X86 96 case HWY_SSE2: 97 return "SSE2"; 98 case HWY_SSSE3: 99 return "SSSE3"; 100 case HWY_SSE4: 101 return "SSE4"; 102 case HWY_AVX2: 103 return "AVX2"; 104 case HWY_AVX3: 105 return "AVX3"; 106 case HWY_AVX3_DL: 107 return "AVX3_DL"; 108 case HWY_AVX3_ZEN4: 109 return "AVX3_ZEN4"; 110 case HWY_AVX3_SPR: 111 return "AVX3_SPR"; 112 case HWY_AVX10_2: 113 return "AVX10_2"; 114 115 // ARM 116 case HWY_SVE2_128: 117 return "SVE2_128"; 118 case HWY_SVE_256: 119 return "SVE_256"; 120 case HWY_SVE2: 121 return "SVE2"; 122 case HWY_SVE: 123 return "SVE"; 124 case HWY_NEON_BF16: 125 return "NEON_BF16"; 126 case HWY_NEON: 127 return "NEON"; 128 case HWY_NEON_WITHOUT_AES: 129 return "NEON_WITHOUT_AES"; 130 131 // PPC 132 case HWY_PPC8: 133 return "PPC8"; 134 case HWY_PPC9: 135 return "PPC9"; 136 case HWY_PPC10: 137 return "PPC10"; 138 139 // S390X 140 case HWY_Z14: 141 return "Z14"; 142 case HWY_Z15: 143 return "Z15"; 144 145 // WASM 146 case HWY_WASM: 147 return "WASM"; 148 case HWY_WASM_EMU256: 149 return "WASM_EMU256"; 150 151 // RISCV 152 case HWY_RVV: 153 return "RVV"; 154 155 // LOONGARCH 156 case HWY_LSX: 157 return "LSX"; 158 case HWY_LASX: 159 return "LASX"; 160 } 161 162 return "Unknown"; 163 } 164 165 // Invokes VISITOR(TARGET, NAMESPACE) for all enabled targets. Alphabetic order. 166 #define HWY_VISIT_TARGETS(VISITOR) \ 167 HWY_VISIT_AVX10_2(VISITOR) \ 168 HWY_VISIT_AVX2(VISITOR) \ 169 HWY_VISIT_AVX3(VISITOR) \ 170 HWY_VISIT_AVX3_DL(VISITOR) \ 171 HWY_VISIT_AVX3_SPR(VISITOR) \ 172 HWY_VISIT_AVX3_ZEN4(VISITOR) \ 173 HWY_VISIT_FALLBACK(VISITOR) \ 174 HWY_VISIT_LASX(VISITOR) \ 175 HWY_VISIT_LSX(VISITOR) \ 176 HWY_VISIT_NEON(VISITOR) \ 177 HWY_VISIT_NEON_BF16(VISITOR) \ 178 HWY_VISIT_NEON_WITHOUT_AES(VISITOR) \ 179 HWY_VISIT_PPC10(VISITOR) \ 180 HWY_VISIT_PPC8(VISITOR) \ 181 HWY_VISIT_PPC9(VISITOR) \ 182 HWY_VISIT_RVV(VISITOR) \ 183 HWY_VISIT_SSE2(VISITOR) \ 184 HWY_VISIT_SSE4(VISITOR) \ 185 HWY_VISIT_SSSE3(VISITOR) \ 186 HWY_VISIT_SVE(VISITOR) \ 187 HWY_VISIT_SVE2(VISITOR) \ 188 HWY_VISIT_SVE2_128(VISITOR) \ 189 HWY_VISIT_SVE_256(VISITOR) \ 190 HWY_VISIT_WASM(VISITOR) \ 191 HWY_VISIT_WASM_EMU256(VISITOR) \ 192 HWY_VISIT_Z14(VISITOR) \ 193 HWY_VISIT_Z15(VISITOR) 194 195 // The maximum number of dynamic targets on any architecture is defined by 196 // HWY_MAX_DYNAMIC_TARGETS and depends on the arch. 197 198 // For the ChosenTarget mask and index we use a different bit arrangement than 199 // in the HWY_TARGETS mask. Only the targets involved in the current 200 // architecture are used in this mask, and therefore only the least significant 201 // (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least 202 // significant bit is set when the mask is not initialized, the next 203 // HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the 204 // HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to 205 // that position and the next more significant bit is used for HWY_SCALAR (if 206 // HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to 207 // define equivalent values for HWY_TARGETS in this representation. 208 // This mask representation allows to use ctz() on this mask and obtain a small 209 // number that's used as an index of the table for dynamic dispatch. In this 210 // way the first entry is used when the mask is uninitialized, the following 211 // HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for 212 // scalar. 213 214 // The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format. 215 #define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1)) 216 217 // Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the 218 // current architecture. 219 #define HWY_CHOSEN_TARGET_SHIFT(X) \ 220 ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \ 221 ((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1)) \ 222 << 1) 223 224 // The HWY_TARGETS mask in the ChosenTarget mask format. 225 #define HWY_CHOSEN_TARGET_MASK_TARGETS \ 226 (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL) 227 228 #if HWY_ARCH_X86 229 // Maximum number of dynamic targets, changing this value is an ABI incompatible 230 // change 231 #define HWY_MAX_DYNAMIC_TARGETS 15 232 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86 233 // These must match the order in which the HWY_TARGETS are defined 234 // starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 - 235 // HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly 236 // HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry 237 // corresponds to the best target. Don't include a "," at the end of the list. 238 #define HWY_CHOOSE_TARGET_LIST(func_name) \ 239 nullptr, /* reserved */ \ 240 nullptr, /* reserved */ \ 241 nullptr, /* reserved */ \ 242 HWY_CHOOSE_AVX10_2(func_name), /* AVX10_2 */ \ 243 HWY_CHOOSE_AVX3_SPR(func_name), /* AVX3_SPR */ \ 244 nullptr, /* reserved */ \ 245 HWY_CHOOSE_AVX3_ZEN4(func_name), /* AVX3_ZEN4 */ \ 246 HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \ 247 HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \ 248 HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \ 249 nullptr, /* AVX */ \ 250 HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \ 251 HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \ 252 nullptr, /* reserved - SSE3? */ \ 253 HWY_CHOOSE_SSE2(func_name) /* SSE2 */ 254 255 #elif HWY_ARCH_ARM 256 // See HWY_ARCH_X86 above for details. 257 #define HWY_MAX_DYNAMIC_TARGETS 15 258 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM 259 #define HWY_CHOOSE_TARGET_LIST(func_name) \ 260 nullptr, /* reserved */ \ 261 nullptr, /* reserved */ \ 262 nullptr, /* reserved */ \ 263 HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \ 264 HWY_CHOOSE_SVE_256(func_name), /* SVE 256-bit */ \ 265 nullptr, /* reserved */ \ 266 nullptr, /* reserved */ \ 267 nullptr, /* reserved */ \ 268 HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \ 269 HWY_CHOOSE_SVE(func_name), /* SVE */ \ 270 nullptr, /* reserved */ \ 271 HWY_CHOOSE_NEON_BF16(func_name), /* NEON + f16/dot/bf16 */ \ 272 nullptr, /* reserved */ \ 273 HWY_CHOOSE_NEON(func_name), /* NEON */ \ 274 HWY_CHOOSE_NEON_WITHOUT_AES(func_name) /* NEON without AES */ 275 276 #elif HWY_ARCH_RISCV 277 // See HWY_ARCH_X86 above for details. 278 #define HWY_MAX_DYNAMIC_TARGETS 9 279 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV 280 #define HWY_CHOOSE_TARGET_LIST(func_name) \ 281 nullptr, /* reserved */ \ 282 nullptr, /* reserved */ \ 283 nullptr, /* reserved */ \ 284 nullptr, /* reserved */ \ 285 nullptr, /* reserved */ \ 286 nullptr, /* reserved */ \ 287 nullptr, /* reserved */ \ 288 HWY_CHOOSE_RVV(func_name), /* RVV */ \ 289 nullptr /* reserved */ 290 291 #elif HWY_ARCH_PPC || HWY_ARCH_S390X 292 // See HWY_ARCH_X86 above for details. 293 #define HWY_MAX_DYNAMIC_TARGETS 9 294 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC 295 #define HWY_CHOOSE_TARGET_LIST(func_name) \ 296 nullptr, /* reserved */ \ 297 nullptr, /* reserved */ \ 298 nullptr, /* reserved */ \ 299 nullptr, /* reserved */ \ 300 HWY_CHOOSE_PPC10(func_name), /* PPC10 */ \ 301 HWY_CHOOSE_PPC9(func_name), /* PPC9 */ \ 302 HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \ 303 HWY_CHOOSE_Z15(func_name), /* Z15 */ \ 304 HWY_CHOOSE_Z14(func_name) /* Z14 */ 305 306 #elif HWY_ARCH_WASM 307 // See HWY_ARCH_X86 above for details. 308 #define HWY_MAX_DYNAMIC_TARGETS 9 309 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM 310 #define HWY_CHOOSE_TARGET_LIST(func_name) \ 311 nullptr, /* reserved */ \ 312 nullptr, /* reserved */ \ 313 nullptr, /* reserved */ \ 314 nullptr, /* reserved */ \ 315 nullptr, /* reserved */ \ 316 nullptr, /* reserved */ \ 317 HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \ 318 HWY_CHOOSE_WASM(func_name), /* WASM */ \ 319 nullptr /* reserved */ 320 321 #elif HWY_ARCH_LOONGARCH 322 #define HWY_MAX_DYNAMIC_TARGETS 3 323 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_LOONGARCH 324 #define HWY_CHOOSE_TARGET_LIST(func_name) \ 325 nullptr, /* reserved */ \ 326 HWY_CHOOSE_LASX(func_name), /* LASX */ \ 327 HWY_CHOOSE_LSX(func_name) /* LSX */ 328 329 #else 330 // Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though 331 // still creating single-entry tables in HWY_EXPORT to ensure portability. 332 #define HWY_MAX_DYNAMIC_TARGETS 1 333 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR 334 #endif 335 336 // Bitfield of supported and enabled targets. The format differs from that of 337 // HWY_TARGETS; the lowest bit governs the first function pointer (which is 338 // special in that it calls FunctionCache, then Update, then dispatches to the 339 // actual implementation) in the tables created by HWY_EXPORT. Monostate (see 340 // GetChosenTarget), thread-safe except on RVV. 341 struct ChosenTarget { 342 public: 343 // Reset bits according to `targets` (typically the return value of 344 // SupportedTargets()). Postcondition: IsInitialized() == true. 345 void Update(int64_t targets) { 346 // These are `targets` shifted downwards, see above. Also include SCALAR 347 // (corresponds to the last entry in the function table) as fallback. 348 StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR); 349 } 350 351 // Reset to the uninitialized state, so that FunctionCache will call Update 352 // during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false. 353 void DeInit() { StoreMask(1); } 354 355 // Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH 356 // function was called, which we check in tests. 357 bool IsInitialized() const { return LoadMask() != 1; } 358 359 // Return the index in the dynamic dispatch table to be used by the current 360 // CPU. Note that this method must be in the header file so it uses the value 361 // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that 362 // calls it, which may be different from others. This means we only enable 363 // those targets that were actually compiled in this module. 364 size_t HWY_INLINE GetIndex() const { 365 return hwy::Num0BitsBelowLS1Bit_Nonzero64( 366 static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS)); 367 } 368 369 private: 370 #if defined(HWY_NO_LIBCXX) 371 int64_t LoadMask() const { return mask_; } 372 void StoreMask(int64_t mask) { mask_ = mask; } 373 374 int64_t mask_{1}; // Initialized to 1 so GetIndex() returns 0. 375 #else 376 int64_t LoadMask() const { return mask_.load(); } 377 void StoreMask(int64_t mask) { mask_.store(mask); } 378 379 std::atomic<int64_t> mask_{1}; // Initialized to 1 so GetIndex() returns 0. 380 #endif // HWY_ARCH_RISCV 381 }; 382 383 // For internal use (e.g. by FunctionCache and DisableTargets). 384 HWY_DLLEXPORT ChosenTarget& GetChosenTarget(); 385 386 } // namespace hwy 387 388 #endif // HIGHWAY_HWY_TARGETS_H_