highway.h (28605B)
1 // Copyright 2020 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // Main header required before using vector types. 17 18 // IWYU pragma: begin_exports 19 #include "hwy/base.h" 20 #include "hwy/detect_compiler_arch.h" 21 #include "hwy/detect_targets.h" 22 #include "hwy/highway_export.h" 23 #include "hwy/targets.h" 24 // IWYU pragma: end_exports 25 26 #if HWY_CXX_LANG < 201703L 27 #define HWY_DISPATCH_MAP 1 28 #else 29 #define HWY_DISPATCH_MAP 0 30 #endif 31 32 // This include guard is checked by foreach_target, so avoid the usual _H_ 33 // suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included 34 // after/outside this include guard. 35 #ifndef HWY_HIGHWAY_INCLUDED 36 #define HWY_HIGHWAY_INCLUDED 37 38 namespace hwy { 39 40 //------------------------------------------------------------------------------ 41 // Shorthand for tags (defined in shared-inl.h) used to select overloads. 42 // Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over 43 // HWY_CAPPED(T, N). 44 45 // HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of 46 // registers in the group, and is ignored on targets that do not support groups. 47 #define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T> 48 #define HWY_FULL2(T, LMUL) \ 49 hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))> 50 #define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3 51 // Workaround for MSVC grouping __VA_ARGS__ into a single argument 52 #define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren 53 // Trailing comma avoids -pedantic false alarm 54 #define HWY_CHOOSE_FULL(...) \ 55 HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, )) 56 #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__) 57 58 // Vector of up to MAX_N lanes. It's better to use full vectors where possible. 59 #define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N> 60 61 //------------------------------------------------------------------------------ 62 // Export user functions for static/dynamic dispatch 63 64 // The static target is the best baseline. When using foreach_target.h, this is 65 // the last target compiled. Otherwise, it is the only target. 66 67 // Evaluates to 0 inside a translation unit if it is generating anything but the 68 // static target. Used to prevent redefinitions of HWY_EXPORT. Unless 69 // foreach_target.h is included, we only compile once anyway, so this is 1 70 // unless it is or has been included. 71 #ifndef HWY_ONCE 72 #define HWY_ONCE 1 73 #endif 74 75 // `HWY_STATIC_NAMESPACE` expands to its namespace name, e.g. `N_AVX2`. 76 #if HWY_STATIC_TARGET == HWY_SCALAR 77 #define HWY_STATIC_NAMESPACE N_SCALAR 78 #elif HWY_STATIC_TARGET == HWY_EMU128 79 #define HWY_STATIC_NAMESPACE N_EMU128 80 #elif HWY_STATIC_TARGET == HWY_WASM 81 #define HWY_STATIC_NAMESPACE N_WASM 82 #elif HWY_STATIC_TARGET == HWY_WASM_EMU256 83 #define HWY_STATIC_NAMESPACE N_WASM_EMU256 84 #elif HWY_STATIC_TARGET == HWY_Z14 85 #define HWY_STATIC_NAMESPACE N_Z14 86 #elif HWY_STATIC_TARGET == HWY_Z15 87 #define HWY_STATIC_NAMESPACE N_Z15 88 #elif HWY_STATIC_TARGET == HWY_PPC8 89 #define HWY_STATIC_NAMESPACE N_PPC8 90 #elif HWY_STATIC_TARGET == HWY_PPC9 91 #define HWY_STATIC_NAMESPACE N_PPC9 92 #elif HWY_STATIC_TARGET == HWY_PPC10 93 #define HWY_STATIC_NAMESPACE N_PPC10 94 #elif HWY_STATIC_TARGET == HWY_LSX 95 #define HWY_STATIC_NAMESPACE N_LSX 96 #elif HWY_STATIC_TARGET == HWY_LASX 97 #define HWY_STATIC_NAMESPACE N_LASX 98 #elif HWY_STATIC_TARGET == HWY_RVV 99 #define HWY_STATIC_NAMESPACE N_RVV 100 #elif HWY_STATIC_TARGET == HWY_NEON_WITHOUT_AES 101 #define HWY_STATIC_NAMESPACE N_NEON_WITHOUT_AES 102 #elif HWY_STATIC_TARGET == HWY_NEON 103 #define HWY_STATIC_NAMESPACE N_NEON 104 #elif HWY_STATIC_TARGET == HWY_NEON_BF16 105 #define HWY_STATIC_NAMESPACE N_NEON_BF16 106 #elif HWY_STATIC_TARGET == HWY_SVE 107 #define HWY_STATIC_NAMESPACE N_SVE 108 #elif HWY_STATIC_TARGET == HWY_SVE2 109 #define HWY_STATIC_NAMESPACE N_SVE2 110 #elif HWY_STATIC_TARGET == HWY_SVE_256 111 #define HWY_STATIC_NAMESPACE N_SVE_256 112 #elif HWY_STATIC_TARGET == HWY_SVE2_128 113 #define HWY_STATIC_NAMESPACE N_SVE2_128 114 #elif HWY_STATIC_TARGET == HWY_SSE2 115 #define HWY_STATIC_NAMESPACE N_SSE2 116 #elif HWY_STATIC_TARGET == HWY_SSSE3 117 #define HWY_STATIC_NAMESPACE N_SSSE3 118 #elif HWY_STATIC_TARGET == HWY_SSE4 119 #define HWY_STATIC_NAMESPACE N_SSE4 120 #elif HWY_STATIC_TARGET == HWY_AVX2 121 #define HWY_STATIC_NAMESPACE N_AVX2 122 #elif HWY_STATIC_TARGET == HWY_AVX3 123 #define HWY_STATIC_NAMESPACE N_AVX3 124 #elif HWY_STATIC_TARGET == HWY_AVX3_DL 125 #define HWY_STATIC_NAMESPACE N_AVX3_DL 126 #elif HWY_STATIC_TARGET == HWY_AVX3_ZEN4 127 #define HWY_STATIC_NAMESPACE N_AVX3_ZEN4 128 #elif HWY_STATIC_TARGET == HWY_AVX3_SPR 129 #define HWY_STATIC_NAMESPACE N_AVX3_SPR 130 #elif HWY_STATIC_TARGET == HWY_AVX10_2 131 #define HWY_STATIC_NAMESPACE N_AVX10_2 132 #endif 133 134 // `HWY_STATIC_DISPATCH(FUNC_NAME)` is the namespace-qualified FUNC_NAME for 135 // `HWY_STATIC_TARGET`, and can be used to deduce the return type of Choose*. 136 #define HWY_STATIC_DISPATCH(FUNC_NAME) HWY_STATIC_NAMESPACE::FUNC_NAME 137 138 // `HWY_CHOOSE_*(FUNC_NAME)` expands to the function pointer for that target or 139 // nullptr if that target was not compiled. 140 // `HWY_VISIT_*(VISITOR)` expands to `VISITOR(TARGET, NAMESPACE)` or nothing if 141 // that target was not compiled. 142 #if HWY_TARGETS & HWY_EMU128 143 #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME 144 #define HWY_VISIT_FALLBACK(VISITOR) VISITOR(HWY_EMU128, N_EMU128) 145 #elif HWY_TARGETS & HWY_SCALAR 146 #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME 147 #define HWY_VISIT_FALLBACK(VISITOR) VISITOR(HWY_SCALAR, N_SCALAR) 148 #else 149 // When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at 150 // runtime, fall back to the baseline with HWY_STATIC_DISPATCH(). 151 #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME) 152 #define HWY_VISIT_FALLBACK(VISITOR) \ 153 VISITOR(HWY_STATIC_TARGET, HWY_STATIC_NAMESPACE) 154 #endif 155 156 #if HWY_TARGETS & HWY_WASM 157 #define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME 158 #define HWY_VISIT_WASM(VISITOR) VISITOR(HWY_WASM, N_WASM) 159 #else 160 #define HWY_CHOOSE_WASM(FUNC_NAME) nullptr 161 #define HWY_VISIT_WASM(VISITOR) 162 #endif 163 164 #if HWY_TARGETS & HWY_WASM_EMU256 165 #define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME 166 #define HWY_VISIT_WASM_EMU256(VISITOR) VISITOR(HWY_WASM_EMU256, N_WASM_EMU256) 167 #else 168 #define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr 169 #define HWY_VISIT_WASM_EMU256(VISITOR) 170 #endif 171 172 #if HWY_TARGETS & HWY_Z14 173 #define HWY_CHOOSE_Z14(FUNC_NAME) &N_Z14::FUNC_NAME 174 #define HWY_VISIT_Z14(VISITOR) VISITOR(HWY_Z14, N_Z14) 175 #else 176 #define HWY_CHOOSE_Z14(FUNC_NAME) nullptr 177 #define HWY_VISIT_Z14(VISITOR) 178 #endif 179 180 #if HWY_TARGETS & HWY_Z15 181 #define HWY_CHOOSE_Z15(FUNC_NAME) &N_Z15::FUNC_NAME 182 #define HWY_VISIT_Z15(VISITOR) VISITOR(HWY_Z15, N_Z15) 183 #else 184 #define HWY_CHOOSE_Z15(FUNC_NAME) nullptr 185 #define HWY_VISIT_Z15(VISITOR) 186 #endif 187 188 #if HWY_TARGETS & HWY_PPC8 189 #define HWY_CHOOSE_PPC8(FUNC_NAME) &N_PPC8::FUNC_NAME 190 #define HWY_VISIT_PPC8(VISITOR) VISITOR(HWY_PPC8, N_PPC8) 191 #else 192 #define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr 193 #define HWY_VISIT_PPC8(VISITOR) 194 #endif 195 196 #if HWY_TARGETS & HWY_PPC9 197 #define HWY_CHOOSE_PPC9(FUNC_NAME) &N_PPC9::FUNC_NAME 198 #define HWY_VISIT_PPC9(VISITOR) VISITOR(HWY_PPC9, N_PPC9) 199 #else 200 #define HWY_CHOOSE_PPC9(FUNC_NAME) nullptr 201 #define HWY_VISIT_PPC9(VISITOR) 202 #endif 203 204 #if HWY_TARGETS & HWY_LSX 205 #define HWY_CHOOSE_LSX(FUNC_NAME) &N_LSX::FUNC_NAME 206 #define HWY_VISIT_LSX(VISITOR) VISITOR(HWY_LSX, N_LSX) 207 #else 208 #define HWY_CHOOSE_LSX(FUNC_NAME) nullptr 209 #define HWY_VISIT_LSX(VISITOR) 210 #endif 211 212 #if HWY_TARGETS & HWY_LASX 213 #define HWY_CHOOSE_LASX(FUNC_NAME) &N_LASX::FUNC_NAME 214 #define HWY_VISIT_LASX(VISITOR) VISITOR(HWY_LASX, N_LASX) 215 #else 216 #define HWY_CHOOSE_LASX(FUNC_NAME) nullptr 217 #define HWY_VISIT_LASX(VISITOR) 218 #endif 219 220 #if HWY_TARGETS & HWY_PPC10 221 #define HWY_CHOOSE_PPC10(FUNC_NAME) &N_PPC10::FUNC_NAME 222 #define HWY_VISIT_PPC10(VISITOR) VISITOR(HWY_PPC10, N_PPC10) 223 #else 224 #define HWY_CHOOSE_PPC10(FUNC_NAME) nullptr 225 #define HWY_VISIT_PPC10(VISITOR) 226 #endif 227 228 #if HWY_TARGETS & HWY_RVV 229 #define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME 230 #define HWY_VISIT_RVV(VISITOR) VISITOR(HWY_RVV, N_RVV) 231 #else 232 #define HWY_CHOOSE_RVV(FUNC_NAME) nullptr 233 #define HWY_VISIT_RVV(VISITOR) 234 #endif 235 236 #if HWY_TARGETS & HWY_NEON_WITHOUT_AES 237 #define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) &N_NEON_WITHOUT_AES::FUNC_NAME 238 #define HWY_VISIT_NEON_WITHOUT_AES(VISITOR) \ 239 VISITOR(HWY_NEON_WITHOUT_AES, N_NEON_WITHOUT_AES) 240 #else 241 #define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) nullptr 242 #define HWY_VISIT_NEON_WITHOUT_AES(VISITOR) 243 #endif 244 245 #if HWY_TARGETS & HWY_NEON 246 #define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME 247 #define HWY_VISIT_NEON(VISITOR) VISITOR(HWY_NEON, N_NEON) 248 #else 249 #define HWY_CHOOSE_NEON(FUNC_NAME) nullptr 250 #define HWY_VISIT_NEON(VISITOR) 251 #endif 252 253 #if HWY_TARGETS & HWY_NEON_BF16 254 #define HWY_CHOOSE_NEON_BF16(FUNC_NAME) &N_NEON_BF16::FUNC_NAME 255 #define HWY_VISIT_NEON_BF16(VISITOR) VISITOR(HWY_NEON_BF16, N_NEON_BF16) 256 #else 257 #define HWY_CHOOSE_NEON_BF16(FUNC_NAME) nullptr 258 #define HWY_VISIT_NEON_BF16(VISITOR) 259 #endif 260 261 #if HWY_TARGETS & HWY_SVE 262 #define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME 263 #define HWY_VISIT_SVE(VISITOR) VISITOR(HWY_SVE, N_SVE) 264 #else 265 #define HWY_CHOOSE_SVE(FUNC_NAME) nullptr 266 #define HWY_VISIT_SVE(VISITOR) 267 #endif 268 269 #if HWY_TARGETS & HWY_SVE2 270 #define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME 271 #define HWY_VISIT_SVE2(VISITOR) VISITOR(HWY_SVE2, N_SVE2) 272 #else 273 #define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr 274 #define HWY_VISIT_SVE2(VISITOR) 275 #endif 276 277 #if HWY_TARGETS & HWY_SVE_256 278 #define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME 279 #define HWY_VISIT_SVE_256(VISITOR) VISITOR(HWY_SVE_256, N_SVE_256) 280 #else 281 #define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr 282 #define HWY_VISIT_SVE_256(VISITOR) 283 #endif 284 285 #if HWY_TARGETS & HWY_SVE2_128 286 #define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME 287 #define HWY_VISIT_SVE2_128(VISITOR) VISITOR(HWY_SVE2_128, N_SVE2_128) 288 #else 289 #define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr 290 #define HWY_VISIT_SVE2_128(VISITOR) 291 #endif 292 293 #if HWY_TARGETS & HWY_SSE2 294 #define HWY_CHOOSE_SSE2(FUNC_NAME) &N_SSE2::FUNC_NAME 295 #define HWY_VISIT_SSE2(VISITOR) VISITOR(HWY_SSE2, N_SSE2) 296 #else 297 #define HWY_CHOOSE_SSE2(FUNC_NAME) nullptr 298 #define HWY_VISIT_SSE2(VISITOR) 299 #endif 300 301 #if HWY_TARGETS & HWY_SSSE3 302 #define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME 303 #define HWY_VISIT_SSSE3(VISITOR) VISITOR(HWY_SSSE3, N_SSSE3) 304 #else 305 #define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr 306 #define HWY_VISIT_SSSE3(VISITOR) 307 #endif 308 309 #if HWY_TARGETS & HWY_SSE4 310 #define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME 311 #define HWY_VISIT_SSE4(VISITOR) VISITOR(HWY_SSE4, N_SSE4) 312 #else 313 #define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr 314 #define HWY_VISIT_SSE4(VISITOR) 315 #endif 316 317 #if HWY_TARGETS & HWY_AVX2 318 #define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME 319 #define HWY_VISIT_AVX2(VISITOR) VISITOR(HWY_AVX2, N_AVX2) 320 #else 321 #define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr 322 #define HWY_VISIT_AVX2(VISITOR) 323 #endif 324 325 #if HWY_TARGETS & HWY_AVX3 326 #define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME 327 #define HWY_VISIT_AVX3(VISITOR) VISITOR(HWY_AVX3, N_AVX3) 328 #else 329 #define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr 330 #define HWY_VISIT_AVX3(VISITOR) 331 #endif 332 333 #if HWY_TARGETS & HWY_AVX3_DL 334 #define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME 335 #define HWY_VISIT_AVX3_DL(VISITOR) VISITOR(HWY_AVX3_DL, N_AVX3_DL) 336 #else 337 #define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr 338 #define HWY_VISIT_AVX3_DL(VISITOR) 339 #endif 340 341 #if HWY_TARGETS & HWY_AVX3_ZEN4 342 #define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) &N_AVX3_ZEN4::FUNC_NAME 343 #define HWY_VISIT_AVX3_ZEN4(VISITOR) VISITOR(HWY_AVX3_ZEN4, N_AVX3_ZEN4) 344 #else 345 #define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) nullptr 346 #define HWY_VISIT_AVX3_ZEN4(VISITOR) 347 #endif 348 349 #if HWY_TARGETS & HWY_AVX3_SPR 350 #define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) &N_AVX3_SPR::FUNC_NAME 351 #define HWY_VISIT_AVX3_SPR(VISITOR) VISITOR(HWY_AVX3_SPR, N_AVX3_SPR) 352 #else 353 #define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) nullptr 354 #define HWY_VISIT_AVX3_SPR(VISITOR) 355 #endif 356 357 #if HWY_TARGETS & HWY_AVX10_2 358 #define HWY_CHOOSE_AVX10_2(FUNC_NAME) &N_AVX10_2::FUNC_NAME 359 #define HWY_VISIT_AVX10_2(VISITOR) VISITOR(HWY_AVX10_2, N_AVX10_2) 360 #else 361 #define HWY_CHOOSE_AVX10_2(FUNC_NAME) nullptr 362 #define HWY_VISIT_AVX10_2(VISITOR) 363 #endif 364 365 // MSVC 2017 workaround: the non-type template parameter to ChooseAndCall 366 // apparently cannot be an array. Use a function pointer instead, which has the 367 // disadvantage that we call the static (not best) target on the first call to 368 // any HWY_DYNAMIC_DISPATCH. 369 #if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915) || \ 370 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) 371 #define HWY_DISPATCH_WORKAROUND 1 372 #else 373 #define HWY_DISPATCH_WORKAROUND 0 374 #endif 375 376 #if HWY_DISPATCH_MAP 377 struct AllExports { 378 template <class FuncPtr, class ExportsKey, uint64_t kHash> 379 static const FuncPtr*& GetRefToExportsPtr() { 380 static const FuncPtr* s_exports = nullptr; 381 return s_exports; 382 } 383 }; 384 #endif 385 386 // Provides a static member function which is what is called during the first 387 // HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of 388 // this function are the first entry in the tables created by HWY_EXPORT[_T]. 389 template <typename RetType, typename... Args> 390 struct FunctionCache { 391 public: 392 typedef RetType(FuncType)(Args...); 393 using FuncPtr = FuncType*; 394 395 // A template function that when instantiated has the same signature as the 396 // function being called. This function initializes the bit array of targets 397 // supported by the current CPU and then calls the appropriate entry within 398 // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any 399 // exported functions, even those defined by different translation units, 400 // will dispatch directly to the best available target. 401 #if HWY_DISPATCH_MAP 402 template <class ExportsKey, uint64_t kHash> 403 static RetType ChooseAndCall(Args... args) { 404 ChosenTarget& chosen_target = GetChosenTarget(); 405 chosen_target.Update(SupportedTargets()); 406 407 const FuncPtr* table = AllExports::template GetRefToExportsPtr< 408 FuncPtr, RemoveCvRef<ExportsKey>, kHash>(); 409 HWY_ASSERT(table); 410 411 return (table[chosen_target.GetIndex()])(args...); 412 } 413 414 #if !HWY_DISPATCH_WORKAROUND 415 template <const FuncPtr* table> 416 static RetType TableChooseAndCall(Args... args) { 417 ChosenTarget& chosen_target = GetChosenTarget(); 418 chosen_target.Update(SupportedTargets()); 419 return (table[chosen_target.GetIndex()])(args...); 420 } 421 #endif // !HWY_DISPATCH_WORKAROUND 422 423 #else // !HWY_DISPATCH_MAP: zero-overhead, but requires C++17 424 template <const FuncPtr* table> 425 static RetType ChooseAndCall(Args... args) { 426 ChosenTarget& chosen_target = GetChosenTarget(); 427 chosen_target.Update(SupportedTargets()); 428 return (table[chosen_target.GetIndex()])(args...); 429 } 430 #endif // HWY_DISPATCH_MAP 431 }; 432 433 // Used to deduce the template parameters RetType and Args from a function. 434 template <typename RetType, typename... Args> 435 FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) { 436 return FunctionCache<RetType, Args...>(); 437 } 438 439 #define HWY_DISPATCH_TABLE(FUNC_NAME) \ 440 HWY_CONCAT(FUNC_NAME, HighwayDispatchTable) 441 442 // HWY_EXPORT(FUNC_NAME); expands to a static array that is used by 443 // HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. 444 // After being exported, it can be called from other parts of the same source 445 // file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper 446 // like in the following example: 447 // 448 // #include "hwy/highway.h" 449 // HWY_BEFORE_NAMESPACE(); 450 // namespace skeleton { 451 // namespace HWY_NAMESPACE { 452 // 453 // void MyFunction(int a, char b, const char* c) { ... } 454 // 455 // // NOLINTNEXTLINE(google-readability-namespace-comments) 456 // } // namespace HWY_NAMESPACE 457 // } // namespace skeleton 458 // HWY_AFTER_NAMESPACE(); 459 // 460 // namespace skeleton { 461 // HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope. 462 // 463 // void MyFunction(int a, char b, const char* c) { 464 // return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c); 465 // } 466 // } // namespace skeleton 467 // 468 // For templated code with a single type parameter, instead use HWY_EXPORT_T and 469 // its HWY_DYNAMIC_DISPATCH_T counterpart: 470 // 471 // template <typename T> 472 // void MyFunctionCaller(T ...) { 473 // // First argument to both HWY_EXPORT_T and HWY_DYNAMIC_DISPATCH_T is an 474 // // arbitrary table name; you must provide the same name for each call. 475 // // It is fine to have multiple HWY_EXPORT_T in a function, but a 64-bit 476 // // FNV hash collision among *any* table names will trigger HWY_ABORT. 477 // HWY_EXPORT_T(Table1, MyFunction<T>) 478 // HWY_DYNAMIC_DISPATCH_T(Table1)(a, b, c); 479 // } 480 // 481 // Note that HWY_EXPORT_T must be invoked inside a template (in the above 482 // example: `MyFunctionCaller`), so that a separate table will be created for 483 // each template instantiation. For convenience, we also provide a macro that 484 // combines both steps and avoids the need to pick a table name: 485 // 486 // template <typename T> 487 // void MyFunctionCaller(T ...) { 488 // // Table name is automatically chosen. Note that this variant must be 489 // // called in statement context; it is not a valid expression. 490 // HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(MyFunction<T>)(a, b, c); 491 // } 492 493 // Simplified version for IDE or the dynamic dispatch case with only one target. 494 #if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0) 495 496 // We use a table to provide the same compile error conditions as with the 497 // non-simplified case, but the table only has a single entry. 498 #define HWY_EXPORT_T(TABLE_NAME, FUNC_NAME) \ 499 HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \ 500 HWY_DISPATCH_TABLE(TABLE_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)} 501 502 // Use the table, not just STATIC_DISPATCH as in DYNAMIC_DISPATCH, because 503 // TABLE_NAME might not match the function name. 504 #define HWY_DYNAMIC_POINTER_T(TABLE_NAME) (HWY_DISPATCH_TABLE(TABLE_NAME)[0]) 505 #define HWY_DYNAMIC_DISPATCH_T(TABLE_NAME) \ 506 (*(HWY_DYNAMIC_POINTER_T(TABLE_NAME))) 507 508 #define HWY_EXPORT(FUNC_NAME) HWY_EXPORT_T(FUNC_NAME, FUNC_NAME) 509 #define HWY_DYNAMIC_POINTER(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME) 510 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME) 511 512 #else // not simplified: full table 513 514 // Pre-C++17 workaround: non-type template arguments must have linkage, which 515 // means we cannot pass &table as a template argument to ChooseAndCall. 516 // ChooseAndCall must find a way to access the table in order to dispatch to the 517 // chosen target: 518 // 0) Skipping this by dispatching to the static target would be surprising to 519 // users and may have serious performance implications. 520 // 1) An extra function parameter would be unacceptable because it changes the 521 // user-visible function signature. 522 // 2) Declaring a table, then defining a pointer to it would work, but requires 523 // an additional DECLARE step outside the function so that the pointer has 524 // linkage, which breaks existing code. 525 // 3) We instead associate the function with the table using an instance of an 526 // unnamed struct and the hash of the table name as the key. Because 527 // ChooseAndCall has the type information, it can then cast to the function 528 // pointer type. However, we cannot simply pass the name as a template 529 // argument to ChooseAndCall because this requires char*, which hits the same 530 // linkage problem. We instead hash the table name, which assumes the 531 // function names do not have collisions. 532 #if HWY_DISPATCH_MAP 533 534 static constexpr uint64_t FNV(const char* name) { 535 return *name ? static_cast<uint64_t>(static_cast<uint8_t>(*name)) ^ 536 (0x100000001b3ULL * FNV(name + 1)) 537 : 0xcbf29ce484222325ULL; 538 } 539 540 template <uint64_t kHash> 541 struct AddExport { 542 template <class ExportsKey, class FuncPtr> 543 AddExport(ExportsKey /*exports_key*/, const char* table_name, 544 const FuncPtr* table) { 545 using FuncCache = decltype(DeduceFunctionCache(hwy::DeclVal<FuncPtr>())); 546 static_assert( 547 hwy::IsSame<RemoveCvRef<FuncPtr>, typename FuncCache::FuncPtr>(), 548 "FuncPtr should be same type as FuncCache::FuncPtr"); 549 550 const FuncPtr*& exports_ptr = AllExports::template GetRefToExportsPtr< 551 RemoveCvRef<FuncPtr>, RemoveCvRef<ExportsKey>, kHash>(); 552 if (exports_ptr && exports_ptr != table) { 553 HWY_ABORT("Hash collision for %s, rename the function\n", table_name); 554 } else { 555 exports_ptr = table; 556 } 557 } 558 }; 559 560 // Dynamic dispatch: defines table of function pointers. This must be invoked 561 // from inside the function template that calls the template we are exporting. 562 // TABLE_NAME must match the one passed to HWY_DYNAMIC_DISPATCH_T. This 563 // argument allows multiple exports within one function. 564 #define HWY_EXPORT_T(TABLE_NAME, FUNC_NAME) \ 565 static const struct { \ 566 } HWY_CONCAT(TABLE_NAME, HighwayDispatchExportsKey) = {}; \ 567 static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \ 568 TABLE_NAME)[static_cast<size_t>(HWY_MAX_DYNAMIC_TARGETS + 2)] = { \ 569 /* The first entry in the table initializes the global cache and \ 570 * calls the appropriate function. */ \ 571 &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(FUNC_NAME))):: \ 572 template ChooseAndCall<decltype(HWY_CONCAT( \ 573 TABLE_NAME, HighwayDispatchExportsKey)), \ 574 hwy::FNV(#TABLE_NAME)>, \ 575 HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \ 576 HWY_CHOOSE_FALLBACK(FUNC_NAME), \ 577 }; \ 578 HWY_MAYBE_UNUSED static hwy::AddExport<hwy::FNV(#TABLE_NAME)> HWY_CONCAT( \ 579 HighwayAddTable, __LINE__)( \ 580 HWY_CONCAT(TABLE_NAME, HighwayDispatchExportsKey), #TABLE_NAME, \ 581 HWY_DISPATCH_TABLE(TABLE_NAME)) 582 583 // For non-template functions. Not necessarily invoked within a function, hence 584 // we derive the string and variable names from FUNC_NAME, not HWY_FUNCTION. 585 #if HWY_DISPATCH_WORKAROUND 586 #define HWY_EXPORT(FUNC_NAME) HWY_EXPORT_T(FUNC_NAME, FUNC_NAME) 587 #else 588 #define HWY_EXPORT(FUNC_NAME) \ 589 static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \ 590 FUNC_NAME)[static_cast<size_t>(HWY_MAX_DYNAMIC_TARGETS + 2)] = { \ 591 /* The first entry in the table initializes the global cache and \ 592 * calls the appropriate function. */ \ 593 &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(FUNC_NAME))):: \ 594 template TableChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \ 595 HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \ 596 HWY_CHOOSE_FALLBACK(FUNC_NAME), \ 597 } 598 #endif // HWY_DISPATCH_WORKAROUND 599 600 #else // !HWY_DISPATCH_MAP 601 602 // Zero-overhead, but requires C++17 for non-type template arguments without 603 // linkage, because HWY_EXPORT_T tables are local static variables. 604 #define HWY_EXPORT_T(TABLE_NAME, FUNC_NAME) \ 605 static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \ 606 TABLE_NAME)[static_cast<size_t>(HWY_MAX_DYNAMIC_TARGETS + 2)] = { \ 607 /* The first entry in the table initializes the global cache and \ 608 * calls the appropriate function. */ \ 609 &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(FUNC_NAME))):: \ 610 template ChooseAndCall<HWY_DISPATCH_TABLE(TABLE_NAME)>, \ 611 HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \ 612 HWY_CHOOSE_FALLBACK(FUNC_NAME), \ 613 } 614 615 #define HWY_EXPORT(FUNC_NAME) HWY_EXPORT_T(FUNC_NAME, FUNC_NAME) 616 617 #endif // HWY_DISPATCH_MAP 618 619 // HWY_DISPATCH_MAP only affects how tables are created, not their usage. 620 621 // Evaluates to the function pointer for the chosen target. 622 #define HWY_DYNAMIC_POINTER(FUNC_NAME) \ 623 (HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]) 624 625 // Calls the function pointer for the chosen target. 626 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG 627 628 // On GCC or Clang, we call hwy::PreventElision(...) to work around a compiler 629 // crash where the LLVM inliner crashes due to inlining incompatible intrinsics. 630 631 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \ 632 __extension__({ \ 633 auto HWY_CONCAT(hwy_tmp_, __LINE__) = *(HWY_DYNAMIC_POINTER(FUNC_NAME)); \ 634 hwy::PreventElision(HWY_CONCAT(hwy_tmp_, __LINE__)); \ 635 HWY_CONCAT(hwy_tmp_, __LINE__); \ 636 }) 637 638 #else // !(HWY_COMPILER_GCC || HWY_COMPILER_CLANG) 639 640 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) (*(HWY_DYNAMIC_POINTER(FUNC_NAME))) 641 642 #endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG 643 644 // Same as DISPATCH, but provide a different arg name to clarify usage. 645 #define HWY_DYNAMIC_DISPATCH_T(TABLE_NAME) HWY_DYNAMIC_DISPATCH(TABLE_NAME) 646 #define HWY_DYNAMIC_POINTER_T(TABLE_NAME) HWY_DYNAMIC_POINTER(TABLE_NAME) 647 648 #endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0) 649 650 // Returns the name of an anonymous dispatch table that is only shared with 651 // macro invocations coming from the same source line. 652 #define HWY_DISPATCH_TABLE_T() HWY_CONCAT(HighwayDispatchTableT, __LINE__) 653 654 // For templated code, combines export and dispatch using an anonymous table. 655 #define HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC_NAME) \ 656 HWY_EXPORT_T(HWY_DISPATCH_TABLE_T(), FUNC_NAME); \ 657 HWY_DYNAMIC_DISPATCH_T(HWY_DISPATCH_TABLE_T()) 658 659 // DEPRECATED names; please use HWY_HAVE_* instead. 660 #define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64 661 #define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16 662 #define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64 663 664 } // namespace hwy 665 666 #endif // HWY_HIGHWAY_INCLUDED 667 668 //------------------------------------------------------------------------------ 669 670 // NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want 671 // to include them once per target, which is ensured by the toggle check. 672 // Because ops/*.h are included under it, they do not need their own guard. 673 #if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE) 674 #ifdef HWY_HIGHWAY_PER_TARGET 675 #undef HWY_HIGHWAY_PER_TARGET 676 #else 677 #define HWY_HIGHWAY_PER_TARGET 678 #endif 679 680 // No SIMD target enabled, skip header inclusion. 681 #if HWY_ENABLED_BASELINE == 0 682 683 // We would expect that HWY_TARGET and HWY_STATIC_TARGET are now both 0. 684 #if HWY_TARGET != 0 685 #error "Why is HWY_TARGET not 0 when HWY_ENABLED_BASELINE == 0?" 686 #endif 687 #if HWY_STATIC_TARGET != 0 688 #error "Why is HWY_STATIC_TARGET not 0 when HWY_ENABLED_BASELINE == 0?" 689 #endif 690 691 #else 692 693 // These define ops inside namespace hwy::HWY_NAMESPACE. 694 #if HWY_TARGET == HWY_SSE2 || HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 695 #include "hwy/ops/x86_128-inl.h" 696 #elif HWY_TARGET == HWY_AVX2 697 #include "hwy/ops/x86_256-inl.h" 698 #elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \ 699 HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR || \ 700 HWY_TARGET == HWY_AVX10_2 701 #include "hwy/ops/x86_avx3-inl.h" 702 #elif HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15 || \ 703 (HWY_TARGET & HWY_ALL_PPC) 704 #include "hwy/ops/ppc_vsx-inl.h" 705 #elif HWY_TARGET & HWY_ALL_NEON 706 #include "hwy/ops/arm_neon-inl.h" 707 #elif HWY_TARGET & HWY_ALL_SVE 708 #include "hwy/ops/arm_sve-inl.h" 709 #elif HWY_TARGET == HWY_WASM_EMU256 710 #include "hwy/ops/wasm_256-inl.h" 711 #elif HWY_TARGET == HWY_WASM 712 #include "hwy/ops/wasm_128-inl.h" 713 #elif HWY_TARGET == HWY_RVV 714 #include "hwy/ops/rvv-inl.h" 715 #elif HWY_TARGET == HWY_LSX 716 #include "hwy/ops/loongarch_lsx-inl.h" 717 #elif HWY_TARGET == HWY_LASX 718 #include "hwy/ops/loongarch_lasx-inl.h" 719 #elif HWY_TARGET == HWY_EMU128 720 #include "hwy/ops/emu128-inl.h" 721 #elif HWY_TARGET == HWY_SCALAR 722 #include "hwy/ops/scalar-inl.h" 723 #else 724 #pragma message("HWY_TARGET does not match any known target") 725 #endif // HWY_TARGET 726 727 #include "hwy/ops/generic_ops-inl.h" 728 729 #endif // HWY_ENABLED_BASELINE 730 731 #endif // HWY_HIGHWAY_PER_TARGET