shared-inl.h (29433B)
1 // Copyright 2020 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // Per-target definitions shared by ops/*.h and user code. 17 18 // IWYU pragma: begin_exports 19 // Export does not seem to be recursive, so re-export these (also in base.h) 20 #include <stddef.h> 21 22 #include "hwy/base.h" 23 // "IWYU pragma: keep" does not work for this include, so hide it from the IDE. 24 #if !HWY_IDE 25 #include <stdint.h> 26 #endif 27 28 #include "hwy/detect_compiler_arch.h" 29 #include "hwy/detect_targets.h" 30 31 // Separate header because foreach_target.h re-enables its include guard. 32 #include "hwy/ops/set_macros-inl.h" 33 34 // IWYU pragma: end_exports 35 36 #if HWY_IS_MSAN 37 #include <sanitizer/msan_interface.h> 38 #endif 39 40 // We are covered by the highway.h include guard, but generic_ops-inl.h 41 // includes this again #if HWY_IDE. 42 // clang-format off 43 #if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == defined(HWY_TARGET_TOGGLE) // NOLINT 44 // clang-format on 45 #ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE 46 #undef HIGHWAY_HWY_OPS_SHARED_TOGGLE 47 #else 48 #define HIGHWAY_HWY_OPS_SHARED_TOGGLE 49 #endif 50 51 HWY_BEFORE_NAMESPACE(); 52 namespace hwy { 53 namespace HWY_NAMESPACE { 54 55 // NOTE: GCC generates incorrect code for vector arguments to non-inlined 56 // functions in two situations: 57 // - on Windows and GCC 10.3, passing by value crashes due to unaligned loads: 58 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412. 59 // - on aarch64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not 60 // all) tests to fail. 61 // 62 // We therefore pass by const& only on GCC and (Windows or aarch64). This alias 63 // must be used for all vector/mask parameters of functions marked HWY_NOINLINE, 64 // and possibly also other functions that are not inlined. 65 // 66 // Even better is to avoid passing vector arguments to non-inlined functions, 67 // because the SVE and RISC-V ABIs are still works in progress and may lead to 68 // incorrect codegen. 69 #if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64) 70 template <class V> 71 using VecArg = const V&; 72 #else 73 template <class V> 74 using VecArg = V; 75 #endif 76 77 namespace detail { 78 79 template <typename T> 80 struct NativeLaneTypeT { 81 using type = T; 82 }; 83 template <> 84 struct NativeLaneTypeT<hwy::float16_t> { 85 #if HWY_HAVE_SCALAR_F16_TYPE 86 using type = hwy::float16_t::Native; 87 #else 88 using type = uint16_t; 89 #endif 90 }; 91 template <> 92 struct NativeLaneTypeT<hwy::bfloat16_t> { 93 #if HWY_HAVE_SCALAR_BF16_TYPE 94 using type = hwy::bfloat16_t::Native; 95 #else 96 using type = uint16_t; 97 #endif 98 }; 99 100 // The type expected by intrinsics for the given Highway lane type T. This 101 // usually matches T, but differs for our wrapper types [b]float16_t. Use this 102 // only when defining intrinsic wrappers, and NOT for casting, which is UB. 103 template <typename T> 104 using NativeLaneType = typename NativeLaneTypeT<T>::type; 105 106 // Returns the same pointer after changing type to NativeLaneType. Use this only 107 // for wrapper functions that call intrinsics (e.g. load/store) where some of 108 // the overloads expect _Float16* or __bf16* arguments. For non-special floats, 109 // this returns the same pointer and type. 110 // 111 // This makes use of the fact that a wrapper struct is pointer-interconvertible 112 // with its first member (a union), thus also with the union members. Do NOT 113 // call both this and U16LanePointer on the same object - they access different 114 // union members, and this is not guaranteed to be safe. 115 template <typename T, HWY_IF_NOT_SPECIAL_FLOAT(T)> 116 HWY_INLINE T* NativeLanePointer(T* p) { 117 return p; 118 } 119 template <typename T, typename NT = NativeLaneType<RemoveConst<T>>, 120 HWY_IF_F16(T)> 121 HWY_INLINE constexpr If<IsConst<T>(), const NT*, NT*> NativeLanePointer(T* p) { 122 #if HWY_HAVE_SCALAR_F16_TYPE 123 return &p->native; 124 #else 125 return &p->bits; 126 #endif 127 } 128 template <typename T, typename NT = NativeLaneType<RemoveConst<T>>, 129 HWY_IF_BF16(T)> 130 HWY_INLINE constexpr If<IsConst<T>(), const NT*, NT*> NativeLanePointer(T* p) { 131 #if HWY_HAVE_SCALAR_BF16_TYPE 132 return &p->native; 133 #else 134 return &p->bits; 135 #endif 136 } 137 138 // Returns a pointer to the u16 member of our [b]float16_t wrapper structs. 139 // Use this in Highway targets that lack __bf16 intrinsics; for storing to 140 // memory, we BitCast vectors to u16 and write to the pointer returned here. 141 // Do NOT call both this and U16LanePointer on the same object - they access 142 // different union members, and this is not guaranteed to be safe. 143 template <typename T, HWY_IF_SPECIAL_FLOAT(T)> 144 HWY_INLINE If<IsConst<T>(), const uint16_t*, uint16_t*> U16LanePointer(T* p) { 145 return &p->bits; 146 } 147 148 // Returns N * 2^pow2. N is the number of lanes in a full vector and pow2 the 149 // desired fraction or multiple of it, see Simd<>. `pow2` is most often in 150 // [-3, 3] but can also be lower for user-specified fractions. 151 constexpr size_t ScaleByPower(size_t N, int pow2) { 152 return pow2 >= 0 ? (N << pow2) : (N >> (-pow2)); 153 } 154 155 template <typename T> 156 HWY_INLINE void MaybePoison(T* HWY_RESTRICT unaligned, size_t count) { 157 #if HWY_IS_MSAN 158 __msan_poison(unaligned, count * sizeof(T)); 159 #else 160 (void)unaligned; 161 (void)count; 162 #endif 163 } 164 165 // This can be useful for working around MSAN limitations. For example, prior 166 // to Clang 16, it did not understand AVX-512 CompressStore. 167 template <typename T> 168 HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) { 169 #if HWY_IS_MSAN 170 __msan_unpoison(unaligned, count * sizeof(T)); 171 #else 172 (void)unaligned; 173 (void)count; 174 #endif 175 } 176 177 } // namespace detail 178 179 // Highway operations are implemented as overloaded functions selected using a 180 // zero-sized tag type D := Simd<T, N, kPow2>. T denotes the lane type. 181 // 182 // N defines how many lanes are in a 'full' vector, typically equal to 183 // HWY_LANES(T) (which is the actual count on targets with vectors of known 184 // size, and an upper bound in case of scalable vectors), otherwise a 185 // user-specified limit at most that large. 186 // 187 // 2^kPow2 is a _subsequently_ applied scaling factor that indicates the 188 // desired fraction of a 'full' vector: 0 means full, -1 means half; 1,2,3 189 // means two/four/eight full vectors ganged together. The largest supported 190 // kPow2 is `HWY_MAX_POW2` and the aliases below take care of clamping 191 // user-specified values to that. Note that `Simd<T, 1, 0>` and `Simd<T, 2, -1>` 192 // have the same `MaxLanes` and `Lanes`. 193 // 194 // We can theoretically keep halving Lanes(), but recursive instantiations of 195 // kPow2 - 1 will eventually fail e.g. because -64 is not a valid shift count. 196 // Users must terminate such compile-time recursions at or above HWY_MIN_POW2. 197 // 198 // WARNING: do not use N directly because it may be a special representation of 199 // a fractional MaxLanes. This arises when we Rebind Simd<uint8_t, 1, 0> to 200 // Simd<uint32_t, ??, 2>. RVV requires that the last argument (kPow2) be two, 201 // but we want MaxLanes to be the same in both cases. Hence ?? is a 202 // fixed-point encoding of 1/4. 203 // 204 // Instead of referring to Simd<> directly, users create D via aliases: 205 // - ScalableTag<T> for a full vector; 206 // - ScalableTag<T, kPow2>() for a fraction/group, where `kPow2` is 207 // interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`; 208 // - CappedTag<T, kLimit> for a vector with up to kLimit lanes; or 209 // - FixedTag<T, kNumLanes> for a vector with exactly kNumLanes lanes. 210 // 211 // Instead of N, use Lanes(D()) for the actual number of lanes at runtime and 212 // D().MaxLanes() for a constexpr upper bound. Both are powers of two. 213 template <typename Lane, size_t N, int kPow2> 214 struct Simd { 215 constexpr Simd() = default; 216 using T = Lane; 217 218 private: 219 static_assert(sizeof(Lane) <= 8, "Lanes are up to 64-bit"); 220 static_assert(IsSame<Lane, RemoveCvRef<Lane>>(), 221 "Lane must not be a reference type, const-qualified type, or " 222 "volatile-qualified type"); 223 static_assert(IsIntegerLaneType<Lane>() || IsFloat<Lane>() || 224 IsSpecialFloat<Lane>(), 225 "IsIntegerLaneType<T>(), IsFloat<T>(), or IsSpecialFloat<T>() " 226 "must be true"); 227 // 20 bits are sufficient for any HWY_MAX_BYTES. This is the 'normal' value of 228 // N when kFrac == 0, otherwise it is one (see FracN). 229 static constexpr size_t kWhole = N & 0xFFFFF; 230 // Fractional part is in the bits above kWhole. 231 static constexpr int kFrac = static_cast<int>(N >> 20); 232 // Can be 8x larger because kPow2 may be as low as -3 (Rebind of a larger 233 // type to u8 results in fractions). 234 static_assert(kWhole <= 8 * HWY_MAX_N && kFrac <= 3, "Out of range"); 235 static_assert(kFrac == 0 || kWhole == 1, "If frac, whole must be 1"); 236 static_assert((kWhole & (kWhole - 1)) == 0 && kWhole != 0, "Not 2^x"); 237 // Important to check this here because kPow2 <= -64 causes confusing 238 // compile errors (invalid shift count). 239 static_assert(kPow2 >= HWY_MIN_POW2, "Forgot kPow2 recursion terminator?"); 240 // However, do NOT verify kPow2 <= HWY_MAX_POW2 - users should be able to 241 // Rebind<uint64_t, ScalableTag<uint8_t, 3>> in order to discover that its 242 // kPow2 is out of bounds. 243 244 public: 245 // Upper bound on the number of lanes (tight if !HWY_HAVE_SCALABLE). In the 246 // common case, N == kWhole, but if kFrac is nonzero, we deduct it from kPow2. 247 // E.g. Rebind<uint32_t, Simd<uint8_t, 1, 0>> is Simd<uint32_t, 0x200001, 2>. 248 // The resulting number of lanes is still 1 because this N represents 1/4 249 // (the ratio of the sizes). Note that RVV requires kPow2 to be the ratio of 250 // the sizes so that the correct LMUL overloads are chosen, even if N is 251 // small enough that it would fit in an LMUL=1 vector. 252 // 253 // Cannot be an enum because GCC warns when using enums and non-enums in the 254 // same expression. Cannot be a static constexpr function (MSVC limitation). 255 // Rounded up to one so this is a valid array length. 256 // 257 // Do not use this directly - only 'public' so it is visible from the accessor 258 // macro required by MSVC. 259 static constexpr size_t kPrivateLanes = 260 HWY_MAX(size_t{1}, detail::ScaleByPower(kWhole, kPow2 - kFrac)); 261 // Do not use this directly - only 'public' so it is visible from the accessor 262 // macro required by MSVC. 263 static constexpr int kPrivatePow2 = kPow2; 264 265 constexpr size_t MaxLanes() const { return kPrivateLanes; } 266 constexpr size_t MaxBytes() const { return kPrivateLanes * sizeof(Lane); } 267 constexpr size_t MaxBlocks() const { return (MaxBytes() + 15) / 16; } 268 // For SFINAE (HWY_IF_POW2_GT_D). 269 constexpr int Pow2() const { return kPow2; } 270 271 // ------------------------------ Changing lane type or count 272 // Do not use any of these directly. Anything used from member typedefs cannot 273 // be made private, but functions only used within other functions can. 274 275 // Returns number of NewT lanes that fit within MaxBytes(). 276 template <typename NewT> 277 static constexpr size_t RepartitionLanes() { 278 // Round up to correctly handle larger NewT. 279 return (kPrivateLanes * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT); 280 } 281 282 // Returns the new kPow2 required for lanes of type NewT. 283 template <typename NewT> 284 static constexpr int RebindPow2() { 285 return kPow2 + 286 ((sizeof(NewT) >= sizeof(T)) 287 ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T))) 288 : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)))); 289 } 290 291 private: 292 // Returns 0 or whole NewN such that kNewMaxLanes = NewN * 2^kNewPow2. 293 template <int kNewPow2, size_t kNewMaxLanes> 294 static constexpr size_t WholeN() { 295 return detail::ScaleByPower(kNewMaxLanes, -kNewPow2); 296 } 297 298 // Returns fractional NewN such that kNewMaxLanes = NewN * 2^kNewPow2. 299 template <int kNewPow2, size_t kNewMaxLanes> 300 static constexpr size_t FracN() { 301 // Only reached if kNewPow2 > CeilLog2(kNewMaxLanes) >= 0 (else WholeN 302 // would not have been zero), but clamp to zero to avoid warnings. kFrac is 303 // the difference, stored in the upper bits of N, and we also set kWhole = 304 // 1 so that the new kPrivateLanes = kNewMaxLanes. 305 static_assert(HWY_MAX_N <= (size_t{1} << 20), "Change bit shift"); 306 return static_cast<size_t>( 307 1 + (HWY_MAX(0, kNewPow2 - static_cast<int>(CeilLog2(kNewMaxLanes))) 308 << 20)); 309 } 310 311 public: 312 // Returns (whole or fractional) NewN, see above. 313 template <int kNewPow2, size_t kNewMaxLanes> 314 static constexpr size_t NewN() { 315 // We require a fraction if inverting kNewPow2 results in 0. 316 return WholeN<kNewPow2, kNewMaxLanes>() == 0 317 ? FracN<kNewPow2, kNewMaxLanes>() 318 : WholeN<kNewPow2, kNewMaxLanes>(); 319 } 320 321 // PromoteTo/DemoteTo() with another lane type, but same number of lanes. 322 template <typename NewT> 323 using Rebind = 324 Simd<NewT, NewN<RebindPow2<NewT>(), kPrivateLanes>(), RebindPow2<NewT>()>; 325 326 // Change lane type while keeping the same vector size, e.g. for MulEven. 327 template <typename NewT> 328 using Repartition = 329 Simd<NewT, NewN<kPow2, RepartitionLanes<NewT>()>(), kPow2>; 330 331 // Half the lanes while keeping the same lane type, e.g. for LowerHalf. 332 using Half = Simd<T, N, kPow2 - 1>; 333 334 // Twice the lanes while keeping the same lane type, e.g. for Combine. 335 using Twice = Simd<T, N, kPow2 + 1>; 336 }; 337 338 namespace detail { 339 340 template <typename T, size_t N, int kPow2> 341 constexpr bool IsFull(Simd<T, N, kPow2> /* d */) { 342 return N == HWY_LANES(T) && kPow2 == 0; 343 } 344 345 // Struct wrappers enable validation of arguments via static_assert. 346 template <typename T, size_t N, int kPow2> 347 struct ClampNAndPow2 { 348 using type = Simd<T, HWY_MIN(N, HWY_MAX_N), HWY_MIN(kPow2, HWY_MAX_POW2)>; 349 }; 350 351 template <typename T, int kPow2> 352 struct ScalableTagChecker { 353 using type = typename ClampNAndPow2<T, HWY_LANES(T), kPow2>::type; 354 }; 355 356 template <typename T, size_t kLimit, int kPow2> 357 struct CappedTagChecker { 358 static_assert(kLimit != 0, "Does not make sense to have zero lanes"); 359 // Safely handle non-power-of-two inputs by rounding down, which is allowed by 360 // CappedTag. Otherwise, Simd<T, 3, 0> would static_assert. 361 static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit); 362 static constexpr size_t N = HWY_MIN(kLimitPow2, HWY_LANES(T)); 363 using type = typename ClampNAndPow2<T, N, kPow2>::type; 364 }; 365 366 template <typename T, size_t kNumLanes> 367 struct FixedTagChecker { 368 static_assert(kNumLanes != 0, "Does not make sense to have zero lanes"); 369 static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes"); 370 using type = Simd<T, kNumLanes, 0>; 371 }; 372 373 } // namespace detail 374 375 // ------------------------------ Aliases for Simd<> 376 377 // Tag describing a full vector (kPow2 == 0: the most common usage, e.g. 1D 378 // loops where the application does not care about the vector size) or a 379 // fraction/multiple of one. Fractions (kPow2 < 0) are useful for arguments or 380 // return values of type promotion and demotion. User-specified kPow2 is 381 // interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`. 382 template <typename T, int kPow2 = 0> 383 using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type; 384 385 // Tag describing a vector with *up to* kLimit active lanes, even on targets 386 // with scalable vectors and HWY_SCALAR. The runtime lane count `Lanes(tag)` may 387 // be less than kLimit, and is 1 on HWY_SCALAR. This alias is typically used for 388 // 1D loops with a relatively low application-defined upper bound, e.g. for 8x8 389 // DCTs. However, it is better if data structures are designed to be 390 // vector-length-agnostic (e.g. a hybrid SoA where there are chunks of `M >= 391 // MaxLanes(d)` DC components followed by M AC1, .., and M AC63; this would 392 // enable vector-length-agnostic loops using ScalableTag). User-specified kPow2 393 // is interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`. 394 template <typename T, size_t kLimit, int kPow2 = 0> 395 using CappedTag = typename detail::CappedTagChecker<T, kLimit, kPow2>::type; 396 397 #if !HWY_HAVE_SCALABLE 398 // If the vector size is known, and the app knows it does not want more than 399 // kLimit lanes, then capping can be beneficial. For example, AVX-512 has lower 400 // IPC and potentially higher costs for unaligned load/store vs. 256-bit AVX2. 401 template <typename T, size_t kLimit, int kPow2 = 0> 402 using CappedTagIfFixed = CappedTag<T, kLimit, kPow2>; 403 #else // HWY_HAVE_SCALABLE 404 // .. whereas on RVV/SVE, the cost of clamping Lanes() may exceed the benefit. 405 template <typename T, size_t kLimit, int kPow2 = 0> 406 using CappedTagIfFixed = ScalableTag<T, kPow2>; 407 #endif 408 409 // Alias for a tag describing a vector with *exactly* kNumLanes active lanes, 410 // even on targets with scalable vectors. Requires `kNumLanes` to be a power of 411 // two not exceeding `HWY_LANES(T)`. 412 // 413 // NOTE: if the application does not need to support HWY_SCALAR (+), use this 414 // instead of CappedTag to emphasize that there will be exactly kNumLanes lanes. 415 // This is useful for data structures that rely on exactly 128-bit SIMD, but 416 // these are discouraged because they cannot benefit from wider vectors. 417 // Instead, applications would ideally define a larger problem size and loop 418 // over it with the (unknown size) vectors from ScalableTag. 419 // 420 // + e.g. if the baseline is known to support SIMD, or the application requires 421 // ops such as TableLookupBytes not supported by HWY_SCALAR. 422 template <typename T, size_t kNumLanes> 423 using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type; 424 425 // Convenience form for fixed sizes. 426 template <typename T> 427 using Full16 = Simd<T, 2 / sizeof(T), 0>; 428 429 template <typename T> 430 using Full32 = Simd<T, 4 / sizeof(T), 0>; 431 432 template <typename T> 433 using Full64 = Simd<T, 8 / sizeof(T), 0>; 434 435 template <typename T> 436 using Full128 = Simd<T, 16 / sizeof(T), 0>; 437 438 // ------------------------------ Accessors for Simd<> 439 440 // Lane type. 441 template <class D> 442 using TFromD = typename D::T; 443 444 // Upper bound on the number of lanes, typically used for SFINAE conditions and 445 // to allocate storage for targets with known vector sizes. Note: this may be a 446 // loose bound, instead use Lanes() as the actual size for AllocateAligned. 447 // MSVC workaround: use static constant directly instead of a function. 448 #define HWY_MAX_LANES_D(D) D::kPrivateLanes 449 450 // Same as D().Pow2(), but this is too complex for SFINAE with MSVC, so we use a 451 // static constant directly. 452 #define HWY_POW2_D(D) D::kPrivatePow2 453 454 // Non-macro form of HWY_MAX_LANES_D in case that is preferable. WARNING: the 455 // macro form may be required for MSVC, which has limitations on deducing 456 // arguments. 457 template <class D> 458 HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) { 459 return HWY_MAX_LANES_D(D); 460 } 461 462 #undef HWY_HAVE_CONSTEXPR_LANES 463 #undef HWY_LANES_CONSTEXPR 464 465 #if HWY_HAVE_SCALABLE 466 #define HWY_HAVE_CONSTEXPR_LANES 0 467 #define HWY_LANES_CONSTEXPR 468 #else 469 470 // We want Lanes() to be constexpr where possible, so that compilers are able to 471 // precompute offsets. However, user code must not depend on the constexpr, 472 // because that will fail for RISC-V V and Arm SVE. To achieve both, we mark it 473 // as non-constexpr in debug builds, but not sanitizers, because we typically 474 // want them to see the same code. 475 #if HWY_IS_DEBUG_BUILD && !HWY_IS_SANITIZER 476 #define HWY_HAVE_CONSTEXPR_LANES 0 477 #define HWY_LANES_CONSTEXPR 478 #else 479 #define HWY_HAVE_CONSTEXPR_LANES 1 480 #define HWY_LANES_CONSTEXPR constexpr 481 #endif 482 483 // Returns actual vector length, used when advancing loop counters. The 484 // non-constexpr implementations are defined in their target's header. For a 485 // guaranteed-constexpr upper bound, use `MaxLanes(d)`. 486 template <class D> 487 HWY_INLINE HWY_MAYBE_UNUSED HWY_LANES_CONSTEXPR size_t Lanes(D) { 488 return HWY_MAX_LANES_D(D); 489 } 490 491 #endif // !HWY_HAVE_SCALABLE 492 493 // Tag for the same number of lanes as D, but with the LaneType T. 494 template <class T, class D> 495 using Rebind = typename D::template Rebind<T>; 496 497 template <class D> 498 using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>; 499 template <class D> 500 using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>; 501 template <class D> 502 using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>; 503 504 // Tag for the same total size as D, but with the LaneType T. 505 template <class T, class D> 506 using Repartition = typename D::template Repartition<T>; 507 508 template <class D> 509 using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>; 510 template <class D> 511 using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>; 512 513 // Shorthand for applying RepartitionToWide twice (for 8/16-bit types). 514 template <class D> 515 using RepartitionToWideX2 = RepartitionToWide<RepartitionToWide<D>>; 516 // Shorthand for applying RepartitionToWide three times (for 8-bit types). 517 template <class D> 518 using RepartitionToWideX3 = RepartitionToWide<RepartitionToWideX2<D>>; 519 520 // Tag for the same lane type as D, but half the lanes. 521 template <class D> 522 using Half = typename D::Half; 523 524 // Tag for the same lane type as D, but twice the lanes. 525 template <class D> 526 using Twice = typename D::Twice; 527 528 // Tag for a 16-byte block with the same lane type as D 529 #if HWY_HAVE_SCALABLE 530 namespace detail { 531 532 template <class D> 533 class BlockDFromD_t {}; 534 535 template <typename T, size_t N, int kPow2> 536 class BlockDFromD_t<Simd<T, N, kPow2>> { 537 using D = Simd<T, N, kPow2>; 538 static constexpr int kNewPow2 = HWY_MIN(kPow2, 0); 539 static constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), HWY_MAX_LANES_D(D)); 540 static constexpr size_t kNewN = D::template NewN<kNewPow2, kMaxLpb>(); 541 542 public: 543 using type = Simd<T, kNewN, kNewPow2>; 544 }; 545 546 } // namespace detail 547 548 template <class D> 549 using BlockDFromD = typename detail::BlockDFromD_t<RemoveConst<D>>::type; 550 #else 551 template <class D> 552 using BlockDFromD = 553 Simd<TFromD<D>, HWY_MIN(16 / sizeof(TFromD<D>), HWY_MAX_LANES_D(D)), 0>; 554 #endif 555 556 // Returns whether `ptr` is a multiple of `Lanes(d)` elements. 557 template <class D, typename T> 558 HWY_API bool IsAligned(D d, T* ptr) { 559 const size_t N = Lanes(d); 560 return reinterpret_cast<uintptr_t>(ptr) % (N * sizeof(T)) == 0; 561 } 562 563 // ------------------------------ Choosing overloads (SFINAE) 564 565 // Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T. 566 #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>) 567 #define HWY_IF_NOT_UNSIGNED_D(D) \ 568 HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>) 569 #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromD<D>) 570 #define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>) 571 #define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>) 572 #define HWY_IF_FLOAT3264_D(D) HWY_IF_FLOAT3264(hwy::HWY_NAMESPACE::TFromD<D>) 573 #define HWY_IF_NOT_FLOAT3264_D(D) \ 574 HWY_IF_NOT_FLOAT3264(hwy::HWY_NAMESPACE::TFromD<D>) 575 #define HWY_IF_SPECIAL_FLOAT_D(D) \ 576 HWY_IF_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>) 577 #define HWY_IF_NOT_SPECIAL_FLOAT_D(D) \ 578 HWY_IF_NOT_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>) 579 #define HWY_IF_FLOAT_OR_SPECIAL_D(D) \ 580 HWY_IF_FLOAT_OR_SPECIAL(hwy::HWY_NAMESPACE::TFromD<D>) 581 #define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D) \ 582 HWY_IF_NOT_FLOAT_NOR_SPECIAL(hwy::HWY_NAMESPACE::TFromD<D>) 583 584 #define HWY_IF_T_SIZE_D(D, bytes) \ 585 HWY_IF_T_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, bytes) 586 #define HWY_IF_NOT_T_SIZE_D(D, bytes) \ 587 HWY_IF_NOT_T_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, bytes) 588 #define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array) \ 589 HWY_IF_T_SIZE_ONE_OF(hwy::HWY_NAMESPACE::TFromD<D>, bit_array) 590 #define HWY_IF_T_SIZE_LE_D(D, bytes) \ 591 HWY_IF_T_SIZE_LE(hwy::HWY_NAMESPACE::TFromD<D>, bytes) 592 #define HWY_IF_T_SIZE_GT_D(D, bytes) \ 593 HWY_IF_T_SIZE_GT(hwy::HWY_NAMESPACE::TFromD<D>, bytes) 594 595 #define HWY_IF_LANES_D(D, lanes) HWY_IF_LANES(HWY_MAX_LANES_D(D), lanes) 596 #define HWY_IF_LANES_LE_D(D, lanes) HWY_IF_LANES_LE(HWY_MAX_LANES_D(D), lanes) 597 #define HWY_IF_LANES_GT_D(D, lanes) HWY_IF_LANES_GT(HWY_MAX_LANES_D(D), lanes) 598 #define HWY_IF_LANES_PER_BLOCK_D(D, lanes) \ 599 HWY_IF_LANES_PER_BLOCK(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), \ 600 lanes) 601 602 #if HWY_COMPILER_MSVC 603 #define HWY_IF_POW2_LE_D(D, pow2) \ 604 hwy::EnableIf<HWY_POW2_D(D) <= pow2>* = nullptr 605 #define HWY_IF_POW2_GT_D(D, pow2) \ 606 hwy::EnableIf<(HWY_POW2_D(D) > pow2)>* = nullptr 607 #else 608 #define HWY_IF_POW2_LE_D(D, pow2) hwy::EnableIf<D().Pow2() <= pow2>* = nullptr 609 #define HWY_IF_POW2_GT_D(D, pow2) hwy::EnableIf<(D().Pow2() > pow2)>* = nullptr 610 #endif // HWY_COMPILER_MSVC 611 612 #define HWY_IF_U8_D(D) HWY_IF_U8(hwy::HWY_NAMESPACE::TFromD<D>) 613 #define HWY_IF_U16_D(D) HWY_IF_U16(hwy::HWY_NAMESPACE::TFromD<D>) 614 #define HWY_IF_U32_D(D) HWY_IF_U32(hwy::HWY_NAMESPACE::TFromD<D>) 615 #define HWY_IF_U64_D(D) HWY_IF_U64(hwy::HWY_NAMESPACE::TFromD<D>) 616 617 #define HWY_IF_I8_D(D) HWY_IF_I8(hwy::HWY_NAMESPACE::TFromD<D>) 618 #define HWY_IF_I16_D(D) HWY_IF_I16(hwy::HWY_NAMESPACE::TFromD<D>) 619 #define HWY_IF_I32_D(D) HWY_IF_I32(hwy::HWY_NAMESPACE::TFromD<D>) 620 #define HWY_IF_I64_D(D) HWY_IF_I64(hwy::HWY_NAMESPACE::TFromD<D>) 621 622 // Use instead of HWY_IF_T_SIZE_D to avoid ambiguity with float16_t/float/double 623 // overloads. 624 #define HWY_IF_UI8_D(D) HWY_IF_UI8(hwy::HWY_NAMESPACE::TFromD<D>) 625 #define HWY_IF_UI16_D(D) HWY_IF_UI16(hwy::HWY_NAMESPACE::TFromD<D>) 626 #define HWY_IF_UI32_D(D) HWY_IF_UI32(hwy::HWY_NAMESPACE::TFromD<D>) 627 #define HWY_IF_UI64_D(D) HWY_IF_UI64(hwy::HWY_NAMESPACE::TFromD<D>) 628 629 #define HWY_IF_BF16_D(D) HWY_IF_BF16(hwy::HWY_NAMESPACE::TFromD<D>) 630 #define HWY_IF_NOT_BF16_D(D) HWY_IF_NOT_BF16(hwy::HWY_NAMESPACE::TFromD<D>) 631 632 #define HWY_IF_F16_D(D) HWY_IF_F16(hwy::HWY_NAMESPACE::TFromD<D>) 633 #define HWY_IF_NOT_F16_D(D) HWY_IF_NOT_F16(hwy::HWY_NAMESPACE::TFromD<D>) 634 635 #define HWY_IF_F32_D(D) HWY_IF_F32(hwy::HWY_NAMESPACE::TFromD<D>) 636 #define HWY_IF_F64_D(D) HWY_IF_F64(hwy::HWY_NAMESPACE::TFromD<D>) 637 638 #define HWY_V_SIZE_D(D) \ 639 (HWY_MAX_LANES_D(D) * sizeof(hwy::HWY_NAMESPACE::TFromD<D>)) 640 #define HWY_IF_V_SIZE_D(D, bytes) \ 641 HWY_IF_V_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes) 642 #define HWY_IF_V_SIZE_LE_D(D, bytes) \ 643 HWY_IF_V_SIZE_LE(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes) 644 #define HWY_IF_V_SIZE_GT_D(D, bytes) \ 645 HWY_IF_V_SIZE_GT(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes) 646 647 // Same, but with a vector argument. ops/*-inl.h define their own TFromV. 648 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>) 649 #define HWY_IF_NOT_UNSIGNED_V(V) \ 650 HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>) 651 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromV<V>) 652 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>) 653 #define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>) 654 #define HWY_IF_FLOAT3264_V(V) HWY_IF_FLOAT3264(hwy::HWY_NAMESPACE::TFromV<V>) 655 #define HWY_IF_SPECIAL_FLOAT_V(V) \ 656 HWY_IF_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>) 657 #define HWY_IF_FLOAT_OR_SPECIAL_V(V) \ 658 HWY_IF_FLOAT_OR_SPECIAL(hwy::HWY_NAMESPACE::TFromV<V>) 659 #define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V) \ 660 HWY_IF_NOT_FLOAT_NOR_SPECIAL(hwy::HWY_NAMESPACE::TFromV<V>) 661 662 #define HWY_IF_T_SIZE_V(V, bytes) \ 663 HWY_IF_T_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, bytes) 664 #define HWY_IF_NOT_T_SIZE_V(V, bytes) \ 665 HWY_IF_NOT_T_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, bytes) 666 #define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array) \ 667 HWY_IF_T_SIZE_ONE_OF(hwy::HWY_NAMESPACE::TFromV<V>, bit_array) 668 669 #define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(hwy::HWY_NAMESPACE::DFromV<V>) 670 #define HWY_IF_V_SIZE_V(V, bytes) \ 671 HWY_IF_V_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes) 672 #define HWY_IF_V_SIZE_LE_V(V, bytes) \ 673 HWY_IF_V_SIZE_LE(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes) 674 #define HWY_IF_V_SIZE_GT_V(V, bytes) \ 675 HWY_IF_V_SIZE_GT(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes) 676 677 // Use in implementations of ReduceSum etc. to avoid conflicts with the N=1 and 678 // N=4 8-bit specializations in generic_ops-inl. 679 #undef HWY_IF_REDUCE_D 680 #define HWY_IF_REDUCE_D(D) \ 681 hwy::EnableIf<HWY_MAX_LANES_D(D) != 1 && \ 682 (HWY_MAX_LANES_D(D) != 4 || \ 683 sizeof(hwy::HWY_NAMESPACE::TFromD<D>) != 1)>* = nullptr 684 685 #undef HWY_IF_SUM_OF_LANES_D 686 #define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1) 687 688 #undef HWY_IF_MINMAX_OF_LANES_D 689 #define HWY_IF_MINMAX_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1) 690 691 #undef HWY_IF_ADDSUB_V 692 #define HWY_IF_ADDSUB_V(V) HWY_IF_LANES_GT_D(hwy::HWY_NAMESPACE::DFromV<V>, 1) 693 694 #undef HWY_IF_MULADDSUB_V 695 #define HWY_IF_MULADDSUB_V(V) \ 696 HWY_IF_LANES_GT_D(hwy::HWY_NAMESPACE::DFromV<V>, 1) 697 698 #undef HWY_IF_PAIRWISE_ADD_128_D 699 #define HWY_IF_PAIRWISE_ADD_128_D(D) HWY_IF_V_SIZE_GT_D(D, 8) 700 701 #undef HWY_IF_PAIRWISE_SUB_128_D 702 #define HWY_IF_PAIRWISE_SUB_128_D(D) HWY_IF_V_SIZE_GT_D(D, 8) 703 704 // HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V is used to disable the default 705 // implementation of unsigned to signed DemoteTo/ReorderDemote2To in 706 // generic_ops-inl.h for at least some of the unsigned to signed demotions on 707 // SCALAR/EMU128/SSE2/SSSE3/SSE4/AVX2/SVE/SVE2/LSX/LASX 708 709 #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V 710 #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) void* = nullptr 711 712 // Old names (deprecated) 713 #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_T_SIZE_D(D, bytes) 714 #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE_D(D, bytes) 715 716 // NOLINTNEXTLINE(google-readability-namespace-comments) 717 } // namespace HWY_NAMESPACE 718 } // namespace hwy 719 HWY_AFTER_NAMESPACE(); 720 721 #endif // HIGHWAY_HWY_OPS_SHARED_TOGGLE