tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

shared-inl.h (29433B)


      1 // Copyright 2020 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 // Per-target definitions shared by ops/*.h and user code.
     17 
     18 // IWYU pragma: begin_exports
     19 // Export does not seem to be recursive, so re-export these (also in base.h)
     20 #include <stddef.h>
     21 
     22 #include "hwy/base.h"
     23 // "IWYU pragma: keep" does not work for this include, so hide it from the IDE.
     24 #if !HWY_IDE
     25 #include <stdint.h>
     26 #endif
     27 
     28 #include "hwy/detect_compiler_arch.h"
     29 #include "hwy/detect_targets.h"
     30 
     31 // Separate header because foreach_target.h re-enables its include guard.
     32 #include "hwy/ops/set_macros-inl.h"
     33 
     34 // IWYU pragma: end_exports
     35 
     36 #if HWY_IS_MSAN
     37 #include <sanitizer/msan_interface.h>
     38 #endif
     39 
     40 // We are covered by the highway.h include guard, but generic_ops-inl.h
     41 // includes this again #if HWY_IDE.
     42 // clang-format off
     43 #if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == defined(HWY_TARGET_TOGGLE)  // NOLINT
     44 // clang-format on
     45 #ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE
     46 #undef HIGHWAY_HWY_OPS_SHARED_TOGGLE
     47 #else
     48 #define HIGHWAY_HWY_OPS_SHARED_TOGGLE
     49 #endif
     50 
     51 HWY_BEFORE_NAMESPACE();
     52 namespace hwy {
     53 namespace HWY_NAMESPACE {
     54 
     55 // NOTE: GCC generates incorrect code for vector arguments to non-inlined
     56 // functions in two situations:
     57 // - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
     58 //   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
     59 // - on aarch64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
     60 //   all) tests to fail.
     61 //
     62 // We therefore pass by const& only on GCC and (Windows or aarch64). This alias
     63 // must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
     64 // and possibly also other functions that are not inlined.
     65 //
     66 // Even better is to avoid passing vector arguments to non-inlined functions,
     67 // because the SVE and RISC-V ABIs are still works in progress and may lead to
     68 // incorrect codegen.
     69 #if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
     70 template <class V>
     71 using VecArg = const V&;
     72 #else
     73 template <class V>
     74 using VecArg = V;
     75 #endif
     76 
     77 namespace detail {
     78 
     79 template <typename T>
     80 struct NativeLaneTypeT {
     81  using type = T;
     82 };
     83 template <>
     84 struct NativeLaneTypeT<hwy::float16_t> {
     85 #if HWY_HAVE_SCALAR_F16_TYPE
     86  using type = hwy::float16_t::Native;
     87 #else
     88  using type = uint16_t;
     89 #endif
     90 };
     91 template <>
     92 struct NativeLaneTypeT<hwy::bfloat16_t> {
     93 #if HWY_HAVE_SCALAR_BF16_TYPE
     94  using type = hwy::bfloat16_t::Native;
     95 #else
     96  using type = uint16_t;
     97 #endif
     98 };
     99 
    100 // The type expected by intrinsics for the given Highway lane type T. This
    101 // usually matches T, but differs for our wrapper types [b]float16_t. Use this
    102 // only when defining intrinsic wrappers, and NOT for casting, which is UB.
    103 template <typename T>
    104 using NativeLaneType = typename NativeLaneTypeT<T>::type;
    105 
    106 // Returns the same pointer after changing type to NativeLaneType. Use this only
    107 // for wrapper functions that call intrinsics (e.g. load/store) where some of
    108 // the overloads expect _Float16* or __bf16* arguments. For non-special floats,
    109 // this returns the same pointer and type.
    110 //
    111 // This makes use of the fact that a wrapper struct is pointer-interconvertible
    112 // with its first member (a union), thus also with the union members. Do NOT
    113 // call both this and U16LanePointer on the same object - they access different
    114 // union members, and this is not guaranteed to be safe.
    115 template <typename T, HWY_IF_NOT_SPECIAL_FLOAT(T)>
    116 HWY_INLINE T* NativeLanePointer(T* p) {
    117  return p;
    118 }
    119 template <typename T, typename NT = NativeLaneType<RemoveConst<T>>,
    120          HWY_IF_F16(T)>
    121 HWY_INLINE constexpr If<IsConst<T>(), const NT*, NT*> NativeLanePointer(T* p) {
    122 #if HWY_HAVE_SCALAR_F16_TYPE
    123  return &p->native;
    124 #else
    125  return &p->bits;
    126 #endif
    127 }
    128 template <typename T, typename NT = NativeLaneType<RemoveConst<T>>,
    129          HWY_IF_BF16(T)>
    130 HWY_INLINE constexpr If<IsConst<T>(), const NT*, NT*> NativeLanePointer(T* p) {
    131 #if HWY_HAVE_SCALAR_BF16_TYPE
    132  return &p->native;
    133 #else
    134  return &p->bits;
    135 #endif
    136 }
    137 
    138 // Returns a pointer to the u16 member of our [b]float16_t wrapper structs.
    139 // Use this in Highway targets that lack __bf16 intrinsics; for storing to
    140 // memory, we BitCast vectors to u16 and write to the pointer returned here.
    141 // Do NOT call both this and U16LanePointer on the same object - they access
    142 // different union members, and this is not guaranteed to be safe.
    143 template <typename T, HWY_IF_SPECIAL_FLOAT(T)>
    144 HWY_INLINE If<IsConst<T>(), const uint16_t*, uint16_t*> U16LanePointer(T* p) {
    145  return &p->bits;
    146 }
    147 
    148 // Returns N * 2^pow2. N is the number of lanes in a full vector and pow2 the
    149 // desired fraction or multiple of it, see Simd<>. `pow2` is most often in
    150 // [-3, 3] but can also be lower for user-specified fractions.
    151 constexpr size_t ScaleByPower(size_t N, int pow2) {
    152  return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
    153 }
    154 
    155 template <typename T>
    156 HWY_INLINE void MaybePoison(T* HWY_RESTRICT unaligned, size_t count) {
    157 #if HWY_IS_MSAN
    158  __msan_poison(unaligned, count * sizeof(T));
    159 #else
    160  (void)unaligned;
    161  (void)count;
    162 #endif
    163 }
    164 
    165 // This can be useful for working around MSAN limitations. For example, prior
    166 // to Clang 16, it did not understand AVX-512 CompressStore.
    167 template <typename T>
    168 HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) {
    169 #if HWY_IS_MSAN
    170  __msan_unpoison(unaligned, count * sizeof(T));
    171 #else
    172  (void)unaligned;
    173  (void)count;
    174 #endif
    175 }
    176 
    177 }  // namespace detail
    178 
    179 // Highway operations are implemented as overloaded functions selected using a
    180 // zero-sized tag type D := Simd<T, N, kPow2>. T denotes the lane type.
    181 //
    182 // N defines how many lanes are in a 'full' vector, typically equal to
    183 // HWY_LANES(T) (which is the actual count on targets with vectors of known
    184 // size, and an upper bound in case of scalable vectors), otherwise a
    185 // user-specified limit at most that large.
    186 //
    187 // 2^kPow2 is a _subsequently_ applied scaling factor that indicates the
    188 // desired fraction of a 'full' vector: 0 means full, -1 means half; 1,2,3
    189 // means two/four/eight full vectors ganged together. The largest supported
    190 // kPow2 is `HWY_MAX_POW2` and the aliases below take care of clamping
    191 // user-specified values to that. Note that `Simd<T, 1, 0>` and `Simd<T, 2, -1>`
    192 // have the same `MaxLanes` and `Lanes`.
    193 //
    194 // We can theoretically keep halving Lanes(), but recursive instantiations of
    195 // kPow2 - 1 will eventually fail e.g. because -64 is not a valid shift count.
    196 // Users must terminate such compile-time recursions at or above HWY_MIN_POW2.
    197 //
    198 // WARNING: do not use N directly because it may be a special representation of
    199 // a fractional MaxLanes. This arises when we Rebind Simd<uint8_t, 1, 0> to
    200 // Simd<uint32_t, ??, 2>. RVV requires that the last argument (kPow2) be two,
    201 // but we want MaxLanes to be the same in both cases. Hence ?? is a
    202 // fixed-point encoding of 1/4.
    203 //
    204 // Instead of referring to Simd<> directly, users create D via aliases:
    205 // - ScalableTag<T> for a full vector;
    206 // - ScalableTag<T, kPow2>() for a fraction/group, where `kPow2` is
    207 //   interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`;
    208 // - CappedTag<T, kLimit> for a vector with up to kLimit lanes; or
    209 // - FixedTag<T, kNumLanes> for a vector with exactly kNumLanes lanes.
    210 //
    211 // Instead of N, use Lanes(D()) for the actual number of lanes at runtime and
    212 // D().MaxLanes() for a constexpr upper bound. Both are powers of two.
    213 template <typename Lane, size_t N, int kPow2>
    214 struct Simd {
    215  constexpr Simd() = default;
    216  using T = Lane;
    217 
    218 private:
    219  static_assert(sizeof(Lane) <= 8, "Lanes are up to 64-bit");
    220  static_assert(IsSame<Lane, RemoveCvRef<Lane>>(),
    221                "Lane must not be a reference type, const-qualified type, or "
    222                "volatile-qualified type");
    223  static_assert(IsIntegerLaneType<Lane>() || IsFloat<Lane>() ||
    224                    IsSpecialFloat<Lane>(),
    225                "IsIntegerLaneType<T>(), IsFloat<T>(), or IsSpecialFloat<T>() "
    226                "must be true");
    227  // 20 bits are sufficient for any HWY_MAX_BYTES. This is the 'normal' value of
    228  // N when kFrac == 0, otherwise it is one (see FracN).
    229  static constexpr size_t kWhole = N & 0xFFFFF;
    230  // Fractional part is in the bits above kWhole.
    231  static constexpr int kFrac = static_cast<int>(N >> 20);
    232  // Can be 8x larger because kPow2 may be as low as -3 (Rebind of a larger
    233  // type to u8 results in fractions).
    234  static_assert(kWhole <= 8 * HWY_MAX_N && kFrac <= 3, "Out of range");
    235  static_assert(kFrac == 0 || kWhole == 1, "If frac, whole must be 1");
    236  static_assert((kWhole & (kWhole - 1)) == 0 && kWhole != 0, "Not 2^x");
    237  // Important to check this here because kPow2 <= -64 causes confusing
    238  // compile errors (invalid shift count).
    239  static_assert(kPow2 >= HWY_MIN_POW2, "Forgot kPow2 recursion terminator?");
    240  // However, do NOT verify kPow2 <= HWY_MAX_POW2 - users should be able to
    241  // Rebind<uint64_t, ScalableTag<uint8_t, 3>> in order to discover that its
    242  // kPow2 is out of bounds.
    243 
    244 public:
    245  // Upper bound on the number of lanes (tight if !HWY_HAVE_SCALABLE). In the
    246  // common case, N == kWhole, but if kFrac is nonzero, we deduct it from kPow2.
    247  // E.g. Rebind<uint32_t, Simd<uint8_t, 1, 0>> is Simd<uint32_t, 0x200001, 2>.
    248  // The resulting number of lanes is still 1 because this N represents 1/4
    249  // (the ratio of the sizes). Note that RVV requires kPow2 to be the ratio of
    250  // the sizes so that the correct LMUL overloads are chosen, even if N is
    251  // small enough that it would fit in an LMUL=1 vector.
    252  //
    253  // Cannot be an enum because GCC warns when using enums and non-enums in the
    254  // same expression. Cannot be a static constexpr function (MSVC limitation).
    255  // Rounded up to one so this is a valid array length.
    256  //
    257  // Do not use this directly - only 'public' so it is visible from the accessor
    258  // macro required by MSVC.
    259  static constexpr size_t kPrivateLanes =
    260      HWY_MAX(size_t{1}, detail::ScaleByPower(kWhole, kPow2 - kFrac));
    261  // Do not use this directly - only 'public' so it is visible from the accessor
    262  // macro required by MSVC.
    263  static constexpr int kPrivatePow2 = kPow2;
    264 
    265  constexpr size_t MaxLanes() const { return kPrivateLanes; }
    266  constexpr size_t MaxBytes() const { return kPrivateLanes * sizeof(Lane); }
    267  constexpr size_t MaxBlocks() const { return (MaxBytes() + 15) / 16; }
    268  // For SFINAE (HWY_IF_POW2_GT_D).
    269  constexpr int Pow2() const { return kPow2; }
    270 
    271  // ------------------------------ Changing lane type or count
    272  // Do not use any of these directly. Anything used from member typedefs cannot
    273  // be made private, but functions only used within other functions can.
    274 
    275  // Returns number of NewT lanes that fit within MaxBytes().
    276  template <typename NewT>
    277  static constexpr size_t RepartitionLanes() {
    278    // Round up to correctly handle larger NewT.
    279    return (kPrivateLanes * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
    280  }
    281 
    282  // Returns the new kPow2 required for lanes of type NewT.
    283  template <typename NewT>
    284  static constexpr int RebindPow2() {
    285    return kPow2 +
    286           ((sizeof(NewT) >= sizeof(T))
    287                ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
    288                : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT))));
    289  }
    290 
    291 private:
    292  // Returns 0 or whole NewN such that kNewMaxLanes = NewN * 2^kNewPow2.
    293  template <int kNewPow2, size_t kNewMaxLanes>
    294  static constexpr size_t WholeN() {
    295    return detail::ScaleByPower(kNewMaxLanes, -kNewPow2);
    296  }
    297 
    298  // Returns fractional NewN such that kNewMaxLanes = NewN * 2^kNewPow2.
    299  template <int kNewPow2, size_t kNewMaxLanes>
    300  static constexpr size_t FracN() {
    301    // Only reached if kNewPow2 > CeilLog2(kNewMaxLanes) >= 0 (else WholeN
    302    // would not have been zero), but clamp to zero to avoid warnings. kFrac is
    303    // the difference, stored in the upper bits of N, and we also set kWhole =
    304    // 1 so that the new kPrivateLanes = kNewMaxLanes.
    305    static_assert(HWY_MAX_N <= (size_t{1} << 20), "Change bit shift");
    306    return static_cast<size_t>(
    307        1 + (HWY_MAX(0, kNewPow2 - static_cast<int>(CeilLog2(kNewMaxLanes)))
    308             << 20));
    309  }
    310 
    311 public:
    312  // Returns (whole or fractional) NewN, see above.
    313  template <int kNewPow2, size_t kNewMaxLanes>
    314  static constexpr size_t NewN() {
    315    // We require a fraction if inverting kNewPow2 results in 0.
    316    return WholeN<kNewPow2, kNewMaxLanes>() == 0
    317               ? FracN<kNewPow2, kNewMaxLanes>()
    318               : WholeN<kNewPow2, kNewMaxLanes>();
    319  }
    320 
    321  // PromoteTo/DemoteTo() with another lane type, but same number of lanes.
    322  template <typename NewT>
    323  using Rebind =
    324      Simd<NewT, NewN<RebindPow2<NewT>(), kPrivateLanes>(), RebindPow2<NewT>()>;
    325 
    326  // Change lane type while keeping the same vector size, e.g. for MulEven.
    327  template <typename NewT>
    328  using Repartition =
    329      Simd<NewT, NewN<kPow2, RepartitionLanes<NewT>()>(), kPow2>;
    330 
    331  // Half the lanes while keeping the same lane type, e.g. for LowerHalf.
    332  using Half = Simd<T, N, kPow2 - 1>;
    333 
    334  // Twice the lanes while keeping the same lane type, e.g. for Combine.
    335  using Twice = Simd<T, N, kPow2 + 1>;
    336 };
    337 
    338 namespace detail {
    339 
    340 template <typename T, size_t N, int kPow2>
    341 constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
    342  return N == HWY_LANES(T) && kPow2 == 0;
    343 }
    344 
    345 // Struct wrappers enable validation of arguments via static_assert.
    346 template <typename T, size_t N, int kPow2>
    347 struct ClampNAndPow2 {
    348  using type = Simd<T, HWY_MIN(N, HWY_MAX_N), HWY_MIN(kPow2, HWY_MAX_POW2)>;
    349 };
    350 
    351 template <typename T, int kPow2>
    352 struct ScalableTagChecker {
    353  using type = typename ClampNAndPow2<T, HWY_LANES(T), kPow2>::type;
    354 };
    355 
    356 template <typename T, size_t kLimit, int kPow2>
    357 struct CappedTagChecker {
    358  static_assert(kLimit != 0, "Does not make sense to have zero lanes");
    359  // Safely handle non-power-of-two inputs by rounding down, which is allowed by
    360  // CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
    361  static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
    362  static constexpr size_t N = HWY_MIN(kLimitPow2, HWY_LANES(T));
    363  using type = typename ClampNAndPow2<T, N, kPow2>::type;
    364 };
    365 
    366 template <typename T, size_t kNumLanes>
    367 struct FixedTagChecker {
    368  static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
    369  static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
    370  using type = Simd<T, kNumLanes, 0>;
    371 };
    372 
    373 }  // namespace detail
    374 
    375 // ------------------------------ Aliases for Simd<>
    376 
    377 // Tag describing a full vector (kPow2 == 0: the most common usage, e.g. 1D
    378 // loops where the application does not care about the vector size) or a
    379 // fraction/multiple of one. Fractions (kPow2 < 0) are useful for arguments or
    380 // return values of type promotion and demotion. User-specified kPow2 is
    381 // interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
    382 template <typename T, int kPow2 = 0>
    383 using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
    384 
    385 // Tag describing a vector with *up to* kLimit active lanes, even on targets
    386 // with scalable vectors and HWY_SCALAR. The runtime lane count `Lanes(tag)` may
    387 // be less than kLimit, and is 1 on HWY_SCALAR. This alias is typically used for
    388 // 1D loops with a relatively low application-defined upper bound, e.g. for 8x8
    389 // DCTs. However, it is better if data structures are designed to be
    390 // vector-length-agnostic (e.g. a hybrid SoA where there are chunks of `M >=
    391 // MaxLanes(d)` DC components followed by M AC1, .., and M AC63; this would
    392 // enable vector-length-agnostic loops using ScalableTag). User-specified kPow2
    393 // is interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
    394 template <typename T, size_t kLimit, int kPow2 = 0>
    395 using CappedTag = typename detail::CappedTagChecker<T, kLimit, kPow2>::type;
    396 
    397 #if !HWY_HAVE_SCALABLE
    398 // If the vector size is known, and the app knows it does not want more than
    399 // kLimit lanes, then capping can be beneficial. For example, AVX-512 has lower
    400 // IPC and potentially higher costs for unaligned load/store vs. 256-bit AVX2.
    401 template <typename T, size_t kLimit, int kPow2 = 0>
    402 using CappedTagIfFixed = CappedTag<T, kLimit, kPow2>;
    403 #else  // HWY_HAVE_SCALABLE
    404 // .. whereas on RVV/SVE, the cost of clamping Lanes() may exceed the benefit.
    405 template <typename T, size_t kLimit, int kPow2 = 0>
    406 using CappedTagIfFixed = ScalableTag<T, kPow2>;
    407 #endif
    408 
    409 // Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
    410 // even on targets with scalable vectors. Requires `kNumLanes` to be a power of
    411 // two not exceeding `HWY_LANES(T)`.
    412 //
    413 // NOTE: if the application does not need to support HWY_SCALAR (+), use this
    414 // instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
    415 // This is useful for data structures that rely on exactly 128-bit SIMD, but
    416 // these are discouraged because they cannot benefit from wider vectors.
    417 // Instead, applications would ideally define a larger problem size and loop
    418 // over it with the (unknown size) vectors from ScalableTag.
    419 //
    420 // + e.g. if the baseline is known to support SIMD, or the application requires
    421 //   ops such as TableLookupBytes not supported by HWY_SCALAR.
    422 template <typename T, size_t kNumLanes>
    423 using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type;
    424 
    425 // Convenience form for fixed sizes.
    426 template <typename T>
    427 using Full16 = Simd<T, 2 / sizeof(T), 0>;
    428 
    429 template <typename T>
    430 using Full32 = Simd<T, 4 / sizeof(T), 0>;
    431 
    432 template <typename T>
    433 using Full64 = Simd<T, 8 / sizeof(T), 0>;
    434 
    435 template <typename T>
    436 using Full128 = Simd<T, 16 / sizeof(T), 0>;
    437 
    438 // ------------------------------ Accessors for Simd<>
    439 
    440 // Lane type.
    441 template <class D>
    442 using TFromD = typename D::T;
    443 
    444 // Upper bound on the number of lanes, typically used for SFINAE conditions and
    445 // to allocate storage for targets with known vector sizes. Note: this may be a
    446 // loose bound, instead use Lanes() as the actual size for AllocateAligned.
    447 // MSVC workaround: use static constant directly instead of a function.
    448 #define HWY_MAX_LANES_D(D) D::kPrivateLanes
    449 
    450 // Same as D().Pow2(), but this is too complex for SFINAE with MSVC, so we use a
    451 // static constant directly.
    452 #define HWY_POW2_D(D) D::kPrivatePow2
    453 
    454 // Non-macro form of HWY_MAX_LANES_D in case that is preferable. WARNING: the
    455 // macro form may be required for MSVC, which has limitations on deducing
    456 // arguments.
    457 template <class D>
    458 HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
    459  return HWY_MAX_LANES_D(D);
    460 }
    461 
    462 #undef HWY_HAVE_CONSTEXPR_LANES
    463 #undef HWY_LANES_CONSTEXPR
    464 
    465 #if HWY_HAVE_SCALABLE
    466 #define HWY_HAVE_CONSTEXPR_LANES 0
    467 #define HWY_LANES_CONSTEXPR
    468 #else
    469 
    470 // We want Lanes() to be constexpr where possible, so that compilers are able to
    471 // precompute offsets. However, user code must not depend on the constexpr,
    472 // because that will fail for RISC-V V and Arm SVE. To achieve both, we mark it
    473 // as non-constexpr in debug builds, but not sanitizers, because we typically
    474 // want them to see the same code.
    475 #if HWY_IS_DEBUG_BUILD && !HWY_IS_SANITIZER
    476 #define HWY_HAVE_CONSTEXPR_LANES 0
    477 #define HWY_LANES_CONSTEXPR
    478 #else
    479 #define HWY_HAVE_CONSTEXPR_LANES 1
    480 #define HWY_LANES_CONSTEXPR constexpr
    481 #endif
    482 
    483 // Returns actual vector length, used when advancing loop counters. The
    484 // non-constexpr implementations are defined in their target's header. For a
    485 // guaranteed-constexpr upper bound, use `MaxLanes(d)`.
    486 template <class D>
    487 HWY_INLINE HWY_MAYBE_UNUSED HWY_LANES_CONSTEXPR size_t Lanes(D) {
    488  return HWY_MAX_LANES_D(D);
    489 }
    490 
    491 #endif  // !HWY_HAVE_SCALABLE
    492 
    493 // Tag for the same number of lanes as D, but with the LaneType T.
    494 template <class T, class D>
    495 using Rebind = typename D::template Rebind<T>;
    496 
    497 template <class D>
    498 using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>;
    499 template <class D>
    500 using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
    501 template <class D>
    502 using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
    503 
    504 // Tag for the same total size as D, but with the LaneType T.
    505 template <class T, class D>
    506 using Repartition = typename D::template Repartition<T>;
    507 
    508 template <class D>
    509 using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
    510 template <class D>
    511 using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
    512 
    513 // Shorthand for applying RepartitionToWide twice (for 8/16-bit types).
    514 template <class D>
    515 using RepartitionToWideX2 = RepartitionToWide<RepartitionToWide<D>>;
    516 // Shorthand for applying RepartitionToWide three times (for 8-bit types).
    517 template <class D>
    518 using RepartitionToWideX3 = RepartitionToWide<RepartitionToWideX2<D>>;
    519 
    520 // Tag for the same lane type as D, but half the lanes.
    521 template <class D>
    522 using Half = typename D::Half;
    523 
    524 // Tag for the same lane type as D, but twice the lanes.
    525 template <class D>
    526 using Twice = typename D::Twice;
    527 
    528 // Tag for a 16-byte block with the same lane type as D
    529 #if HWY_HAVE_SCALABLE
    530 namespace detail {
    531 
    532 template <class D>
    533 class BlockDFromD_t {};
    534 
    535 template <typename T, size_t N, int kPow2>
    536 class BlockDFromD_t<Simd<T, N, kPow2>> {
    537  using D = Simd<T, N, kPow2>;
    538  static constexpr int kNewPow2 = HWY_MIN(kPow2, 0);
    539  static constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), HWY_MAX_LANES_D(D));
    540  static constexpr size_t kNewN = D::template NewN<kNewPow2, kMaxLpb>();
    541 
    542 public:
    543  using type = Simd<T, kNewN, kNewPow2>;
    544 };
    545 
    546 }  // namespace detail
    547 
    548 template <class D>
    549 using BlockDFromD = typename detail::BlockDFromD_t<RemoveConst<D>>::type;
    550 #else
    551 template <class D>
    552 using BlockDFromD =
    553    Simd<TFromD<D>, HWY_MIN(16 / sizeof(TFromD<D>), HWY_MAX_LANES_D(D)), 0>;
    554 #endif
    555 
    556 // Returns whether `ptr` is a multiple of `Lanes(d)` elements.
    557 template <class D, typename T>
    558 HWY_API bool IsAligned(D d, T* ptr) {
    559  const size_t N = Lanes(d);
    560  return reinterpret_cast<uintptr_t>(ptr) % (N * sizeof(T)) == 0;
    561 }
    562 
    563 // ------------------------------ Choosing overloads (SFINAE)
    564 
    565 // Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
    566 #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
    567 #define HWY_IF_NOT_UNSIGNED_D(D) \
    568  HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
    569 #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
    570 #define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
    571 #define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
    572 #define HWY_IF_FLOAT3264_D(D) HWY_IF_FLOAT3264(hwy::HWY_NAMESPACE::TFromD<D>)
    573 #define HWY_IF_NOT_FLOAT3264_D(D) \
    574  HWY_IF_NOT_FLOAT3264(hwy::HWY_NAMESPACE::TFromD<D>)
    575 #define HWY_IF_SPECIAL_FLOAT_D(D) \
    576  HWY_IF_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
    577 #define HWY_IF_NOT_SPECIAL_FLOAT_D(D) \
    578  HWY_IF_NOT_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
    579 #define HWY_IF_FLOAT_OR_SPECIAL_D(D) \
    580  HWY_IF_FLOAT_OR_SPECIAL(hwy::HWY_NAMESPACE::TFromD<D>)
    581 #define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D) \
    582  HWY_IF_NOT_FLOAT_NOR_SPECIAL(hwy::HWY_NAMESPACE::TFromD<D>)
    583 
    584 #define HWY_IF_T_SIZE_D(D, bytes) \
    585  HWY_IF_T_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
    586 #define HWY_IF_NOT_T_SIZE_D(D, bytes) \
    587  HWY_IF_NOT_T_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
    588 #define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array) \
    589  HWY_IF_T_SIZE_ONE_OF(hwy::HWY_NAMESPACE::TFromD<D>, bit_array)
    590 #define HWY_IF_T_SIZE_LE_D(D, bytes) \
    591  HWY_IF_T_SIZE_LE(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
    592 #define HWY_IF_T_SIZE_GT_D(D, bytes) \
    593  HWY_IF_T_SIZE_GT(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
    594 
    595 #define HWY_IF_LANES_D(D, lanes) HWY_IF_LANES(HWY_MAX_LANES_D(D), lanes)
    596 #define HWY_IF_LANES_LE_D(D, lanes) HWY_IF_LANES_LE(HWY_MAX_LANES_D(D), lanes)
    597 #define HWY_IF_LANES_GT_D(D, lanes) HWY_IF_LANES_GT(HWY_MAX_LANES_D(D), lanes)
    598 #define HWY_IF_LANES_PER_BLOCK_D(D, lanes)                                  \
    599  HWY_IF_LANES_PER_BLOCK(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), \
    600                         lanes)
    601 
    602 #if HWY_COMPILER_MSVC
    603 #define HWY_IF_POW2_LE_D(D, pow2) \
    604  hwy::EnableIf<HWY_POW2_D(D) <= pow2>* = nullptr
    605 #define HWY_IF_POW2_GT_D(D, pow2) \
    606  hwy::EnableIf<(HWY_POW2_D(D) > pow2)>* = nullptr
    607 #else
    608 #define HWY_IF_POW2_LE_D(D, pow2) hwy::EnableIf<D().Pow2() <= pow2>* = nullptr
    609 #define HWY_IF_POW2_GT_D(D, pow2) hwy::EnableIf<(D().Pow2() > pow2)>* = nullptr
    610 #endif  // HWY_COMPILER_MSVC
    611 
    612 #define HWY_IF_U8_D(D) HWY_IF_U8(hwy::HWY_NAMESPACE::TFromD<D>)
    613 #define HWY_IF_U16_D(D) HWY_IF_U16(hwy::HWY_NAMESPACE::TFromD<D>)
    614 #define HWY_IF_U32_D(D) HWY_IF_U32(hwy::HWY_NAMESPACE::TFromD<D>)
    615 #define HWY_IF_U64_D(D) HWY_IF_U64(hwy::HWY_NAMESPACE::TFromD<D>)
    616 
    617 #define HWY_IF_I8_D(D) HWY_IF_I8(hwy::HWY_NAMESPACE::TFromD<D>)
    618 #define HWY_IF_I16_D(D) HWY_IF_I16(hwy::HWY_NAMESPACE::TFromD<D>)
    619 #define HWY_IF_I32_D(D) HWY_IF_I32(hwy::HWY_NAMESPACE::TFromD<D>)
    620 #define HWY_IF_I64_D(D) HWY_IF_I64(hwy::HWY_NAMESPACE::TFromD<D>)
    621 
    622 // Use instead of HWY_IF_T_SIZE_D to avoid ambiguity with float16_t/float/double
    623 // overloads.
    624 #define HWY_IF_UI8_D(D) HWY_IF_UI8(hwy::HWY_NAMESPACE::TFromD<D>)
    625 #define HWY_IF_UI16_D(D) HWY_IF_UI16(hwy::HWY_NAMESPACE::TFromD<D>)
    626 #define HWY_IF_UI32_D(D) HWY_IF_UI32(hwy::HWY_NAMESPACE::TFromD<D>)
    627 #define HWY_IF_UI64_D(D) HWY_IF_UI64(hwy::HWY_NAMESPACE::TFromD<D>)
    628 
    629 #define HWY_IF_BF16_D(D) HWY_IF_BF16(hwy::HWY_NAMESPACE::TFromD<D>)
    630 #define HWY_IF_NOT_BF16_D(D) HWY_IF_NOT_BF16(hwy::HWY_NAMESPACE::TFromD<D>)
    631 
    632 #define HWY_IF_F16_D(D) HWY_IF_F16(hwy::HWY_NAMESPACE::TFromD<D>)
    633 #define HWY_IF_NOT_F16_D(D) HWY_IF_NOT_F16(hwy::HWY_NAMESPACE::TFromD<D>)
    634 
    635 #define HWY_IF_F32_D(D) HWY_IF_F32(hwy::HWY_NAMESPACE::TFromD<D>)
    636 #define HWY_IF_F64_D(D) HWY_IF_F64(hwy::HWY_NAMESPACE::TFromD<D>)
    637 
    638 #define HWY_V_SIZE_D(D) \
    639  (HWY_MAX_LANES_D(D) * sizeof(hwy::HWY_NAMESPACE::TFromD<D>))
    640 #define HWY_IF_V_SIZE_D(D, bytes) \
    641  HWY_IF_V_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes)
    642 #define HWY_IF_V_SIZE_LE_D(D, bytes) \
    643  HWY_IF_V_SIZE_LE(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes)
    644 #define HWY_IF_V_SIZE_GT_D(D, bytes) \
    645  HWY_IF_V_SIZE_GT(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes)
    646 
    647 // Same, but with a vector argument. ops/*-inl.h define their own TFromV.
    648 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
    649 #define HWY_IF_NOT_UNSIGNED_V(V) \
    650  HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
    651 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
    652 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
    653 #define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
    654 #define HWY_IF_FLOAT3264_V(V) HWY_IF_FLOAT3264(hwy::HWY_NAMESPACE::TFromV<V>)
    655 #define HWY_IF_SPECIAL_FLOAT_V(V) \
    656  HWY_IF_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
    657 #define HWY_IF_FLOAT_OR_SPECIAL_V(V) \
    658  HWY_IF_FLOAT_OR_SPECIAL(hwy::HWY_NAMESPACE::TFromV<V>)
    659 #define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V) \
    660  HWY_IF_NOT_FLOAT_NOR_SPECIAL(hwy::HWY_NAMESPACE::TFromV<V>)
    661 
    662 #define HWY_IF_T_SIZE_V(V, bytes) \
    663  HWY_IF_T_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, bytes)
    664 #define HWY_IF_NOT_T_SIZE_V(V, bytes) \
    665  HWY_IF_NOT_T_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, bytes)
    666 #define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array) \
    667  HWY_IF_T_SIZE_ONE_OF(hwy::HWY_NAMESPACE::TFromV<V>, bit_array)
    668 
    669 #define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(hwy::HWY_NAMESPACE::DFromV<V>)
    670 #define HWY_IF_V_SIZE_V(V, bytes) \
    671  HWY_IF_V_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes)
    672 #define HWY_IF_V_SIZE_LE_V(V, bytes) \
    673  HWY_IF_V_SIZE_LE(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes)
    674 #define HWY_IF_V_SIZE_GT_V(V, bytes) \
    675  HWY_IF_V_SIZE_GT(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes)
    676 
    677 // Use in implementations of ReduceSum etc. to avoid conflicts with the N=1 and
    678 // N=4 8-bit specializations in generic_ops-inl.
    679 #undef HWY_IF_REDUCE_D
    680 #define HWY_IF_REDUCE_D(D)                  \
    681  hwy::EnableIf<HWY_MAX_LANES_D(D) != 1 &&  \
    682                (HWY_MAX_LANES_D(D) != 4 || \
    683                 sizeof(hwy::HWY_NAMESPACE::TFromD<D>) != 1)>* = nullptr
    684 
    685 #undef HWY_IF_SUM_OF_LANES_D
    686 #define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1)
    687 
    688 #undef HWY_IF_MINMAX_OF_LANES_D
    689 #define HWY_IF_MINMAX_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1)
    690 
    691 #undef HWY_IF_ADDSUB_V
    692 #define HWY_IF_ADDSUB_V(V) HWY_IF_LANES_GT_D(hwy::HWY_NAMESPACE::DFromV<V>, 1)
    693 
    694 #undef HWY_IF_MULADDSUB_V
    695 #define HWY_IF_MULADDSUB_V(V) \
    696  HWY_IF_LANES_GT_D(hwy::HWY_NAMESPACE::DFromV<V>, 1)
    697 
    698 #undef HWY_IF_PAIRWISE_ADD_128_D
    699 #define HWY_IF_PAIRWISE_ADD_128_D(D) HWY_IF_V_SIZE_GT_D(D, 8)
    700 
    701 #undef HWY_IF_PAIRWISE_SUB_128_D
    702 #define HWY_IF_PAIRWISE_SUB_128_D(D) HWY_IF_V_SIZE_GT_D(D, 8)
    703 
    704 // HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V is used to disable the default
    705 // implementation of unsigned to signed DemoteTo/ReorderDemote2To in
    706 // generic_ops-inl.h for at least some of the unsigned to signed demotions on
    707 // SCALAR/EMU128/SSE2/SSSE3/SSE4/AVX2/SVE/SVE2/LSX/LASX
    708 
    709 #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
    710 #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) void* = nullptr
    711 
    712 // Old names (deprecated)
    713 #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_T_SIZE_D(D, bytes)
    714 #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE_D(D, bytes)
    715 
    716 // NOLINTNEXTLINE(google-readability-namespace-comments)
    717 }  // namespace HWY_NAMESPACE
    718 }  // namespace hwy
    719 HWY_AFTER_NAMESPACE();
    720 
    721 #endif  // HIGHWAY_HWY_OPS_SHARED_TOGGLE