tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

inside-inl.h (23665B)


      1 // Copyright 2023 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 // Must be included inside an existing include guard, with the following ops
     17 // already defined: BitCast, And, Set, ShiftLeft, ShiftRight, PromoteLowerTo,
     18 // ConcatEven, ConcatOdd, plus the optional detail::PromoteEvenTo and
     19 // detail::PromoteOddTo (if implemented in the target-specific header).
     20 
     21 // This is normally set by set_macros-inl.h before this header is included;
     22 // if not, we are viewing this header standalone. Reduce IDE errors by:
     23 #if !defined(HWY_NAMESPACE)
     24 // 1) Defining HWY_IDE so we get syntax highlighting rather than all-gray text.
     25 #include "hwy/ops/shared-inl.h"
     26 // 2) Entering the HWY_NAMESPACE to make definitions from shared-inl.h visible.
     27 HWY_BEFORE_NAMESPACE();
     28 namespace hwy {
     29 namespace HWY_NAMESPACE {
     30 #define HWY_INSIDE_END_NAMESPACE
     31 // 3) Providing a dummy VFromD (usually done by the target-specific header).
     32 template <class D>
     33 using VFromD = int;
     34 template <class D>
     35 using TFromV = int;
     36 template <class D>
     37 struct DFromV {};
     38 #endif
     39 
     40 // ------------------------------ Vec/Create/Get/Set2..4
     41 
     42 // On SVE and RVV, Vec2..4 are aliases to built-in types. Also exclude the
     43 // fixed-size SVE targets.
     44 #if HWY_IDE || (!HWY_HAVE_SCALABLE && !HWY_TARGET_IS_SVE)
     45 
     46 // NOTE: these are used inside arm_neon-inl.h, hence they cannot be defined in
     47 // generic_ops-inl.h, which is included after that.
     48 template <class D>
     49 struct Vec2 {
     50  VFromD<D> v0;
     51  VFromD<D> v1;
     52 };
     53 
     54 template <class D>
     55 struct Vec3 {
     56  VFromD<D> v0;
     57  VFromD<D> v1;
     58  VFromD<D> v2;
     59 };
     60 
     61 template <class D>
     62 struct Vec4 {
     63  VFromD<D> v0;
     64  VFromD<D> v1;
     65  VFromD<D> v2;
     66  VFromD<D> v3;
     67 };
     68 
     69 // D arg is unused but allows deducing D.
     70 template <class D>
     71 HWY_API Vec2<D> Create2(D /* tag */, VFromD<D> v0, VFromD<D> v1) {
     72  return Vec2<D>{v0, v1};
     73 }
     74 
     75 template <class D>
     76 HWY_API Vec3<D> Create3(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2) {
     77  return Vec3<D>{v0, v1, v2};
     78 }
     79 
     80 template <class D>
     81 HWY_API Vec4<D> Create4(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
     82                        VFromD<D> v3) {
     83  return Vec4<D>{v0, v1, v2, v3};
     84 }
     85 
     86 template <size_t kIndex, class D>
     87 HWY_API VFromD<D> Get2(Vec2<D> tuple) {
     88  static_assert(kIndex < 2, "Tuple index out of bounds");
     89  return kIndex == 0 ? tuple.v0 : tuple.v1;
     90 }
     91 
     92 template <size_t kIndex, class D>
     93 HWY_API VFromD<D> Get3(Vec3<D> tuple) {
     94  static_assert(kIndex < 3, "Tuple index out of bounds");
     95  return kIndex == 0 ? tuple.v0 : kIndex == 1 ? tuple.v1 : tuple.v2;
     96 }
     97 
     98 template <size_t kIndex, class D>
     99 HWY_API VFromD<D> Get4(Vec4<D> tuple) {
    100  static_assert(kIndex < 4, "Tuple index out of bounds");
    101  return kIndex == 0   ? tuple.v0
    102         : kIndex == 1 ? tuple.v1
    103         : kIndex == 2 ? tuple.v2
    104                       : tuple.v3;
    105 }
    106 
    107 template <size_t kIndex, class D>
    108 HWY_API Vec2<D> Set2(Vec2<D> tuple, VFromD<D> val) {
    109  static_assert(kIndex < 2, "Tuple index out of bounds");
    110  if (kIndex == 0) {
    111    tuple.v0 = val;
    112  } else {
    113    tuple.v1 = val;
    114  }
    115  return tuple;
    116 }
    117 
    118 template <size_t kIndex, class D>
    119 HWY_API Vec3<D> Set3(Vec3<D> tuple, VFromD<D> val) {
    120  static_assert(kIndex < 3, "Tuple index out of bounds");
    121  if (kIndex == 0) {
    122    tuple.v0 = val;
    123  } else if (kIndex == 1) {
    124    tuple.v1 = val;
    125  } else {
    126    tuple.v2 = val;
    127  }
    128  return tuple;
    129 }
    130 
    131 template <size_t kIndex, class D>
    132 HWY_API Vec4<D> Set4(Vec4<D> tuple, VFromD<D> val) {
    133  static_assert(kIndex < 4, "Tuple index out of bounds");
    134  if (kIndex == 0) {
    135    tuple.v0 = val;
    136  } else if (kIndex == 1) {
    137    tuple.v1 = val;
    138  } else if (kIndex == 2) {
    139    tuple.v2 = val;
    140  } else {
    141    tuple.v3 = val;
    142  }
    143  return tuple;
    144 }
    145 
    146 #endif  // !HWY_HAVE_SCALABLE || HWY_IDE
    147 
    148 // ------------------------------ Rol/Ror (And, Or, Neg, Shl, Shr)
    149 #if (defined(HWY_NATIVE_ROL_ROR_8) == defined(HWY_TARGET_TOGGLE))
    150 #ifdef HWY_NATIVE_ROL_ROR_8
    151 #undef HWY_NATIVE_ROL_ROR_8
    152 #else
    153 #define HWY_NATIVE_ROL_ROR_8
    154 #endif
    155 
    156 template <class V, HWY_IF_UI8(TFromV<V>)>
    157 HWY_API V Rol(V a, V b) {
    158  const DFromV<decltype(a)> d;
    159  const RebindToSigned<decltype(d)> di;
    160  const RebindToUnsigned<decltype(d)> du;
    161 
    162  const auto shift_amt_mask = Set(du, uint8_t{7});
    163  const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
    164  const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
    165 
    166  const auto vu = BitCast(du, a);
    167  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
    168 }
    169 
    170 template <class V, HWY_IF_UI8(TFromV<V>)>
    171 HWY_API V Ror(V a, V b) {
    172  const DFromV<decltype(a)> d;
    173  const RebindToSigned<decltype(d)> di;
    174  const RebindToUnsigned<decltype(d)> du;
    175 
    176  const auto shift_amt_mask = Set(du, uint8_t{7});
    177  const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
    178  const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
    179 
    180  const auto vu = BitCast(du, a);
    181  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
    182 }
    183 
    184 #endif  // HWY_NATIVE_ROL_ROR_8
    185 
    186 #if (defined(HWY_NATIVE_ROL_ROR_16) == defined(HWY_TARGET_TOGGLE))
    187 #ifdef HWY_NATIVE_ROL_ROR_16
    188 #undef HWY_NATIVE_ROL_ROR_16
    189 #else
    190 #define HWY_NATIVE_ROL_ROR_16
    191 #endif
    192 
    193 template <class V, HWY_IF_UI16(TFromV<V>)>
    194 HWY_API V Rol(V a, V b) {
    195  const DFromV<decltype(a)> d;
    196  const RebindToSigned<decltype(d)> di;
    197  const RebindToUnsigned<decltype(d)> du;
    198 
    199  const auto shift_amt_mask = Set(du, uint16_t{15});
    200  const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
    201  const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
    202 
    203  const auto vu = BitCast(du, a);
    204  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
    205 }
    206 
    207 template <class V, HWY_IF_UI16(TFromV<V>)>
    208 HWY_API V Ror(V a, V b) {
    209  const DFromV<decltype(a)> d;
    210  const RebindToSigned<decltype(d)> di;
    211  const RebindToUnsigned<decltype(d)> du;
    212 
    213  const auto shift_amt_mask = Set(du, uint16_t{15});
    214  const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
    215  const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
    216 
    217  const auto vu = BitCast(du, a);
    218  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
    219 }
    220 
    221 #endif  // HWY_NATIVE_ROL_ROR_16
    222 
    223 #if (defined(HWY_NATIVE_ROL_ROR_32_64) == defined(HWY_TARGET_TOGGLE))
    224 #ifdef HWY_NATIVE_ROL_ROR_32_64
    225 #undef HWY_NATIVE_ROL_ROR_32_64
    226 #else
    227 #define HWY_NATIVE_ROL_ROR_32_64
    228 #endif
    229 
    230 template <class V, HWY_IF_UI32(TFromV<V>)>
    231 HWY_API V Rol(V a, V b) {
    232  const DFromV<decltype(a)> d;
    233  const RebindToSigned<decltype(d)> di;
    234  const RebindToUnsigned<decltype(d)> du;
    235 
    236  const auto shift_amt_mask = Set(du, uint32_t{31});
    237  const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
    238  const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
    239 
    240  const auto vu = BitCast(du, a);
    241  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
    242 }
    243 
    244 template <class V, HWY_IF_UI32(TFromV<V>)>
    245 HWY_API V Ror(V a, V b) {
    246  const DFromV<decltype(a)> d;
    247  const RebindToSigned<decltype(d)> di;
    248  const RebindToUnsigned<decltype(d)> du;
    249 
    250  const auto shift_amt_mask = Set(du, uint32_t{31});
    251  const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
    252  const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
    253 
    254  const auto vu = BitCast(du, a);
    255  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
    256 }
    257 
    258 #if HWY_HAVE_INTEGER64
    259 template <class V, HWY_IF_UI64(TFromV<V>)>
    260 HWY_API V Rol(V a, V b) {
    261  const DFromV<decltype(a)> d;
    262  const RebindToSigned<decltype(d)> di;
    263  const RebindToUnsigned<decltype(d)> du;
    264 
    265  const auto shift_amt_mask = Set(du, uint64_t{63});
    266  const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
    267  const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
    268 
    269  const auto vu = BitCast(du, a);
    270  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
    271 }
    272 
    273 template <class V, HWY_IF_UI64(TFromV<V>)>
    274 HWY_API V Ror(V a, V b) {
    275  const DFromV<decltype(a)> d;
    276  const RebindToSigned<decltype(d)> di;
    277  const RebindToUnsigned<decltype(d)> du;
    278 
    279  const auto shift_amt_mask = Set(du, uint64_t{63});
    280  const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
    281  const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
    282 
    283  const auto vu = BitCast(du, a);
    284  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
    285 }
    286 #endif  // HWY_HAVE_INTEGER64
    287 
    288 #endif  // HWY_NATIVE_ROL_ROR_32_64
    289 
    290 // ------------------------------ RotateLeftSame/RotateRightSame
    291 
    292 #if (defined(HWY_NATIVE_ROL_ROR_SAME_8) == defined(HWY_TARGET_TOGGLE))
    293 #ifdef HWY_NATIVE_ROL_ROR_SAME_8
    294 #undef HWY_NATIVE_ROL_ROR_SAME_8
    295 #else
    296 #define HWY_NATIVE_ROL_ROR_SAME_8
    297 #endif
    298 
    299 template <class V, HWY_IF_UI8(TFromV<V>)>
    300 HWY_API V RotateLeftSame(V v, int bits) {
    301  const DFromV<decltype(v)> d;
    302  const RebindToUnsigned<decltype(d)> du;
    303 
    304  const int shl_amt = bits & 7;
    305  const int shr_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u);
    306 
    307  const auto vu = BitCast(du, v);
    308  return BitCast(d,
    309                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
    310 }
    311 
    312 template <class V, HWY_IF_UI8(TFromV<V>)>
    313 HWY_API V RotateRightSame(V v, int bits) {
    314  const DFromV<decltype(v)> d;
    315  const RebindToUnsigned<decltype(d)> du;
    316 
    317  const int shr_amt = bits & 7;
    318  const int shl_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u);
    319 
    320  const auto vu = BitCast(du, v);
    321  return BitCast(d,
    322                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
    323 }
    324 
    325 #endif  // HWY_NATIVE_ROL_ROR_SAME_8
    326 
    327 #if (defined(HWY_NATIVE_ROL_ROR_SAME_16) == defined(HWY_TARGET_TOGGLE))
    328 #ifdef HWY_NATIVE_ROL_ROR_SAME_16
    329 #undef HWY_NATIVE_ROL_ROR_SAME_16
    330 #else
    331 #define HWY_NATIVE_ROL_ROR_SAME_16
    332 #endif
    333 
    334 template <class V, HWY_IF_UI16(TFromV<V>)>
    335 HWY_API V RotateLeftSame(V v, int bits) {
    336  const DFromV<decltype(v)> d;
    337  const RebindToUnsigned<decltype(d)> du;
    338 
    339  const int shl_amt = bits & 15;
    340  const int shr_amt =
    341      static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u);
    342 
    343  const auto vu = BitCast(du, v);
    344  return BitCast(d,
    345                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
    346 }
    347 
    348 template <class V, HWY_IF_UI16(TFromV<V>)>
    349 HWY_API V RotateRightSame(V v, int bits) {
    350  const DFromV<decltype(v)> d;
    351  const RebindToUnsigned<decltype(d)> du;
    352 
    353  const int shr_amt = bits & 15;
    354  const int shl_amt =
    355      static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u);
    356 
    357  const auto vu = BitCast(du, v);
    358  return BitCast(d,
    359                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
    360 }
    361 #endif  // HWY_NATIVE_ROL_ROR_SAME_16
    362 
    363 #if (defined(HWY_NATIVE_ROL_ROR_SAME_32_64) == defined(HWY_TARGET_TOGGLE))
    364 #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
    365 #undef HWY_NATIVE_ROL_ROR_SAME_32_64
    366 #else
    367 #define HWY_NATIVE_ROL_ROR_SAME_32_64
    368 #endif
    369 
    370 template <class V, HWY_IF_UI32(TFromV<V>)>
    371 HWY_API V RotateLeftSame(V v, int bits) {
    372  const DFromV<decltype(v)> d;
    373  const RebindToUnsigned<decltype(d)> du;
    374 
    375  const int shl_amt = bits & 31;
    376  const int shr_amt =
    377      static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u);
    378 
    379  const auto vu = BitCast(du, v);
    380  return BitCast(d,
    381                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
    382 }
    383 
    384 template <class V, HWY_IF_UI32(TFromV<V>)>
    385 HWY_API V RotateRightSame(V v, int bits) {
    386  const DFromV<decltype(v)> d;
    387  const RebindToUnsigned<decltype(d)> du;
    388 
    389  const int shr_amt = bits & 31;
    390  const int shl_amt =
    391      static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u);
    392 
    393  const auto vu = BitCast(du, v);
    394  return BitCast(d,
    395                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
    396 }
    397 
    398 #if HWY_HAVE_INTEGER64
    399 template <class V, HWY_IF_UI64(TFromV<V>)>
    400 HWY_API V RotateLeftSame(V v, int bits) {
    401  const DFromV<decltype(v)> d;
    402  const RebindToUnsigned<decltype(d)> du;
    403 
    404  const int shl_amt = bits & 63;
    405  const int shr_amt =
    406      static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u);
    407 
    408  const auto vu = BitCast(du, v);
    409  return BitCast(d,
    410                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
    411 }
    412 
    413 template <class V, HWY_IF_UI64(TFromV<V>)>
    414 HWY_API V RotateRightSame(V v, int bits) {
    415  const DFromV<decltype(v)> d;
    416  const RebindToUnsigned<decltype(d)> du;
    417 
    418  const int shr_amt = bits & 63;
    419  const int shl_amt =
    420      static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u);
    421 
    422  const auto vu = BitCast(du, v);
    423  return BitCast(d,
    424                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
    425 }
    426 #endif  // HWY_HAVE_INTEGER64
    427 
    428 #endif  // HWY_NATIVE_ROL_ROR_SAME_32_64
    429 
    430 // ------------------------------ PromoteEvenTo/PromoteOddTo
    431 
    432 // These are used by target-specific headers for ReorderWidenMulAccumulate etc.
    433 
    434 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
    435 namespace detail {
    436 
    437 // Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as
    438 // there are target-specific specializations for some of the
    439 // detail::PromoteEvenTo and detail::PromoteOddTo cases on
    440 // SVE/PPC/SSE2/SSSE3/SSE4/AVX2.
    441 
    442 // All targets except HWY_SCALAR use the implementations of
    443 // detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at
    444 // least some of the PromoteEvenTo and PromoteOddTo cases.
    445 
    446 // Signed to signed PromoteEvenTo/PromoteOddTo
    447 template <size_t kToLaneSize, class D, class V>
    448 HWY_INLINE VFromD<D> PromoteEvenTo(
    449    hwy::SignedTag /*to_type_tag*/,
    450    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    451    hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
    452 #if HWY_TARGET_IS_SVE
    453  // The intrinsic expects the wide lane type.
    454  return NativePromoteEvenTo(BitCast(d_to, v));
    455 #else
    456 #if HWY_IS_LITTLE_ENDIAN
    457  // On little-endian targets, need to shift each lane of the bitcasted
    458  // vector left by kToLaneSize * 4 bits to get the bits of the even
    459  // source lanes into the upper kToLaneSize * 4 bits of even_in_hi.
    460  const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
    461 #else
    462  // On big-endian targets, the bits of the even source lanes are already
    463  // in the upper kToLaneSize * 4 bits of the lanes of the bitcasted
    464  // vector.
    465  const auto even_in_hi = BitCast(d_to, v);
    466 #endif
    467 
    468  // Right-shift even_in_hi by kToLaneSize * 4 bits
    469  return ShiftRight<kToLaneSize * 4>(even_in_hi);
    470 #endif  // HWY_TARGET_IS_SVE
    471 }
    472 
    473 // Unsigned to unsigned PromoteEvenTo/PromoteOddTo
    474 template <size_t kToLaneSize, class D, class V>
    475 HWY_INLINE VFromD<D> PromoteEvenTo(
    476    hwy::UnsignedTag /*to_type_tag*/,
    477    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    478    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
    479 #if HWY_TARGET_IS_SVE
    480  // The intrinsic expects the wide lane type.
    481  return NativePromoteEvenTo(BitCast(d_to, v));
    482 #else
    483 #if HWY_IS_LITTLE_ENDIAN
    484  // On little-endian targets, the bits of the even source lanes are already
    485  // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
    486 
    487  // Simply need to zero out the upper bits of each lane of the bitcasted
    488  // vector.
    489  return And(BitCast(d_to, v),
    490             Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
    491 #else
    492  // On big-endian targets, need to shift each lane of the bitcasted vector
    493  // right by kToLaneSize * 4 bits to get the bits of the even source lanes into
    494  // the lower kToLaneSize * 4 bits of the result.
    495 
    496  // The right shift below will zero out the upper kToLaneSize * 4 bits of the
    497  // result.
    498  return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
    499 #endif
    500 #endif  // HWY_TARGET_IS_SVE
    501 }
    502 
    503 template <size_t kToLaneSize, class D, class V>
    504 HWY_INLINE VFromD<D> PromoteOddTo(
    505    hwy::SignedTag /*to_type_tag*/,
    506    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    507    hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
    508 #if HWY_IS_LITTLE_ENDIAN
    509  // On little-endian targets, the bits of the odd source lanes are already in
    510  // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
    511  const auto odd_in_hi = BitCast(d_to, v);
    512 #else
    513  // On big-endian targets, need to shift each lane of the bitcasted vector
    514  // left by kToLaneSize * 4 bits to get the bits of the odd source lanes into
    515  // the upper kToLaneSize * 4 bits of odd_in_hi.
    516  const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
    517 #endif
    518 
    519  // Right-shift odd_in_hi by kToLaneSize * 4 bits
    520  return ShiftRight<kToLaneSize * 4>(odd_in_hi);
    521 }
    522 
    523 template <size_t kToLaneSize, class D, class V>
    524 HWY_INLINE VFromD<D> PromoteOddTo(
    525    hwy::UnsignedTag /*to_type_tag*/,
    526    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    527    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
    528 #if HWY_IS_LITTLE_ENDIAN
    529  // On little-endian targets, need to shift each lane of the bitcasted vector
    530  // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into
    531  // the lower kToLaneSize * 4 bits of the result.
    532 
    533  // The right shift below will zero out the upper kToLaneSize * 4 bits of the
    534  // result.
    535  return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
    536 #else
    537  // On big-endian targets, the bits of the even source lanes are already
    538  // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
    539 
    540  // Simply need to zero out the upper bits of each lane of the bitcasted
    541  // vector.
    542  return And(BitCast(d_to, v),
    543             Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
    544 #endif
    545 }
    546 
    547 // Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo
    548 // followed by BitCast to signed
    549 template <size_t kToLaneSize, class D, class V>
    550 HWY_INLINE VFromD<D> PromoteEvenTo(
    551    hwy::SignedTag /*to_type_tag*/,
    552    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    553    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
    554  const RebindToUnsigned<decltype(d_to)> du_to;
    555  return BitCast(d_to,
    556                 PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
    557                               hwy::UnsignedTag(), du_to, v));
    558 }
    559 
    560 template <size_t kToLaneSize, class D, class V>
    561 HWY_INLINE VFromD<D> PromoteOddTo(
    562    hwy::SignedTag /*to_type_tag*/,
    563    hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    564    hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
    565  const RebindToUnsigned<decltype(d_to)> du_to;
    566  return BitCast(d_to,
    567                 PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
    568                              hwy::UnsignedTag(), du_to, v));
    569 }
    570 
    571 // BF16->F32 PromoteEvenTo
    572 
    573 // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
    574 // instead of hwy::FloatTag on targets that use scalable vectors.
    575 
    576 // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
    577 // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
    578 
    579 // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
    580 // to be a bfloat16_t vector.
    581 template <class FromTypeTag, class DF32, class VBF16,
    582          class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
    583          hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
    584 HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
    585                                      hwy::SizeTag<4> /*to_lane_size_tag*/,
    586                                      FromTypeTag /*from_type_tag*/, DF32 d_to,
    587                                      VBF16 v) {
    588  const RebindToUnsigned<decltype(d_to)> du_to;
    589 #if HWY_IS_LITTLE_ENDIAN
    590  // On little-endian platforms, need to shift left each lane of the bitcasted
    591  // vector by 16 bits.
    592  return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
    593 #else
    594  // On big-endian platforms, the even lanes of the source vector are already
    595  // in the upper 16 bits of the lanes of the bitcasted vector.
    596 
    597  // Need to simply zero out the lower 16 bits of each lane of the bitcasted
    598  // vector.
    599  return BitCast(d_to,
    600                 And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
    601 #endif
    602 }
    603 
    604 // BF16->F32 PromoteOddTo
    605 
    606 // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
    607 // instead of hwy::FloatTag on targets that use scalable vectors.
    608 
    609 // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
    610 // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
    611 
    612 // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
    613 // to be a bfloat16_t vector.
    614 template <class FromTypeTag, class DF32, class VBF16,
    615          class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
    616          hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
    617 HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
    618                                     hwy::SizeTag<4> /*to_lane_size_tag*/,
    619                                     FromTypeTag /*from_type_tag*/, DF32 d_to,
    620                                     VBF16 v) {
    621  const RebindToUnsigned<decltype(d_to)> du_to;
    622 #if HWY_IS_LITTLE_ENDIAN
    623  // On little-endian platforms, the odd lanes of the source vector are already
    624  // in the upper 16 bits of the lanes of the bitcasted vector.
    625 
    626  // Need to simply zero out the lower 16 bits of each lane of the bitcasted
    627  // vector.
    628  return BitCast(d_to,
    629                 And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
    630 #else
    631  // On big-endian platforms, need to shift left each lane of the bitcasted
    632  // vector by 16 bits.
    633  return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
    634 #endif
    635 }
    636 
    637 // Default PromoteEvenTo/PromoteOddTo implementations
    638 template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
    639          class V, HWY_IF_LANES_D(D, 1)>
    640 HWY_INLINE VFromD<D> PromoteEvenTo(
    641    ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    642    FromTypeTag /*from_type_tag*/, D d_to, V v) {
    643  return PromoteLowerTo(d_to, v);
    644 }
    645 
    646 template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
    647          class V, HWY_IF_LANES_GT_D(D, 1)>
    648 HWY_INLINE VFromD<D> PromoteEvenTo(
    649    ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    650    FromTypeTag /*from_type_tag*/, D d_to, V v) {
    651  const DFromV<decltype(v)> d;
    652  return PromoteLowerTo(d_to, ConcatEven(d, v, v));
    653 }
    654 
    655 template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
    656          class V>
    657 HWY_INLINE VFromD<D> PromoteOddTo(
    658    ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
    659    FromTypeTag /*from_type_tag*/, D d_to, V v) {
    660  const DFromV<decltype(v)> d;
    661  return PromoteLowerTo(d_to, ConcatOdd(d, v, v));
    662 }
    663 
    664 }  // namespace detail
    665 
    666 template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
    667          class V2 = VFromD<Repartition<TFromV<V>, D>>,
    668          HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
    669 HWY_API VFromD<D> PromoteEvenTo(D d, V v) {
    670  return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(),
    671                               hwy::SizeTag<sizeof(TFromD<D>)>(),
    672                               hwy::TypeTag<TFromV<V>>(), d, v);
    673 }
    674 
    675 template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
    676          class V2 = VFromD<Repartition<TFromV<V>, D>>,
    677          HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
    678 HWY_API VFromD<D> PromoteOddTo(D d, V v) {
    679  return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(),
    680                              hwy::SizeTag<sizeof(TFromD<D>)>(),
    681                              hwy::TypeTag<TFromV<V>>(), d, v);
    682 }
    683 #endif  // HWY_TARGET != HWY_SCALAR
    684 
    685 #ifdef HWY_INSIDE_END_NAMESPACE
    686 #undef HWY_INSIDE_END_NAMESPACE
    687 // NOLINTNEXTLINE(google-readability-namespace-comments)
    688 }  // namespace HWY_NAMESPACE
    689 }  // namespace hwy
    690 HWY_AFTER_NAMESPACE();
    691 #endif