generic_ops-inl.h (308815B)
1 // Copyright 2021 Google LLC 2 // Copyright 2023,2024 Arm Limited and/or 3 // its affiliates <open-source-office@arm.com> 4 // SPDX-License-Identifier: Apache-2.0 5 // SPDX-License-Identifier: BSD-3-Clause 6 // 7 // Licensed under the Apache License, Version 2.0 (the "License"); 8 // you may not use this file except in compliance with the License. 9 // You may obtain a copy of the License at 10 // 11 // http://www.apache.org/licenses/LICENSE-2.0 12 // 13 // Unless required by applicable law or agreed to in writing, software 14 // distributed under the License is distributed on an "AS IS" BASIS, 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 // See the License for the specific language governing permissions and 17 // limitations under the License. 18 19 // Target-independent types/functions defined after target-specific ops. 20 21 // The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip 22 // the generic implementation here if native ops are already defined. 23 24 #include "hwy/base.h" 25 26 // Define detail::Shuffle1230 etc, but only when viewing the current header; 27 // normally this is included via highway.h, which includes ops/*.h. 28 #if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED) 29 #include "hwy/detect_targets.h" 30 #include "hwy/ops/emu128-inl.h" 31 #endif // HWY_IDE 32 33 // Relies on the external include guard in highway.h. 34 HWY_BEFORE_NAMESPACE(); 35 namespace hwy { 36 namespace HWY_NAMESPACE { 37 38 // The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>. 39 template <class V> 40 using LaneType = decltype(GetLane(V())); 41 42 // Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return 43 // type of functions that do not take a vector argument, or as an argument type 44 // if the function only has a template argument for D, or for explicit type 45 // names instead of auto. This may be a built-in type. 46 template <class D> 47 using Vec = decltype(Zero(D())); 48 49 // Mask type. Useful as the return type of functions that do not take a mask 50 // argument, or as an argument type if the function only has a template argument 51 // for D, or for explicit type names instead of auto. 52 template <class D> 53 using Mask = decltype(MaskFromVec(Zero(D()))); 54 55 // Returns the closest value to v within [lo, hi]. 56 template <class V> 57 HWY_API V Clamp(const V v, const V lo, const V hi) { 58 return Min(Max(lo, v), hi); 59 } 60 61 // CombineShiftRightBytes (and -Lanes) are not available for the scalar target, 62 // and RVV has its own implementation of -Lanes. 63 #if (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV) || HWY_IDE 64 65 template <size_t kLanes, class D> 66 HWY_API VFromD<D> CombineShiftRightLanes(D d, VFromD<D> hi, VFromD<D> lo) { 67 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); 68 static_assert(kBytes < 16, "Shift count is per-block"); 69 return CombineShiftRightBytes<kBytes>(d, hi, lo); 70 } 71 72 #endif 73 74 // Returns lanes with the most significant bit set and all other bits zero. 75 template <class D> 76 HWY_API Vec<D> SignBit(D d) { 77 const RebindToUnsigned<decltype(d)> du; 78 return BitCast(d, Set(du, SignMask<TFromD<D>>())); 79 } 80 81 // Returns quiet NaN. 82 template <class D> 83 HWY_API Vec<D> NaN(D d) { 84 const RebindToSigned<D> di; 85 // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus 86 // mantissa MSB (to indicate quiet) would be sufficient. 87 return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>())); 88 } 89 90 // Returns positive infinity. 91 template <class D> 92 HWY_API Vec<D> Inf(D d) { 93 const RebindToUnsigned<D> du; 94 using T = TFromD<D>; 95 using TU = TFromD<decltype(du)>; 96 const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>()); 97 return BitCast(d, Set(du, max_x2 >> 1)); 98 } 99 100 // ------------------------------ MaskedSetOr/MaskedSet 101 102 template <class V, typename T = TFromV<V>, typename D = DFromV<V>, 103 typename M = MFromD<D>> 104 HWY_API V MaskedSetOr(V no, M m, T a) { 105 D d; 106 return IfThenElse(m, Set(d, a), no); 107 } 108 109 template <class D, typename V = VFromD<D>, typename M = MFromD<D>, 110 typename T = TFromD<D>> 111 HWY_API V MaskedSet(D d, M m, T a) { 112 return IfThenElseZero(m, Set(d, a)); 113 } 114 115 // ------------------------------ ZeroExtendResizeBitCast 116 117 // The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128 118 // target is in emu128-inl.h, and the implementation of 119 // detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h 120 #if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR 121 namespace detail { 122 123 #if HWY_HAVE_SCALABLE 124 template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom> 125 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( 126 hwy::SizeTag<kFromVectSize> /* from_size_tag */, 127 hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from, 128 VFromD<DFrom> v) { 129 const Repartition<uint8_t, DTo> d_to_u8; 130 const auto resized = ResizeBitCast(d_to_u8, v); 131 // Zero the upper bytes which were not present/valid in d_from. 132 const size_t num_bytes = Lanes(Repartition<uint8_t, decltype(d_from)>()); 133 return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized)); 134 } 135 #else // target that uses fixed-size vectors 136 // Truncating or same-size resizing cast: same as ResizeBitCast 137 template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom, 138 HWY_IF_LANES_LE(kToVectSize, kFromVectSize)> 139 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( 140 hwy::SizeTag<kFromVectSize> /* from_size_tag */, 141 hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/, 142 VFromD<DFrom> v) { 143 return ResizeBitCast(d_to, v); 144 } 145 146 // Resizing cast to vector that has twice the number of lanes of the source 147 // vector 148 template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom, 149 HWY_IF_LANES(kToVectSize, kFromVectSize * 2)> 150 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( 151 hwy::SizeTag<kFromVectSize> /* from_size_tag */, 152 hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from, 153 VFromD<DFrom> v) { 154 const Twice<decltype(d_from)> dt_from; 155 return BitCast(d_to, ZeroExtendVector(dt_from, v)); 156 } 157 158 // Resizing cast to vector that has more than twice the number of lanes of the 159 // source vector 160 template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom, 161 HWY_IF_LANES_GT(kToVectSize, kFromVectSize * 2)> 162 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( 163 hwy::SizeTag<kFromVectSize> /* from_size_tag */, 164 hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/, 165 VFromD<DFrom> v) { 166 using TFrom = TFromD<DFrom>; 167 constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom); 168 const Repartition<TFrom, decltype(d_to)> d_resize_to; 169 return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes), 170 ResizeBitCast(d_resize_to, v))); 171 } 172 #endif // HWY_HAVE_SCALABLE 173 174 } // namespace detail 175 #endif // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR 176 177 template <class DTo, class DFrom> 178 HWY_API VFromD<DTo> ZeroExtendResizeBitCast(DTo d_to, DFrom d_from, 179 VFromD<DFrom> v) { 180 return detail::ZeroExtendResizeBitCast(hwy::SizeTag<d_from.MaxBytes()>(), 181 hwy::SizeTag<d_to.MaxBytes()>(), d_to, 182 d_from, v); 183 } 184 185 // ------------------------------ SafeFillN 186 187 template <class D, typename T = TFromD<D>> 188 HWY_API void SafeFillN(const size_t num, const T value, D d, 189 T* HWY_RESTRICT to) { 190 #if HWY_MEM_OPS_MIGHT_FAULT 191 (void)d; 192 for (size_t i = 0; i < num; ++i) { 193 to[i] = value; 194 } 195 #else 196 BlendedStore(Set(d, value), FirstN(d, num), d, to); 197 #endif 198 } 199 200 // ------------------------------ SafeCopyN 201 202 template <class D, typename T = TFromD<D>> 203 HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from, 204 T* HWY_RESTRICT to) { 205 #if HWY_MEM_OPS_MIGHT_FAULT 206 (void)d; 207 for (size_t i = 0; i < num; ++i) { 208 to[i] = from[i]; 209 } 210 #else 211 const Mask<D> mask = FirstN(d, num); 212 BlendedStore(MaskedLoad(mask, d, from), mask, d, to); 213 #endif 214 } 215 216 // ------------------------------ IsNegative 217 #if (defined(HWY_NATIVE_IS_NEGATIVE) == defined(HWY_TARGET_TOGGLE)) 218 #ifdef HWY_NATIVE_IS_NEGATIVE 219 #undef HWY_NATIVE_IS_NEGATIVE 220 #else 221 #define HWY_NATIVE_IS_NEGATIVE 222 #endif 223 224 template <class V, HWY_IF_NOT_UNSIGNED_V(V)> 225 HWY_API Mask<DFromV<V>> IsNegative(V v) { 226 const DFromV<decltype(v)> d; 227 const RebindToSigned<decltype(d)> di; 228 return RebindMask(d, MaskFromVec(BroadcastSignBit(BitCast(di, v)))); 229 } 230 231 #endif // HWY_NATIVE_IS_NEGATIVE 232 233 // ------------------------------ MaskFalse 234 #if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE)) 235 #ifdef HWY_NATIVE_MASK_FALSE 236 #undef HWY_NATIVE_MASK_FALSE 237 #else 238 #define HWY_NATIVE_MASK_FALSE 239 #endif 240 241 template <class D> 242 HWY_API Mask<D> MaskFalse(D d) { 243 return MaskFromVec(Zero(d)); 244 } 245 246 #endif // HWY_NATIVE_MASK_FALSE 247 248 // ------------------------------ SetMask 249 #if (defined(HWY_NATIVE_SET_MASK) == defined(HWY_TARGET_TOGGLE)) 250 #ifdef HWY_NATIVE_SET_MASK 251 #undef HWY_NATIVE_SET_MASK 252 #else 253 #define HWY_NATIVE_SET_MASK 254 #endif 255 256 template <class D> 257 HWY_API Mask<D> SetMask(D d, bool val) { 258 const Repartition<int32_t, decltype(d)> di32; 259 return MaskFromVec(ResizeBitCast(d, Set(di32, -static_cast<int32_t>(val)))); 260 } 261 262 #endif // HWY_NATIVE_SET_MASK 263 264 // ------------------------------ IfNegativeThenElseZero 265 #if (defined(HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO) == defined(HWY_TARGET_TOGGLE)) 266 #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO 267 #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO 268 #else 269 #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO 270 #endif 271 272 template <class V, HWY_IF_NOT_UNSIGNED_V(V)> 273 HWY_API V IfNegativeThenElseZero(V v, V yes) { 274 return IfThenElseZero(IsNegative(v), yes); 275 } 276 277 #endif // HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO 278 279 // ------------------------------ IfNegativeThenZeroElse 280 #if (defined(HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE) == defined(HWY_TARGET_TOGGLE)) 281 #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE 282 #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE 283 #else 284 #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE 285 #endif 286 287 template <class V, HWY_IF_NOT_UNSIGNED_V(V)> 288 HWY_API V IfNegativeThenZeroElse(V v, V no) { 289 return IfThenZeroElse(IsNegative(v), no); 290 } 291 292 #endif // HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE 293 294 // ------------------------------ ZeroIfNegative (IfNegativeThenZeroElse) 295 296 // ZeroIfNegative is generic for all vector lengths 297 template <class V, HWY_IF_NOT_UNSIGNED_V(V)> 298 HWY_API V ZeroIfNegative(V v) { 299 return IfNegativeThenZeroElse(v, v); 300 } 301 302 // ------------------------------ BitwiseIfThenElse 303 #if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE)) 304 #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE 305 #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE 306 #else 307 #define HWY_NATIVE_BITWISE_IF_THEN_ELSE 308 #endif 309 310 template <class V> 311 HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { 312 return Or(And(mask, yes), AndNot(mask, no)); 313 } 314 315 #endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE 316 317 // ------------------------------ PromoteMaskTo 318 319 #if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE)) 320 #ifdef HWY_NATIVE_PROMOTE_MASK_TO 321 #undef HWY_NATIVE_PROMOTE_MASK_TO 322 #else 323 #define HWY_NATIVE_PROMOTE_MASK_TO 324 #endif 325 326 template <class DTo, class DFrom> 327 HWY_API Mask<DTo> PromoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) { 328 static_assert( 329 sizeof(TFromD<DTo>) > sizeof(TFromD<DFrom>), 330 "sizeof(TFromD<DTo>) must be greater than sizeof(TFromD<DFrom>)"); 331 static_assert( 332 IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(), 333 "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>"); 334 335 const RebindToSigned<decltype(d_to)> di_to; 336 const RebindToSigned<decltype(d_from)> di_from; 337 338 return MaskFromVec(BitCast( 339 d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m))))); 340 } 341 342 #endif // HWY_NATIVE_PROMOTE_MASK_TO 343 344 // ------------------------------ DemoteMaskTo 345 346 #if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE)) 347 #ifdef HWY_NATIVE_DEMOTE_MASK_TO 348 #undef HWY_NATIVE_DEMOTE_MASK_TO 349 #else 350 #define HWY_NATIVE_DEMOTE_MASK_TO 351 #endif 352 353 template <class DTo, class DFrom> 354 HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) { 355 static_assert(sizeof(TFromD<DTo>) < sizeof(TFromD<DFrom>), 356 "sizeof(TFromD<DTo>) must be less than sizeof(TFromD<DFrom>)"); 357 static_assert( 358 IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(), 359 "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>"); 360 361 const RebindToSigned<decltype(d_to)> di_to; 362 const RebindToSigned<decltype(d_from)> di_from; 363 364 return MaskFromVec( 365 BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m))))); 366 } 367 368 #endif // HWY_NATIVE_DEMOTE_MASK_TO 369 370 // ------------------------------ InsertIntoUpper 371 #if (defined(HWY_NATIVE_LOAD_HIGHER) == defined(HWY_TARGET_TOGGLE)) 372 #ifdef HWY_NATIVE_LOAD_HIGHER 373 #undef HWY_NATIVE_LOAD_HIGHER 374 #else 375 #define HWY_NATIVE_LOAD_HIGHER 376 #endif 377 template <class D, typename T, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1), 378 HWY_IF_POW2_GT_D(D, -3)> 379 HWY_API V InsertIntoUpper(D d, T* p, V a) { 380 Half<D> dh; 381 const VFromD<decltype(dh)> b = LoadU(dh, p); 382 return Combine(d, b, LowerHalf(a)); 383 } 384 #endif // HWY_NATIVE_LOAD_HIGHER 385 386 // ------------------------------ CombineMasks 387 388 #if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE)) 389 #ifdef HWY_NATIVE_COMBINE_MASKS 390 #undef HWY_NATIVE_COMBINE_MASKS 391 #else 392 #define HWY_NATIVE_COMBINE_MASKS 393 #endif 394 395 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 396 template <class D> 397 HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) { 398 const Half<decltype(d)> dh; 399 return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo))); 400 } 401 #endif 402 403 #endif // HWY_NATIVE_COMBINE_MASKS 404 405 // ------------------------------ LowerHalfOfMask 406 407 #if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE)) 408 #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK 409 #undef HWY_NATIVE_LOWER_HALF_OF_MASK 410 #else 411 #define HWY_NATIVE_LOWER_HALF_OF_MASK 412 #endif 413 414 template <class D> 415 HWY_API Mask<D> LowerHalfOfMask(D d, Mask<Twice<D>> m) { 416 const Twice<decltype(d)> dt; 417 return MaskFromVec(LowerHalf(d, VecFromMask(dt, m))); 418 } 419 420 #endif // HWY_NATIVE_LOWER_HALF_OF_MASK 421 422 // ------------------------------ UpperHalfOfMask 423 424 #if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE)) 425 #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK 426 #undef HWY_NATIVE_UPPER_HALF_OF_MASK 427 #else 428 #define HWY_NATIVE_UPPER_HALF_OF_MASK 429 #endif 430 431 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 432 template <class D> 433 HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) { 434 const Twice<decltype(d)> dt; 435 return MaskFromVec(UpperHalf(d, VecFromMask(dt, m))); 436 } 437 #endif 438 439 #endif // HWY_NATIVE_UPPER_HALF_OF_MASK 440 441 // ------------------------------ OrderedDemote2MasksTo 442 443 #if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \ 444 defined(HWY_TARGET_TOGGLE)) 445 #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO 446 #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO 447 #else 448 #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO 449 #endif 450 451 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 452 template <class DTo, class DFrom> 453 HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a, 454 Mask<DFrom> b) { 455 static_assert( 456 sizeof(TFromD<DTo>) == sizeof(TFromD<DFrom>) / 2, 457 "sizeof(TFromD<DTo>) must be equal to sizeof(TFromD<DFrom>) / 2"); 458 static_assert(IsSame<Mask<DTo>, Mask<Repartition<TFromD<DTo>, DFrom>>>(), 459 "Mask<DTo> must be the same type as " 460 "Mask<Repartition<TFromD<DTo>, DFrom>>>()"); 461 462 const RebindToSigned<decltype(d_from)> di_from; 463 const RebindToSigned<decltype(d_to)> di_to; 464 465 const auto va = BitCast(di_from, VecFromMask(d_from, a)); 466 const auto vb = BitCast(di_from, VecFromMask(d_from, b)); 467 return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb))); 468 } 469 #endif 470 471 #endif // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO 472 473 // ------------------------------ RotateLeft 474 template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 475 HWY_API V RotateLeft(V v) { 476 constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8; 477 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); 478 479 constexpr int kRotateRightAmt = 480 (kBits == 0) ? 0 : static_cast<int>(kSizeInBits) - kBits; 481 return RotateRight<kRotateRightAmt>(v); 482 } 483 484 // ------------------------------ InterleaveWholeLower/InterleaveWholeUpper 485 #if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE)) 486 #ifdef HWY_NATIVE_INTERLEAVE_WHOLE 487 #undef HWY_NATIVE_INTERLEAVE_WHOLE 488 #else 489 #define HWY_NATIVE_INTERLEAVE_WHOLE 490 #endif 491 492 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 493 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 494 HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) { 495 // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if 496 // D().MaxBytes() <= 16 is true 497 return InterleaveLower(d, a, b); 498 } 499 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 500 HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) { 501 // InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if 502 // D().MaxBytes() <= 16 is true 503 return InterleaveUpper(d, a, b); 504 } 505 506 // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3 507 // is implemented in x86_256-inl.h. 508 509 // InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is 510 // implemented in x86_512-inl.h. 511 512 // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256 513 // is implemented in wasm_256-inl.h. 514 #endif // HWY_TARGET != HWY_SCALAR 515 516 #endif // HWY_NATIVE_INTERLEAVE_WHOLE 517 518 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 519 // The InterleaveWholeLower without the optional D parameter is generic for all 520 // vector lengths. 521 template <class V> 522 HWY_API V InterleaveWholeLower(V a, V b) { 523 return InterleaveWholeLower(DFromV<V>(), a, b); 524 } 525 #endif // HWY_TARGET != HWY_SCALAR 526 527 // ------------------------------ InterleaveEven 528 529 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 530 // InterleaveEven without the optional D parameter is generic for all vector 531 // lengths 532 template <class V> 533 HWY_API V InterleaveEven(V a, V b) { 534 return InterleaveEven(DFromV<V>(), a, b); 535 } 536 #endif 537 538 // ------------------------------ MinNumber/MaxNumber 539 540 #if (defined(HWY_NATIVE_FLOAT_MIN_MAX_NUMBER) == defined(HWY_TARGET_TOGGLE)) 541 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 542 #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 543 #else 544 #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 545 #endif 546 547 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)> 548 HWY_API V MinNumber(V a, V b) { 549 return Min(a, b); 550 } 551 552 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)> 553 HWY_API V MaxNumber(V a, V b) { 554 return Max(a, b); 555 } 556 557 #endif 558 559 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 560 HWY_API V MinNumber(V a, V b) { 561 return Min(a, b); 562 } 563 564 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 565 HWY_API V MaxNumber(V a, V b) { 566 return Max(a, b); 567 } 568 569 // ------------------------------ MinMagnitude/MaxMagnitude 570 571 #if (defined(HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE) == defined(HWY_TARGET_TOGGLE)) 572 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE 573 #undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE 574 #else 575 #define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE 576 #endif 577 578 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)> 579 HWY_API V MinMagnitude(V a, V b) { 580 const V abs_a = Abs(a); 581 const V abs_b = Abs(b); 582 const V min = Min(IfThenElse(Eq(abs_a, abs_b), a, b), b); 583 return IfThenElse(Lt(abs_a, abs_b), a, min); 584 } 585 586 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)> 587 HWY_API V MaxMagnitude(V a, V b) { 588 const V abs_a = Abs(a); 589 const V abs_b = Abs(b); 590 // This lvalue appears to be necessary to avoid a clang bug on SVE. 591 const V max = Max(IfThenElse(Eq(abs_a, abs_b), b, a), a); 592 return IfThenElse(Lt(abs_a, abs_b), b, max); 593 } 594 595 #endif // HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE 596 597 template <class V, HWY_IF_SIGNED_V(V)> 598 HWY_API V MinMagnitude(V a, V b) { 599 const DFromV<V> d; 600 const RebindToUnsigned<decltype(d)> du; 601 const auto abs_a = BitCast(du, Abs(a)); 602 const auto abs_b = BitCast(du, Abs(b)); 603 return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), a, 604 Min(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), a, b), b)); 605 } 606 607 template <class V, HWY_IF_SIGNED_V(V)> 608 HWY_API V MaxMagnitude(V a, V b) { 609 const DFromV<V> d; 610 const RebindToUnsigned<decltype(d)> du; 611 const auto abs_a = BitCast(du, Abs(a)); 612 const auto abs_b = BitCast(du, Abs(b)); 613 return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), b, 614 Max(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), b, a), a)); 615 } 616 617 template <class V, HWY_IF_UNSIGNED_V(V)> 618 HWY_API V MinMagnitude(V a, V b) { 619 return Min(a, b); 620 } 621 622 template <class V, HWY_IF_UNSIGNED_V(V)> 623 HWY_API V MaxMagnitude(V a, V b) { 624 return Max(a, b); 625 } 626 627 // ------------------------------ AddSub 628 629 template <class V, HWY_IF_LANES_D(DFromV<V>, 1)> 630 HWY_API V AddSub(V a, V b) { 631 // AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b) 632 return Sub(a, b); 633 } 634 635 // AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on 636 // SSSE3/SSE4/AVX2/AVX3 637 638 // AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on 639 // AVX2/AVX3 640 641 // AddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h 642 643 // AddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h 644 template <class V, HWY_IF_ADDSUB_V(V)> 645 HWY_API V AddSub(V a, V b) { 646 using D = DFromV<decltype(a)>; 647 using T = TFromD<D>; 648 using TNegate = If<!hwy::IsSigned<T>(), MakeSigned<T>, T>; 649 650 const D d; 651 const Rebind<TNegate, D> d_negate; 652 653 // Negate the even lanes of b 654 const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b)))); 655 656 return Add(a, negated_even_b); 657 } 658 659 // ------------------------------ MaskedAddOr etc. 660 #if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE)) 661 #ifdef HWY_NATIVE_MASKED_ARITH 662 #undef HWY_NATIVE_MASKED_ARITH 663 #else 664 #define HWY_NATIVE_MASKED_ARITH 665 #endif 666 667 template <class V, class M> 668 HWY_API V MaskedMinOr(V no, M m, V a, V b) { 669 return IfThenElse(m, Min(a, b), no); 670 } 671 672 template <class V, class M> 673 HWY_API V MaskedMaxOr(V no, M m, V a, V b) { 674 return IfThenElse(m, Max(a, b), no); 675 } 676 677 template <class V, class M> 678 HWY_API V MaskedAddOr(V no, M m, V a, V b) { 679 return IfThenElse(m, Add(a, b), no); 680 } 681 682 template <class V, class M> 683 HWY_API V MaskedSubOr(V no, M m, V a, V b) { 684 return IfThenElse(m, Sub(a, b), no); 685 } 686 687 template <class V, class M> 688 HWY_API V MaskedMulOr(V no, M m, V a, V b) { 689 return IfThenElse(m, Mul(a, b), no); 690 } 691 692 template <class V, class M> 693 HWY_API V MaskedDivOr(V no, M m, V a, V b) { 694 const DFromV<V> d; 695 // Avoid division by zero for masked-out lanes. 696 const V nonzero = Set(d, TFromD<decltype(d)>{1}); 697 return IfThenElse(m, Div(a, IfThenElse(m, b, nonzero)), no); 698 } 699 700 template <class V, class M> 701 HWY_API V MaskedModOr(V no, M m, V a, V b) { 702 const DFromV<V> d; 703 // Avoid division by zero for masked-out lanes. 704 const V nonzero = Set(d, TFromD<decltype(d)>{1}); 705 return IfThenElse(m, Mod(a, IfThenElse(m, b, nonzero)), no); 706 } 707 708 template <class V, class M> 709 HWY_API V MaskedSatAddOr(V no, M m, V a, V b) { 710 return IfThenElse(m, SaturatedAdd(a, b), no); 711 } 712 713 template <class V, class M> 714 HWY_API V MaskedSatSubOr(V no, M m, V a, V b) { 715 return IfThenElse(m, SaturatedSub(a, b), no); 716 } 717 #endif // HWY_NATIVE_MASKED_ARITH 718 719 #if (defined(HWY_NATIVE_ZERO_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE)) 720 #ifdef HWY_NATIVE_ZERO_MASKED_ARITH 721 #undef HWY_NATIVE_ZERO_MASKED_ARITH 722 #else 723 #define HWY_NATIVE_ZERO_MASKED_ARITH 724 #endif 725 726 template <class V, class M> 727 HWY_API V MaskedMax(M m, V a, V b) { 728 return IfThenElseZero(m, (Max(a, b))); 729 } 730 731 template <class V, class M> 732 HWY_API V MaskedAdd(M m, V a, V b) { 733 return IfThenElseZero(m, Add(a, b)); 734 } 735 736 template <class V, class M> 737 HWY_API V MaskedSub(M m, V a, V b) { 738 return IfThenElseZero(m, Sub(a, b)); 739 } 740 741 template <class V, class M> 742 HWY_API V MaskedMul(M m, V a, V b) { 743 return IfThenElseZero(m, Mul(a, b)); 744 } 745 746 template <class V, class M> 747 HWY_API V MaskedDiv(M m, V a, V b) { 748 return IfThenElseZero(m, Div(a, b)); 749 } 750 751 template <class V, class M> 752 HWY_API V MaskedSaturatedAdd(M m, V a, V b) { 753 return IfThenElseZero(m, SaturatedAdd(a, b)); 754 } 755 756 template <class V, class M> 757 HWY_API V MaskedSaturatedSub(M m, V a, V b) { 758 return IfThenElseZero(m, SaturatedSub(a, b)); 759 } 760 761 template <class V, class M, typename D = DFromV<V>, HWY_IF_I16_D(D)> 762 HWY_API V MaskedMulFixedPoint15(M m, V a, V b) { 763 return IfThenElseZero(m, MulFixedPoint15(a, b)); 764 } 765 766 template <class V, class M> 767 HWY_API V MaskedMulAdd(M m, V mul, V x, V add) { 768 return IfThenElseZero(m, MulAdd(mul, x, add)); 769 } 770 771 template <class V, class M> 772 HWY_API V MaskedNegMulAdd(M m, V mul, V x, V add) { 773 return IfThenElseZero(m, NegMulAdd(mul, x, add)); 774 } 775 776 template <class D, class M, HWY_IF_UI32_D(D), 777 class V16 = VFromD<RepartitionToNarrow<D>>> 778 HWY_API VFromD<D> MaskedWidenMulPairwiseAdd(D d32, M m, V16 a, V16 b) { 779 return IfThenElseZero(m, WidenMulPairwiseAdd(d32, a, b)); 780 } 781 782 template <class DF, class M, HWY_IF_F32_D(DF), class VBF> 783 HWY_API VFromD<DF> MaskedWidenMulPairwiseAdd(DF df, M m, VBF a, VBF b) { 784 return IfThenElseZero(m, WidenMulPairwiseAdd(df, a, b)); 785 } 786 #endif // HWY_NATIVE_ZERO_MASKED_ARITH 787 788 // ------------------------------ MaskedShift 789 template <int kShift, class V, class M> 790 HWY_API V MaskedShiftLeft(M m, V a) { 791 return IfThenElseZero(m, ShiftLeft<kShift>(a)); 792 } 793 794 template <int kShift, class V, class M> 795 HWY_API V MaskedShiftRight(M m, V a) { 796 return IfThenElseZero(m, ShiftRight<kShift>(a)); 797 } 798 799 template <int kShift, class V, class M> 800 HWY_API V MaskedShiftRightOr(V no, M m, V a) { 801 return IfThenElse(m, ShiftRight<kShift>(a), no); 802 } 803 804 template <class V, class M> 805 HWY_API V MaskedShrOr(V no, M m, V a, V shifts) { 806 return IfThenElse(m, Shr(a, shifts), no); 807 } 808 809 // ------------------------------ MaskedEq etc. 810 #if (defined(HWY_NATIVE_MASKED_COMP) == defined(HWY_TARGET_TOGGLE)) 811 #ifdef HWY_NATIVE_MASKED_COMP 812 #undef HWY_NATIVE_MASKED_COMP 813 #else 814 #define HWY_NATIVE_MASKED_COMP 815 #endif 816 817 template <class V, class M> 818 HWY_API auto MaskedEq(M m, V a, V b) -> decltype(a == b) { 819 return And(m, Eq(a, b)); 820 } 821 822 template <class V, class M> 823 HWY_API auto MaskedNe(M m, V a, V b) -> decltype(a == b) { 824 return And(m, Ne(a, b)); 825 } 826 827 template <class V, class M> 828 HWY_API auto MaskedLt(M m, V a, V b) -> decltype(a == b) { 829 return And(m, Lt(a, b)); 830 } 831 832 template <class V, class M> 833 HWY_API auto MaskedGt(M m, V a, V b) -> decltype(a == b) { 834 return And(m, Gt(a, b)); 835 } 836 837 template <class V, class M> 838 HWY_API auto MaskedLe(M m, V a, V b) -> decltype(a == b) { 839 return And(m, Le(a, b)); 840 } 841 842 template <class V, class M> 843 HWY_API auto MaskedGe(M m, V a, V b) -> decltype(a == b) { 844 return And(m, Ge(a, b)); 845 } 846 847 template <class V, class M, class D = DFromV<V>> 848 HWY_API MFromD<D> MaskedIsNaN(const M m, const V v) { 849 return And(m, IsNaN(v)); 850 } 851 #endif // HWY_NATIVE_MASKED_COMP 852 853 // ------------------------------ IfNegativeThenNegOrUndefIfZero 854 855 #if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \ 856 defined(HWY_TARGET_TOGGLE)) 857 #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG 858 #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG 859 #else 860 #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG 861 #endif 862 863 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 864 HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { 865 #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE 866 // MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE 867 const auto zero = Zero(DFromV<V>()); 868 return MaskedSubOr(v, Lt(mask, zero), zero, v); 869 #else 870 return IfNegativeThenElse(mask, Neg(v), v); 871 #endif 872 } 873 874 #endif // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG 875 876 template <class V, HWY_IF_FLOAT_V(V)> 877 HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { 878 return CopySign(v, Xor(mask, v)); 879 } 880 881 // ------------------------------ SaturatedNeg 882 883 #if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE)) 884 #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32 885 #undef HWY_NATIVE_SATURATED_NEG_8_16_32 886 #else 887 #define HWY_NATIVE_SATURATED_NEG_8_16_32 888 #endif 889 890 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)), 891 HWY_IF_SIGNED_V(V)> 892 HWY_API V SaturatedNeg(V v) { 893 const DFromV<decltype(v)> d; 894 return SaturatedSub(Zero(d), v); 895 } 896 897 template <class V, HWY_IF_I32(TFromV<V>)> 898 HWY_API V SaturatedNeg(V v) { 899 const DFromV<decltype(v)> d; 900 901 #if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_PPC || HWY_TARGET_IS_SVE || \ 902 HWY_TARGET_IS_NEON 903 // RVV/PPC/SVE/NEON have native I32 SaturatedSub instructions 904 return SaturatedSub(Zero(d), v); 905 #else 906 // ~v[i] - ((v[i] > LimitsMin<int32_t>()) ? -1 : 0) is equivalent to 907 // (v[i] > LimitsMin<int32_t>) ? (-v[i]) : LimitsMax<int32_t>() since 908 // -v[i] == ~v[i] + 1 == ~v[i] - (-1) and 909 // ~LimitsMin<int32_t>() == LimitsMax<int32_t>(). 910 return Sub(Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin<int32_t>())))); 911 #endif 912 } 913 #endif // HWY_NATIVE_SATURATED_NEG_8_16_32 914 915 #if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE)) 916 #ifdef HWY_NATIVE_SATURATED_NEG_64 917 #undef HWY_NATIVE_SATURATED_NEG_64 918 #else 919 #define HWY_NATIVE_SATURATED_NEG_64 920 #endif 921 922 template <class V, HWY_IF_I64(TFromV<V>)> 923 HWY_API V SaturatedNeg(V v) { 924 #if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_SVE || HWY_TARGET_IS_NEON 925 // RVV/SVE/NEON have native I64 SaturatedSub instructions 926 const DFromV<decltype(v)> d; 927 return SaturatedSub(Zero(d), v); 928 #else 929 const auto neg_v = Neg(v); 930 return Add(neg_v, BroadcastSignBit(And(v, neg_v))); 931 #endif 932 } 933 #endif // HWY_NATIVE_SATURATED_NEG_64 934 935 // ------------------------------ SaturatedAbs 936 937 #if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE)) 938 #ifdef HWY_NATIVE_SATURATED_ABS 939 #undef HWY_NATIVE_SATURATED_ABS 940 #else 941 #define HWY_NATIVE_SATURATED_ABS 942 #endif 943 944 template <class V, HWY_IF_SIGNED_V(V)> 945 HWY_API V SaturatedAbs(V v) { 946 return Max(v, SaturatedNeg(v)); 947 } 948 949 #endif 950 951 // ------------------------------ MaskedAbsOr 952 template <class V, HWY_IF_SIGNED_V(V), class M> 953 HWY_API V MaskedAbsOr(V no, M m, V v) { 954 return IfThenElse(m, Abs(v), no); 955 } 956 957 // ------------------------------ MaskedAbs 958 template <class V, HWY_IF_SIGNED_V(V), class M> 959 HWY_API V MaskedAbs(M m, V v) { 960 return IfThenElseZero(m, Abs(v)); 961 } 962 963 // ------------------------------ Reductions 964 965 // Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled, 966 // they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set. 967 // Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the 968 // SumOfLanes overloads. For the latter group, we here define the remaining 969 // overloads, plus ReduceSum which uses them plus GetLane. 970 #if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE)) 971 #ifdef HWY_NATIVE_REDUCE_SCALAR 972 #undef HWY_NATIVE_REDUCE_SCALAR 973 #else 974 #define HWY_NATIVE_REDUCE_SCALAR 975 #endif 976 977 namespace detail { 978 979 // Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes. 980 struct AddFunc { 981 template <class V> 982 V operator()(V a, V b) const { 983 return Add(a, b); 984 } 985 }; 986 987 struct MinFunc { 988 template <class V> 989 V operator()(V a, V b) const { 990 return Min(a, b); 991 } 992 }; 993 994 struct MaxFunc { 995 template <class V> 996 V operator()(V a, V b) const { 997 return Max(a, b); 998 } 999 }; 1000 1001 // No-op for vectors of at most one block. 1002 template <class D, class Func, HWY_IF_V_SIZE_LE_D(D, 16)> 1003 HWY_INLINE VFromD<D> ReduceAcrossBlocks(D, Func, VFromD<D> v) { 1004 return v; 1005 } 1006 1007 // Reduces a lane with its counterpart in other block(s). Shared by AVX2 and 1008 // WASM_EMU256. AVX3 has its own overload. 1009 template <class D, class Func, HWY_IF_V_SIZE_D(D, 32)> 1010 HWY_INLINE VFromD<D> ReduceAcrossBlocks(D /*d*/, Func f, VFromD<D> v) { 1011 return f(v, SwapAdjacentBlocks(v)); 1012 } 1013 1014 // These return the reduction result broadcasted across all lanes. They assume 1015 // the caller has already reduced across blocks. 1016 1017 template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 2)> 1018 HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v10) { 1019 return f(v10, Reverse2(d, v10)); 1020 } 1021 1022 template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 4)> 1023 HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v3210) { 1024 const VFromD<D> v0123 = Reverse4(d, v3210); 1025 const VFromD<D> v03_12_12_03 = f(v3210, v0123); 1026 const VFromD<D> v12_03_03_12 = Reverse2(d, v03_12_12_03); 1027 return f(v03_12_12_03, v12_03_03_12); 1028 } 1029 1030 template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 8)> 1031 HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v76543210) { 1032 // The upper half is reversed from the lower half; omit for brevity. 1033 const VFromD<D> v34_25_16_07 = f(v76543210, Reverse8(d, v76543210)); 1034 const VFromD<D> v0347_1625_1625_0347 = 1035 f(v34_25_16_07, Reverse4(d, v34_25_16_07)); 1036 return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); 1037 } 1038 1039 template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_U8_D(D)> 1040 HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) { 1041 const RepartitionToWide<decltype(d)> dw; 1042 using VW = VFromD<decltype(dw)>; 1043 const VW vw = BitCast(dw, v); 1044 // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN. 1045 const VW even = And(vw, Set(dw, 0xFF)); 1046 const VW odd = ShiftRight<8>(vw); 1047 const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd)); 1048 #if HWY_IS_LITTLE_ENDIAN 1049 return DupEven(BitCast(d, reduced)); 1050 #else 1051 return DupOdd(BitCast(d, reduced)); 1052 #endif 1053 } 1054 1055 template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_I8_D(D)> 1056 HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) { 1057 const RepartitionToWide<decltype(d)> dw; 1058 using VW = VFromD<decltype(dw)>; 1059 const VW vw = BitCast(dw, v); 1060 // Sign-extend 1061 // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN. 1062 const VW even = ShiftRight<8>(ShiftLeft<8>(vw)); 1063 const VW odd = ShiftRight<8>(vw); 1064 const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd)); 1065 #if HWY_IS_LITTLE_ENDIAN 1066 return DupEven(BitCast(d, reduced)); 1067 #else 1068 return DupOdd(BitCast(d, reduced)); 1069 #endif 1070 } 1071 1072 } // namespace detail 1073 1074 template <class D, HWY_IF_SUM_OF_LANES_D(D)> 1075 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { 1076 const detail::AddFunc f; 1077 v = detail::ReduceAcrossBlocks(d, f, v); 1078 return detail::ReduceWithinBlocks(d, f, v); 1079 } 1080 template <class D, HWY_IF_MINMAX_OF_LANES_D(D)> 1081 HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) { 1082 const detail::MinFunc f; 1083 v = detail::ReduceAcrossBlocks(d, f, v); 1084 return detail::ReduceWithinBlocks(d, f, v); 1085 } 1086 template <class D, HWY_IF_MINMAX_OF_LANES_D(D)> 1087 HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) { 1088 const detail::MaxFunc f; 1089 v = detail::ReduceAcrossBlocks(d, f, v); 1090 return detail::ReduceWithinBlocks(d, f, v); 1091 } 1092 1093 template <class D, HWY_IF_REDUCE_D(D)> 1094 HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) { 1095 return GetLane(SumOfLanes(d, v)); 1096 } 1097 template <class D, HWY_IF_REDUCE_D(D)> 1098 HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) { 1099 return GetLane(MinOfLanes(d, v)); 1100 } 1101 template <class D, HWY_IF_REDUCE_D(D)> 1102 HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) { 1103 return GetLane(MaxOfLanes(d, v)); 1104 } 1105 1106 #endif // HWY_NATIVE_REDUCE_SCALAR 1107 1108 // Corner cases for both generic and native implementations: 1109 // N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm) 1110 template <class D, HWY_IF_LANES_D(D, 1)> 1111 HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) { 1112 return GetLane(v); 1113 } 1114 template <class D, HWY_IF_LANES_D(D, 1)> 1115 HWY_API TFromD<D> ReduceMin(D /*d*/, VFromD<D> v) { 1116 return GetLane(v); 1117 } 1118 template <class D, HWY_IF_LANES_D(D, 1)> 1119 HWY_API TFromD<D> ReduceMax(D /*d*/, VFromD<D> v) { 1120 return GetLane(v); 1121 } 1122 1123 template <class D, HWY_IF_LANES_D(D, 1)> 1124 HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) { 1125 return v; 1126 } 1127 template <class D, HWY_IF_LANES_D(D, 1)> 1128 HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) { 1129 return v; 1130 } 1131 template <class D, HWY_IF_LANES_D(D, 1)> 1132 HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) { 1133 return v; 1134 } 1135 1136 // N=4 for 8-bit is still less than the minimum native size. 1137 1138 // ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8 1139 // ReduceSum operations 1140 #if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE)) 1141 #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8 1142 #undef HWY_NATIVE_REDUCE_SUM_4_UI8 1143 #else 1144 #define HWY_NATIVE_REDUCE_SUM_4_UI8 1145 #endif 1146 1147 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)> 1148 HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) { 1149 const Twice<RepartitionToWide<decltype(d)>> dw; 1150 return static_cast<TFromD<D>>(ReduceSum(dw, PromoteTo(dw, v))); 1151 } 1152 #endif // HWY_NATIVE_REDUCE_SUM_4_UI8 1153 1154 // RVV/SVE have target-specific implementations of the N=4 I8/U8 1155 // ReduceMin/ReduceMax operations 1156 #if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE)) 1157 #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8 1158 #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8 1159 #else 1160 #define HWY_NATIVE_REDUCE_MINMAX_4_UI8 1161 #endif 1162 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)> 1163 HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) { 1164 const Twice<RepartitionToWide<decltype(d)>> dw; 1165 return static_cast<TFromD<D>>(ReduceMin(dw, PromoteTo(dw, v))); 1166 } 1167 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)> 1168 HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) { 1169 const Twice<RepartitionToWide<decltype(d)>> dw; 1170 return static_cast<TFromD<D>>(ReduceMax(dw, PromoteTo(dw, v))); 1171 } 1172 #endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8 1173 1174 #if (defined(HWY_NATIVE_MASKED_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE)) 1175 #ifdef HWY_NATIVE_MASKED_REDUCE_SCALAR 1176 #undef HWY_NATIVE_MASKED_REDUCE_SCALAR 1177 #else 1178 #define HWY_NATIVE_MASKED_REDUCE_SCALAR 1179 #endif 1180 1181 template <class D, class M> 1182 HWY_API TFromD<D> MaskedReduceSum(D d, M m, VFromD<D> v) { 1183 return ReduceSum(d, IfThenElseZero(m, v)); 1184 } 1185 template <class D, class M> 1186 HWY_API TFromD<D> MaskedReduceMin(D d, M m, VFromD<D> v) { 1187 return ReduceMin( 1188 d, IfThenElse(m, v, Set(d, hwy::PositiveInfOrHighestValue<TFromD<D>>()))); 1189 } 1190 template <class D, class M> 1191 HWY_API TFromD<D> MaskedReduceMax(D d, M m, VFromD<D> v) { 1192 return ReduceMax( 1193 d, IfThenElse(m, v, Set(d, hwy::NegativeInfOrLowestValue<TFromD<D>>()))); 1194 } 1195 1196 #endif // HWY_NATIVE_MASKED_REDUCE_SCALAR 1197 1198 // ------------------------------ IsEitherNaN 1199 #if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE)) 1200 #ifdef HWY_NATIVE_IS_EITHER_NAN 1201 #undef HWY_NATIVE_IS_EITHER_NAN 1202 #else 1203 #define HWY_NATIVE_IS_EITHER_NAN 1204 #endif 1205 1206 template <class V, HWY_IF_FLOAT_V(V)> 1207 HWY_API MFromD<DFromV<V>> IsEitherNaN(V a, V b) { 1208 return Or(IsNaN(a), IsNaN(b)); 1209 } 1210 1211 #endif // HWY_NATIVE_IS_EITHER_NAN 1212 1213 // ------------------------------ IsInf, IsFinite 1214 1215 // AVX3 has target-specific implementations of these. 1216 #if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE)) 1217 #ifdef HWY_NATIVE_ISINF 1218 #undef HWY_NATIVE_ISINF 1219 #else 1220 #define HWY_NATIVE_ISINF 1221 #endif 1222 1223 template <class V, class D = DFromV<V>> 1224 HWY_API MFromD<D> IsInf(const V v) { 1225 using T = TFromD<D>; 1226 const D d; 1227 const RebindToUnsigned<decltype(d)> du; 1228 const VFromD<decltype(du)> vu = BitCast(du, v); 1229 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. 1230 return RebindMask( 1231 d, 1232 Eq(Add(vu, vu), 1233 Set(du, static_cast<MakeUnsigned<T>>(hwy::MaxExponentTimes2<T>())))); 1234 } 1235 1236 // Returns whether normal/subnormal/zero. 1237 template <class V, class D = DFromV<V>> 1238 HWY_API MFromD<D> IsFinite(const V v) { 1239 using T = TFromD<D>; 1240 const D d; 1241 const RebindToUnsigned<decltype(d)> du; 1242 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison 1243 const VFromD<decltype(du)> vu = BitCast(du, v); 1244 // 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code 1245 // for AVX2 if we instead add vu + vu. 1246 #if HWY_COMPILER_MSVC 1247 const VFromD<decltype(du)> shl = ShiftLeft<1>(vu); 1248 #else 1249 const VFromD<decltype(du)> shl = Add(vu, vu); 1250 #endif 1251 1252 // Then shift right so we can compare with the max exponent (cannot compare 1253 // with MaxExponentTimes2 directly because it is negative and non-negative 1254 // floats would be greater). 1255 const VFromD<decltype(di)> exp = 1256 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(shl)); 1257 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>()))); 1258 } 1259 1260 #endif // HWY_NATIVE_ISINF 1261 1262 // ------------------------------ CeilInt/FloorInt 1263 #if (defined(HWY_NATIVE_CEIL_FLOOR_INT) == defined(HWY_TARGET_TOGGLE)) 1264 #ifdef HWY_NATIVE_CEIL_FLOOR_INT 1265 #undef HWY_NATIVE_CEIL_FLOOR_INT 1266 #else 1267 #define HWY_NATIVE_CEIL_FLOOR_INT 1268 #endif 1269 1270 template <class V, HWY_IF_FLOAT_V(V)> 1271 HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) { 1272 const DFromV<decltype(v)> d; 1273 const RebindToSigned<decltype(d)> di; 1274 return ConvertTo(di, Ceil(v)); 1275 } 1276 1277 template <class V, HWY_IF_FLOAT_V(V)> 1278 HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) { 1279 const DFromV<decltype(v)> d; 1280 const RebindToSigned<decltype(d)> di; 1281 return ConvertTo(di, Floor(v)); 1282 } 1283 1284 #endif // HWY_NATIVE_CEIL_FLOOR_INT 1285 1286 // ------------------------------ MulByPow2/MulByFloorPow2 1287 1288 #if (defined(HWY_NATIVE_MUL_BY_POW2) == defined(HWY_TARGET_TOGGLE)) 1289 #ifdef HWY_NATIVE_MUL_BY_POW2 1290 #undef HWY_NATIVE_MUL_BY_POW2 1291 #else 1292 #define HWY_NATIVE_MUL_BY_POW2 1293 #endif 1294 1295 template <class V, HWY_IF_FLOAT_V(V)> 1296 HWY_API V MulByPow2(V v, VFromD<RebindToSigned<DFromV<V>>> exp) { 1297 const DFromV<decltype(v)> df; 1298 const RebindToUnsigned<decltype(df)> du; 1299 const RebindToSigned<decltype(df)> di; 1300 1301 using TF = TFromD<decltype(df)>; 1302 using TI = TFromD<decltype(di)>; 1303 using TU = TFromD<decltype(du)>; 1304 1305 using VF = VFromD<decltype(df)>; 1306 using VI = VFromD<decltype(di)>; 1307 1308 constexpr TI kMaxBiasedExp = MaxExponentField<TF>(); 1309 static_assert(kMaxBiasedExp > 0, "kMaxBiasedExp > 0 must be true"); 1310 1311 constexpr TI kExpBias = static_cast<TI>(kMaxBiasedExp >> 1); 1312 static_assert(kExpBias > 0, "kExpBias > 0 must be true"); 1313 static_assert(kExpBias <= LimitsMax<TI>() / 3, 1314 "kExpBias <= LimitsMax<TI>() / 3 must be true"); 1315 1316 #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4 1317 using TExpMinMax = If<(sizeof(TI) <= 4), TI, int32_t>; 1318 #elif (HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2) || \ 1319 HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256 1320 using TExpMinMax = int16_t; 1321 #else 1322 using TExpMinMax = TI; 1323 #endif 1324 1325 #if HWY_TARGET == HWY_EMU128 || HWY_TARGET == HWY_SCALAR 1326 using TExpSatSub = TU; 1327 #elif HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \ 1328 HWY_TARGET == HWY_WASM_EMU256 1329 using TExpSatSub = If<(sizeof(TF) == 4), uint8_t, uint16_t>; 1330 #elif HWY_TARGET_IS_PPC 1331 using TExpSatSub = If<(sizeof(TF) >= 4), uint32_t, TU>; 1332 #else 1333 using TExpSatSub = If<(sizeof(TF) == 4), uint8_t, TU>; 1334 #endif 1335 1336 static_assert(kExpBias <= static_cast<TI>(LimitsMax<TExpMinMax>() / 3), 1337 "kExpBias <= LimitsMax<TExpMinMax>() / 3 must be true"); 1338 1339 const Repartition<TExpMinMax, decltype(df)> d_exp_min_max; 1340 const Repartition<TExpSatSub, decltype(df)> d_sat_exp_sub; 1341 1342 constexpr int kNumOfExpBits = ExponentBits<TF>(); 1343 constexpr int kNumOfMantBits = MantissaBits<TF>(); 1344 1345 // The sign bit of BitCastScalar<TU>(a[i]) >> kNumOfMantBits can be zeroed out 1346 // using SaturatedSub if kZeroOutSignUsingSatSub is true. 1347 1348 // If kZeroOutSignUsingSatSub is true, then val_for_exp_sub will be bitcasted 1349 // to a vector that has a smaller lane size than TU for the SaturatedSub 1350 // operation below. 1351 constexpr bool kZeroOutSignUsingSatSub = 1352 ((sizeof(TExpSatSub) * 8) == static_cast<size_t>(kNumOfExpBits)); 1353 1354 // If kZeroOutSignUsingSatSub is true, then the upper 1355 // (sizeof(TU) - sizeof(TExpSatSub)) * 8 bits of kExpDecrBy1Bits will be all 1356 // ones and the lower sizeof(TExpSatSub) * 8 bits of kExpDecrBy1Bits will be 1357 // equal to 1. 1358 1359 // Otherwise, if kZeroOutSignUsingSatSub is false, kExpDecrBy1Bits will be 1360 // equal to 1. 1361 constexpr TU kExpDecrBy1Bits = static_cast<TU>( 1362 TU{1} - (static_cast<TU>(kZeroOutSignUsingSatSub) << kNumOfExpBits)); 1363 1364 VF val_for_exp_sub = v; 1365 HWY_IF_CONSTEXPR(!kZeroOutSignUsingSatSub) { 1366 // If kZeroOutSignUsingSatSub is not true, zero out the sign bit of 1367 // val_for_exp_sub[i] using Abs 1368 val_for_exp_sub = Abs(val_for_exp_sub); 1369 } 1370 1371 // min_exp1_plus_min_exp2[i] is the smallest exponent such that 1372 // min_exp1_plus_min_exp2[i] >= 2 - kExpBias * 2 and 1373 // std::ldexp(v[i], min_exp1_plus_min_exp2[i]) is a normal floating-point 1374 // number if v[i] is a normal number 1375 const VI min_exp1_plus_min_exp2 = BitCast( 1376 di, 1377 Max(BitCast( 1378 d_exp_min_max, 1379 Neg(BitCast( 1380 di, 1381 SaturatedSub( 1382 BitCast(d_sat_exp_sub, ShiftRight<kNumOfMantBits>( 1383 BitCast(du, val_for_exp_sub))), 1384 BitCast(d_sat_exp_sub, Set(du, kExpDecrBy1Bits)))))), 1385 BitCast(d_exp_min_max, 1386 Set(di, static_cast<TI>(2 - kExpBias - kExpBias))))); 1387 1388 const VI clamped_exp = 1389 Max(Min(exp, Set(di, static_cast<TI>(kExpBias * 3))), 1390 Add(min_exp1_plus_min_exp2, Set(di, static_cast<TI>(1 - kExpBias)))); 1391 1392 const VI exp1_plus_exp2 = BitCast( 1393 di, Max(Min(BitCast(d_exp_min_max, 1394 Sub(clamped_exp, ShiftRight<2>(clamped_exp))), 1395 BitCast(d_exp_min_max, 1396 Set(di, static_cast<TI>(kExpBias + kExpBias)))), 1397 BitCast(d_exp_min_max, min_exp1_plus_min_exp2))); 1398 1399 const VI exp1 = ShiftRight<1>(exp1_plus_exp2); 1400 const VI exp2 = Sub(exp1_plus_exp2, exp1); 1401 const VI exp3 = Sub(clamped_exp, exp1_plus_exp2); 1402 1403 const VI exp_bias = Set(di, kExpBias); 1404 1405 const VF factor1 = 1406 BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp1, exp_bias))); 1407 const VF factor2 = 1408 BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp2, exp_bias))); 1409 const VF factor3 = 1410 BitCast(df, ShiftLeft<kNumOfMantBits>(Add(exp3, exp_bias))); 1411 1412 return Mul(Mul(Mul(v, factor1), factor2), factor3); 1413 } 1414 1415 template <class V, HWY_IF_FLOAT_V(V)> 1416 HWY_API V MulByFloorPow2(V v, V exp) { 1417 const DFromV<decltype(v)> df; 1418 1419 // MulByFloorPow2 special cases: 1420 // MulByFloorPow2(v, NaN) => NaN 1421 // MulByFloorPow2(0, inf) => NaN 1422 // MulByFloorPow2(inf, -inf) => NaN 1423 // MulByFloorPow2(-inf, -inf) => NaN 1424 const auto is_special_case_with_nan_result = 1425 Or(IsNaN(exp), 1426 And(Eq(Abs(v), IfNegativeThenElseZero(exp, Inf(df))), IsInf(exp))); 1427 1428 return IfThenElse(is_special_case_with_nan_result, NaN(df), 1429 MulByPow2(v, FloorInt(exp))); 1430 } 1431 1432 #endif // HWY_NATIVE_MUL_BY_POW2 1433 1434 // ------------------------------ GetBiasedExponent 1435 #if (defined(HWY_NATIVE_GET_BIASED_EXPONENT) == defined(HWY_TARGET_TOGGLE)) 1436 #ifdef HWY_NATIVE_GET_BIASED_EXPONENT 1437 #undef HWY_NATIVE_GET_BIASED_EXPONENT 1438 #else 1439 #define HWY_NATIVE_GET_BIASED_EXPONENT 1440 #endif 1441 1442 template <class V, HWY_IF_FLOAT_V(V)> 1443 HWY_API VFromD<RebindToUnsigned<DFromV<V>>> GetBiasedExponent(V v) { 1444 using T = TFromV<V>; 1445 1446 const DFromV<V> d; 1447 const RebindToUnsigned<decltype(d)> du; 1448 1449 constexpr int kNumOfMantBits = MantissaBits<T>(); 1450 return ShiftRight<kNumOfMantBits>(BitCast(du, Abs(v))); 1451 } 1452 1453 #endif 1454 1455 // ------------------------------ GetExponent 1456 1457 #if (defined(HWY_NATIVE_GET_EXPONENT) == defined(HWY_TARGET_TOGGLE)) 1458 #ifdef HWY_NATIVE_GET_EXPONENT 1459 #undef HWY_NATIVE_GET_EXPONENT 1460 #else 1461 #define HWY_NATIVE_GET_EXPONENT 1462 #endif 1463 1464 template <class V, HWY_IF_FLOAT_V(V)> 1465 HWY_API V GetExponent(V v) { 1466 const DFromV<V> d; 1467 using T = TFromV<V>; 1468 const RebindToSigned<decltype(d)> di; 1469 1470 const auto exponent_offset = Set(di, MaxExponentField<T>() >> 1); 1471 1472 // extract exponent bits as integer 1473 const auto encoded_exponent = GetBiasedExponent(v); 1474 const auto exponent_int = Sub(BitCast(di, encoded_exponent), exponent_offset); 1475 1476 // convert integer to original type 1477 return ConvertTo(d, exponent_int); 1478 } 1479 1480 #endif // HWY_NATIVE_GET_EXPONENT 1481 // ------------------------------ LoadInterleaved2 1482 1483 #if HWY_IDE || \ 1484 (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE)) 1485 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED 1486 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED 1487 #else 1488 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED 1489 #endif 1490 1491 template <class D, HWY_IF_LANES_GT_D(D, 1)> 1492 HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned, 1493 VFromD<D>& v0, VFromD<D>& v1) { 1494 const VFromD<D> A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0] 1495 const VFromD<D> B = LoadU(d, unaligned + Lanes(d)); 1496 v0 = ConcatEven(d, B, A); 1497 v1 = ConcatOdd(d, B, A); 1498 } 1499 1500 template <class D, HWY_IF_LANES_D(D, 1)> 1501 HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned, 1502 VFromD<D>& v0, VFromD<D>& v1) { 1503 v0 = LoadU(d, unaligned + 0); 1504 v1 = LoadU(d, unaligned + 1); 1505 } 1506 1507 // ------------------------------ LoadInterleaved3 (CombineShiftRightBytes) 1508 1509 namespace detail { 1510 1511 #if HWY_IDE 1512 template <class V> 1513 HWY_INLINE V ShuffleTwo1230(V a, V /* b */) { 1514 return a; 1515 } 1516 template <class V> 1517 HWY_INLINE V ShuffleTwo2301(V a, V /* b */) { 1518 return a; 1519 } 1520 template <class V> 1521 HWY_INLINE V ShuffleTwo3012(V a, V /* b */) { 1522 return a; 1523 } 1524 #endif // HWY_IDE 1525 1526 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. 1527 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 1528 HWY_INLINE void LoadTransposedBlocks3(D d, 1529 const TFromD<D>* HWY_RESTRICT unaligned, 1530 VFromD<D>& A, VFromD<D>& B, 1531 VFromD<D>& C) { 1532 constexpr size_t kN = MaxLanes(d); 1533 A = LoadU(d, unaligned + 0 * kN); 1534 B = LoadU(d, unaligned + 1 * kN); 1535 C = LoadU(d, unaligned + 2 * kN); 1536 } 1537 1538 } // namespace detail 1539 1540 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)> 1541 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, 1542 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { 1543 const RebindToUnsigned<decltype(d)> du; 1544 using V = VFromD<D>; 1545 using VU = VFromD<decltype(du)>; 1546 // Compact notation so these fit on one line: 12 := v1[2]. 1547 V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00 1548 V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15 1549 V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a 1550 detail::LoadTransposedBlocks3(d, unaligned, A, B, C); 1551 // Compress all lanes belonging to v0 into consecutive lanes. 1552 constexpr uint8_t Z = 0x80; 1553 const VU idx_v0A = 1554 Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); 1555 const VU idx_v0B = 1556 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z); 1557 const VU idx_v0C = 1558 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13); 1559 const VU idx_v1A = 1560 Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); 1561 const VU idx_v1B = 1562 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z); 1563 const VU idx_v1C = 1564 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14); 1565 const VU idx_v2A = 1566 Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); 1567 const VU idx_v2B = 1568 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z); 1569 const VU idx_v2C = 1570 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15); 1571 const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A)); 1572 const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B)); 1573 const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C)); 1574 const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A)); 1575 const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B)); 1576 const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C)); 1577 const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A)); 1578 const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B)); 1579 const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C)); 1580 v0 = Xor3(v0L, v0M, v0U); 1581 v1 = Xor3(v1L, v1M, v1U); 1582 v2 = Xor3(v2L, v2M, v2U); 1583 } 1584 1585 // 8-bit lanes x8 1586 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> 1587 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, 1588 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { 1589 const RebindToUnsigned<decltype(d)> du; 1590 using V = VFromD<D>; 1591 using VU = VFromD<decltype(du)>; 1592 V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] 1593 V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] 1594 V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] 1595 detail::LoadTransposedBlocks3(d, unaligned, A, B, C); 1596 // Compress all lanes belonging to v0 into consecutive lanes. 1597 constexpr uint8_t Z = 0x80; 1598 const VU idx_v0A = 1599 Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); 1600 const VU idx_v0B = 1601 Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); 1602 const VU idx_v0C = 1603 Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0); 1604 const VU idx_v1A = 1605 Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); 1606 const VU idx_v1B = 1607 Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); 1608 const VU idx_v1C = 1609 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0); 1610 const VU idx_v2A = 1611 Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); 1612 const VU idx_v2B = 1613 Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); 1614 const VU idx_v2C = 1615 Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0); 1616 const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A)); 1617 const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B)); 1618 const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C)); 1619 const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A)); 1620 const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B)); 1621 const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C)); 1622 const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A)); 1623 const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B)); 1624 const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C)); 1625 v0 = Xor3(v0L, v0M, v0U); 1626 v1 = Xor3(v1L, v1M, v1U); 1627 v2 = Xor3(v2L, v2M, v2U); 1628 } 1629 1630 // 16-bit lanes x8 1631 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> 1632 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, 1633 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { 1634 const RebindToUnsigned<decltype(d)> du; 1635 const Repartition<uint8_t, decltype(du)> du8; 1636 using V = VFromD<D>; 1637 using VU8 = VFromD<decltype(du8)>; 1638 V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] 1639 V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] 1640 V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] 1641 detail::LoadTransposedBlocks3(d, unaligned, A, B, C); 1642 // Compress all lanes belonging to v0 into consecutive lanes. Same as above, 1643 // but each element of the array contains a byte index for a byte of a lane. 1644 constexpr uint8_t Z = 0x80; 1645 const VU8 idx_v0A = Dup128VecFromValues(du8, 0x00, 0x01, 0x06, 0x07, 0x0C, 1646 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); 1647 const VU8 idx_v0B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x02, 0x03, 1648 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z); 1649 const VU8 idx_v0C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1650 Z, 0x04, 0x05, 0x0A, 0x0B); 1651 const VU8 idx_v1A = Dup128VecFromValues(du8, 0x02, 0x03, 0x08, 0x09, 0x0E, 1652 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); 1653 const VU8 idx_v1B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x04, 0x05, 1654 0x0A, 0x0B, Z, Z, Z, Z, Z, Z); 1655 const VU8 idx_v1C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1656 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D); 1657 const VU8 idx_v2A = Dup128VecFromValues(du8, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, 1658 Z, Z, Z, Z, Z, Z, Z, Z, Z); 1659 const VU8 idx_v2B = Dup128VecFromValues(du8, Z, Z, Z, Z, 0x00, 0x01, 0x06, 1660 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z); 1661 const VU8 idx_v2C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1662 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F); 1663 const V v0L = TableLookupBytesOr0(A, BitCast(d, idx_v0A)); 1664 const V v0M = TableLookupBytesOr0(B, BitCast(d, idx_v0B)); 1665 const V v0U = TableLookupBytesOr0(C, BitCast(d, idx_v0C)); 1666 const V v1L = TableLookupBytesOr0(A, BitCast(d, idx_v1A)); 1667 const V v1M = TableLookupBytesOr0(B, BitCast(d, idx_v1B)); 1668 const V v1U = TableLookupBytesOr0(C, BitCast(d, idx_v1C)); 1669 const V v2L = TableLookupBytesOr0(A, BitCast(d, idx_v2A)); 1670 const V v2M = TableLookupBytesOr0(B, BitCast(d, idx_v2B)); 1671 const V v2U = TableLookupBytesOr0(C, BitCast(d, idx_v2C)); 1672 v0 = Xor3(v0L, v0M, v0U); 1673 v1 = Xor3(v1L, v1M, v1U); 1674 v2 = Xor3(v2L, v2M, v2U); 1675 } 1676 1677 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)> 1678 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, 1679 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { 1680 using V = VFromD<D>; 1681 V A; // v0[1] v2[0] v1[0] v0[0] 1682 V B; // v1[2] v0[2] v2[1] v1[1] 1683 V C; // v2[3] v1[3] v0[3] v2[2] 1684 detail::LoadTransposedBlocks3(d, unaligned, A, B, C); 1685 1686 const V vxx_02_03_xx = OddEven(C, B); 1687 v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx); 1688 1689 // Shuffle2301 takes the upper/lower halves of the output from one input, so 1690 // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use 1691 // OddEven because it may have higher throughput than Shuffle. 1692 const V vxx_xx_10_11 = OddEven(A, B); 1693 const V v12_13_xx_xx = OddEven(B, C); 1694 v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx); 1695 1696 const V vxx_20_21_xx = OddEven(B, A); 1697 v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C); 1698 } 1699 1700 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)> 1701 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, 1702 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { 1703 VFromD<D> A; // v1[0] v0[0] 1704 VFromD<D> B; // v0[1] v2[0] 1705 VFromD<D> C; // v2[1] v1[1] 1706 detail::LoadTransposedBlocks3(d, unaligned, A, B, C); 1707 v0 = OddEven(B, A); 1708 v1 = CombineShiftRightBytes<sizeof(TFromD<D>)>(d, C, A); 1709 v2 = OddEven(C, B); 1710 } 1711 1712 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> 1713 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, 1714 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { 1715 v0 = LoadU(d, unaligned + 0); 1716 v1 = LoadU(d, unaligned + 1); 1717 v2 = LoadU(d, unaligned + 2); 1718 } 1719 1720 // ------------------------------ LoadInterleaved4 1721 1722 namespace detail { 1723 1724 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. 1725 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 1726 HWY_INLINE void LoadTransposedBlocks4(D d, 1727 const TFromD<D>* HWY_RESTRICT unaligned, 1728 VFromD<D>& vA, VFromD<D>& vB, 1729 VFromD<D>& vC, VFromD<D>& vD) { 1730 constexpr size_t kN = MaxLanes(d); 1731 vA = LoadU(d, unaligned + 0 * kN); 1732 vB = LoadU(d, unaligned + 1 * kN); 1733 vC = LoadU(d, unaligned + 2 * kN); 1734 vD = LoadU(d, unaligned + 3 * kN); 1735 } 1736 1737 } // namespace detail 1738 1739 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)> 1740 HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned, 1741 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, 1742 VFromD<D>& v3) { 1743 const Repartition<uint64_t, decltype(d)> d64; 1744 using V64 = VFromD<decltype(d64)>; 1745 using V = VFromD<D>; 1746 // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD. 1747 // Here int[i] means the four interleaved values of the i-th 4-tuple and 1748 // int[3..0] indicates four consecutive 4-tuples (0 = least-significant). 1749 V vA; // int[13..10] int[3..0] 1750 V vB; // int[17..14] int[7..4] 1751 V vC; // int[1b..18] int[b..8] 1752 V vD; // int[1f..1c] int[f..c] 1753 detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); 1754 1755 // For brevity, the comments only list the lower block (upper = lower + 0x10) 1756 const V v5140 = InterleaveLower(d, vA, vB); // int[5,1,4,0] 1757 const V vd9c8 = InterleaveLower(d, vC, vD); // int[d,9,c,8] 1758 const V v7362 = InterleaveUpper(d, vA, vB); // int[7,3,6,2] 1759 const V vfbea = InterleaveUpper(d, vC, vD); // int[f,b,e,a] 1760 1761 const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0] 1762 const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8] 1763 const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1] 1764 const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9] 1765 1766 const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0] 1767 const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8] 1768 const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0] 1769 const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8] 1770 1771 v0 = BitCast(d, InterleaveLower(d64, v10L, v10U)); 1772 v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U)); 1773 v2 = BitCast(d, InterleaveLower(d64, v32L, v32U)); 1774 v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U)); 1775 } 1776 1777 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8)> 1778 HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned, 1779 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, 1780 VFromD<D>& v3) { 1781 // In the last step, we interleave by half of the block size, which is usually 1782 // 8 bytes but half that for 8-bit x8 vectors. 1783 using TW = hwy::UnsignedFromSize<d.MaxBytes() == 8 ? 4 : 8>; 1784 const Repartition<TW, decltype(d)> dw; 1785 using VW = VFromD<decltype(dw)>; 1786 1787 // (Comments are for 256-bit vectors.) 1788 // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD. 1789 VFromD<D> vA; // v3210[9]v3210[8] v3210[1]v3210[0] 1790 VFromD<D> vB; // v3210[b]v3210[a] v3210[3]v3210[2] 1791 VFromD<D> vC; // v3210[d]v3210[c] v3210[5]v3210[4] 1792 VFromD<D> vD; // v3210[f]v3210[e] v3210[7]v3210[6] 1793 detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); 1794 1795 const VFromD<D> va820 = InterleaveLower(d, vA, vB); // v3210[a,8] v3210[2,0] 1796 const VFromD<D> vec64 = InterleaveLower(d, vC, vD); // v3210[e,c] v3210[6,4] 1797 const VFromD<D> vb931 = InterleaveUpper(d, vA, vB); // v3210[b,9] v3210[3,1] 1798 const VFromD<D> vfd75 = InterleaveUpper(d, vC, vD); // v3210[f,d] v3210[7,5] 1799 1800 const VW v10_b830 = // v10[b..8] v10[3..0] 1801 BitCast(dw, InterleaveLower(d, va820, vb931)); 1802 const VW v10_fc74 = // v10[f..c] v10[7..4] 1803 BitCast(dw, InterleaveLower(d, vec64, vfd75)); 1804 const VW v32_b830 = // v32[b..8] v32[3..0] 1805 BitCast(dw, InterleaveUpper(d, va820, vb931)); 1806 const VW v32_fc74 = // v32[f..c] v32[7..4] 1807 BitCast(dw, InterleaveUpper(d, vec64, vfd75)); 1808 1809 v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74)); 1810 v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74)); 1811 v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74)); 1812 v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74)); 1813 } 1814 1815 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)> 1816 HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned, 1817 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, 1818 VFromD<D>& v3) { 1819 using V = VFromD<D>; 1820 V vA; // v3210[4] v3210[0] 1821 V vB; // v3210[5] v3210[1] 1822 V vC; // v3210[6] v3210[2] 1823 V vD; // v3210[7] v3210[3] 1824 detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); 1825 const V v10e = InterleaveLower(d, vA, vC); // v1[6,4] v0[6,4] v1[2,0] v0[2,0] 1826 const V v10o = InterleaveLower(d, vB, vD); // v1[7,5] v0[7,5] v1[3,1] v0[3,1] 1827 const V v32e = InterleaveUpper(d, vA, vC); // v3[6,4] v2[6,4] v3[2,0] v2[2,0] 1828 const V v32o = InterleaveUpper(d, vB, vD); // v3[7,5] v2[7,5] v3[3,1] v2[3,1] 1829 1830 v0 = InterleaveLower(d, v10e, v10o); 1831 v1 = InterleaveUpper(d, v10e, v10o); 1832 v2 = InterleaveLower(d, v32e, v32o); 1833 v3 = InterleaveUpper(d, v32e, v32o); 1834 } 1835 1836 template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)> 1837 HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned, 1838 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, 1839 VFromD<D>& v3) { 1840 VFromD<D> vA, vB, vC, vD; 1841 detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); 1842 v0 = InterleaveLower(d, vA, vC); 1843 v1 = InterleaveUpper(d, vA, vC); 1844 v2 = InterleaveLower(d, vB, vD); 1845 v3 = InterleaveUpper(d, vB, vD); 1846 } 1847 1848 // Any T x1 1849 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> 1850 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, 1851 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, 1852 VFromD<D>& v3) { 1853 v0 = LoadU(d, unaligned + 0); 1854 v1 = LoadU(d, unaligned + 1); 1855 v2 = LoadU(d, unaligned + 2); 1856 v3 = LoadU(d, unaligned + 3); 1857 } 1858 1859 // ------------------------------ StoreInterleaved2 1860 1861 namespace detail { 1862 1863 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. 1864 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 1865 HWY_INLINE void StoreTransposedBlocks2(VFromD<D> A, VFromD<D> B, D d, 1866 TFromD<D>* HWY_RESTRICT unaligned) { 1867 constexpr size_t kN = MaxLanes(d); 1868 StoreU(A, d, unaligned + 0 * kN); 1869 StoreU(B, d, unaligned + 1 * kN); 1870 } 1871 1872 } // namespace detail 1873 1874 // >= 128 bit vector 1875 template <class D, HWY_IF_V_SIZE_GT_D(D, 8)> 1876 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, 1877 TFromD<D>* HWY_RESTRICT unaligned) { 1878 const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0] 1879 const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[kN/2] v0[kN/2] 1880 detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned); 1881 } 1882 1883 // <= 64 bits 1884 template <class V, class D, HWY_IF_V_SIZE_LE_D(D, 8)> 1885 HWY_API void StoreInterleaved2(V part0, V part1, D d, 1886 TFromD<D>* HWY_RESTRICT unaligned) { 1887 const Twice<decltype(d)> d2; 1888 const auto v0 = ZeroExtendVector(d2, part0); 1889 const auto v1 = ZeroExtendVector(d2, part1); 1890 const auto v10 = InterleaveLower(d2, v0, v1); 1891 StoreU(v10, d2, unaligned); 1892 } 1893 1894 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, 1895 // TableLookupBytes) 1896 1897 namespace detail { 1898 1899 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. 1900 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 1901 HWY_INLINE void StoreTransposedBlocks3(VFromD<D> A, VFromD<D> B, VFromD<D> C, 1902 D d, TFromD<D>* HWY_RESTRICT unaligned) { 1903 constexpr size_t kN = MaxLanes(d); 1904 StoreU(A, d, unaligned + 0 * kN); 1905 StoreU(B, d, unaligned + 1 * kN); 1906 StoreU(C, d, unaligned + 2 * kN); 1907 } 1908 1909 } // namespace detail 1910 1911 // >= 128-bit vector, 8-bit lanes 1912 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)> 1913 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, 1914 TFromD<D>* HWY_RESTRICT unaligned) { 1915 const RebindToUnsigned<decltype(d)> du; 1916 using TU = TFromD<decltype(du)>; 1917 using VU = VFromD<decltype(du)>; 1918 const VU k5 = Set(du, TU{5}); 1919 const VU k6 = Set(du, TU{6}); 1920 1921 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): 1922 // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes 1923 // to their place, with 0x80 so lanes to be filled from other vectors are 0 1924 // to enable blending by ORing together. 1925 const VFromD<decltype(du)> shuf_A0 = 1926 Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 1927 0x80, 0x80, 4, 0x80, 0x80, 5); 1928 // Cannot reuse shuf_A0 because it contains 5. 1929 const VFromD<decltype(du)> shuf_A1 = 1930 Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 1931 3, 0x80, 0x80, 4, 0x80, 0x80); 1932 // The interleaved vectors will be named A, B, C; temporaries with suffix 1933 // 0..2 indicate which input vector's lanes they hold. 1934 // cannot reuse shuf_A0 (has 5) 1935 const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); 1936 const VU vA0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 1937 const VU vA1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. 1938 const VU vA2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. 1939 const VFromD<D> A = BitCast(d, vA0 | vA1 | vA2); 1940 1941 // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5] 1942 const VU shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6.. 1943 const VU shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5 1944 const VU shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5. 1945 const VU vB0 = TableLookupBytesOr0(v0, shuf_B0); 1946 const VU vB1 = TableLookupBytesOr0(v1, shuf_B1); 1947 const VU vB2 = TableLookupBytesOr0(v2, shuf_B2); 1948 const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2); 1949 1950 // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10] 1951 const VU shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B. 1952 const VU shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B.. 1953 const VU shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A 1954 const VU vC0 = TableLookupBytesOr0(v0, shuf_C0); 1955 const VU vC1 = TableLookupBytesOr0(v1, shuf_C1); 1956 const VU vC2 = TableLookupBytesOr0(v2, shuf_C2); 1957 const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2); 1958 1959 detail::StoreTransposedBlocks3(A, B, C, d, unaligned); 1960 } 1961 1962 // >= 128-bit vector, 16-bit lanes 1963 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)> 1964 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, 1965 TFromD<D>* HWY_RESTRICT unaligned) { 1966 const Repartition<uint8_t, decltype(d)> du8; 1967 using VU8 = VFromD<decltype(du8)>; 1968 const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)}); 1969 const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)}); 1970 1971 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): 1972 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be 1973 // filled from other vectors are 0 for blending. Note that these are byte 1974 // indices for 16-bit lanes. 1975 const VFromD<decltype(du8)> shuf_A1 = 1976 Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 1977 0x80, 0x80, 0x80, 0x80, 4, 5); 1978 const VFromD<decltype(du8)> shuf_A2 = 1979 Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 1980 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80); 1981 1982 // The interleaved vectors will be named A, B, C; temporaries with suffix 1983 // 0..2 indicate which input vector's lanes they hold. 1984 const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); 1985 1986 const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0); 1987 const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1); 1988 const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2); 1989 const VFromD<D> A = BitCast(d, A0 | A1 | A2); 1990 1991 // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2] 1992 const VU8 shuf_B0 = shuf_A1 + k3; // 5..4..3. 1993 const VU8 shuf_B1 = shuf_A2 + k3; // ..4..3.. 1994 const VU8 shuf_B2 = shuf_A0 + k2; // .4..3..2 1995 const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0); 1996 const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1); 1997 const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2); 1998 const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2); 1999 2000 // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] 2001 const VU8 shuf_C0 = shuf_B1 + k3; // ..7..6.. 2002 const VU8 shuf_C1 = shuf_B2 + k3; // .7..6..5 2003 const VU8 shuf_C2 = shuf_B0 + k2; // 7..6..5. 2004 const VU8 vC0 = TableLookupBytesOr0(v0, shuf_C0); 2005 const VU8 vC1 = TableLookupBytesOr0(v1, shuf_C1); 2006 const VU8 vC2 = TableLookupBytesOr0(v2, shuf_C2); 2007 const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2); 2008 2009 detail::StoreTransposedBlocks3(A, B, C, d, unaligned); 2010 } 2011 2012 // >= 128-bit vector, 32-bit lanes 2013 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)> 2014 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, 2015 TFromD<D>* HWY_RESTRICT unaligned) { 2016 const RepartitionToWide<decltype(d)> dw; 2017 2018 const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1); 2019 const VFromD<D> v01_v20 = OddEven(v0, v2); 2020 // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0) 2021 const VFromD<D> A = BitCast( 2022 d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20))); 2023 2024 const VFromD<D> v1_321 = ShiftRightLanes<1>(d, v1); 2025 const VFromD<D> v0_32 = ShiftRightLanes<2>(d, v0); 2026 const VFromD<D> v21_v11 = OddEven(v2, v1_321); 2027 const VFromD<D> v12_v02 = OddEven(v1_321, v0_32); 2028 // B: v1[2],v0[2], v2[1],v1[1] 2029 const VFromD<D> B = BitCast( 2030 d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02))); 2031 2032 // Notation refers to the upper 2 lanes of the vector for InterleaveUpper. 2033 const VFromD<D> v23_v13 = OddEven(v2, v1_321); 2034 const VFromD<D> v03_v22 = OddEven(v0, v2); 2035 // C: v2[3],v1[3],v0[3], v2[2] 2036 const VFromD<D> C = BitCast( 2037 d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13))); 2038 2039 detail::StoreTransposedBlocks3(A, B, C, d, unaligned); 2040 } 2041 2042 // >= 128-bit vector, 64-bit lanes 2043 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)> 2044 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, 2045 TFromD<D>* HWY_RESTRICT unaligned) { 2046 const VFromD<D> A = InterleaveLower(d, v0, v1); 2047 const VFromD<D> B = OddEven(v0, v2); 2048 const VFromD<D> C = InterleaveUpper(d, v1, v2); 2049 detail::StoreTransposedBlocks3(A, B, C, d, unaligned); 2050 } 2051 2052 // 64-bit vector, 8-bit lanes 2053 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)> 2054 HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1, 2055 VFromD<D> part2, D d, 2056 TFromD<D>* HWY_RESTRICT unaligned) { 2057 // Use full vectors for the shuffles and first result. 2058 constexpr size_t kFullN = 16 / sizeof(TFromD<D>); 2059 const Full128<uint8_t> du; 2060 using VU = VFromD<decltype(du)>; 2061 const Full128<TFromD<D>> d_full; 2062 const VU k5 = Set(du, uint8_t{5}); 2063 const VU k6 = Set(du, uint8_t{6}); 2064 2065 const VFromD<decltype(d_full)> v0{part0.raw}; 2066 const VFromD<decltype(d_full)> v1{part1.raw}; 2067 const VFromD<decltype(d_full)> v2{part2.raw}; 2068 2069 // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): 2070 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be 2071 // filled from other vectors are 0 for blending. 2072 alignas(16) static constexpr uint8_t tbl_v0[16] = { 2073 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // 2074 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; 2075 alignas(16) static constexpr uint8_t tbl_v1[16] = { 2076 0x80, 0, 0x80, 0x80, 1, 0x80, // 2077 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; 2078 // The interleaved vectors will be named A, B, C; temporaries with suffix 2079 // 0..2 indicate which input vector's lanes they hold. 2080 const VU shuf_A0 = Load(du, tbl_v0); 2081 const VU shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB) 2082 const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); 2083 const VU A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 2084 const VU A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. 2085 const VU A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. 2086 const auto A = BitCast(d_full, A0 | A1 | A2); 2087 StoreU(A, d_full, unaligned + 0 * kFullN); 2088 2089 // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] 2090 const VU shuf_B0 = shuf_A2 + k6; // ..7..6.. 2091 const VU shuf_B1 = shuf_A0 + k5; // .7..6..5 2092 const VU shuf_B2 = shuf_A1 + k5; // 7..6..5. 2093 const VU vB0 = TableLookupBytesOr0(v0, shuf_B0); 2094 const VU vB1 = TableLookupBytesOr0(v1, shuf_B1); 2095 const VU vB2 = TableLookupBytesOr0(v2, shuf_B2); 2096 const VFromD<D> B{BitCast(d_full, vB0 | vB1 | vB2).raw}; 2097 StoreU(B, d, unaligned + 1 * kFullN); 2098 } 2099 2100 // 64-bit vector, 16-bit lanes 2101 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 4)> 2102 HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1, 2103 VFromD<D> part2, D dh, 2104 TFromD<D>* HWY_RESTRICT unaligned) { 2105 const Twice<D> d_full; 2106 const Full128<uint8_t> du8; 2107 using VU8 = VFromD<decltype(du8)>; 2108 const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)}); 2109 const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)}); 2110 2111 const VFromD<decltype(d_full)> v0{part0.raw}; 2112 const VFromD<decltype(d_full)> v1{part1.raw}; 2113 const VFromD<decltype(d_full)> v2{part2.raw}; 2114 2115 // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right): 2116 // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes 2117 // to their place, with 0x80 so lanes to be filled from other vectors are 0 2118 // to enable blending by ORing together. 2119 alignas(16) static constexpr uint8_t tbl_v1[16] = { 2120 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2121 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5}; 2122 alignas(16) static constexpr uint8_t tbl_v2[16] = { 2123 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 2124 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; 2125 2126 // The interleaved vectors will be named A, B; temporaries with suffix 2127 // 0..2 indicate which input vector's lanes they hold. 2128 const VU8 shuf_A1 = Load(du8, tbl_v1); // 2..1..0. 2129 // .2..1..0 2130 const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); 2131 const VU8 shuf_A2 = Load(du8, tbl_v2); // ..1..0.. 2132 2133 const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0); 2134 const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1); 2135 const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2); 2136 const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2); 2137 StoreU(A, d_full, unaligned); 2138 2139 // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2] 2140 const VU8 shuf_B0 = shuf_A1 + k3; // ..3. 2141 const VU8 shuf_B1 = shuf_A2 + k3; // .3.. 2142 const VU8 shuf_B2 = shuf_A0 + k2; // 3..2 2143 const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0); 2144 const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1); 2145 const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2); 2146 const VFromD<decltype(d_full)> B = BitCast(d_full, vB0 | vB1 | vB2); 2147 StoreU(VFromD<D>{B.raw}, dh, unaligned + MaxLanes(d_full)); 2148 } 2149 2150 // 64-bit vector, 32-bit lanes 2151 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_D(D, 2)> 2152 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, 2153 TFromD<D>* HWY_RESTRICT unaligned) { 2154 // (same code as 128-bit vector, 64-bit lanes) 2155 const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1); 2156 const VFromD<D> v01_v20 = OddEven(v0, v2); 2157 const VFromD<D> v21_v11 = InterleaveUpper(d, v1, v2); 2158 constexpr size_t kN = MaxLanes(d); 2159 StoreU(v10_v00, d, unaligned + 0 * kN); 2160 StoreU(v01_v20, d, unaligned + 1 * kN); 2161 StoreU(v21_v11, d, unaligned + 2 * kN); 2162 } 2163 2164 // 64-bit lanes are handled by the N=1 case below. 2165 2166 // <= 32-bit vector, 8-bit lanes 2167 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4), 2168 HWY_IF_LANES_GT_D(D, 1)> 2169 HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1, 2170 VFromD<D> part2, D d, 2171 TFromD<D>* HWY_RESTRICT unaligned) { 2172 // Use full vectors for the shuffles and result. 2173 const Full128<uint8_t> du; 2174 using VU = VFromD<decltype(du)>; 2175 const Full128<TFromD<D>> d_full; 2176 2177 const VFromD<decltype(d_full)> v0{part0.raw}; 2178 const VFromD<decltype(d_full)> v1{part1.raw}; 2179 const VFromD<decltype(d_full)> v2{part2.raw}; 2180 2181 // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 2182 // so lanes to be filled from other vectors are 0 to enable blending by ORing 2183 // together. 2184 alignas(16) static constexpr uint8_t tbl_v0[16] = { 2185 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 2186 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; 2187 // The interleaved vector will be named A; temporaries with suffix 2188 // 0..2 indicate which input vector's lanes they hold. 2189 const VU shuf_A0 = Load(du, tbl_v0); 2190 const VU shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0); 2191 const VU shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0); 2192 const VU A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0 2193 const VU A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0. 2194 const VU A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0.. 2195 const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2); 2196 alignas(16) TFromD<D> buf[MaxLanes(d_full)]; 2197 StoreU(A, d_full, buf); 2198 CopyBytes<d.MaxBytes() * 3>(buf, unaligned); 2199 } 2200 2201 // 32-bit vector, 16-bit lanes 2202 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 2)> 2203 HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1, 2204 VFromD<D> part2, D d, 2205 TFromD<D>* HWY_RESTRICT unaligned) { 2206 // Use full vectors for the shuffles and result. 2207 const Full128<uint8_t> du8; 2208 using VU8 = VFromD<decltype(du8)>; 2209 const Full128<TFromD<D>> d_full; 2210 2211 const VFromD<decltype(d_full)> v0{part0.raw}; 2212 const VFromD<decltype(d_full)> v1{part1.raw}; 2213 const VFromD<decltype(d_full)> v2{part2.raw}; 2214 2215 // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 2216 // so lanes to be filled from other vectors are 0 to enable blending by ORing 2217 // together. 2218 alignas(16) static constexpr uint8_t tbl_v2[16] = { 2219 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 2220 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; 2221 // The interleaved vector will be named A; temporaries with suffix 2222 // 0..2 indicate which input vector's lanes they hold. 2223 const VU8 shuf_A2 = Load(du8, tbl_v2); // ..1..0.. 2224 const VU8 shuf_A1 = 2225 CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2); // ...1..0. 2226 const VU8 shuf_A0 = 2227 CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2); // ....1..0 2228 const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0 2229 const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0. 2230 const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0.. 2231 const auto A = BitCast(d_full, A0 | A1 | A2); 2232 alignas(16) TFromD<D> buf[MaxLanes(d_full)]; 2233 StoreU(A, d_full, buf); 2234 CopyBytes<d.MaxBytes() * 3>(buf, unaligned); 2235 } 2236 2237 // Single-element vector, any lane size: just store directly 2238 template <class D, HWY_IF_LANES_D(D, 1)> 2239 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, 2240 TFromD<D>* HWY_RESTRICT unaligned) { 2241 StoreU(v0, d, unaligned + 0); 2242 StoreU(v1, d, unaligned + 1); 2243 StoreU(v2, d, unaligned + 2); 2244 } 2245 2246 // ------------------------------ StoreInterleaved4 2247 2248 namespace detail { 2249 2250 // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. 2251 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 2252 HWY_INLINE void StoreTransposedBlocks4(VFromD<D> vA, VFromD<D> vB, VFromD<D> vC, 2253 VFromD<D> vD, D d, 2254 TFromD<D>* HWY_RESTRICT unaligned) { 2255 constexpr size_t kN = MaxLanes(d); 2256 StoreU(vA, d, unaligned + 0 * kN); 2257 StoreU(vB, d, unaligned + 1 * kN); 2258 StoreU(vC, d, unaligned + 2 * kN); 2259 StoreU(vD, d, unaligned + 3 * kN); 2260 } 2261 2262 } // namespace detail 2263 2264 // >= 128-bit vector, 8..32-bit lanes 2265 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)> 2266 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, 2267 VFromD<D> v3, D d, 2268 TFromD<D>* HWY_RESTRICT unaligned) { 2269 const RepartitionToWide<decltype(d)> dw; 2270 const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0] 2271 const auto v32L = ZipLower(dw, v2, v3); 2272 const auto v10U = ZipUpper(dw, v0, v1); 2273 const auto v32U = ZipUpper(dw, v2, v3); 2274 // The interleaved vectors are vA, vB, vC, vD. 2275 const VFromD<D> vA = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210 2276 const VFromD<D> vB = BitCast(d, InterleaveUpper(dw, v10L, v32L)); 2277 const VFromD<D> vC = BitCast(d, InterleaveLower(dw, v10U, v32U)); 2278 const VFromD<D> vD = BitCast(d, InterleaveUpper(dw, v10U, v32U)); 2279 detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned); 2280 } 2281 2282 // >= 128-bit vector, 64-bit lanes 2283 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)> 2284 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, 2285 VFromD<D> v3, D d, 2286 TFromD<D>* HWY_RESTRICT unaligned) { 2287 // The interleaved vectors are vA, vB, vC, vD. 2288 const VFromD<D> vA = InterleaveLower(d, v0, v1); // v1[0] v0[0] 2289 const VFromD<D> vB = InterleaveLower(d, v2, v3); 2290 const VFromD<D> vC = InterleaveUpper(d, v0, v1); 2291 const VFromD<D> vD = InterleaveUpper(d, v2, v3); 2292 detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned); 2293 } 2294 2295 // 64-bit vector, 8..32-bit lanes 2296 template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)> 2297 HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1, 2298 VFromD<D> part2, VFromD<D> part3, D /* tag */, 2299 TFromD<D>* HWY_RESTRICT unaligned) { 2300 // Use full vectors to reduce the number of stores. 2301 const Full128<TFromD<D>> d_full; 2302 const RepartitionToWide<decltype(d_full)> dw; 2303 const VFromD<decltype(d_full)> v0{part0.raw}; 2304 const VFromD<decltype(d_full)> v1{part1.raw}; 2305 const VFromD<decltype(d_full)> v2{part2.raw}; 2306 const VFromD<decltype(d_full)> v3{part3.raw}; 2307 const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0] 2308 const auto v32 = ZipLower(dw, v2, v3); 2309 const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32)); 2310 const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32)); 2311 StoreU(A, d_full, unaligned); 2312 StoreU(B, d_full, unaligned + MaxLanes(d_full)); 2313 } 2314 2315 // 64-bit vector, 64-bit lane 2316 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)> 2317 HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1, 2318 VFromD<D> part2, VFromD<D> part3, D /* tag */, 2319 TFromD<D>* HWY_RESTRICT unaligned) { 2320 // Use full vectors to reduce the number of stores. 2321 const Full128<TFromD<D>> d_full; 2322 const VFromD<decltype(d_full)> v0{part0.raw}; 2323 const VFromD<decltype(d_full)> v1{part1.raw}; 2324 const VFromD<decltype(d_full)> v2{part2.raw}; 2325 const VFromD<decltype(d_full)> v3{part3.raw}; 2326 const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0] 2327 const auto B = InterleaveLower(d_full, v2, v3); 2328 StoreU(A, d_full, unaligned); 2329 StoreU(B, d_full, unaligned + MaxLanes(d_full)); 2330 } 2331 2332 // <= 32-bit vectors 2333 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> 2334 HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1, 2335 VFromD<D> part2, VFromD<D> part3, D d, 2336 TFromD<D>* HWY_RESTRICT unaligned) { 2337 // Use full vectors to reduce the number of stores. 2338 const Full128<TFromD<D>> d_full; 2339 const RepartitionToWide<decltype(d_full)> dw; 2340 const VFromD<decltype(d_full)> v0{part0.raw}; 2341 const VFromD<decltype(d_full)> v1{part1.raw}; 2342 const VFromD<decltype(d_full)> v2{part2.raw}; 2343 const VFromD<decltype(d_full)> v3{part3.raw}; 2344 const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0] 2345 const auto v32 = ZipLower(dw, v2, v3); 2346 const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32)); 2347 alignas(16) TFromD<D> buf[MaxLanes(d_full)]; 2348 StoreU(v3210, d_full, buf); 2349 CopyBytes<d.MaxBytes() * 4>(buf, unaligned); 2350 } 2351 2352 #endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED 2353 2354 // ------------------------------ PairwiseAdd/PairwiseSub 2355 #if (defined(HWY_NATIVE_PAIRWISE_ADD) == defined(HWY_TARGET_TOGGLE)) 2356 #ifdef HWY_NATIVE_PAIRWISE_ADD 2357 #undef HWY_NATIVE_PAIRWISE_ADD 2358 #else 2359 #define HWY_NATIVE_PAIRWISE_ADD 2360 #endif 2361 2362 template <class D, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1)> 2363 HWY_API V PairwiseAdd(D d, V a, V b) { 2364 return Add(InterleaveEven(d, a, b), InterleaveOdd(d, a, b)); 2365 } 2366 2367 #endif 2368 2369 #if (defined(HWY_NATIVE_PAIRWISE_SUB) == defined(HWY_TARGET_TOGGLE)) 2370 #ifdef HWY_NATIVE_PAIRWISE_SUB 2371 #undef HWY_NATIVE_PAIRWISE_SUB 2372 #else 2373 #define HWY_NATIVE_PAIRWISE_SUB 2374 #endif 2375 2376 template <class D, class V = VFromD<D>(), HWY_IF_LANES_GT_D(D, 1)> 2377 HWY_API V PairwiseSub(D d, V a, V b) { 2378 return Sub(InterleaveOdd(d, a, b), InterleaveEven(d, a, b)); 2379 } 2380 2381 #endif 2382 2383 // Load/StoreInterleaved for special floats. Requires HWY_GENERIC_IF_EMULATED_D 2384 // is defined such that it is true only for types that actually require these 2385 // generic implementations. 2386 #if HWY_IDE || (defined(HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED) == \ 2387 defined(HWY_TARGET_TOGGLE) && \ 2388 defined(HWY_GENERIC_IF_EMULATED_D)) 2389 #ifdef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED 2390 #undef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED 2391 #else 2392 #define HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED 2393 #endif 2394 #if HWY_IDE 2395 #define HWY_GENERIC_IF_EMULATED_D(D) int 2396 #endif 2397 2398 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>> 2399 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, 2400 VFromD<D>& v0, VFromD<D>& v1) { 2401 const RebindToUnsigned<decltype(d)> du; 2402 VFromD<decltype(du)> vu0, vu1; 2403 LoadInterleaved2(du, detail::U16LanePointer(unaligned), vu0, vu1); 2404 v0 = BitCast(d, vu0); 2405 v1 = BitCast(d, vu1); 2406 } 2407 2408 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>> 2409 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, 2410 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { 2411 const RebindToUnsigned<decltype(d)> du; 2412 VFromD<decltype(du)> vu0, vu1, vu2; 2413 LoadInterleaved3(du, detail::U16LanePointer(unaligned), vu0, vu1, vu2); 2414 v0 = BitCast(d, vu0); 2415 v1 = BitCast(d, vu1); 2416 v2 = BitCast(d, vu2); 2417 } 2418 2419 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>> 2420 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, 2421 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, 2422 VFromD<D>& v3) { 2423 const RebindToUnsigned<decltype(d)> du; 2424 VFromD<decltype(du)> vu0, vu1, vu2, vu3; 2425 LoadInterleaved4(du, detail::U16LanePointer(unaligned), vu0, vu1, vu2, vu3); 2426 v0 = BitCast(d, vu0); 2427 v1 = BitCast(d, vu1); 2428 v2 = BitCast(d, vu2); 2429 v3 = BitCast(d, vu3); 2430 } 2431 2432 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>> 2433 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, 2434 T* HWY_RESTRICT unaligned) { 2435 const RebindToUnsigned<decltype(d)> du; 2436 StoreInterleaved2(BitCast(du, v0), BitCast(du, v1), du, 2437 detail::U16LanePointer(unaligned)); 2438 } 2439 2440 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>> 2441 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, 2442 T* HWY_RESTRICT unaligned) { 2443 const RebindToUnsigned<decltype(d)> du; 2444 StoreInterleaved3(BitCast(du, v0), BitCast(du, v1), BitCast(du, v2), du, 2445 detail::U16LanePointer(unaligned)); 2446 } 2447 2448 template <class D, HWY_GENERIC_IF_EMULATED_D(D), typename T = TFromD<D>> 2449 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, 2450 VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) { 2451 const RebindToUnsigned<decltype(d)> du; 2452 StoreInterleaved4(BitCast(du, v0), BitCast(du, v1), BitCast(du, v2), 2453 BitCast(du, v3), du, detail::U16LanePointer(unaligned)); 2454 } 2455 2456 #endif // HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED 2457 2458 // ------------------------------ LoadN 2459 2460 #if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE)) 2461 2462 #ifdef HWY_NATIVE_LOAD_N 2463 #undef HWY_NATIVE_LOAD_N 2464 #else 2465 #define HWY_NATIVE_LOAD_N 2466 #endif 2467 2468 #if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE 2469 namespace detail { 2470 2471 template <class DTo, class DFrom> 2472 HWY_INLINE VFromD<DTo> LoadNResizeBitCast(DTo d_to, DFrom d_from, 2473 VFromD<DFrom> v) { 2474 #if HWY_TARGET <= HWY_SSE2 2475 // On SSE2/SSSE3/SSE4, the LoadU operation will zero out any lanes of v.raw 2476 // past the first (lowest-index) Lanes(d_from) lanes of v.raw if 2477 // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true 2478 (void)d_from; 2479 return ResizeBitCast(d_to, v); 2480 #else 2481 // On other targets such as PPC/NEON, the contents of any lanes past the first 2482 // (lowest-index) Lanes(d_from) lanes of v.raw might be non-zero if 2483 // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true. 2484 return ZeroExtendResizeBitCast(d_to, d_from, v); 2485 #endif 2486 } 2487 2488 } // namespace detail 2489 2490 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1), 2491 HWY_IF_NOT_BF16_D(D)> 2492 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, 2493 size_t num_lanes) { 2494 return (num_lanes > 0) ? LoadU(d, p) : Zero(d); 2495 } 2496 2497 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1), 2498 HWY_IF_NOT_BF16_D(D)> 2499 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, 2500 size_t num_lanes) { 2501 return (num_lanes > 0) ? LoadU(d, p) : no; 2502 } 2503 2504 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2), 2505 HWY_IF_NOT_BF16_D(D)> 2506 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, 2507 size_t num_lanes) { 2508 const FixedTag<TFromD<D>, 1> d1; 2509 2510 if (num_lanes >= 2) return LoadU(d, p); 2511 if (num_lanes == 0) return Zero(d); 2512 return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)); 2513 } 2514 2515 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2), 2516 HWY_IF_NOT_BF16_D(D)> 2517 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, 2518 size_t num_lanes) { 2519 const FixedTag<TFromD<D>, 1> d1; 2520 2521 if (num_lanes >= 2) return LoadU(d, p); 2522 if (num_lanes == 0) return no; 2523 return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no); 2524 } 2525 2526 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4), 2527 HWY_IF_NOT_BF16_D(D)> 2528 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, 2529 size_t num_lanes) { 2530 const FixedTag<TFromD<D>, 2> d2; 2531 const Half<decltype(d2)> d1; 2532 2533 if (num_lanes >= 4) return LoadU(d, p); 2534 if (num_lanes == 0) return Zero(d); 2535 if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)); 2536 2537 // Two or three lanes. 2538 const VFromD<D> v_lo = detail::LoadNResizeBitCast(d, d2, LoadU(d2, p)); 2539 return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]); 2540 } 2541 2542 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4), 2543 HWY_IF_NOT_BF16_D(D)> 2544 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, 2545 size_t num_lanes) { 2546 const FixedTag<TFromD<D>, 2> d2; 2547 2548 if (num_lanes >= 4) return LoadU(d, p); 2549 if (num_lanes == 0) return no; 2550 if (num_lanes == 1) return InsertLane(no, 0, p[0]); 2551 2552 // Two or three lanes. 2553 const VFromD<D> v_lo = 2554 ConcatUpperLower(d, no, ResizeBitCast(d, LoadU(d2, p))); 2555 return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]); 2556 } 2557 2558 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8), 2559 HWY_IF_NOT_BF16_D(D)> 2560 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, 2561 size_t num_lanes) { 2562 const FixedTag<TFromD<D>, 4> d4; 2563 const Half<decltype(d4)> d2; 2564 const Half<decltype(d2)> d1; 2565 2566 if (num_lanes >= 8) return LoadU(d, p); 2567 if (num_lanes == 0) return Zero(d); 2568 if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)); 2569 2570 const size_t leading_len = num_lanes & 4; 2571 VFromD<decltype(d4)> v_trailing = Zero(d4); 2572 2573 if ((num_lanes & 2) != 0) { 2574 const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len); 2575 if ((num_lanes & 1) != 0) { 2576 v_trailing = Combine( 2577 d4, 2578 detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)), 2579 v_trailing_lo2); 2580 } else { 2581 v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2); 2582 } 2583 } else if ((num_lanes & 1) != 0) { 2584 v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len)); 2585 } 2586 2587 if (leading_len != 0) { 2588 return Combine(d, v_trailing, LoadU(d4, p)); 2589 } else { 2590 return detail::LoadNResizeBitCast(d, d4, v_trailing); 2591 } 2592 } 2593 2594 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8), 2595 HWY_IF_NOT_BF16_D(D)> 2596 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, 2597 size_t num_lanes) { 2598 const FixedTag<TFromD<D>, 4> d4; 2599 const Half<decltype(d4)> d2; 2600 const Half<decltype(d2)> d1; 2601 2602 if (num_lanes >= 8) return LoadU(d, p); 2603 if (num_lanes == 0) return no; 2604 if (num_lanes == 1) return InsertLane(no, 0, p[0]); 2605 2606 const size_t leading_len = num_lanes & 4; 2607 VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no); 2608 2609 if ((num_lanes & 2) != 0) { 2610 const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len); 2611 if ((num_lanes & 1) != 0) { 2612 v_trailing = Combine( 2613 d4, 2614 InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)), 2615 ResizeBitCast(d2, no)), 2616 v_trailing_lo2); 2617 } else { 2618 v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no), 2619 ResizeBitCast(d4, v_trailing_lo2)); 2620 } 2621 } else if ((num_lanes & 1) != 0) { 2622 v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]); 2623 } 2624 2625 if (leading_len != 0) { 2626 return Combine(d, v_trailing, LoadU(d4, p)); 2627 } else { 2628 return ConcatUpperLower(d, no, ResizeBitCast(d, v_trailing)); 2629 } 2630 } 2631 2632 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16), 2633 HWY_IF_NOT_BF16_D(D)> 2634 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, 2635 size_t num_lanes) { 2636 const FixedTag<TFromD<D>, 8> d8; 2637 const Half<decltype(d8)> d4; 2638 const Half<decltype(d4)> d2; 2639 const Half<decltype(d2)> d1; 2640 2641 if (num_lanes >= 16) return LoadU(d, p); 2642 if (num_lanes == 0) return Zero(d); 2643 if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)); 2644 2645 const size_t leading_len = num_lanes & 12; 2646 VFromD<decltype(d4)> v_trailing = Zero(d4); 2647 2648 if ((num_lanes & 2) != 0) { 2649 const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len); 2650 if ((num_lanes & 1) != 0) { 2651 v_trailing = Combine( 2652 d4, 2653 detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)), 2654 v_trailing_lo2); 2655 } else { 2656 v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2); 2657 } 2658 } else if ((num_lanes & 1) != 0) { 2659 v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len)); 2660 } 2661 2662 if (leading_len != 0) { 2663 if (leading_len >= 8) { 2664 const VFromD<decltype(d8)> v_hi7 = 2665 ((leading_len & 4) != 0) 2666 ? Combine(d8, v_trailing, LoadU(d4, p + 8)) 2667 : detail::LoadNResizeBitCast(d8, d4, v_trailing); 2668 return Combine(d, v_hi7, LoadU(d8, p)); 2669 } else { 2670 return detail::LoadNResizeBitCast(d, d8, 2671 Combine(d8, v_trailing, LoadU(d4, p))); 2672 } 2673 } else { 2674 return detail::LoadNResizeBitCast(d, d4, v_trailing); 2675 } 2676 } 2677 2678 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16), 2679 HWY_IF_NOT_BF16_D(D)> 2680 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, 2681 size_t num_lanes) { 2682 const FixedTag<TFromD<D>, 8> d8; 2683 const Half<decltype(d8)> d4; 2684 const Half<decltype(d4)> d2; 2685 const Half<decltype(d2)> d1; 2686 2687 if (num_lanes >= 16) return LoadU(d, p); 2688 if (num_lanes == 0) return no; 2689 if (num_lanes == 1) return InsertLane(no, 0, p[0]); 2690 2691 const size_t leading_len = num_lanes & 12; 2692 VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no); 2693 2694 if ((num_lanes & 2) != 0) { 2695 const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len); 2696 if ((num_lanes & 1) != 0) { 2697 v_trailing = Combine( 2698 d4, 2699 InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)), 2700 ResizeBitCast(d2, no)), 2701 v_trailing_lo2); 2702 } else { 2703 v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no), 2704 ResizeBitCast(d4, v_trailing_lo2)); 2705 } 2706 } else if ((num_lanes & 1) != 0) { 2707 v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]); 2708 } 2709 2710 if (leading_len != 0) { 2711 if (leading_len >= 8) { 2712 const VFromD<decltype(d8)> v_hi7 = 2713 ((leading_len & 4) != 0) 2714 ? Combine(d8, v_trailing, LoadU(d4, p + 8)) 2715 : ConcatUpperLower(d8, ResizeBitCast(d8, no), 2716 ResizeBitCast(d8, v_trailing)); 2717 return Combine(d, v_hi7, LoadU(d8, p)); 2718 } else { 2719 return ConcatUpperLower( 2720 d, ResizeBitCast(d, no), 2721 ResizeBitCast(d, Combine(d8, v_trailing, LoadU(d4, p)))); 2722 } 2723 } else { 2724 const Repartition<uint32_t, D> du32; 2725 // lowest 4 bytes from v_trailing, next 4 from no. 2726 const VFromD<decltype(du32)> lo8 = 2727 InterleaveLower(ResizeBitCast(du32, v_trailing), BitCast(du32, no)); 2728 return ConcatUpperLower(d, ResizeBitCast(d, no), ResizeBitCast(d, lo8)); 2729 } 2730 } 2731 2732 #if HWY_MAX_BYTES >= 32 2733 2734 template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)> 2735 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, 2736 size_t num_lanes) { 2737 if (num_lanes >= Lanes(d)) return LoadU(d, p); 2738 2739 const Half<decltype(d)> dh; 2740 const size_t half_N = Lanes(dh); 2741 if (num_lanes <= half_N) { 2742 return ZeroExtendVector(d, LoadN(dh, p, num_lanes)); 2743 } else { 2744 const VFromD<decltype(dh)> v_lo = LoadU(dh, p); 2745 const VFromD<decltype(dh)> v_hi = LoadN(dh, p + half_N, num_lanes - half_N); 2746 return Combine(d, v_hi, v_lo); 2747 } 2748 } 2749 2750 template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)> 2751 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, 2752 size_t num_lanes) { 2753 if (num_lanes >= Lanes(d)) return LoadU(d, p); 2754 2755 const Half<decltype(d)> dh; 2756 const size_t half_N = Lanes(dh); 2757 const VFromD<decltype(dh)> no_h = LowerHalf(no); 2758 if (num_lanes <= half_N) { 2759 return ConcatUpperLower(d, no, 2760 ResizeBitCast(d, LoadNOr(no_h, dh, p, num_lanes))); 2761 } else { 2762 const VFromD<decltype(dh)> v_lo = LoadU(dh, p); 2763 const VFromD<decltype(dh)> v_hi = 2764 LoadNOr(no_h, dh, p + half_N, num_lanes - half_N); 2765 return Combine(d, v_hi, v_lo); 2766 } 2767 } 2768 2769 #endif // HWY_MAX_BYTES >= 32 2770 2771 template <class D, HWY_IF_BF16_D(D)> 2772 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, 2773 size_t num_lanes) { 2774 const RebindToUnsigned<D> du; 2775 return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes)); 2776 } 2777 2778 template <class D, HWY_IF_BF16_D(D)> 2779 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, 2780 size_t num_lanes) { 2781 const RebindToUnsigned<D> du; 2782 return BitCast( 2783 d, LoadNOr(BitCast(du, no), du, detail::U16LanePointer(p), num_lanes)); 2784 } 2785 2786 #else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE 2787 2788 // For SVE and non-sanitizer AVX-512; RVV has its own specialization. 2789 template <class D> 2790 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, 2791 size_t num_lanes) { 2792 #if HWY_MEM_OPS_MIGHT_FAULT 2793 if (num_lanes <= 0) return Zero(d); 2794 #endif 2795 2796 return MaskedLoad(FirstN(d, num_lanes), d, p); 2797 } 2798 2799 template <class D> 2800 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, 2801 size_t num_lanes) { 2802 #if HWY_MEM_OPS_MIGHT_FAULT 2803 if (num_lanes <= 0) return no; 2804 #endif 2805 2806 return MaskedLoadOr(no, FirstN(d, num_lanes), d, p); 2807 } 2808 2809 #endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE 2810 #endif // HWY_NATIVE_LOAD_N 2811 2812 // ------------------------------ StoreN 2813 #if (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE)) 2814 #ifdef HWY_NATIVE_STORE_N 2815 #undef HWY_NATIVE_STORE_N 2816 #else 2817 #define HWY_NATIVE_STORE_N 2818 #endif 2819 2820 #if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE 2821 namespace detail { 2822 2823 template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)> 2824 HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) { 2825 constexpr size_t kMinShrVectBytes = HWY_TARGET_IS_NEON ? 8 : 16; 2826 const FixedTag<uint8_t, kMinShrVectBytes> d_shift; 2827 return ResizeBitCast( 2828 dh, ShiftRightBytes<dh.MaxBytes()>(d_shift, ResizeBitCast(d_shift, v))); 2829 } 2830 2831 template <class DH, HWY_IF_V_SIZE_GT_D(DH, 4)> 2832 HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) { 2833 return UpperHalf(dh, v); 2834 } 2835 2836 } // namespace detail 2837 2838 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1), 2839 typename T = TFromD<D>> 2840 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p, 2841 size_t max_lanes_to_store) { 2842 if (max_lanes_to_store > 0) { 2843 StoreU(v, d, p); 2844 } 2845 } 2846 2847 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2), 2848 typename T = TFromD<D>> 2849 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p, 2850 size_t max_lanes_to_store) { 2851 if (max_lanes_to_store > 1) { 2852 StoreU(v, d, p); 2853 } else if (max_lanes_to_store == 1) { 2854 const FixedTag<TFromD<D>, 1> d1; 2855 StoreU(LowerHalf(d1, v), d1, p); 2856 } 2857 } 2858 2859 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4), 2860 typename T = TFromD<D>> 2861 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p, 2862 size_t max_lanes_to_store) { 2863 const FixedTag<TFromD<D>, 2> d2; 2864 const Half<decltype(d2)> d1; 2865 2866 if (max_lanes_to_store > 1) { 2867 if (max_lanes_to_store >= 4) { 2868 StoreU(v, d, p); 2869 } else { 2870 StoreU(ResizeBitCast(d2, v), d2, p); 2871 if (max_lanes_to_store == 3) { 2872 StoreU(ResizeBitCast(d1, detail::StoreNGetUpperHalf(d2, v)), d1, p + 2); 2873 } 2874 } 2875 } else if (max_lanes_to_store == 1) { 2876 StoreU(ResizeBitCast(d1, v), d1, p); 2877 } 2878 } 2879 2880 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8), 2881 typename T = TFromD<D>> 2882 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p, 2883 size_t max_lanes_to_store) { 2884 const FixedTag<TFromD<D>, 4> d4; 2885 const Half<decltype(d4)> d2; 2886 const Half<decltype(d2)> d1; 2887 2888 if (max_lanes_to_store <= 1) { 2889 if (max_lanes_to_store == 1) { 2890 StoreU(ResizeBitCast(d1, v), d1, p); 2891 } 2892 } else if (max_lanes_to_store >= 8) { 2893 StoreU(v, d, p); 2894 } else if (max_lanes_to_store >= 4) { 2895 StoreU(LowerHalf(d4, v), d4, p); 2896 StoreN(detail::StoreNGetUpperHalf(d4, v), d4, p + 4, 2897 max_lanes_to_store - 4); 2898 } else { 2899 StoreN(LowerHalf(d4, v), d4, p, max_lanes_to_store); 2900 } 2901 } 2902 2903 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16), 2904 typename T = TFromD<D>> 2905 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p, 2906 size_t max_lanes_to_store) { 2907 const FixedTag<TFromD<D>, 8> d8; 2908 const Half<decltype(d8)> d4; 2909 const Half<decltype(d4)> d2; 2910 const Half<decltype(d2)> d1; 2911 2912 if (max_lanes_to_store <= 1) { 2913 if (max_lanes_to_store == 1) { 2914 StoreU(ResizeBitCast(d1, v), d1, p); 2915 } 2916 } else if (max_lanes_to_store >= 16) { 2917 StoreU(v, d, p); 2918 } else if (max_lanes_to_store >= 8) { 2919 StoreU(LowerHalf(d8, v), d8, p); 2920 StoreN(detail::StoreNGetUpperHalf(d8, v), d8, p + 8, 2921 max_lanes_to_store - 8); 2922 } else { 2923 StoreN(LowerHalf(d8, v), d8, p, max_lanes_to_store); 2924 } 2925 } 2926 2927 #if HWY_MAX_BYTES >= 32 2928 template <class D, HWY_IF_V_SIZE_GT_D(D, 16), typename T = TFromD<D>> 2929 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p, 2930 size_t max_lanes_to_store) { 2931 const size_t N = Lanes(d); 2932 if (max_lanes_to_store >= N) { 2933 StoreU(v, d, p); 2934 return; 2935 } 2936 2937 const Half<decltype(d)> dh; 2938 const size_t half_N = Lanes(dh); 2939 if (max_lanes_to_store <= half_N) { 2940 StoreN(LowerHalf(dh, v), dh, p, max_lanes_to_store); 2941 } else { 2942 StoreU(LowerHalf(dh, v), dh, p); 2943 StoreN(UpperHalf(dh, v), dh, p + half_N, max_lanes_to_store - half_N); 2944 } 2945 } 2946 #endif // HWY_MAX_BYTES >= 32 2947 2948 #else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE 2949 template <class D, typename T = TFromD<D>> 2950 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p, 2951 size_t max_lanes_to_store) { 2952 const size_t N = Lanes(d); 2953 const size_t clamped_max_lanes_to_store = HWY_MIN(max_lanes_to_store, N); 2954 #if HWY_MEM_OPS_MIGHT_FAULT 2955 if (clamped_max_lanes_to_store == 0) return; 2956 #endif 2957 2958 BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p); 2959 2960 detail::MaybeUnpoison(p, clamped_max_lanes_to_store); 2961 } 2962 #endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE 2963 2964 #endif // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE)) 2965 2966 // ------------------------------ TruncateStore 2967 #if (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE)) 2968 #ifdef HWY_NATIVE_STORE_TRUNCATED 2969 #undef HWY_NATIVE_STORE_TRUNCATED 2970 #else 2971 #define HWY_NATIVE_STORE_TRUNCATED 2972 #endif 2973 2974 template <class D, class T, HWY_IF_T_SIZE_GT_D(D, sizeof(T)), 2975 HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D)> 2976 HWY_API void TruncateStore(VFromD<D> v, const D /*d*/, T* HWY_RESTRICT p) { 2977 using DTo = Rebind<T, D>; 2978 DTo dsmall; 2979 StoreU(TruncateTo(dsmall, v), dsmall, p); 2980 } 2981 2982 #endif // (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE)) 2983 2984 // ------------------------------ Scatter 2985 2986 #if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE)) 2987 #ifdef HWY_NATIVE_SCATTER 2988 #undef HWY_NATIVE_SCATTER 2989 #else 2990 #define HWY_NATIVE_SCATTER 2991 #endif 2992 2993 template <class D, typename T = TFromD<D>> 2994 HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base, 2995 VFromD<RebindToSigned<D>> offset) { 2996 const RebindToSigned<decltype(d)> di; 2997 using TI = TFromD<decltype(di)>; 2998 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 2999 3000 HWY_ALIGN T lanes[MaxLanes(d)]; 3001 Store(v, d, lanes); 3002 3003 HWY_ALIGN TI offset_lanes[MaxLanes(d)]; 3004 Store(offset, di, offset_lanes); 3005 3006 uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base); 3007 for (size_t i = 0; i < MaxLanes(d); ++i) { 3008 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]); 3009 } 3010 } 3011 3012 template <class D, typename T = TFromD<D>> 3013 HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base, 3014 VFromD<RebindToSigned<D>> index) { 3015 const RebindToSigned<decltype(d)> di; 3016 using TI = TFromD<decltype(di)>; 3017 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 3018 3019 HWY_ALIGN T lanes[MaxLanes(d)]; 3020 Store(v, d, lanes); 3021 3022 HWY_ALIGN TI index_lanes[MaxLanes(d)]; 3023 Store(index, di, index_lanes); 3024 3025 for (size_t i = 0; i < MaxLanes(d); ++i) { 3026 base[index_lanes[i]] = lanes[i]; 3027 } 3028 } 3029 3030 template <class D, typename T = TFromD<D>> 3031 HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d, 3032 T* HWY_RESTRICT base, 3033 VFromD<RebindToSigned<D>> index) { 3034 const RebindToSigned<decltype(d)> di; 3035 using TI = TFromD<decltype(di)>; 3036 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 3037 3038 HWY_ALIGN T lanes[MaxLanes(d)]; 3039 Store(v, d, lanes); 3040 3041 HWY_ALIGN TI index_lanes[MaxLanes(d)]; 3042 Store(index, di, index_lanes); 3043 3044 HWY_ALIGN TI mask_lanes[MaxLanes(di)]; 3045 Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes); 3046 3047 for (size_t i = 0; i < MaxLanes(d); ++i) { 3048 if (mask_lanes[i]) base[index_lanes[i]] = lanes[i]; 3049 } 3050 } 3051 3052 template <class D, typename T = TFromD<D>> 3053 HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base, 3054 VFromD<RebindToSigned<D>> index, 3055 const size_t max_lanes_to_store) { 3056 const RebindToSigned<decltype(d)> di; 3057 using TI = TFromD<decltype(di)>; 3058 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 3059 3060 for (size_t i = 0; i < MaxLanes(d); ++i) { 3061 if (i < max_lanes_to_store) base[ExtractLane(index, i)] = ExtractLane(v, i); 3062 } 3063 } 3064 #else 3065 template <class D, typename T = TFromD<D>> 3066 HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base, 3067 VFromD<RebindToSigned<D>> index, 3068 const size_t max_lanes_to_store) { 3069 MaskedScatterIndex(v, FirstN(d, max_lanes_to_store), d, base, index); 3070 } 3071 #endif // (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE)) 3072 3073 // ------------------------------ Gather 3074 3075 #if (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE)) 3076 #ifdef HWY_NATIVE_GATHER 3077 #undef HWY_NATIVE_GATHER 3078 #else 3079 #define HWY_NATIVE_GATHER 3080 #endif 3081 3082 template <class D, typename T = TFromD<D>> 3083 HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base, 3084 VFromD<RebindToSigned<D>> offset) { 3085 const RebindToSigned<D> di; 3086 using TI = TFromD<decltype(di)>; 3087 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 3088 3089 HWY_ALIGN TI offset_lanes[MaxLanes(d)]; 3090 Store(offset, di, offset_lanes); 3091 3092 HWY_ALIGN T lanes[MaxLanes(d)]; 3093 const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base); 3094 for (size_t i = 0; i < MaxLanes(d); ++i) { 3095 HWY_DASSERT(offset_lanes[i] >= 0); 3096 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]); 3097 } 3098 return Load(d, lanes); 3099 } 3100 3101 template <class D, typename T = TFromD<D>> 3102 HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base, 3103 VFromD<RebindToSigned<D>> index) { 3104 const RebindToSigned<D> di; 3105 using TI = TFromD<decltype(di)>; 3106 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 3107 3108 HWY_ALIGN TI index_lanes[MaxLanes(d)]; 3109 Store(index, di, index_lanes); 3110 3111 HWY_ALIGN T lanes[MaxLanes(d)]; 3112 for (size_t i = 0; i < MaxLanes(d); ++i) { 3113 HWY_DASSERT(index_lanes[i] >= 0); 3114 lanes[i] = base[index_lanes[i]]; 3115 } 3116 return Load(d, lanes); 3117 } 3118 3119 template <class D, typename T = TFromD<D>> 3120 HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d, 3121 const T* HWY_RESTRICT base, 3122 VFromD<RebindToSigned<D>> index) { 3123 const RebindToSigned<D> di; 3124 using TI = TFromD<decltype(di)>; 3125 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 3126 3127 HWY_ALIGN TI index_lanes[MaxLanes(di)]; 3128 Store(index, di, index_lanes); 3129 3130 HWY_ALIGN TI mask_lanes[MaxLanes(di)]; 3131 Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes); 3132 3133 HWY_ALIGN T lanes[MaxLanes(d)]; 3134 for (size_t i = 0; i < MaxLanes(d); ++i) { 3135 HWY_DASSERT(index_lanes[i] >= 0); 3136 lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0}; 3137 } 3138 return Load(d, lanes); 3139 } 3140 3141 template <class D, typename T = TFromD<D>> 3142 HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d, 3143 const T* HWY_RESTRICT base, 3144 VFromD<RebindToSigned<D>> index) { 3145 const RebindToSigned<D> di; 3146 using TI = TFromD<decltype(di)>; 3147 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 3148 3149 HWY_ALIGN TI index_lanes[MaxLanes(di)]; 3150 Store(index, di, index_lanes); 3151 3152 HWY_ALIGN TI mask_lanes[MaxLanes(di)]; 3153 Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes); 3154 3155 HWY_ALIGN T no_lanes[MaxLanes(d)]; 3156 Store(no, d, no_lanes); 3157 3158 HWY_ALIGN T lanes[MaxLanes(d)]; 3159 for (size_t i = 0; i < MaxLanes(d); ++i) { 3160 HWY_DASSERT(index_lanes[i] >= 0); 3161 lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : no_lanes[i]; 3162 } 3163 return Load(d, lanes); 3164 } 3165 3166 template <class D, typename T = TFromD<D>> 3167 HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base, 3168 VFromD<RebindToSigned<D>> index, 3169 const size_t max_lanes_to_load) { 3170 return GatherIndexNOr(Zero(d), d, base, index, max_lanes_to_load); 3171 } 3172 3173 template <class D, typename T = TFromD<D>> 3174 HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base, 3175 VFromD<RebindToSigned<D>> index, 3176 const size_t max_lanes_to_load) { 3177 const RebindToSigned<D> di; 3178 using TI = TFromD<decltype(di)>; 3179 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 3180 3181 VFromD<D> v = no; 3182 for (size_t i = 0; i < MaxLanes(d); ++i) { 3183 if (i < max_lanes_to_load) 3184 v = InsertLane(v, i, base[ExtractLane(index, i)]); 3185 } 3186 return v; 3187 } 3188 #else 3189 template <class D, typename T = TFromD<D>> 3190 HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base, 3191 VFromD<RebindToSigned<D>> index, 3192 const size_t max_lanes_to_load) { 3193 return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index); 3194 } 3195 template <class D, typename T = TFromD<D>> 3196 HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base, 3197 VFromD<RebindToSigned<D>> index, 3198 const size_t max_lanes_to_load) { 3199 return MaskedGatherIndexOr(no, FirstN(d, max_lanes_to_load), d, base, index); 3200 } 3201 #endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE)) 3202 3203 // ------------------------------ Integer AbsDiff and SumsOf8AbsDiff 3204 3205 #if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE)) 3206 #ifdef HWY_NATIVE_INTEGER_ABS_DIFF 3207 #undef HWY_NATIVE_INTEGER_ABS_DIFF 3208 #else 3209 #define HWY_NATIVE_INTEGER_ABS_DIFF 3210 #endif 3211 3212 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 3213 HWY_API V AbsDiff(V a, V b) { 3214 return Sub(Max(a, b), Min(a, b)); 3215 } 3216 3217 #endif // HWY_NATIVE_INTEGER_ABS_DIFF 3218 3219 #if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE)) 3220 #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF 3221 #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF 3222 #else 3223 #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF 3224 #endif 3225 3226 template <class V, HWY_IF_UI8_D(DFromV<V>), 3227 HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))> 3228 HWY_API Vec<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) { 3229 const DFromV<decltype(a)> d; 3230 const RebindToUnsigned<decltype(d)> du; 3231 const RepartitionToWideX3<decltype(d)> dw; 3232 3233 return BitCast(dw, SumsOf8(BitCast(du, AbsDiff(a, b)))); 3234 } 3235 3236 #endif // HWY_NATIVE_SUMS_OF_8_ABS_DIFF 3237 3238 // ------------------------------ SaturatedAdd/SaturatedSub for UI32/UI64 3239 3240 #if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) 3241 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB 3242 #undef HWY_NATIVE_I32_SATURATED_ADDSUB 3243 #else 3244 #define HWY_NATIVE_I32_SATURATED_ADDSUB 3245 #endif 3246 3247 template <class V, HWY_IF_I32_D(DFromV<V>)> 3248 HWY_API V SaturatedAdd(V a, V b) { 3249 const DFromV<decltype(a)> d; 3250 const auto sum = Add(a, b); 3251 const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum)); 3252 const auto overflow_result = 3253 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>())); 3254 return IfNegativeThenElse(overflow_mask, overflow_result, sum); 3255 } 3256 3257 template <class V, HWY_IF_I32_D(DFromV<V>)> 3258 HWY_API V SaturatedSub(V a, V b) { 3259 const DFromV<decltype(a)> d; 3260 const auto diff = Sub(a, b); 3261 const auto overflow_mask = And(Xor(a, b), Xor(a, diff)); 3262 const auto overflow_result = 3263 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>())); 3264 return IfNegativeThenElse(overflow_mask, overflow_result, diff); 3265 } 3266 3267 #endif // HWY_NATIVE_I32_SATURATED_ADDSUB 3268 3269 #if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) 3270 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB 3271 #undef HWY_NATIVE_I64_SATURATED_ADDSUB 3272 #else 3273 #define HWY_NATIVE_I64_SATURATED_ADDSUB 3274 #endif 3275 3276 template <class V, HWY_IF_I64_D(DFromV<V>)> 3277 HWY_API V SaturatedAdd(V a, V b) { 3278 const DFromV<decltype(a)> d; 3279 const auto sum = Add(a, b); 3280 const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum)); 3281 const auto overflow_result = 3282 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>())); 3283 return IfNegativeThenElse(overflow_mask, overflow_result, sum); 3284 } 3285 3286 template <class V, HWY_IF_I64_D(DFromV<V>)> 3287 HWY_API V SaturatedSub(V a, V b) { 3288 const DFromV<decltype(a)> d; 3289 const auto diff = Sub(a, b); 3290 const auto overflow_mask = And(Xor(a, b), Xor(a, diff)); 3291 const auto overflow_result = 3292 Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>())); 3293 return IfNegativeThenElse(overflow_mask, overflow_result, diff); 3294 } 3295 3296 #endif // HWY_NATIVE_I64_SATURATED_ADDSUB 3297 3298 #if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) 3299 #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB 3300 #undef HWY_NATIVE_U32_SATURATED_ADDSUB 3301 #else 3302 #define HWY_NATIVE_U32_SATURATED_ADDSUB 3303 #endif 3304 3305 template <class V, HWY_IF_U32_D(DFromV<V>)> 3306 HWY_API V SaturatedAdd(V a, V b) { 3307 return Add(a, Min(b, Not(a))); 3308 } 3309 3310 template <class V, HWY_IF_U32_D(DFromV<V>)> 3311 HWY_API V SaturatedSub(V a, V b) { 3312 return Sub(a, Min(a, b)); 3313 } 3314 3315 #endif // HWY_NATIVE_U32_SATURATED_ADDSUB 3316 3317 #if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) 3318 #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB 3319 #undef HWY_NATIVE_U64_SATURATED_ADDSUB 3320 #else 3321 #define HWY_NATIVE_U64_SATURATED_ADDSUB 3322 #endif 3323 3324 template <class V, HWY_IF_U64_D(DFromV<V>)> 3325 HWY_API V SaturatedAdd(V a, V b) { 3326 return Add(a, Min(b, Not(a))); 3327 } 3328 3329 template <class V, HWY_IF_U64_D(DFromV<V>)> 3330 HWY_API V SaturatedSub(V a, V b) { 3331 return Sub(a, Min(a, b)); 3332 } 3333 3334 #endif // HWY_NATIVE_U64_SATURATED_ADDSUB 3335 3336 // ------------------------------ Unsigned to signed demotions 3337 3338 template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V), 3339 HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V), 3340 class V2 = VFromD<Rebind<TFromV<V>, DN>>, 3341 hwy::EnableIf<(sizeof(TFromD<DN>) < sizeof(TFromV<V>))>* = nullptr, 3342 HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))> 3343 HWY_API VFromD<DN> DemoteTo(DN dn, V v) { 3344 const DFromV<decltype(v)> d; 3345 const RebindToSigned<decltype(d)> di; 3346 const RebindToUnsigned<decltype(dn)> dn_u; 3347 3348 // First, do a signed to signed demotion. This will convert any values 3349 // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a 3350 // negative value. 3351 const auto i2i_demote_result = DemoteTo(dn, BitCast(di, v)); 3352 3353 // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>() 3354 // using an unsigned Min operation. 3355 const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>()); 3356 3357 return BitCast( 3358 dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val))); 3359 } 3360 3361 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 3362 template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V), 3363 HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V), 3364 class V2 = VFromD<Repartition<TFromV<V>, DN>>, 3365 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 3366 HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))> 3367 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 3368 const DFromV<decltype(a)> d; 3369 const RebindToSigned<decltype(d)> di; 3370 const RebindToUnsigned<decltype(dn)> dn_u; 3371 3372 // First, do a signed to signed demotion. This will convert any values 3373 // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a 3374 // negative value. 3375 const auto i2i_demote_result = 3376 ReorderDemote2To(dn, BitCast(di, a), BitCast(di, b)); 3377 3378 // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>() 3379 // using an unsigned Min operation. 3380 const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>()); 3381 3382 return BitCast( 3383 dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val))); 3384 } 3385 #endif 3386 3387 // ------------------------------ PromoteLowerTo 3388 3389 // There is no codegen advantage for a native version of this. It is provided 3390 // only for convenience. 3391 template <class D, class V> 3392 HWY_API VFromD<D> PromoteLowerTo(D d, V v) { 3393 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V 3394 // because it cannot be deduced from D (could be either bf16 or f16). 3395 const Rebind<TFromV<V>, decltype(d)> dh; 3396 return PromoteTo(d, LowerHalf(dh, v)); 3397 } 3398 3399 // ------------------------------ PromoteUpperTo 3400 3401 #if (defined(HWY_NATIVE_PROMOTE_UPPER_TO) == defined(HWY_TARGET_TOGGLE)) 3402 #ifdef HWY_NATIVE_PROMOTE_UPPER_TO 3403 #undef HWY_NATIVE_PROMOTE_UPPER_TO 3404 #else 3405 #define HWY_NATIVE_PROMOTE_UPPER_TO 3406 #endif 3407 3408 // This requires UpperHalf. 3409 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 3410 3411 template <class D, class V> 3412 HWY_API VFromD<D> PromoteUpperTo(D d, V v) { 3413 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V 3414 // because it cannot be deduced from D (could be either bf16 or f16). 3415 const Rebind<TFromV<V>, decltype(d)> dh; 3416 return PromoteTo(d, UpperHalf(dh, v)); 3417 } 3418 3419 #endif // HWY_TARGET != HWY_SCALAR 3420 #endif // HWY_NATIVE_PROMOTE_UPPER_TO 3421 3422 // ------------------------------ float16_t <-> float 3423 3424 #if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE)) 3425 #ifdef HWY_NATIVE_F16C 3426 #undef HWY_NATIVE_F16C 3427 #else 3428 #define HWY_NATIVE_F16C 3429 #endif 3430 3431 template <class D, HWY_IF_F32_D(D)> 3432 HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) { 3433 const RebindToSigned<decltype(df32)> di32; 3434 const RebindToUnsigned<decltype(df32)> du32; 3435 const Rebind<uint16_t, decltype(df32)> du16; 3436 using VU32 = VFromD<decltype(du32)>; 3437 3438 const VU32 bits16 = PromoteTo(du32, BitCast(du16, v)); 3439 const VU32 sign = ShiftRight<15>(bits16); 3440 const VU32 biased_exp = And(ShiftRight<10>(bits16), Set(du32, 0x1F)); 3441 const VU32 mantissa = And(bits16, Set(du32, 0x3FF)); 3442 const VU32 subnormal = 3443 BitCast(du32, Mul(ConvertTo(df32, BitCast(di32, mantissa)), 3444 Set(df32, 1.0f / 16384 / 1024))); 3445 3446 const VU32 biased_exp32 = Add(biased_exp, Set(du32, 127 - 15)); 3447 const VU32 mantissa32 = ShiftLeft<23 - 10>(mantissa); 3448 const VU32 normal = Or(ShiftLeft<23>(biased_exp32), mantissa32); 3449 const VU32 bits32 = IfThenElse(Eq(biased_exp, Zero(du32)), subnormal, normal); 3450 return BitCast(df32, Or(ShiftLeft<31>(sign), bits32)); 3451 } 3452 3453 template <class D, HWY_IF_F16_D(D)> 3454 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) { 3455 const RebindToSigned<decltype(df16)> di16; 3456 const Rebind<int32_t, decltype(df16)> di32; 3457 const RebindToFloat<decltype(di32)> df32; 3458 const RebindToUnsigned<decltype(df32)> du32; 3459 3460 // There are 23 fractional bits (plus the implied 1 bit) in the mantissa of 3461 // a F32, and there are 10 fractional bits (plus the implied 1 bit) in the 3462 // mantissa of a F16 3463 3464 // We want the unbiased exponent of round_incr[i] to be at least (-14) + 13 as 3465 // 2^(-14) is the smallest positive normal F16 value and as we want 13 3466 // mantissa bits (including the implicit 1 bit) to the left of the 3467 // F32 mantissa bits in rounded_val[i] since 23 - 10 is equal to 13 3468 3469 // The biased exponent of round_incr[i] needs to be at least 126 as 3470 // (-14) + 13 + 127 is equal to 126 3471 3472 // We also want to biased exponent of round_incr[i] to be less than or equal 3473 // to 255 (which is equal to MaxExponentField<float>()) 3474 3475 // The biased F32 exponent of round_incr is equal to 3476 // HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126) 3477 3478 // hi9_bits[i] is equal to the upper 9 bits of v[i] 3479 const auto hi9_bits = ShiftRight<23>(BitCast(du32, v)); 3480 3481 const auto k13 = Set(du32, uint32_t{13u}); 3482 3483 // Minimum biased F32 exponent of round_incr 3484 const auto k126 = Set(du32, uint32_t{126u}); 3485 3486 // round_incr_hi9_bits[i] is equivalent to 3487 // (hi9_bits[i] & 0x100) | 3488 // HWY_MAX(HWY_MIN((hi9_bits[i] & 0xFF) + 13, 255), 126) 3489 3490 #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 3491 const auto k255 = Set(du32, uint32_t{255u}); 3492 const auto round_incr_hi9_bits = BitwiseIfThenElse( 3493 k255, Max(Min(Add(And(hi9_bits, k255), k13), k255), k126), hi9_bits); 3494 #else 3495 // On targets other than SCALAR and EMU128, the exponent bits of hi9_bits can 3496 // be incremented by 13 and clamped to the [13, 255] range without overflowing 3497 // into the sign bit of hi9_bits by using U8 SaturatedAdd as there are 8 3498 // exponent bits in an F32 3499 3500 // U8 Max can be used on targets other than SCALAR and EMU128 to clamp 3501 // ((hi9_bits & 0xFF) + 13) to the [126, 255] range without affecting the sign 3502 // bit 3503 3504 const Repartition<uint8_t, decltype(du32)> du32_as_u8; 3505 const auto round_incr_hi9_bits = BitCast( 3506 du32, 3507 Max(SaturatedAdd(BitCast(du32_as_u8, hi9_bits), BitCast(du32_as_u8, k13)), 3508 BitCast(du32_as_u8, k126))); 3509 #endif 3510 3511 // (round_incr_hi9_bits >> 8) is equal to (hi9_bits >> 8), and 3512 // (round_incr_hi9_bits & 0xFF) is equal to 3513 // HWY_MAX(HWY_MIN((round_incr_hi9_bits & 0xFF) + 13, 255), 126) 3514 3515 const auto round_incr = BitCast(df32, ShiftLeft<23>(round_incr_hi9_bits)); 3516 3517 // Add round_incr[i] to v[i] to round the mantissa to the nearest F16 mantissa 3518 // and to move the fractional bits of the resulting non-NaN mantissa down to 3519 // the lower 10 bits of rounded_val if (v[i] + round_incr[i]) is a non-NaN 3520 // value 3521 const auto rounded_val = Add(v, round_incr); 3522 3523 // rounded_val_bits is the bits of rounded_val as a U32 3524 const auto rounded_val_bits = BitCast(du32, rounded_val); 3525 3526 // rounded_val[i] is known to have the same biased exponent as round_incr[i] 3527 // as |round_incr[i]| > 2^12*|v[i]| is true if round_incr[i] is a finite 3528 // value, round_incr[i] and v[i] both have the same sign, and |round_incr[i]| 3529 // is either a power of 2 that is greater than or equal to 2^-1 or infinity. 3530 3531 // If rounded_val[i] is a finite F32 value, then 3532 // (rounded_val_bits[i] & 0x00000FFF) is the bit representation of the 3533 // rounded mantissa of rounded_val[i] as a UQ2.10 fixed point number that is 3534 // in the range [0, 2]. 3535 3536 // In other words, (rounded_val_bits[i] & 0x00000FFF) is between 0 and 0x0800, 3537 // with (rounded_val_bits[i] & 0x000003FF) being the fractional bits of the 3538 // resulting F16 mantissa, if rounded_v[i] is a finite F32 value. 3539 3540 // (rounded_val_bits[i] & 0x007FF000) == 0 is guaranteed to be true if 3541 // rounded_val[i] is a non-NaN value 3542 3543 // The biased exponent of rounded_val[i] is guaranteed to be at least 126 as 3544 // the biased exponent of round_incr[i] is at least 126 and as both v[i] and 3545 // round_incr[i] have the same sign bit 3546 3547 // The ULP of a F32 value with a biased exponent of 126 is equal to 3548 // 2^(126 - 127 - 23), which is equal to 2^(-24) (which is also the ULP of a 3549 // F16 value with a biased exponent of 0 or 1 as (1 - 15 - 10) is equal to 3550 // -24) 3551 3552 // The biased exponent (before subtracting by 126) needs to be clamped to the 3553 // [126, 157] range as 126 + 31 is equal to 157 and as 31 is the largest 3554 // biased exponent of a F16. 3555 3556 // The biased exponent of the resulting F16 value is equal to 3557 // HWY_MIN((round_incr_hi9_bits[i] & 0xFF) + 3558 // ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126 3559 3560 #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 3561 const auto k157Shl10 = Set(du32, static_cast<uint32_t>(uint32_t{157u} << 10)); 3562 auto f16_exp_bits = 3563 Min(Add(ShiftLeft<10>(And(round_incr_hi9_bits, k255)), 3564 And(rounded_val_bits, 3565 Set(du32, static_cast<uint32_t>(uint32_t{0xFFu} << 10)))), 3566 k157Shl10); 3567 const auto f16_result_is_inf_mask = 3568 RebindMask(df32, Eq(f16_exp_bits, k157Shl10)); 3569 #else 3570 const auto k157 = Set(du32, uint32_t{157}); 3571 auto f16_exp_bits = BitCast( 3572 du32, 3573 Min(SaturatedAdd(BitCast(du32_as_u8, round_incr_hi9_bits), 3574 BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))), 3575 BitCast(du32_as_u8, k157))); 3576 const auto f16_result_is_inf_mask = RebindMask(df32, Eq(f16_exp_bits, k157)); 3577 f16_exp_bits = ShiftLeft<10>(f16_exp_bits); 3578 #endif 3579 3580 f16_exp_bits = 3581 Sub(f16_exp_bits, Set(du32, static_cast<uint32_t>(uint32_t{126u} << 10))); 3582 3583 const auto f16_unmasked_mant_bits = 3584 BitCast(di32, Or(IfThenZeroElse(f16_result_is_inf_mask, rounded_val), 3585 VecFromMask(df32, IsNaN(rounded_val)))); 3586 3587 const auto f16_exp_mant_bits = 3588 OrAnd(BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits, 3589 Set(di32, int32_t{0x03FF})); 3590 3591 // f16_bits_as_i32 is the F16 bits sign-extended to an I32 (with the upper 17 3592 // bits of f16_bits_as_i32[i] set to the sign bit of rounded_val[i]) to allow 3593 // efficient truncation of the F16 bits to an I16 using an I32->I16 DemoteTo 3594 // operation 3595 const auto f16_bits_as_i32 = 3596 OrAnd(f16_exp_mant_bits, ShiftRight<16>(BitCast(di32, rounded_val_bits)), 3597 Set(di32, static_cast<int32_t>(0xFFFF8000u))); 3598 return BitCast(df16, DemoteTo(di16, f16_bits_as_i32)); 3599 } 3600 3601 #endif // HWY_NATIVE_F16C 3602 3603 // ------------------------------ F64->F16 DemoteTo 3604 #if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE)) 3605 #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 3606 #undef HWY_NATIVE_DEMOTE_F64_TO_F16 3607 #else 3608 #define HWY_NATIVE_DEMOTE_F64_TO_F16 3609 #endif 3610 3611 #if HWY_HAVE_FLOAT64 3612 template <class D, HWY_IF_F16_D(D)> 3613 HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) { 3614 const Rebind<double, D> df64; 3615 const Rebind<uint64_t, D> du64; 3616 const Rebind<float, D> df32; 3617 3618 // The mantissa bits of v[i] are first rounded using round-to-odd rounding to 3619 // the nearest F64 value that has the lower 29 bits zeroed out to ensure that 3620 // the result is correctly rounded to a F16. 3621 3622 const auto vf64_rounded = OrAnd( 3623 And(v, 3624 BitCast(df64, Set(du64, static_cast<uint64_t>(0xFFFFFFFFE0000000u)))), 3625 BitCast(df64, Add(BitCast(du64, v), 3626 Set(du64, static_cast<uint64_t>(0x000000001FFFFFFFu)))), 3627 BitCast(df64, Set(du64, static_cast<uint64_t>(0x0000000020000000ULL)))); 3628 3629 return DemoteTo(df16, DemoteTo(df32, vf64_rounded)); 3630 } 3631 #endif // HWY_HAVE_FLOAT64 3632 3633 #endif // HWY_NATIVE_DEMOTE_F64_TO_F16 3634 3635 // ------------------------------ F16->F64 PromoteTo 3636 #if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE)) 3637 #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64 3638 #undef HWY_NATIVE_PROMOTE_F16_TO_F64 3639 #else 3640 #define HWY_NATIVE_PROMOTE_F16_TO_F64 3641 #endif 3642 3643 #if HWY_HAVE_FLOAT64 3644 template <class D, HWY_IF_F64_D(D)> 3645 HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) { 3646 return PromoteTo(df64, PromoteTo(Rebind<float, D>(), v)); 3647 } 3648 #endif // HWY_HAVE_FLOAT64 3649 3650 #endif // HWY_NATIVE_PROMOTE_F16_TO_F64 3651 3652 // ------------------------------ F32 to BF16 DemoteTo 3653 #if (defined(HWY_NATIVE_DEMOTE_F32_TO_BF16) == defined(HWY_TARGET_TOGGLE)) 3654 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16 3655 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16 3656 #else 3657 #define HWY_NATIVE_DEMOTE_F32_TO_BF16 3658 #endif 3659 3660 namespace detail { 3661 3662 // Round a F32 value to the nearest BF16 value, with the result returned as the 3663 // rounded F32 value bitcasted to an U32 3664 3665 // RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent 3666 // NaN F32 values from being converted to an infinity 3667 template <class V, HWY_IF_F32(TFromV<V>)> 3668 HWY_INLINE VFromD<RebindToUnsigned<DFromV<V>>> RoundF32ForDemoteToBF16(V v) { 3669 const DFromV<decltype(v)> d; 3670 const RebindToUnsigned<decltype(d)> du32; 3671 3672 const auto is_non_nan = Not(IsNaN(v)); 3673 const auto bits32 = BitCast(du32, v); 3674 3675 const auto round_incr = 3676 Add(And(ShiftRight<16>(bits32), Set(du32, uint32_t{1})), 3677 Set(du32, uint32_t{0x7FFFu})); 3678 return MaskedAddOr(Or(bits32, Set(du32, uint32_t{0x00400000u})), 3679 RebindMask(du32, is_non_nan), bits32, round_incr); 3680 } 3681 3682 } // namespace detail 3683 3684 template <class D, HWY_IF_BF16_D(D)> 3685 HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) { 3686 const RebindToUnsigned<decltype(dbf16)> du16; 3687 const Twice<decltype(du16)> dt_u16; 3688 3689 const auto rounded_bits = BitCast(dt_u16, detail::RoundF32ForDemoteToBF16(v)); 3690 #if HWY_IS_LITTLE_ENDIAN 3691 return BitCast( 3692 dbf16, LowerHalf(du16, ConcatOdd(dt_u16, rounded_bits, rounded_bits))); 3693 #else 3694 return BitCast( 3695 dbf16, LowerHalf(du16, ConcatEven(dt_u16, rounded_bits, rounded_bits))); 3696 #endif 3697 } 3698 3699 template <class D, HWY_IF_BF16_D(D)> 3700 HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a, 3701 VFromD<Repartition<float, D>> b) { 3702 const RebindToUnsigned<decltype(dbf16)> du16; 3703 3704 const auto rounded_a_bits32 = 3705 BitCast(du16, detail::RoundF32ForDemoteToBF16(a)); 3706 const auto rounded_b_bits32 = 3707 BitCast(du16, detail::RoundF32ForDemoteToBF16(b)); 3708 #if HWY_IS_LITTLE_ENDIAN 3709 return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, rounded_b_bits32), 3710 BitCast(du16, rounded_a_bits32))); 3711 #else 3712 return BitCast(dbf16, ConcatEven(du16, BitCast(du16, rounded_b_bits32), 3713 BitCast(du16, rounded_a_bits32))); 3714 #endif 3715 } 3716 3717 template <class D, HWY_IF_BF16_D(D)> 3718 HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a, 3719 VFromD<Repartition<float, D>> b) { 3720 const RebindToUnsigned<decltype(dbf16)> du16; 3721 3722 #if HWY_IS_LITTLE_ENDIAN 3723 const auto a_in_odd = detail::RoundF32ForDemoteToBF16(a); 3724 const auto b_in_even = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(b)); 3725 #else 3726 const auto a_in_odd = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(a)); 3727 const auto b_in_even = detail::RoundF32ForDemoteToBF16(b); 3728 #endif 3729 3730 return BitCast(dbf16, 3731 OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even))); 3732 } 3733 3734 #endif // HWY_NATIVE_DEMOTE_F32_TO_BF16 3735 3736 // ------------------------------ PromoteInRangeTo 3737 #if (defined(HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO) == \ 3738 defined(HWY_TARGET_TOGGLE)) 3739 #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 3740 #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 3741 #else 3742 #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 3743 #endif 3744 3745 #if HWY_HAVE_INTEGER64 3746 template <class D64, HWY_IF_UI64_D(D64)> 3747 HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) { 3748 return PromoteTo(d64, v); 3749 } 3750 #endif 3751 3752 #endif // HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 3753 3754 // ------------------------------ ConvertInRangeTo 3755 #if (defined(HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO) == defined(HWY_TARGET_TOGGLE)) 3756 #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 3757 #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 3758 #else 3759 #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 3760 #endif 3761 3762 template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI), 3763 HWY_IF_T_SIZE_ONE_OF_D(DI, (HWY_HAVE_FLOAT16 ? (1 << 2) : 0) | 3764 (1 << 4) | 3765 (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))> 3766 HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) { 3767 return ConvertTo(di, v); 3768 } 3769 3770 #endif // HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 3771 3772 // ------------------------------ DemoteInRangeTo 3773 #if (defined(HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO) == \ 3774 defined(HWY_TARGET_TOGGLE)) 3775 #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 3776 #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 3777 #else 3778 #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 3779 #endif 3780 3781 #if HWY_HAVE_FLOAT64 3782 template <class D32, HWY_IF_UI32_D(D32)> 3783 HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) { 3784 return DemoteTo(d32, v); 3785 } 3786 #endif 3787 3788 #endif // HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 3789 3790 // ------------------------------ PromoteInRangeLowerTo/PromoteInRangeUpperTo 3791 3792 template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)> 3793 HWY_API VFromD<D> PromoteInRangeLowerTo(D d, V v) { 3794 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V 3795 // because it cannot be deduced from D (could be either bf16 or f16). 3796 const Rebind<TFromV<V>, decltype(d)> dh; 3797 return PromoteInRangeTo(d, LowerHalf(dh, v)); 3798 } 3799 3800 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 3801 template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)> 3802 HWY_API VFromD<D> PromoteInRangeUpperTo(D d, V v) { 3803 #if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \ 3804 (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64)) 3805 // On targets that provide target-specific implementations of F32->UI64 3806 // PromoteInRangeTo, promote the upper half of v using PromoteInRangeTo 3807 3808 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V 3809 // because it cannot be deduced from D (could be either bf16 or f16). 3810 const Rebind<TFromV<V>, decltype(d)> dh; 3811 return PromoteInRangeTo(d, UpperHalf(dh, v)); 3812 #else 3813 // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper 3814 // around F32->UI64 PromoteTo, promote the upper half of v to TFromD<D> using 3815 // PromoteUpperTo 3816 return PromoteUpperTo(d, v); 3817 #endif 3818 } 3819 #endif // HWY_TARGET != HWY_SCALAR 3820 3821 // ------------------------------ PromoteInRangeEvenTo/PromoteInRangeOddTo 3822 3823 template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)> 3824 HWY_API VFromD<D> PromoteInRangeEvenTo(D d, V v) { 3825 #if HWY_TARGET == HWY_SCALAR 3826 return PromoteInRangeTo(d, v); 3827 #elif (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \ 3828 (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64)) 3829 // On targets that provide target-specific implementations of F32->UI64 3830 // PromoteInRangeTo, promote the even lanes of v using PromoteInRangeTo 3831 3832 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V 3833 // because it cannot be deduced from D (could be either bf16 or f16). 3834 const DFromV<decltype(v)> d_from; 3835 const Rebind<TFromV<V>, decltype(d)> dh; 3836 return PromoteInRangeTo(d, LowerHalf(dh, ConcatEven(d_from, v, v))); 3837 #else 3838 // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper 3839 // around F32->UI64 PromoteTo, promote the even lanes of v to TFromD<D> using 3840 // PromoteEvenTo 3841 return PromoteEvenTo(d, v); 3842 #endif // HWY_TARGET == HWY_SCALAR 3843 } 3844 3845 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 3846 template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)> 3847 HWY_API VFromD<D> PromoteInRangeOddTo(D d, V v) { 3848 #if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \ 3849 (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64)) 3850 // On targets that provide target-specific implementations of F32->UI64 3851 // PromoteInRangeTo, promote the odd lanes of v using PromoteInRangeTo 3852 3853 // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V 3854 // because it cannot be deduced from D (could be either bf16 or f16). 3855 const DFromV<decltype(v)> d_from; 3856 const Rebind<TFromV<V>, decltype(d)> dh; 3857 return PromoteInRangeTo(d, LowerHalf(dh, ConcatOdd(d_from, v, v))); 3858 #else 3859 // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper 3860 // around F32->UI64 PromoteTo, promote the odd lanes of v to TFromD<D> using 3861 // PromoteOddTo 3862 return PromoteOddTo(d, v); 3863 #endif 3864 } 3865 #endif // HWY_TARGET != HWY_SCALAR 3866 3867 // ------------------------------ SumsOf2 3868 3869 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 3870 namespace detail { 3871 3872 template <class TypeTag, size_t kLaneSize, class V> 3873 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 3874 TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) { 3875 const DFromV<decltype(v)> d; 3876 const RepartitionToWide<decltype(d)> dw; 3877 return Add(PromoteEvenTo(dw, v), PromoteOddTo(dw, v)); 3878 } 3879 3880 } // namespace detail 3881 3882 template <class V> 3883 HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(V v) { 3884 return detail::SumsOf2(hwy::TypeTag<TFromV<V>>(), 3885 hwy::SizeTag<sizeof(TFromV<V>)>(), v); 3886 } 3887 #endif // HWY_TARGET != HWY_SCALAR 3888 3889 // ------------------------------ SumsOf4 3890 3891 namespace detail { 3892 3893 template <class TypeTag, size_t kLaneSize, class V> 3894 HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4( 3895 TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) { 3896 using hwy::HWY_NAMESPACE::SumsOf2; 3897 return SumsOf2(SumsOf2(v)); 3898 } 3899 3900 } // namespace detail 3901 3902 template <class V> 3903 HWY_API VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(V v) { 3904 return detail::SumsOf4(hwy::TypeTag<TFromV<V>>(), 3905 hwy::SizeTag<sizeof(TFromV<V>)>(), v); 3906 } 3907 3908 // ------------------------------ OrderedTruncate2To 3909 3910 #if HWY_IDE || \ 3911 (defined(HWY_NATIVE_ORDERED_TRUNCATE_2_TO) == defined(HWY_TARGET_TOGGLE)) 3912 3913 #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO 3914 #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO 3915 #else 3916 #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO 3917 #endif 3918 3919 // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar) 3920 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 3921 template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V), 3922 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 3923 HWY_IF_LANES_D(DFromV<VFromD<DN>>, HWY_MAX_LANES_D(DFromV<V>) * 2)> 3924 HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) { 3925 return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); 3926 } 3927 #endif // HWY_TARGET != HWY_SCALAR 3928 #endif // HWY_NATIVE_ORDERED_TRUNCATE_2_TO 3929 3930 // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex 3931 3932 #if (defined(HWY_NATIVE_LEADING_ZERO_COUNT) == defined(HWY_TARGET_TOGGLE)) 3933 #ifdef HWY_NATIVE_LEADING_ZERO_COUNT 3934 #undef HWY_NATIVE_LEADING_ZERO_COUNT 3935 #else 3936 #define HWY_NATIVE_LEADING_ZERO_COUNT 3937 #endif 3938 3939 namespace detail { 3940 3941 template <class D, HWY_IF_U32_D(D)> 3942 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { 3943 const RebindToFloat<decltype(d)> df; 3944 #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2 3945 const RebindToSigned<decltype(d)> di; 3946 const Repartition<int16_t, decltype(d)> di16; 3947 3948 // On SSE2/SSSE3/SSE4/AVX2, do an int32_t to float conversion, followed 3949 // by a unsigned right shift of the uint32_t bit representation of the 3950 // floating point values by 23, followed by an int16_t Min 3951 // operation as we are only interested in the biased exponent that would 3952 // result from a uint32_t to float conversion. 3953 3954 // An int32_t to float vector conversion is also much more efficient on 3955 // SSE2/SSSE3/SSE4/AVX2 than an uint32_t vector to float vector conversion 3956 // as an uint32_t vector to float vector conversion on SSE2/SSSE3/SSE4/AVX2 3957 // requires multiple instructions whereas an int32_t to float vector 3958 // conversion can be carried out using a single instruction on 3959 // SSE2/SSSE3/SSE4/AVX2. 3960 3961 const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(di, v))); 3962 return BitCast(d, Min(BitCast(di16, ShiftRight<23>(f32_bits)), 3963 BitCast(di16, Set(d, 158)))); 3964 #else 3965 const auto f32_bits = BitCast(d, ConvertTo(df, v)); 3966 return BitCast(d, ShiftRight<23>(f32_bits)); 3967 #endif 3968 } 3969 3970 template <class V, HWY_IF_U32_D(DFromV<V>)> 3971 HWY_INLINE V I32RangeU32ToF32BiasedExp(V v) { 3972 // I32RangeU32ToF32BiasedExp is similar to UIntToF32BiasedExp, but 3973 // I32RangeU32ToF32BiasedExp assumes that v[i] is between 0 and 2147483647. 3974 const DFromV<decltype(v)> d; 3975 const RebindToFloat<decltype(d)> df; 3976 #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2 3977 const RebindToSigned<decltype(d)> d_src; 3978 #else 3979 const RebindToUnsigned<decltype(d)> d_src; 3980 #endif 3981 const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(d_src, v))); 3982 return ShiftRight<23>(f32_bits); 3983 } 3984 3985 template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)> 3986 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { 3987 const Rebind<uint32_t, decltype(d)> du32; 3988 const auto f32_biased_exp_as_u32 = 3989 I32RangeU32ToF32BiasedExp(PromoteTo(du32, v)); 3990 return TruncateTo(d, f32_biased_exp_as_u32); 3991 } 3992 3993 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 3994 template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)> 3995 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { 3996 const Half<decltype(d)> dh; 3997 const Rebind<uint32_t, decltype(dh)> du32; 3998 3999 const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v)); 4000 const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v)); 4001 4002 const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32); 4003 const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32); 4004 #if HWY_TARGET <= HWY_SSE2 4005 const RebindToSigned<decltype(du32)> di32; 4006 const RebindToSigned<decltype(d)> di; 4007 return BitCast(d, 4008 OrderedDemote2To(di, BitCast(di32, lo_f32_biased_exp_as_u32), 4009 BitCast(di32, hi_f32_biased_exp_as_u32))); 4010 #else 4011 return OrderedTruncate2To(d, lo_f32_biased_exp_as_u32, 4012 hi_f32_biased_exp_as_u32); 4013 #endif 4014 } 4015 #endif // HWY_TARGET != HWY_SCALAR 4016 4017 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)> 4018 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { 4019 const Rebind<uint32_t, decltype(d)> du32; 4020 const auto f32_biased_exp_as_u32 = 4021 I32RangeU32ToF32BiasedExp(PromoteTo(du32, v)); 4022 return U8FromU32(f32_biased_exp_as_u32); 4023 } 4024 4025 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 4026 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4), 4027 HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 2)> 4028 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { 4029 const Half<decltype(d)> dh; 4030 const Rebind<uint32_t, decltype(dh)> du32; 4031 const Repartition<uint16_t, decltype(du32)> du16; 4032 4033 const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v)); 4034 const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v)); 4035 4036 const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32); 4037 const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32); 4038 4039 #if HWY_TARGET <= HWY_SSE2 4040 const RebindToSigned<decltype(du32)> di32; 4041 const RebindToSigned<decltype(du16)> di16; 4042 const auto f32_biased_exp_as_i16 = 4043 OrderedDemote2To(di16, BitCast(di32, lo_f32_biased_exp_as_u32), 4044 BitCast(di32, hi_f32_biased_exp_as_u32)); 4045 return DemoteTo(d, f32_biased_exp_as_i16); 4046 #else 4047 const auto f32_biased_exp_as_u16 = OrderedTruncate2To( 4048 du16, lo_f32_biased_exp_as_u32, hi_f32_biased_exp_as_u32); 4049 return TruncateTo(d, f32_biased_exp_as_u16); 4050 #endif 4051 } 4052 4053 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 2)> 4054 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { 4055 const Half<decltype(d)> dh; 4056 const Half<decltype(dh)> dq; 4057 const Rebind<uint32_t, decltype(dq)> du32; 4058 const Repartition<uint16_t, decltype(du32)> du16; 4059 4060 const auto lo_half = LowerHalf(dh, v); 4061 const auto hi_half = UpperHalf(dh, v); 4062 4063 const auto u32_q0 = PromoteTo(du32, LowerHalf(dq, lo_half)); 4064 const auto u32_q1 = PromoteTo(du32, UpperHalf(dq, lo_half)); 4065 const auto u32_q2 = PromoteTo(du32, LowerHalf(dq, hi_half)); 4066 const auto u32_q3 = PromoteTo(du32, UpperHalf(dq, hi_half)); 4067 4068 const auto f32_biased_exp_as_u32_q0 = I32RangeU32ToF32BiasedExp(u32_q0); 4069 const auto f32_biased_exp_as_u32_q1 = I32RangeU32ToF32BiasedExp(u32_q1); 4070 const auto f32_biased_exp_as_u32_q2 = I32RangeU32ToF32BiasedExp(u32_q2); 4071 const auto f32_biased_exp_as_u32_q3 = I32RangeU32ToF32BiasedExp(u32_q3); 4072 4073 #if HWY_TARGET <= HWY_SSE2 4074 const RebindToSigned<decltype(du32)> di32; 4075 const RebindToSigned<decltype(du16)> di16; 4076 4077 const auto lo_f32_biased_exp_as_i16 = 4078 OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q0), 4079 BitCast(di32, f32_biased_exp_as_u32_q1)); 4080 const auto hi_f32_biased_exp_as_i16 = 4081 OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q2), 4082 BitCast(di32, f32_biased_exp_as_u32_q3)); 4083 return OrderedDemote2To(d, lo_f32_biased_exp_as_i16, 4084 hi_f32_biased_exp_as_i16); 4085 #else 4086 const auto lo_f32_biased_exp_as_u16 = OrderedTruncate2To( 4087 du16, f32_biased_exp_as_u32_q0, f32_biased_exp_as_u32_q1); 4088 const auto hi_f32_biased_exp_as_u16 = OrderedTruncate2To( 4089 du16, f32_biased_exp_as_u32_q2, f32_biased_exp_as_u32_q3); 4090 return OrderedTruncate2To(d, lo_f32_biased_exp_as_u16, 4091 hi_f32_biased_exp_as_u16); 4092 #endif 4093 } 4094 #endif // HWY_TARGET != HWY_SCALAR 4095 4096 #if HWY_TARGET == HWY_SCALAR 4097 template <class D> 4098 using F32ExpLzcntMinMaxRepartition = RebindToUnsigned<D>; 4099 #elif HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2 4100 template <class D> 4101 using F32ExpLzcntMinMaxRepartition = Repartition<uint8_t, D>; 4102 #else 4103 template <class D> 4104 using F32ExpLzcntMinMaxRepartition = 4105 Repartition<UnsignedFromSize<HWY_MIN(sizeof(TFromD<D>), 4)>, D>; 4106 #endif 4107 4108 template <class V> 4109 using F32ExpLzcntMinMaxCmpV = VFromD<F32ExpLzcntMinMaxRepartition<DFromV<V>>>; 4110 4111 template <class V> 4112 HWY_INLINE F32ExpLzcntMinMaxCmpV<V> F32ExpLzcntMinMaxBitCast(V v) { 4113 const DFromV<decltype(v)> d; 4114 const F32ExpLzcntMinMaxRepartition<decltype(d)> d2; 4115 return BitCast(d2, v); 4116 } 4117 4118 template <class D, HWY_IF_U64_D(D)> 4119 HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) { 4120 #if HWY_TARGET == HWY_SCALAR 4121 const uint64_t u64_val = GetLane(v); 4122 const float f32_val = static_cast<float>(u64_val); 4123 const uint32_t f32_bits = BitCastScalar<uint32_t>(f32_val); 4124 return Set(d, static_cast<uint64_t>(f32_bits >> 23)); 4125 #else 4126 const Repartition<uint32_t, decltype(d)> du32; 4127 const auto f32_biased_exp = UIntToF32BiasedExp(du32, BitCast(du32, v)); 4128 const auto f32_biased_exp_adj = 4129 IfThenZeroElse(Eq(f32_biased_exp, Zero(du32)), 4130 BitCast(du32, Set(d, 0x0000002000000000u))); 4131 const auto adj_f32_biased_exp = Add(f32_biased_exp, f32_biased_exp_adj); 4132 4133 return ShiftRight<32>(BitCast( 4134 d, Max(F32ExpLzcntMinMaxBitCast(adj_f32_biased_exp), 4135 F32ExpLzcntMinMaxBitCast(Reverse2(du32, adj_f32_biased_exp))))); 4136 #endif 4137 } 4138 4139 template <class V, HWY_IF_UNSIGNED_V(V)> 4140 HWY_INLINE V UIntToF32BiasedExp(V v) { 4141 const DFromV<decltype(v)> d; 4142 return UIntToF32BiasedExp(d, v); 4143 } 4144 4145 template <class V, HWY_IF_UNSIGNED_V(V), 4146 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> 4147 HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) { 4148 return v; 4149 } 4150 4151 template <class V, HWY_IF_UNSIGNED_V(V), 4152 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))> 4153 HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) { 4154 // If v[i] >= 16777216 is true, make sure that the bit at 4155 // HighestSetBitIndex(v[i]) - 24 is zeroed out to ensure that any inexact 4156 // conversion to single-precision floating point is rounded down. 4157 4158 // This zeroing-out can be accomplished through the AndNot operation below. 4159 return AndNot(ShiftRight<24>(v), v); 4160 } 4161 4162 } // namespace detail 4163 4164 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 4165 HWY_API V HighestSetBitIndex(V v) { 4166 const DFromV<decltype(v)> d; 4167 const RebindToUnsigned<decltype(d)> du; 4168 using TU = TFromD<decltype(du)>; 4169 4170 const auto f32_biased_exp = detail::UIntToF32BiasedExp( 4171 detail::NormalizeForUIntTruncConvToF32(BitCast(du, v))); 4172 return BitCast(d, Sub(f32_biased_exp, Set(du, TU{127}))); 4173 } 4174 4175 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 4176 HWY_API V LeadingZeroCount(V v) { 4177 const DFromV<decltype(v)> d; 4178 const RebindToUnsigned<decltype(d)> du; 4179 using TU = TFromD<decltype(du)>; 4180 4181 constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; 4182 const auto f32_biased_exp = detail::UIntToF32BiasedExp( 4183 detail::NormalizeForUIntTruncConvToF32(BitCast(du, v))); 4184 const auto lz_count = Sub(Set(du, TU{kNumOfBitsInT + 126}), f32_biased_exp); 4185 4186 return BitCast(d, 4187 Min(detail::F32ExpLzcntMinMaxBitCast(lz_count), 4188 detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT)))); 4189 } 4190 4191 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 4192 HWY_API V TrailingZeroCount(V v) { 4193 const DFromV<decltype(v)> d; 4194 const RebindToUnsigned<decltype(d)> du; 4195 const RebindToSigned<decltype(d)> di; 4196 using TU = TFromD<decltype(du)>; 4197 4198 const auto vi = BitCast(di, v); 4199 const auto lowest_bit = BitCast(du, And(vi, Neg(vi))); 4200 4201 constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; 4202 const auto f32_biased_exp = detail::UIntToF32BiasedExp(lowest_bit); 4203 const auto tz_count = Sub(f32_biased_exp, Set(du, TU{127})); 4204 4205 return BitCast(d, 4206 Min(detail::F32ExpLzcntMinMaxBitCast(tz_count), 4207 detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT)))); 4208 } 4209 #endif // HWY_NATIVE_LEADING_ZERO_COUNT 4210 4211 // ------------------------------ MaskedLeadingZeroCount 4212 #if (defined(HWY_NATIVE_MASKED_LEADING_ZERO_COUNT) == \ 4213 defined(HWY_TARGET_TOGGLE)) 4214 #ifdef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT 4215 #undef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT 4216 #else 4217 #define HWY_NATIVE_MASKED_LEADING_ZERO_COUNT 4218 #endif 4219 4220 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), class M> 4221 HWY_API V MaskedLeadingZeroCount(M m, V v) { 4222 return IfThenElseZero(m, LeadingZeroCount(v)); 4223 } 4224 #endif // HWY_NATIVE_MASKED_LEADING_ZERO_COUNT 4225 4226 // ------------------------------ AESRound 4227 4228 // Cannot implement on scalar: need at least 16 bytes for TableLookupBytes. 4229 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 4230 4231 // Define for white-box testing, even if native instructions are available. 4232 namespace detail { 4233 4234 // Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with 4235 // Vector Permute Instructions" and the accompanying assembly language 4236 // implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan: 4237 // https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html . 4238 // 4239 // A brute-force 256 byte table lookup can also be made constant-time, and 4240 // possibly competitive on NEON, but this is more performance-portable 4241 // especially for x86 and large vectors. 4242 4243 template <class V> // u8 4244 HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL, 4245 V affine_tblU) { 4246 const DFromV<V> du; 4247 const auto mask = Set(du, uint8_t{0xF}); 4248 4249 // Change polynomial basis to GF(2^4) 4250 { 4251 const VFromD<decltype(du)> basisL = 4252 Dup128VecFromValues(du, 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2, 4253 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA); 4254 const VFromD<decltype(du)> basisU = 4255 Dup128VecFromValues(du, 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C, 4256 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD); 4257 const auto sL = And(state, mask); 4258 const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero 4259 const auto gf4L = TableLookupBytes(basisL, sL); 4260 const auto gf4U = TableLookupBytes(basisU, sU); 4261 state = Xor(gf4L, gf4U); 4262 } 4263 4264 // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and 4265 // cause TableLookupBytesOr0 to return 0. 4266 const VFromD<decltype(du)> zetaInv = Dup128VecFromValues( 4267 du, 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3); 4268 const VFromD<decltype(du)> tbl = Dup128VecFromValues( 4269 du, 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4); 4270 const auto sL = And(state, mask); // L=low nibble, U=upper 4271 const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero 4272 const auto sX = Xor(sU, sL); 4273 const auto invL = TableLookupBytes(zetaInv, sL); 4274 const auto invU = TableLookupBytes(tbl, sU); 4275 const auto invX = TableLookupBytes(tbl, sX); 4276 const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU))); 4277 const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX))); 4278 4279 const auto affL = TableLookupBytesOr0(affine_tblL, outL); 4280 const auto affU = TableLookupBytesOr0(affine_tblU, outU); 4281 return Xor(affL, affU); 4282 } 4283 4284 template <class V> // u8 4285 HWY_INLINE V SubBytes(V state) { 4286 const DFromV<V> du; 4287 // Linear skew (cannot bake 0x63 bias into the table because out* indices 4288 // may have the infinity flag set). 4289 const VFromD<decltype(du)> affineL = 4290 Dup128VecFromValues(du, 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0, 4291 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15); 4292 const VFromD<decltype(du)> affineU = 4293 Dup128VecFromValues(du, 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF, 4294 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E); 4295 return Xor(SubBytesMulInverseAndAffineLookup(state, affineL, affineU), 4296 Set(du, uint8_t{0x63})); 4297 } 4298 4299 template <class V> // u8 4300 HWY_INLINE V InvSubBytes(V state) { 4301 const DFromV<V> du; 4302 const VFromD<decltype(du)> gF2P4InvToGF2P8InvL = 4303 Dup128VecFromValues(du, 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13, 4304 0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7); 4305 const VFromD<decltype(du)> gF2P4InvToGF2P8InvU = 4306 Dup128VecFromValues(du, 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12, 4307 0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA); 4308 4309 // Apply the inverse affine transformation 4310 const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)), 4311 Or(ShiftLeft<3>(state), ShiftRight<5>(state)), 4312 Or(ShiftLeft<6>(state), ShiftRight<2>(state))), 4313 Set(du, uint8_t{0x05})); 4314 4315 // The GF(2^8) multiplicative inverse is computed as follows: 4316 // - Changing the polynomial basis to GF(2^4) 4317 // - Computing the GF(2^4) multiplicative inverse 4318 // - Converting the GF(2^4) multiplicative inverse to the GF(2^8) 4319 // multiplicative inverse through table lookups using the 4320 // kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables 4321 return SubBytesMulInverseAndAffineLookup(b, gF2P4InvToGF2P8InvL, 4322 gF2P4InvToGF2P8InvU); 4323 } 4324 4325 } // namespace detail 4326 4327 #endif // HWY_TARGET != HWY_SCALAR 4328 4329 #if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE)) 4330 #ifdef HWY_NATIVE_AES 4331 #undef HWY_NATIVE_AES 4332 #else 4333 #define HWY_NATIVE_AES 4334 #endif 4335 4336 // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar) 4337 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 4338 4339 namespace detail { 4340 4341 template <class V> // u8 4342 HWY_INLINE V ShiftRows(const V state) { 4343 const DFromV<V> du; 4344 // transposed: state is column major 4345 const VFromD<decltype(du)> shift_row = Dup128VecFromValues( 4346 du, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11); 4347 return TableLookupBytes(state, shift_row); 4348 } 4349 4350 template <class V> // u8 4351 HWY_INLINE V InvShiftRows(const V state) { 4352 const DFromV<V> du; 4353 // transposed: state is column major 4354 const VFromD<decltype(du)> shift_row = Dup128VecFromValues( 4355 du, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3); 4356 return TableLookupBytes(state, shift_row); 4357 } 4358 4359 template <class V> // u8 4360 HWY_INLINE V GF2P8Mod11BMulBy2(V v) { 4361 const DFromV<V> du; 4362 const RebindToSigned<decltype(du)> di; // can only do signed comparisons 4363 const auto msb = Lt(BitCast(di, v), Zero(di)); 4364 const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B}))); 4365 return Xor(Add(v, v), overflow); // = v*2 in GF(2^8). 4366 } 4367 4368 template <class V> // u8 4369 HWY_INLINE V MixColumns(const V state) { 4370 const DFromV<V> du; 4371 // For each column, the rows are the sum of GF(2^8) matrix multiplication by: 4372 // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3. 4373 // 1 2 3 1 // d are on diagonal, no permutation needed. 4374 // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows. 4375 // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301). 4376 const VFromD<decltype(du)> v2301 = Dup128VecFromValues( 4377 du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); 4378 const VFromD<decltype(du)> v1230 = Dup128VecFromValues( 4379 du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); 4380 const auto d = GF2P8Mod11BMulBy2(state); // = state*2 in GF(2^8). 4381 const auto s2301 = TableLookupBytes(state, v2301); 4382 const auto d_s2301 = Xor(d, s2301); 4383 const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)} 4384 const auto t1230_s3012 = TableLookupBytes(t_s2301, v1230); 4385 return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms 4386 } 4387 4388 template <class V> // u8 4389 HWY_INLINE V InvMixColumns(const V state) { 4390 const DFromV<V> du; 4391 // For each column, the rows are the sum of GF(2^8) matrix multiplication by: 4392 // 14 11 13 9 4393 // 9 14 11 13 4394 // 13 9 14 11 4395 // 11 13 9 14 4396 const VFromD<decltype(du)> v2301 = Dup128VecFromValues( 4397 du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); 4398 const VFromD<decltype(du)> v1230 = Dup128VecFromValues( 4399 du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); 4400 4401 const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */ 4402 const auto sx4 = GF2P8Mod11BMulBy2(sx2); /* = state*4 in GF(2^8) */ 4403 const auto sx8 = GF2P8Mod11BMulBy2(sx4); /* = state*8 in GF(2^8) */ 4404 const auto sx9 = Xor(sx8, state); /* = state*9 in GF(2^8) */ 4405 const auto sx11 = Xor(sx9, sx2); /* = state*11 in GF(2^8) */ 4406 const auto sx13 = Xor(sx9, sx4); /* = state*13 in GF(2^8) */ 4407 const auto sx14 = Xor3(sx8, sx4, sx2); /* = state*14 in GF(2^8) */ 4408 4409 const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230)); 4410 const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230)); 4411 const auto sx13_2301_sx9_3012 = TableLookupBytes(sx13_0123_sx9_1230, v2301); 4412 return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012); 4413 } 4414 4415 } // namespace detail 4416 4417 template <class V> // u8 4418 HWY_API V AESRound(V state, const V round_key) { 4419 // Intel docs swap the first two steps, but it does not matter because 4420 // ShiftRows is a permutation and SubBytes is independent of lane index. 4421 state = detail::SubBytes(state); 4422 state = detail::ShiftRows(state); 4423 state = detail::MixColumns(state); 4424 state = Xor(state, round_key); // AddRoundKey 4425 return state; 4426 } 4427 4428 template <class V> // u8 4429 HWY_API V AESLastRound(V state, const V round_key) { 4430 // LIke AESRound, but without MixColumns. 4431 state = detail::SubBytes(state); 4432 state = detail::ShiftRows(state); 4433 state = Xor(state, round_key); // AddRoundKey 4434 return state; 4435 } 4436 4437 template <class V> 4438 HWY_API V AESInvMixColumns(V state) { 4439 return detail::InvMixColumns(state); 4440 } 4441 4442 template <class V> // u8 4443 HWY_API V AESRoundInv(V state, const V round_key) { 4444 state = detail::InvSubBytes(state); 4445 state = detail::InvShiftRows(state); 4446 state = detail::InvMixColumns(state); 4447 state = Xor(state, round_key); // AddRoundKey 4448 return state; 4449 } 4450 4451 template <class V> // u8 4452 HWY_API V AESLastRoundInv(V state, const V round_key) { 4453 // Like AESRoundInv, but without InvMixColumns. 4454 state = detail::InvSubBytes(state); 4455 state = detail::InvShiftRows(state); 4456 state = Xor(state, round_key); // AddRoundKey 4457 return state; 4458 } 4459 4460 template <uint8_t kRcon, class V, HWY_IF_U8_D(DFromV<V>)> 4461 HWY_API V AESKeyGenAssist(V v) { 4462 const DFromV<decltype(v)> d; 4463 const V rconXorMask = Dup128VecFromValues(d, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 4464 0, 0, kRcon, 0, 0, 0); 4465 const V rotWordShuffle = Dup128VecFromValues(d, 4, 5, 6, 7, 5, 6, 7, 4, 12, 4466 13, 14, 15, 13, 14, 15, 12); 4467 const auto sub_word_result = detail::SubBytes(v); 4468 const auto rot_word_result = 4469 TableLookupBytes(sub_word_result, rotWordShuffle); 4470 return Xor(rot_word_result, rconXorMask); 4471 } 4472 4473 // Constant-time implementation inspired by 4474 // https://www.bearssl.org/constanttime.html, but about half the cost because we 4475 // use 64x64 multiplies and 128-bit XORs. 4476 template <class V> 4477 HWY_API V CLMulLower(V a, V b) { 4478 const DFromV<V> d; 4479 static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64"); 4480 const auto k1 = Set(d, 0x1111111111111111ULL); 4481 const auto k2 = Set(d, 0x2222222222222222ULL); 4482 const auto k4 = Set(d, 0x4444444444444444ULL); 4483 const auto k8 = Set(d, 0x8888888888888888ULL); 4484 const auto a0 = And(a, k1); 4485 const auto a1 = And(a, k2); 4486 const auto a2 = And(a, k4); 4487 const auto a3 = And(a, k8); 4488 const auto b0 = And(b, k1); 4489 const auto b1 = And(b, k2); 4490 const auto b2 = And(b, k4); 4491 const auto b3 = And(b, k8); 4492 4493 auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3)); 4494 auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0)); 4495 auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1)); 4496 auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2)); 4497 m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1))); 4498 m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2))); 4499 m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3))); 4500 m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0))); 4501 return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8))); 4502 } 4503 4504 template <class V> 4505 HWY_API V CLMulUpper(V a, V b) { 4506 const DFromV<V> d; 4507 static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64"); 4508 const auto k1 = Set(d, 0x1111111111111111ULL); 4509 const auto k2 = Set(d, 0x2222222222222222ULL); 4510 const auto k4 = Set(d, 0x4444444444444444ULL); 4511 const auto k8 = Set(d, 0x8888888888888888ULL); 4512 const auto a0 = And(a, k1); 4513 const auto a1 = And(a, k2); 4514 const auto a2 = And(a, k4); 4515 const auto a3 = And(a, k8); 4516 const auto b0 = And(b, k1); 4517 const auto b1 = And(b, k2); 4518 const auto b2 = And(b, k4); 4519 const auto b3 = And(b, k8); 4520 4521 auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3)); 4522 auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0)); 4523 auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1)); 4524 auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2)); 4525 m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1))); 4526 m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2))); 4527 m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3))); 4528 m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0))); 4529 return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8))); 4530 } 4531 4532 #endif // HWY_NATIVE_AES 4533 #endif // HWY_TARGET != HWY_SCALAR 4534 4535 // ------------------------------ PopulationCount 4536 4537 #if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE)) 4538 #ifdef HWY_NATIVE_POPCNT 4539 #undef HWY_NATIVE_POPCNT 4540 #else 4541 #define HWY_NATIVE_POPCNT 4542 #endif 4543 4544 template <class V, class D = DFromV<V>, HWY_IF_U8_D(D)> 4545 HWY_API V PopulationCount(V v) { 4546 const D d; 4547 4548 #if HWY_TARGET == HWY_SSE2 4549 // TableLookupBytes is slow on SSE2 4550 4551 // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3 4552 const V k33 = Set(d, uint8_t{0x33}); 4553 v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55}))); 4554 v = Add(And(ShiftRight<2>(v), k33), And(v, k33)); 4555 return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F})); 4556 #else // HWY_TARGET != HWY_SSE2 4557 4558 #if HWY_TARGET == HWY_RVV 4559 // Need at least LMUL=1 on RVV to ensure that Lanes(d_tbl) is at least 16 4560 const ScalableTag<uint8_t, HWY_MAX(HWY_POW2_D(D), 0)> d_tbl; 4561 #else 4562 const FixedTag<uint8_t, HWY_MAX(HWY_MAX_LANES_D(D), 16)> d_tbl; 4563 #endif 4564 4565 const auto lookup = Dup128VecFromValues(d_tbl, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 4566 2, 3, 2, 3, 3, 4); 4567 const auto lo = And(v, Set(d, uint8_t{0xF})); 4568 const auto hi = ShiftRight<4>(v); 4569 4570 #if HWY_TARGET == HWY_RVV 4571 // On RVV, use TableLookupLanes to avoid unnecessary overhead 4572 const auto hi_popcnt = 4573 ResizeBitCast(d, TableLookupLanes(lookup, ResizeBitCast(d_tbl, hi))); 4574 const auto lo_popcnt = 4575 ResizeBitCast(d, TableLookupLanes(lookup, ResizeBitCast(d_tbl, lo))); 4576 #else // HWY_TARGET != HWY_RVV 4577 const auto hi_popcnt = TableLookupBytes(lookup, hi); 4578 const auto lo_popcnt = TableLookupBytes(lookup, lo); 4579 #endif // HWY_TARGET == HWY_RVV 4580 4581 return Add(hi_popcnt, lo_popcnt); 4582 #endif // HWY_TARGET == HWY_SSE2 4583 } 4584 4585 template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)> 4586 HWY_API V PopulationCount(V v) { 4587 const D d; 4588 const Repartition<uint8_t, decltype(d)> d8; 4589 const auto vals = BitCast(d, PopulationCount(BitCast(d8, v))); 4590 return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF}))); 4591 } 4592 4593 template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)> 4594 HWY_API V PopulationCount(V v) { 4595 const D d; 4596 Repartition<uint16_t, decltype(d)> d16; 4597 auto vals = BitCast(d, PopulationCount(BitCast(d16, v))); 4598 return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF}))); 4599 } 4600 4601 #if HWY_HAVE_INTEGER64 4602 template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)> 4603 HWY_API V PopulationCount(V v) { 4604 const D d; 4605 Repartition<uint32_t, decltype(d)> d32; 4606 auto vals = BitCast(d, PopulationCount(BitCast(d32, v))); 4607 return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL))); 4608 } 4609 #endif 4610 4611 #endif // HWY_NATIVE_POPCNT 4612 4613 // ------------------------------ 8-bit multiplication 4614 4615 #if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE 4616 #ifdef HWY_NATIVE_MUL_8 4617 #undef HWY_NATIVE_MUL_8 4618 #else 4619 #define HWY_NATIVE_MUL_8 4620 #endif 4621 4622 // 8 bit and fits in wider reg: promote 4623 template <class V, HWY_IF_T_SIZE_V(V, 1), 4624 HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)> 4625 HWY_API V operator*(const V a, const V b) { 4626 const DFromV<decltype(a)> d; 4627 const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw; 4628 const RebindToUnsigned<decltype(d)> du; // TruncateTo result 4629 const RebindToUnsigned<decltype(dw)> dwu; // TruncateTo input 4630 const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b); 4631 // TruncateTo is cheaper than ConcatEven. 4632 return BitCast(d, TruncateTo(du, BitCast(dwu, mul))); 4633 } 4634 4635 // 8 bit full reg: promote halves 4636 template <class V, HWY_IF_T_SIZE_V(V, 1), 4637 HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)> 4638 HWY_API V operator*(const V a, const V b) { 4639 const DFromV<decltype(a)> d; 4640 const Half<decltype(d)> dh; 4641 const Twice<RepartitionToWide<decltype(dh)>> dw; 4642 const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a)); 4643 const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a)); 4644 const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b)); 4645 const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b)); 4646 const VFromD<decltype(dw)> m0 = a0 * b0; 4647 const VFromD<decltype(dw)> m1 = a1 * b1; 4648 return ConcatEven(d, BitCast(d, m1), BitCast(d, m0)); 4649 } 4650 4651 #endif // HWY_NATIVE_MUL_8 4652 4653 // ------------------------------ 64-bit multiplication 4654 4655 #if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE 4656 #ifdef HWY_NATIVE_MUL_64 4657 #undef HWY_NATIVE_MUL_64 4658 #else 4659 #define HWY_NATIVE_MUL_64 4660 #endif 4661 4662 // Single-lane i64 or u64 4663 template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8), 4664 HWY_IF_NOT_FLOAT_V(V)> 4665 HWY_API V operator*(V x, V y) { 4666 const DFromV<V> d; 4667 using T = TFromD<decltype(d)>; 4668 using TU = MakeUnsigned<T>; 4669 const TU xu = static_cast<TU>(GetLane(x)); 4670 const TU yu = static_cast<TU>(GetLane(y)); 4671 return Set(d, static_cast<T>(xu * yu)); 4672 } 4673 4674 template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64), 4675 HWY_IF_V_SIZE_GT_D(D64, 8)> 4676 HWY_API V operator*(V x, V y) { 4677 RepartitionToNarrow<D64> d32; 4678 auto x32 = BitCast(d32, x); 4679 auto y32 = BitCast(d32, y); 4680 auto lolo = BitCast(d32, MulEven(x32, y32)); 4681 auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y)))); 4682 auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32)); 4683 auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo))); 4684 return BitCast(D64{}, lolo + hi); 4685 } 4686 template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64), 4687 HWY_IF_V_SIZE_GT_D(DI64, 8)> 4688 HWY_API V operator*(V x, V y) { 4689 RebindToUnsigned<DI64> du64; 4690 return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y)); 4691 } 4692 4693 #endif // HWY_NATIVE_MUL_64 4694 4695 // ------------------------------ MulRound 4696 template <class V, HWY_IF_FLOAT_V(V)> 4697 HWY_API V MulRound(V a, V b) { 4698 return Round(Mul(a, b)); 4699 } 4700 4701 // ------------------------------ MulAdd / NegMulAdd 4702 4703 #if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE)) 4704 #ifdef HWY_NATIVE_INT_FMA 4705 #undef HWY_NATIVE_INT_FMA 4706 #else 4707 #define HWY_NATIVE_INT_FMA 4708 #endif 4709 4710 #ifdef HWY_NATIVE_INT_FMSUB 4711 #undef HWY_NATIVE_INT_FMSUB 4712 #else 4713 #define HWY_NATIVE_INT_FMSUB 4714 #endif 4715 4716 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 4717 HWY_API V MulAdd(V mul, V x, V add) { 4718 return Add(Mul(mul, x), add); 4719 } 4720 4721 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 4722 HWY_API V NegMulAdd(V mul, V x, V add) { 4723 return Sub(add, Mul(mul, x)); 4724 } 4725 4726 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 4727 HWY_API V MulSub(V mul, V x, V sub) { 4728 return Sub(Mul(mul, x), sub); 4729 } 4730 #endif // HWY_NATIVE_INT_FMA 4731 // ------------------------------ MulComplex* / MaskedMulComplex* 4732 4733 #if (defined(HWY_NATIVE_CPLX) == defined(HWY_TARGET_TOGGLE)) 4734 #ifdef HWY_NATIVE_CPLX 4735 #undef HWY_NATIVE_CPLX 4736 #else 4737 #define HWY_NATIVE_CPLX 4738 #endif 4739 4740 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 4741 4742 template <class V, HWY_IF_NOT_UNSIGNED(TFromV<V>)> 4743 HWY_API V ComplexConj(V a) { 4744 return OddEven(Neg(a), a); 4745 } 4746 4747 template <class V> 4748 HWY_API V MulComplex(V a, V b) { 4749 // a = u + iv, b = x + iy 4750 const auto u = DupEven(a); 4751 const auto v = DupOdd(a); 4752 const auto x = DupEven(b); 4753 const auto y = DupOdd(b); 4754 4755 return OddEven(MulAdd(u, y, Mul(v, x)), Sub(Mul(u, x), Mul(v, y))); 4756 } 4757 4758 template <class V> 4759 HWY_API V MulComplexConj(V a, V b) { 4760 // a = u + iv, b = x + iy 4761 const auto u = DupEven(a); 4762 const auto v = DupOdd(a); 4763 const auto x = DupEven(b); 4764 const auto y = DupOdd(b); 4765 4766 return OddEven(Sub(Mul(v, x), Mul(u, y)), MulAdd(u, x, Mul(v, y))); 4767 } 4768 4769 template <class V> 4770 HWY_API V MulComplexAdd(V a, V b, V c) { 4771 return Add(MulComplex(a, b), c); 4772 } 4773 4774 template <class V> 4775 HWY_API V MulComplexConjAdd(V a, V b, V c) { 4776 return Add(MulComplexConj(a, b), c); 4777 } 4778 4779 template <class V, class M> 4780 HWY_API V MaskedMulComplexConjAdd(M mask, V a, V b, V c) { 4781 return IfThenElseZero(mask, MulComplexConjAdd(a, b, c)); 4782 } 4783 4784 template <class V, class M> 4785 HWY_API V MaskedMulComplexConj(M mask, V a, V b) { 4786 return IfThenElseZero(mask, MulComplexConj(a, b)); 4787 } 4788 4789 template <class V, class M> 4790 HWY_API V MaskedMulComplexOr(V no, M mask, V a, V b) { 4791 return IfThenElse(mask, MulComplex(a, b), no); 4792 } 4793 #endif // HWY_TARGET != HWY_SCALAR 4794 4795 #endif // HWY_NATIVE_CPLX 4796 4797 // ------------------------------ MaskedMulAddOr 4798 #if (defined(HWY_NATIVE_MASKED_INT_FMA) == defined(HWY_TARGET_TOGGLE)) 4799 #ifdef HWY_NATIVE_MASKED_INT_FMA 4800 #undef HWY_NATIVE_MASKED_INT_FMA 4801 #else 4802 #define HWY_NATIVE_MASKED_INT_FMA 4803 #endif 4804 4805 template <class V, class M> 4806 HWY_API V MaskedMulAddOr(V no, M m, V mul, V x, V add) { 4807 return IfThenElse(m, MulAdd(mul, x, add), no); 4808 } 4809 4810 #endif // HWY_NATIVE_MASKED_INT_FMA 4811 4812 // ------------------------------ Integer MulSub / NegMulSub 4813 #if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE)) 4814 #ifdef HWY_NATIVE_INT_FMSUB 4815 #undef HWY_NATIVE_INT_FMSUB 4816 #else 4817 #define HWY_NATIVE_INT_FMSUB 4818 #endif 4819 4820 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 4821 HWY_API V MulSub(V mul, V x, V sub) { 4822 const DFromV<decltype(mul)> d; 4823 const RebindToSigned<decltype(d)> di; 4824 return MulAdd(mul, x, BitCast(d, Neg(BitCast(di, sub)))); 4825 } 4826 4827 #endif // HWY_NATIVE_INT_FMSUB 4828 4829 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 4830 HWY_API V NegMulSub(V mul, V x, V sub) { 4831 const DFromV<decltype(mul)> d; 4832 const RebindToSigned<decltype(d)> di; 4833 4834 return BitCast(d, Neg(BitCast(di, MulAdd(mul, x, sub)))); 4835 } 4836 4837 // ------------------------------ MulAddSub 4838 4839 // MulAddSub(mul, x, sub_or_add) for a 1-lane vector is equivalent to 4840 // MulSub(mul, x, sub_or_add) 4841 template <class V, HWY_IF_LANES_D(DFromV<V>, 1)> 4842 HWY_API V MulAddSub(V mul, V x, V sub_or_add) { 4843 return MulSub(mul, x, sub_or_add); 4844 } 4845 4846 // MulAddSub for F16/F32/F64 vectors with 2 or more lanes on 4847 // SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and 4848 // x86_512-inl.h 4849 4850 // MulAddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h 4851 4852 // MulAddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h 4853 template <class V, HWY_IF_MULADDSUB_V(V)> 4854 HWY_API V MulAddSub(V mul, V x, V sub_or_add) { 4855 using D = DFromV<V>; 4856 using T = TFromD<D>; 4857 using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>; 4858 4859 const D d; 4860 const Rebind<TNegate, D> d_negate; 4861 4862 const auto add = 4863 OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add)))); 4864 return MulAdd(mul, x, add); 4865 } 4866 // ------------------------------ MulSubAdd 4867 4868 template <class V> 4869 HWY_API V MulSubAdd(V mul, V x, V sub_or_add) { 4870 using D = DFromV<V>; 4871 using T = TFromD<D>; 4872 using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>; 4873 4874 const D d; 4875 const Rebind<TNegate, D> d_negate; 4876 4877 return MulAddSub(mul, x, BitCast(d, Neg(BitCast(d_negate, sub_or_add)))); 4878 } 4879 4880 // ------------------------------ MaskedConvertTo 4881 template <class D, class V, class M> 4882 HWY_API VFromD<D> MaskedConvertTo(M m, D d, V v) { 4883 return IfThenElseZero(m, ConvertTo(d, v)); 4884 } 4885 4886 // ------------------------------ Integer division 4887 #if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE)) 4888 #ifdef HWY_NATIVE_INT_DIV 4889 #undef HWY_NATIVE_INT_DIV 4890 #else 4891 #define HWY_NATIVE_INT_DIV 4892 #endif 4893 4894 namespace detail { 4895 4896 // DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo are okay to use in 4897 // the implementation of detail::IntDiv in generic_ops-inl.h as the current 4898 // implementations of DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo 4899 // will convert values that are outside of the range of TFromD<DI> by either 4900 // saturation, truncation, or converting values that are outside of the 4901 // destination range to LimitsMin<TFromD<DI>>() (which is equal to 4902 // static_cast<TFromD<DI>>(LimitsMax<TFromD<DI>>() + 1)) 4903 4904 template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))> 4905 HWY_INLINE Vec<D> IntDivConvFloatToInt(D di, V vf) { 4906 return ConvertInRangeTo(di, vf); 4907 } 4908 4909 template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))> 4910 HWY_INLINE Vec<D> IntDivConvIntToFloat(D df, V vi) { 4911 return ConvertTo(df, vi); 4912 } 4913 4914 #if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 4915 template <class D, class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)> 4916 HWY_INLINE Vec<D> IntDivConvFloatToInt(D df, V vi) { 4917 return PromoteInRangeTo(df, vi); 4918 } 4919 4920 // If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32 4921 // IntDivConvIntToFloat(df, vi) returns an approximation of 4922 // static_cast<float>(v[i]) that is within 4 ULP of static_cast<float>(v[i]) 4923 template <class D, class V, HWY_IF_F32_D(D), HWY_IF_I64(TFromV<V>)> 4924 HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vi) { 4925 const Twice<decltype(df32)> dt_f32; 4926 4927 auto vf32 = 4928 ConvertTo(dt_f32, BitCast(RebindToSigned<decltype(dt_f32)>(), vi)); 4929 4930 #if HWY_IS_LITTLE_ENDIAN 4931 const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32)); 4932 auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32)); 4933 #else 4934 const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32)); 4935 auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32)); 4936 #endif 4937 4938 const RebindToSigned<decltype(df32)> di32; 4939 4940 hi_f32 = 4941 Add(hi_f32, And(BitCast(df32, BroadcastSignBit(BitCast(di32, lo_f32))), 4942 Set(df32, 1.0f))); 4943 return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32); 4944 } 4945 4946 template <class D, class V, HWY_IF_F32_D(D), HWY_IF_U64(TFromV<V>)> 4947 HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vu) { 4948 const Twice<decltype(df32)> dt_f32; 4949 4950 auto vf32 = 4951 ConvertTo(dt_f32, BitCast(RebindToUnsigned<decltype(dt_f32)>(), vu)); 4952 4953 #if HWY_IS_LITTLE_ENDIAN 4954 const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32)); 4955 const auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32)); 4956 #else 4957 const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32)); 4958 const auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32)); 4959 #endif 4960 4961 return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32); 4962 } 4963 #endif // !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 4964 4965 template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 4966 HWY_IF_T_SIZE_GT(TFromV<V>, kOrigLaneSize)> 4967 HWY_INLINE V IntDivUsingFloatDiv(V a, V b) { 4968 const DFromV<decltype(a)> d; 4969 const RebindToFloat<decltype(d)> df; 4970 4971 // If kOrigLaneSize < sizeof(T) is true, then a[i] and b[i] are both in the 4972 // [LimitsMin<SignedFromSize<kOrigLaneSize>>(), 4973 // LimitsMax<UnsignedFromSize<kOrigLaneSize>>()] range. 4974 4975 // floor(|a[i] / b[i]|) <= |flt_q| < floor(|a[i] / b[i]|) + 1 is also 4976 // guaranteed to be true if MakeFloat<T> has at least kOrigLaneSize*8 + 1 4977 // mantissa bits (including the implied one bit), where flt_q is equal to 4978 // static_cast<MakeFloat<T>>(a[i]) / static_cast<MakeFloat<T>>(b[i]), 4979 // even in the case where the magnitude of an inexact floating point division 4980 // result is rounded up. 4981 4982 // In other words, floor(flt_q) < flt_q < ceil(flt_q) is guaranteed to be true 4983 // if (a[i] % b[i]) != 0 is true and MakeFloat<T> has at least 4984 // kOrigLaneSize*8 + 1 mantissa bits (including the implied one bit), even in 4985 // the case where the magnitude of an inexact floating point division result 4986 // is rounded up. 4987 4988 // It is okay to do conversions from MakeFloat<TFromV<V>> to TFromV<V> using 4989 // ConvertInRangeTo if sizeof(TFromV<V>) > kOrigLaneSize as the result of the 4990 // floating point division is always greater than LimitsMin<TFromV<V>>() and 4991 // less than LimitsMax<TFromV<V>>() if sizeof(TFromV<V>) > kOrigLaneSize and 4992 // b[i] != 0. 4993 4994 #if HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64 4995 // On Armv7, do division by multiplying by the ApproximateReciprocal 4996 // to avoid unnecessary overhead as F32 Div refines the approximate 4997 // reciprocal using 4 Newton-Raphson iterations 4998 4999 const RebindToSigned<decltype(d)> di; 5000 const RebindToUnsigned<decltype(d)> du; 5001 5002 const auto flt_b = ConvertTo(df, b); 5003 auto flt_recip_b = ApproximateReciprocal(flt_b); 5004 if (kOrigLaneSize > 1) { 5005 flt_recip_b = 5006 Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b)); 5007 } 5008 5009 auto q0 = ConvertInRangeTo(d, Mul(ConvertTo(df, a), flt_recip_b)); 5010 const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a)); 5011 5012 auto r1 = r0; 5013 5014 // Need to negate r1[i] if a[i] < 0 is true 5015 if (IsSigned<TFromV<V>>()) { 5016 r1 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r1); 5017 } 5018 5019 // r1[i] is now equal to (a[i] < 0) ? (-r0[i]) : r0[i] 5020 5021 auto abs_b = BitCast(du, b); 5022 if (IsSigned<TFromV<V>>()) { 5023 abs_b = BitCast(du, Abs(BitCast(di, abs_b))); 5024 } 5025 5026 // If (r1[i] < 0 || r1[i] >= abs_b[i]) is true, then set q1[i] to -1. 5027 // Otherwise, set q1[i] to 0. 5028 5029 // (r1[i] < 0 || r1[i] >= abs_b[i]) can be carried out using a single unsigned 5030 // comparison as static_cast<TU>(r1[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i] 5031 // will be true if r1[i] < 0 is true. 5032 auto q1 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r1), abs_b))); 5033 5034 // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? -1 : 0 5035 5036 // Need to negate q1[i] if r0[i] and b[i] do not have the same sign 5037 auto q1_negate_mask = r0; 5038 if (IsSigned<TFromV<V>>()) { 5039 q1_negate_mask = Xor(q1_negate_mask, BitCast(di, b)); 5040 } 5041 q1 = IfNegativeThenElse(q1_negate_mask, Neg(q1), q1); 5042 5043 // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? 5044 // (((r0[i] ^ b[i]) < 0) ? 1 : -1) 5045 5046 // Need to subtract q1[i] from q0[i] to get the final result 5047 return Sub(q0, BitCast(d, q1)); 5048 #else 5049 // On targets other than Armv7 NEON, use F16 or F32 division as most targets 5050 // other than Armv7 NEON have native F32 divide instructions 5051 return ConvertInRangeTo(d, Div(ConvertTo(df, a), ConvertTo(df, b))); 5052 #endif 5053 } 5054 5055 template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 5056 HWY_IF_T_SIZE(TFromV<V>, kOrigLaneSize), 5057 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))> 5058 HWY_INLINE V IntDivUsingFloatDiv(V a, V b) { 5059 // If kOrigLaneSize == sizeof(T) is true, at least two reciprocal 5060 // multiplication steps are needed as the mantissa of MakeFloat<T> has fewer 5061 // than kOrigLaneSize*8 + 1 bits 5062 5063 using T = TFromV<V>; 5064 5065 #if HWY_HAVE_FLOAT64 5066 using TF = MakeFloat<T>; 5067 #else 5068 using TF = float; 5069 #endif 5070 5071 const DFromV<decltype(a)> d; 5072 const RebindToSigned<decltype(d)> di; 5073 const RebindToUnsigned<decltype(d)> du; 5074 const Rebind<TF, decltype(d)> df; 5075 5076 if (!IsSigned<T>()) { 5077 // If T is unsigned, set a[i] to (a[i] >= b[i] ? 1 : 0) and set b[i] to 1 if 5078 // b[i] > LimitsMax<MakeSigned<T>>() is true 5079 5080 const auto one = Set(di, MakeSigned<T>{1}); 5081 a = BitCast( 5082 d, IfNegativeThenElse(BitCast(di, b), 5083 IfThenElseZero(RebindMask(di, Ge(a, b)), one), 5084 BitCast(di, a))); 5085 b = BitCast(d, IfNegativeThenElse(BitCast(di, b), one, BitCast(di, b))); 5086 } 5087 5088 // LimitsMin<T>() <= b[i] <= LimitsMax<MakeSigned<T>>() is now true 5089 5090 const auto flt_b = IntDivConvIntToFloat(df, b); 5091 5092 #if HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64 5093 auto flt_recip_b = ApproximateReciprocal(flt_b); 5094 flt_recip_b = 5095 Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b)); 5096 #else 5097 const auto flt_recip_b = Div(Set(df, TF(1.0)), flt_b); 5098 #endif 5099 5100 // It is okay if the conversion of a[i] * flt_recip_b[i] to T using 5101 // IntDivConvFloatToInt returns incorrect results in any lanes where b[i] == 0 5102 // as the result of IntDivUsingFloatDiv(a, b) is implementation-defined in any 5103 // lanes where b[i] == 0. 5104 5105 // If ScalarAbs(b[i]) == 1 is true, then it is possible for 5106 // a[i] * flt_recip_b[i] to be rounded up to a value that is outside of the 5107 // range of T. If a[i] * flt_recip_b[i] is outside of the range of T, 5108 // IntDivConvFloatToInt will convert any values that are out of the range of T 5109 // by either saturation, truncation, or wrapping around to LimitsMin<T>(). 5110 5111 // It is okay if the conversion of a[i] * flt_recip_b[i] to T using 5112 // IntDivConvFloatToInt wraps around if ScalarAbs(b[i]) == 1 as r0 will have 5113 // the correct sign if ScalarAbs(b[i]) == 1, even in the cases where the 5114 // conversion of a[i] * flt_recip_b[i] to T using IntDivConvFloatToInt is 5115 // truncated or wraps around. 5116 5117 // If ScalarAbs(b[i]) >= 2 is true, a[i] * flt_recip_b[i] will be within the 5118 // range of T, even in the cases where the conversion of a[i] to TF is 5119 // rounded up or the result of multiplying a[i] by flt_recip_b[i] is rounded 5120 // up. 5121 5122 // ScalarAbs(r0[i]) will also always be less than (LimitsMax<T>() / 2) if 5123 // b[i] != 0, even in the cases where the conversion of a[i] * flt_recip_b[i] 5124 // to T using IntDivConvFloatToInt is truncated or is wrapped around. 5125 5126 auto q0 = 5127 IntDivConvFloatToInt(d, Mul(IntDivConvIntToFloat(df, a), flt_recip_b)); 5128 const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a)); 5129 5130 // If b[i] != 0 is true, r0[i] * flt_recip_b[i] is always within the range of 5131 // T, even in the cases where the conversion of r0[i] to TF is rounded up or 5132 // the multiplication of r0[i] by flt_recip_b[i] is rounded up. 5133 5134 auto q1 = 5135 IntDivConvFloatToInt(di, Mul(IntDivConvIntToFloat(df, r0), flt_recip_b)); 5136 const auto r1 = hwy::HWY_NAMESPACE::NegMulAdd(q1, BitCast(di, b), r0); 5137 5138 auto r3 = r1; 5139 5140 #if !HWY_HAVE_FLOAT64 5141 // Need two additional reciprocal multiplication steps for I64/U64 vectors if 5142 // HWY_HAVE_FLOAT64 is 0 5143 if (sizeof(T) == 8) { 5144 const auto q2 = IntDivConvFloatToInt( 5145 di, Mul(IntDivConvIntToFloat(df, r1), flt_recip_b)); 5146 const auto r2 = hwy::HWY_NAMESPACE::NegMulAdd(q2, BitCast(di, b), r1); 5147 5148 const auto q3 = IntDivConvFloatToInt( 5149 di, Mul(IntDivConvIntToFloat(df, r2), flt_recip_b)); 5150 r3 = hwy::HWY_NAMESPACE::NegMulAdd(q3, BitCast(di, b), r2); 5151 5152 q0 = Add(q0, BitCast(d, q2)); 5153 q1 = Add(q1, q3); 5154 } 5155 #endif // !HWY_HAVE_FLOAT64 5156 5157 auto r4 = r3; 5158 5159 // Need to negate r4[i] if a[i] < 0 is true 5160 if (IsSigned<TFromV<V>>()) { 5161 r4 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r4); 5162 } 5163 5164 // r4[i] is now equal to (a[i] < 0) ? (-r3[i]) : r3[i] 5165 5166 auto abs_b = BitCast(du, b); 5167 if (IsSigned<TFromV<V>>()) { 5168 abs_b = BitCast(du, Abs(BitCast(di, abs_b))); 5169 } 5170 5171 // If (r4[i] < 0 || r4[i] >= abs_b[i]) is true, then set q4[i] to -1. 5172 // Otherwise, set r4[i] to 0. 5173 5174 // (r4[i] < 0 || r4[i] >= abs_b[i]) can be carried out using a single unsigned 5175 // comparison as static_cast<TU>(r4[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i] 5176 // will be true if r4[i] < 0 is true. 5177 auto q4 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r4), abs_b))); 5178 5179 // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? -1 : 0 5180 5181 // Need to negate q4[i] if r3[i] and b[i] do not have the same sign 5182 auto q4_negate_mask = r3; 5183 if (IsSigned<TFromV<V>>()) { 5184 q4_negate_mask = Xor(q4_negate_mask, BitCast(di, b)); 5185 } 5186 q4 = IfNegativeThenElse(q4_negate_mask, Neg(q4), q4); 5187 5188 // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? 5189 // (((r3[i] ^ b[i]) < 0) ? 1 : -1) 5190 5191 // The final result is equal to q0[i] + q1[i] - q4[i] 5192 return Sub(Add(q0, BitCast(d, q1)), BitCast(d, q4)); 5193 } 5194 5195 template <size_t kOrigLaneSize, class V, 5196 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)), 5197 HWY_IF_V_SIZE_LE_V( 5198 V, HWY_MAX_BYTES / 5199 ((!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1) ? 4 : 2))> 5200 HWY_INLINE V IntDiv(V a, V b) { 5201 using T = TFromV<V>; 5202 5203 // If HWY_HAVE_FLOAT16 is 0, need to promote I8 to I32 and U8 to U32 5204 using TW = MakeWide< 5205 If<(!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1), MakeWide<T>, T>>; 5206 5207 const DFromV<decltype(a)> d; 5208 const Rebind<TW, decltype(d)> dw; 5209 5210 #if HWY_TARGET <= HWY_SSE2 5211 // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid 5212 // unnecessary overhead 5213 const RebindToSigned<decltype(dw)> dw_i; 5214 5215 // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<T> if 5216 // kOrigLaneSize < sizeof(T) to avoid unnecessary overhead 5217 const If<(kOrigLaneSize < sizeof(T)), RebindToSigned<decltype(d)>, 5218 decltype(d)> 5219 d_demote_to; 5220 #else 5221 // On other targets, promote to TW and demote to T 5222 const decltype(dw) dw_i; 5223 const decltype(d) d_demote_to; 5224 #endif 5225 5226 return BitCast( 5227 d, DemoteTo(d_demote_to, IntDivUsingFloatDiv<kOrigLaneSize>( 5228 PromoteTo(dw_i, a), PromoteTo(dw_i, b)))); 5229 } 5230 5231 template <size_t kOrigLaneSize, class V, 5232 HWY_IF_T_SIZE_ONE_OF_V(V, 5233 (HWY_HAVE_FLOAT16 ? (1 << 1) : 0) | (1 << 2)), 5234 HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)> 5235 HWY_INLINE V IntDiv(V a, V b) { 5236 const DFromV<decltype(a)> d; 5237 const RepartitionToWide<decltype(d)> dw; 5238 5239 #if HWY_TARGET <= HWY_SSE2 5240 // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid 5241 // unnecessary overhead 5242 const RebindToSigned<decltype(dw)> dw_i; 5243 5244 // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<TFromV<V>> if 5245 // kOrigLaneSize < sizeof(TFromV<V>) to avoid unnecessary overhead 5246 const If<(kOrigLaneSize < sizeof(TFromV<V>)), RebindToSigned<decltype(d)>, 5247 decltype(d)> 5248 d_demote_to; 5249 #else 5250 // On other targets, promote to MakeWide<TFromV<V>> and demote to TFromV<V> 5251 const decltype(dw) dw_i; 5252 const decltype(d) d_demote_to; 5253 #endif 5254 5255 return BitCast(d, OrderedDemote2To( 5256 d_demote_to, 5257 IntDivUsingFloatDiv<kOrigLaneSize>( 5258 PromoteLowerTo(dw_i, a), PromoteLowerTo(dw_i, b)), 5259 IntDivUsingFloatDiv<kOrigLaneSize>( 5260 PromoteUpperTo(dw_i, a), PromoteUpperTo(dw_i, b)))); 5261 } 5262 5263 #if !HWY_HAVE_FLOAT16 5264 template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>), 5265 HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)> 5266 HWY_INLINE V IntDiv(V a, V b) { 5267 const DFromV<decltype(a)> d; 5268 const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw; 5269 5270 #if HWY_TARGET <= HWY_SSE2 5271 // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary 5272 // overhead 5273 const RebindToSigned<decltype(dw)> dw_i; 5274 #else 5275 // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V> 5276 const decltype(dw) dw_i; 5277 #endif 5278 5279 return DemoteTo(d, 5280 BitCast(dw_i, IntDiv<1>(PromoteTo(dw, a), PromoteTo(dw, b)))); 5281 } 5282 template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>), 5283 HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)> 5284 HWY_INLINE V IntDiv(V a, V b) { 5285 const DFromV<decltype(a)> d; 5286 const RepartitionToWide<decltype(d)> dw; 5287 5288 #if HWY_TARGET <= HWY_SSE2 5289 // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary 5290 // overhead 5291 const RebindToSigned<decltype(dw)> dw_i; 5292 #else 5293 // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V> 5294 const decltype(dw) dw_i; 5295 #endif 5296 5297 return OrderedDemote2To( 5298 d, BitCast(dw_i, IntDiv<1>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b))), 5299 BitCast(dw_i, IntDiv<1>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b)))); 5300 } 5301 #endif // !HWY_HAVE_FLOAT16 5302 5303 template <size_t kOrigLaneSize, class V, 5304 HWY_IF_T_SIZE_ONE_OF_V(V, 5305 (HWY_HAVE_FLOAT64 ? 0 : (1 << 4)) | (1 << 8))> 5306 HWY_INLINE V IntDiv(V a, V b) { 5307 return IntDivUsingFloatDiv<kOrigLaneSize>(a, b); 5308 } 5309 5310 #if HWY_HAVE_FLOAT64 5311 template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>), 5312 HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)> 5313 HWY_INLINE V IntDiv(V a, V b) { 5314 const DFromV<decltype(a)> d; 5315 const Rebind<double, decltype(d)> df64; 5316 5317 // It is okay to demote the F64 Div result to int32_t or uint32_t using 5318 // DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i]) 5319 // will always be within the range of TFromV<V> if b[i] != 0 and 5320 // sizeof(TFromV<V>) <= 4. 5321 5322 return DemoteInRangeTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b))); 5323 } 5324 template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>), 5325 HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)> 5326 HWY_INLINE V IntDiv(V a, V b) { 5327 const DFromV<decltype(a)> d; 5328 const Half<decltype(d)> dh; 5329 const Repartition<double, decltype(d)> df64; 5330 5331 // It is okay to demote the F64 Div result to int32_t or uint32_t using 5332 // DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i]) 5333 // will always be within the range of TFromV<V> if b[i] != 0 and 5334 // sizeof(TFromV<V>) <= 4. 5335 5336 const VFromD<decltype(df64)> div1 = 5337 Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b)); 5338 const VFromD<decltype(df64)> div0 = 5339 Div(PromoteLowerTo(df64, a), PromoteLowerTo(df64, b)); 5340 return Combine(d, DemoteInRangeTo(dh, div1), DemoteInRangeTo(dh, div0)); 5341 } 5342 #endif // HWY_HAVE_FLOAT64 5343 5344 template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 5345 HWY_IF_T_SIZE_ONE_OF_V(V, ((HWY_TARGET <= HWY_SSE2 || 5346 HWY_TARGET == HWY_WASM || 5347 HWY_TARGET == HWY_WASM_EMU256 || 5348 HWY_TARGET == HWY_LSX || 5349 HWY_TARGET == HWY_LASX) 5350 ? 0 5351 : (1 << 1)) | 5352 (1 << 2) | (1 << 4) | (1 << 8))> 5353 HWY_INLINE V IntMod(V a, V b) { 5354 return hwy::HWY_NAMESPACE::NegMulAdd(IntDiv<kOrigLaneSize>(a, b), b, a); 5355 } 5356 5357 #if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \ 5358 HWY_TARGET == HWY_WASM_EMU256 || HWY_TARGET == HWY_LSX || \ 5359 HWY_TARGET == HWY_LASX 5360 template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>), 5361 HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)> 5362 HWY_INLINE V IntMod(V a, V b) { 5363 const DFromV<decltype(a)> d; 5364 const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw; 5365 return DemoteTo(d, IntMod<kOrigLaneSize>(PromoteTo(dw, a), PromoteTo(dw, b))); 5366 } 5367 5368 template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>), 5369 HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)> 5370 HWY_INLINE V IntMod(V a, V b) { 5371 const DFromV<decltype(a)> d; 5372 const RepartitionToWide<decltype(d)> dw; 5373 return OrderedDemote2To( 5374 d, IntMod<kOrigLaneSize>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b)), 5375 IntMod<kOrigLaneSize>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b))); 5376 } 5377 #endif // HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || HWY_TARGET == 5378 // HWY_WASM_EMU256 || HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX 5379 5380 } // namespace detail 5381 5382 #if HWY_TARGET == HWY_SCALAR 5383 5384 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 5385 HWY_API Vec1<T> operator/(Vec1<T> a, Vec1<T> b) { 5386 return detail::IntDiv<sizeof(T)>(a, b); 5387 } 5388 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 5389 HWY_API Vec1<T> operator%(Vec1<T> a, Vec1<T> b) { 5390 return detail::IntMod<sizeof(T)>(a, b); 5391 } 5392 5393 #else // HWY_TARGET != HWY_SCALAR 5394 5395 template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 5396 HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) { 5397 return detail::IntDiv<sizeof(T)>(a, b); 5398 } 5399 5400 template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 5401 HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) { 5402 return detail::IntMod<sizeof(T)>(a, b); 5403 } 5404 5405 #if HWY_CAP_GE256 5406 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 5407 HWY_API Vec256<T> operator/(Vec256<T> a, Vec256<T> b) { 5408 return detail::IntDiv<sizeof(T)>(a, b); 5409 } 5410 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 5411 HWY_API Vec256<T> operator%(Vec256<T> a, Vec256<T> b) { 5412 return detail::IntMod<sizeof(T)>(a, b); 5413 } 5414 #endif 5415 5416 #if HWY_CAP_GE512 5417 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 5418 HWY_API Vec512<T> operator/(Vec512<T> a, Vec512<T> b) { 5419 return detail::IntDiv<sizeof(T)>(a, b); 5420 } 5421 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 5422 HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) { 5423 return detail::IntMod<sizeof(T)>(a, b); 5424 } 5425 #endif 5426 5427 #endif // HWY_TARGET == HWY_SCALAR 5428 5429 #endif // HWY_NATIVE_INT_DIV 5430 5431 // ------------------------------ AverageRound 5432 5433 #if (defined(HWY_NATIVE_AVERAGE_ROUND_UI32) == defined(HWY_TARGET_TOGGLE)) 5434 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32 5435 #undef HWY_NATIVE_AVERAGE_ROUND_UI32 5436 #else 5437 #define HWY_NATIVE_AVERAGE_ROUND_UI32 5438 #endif 5439 5440 template <class V, HWY_IF_UI32(TFromV<V>)> 5441 HWY_API V AverageRound(V a, V b) { 5442 return Sub(Or(a, b), ShiftRight<1>(Xor(a, b))); 5443 } 5444 5445 #endif // HWY_NATIVE_AVERAGE_ROUND_UI64 5446 5447 #if (defined(HWY_NATIVE_AVERAGE_ROUND_UI64) == defined(HWY_TARGET_TOGGLE)) 5448 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64 5449 #undef HWY_NATIVE_AVERAGE_ROUND_UI64 5450 #else 5451 #define HWY_NATIVE_AVERAGE_ROUND_UI64 5452 #endif 5453 5454 #if HWY_HAVE_INTEGER64 5455 template <class V, HWY_IF_UI64(TFromV<V>)> 5456 HWY_API V AverageRound(V a, V b) { 5457 return Sub(Or(a, b), ShiftRight<1>(Xor(a, b))); 5458 } 5459 #endif 5460 5461 #endif // HWY_NATIVE_AVERAGE_ROUND_UI64 5462 5463 // ------------------------------ RoundingShiftRight (AverageRound) 5464 5465 #if (defined(HWY_NATIVE_ROUNDING_SHR) == defined(HWY_TARGET_TOGGLE)) 5466 #ifdef HWY_NATIVE_ROUNDING_SHR 5467 #undef HWY_NATIVE_ROUNDING_SHR 5468 #else 5469 #define HWY_NATIVE_ROUNDING_SHR 5470 #endif 5471 5472 template <int kShiftAmt, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 5473 HWY_API V RoundingShiftRight(V v) { 5474 const DFromV<V> d; 5475 using T = TFromD<decltype(d)>; 5476 5477 static_assert( 5478 0 <= kShiftAmt && kShiftAmt <= static_cast<int>(sizeof(T) * 8 - 1), 5479 "kShiftAmt is out of range"); 5480 5481 constexpr int kScaleDownShrAmt = HWY_MAX(kShiftAmt - 1, 0); 5482 5483 auto scaled_down_v = v; 5484 HWY_IF_CONSTEXPR(kScaleDownShrAmt > 0) { 5485 scaled_down_v = ShiftRight<kScaleDownShrAmt>(v); 5486 } 5487 5488 HWY_IF_CONSTEXPR(kShiftAmt == 0) { return scaled_down_v; } 5489 5490 return AverageRound(scaled_down_v, Zero(d)); 5491 } 5492 5493 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 5494 HWY_API V RoundingShiftRightSame(V v, int shift_amt) { 5495 const DFromV<V> d; 5496 5497 const bool shift_amt_is_zero = (shift_amt == 0); 5498 const auto scaled_down_v = ShiftRightSame( 5499 v, static_cast<int>(static_cast<unsigned>(shift_amt) + 5500 static_cast<unsigned>(shift_amt_is_zero) - 1u)); 5501 5502 return AverageRound( 5503 scaled_down_v, 5504 IfThenElseZero(SetMask(d, shift_amt_is_zero), scaled_down_v)); 5505 } 5506 5507 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 5508 HWY_API V RoundingShr(V v, V amt) { 5509 const DFromV<V> d; 5510 const RebindToUnsigned<decltype(d)> du; 5511 using T = TFromD<decltype(d)>; 5512 using TU = MakeUnsigned<T>; 5513 5514 const auto unsigned_amt = BitCast(du, amt); 5515 const auto scale_down_shr_amt = 5516 BitCast(d, SaturatedSub(unsigned_amt, Set(du, TU{1}))); 5517 5518 const auto scaled_down_v = Shr(v, scale_down_shr_amt); 5519 return AverageRound(scaled_down_v, 5520 IfThenElseZero(Eq(amt, Zero(d)), scaled_down_v)); 5521 } 5522 5523 #endif // HWY_NATIVE_ROUNDING_SHR 5524 5525 // ------------------------------ MulEvenAdd (PromoteEvenTo) 5526 5527 // SVE with bf16 and NEON with bf16 override this. 5528 #if (defined(HWY_NATIVE_MUL_EVEN_BF16) == defined(HWY_TARGET_TOGGLE)) 5529 #ifdef HWY_NATIVE_MUL_EVEN_BF16 5530 #undef HWY_NATIVE_MUL_EVEN_BF16 5531 #else 5532 #define HWY_NATIVE_MUL_EVEN_BF16 5533 #endif 5534 5535 template <class DF, HWY_IF_F32_D(DF), 5536 class VBF = VFromD<Repartition<bfloat16_t, DF>>> 5537 HWY_API VFromD<DF> MulEvenAdd(DF df, VBF a, VBF b, VFromD<DF> c) { 5538 return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), c); 5539 } 5540 5541 template <class DF, HWY_IF_F32_D(DF), 5542 class VBF = VFromD<Repartition<bfloat16_t, DF>>> 5543 HWY_API VFromD<DF> MulOddAdd(DF df, VBF a, VBF b, VFromD<DF> c) { 5544 return MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), c); 5545 } 5546 5547 #endif // HWY_NATIVE_MUL_EVEN_BF16 5548 5549 // ------------------------------ ReorderWidenMulAccumulate (MulEvenAdd) 5550 5551 // AVX3_SPR/ZEN4, and NEON with bf16 but not(!) SVE override this. 5552 #if (defined(HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16) == \ 5553 defined(HWY_TARGET_TOGGLE)) 5554 #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 5555 #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 5556 #else 5557 #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 5558 #endif 5559 5560 template <class DF, HWY_IF_F32_D(DF), 5561 class VBF = VFromD<Repartition<bfloat16_t, DF>>> 5562 HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b, 5563 VFromD<DF> sum0, 5564 VFromD<DF>& sum1) { 5565 // Lane order within sum0/1 is undefined, hence we can avoid the 5566 // longer-latency lane-crossing PromoteTo by using PromoteEvenTo. 5567 sum1 = MulOddAdd(df, a, b, sum1); 5568 return MulEvenAdd(df, a, b, sum0); 5569 } 5570 5571 #endif // HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 5572 5573 // ------------------------------ WidenMulAccumulate 5574 5575 #if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE) == defined(HWY_TARGET_TOGGLE)) 5576 #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE 5577 #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE 5578 #else 5579 #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE 5580 #endif 5581 5582 template<class D, HWY_IF_INTEGER(TFromD<D>), 5583 class DN = RepartitionToNarrow<D>> 5584 HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x, 5585 VFromD<D> low, VFromD<D>& high) { 5586 high = MulAdd(PromoteUpperTo(d, mul), PromoteUpperTo(d, x), high); 5587 return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low); 5588 } 5589 5590 #endif // HWY_NATIVE_WIDEN_MUL_ACCUMULATE 5591 5592 #if 0 5593 #if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16) == defined(HWY_TARGET_TOGGLE)) 5594 5595 #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16 5596 #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16 5597 #else 5598 #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16 5599 #endif 5600 5601 #if HWY_HAVE_FLOAT16 5602 5603 template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>> 5604 HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x, 5605 VFromD<D> low, VFromD<D>& high) { 5606 high = MulAdd(PromoteUpperTo(d, mul), PromoteUpperTo(d, x), high); 5607 return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low); 5608 } 5609 5610 #endif // HWY_HAVE_FLOAT16 5611 5612 #endif // HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16 5613 #endif // #if 0 5614 5615 // ------------------------------ SatWidenMulPairwiseAdd 5616 5617 #if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \ 5618 defined(HWY_TARGET_TOGGLE)) 5619 5620 #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD 5621 #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD 5622 #else 5623 #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD 5624 #endif 5625 5626 template <class DI16, class VU8, class VI8, 5627 class VU8_2 = Vec<Repartition<uint8_t, DI16>>, HWY_IF_I16_D(DI16), 5628 HWY_IF_U8_D(DFromV<VU8>), HWY_IF_I8_D(DFromV<VI8>), 5629 HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VI8)), 5630 HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VU8_2))> 5631 HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) { 5632 const RebindToUnsigned<decltype(di16)> du16; 5633 5634 const auto a0 = BitCast(di16, PromoteEvenTo(du16, a)); 5635 const auto b0 = PromoteEvenTo(di16, b); 5636 5637 const auto a1 = BitCast(di16, PromoteOddTo(du16, a)); 5638 const auto b1 = PromoteOddTo(di16, b); 5639 5640 return SaturatedAdd(Mul(a0, b0), Mul(a1, b1)); 5641 } 5642 5643 #endif 5644 5645 // ------------------------------ SatWidenMulPairwiseAccumulate 5646 5647 #if (defined(HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM) == \ 5648 defined(HWY_TARGET_TOGGLE)) 5649 5650 #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM 5651 #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM 5652 #else 5653 #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM 5654 #endif 5655 5656 template <class DI32, HWY_IF_I32_D(DI32)> 5657 HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate( 5658 DI32 di32, VFromD<Repartition<int16_t, DI32>> a, 5659 VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) { 5660 // WidenMulPairwiseAdd(di32, a, b) is okay here as 5661 // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as 5662 // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if 5663 // a[0], b[0], a[1], and b[1] are all equal to -32768. 5664 5665 const auto product = WidenMulPairwiseAdd(di32, a, b); 5666 5667 const auto mul_overflow = 5668 VecFromMask(di32, Eq(product, Set(di32, LimitsMin<int32_t>()))); 5669 5670 return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)), 5671 Add(product, mul_overflow)); 5672 } 5673 5674 #endif // HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM 5675 5676 // ------------------------------ SatWidenMulAccumFixedPoint 5677 5678 #if (defined(HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT) == \ 5679 defined(HWY_TARGET_TOGGLE)) 5680 5681 #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT 5682 #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT 5683 #else 5684 #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT 5685 #endif 5686 5687 template <class DI32, HWY_IF_I32_D(DI32)> 5688 HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32, 5689 VFromD<Rebind<int16_t, DI32>> a, 5690 VFromD<Rebind<int16_t, DI32>> b, 5691 VFromD<DI32> sum) { 5692 const Repartition<int16_t, DI32> dt_i16; 5693 5694 const auto vt_a = ResizeBitCast(dt_i16, a); 5695 const auto vt_b = ResizeBitCast(dt_i16, b); 5696 5697 const auto dup_a = InterleaveWholeLower(dt_i16, vt_a, vt_a); 5698 const auto dup_b = InterleaveWholeLower(dt_i16, vt_b, vt_b); 5699 5700 return SatWidenMulPairwiseAccumulate(di32, dup_a, dup_b, sum); 5701 } 5702 5703 #endif // HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT 5704 5705 // ------------------------------ MaskedSqrt 5706 5707 #if (defined(HWY_NATIVE_MASKED_SQRT) == defined(HWY_TARGET_TOGGLE)) 5708 5709 #ifdef HWY_NATIVE_MASKED_SQRT 5710 #undef HWY_NATIVE_MASKED_SQRT 5711 #else 5712 #define HWY_NATIVE_MASKED_SQRT 5713 #endif 5714 template <class V, HWY_IF_FLOAT_V(V), class M> 5715 HWY_API V MaskedSqrt(M m, V v) { 5716 return IfThenElseZero(m, Sqrt(v)); 5717 } 5718 5719 template <class V, HWY_IF_FLOAT_V(V), class M> 5720 HWY_API V MaskedSqrtOr(V no, M m, V v) { 5721 return IfThenElse(m, Sqrt(v), no); 5722 } 5723 #endif 5724 5725 // ------------------------------ SumOfMulQuadAccumulate 5726 5727 #if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \ 5728 defined(HWY_TARGET_TOGGLE)) 5729 5730 #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 5731 #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 5732 #else 5733 #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 5734 #endif 5735 5736 template <class DI32, HWY_IF_I32_D(DI32)> 5737 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32, 5738 VFromD<Repartition<int8_t, DI32>> a, 5739 VFromD<Repartition<int8_t, DI32>> b, 5740 VFromD<DI32> sum) { 5741 const Repartition<int16_t, decltype(di32)> di16; 5742 5743 const auto a0 = PromoteEvenTo(di16, a); 5744 const auto b0 = PromoteEvenTo(di16, b); 5745 5746 const auto a1 = PromoteOddTo(di16, a); 5747 const auto b1 = PromoteOddTo(di16, b); 5748 5749 return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0), 5750 WidenMulPairwiseAdd(di32, a1, b1))); 5751 } 5752 5753 #endif 5754 5755 #if (defined(HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE) == \ 5756 defined(HWY_TARGET_TOGGLE)) 5757 5758 #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 5759 #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 5760 #else 5761 #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 5762 #endif 5763 5764 template <class DU32, HWY_IF_U32_D(DU32)> 5765 HWY_API VFromD<DU32> SumOfMulQuadAccumulate( 5766 DU32 du32, VFromD<Repartition<uint8_t, DU32>> a, 5767 VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) { 5768 const Repartition<uint16_t, decltype(du32)> du16; 5769 const RebindToSigned<decltype(du16)> di16; 5770 const RebindToSigned<decltype(du32)> di32; 5771 5772 const auto lo8_mask = Set(di16, int16_t{0x00FF}); 5773 const auto a0 = And(BitCast(di16, a), lo8_mask); 5774 const auto b0 = And(BitCast(di16, b), lo8_mask); 5775 5776 const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a))); 5777 const auto b1 = BitCast(di16, ShiftRight<8>(BitCast(du16, b))); 5778 5779 return Add(sum, Add(BitCast(du32, WidenMulPairwiseAdd(di32, a0, b0)), 5780 BitCast(du32, WidenMulPairwiseAdd(di32, a1, b1)))); 5781 } 5782 5783 #endif 5784 5785 #if (defined(HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE) == \ 5786 defined(HWY_TARGET_TOGGLE)) 5787 5788 #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 5789 #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 5790 #else 5791 #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 5792 #endif 5793 5794 template <class DI32, HWY_IF_I32_D(DI32)> 5795 HWY_API VFromD<DI32> SumOfMulQuadAccumulate( 5796 DI32 di32, VFromD<Repartition<uint8_t, DI32>> a_u, 5797 VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) { 5798 const Repartition<int16_t, decltype(di32)> di16; 5799 const RebindToUnsigned<decltype(di16)> du16; 5800 5801 const auto a0 = And(BitCast(di16, a_u), Set(di16, int16_t{0x00FF})); 5802 const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b_i))); 5803 5804 const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a_u))); 5805 const auto b1 = ShiftRight<8>(BitCast(di16, b_i)); 5806 5807 // NOTE: SatWidenMulPairwiseAdd(di16, a_u, b_i) cannot be used in 5808 // SumOfMulQuadAccumulate as it is possible for 5809 // a_u[0]*b_i[0]+a_u[1]*b_i[1] to overflow an int16_t if a_u[0], b_i[0], 5810 // a_u[1], and b_i[1] are all non-zero and b_i[0] and b_i[1] have the same 5811 // sign. 5812 5813 return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0), 5814 WidenMulPairwiseAdd(di32, a1, b1))); 5815 } 5816 5817 #endif 5818 5819 #if (defined(HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE) == \ 5820 defined(HWY_TARGET_TOGGLE)) 5821 5822 #ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE 5823 #undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE 5824 #else 5825 #define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE 5826 #endif 5827 5828 #if HWY_HAVE_INTEGER64 5829 template <class DI64, HWY_IF_I64_D(DI64)> 5830 HWY_API VFromD<DI64> SumOfMulQuadAccumulate( 5831 DI64 di64, VFromD<Repartition<int16_t, DI64>> a, 5832 VFromD<Repartition<int16_t, DI64>> b, VFromD<DI64> sum) { 5833 const Repartition<int32_t, decltype(di64)> di32; 5834 5835 // WidenMulPairwiseAdd(di32, a, b) is okay here as 5836 // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as 5837 // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if 5838 // a[0], b[0], a[1], and b[1] are all equal to -32768. 5839 5840 const auto i32_pairwise_sum = WidenMulPairwiseAdd(di32, a, b); 5841 const auto i32_pairwise_sum_overflow = 5842 VecFromMask(di32, Eq(i32_pairwise_sum, Set(di32, LimitsMin<int32_t>()))); 5843 5844 // The upper 32 bits of sum0 and sum1 need to be zeroed out in the case of 5845 // overflow. 5846 const auto hi32_mask = Set(di64, static_cast<int64_t>(~int64_t{0xFFFFFFFF})); 5847 const auto p0_zero_out_mask = 5848 ShiftLeft<32>(BitCast(di64, i32_pairwise_sum_overflow)); 5849 const auto p1_zero_out_mask = 5850 And(BitCast(di64, i32_pairwise_sum_overflow), hi32_mask); 5851 5852 const auto p0 = 5853 AndNot(p0_zero_out_mask, 5854 ShiftRight<32>(ShiftLeft<32>(BitCast(di64, i32_pairwise_sum)))); 5855 const auto p1 = 5856 AndNot(p1_zero_out_mask, ShiftRight<32>(BitCast(di64, i32_pairwise_sum))); 5857 5858 return Add(sum, Add(p0, p1)); 5859 } 5860 #endif // HWY_HAVE_INTEGER64 5861 #endif // HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE 5862 5863 #if (defined(HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE) == \ 5864 defined(HWY_TARGET_TOGGLE)) 5865 5866 #ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE 5867 #undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE 5868 #else 5869 #define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE 5870 #endif 5871 5872 #if HWY_HAVE_INTEGER64 5873 template <class DU64, HWY_IF_U64_D(DU64)> 5874 HWY_API VFromD<DU64> SumOfMulQuadAccumulate( 5875 DU64 du64, VFromD<Repartition<uint16_t, DU64>> a, 5876 VFromD<Repartition<uint16_t, DU64>> b, VFromD<DU64> sum) { 5877 const auto u32_even_prod = MulEven(a, b); 5878 const auto u32_odd_prod = MulOdd(a, b); 5879 5880 const auto p0 = Add(PromoteEvenTo(du64, u32_even_prod), 5881 PromoteEvenTo(du64, u32_odd_prod)); 5882 const auto p1 = 5883 Add(PromoteOddTo(du64, u32_even_prod), PromoteOddTo(du64, u32_odd_prod)); 5884 5885 return Add(sum, Add(p0, p1)); 5886 } 5887 #endif // HWY_HAVE_INTEGER64 5888 #endif // HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE 5889 5890 // ------------------------------ F64 ApproximateReciprocal 5891 5892 #if (defined(HWY_NATIVE_F64_APPROX_RECIP) == defined(HWY_TARGET_TOGGLE)) 5893 #ifdef HWY_NATIVE_F64_APPROX_RECIP 5894 #undef HWY_NATIVE_F64_APPROX_RECIP 5895 #else 5896 #define HWY_NATIVE_F64_APPROX_RECIP 5897 #endif 5898 5899 #if HWY_HAVE_FLOAT64 5900 template <class V, HWY_IF_F64_D(DFromV<V>)> 5901 HWY_API V ApproximateReciprocal(V v) { 5902 const DFromV<decltype(v)> d; 5903 return Div(Set(d, 1.0), v); 5904 } 5905 #endif // HWY_HAVE_FLOAT64 5906 5907 #endif // HWY_NATIVE_F64_APPROX_RECIP 5908 5909 // ------------------------------ MaskedApproximateReciprocal 5910 template <class V, HWY_IF_FLOAT_V(V), class M> 5911 HWY_API V MaskedApproximateReciprocal(M m, V v) { 5912 return IfThenElseZero(m, ApproximateReciprocal(v)); 5913 } 5914 5915 // ------------------------------ F64 ApproximateReciprocalSqrt 5916 5917 #if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE)) 5918 #ifdef HWY_NATIVE_F64_APPROX_RSQRT 5919 #undef HWY_NATIVE_F64_APPROX_RSQRT 5920 #else 5921 #define HWY_NATIVE_F64_APPROX_RSQRT 5922 #endif 5923 5924 #if HWY_HAVE_FLOAT64 5925 template <class V, HWY_IF_F64_D(DFromV<V>)> 5926 HWY_API V ApproximateReciprocalSqrt(V v) { 5927 const DFromV<decltype(v)> d; 5928 const RebindToUnsigned<decltype(d)> du; 5929 const auto half = Mul(v, Set(d, 0.5)); 5930 // Initial guess based on log2(f) 5931 const auto guess = BitCast(d, Sub(Set(du, uint64_t{0x5FE6EB50C7B537A9u}), 5932 ShiftRight<1>(BitCast(du, v)))); 5933 // One Newton-Raphson iteration 5934 return Mul(guess, NegMulAdd(Mul(half, guess), guess, Set(d, 1.5))); 5935 } 5936 #endif // HWY_HAVE_FLOAT64 5937 5938 #endif // HWY_NATIVE_F64_APPROX_RSQRT 5939 5940 // ------------------------------ MaskedApproximateReciprocalSqrt 5941 template <class V, HWY_IF_FLOAT_V(V), class M> 5942 HWY_API V MaskedApproximateReciprocalSqrt(M m, V v) { 5943 return IfThenElseZero(m, ApproximateReciprocalSqrt(v)); 5944 } 5945 5946 // ------------------------------ Compress* 5947 5948 #if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE)) 5949 #ifdef HWY_NATIVE_COMPRESS8 5950 #undef HWY_NATIVE_COMPRESS8 5951 #else 5952 #define HWY_NATIVE_COMPRESS8 5953 #endif 5954 5955 template <class V, class D, typename T, HWY_IF_T_SIZE(T, 1)> 5956 HWY_API size_t CompressBitsStore(V v, const uint8_t* HWY_RESTRICT bits, D d, 5957 T* unaligned) { 5958 HWY_ALIGN T lanes[MaxLanes(d)]; 5959 Store(v, d, lanes); 5960 5961 const Simd<T, HWY_MIN(MaxLanes(d), 8), 0> d8; 5962 T* HWY_RESTRICT pos = unaligned; 5963 5964 HWY_ALIGN constexpr T table[2048] = { 5965 0, 1, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 5966 1, 0, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 5967 2, 0, 1, 3, 4, 5, 6, 7, /**/ 0, 2, 1, 3, 4, 5, 6, 7, // 5968 1, 2, 0, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 5969 3, 0, 1, 2, 4, 5, 6, 7, /**/ 0, 3, 1, 2, 4, 5, 6, 7, // 5970 1, 3, 0, 2, 4, 5, 6, 7, /**/ 0, 1, 3, 2, 4, 5, 6, 7, // 5971 2, 3, 0, 1, 4, 5, 6, 7, /**/ 0, 2, 3, 1, 4, 5, 6, 7, // 5972 1, 2, 3, 0, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 5973 4, 0, 1, 2, 3, 5, 6, 7, /**/ 0, 4, 1, 2, 3, 5, 6, 7, // 5974 1, 4, 0, 2, 3, 5, 6, 7, /**/ 0, 1, 4, 2, 3, 5, 6, 7, // 5975 2, 4, 0, 1, 3, 5, 6, 7, /**/ 0, 2, 4, 1, 3, 5, 6, 7, // 5976 1, 2, 4, 0, 3, 5, 6, 7, /**/ 0, 1, 2, 4, 3, 5, 6, 7, // 5977 3, 4, 0, 1, 2, 5, 6, 7, /**/ 0, 3, 4, 1, 2, 5, 6, 7, // 5978 1, 3, 4, 0, 2, 5, 6, 7, /**/ 0, 1, 3, 4, 2, 5, 6, 7, // 5979 2, 3, 4, 0, 1, 5, 6, 7, /**/ 0, 2, 3, 4, 1, 5, 6, 7, // 5980 1, 2, 3, 4, 0, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 5981 5, 0, 1, 2, 3, 4, 6, 7, /**/ 0, 5, 1, 2, 3, 4, 6, 7, // 5982 1, 5, 0, 2, 3, 4, 6, 7, /**/ 0, 1, 5, 2, 3, 4, 6, 7, // 5983 2, 5, 0, 1, 3, 4, 6, 7, /**/ 0, 2, 5, 1, 3, 4, 6, 7, // 5984 1, 2, 5, 0, 3, 4, 6, 7, /**/ 0, 1, 2, 5, 3, 4, 6, 7, // 5985 3, 5, 0, 1, 2, 4, 6, 7, /**/ 0, 3, 5, 1, 2, 4, 6, 7, // 5986 1, 3, 5, 0, 2, 4, 6, 7, /**/ 0, 1, 3, 5, 2, 4, 6, 7, // 5987 2, 3, 5, 0, 1, 4, 6, 7, /**/ 0, 2, 3, 5, 1, 4, 6, 7, // 5988 1, 2, 3, 5, 0, 4, 6, 7, /**/ 0, 1, 2, 3, 5, 4, 6, 7, // 5989 4, 5, 0, 1, 2, 3, 6, 7, /**/ 0, 4, 5, 1, 2, 3, 6, 7, // 5990 1, 4, 5, 0, 2, 3, 6, 7, /**/ 0, 1, 4, 5, 2, 3, 6, 7, // 5991 2, 4, 5, 0, 1, 3, 6, 7, /**/ 0, 2, 4, 5, 1, 3, 6, 7, // 5992 1, 2, 4, 5, 0, 3, 6, 7, /**/ 0, 1, 2, 4, 5, 3, 6, 7, // 5993 3, 4, 5, 0, 1, 2, 6, 7, /**/ 0, 3, 4, 5, 1, 2, 6, 7, // 5994 1, 3, 4, 5, 0, 2, 6, 7, /**/ 0, 1, 3, 4, 5, 2, 6, 7, // 5995 2, 3, 4, 5, 0, 1, 6, 7, /**/ 0, 2, 3, 4, 5, 1, 6, 7, // 5996 1, 2, 3, 4, 5, 0, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 5997 6, 0, 1, 2, 3, 4, 5, 7, /**/ 0, 6, 1, 2, 3, 4, 5, 7, // 5998 1, 6, 0, 2, 3, 4, 5, 7, /**/ 0, 1, 6, 2, 3, 4, 5, 7, // 5999 2, 6, 0, 1, 3, 4, 5, 7, /**/ 0, 2, 6, 1, 3, 4, 5, 7, // 6000 1, 2, 6, 0, 3, 4, 5, 7, /**/ 0, 1, 2, 6, 3, 4, 5, 7, // 6001 3, 6, 0, 1, 2, 4, 5, 7, /**/ 0, 3, 6, 1, 2, 4, 5, 7, // 6002 1, 3, 6, 0, 2, 4, 5, 7, /**/ 0, 1, 3, 6, 2, 4, 5, 7, // 6003 2, 3, 6, 0, 1, 4, 5, 7, /**/ 0, 2, 3, 6, 1, 4, 5, 7, // 6004 1, 2, 3, 6, 0, 4, 5, 7, /**/ 0, 1, 2, 3, 6, 4, 5, 7, // 6005 4, 6, 0, 1, 2, 3, 5, 7, /**/ 0, 4, 6, 1, 2, 3, 5, 7, // 6006 1, 4, 6, 0, 2, 3, 5, 7, /**/ 0, 1, 4, 6, 2, 3, 5, 7, // 6007 2, 4, 6, 0, 1, 3, 5, 7, /**/ 0, 2, 4, 6, 1, 3, 5, 7, // 6008 1, 2, 4, 6, 0, 3, 5, 7, /**/ 0, 1, 2, 4, 6, 3, 5, 7, // 6009 3, 4, 6, 0, 1, 2, 5, 7, /**/ 0, 3, 4, 6, 1, 2, 5, 7, // 6010 1, 3, 4, 6, 0, 2, 5, 7, /**/ 0, 1, 3, 4, 6, 2, 5, 7, // 6011 2, 3, 4, 6, 0, 1, 5, 7, /**/ 0, 2, 3, 4, 6, 1, 5, 7, // 6012 1, 2, 3, 4, 6, 0, 5, 7, /**/ 0, 1, 2, 3, 4, 6, 5, 7, // 6013 5, 6, 0, 1, 2, 3, 4, 7, /**/ 0, 5, 6, 1, 2, 3, 4, 7, // 6014 1, 5, 6, 0, 2, 3, 4, 7, /**/ 0, 1, 5, 6, 2, 3, 4, 7, // 6015 2, 5, 6, 0, 1, 3, 4, 7, /**/ 0, 2, 5, 6, 1, 3, 4, 7, // 6016 1, 2, 5, 6, 0, 3, 4, 7, /**/ 0, 1, 2, 5, 6, 3, 4, 7, // 6017 3, 5, 6, 0, 1, 2, 4, 7, /**/ 0, 3, 5, 6, 1, 2, 4, 7, // 6018 1, 3, 5, 6, 0, 2, 4, 7, /**/ 0, 1, 3, 5, 6, 2, 4, 7, // 6019 2, 3, 5, 6, 0, 1, 4, 7, /**/ 0, 2, 3, 5, 6, 1, 4, 7, // 6020 1, 2, 3, 5, 6, 0, 4, 7, /**/ 0, 1, 2, 3, 5, 6, 4, 7, // 6021 4, 5, 6, 0, 1, 2, 3, 7, /**/ 0, 4, 5, 6, 1, 2, 3, 7, // 6022 1, 4, 5, 6, 0, 2, 3, 7, /**/ 0, 1, 4, 5, 6, 2, 3, 7, // 6023 2, 4, 5, 6, 0, 1, 3, 7, /**/ 0, 2, 4, 5, 6, 1, 3, 7, // 6024 1, 2, 4, 5, 6, 0, 3, 7, /**/ 0, 1, 2, 4, 5, 6, 3, 7, // 6025 3, 4, 5, 6, 0, 1, 2, 7, /**/ 0, 3, 4, 5, 6, 1, 2, 7, // 6026 1, 3, 4, 5, 6, 0, 2, 7, /**/ 0, 1, 3, 4, 5, 6, 2, 7, // 6027 2, 3, 4, 5, 6, 0, 1, 7, /**/ 0, 2, 3, 4, 5, 6, 1, 7, // 6028 1, 2, 3, 4, 5, 6, 0, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 6029 7, 0, 1, 2, 3, 4, 5, 6, /**/ 0, 7, 1, 2, 3, 4, 5, 6, // 6030 1, 7, 0, 2, 3, 4, 5, 6, /**/ 0, 1, 7, 2, 3, 4, 5, 6, // 6031 2, 7, 0, 1, 3, 4, 5, 6, /**/ 0, 2, 7, 1, 3, 4, 5, 6, // 6032 1, 2, 7, 0, 3, 4, 5, 6, /**/ 0, 1, 2, 7, 3, 4, 5, 6, // 6033 3, 7, 0, 1, 2, 4, 5, 6, /**/ 0, 3, 7, 1, 2, 4, 5, 6, // 6034 1, 3, 7, 0, 2, 4, 5, 6, /**/ 0, 1, 3, 7, 2, 4, 5, 6, // 6035 2, 3, 7, 0, 1, 4, 5, 6, /**/ 0, 2, 3, 7, 1, 4, 5, 6, // 6036 1, 2, 3, 7, 0, 4, 5, 6, /**/ 0, 1, 2, 3, 7, 4, 5, 6, // 6037 4, 7, 0, 1, 2, 3, 5, 6, /**/ 0, 4, 7, 1, 2, 3, 5, 6, // 6038 1, 4, 7, 0, 2, 3, 5, 6, /**/ 0, 1, 4, 7, 2, 3, 5, 6, // 6039 2, 4, 7, 0, 1, 3, 5, 6, /**/ 0, 2, 4, 7, 1, 3, 5, 6, // 6040 1, 2, 4, 7, 0, 3, 5, 6, /**/ 0, 1, 2, 4, 7, 3, 5, 6, // 6041 3, 4, 7, 0, 1, 2, 5, 6, /**/ 0, 3, 4, 7, 1, 2, 5, 6, // 6042 1, 3, 4, 7, 0, 2, 5, 6, /**/ 0, 1, 3, 4, 7, 2, 5, 6, // 6043 2, 3, 4, 7, 0, 1, 5, 6, /**/ 0, 2, 3, 4, 7, 1, 5, 6, // 6044 1, 2, 3, 4, 7, 0, 5, 6, /**/ 0, 1, 2, 3, 4, 7, 5, 6, // 6045 5, 7, 0, 1, 2, 3, 4, 6, /**/ 0, 5, 7, 1, 2, 3, 4, 6, // 6046 1, 5, 7, 0, 2, 3, 4, 6, /**/ 0, 1, 5, 7, 2, 3, 4, 6, // 6047 2, 5, 7, 0, 1, 3, 4, 6, /**/ 0, 2, 5, 7, 1, 3, 4, 6, // 6048 1, 2, 5, 7, 0, 3, 4, 6, /**/ 0, 1, 2, 5, 7, 3, 4, 6, // 6049 3, 5, 7, 0, 1, 2, 4, 6, /**/ 0, 3, 5, 7, 1, 2, 4, 6, // 6050 1, 3, 5, 7, 0, 2, 4, 6, /**/ 0, 1, 3, 5, 7, 2, 4, 6, // 6051 2, 3, 5, 7, 0, 1, 4, 6, /**/ 0, 2, 3, 5, 7, 1, 4, 6, // 6052 1, 2, 3, 5, 7, 0, 4, 6, /**/ 0, 1, 2, 3, 5, 7, 4, 6, // 6053 4, 5, 7, 0, 1, 2, 3, 6, /**/ 0, 4, 5, 7, 1, 2, 3, 6, // 6054 1, 4, 5, 7, 0, 2, 3, 6, /**/ 0, 1, 4, 5, 7, 2, 3, 6, // 6055 2, 4, 5, 7, 0, 1, 3, 6, /**/ 0, 2, 4, 5, 7, 1, 3, 6, // 6056 1, 2, 4, 5, 7, 0, 3, 6, /**/ 0, 1, 2, 4, 5, 7, 3, 6, // 6057 3, 4, 5, 7, 0, 1, 2, 6, /**/ 0, 3, 4, 5, 7, 1, 2, 6, // 6058 1, 3, 4, 5, 7, 0, 2, 6, /**/ 0, 1, 3, 4, 5, 7, 2, 6, // 6059 2, 3, 4, 5, 7, 0, 1, 6, /**/ 0, 2, 3, 4, 5, 7, 1, 6, // 6060 1, 2, 3, 4, 5, 7, 0, 6, /**/ 0, 1, 2, 3, 4, 5, 7, 6, // 6061 6, 7, 0, 1, 2, 3, 4, 5, /**/ 0, 6, 7, 1, 2, 3, 4, 5, // 6062 1, 6, 7, 0, 2, 3, 4, 5, /**/ 0, 1, 6, 7, 2, 3, 4, 5, // 6063 2, 6, 7, 0, 1, 3, 4, 5, /**/ 0, 2, 6, 7, 1, 3, 4, 5, // 6064 1, 2, 6, 7, 0, 3, 4, 5, /**/ 0, 1, 2, 6, 7, 3, 4, 5, // 6065 3, 6, 7, 0, 1, 2, 4, 5, /**/ 0, 3, 6, 7, 1, 2, 4, 5, // 6066 1, 3, 6, 7, 0, 2, 4, 5, /**/ 0, 1, 3, 6, 7, 2, 4, 5, // 6067 2, 3, 6, 7, 0, 1, 4, 5, /**/ 0, 2, 3, 6, 7, 1, 4, 5, // 6068 1, 2, 3, 6, 7, 0, 4, 5, /**/ 0, 1, 2, 3, 6, 7, 4, 5, // 6069 4, 6, 7, 0, 1, 2, 3, 5, /**/ 0, 4, 6, 7, 1, 2, 3, 5, // 6070 1, 4, 6, 7, 0, 2, 3, 5, /**/ 0, 1, 4, 6, 7, 2, 3, 5, // 6071 2, 4, 6, 7, 0, 1, 3, 5, /**/ 0, 2, 4, 6, 7, 1, 3, 5, // 6072 1, 2, 4, 6, 7, 0, 3, 5, /**/ 0, 1, 2, 4, 6, 7, 3, 5, // 6073 3, 4, 6, 7, 0, 1, 2, 5, /**/ 0, 3, 4, 6, 7, 1, 2, 5, // 6074 1, 3, 4, 6, 7, 0, 2, 5, /**/ 0, 1, 3, 4, 6, 7, 2, 5, // 6075 2, 3, 4, 6, 7, 0, 1, 5, /**/ 0, 2, 3, 4, 6, 7, 1, 5, // 6076 1, 2, 3, 4, 6, 7, 0, 5, /**/ 0, 1, 2, 3, 4, 6, 7, 5, // 6077 5, 6, 7, 0, 1, 2, 3, 4, /**/ 0, 5, 6, 7, 1, 2, 3, 4, // 6078 1, 5, 6, 7, 0, 2, 3, 4, /**/ 0, 1, 5, 6, 7, 2, 3, 4, // 6079 2, 5, 6, 7, 0, 1, 3, 4, /**/ 0, 2, 5, 6, 7, 1, 3, 4, // 6080 1, 2, 5, 6, 7, 0, 3, 4, /**/ 0, 1, 2, 5, 6, 7, 3, 4, // 6081 3, 5, 6, 7, 0, 1, 2, 4, /**/ 0, 3, 5, 6, 7, 1, 2, 4, // 6082 1, 3, 5, 6, 7, 0, 2, 4, /**/ 0, 1, 3, 5, 6, 7, 2, 4, // 6083 2, 3, 5, 6, 7, 0, 1, 4, /**/ 0, 2, 3, 5, 6, 7, 1, 4, // 6084 1, 2, 3, 5, 6, 7, 0, 4, /**/ 0, 1, 2, 3, 5, 6, 7, 4, // 6085 4, 5, 6, 7, 0, 1, 2, 3, /**/ 0, 4, 5, 6, 7, 1, 2, 3, // 6086 1, 4, 5, 6, 7, 0, 2, 3, /**/ 0, 1, 4, 5, 6, 7, 2, 3, // 6087 2, 4, 5, 6, 7, 0, 1, 3, /**/ 0, 2, 4, 5, 6, 7, 1, 3, // 6088 1, 2, 4, 5, 6, 7, 0, 3, /**/ 0, 1, 2, 4, 5, 6, 7, 3, // 6089 3, 4, 5, 6, 7, 0, 1, 2, /**/ 0, 3, 4, 5, 6, 7, 1, 2, // 6090 1, 3, 4, 5, 6, 7, 0, 2, /**/ 0, 1, 3, 4, 5, 6, 7, 2, // 6091 2, 3, 4, 5, 6, 7, 0, 1, /**/ 0, 2, 3, 4, 5, 6, 7, 1, // 6092 1, 2, 3, 4, 5, 6, 7, 0, /**/ 0, 1, 2, 3, 4, 5, 6, 7}; 6093 6094 for (size_t i = 0; i < Lanes(d); i += 8) { 6095 // Each byte worth of bits is the index of one of 256 8-byte ranges, and its 6096 // population count determines how far to advance the write position. 6097 const size_t bits8 = bits[i / 8]; 6098 const auto indices = Load(d8, table + bits8 * 8); 6099 const auto compressed = TableLookupBytes(LoadU(d8, lanes + i), indices); 6100 StoreU(compressed, d8, pos); 6101 pos += PopCount(bits8); 6102 } 6103 return static_cast<size_t>(pos - unaligned); 6104 } 6105 6106 template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)> 6107 HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) { 6108 uint8_t bits[HWY_MAX(size_t{8}, MaxLanes(d) / 8)]; 6109 (void)StoreMaskBits(d, mask, bits); 6110 return CompressBitsStore(v, bits, d, unaligned); 6111 } 6112 6113 template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)> 6114 HWY_API size_t CompressBlendedStore(V v, M mask, D d, 6115 T* HWY_RESTRICT unaligned) { 6116 HWY_ALIGN T buf[MaxLanes(d)]; 6117 const size_t bytes = CompressStore(v, mask, d, buf); 6118 BlendedStore(Load(d, buf), FirstN(d, bytes), d, unaligned); 6119 return bytes; 6120 } 6121 6122 // For reasons unknown, HWY_IF_T_SIZE_V is a compile error in SVE. 6123 template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)> 6124 HWY_API V Compress(V v, const M mask) { 6125 const DFromV<V> d; 6126 HWY_ALIGN T lanes[MaxLanes(d)]; 6127 (void)CompressStore(v, mask, d, lanes); 6128 return Load(d, lanes); 6129 } 6130 6131 template <class V, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)> 6132 HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { 6133 const DFromV<V> d; 6134 HWY_ALIGN T lanes[MaxLanes(d)]; 6135 (void)CompressBitsStore(v, bits, d, lanes); 6136 return Load(d, lanes); 6137 } 6138 6139 template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)> 6140 HWY_API V CompressNot(V v, M mask) { 6141 return Compress(v, Not(mask)); 6142 } 6143 6144 #endif // HWY_NATIVE_COMPRESS8 6145 6146 // ------------------------------ Expand 6147 6148 // Note that this generic implementation assumes <= 128 bit fixed vectors; 6149 // the SVE and RVV targets provide their own native implementations. 6150 #if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE 6151 #ifdef HWY_NATIVE_EXPAND 6152 #undef HWY_NATIVE_EXPAND 6153 #else 6154 #define HWY_NATIVE_EXPAND 6155 #endif 6156 6157 namespace detail { 6158 6159 template <size_t N> 6160 HWY_INLINE Vec128<uint8_t, N> IndicesForExpandFromBits(uint64_t mask_bits) { 6161 static_assert(N <= 8, "Should only be called for half-vectors"); 6162 const Simd<uint8_t, N, 0> du8; 6163 HWY_DASSERT(mask_bits < 0x100); 6164 alignas(16) static constexpr uint8_t table[2048] = { 6165 // PrintExpand8x8Tables 6166 128, 128, 128, 128, 128, 128, 128, 128, // 6167 0, 128, 128, 128, 128, 128, 128, 128, // 6168 128, 0, 128, 128, 128, 128, 128, 128, // 6169 0, 1, 128, 128, 128, 128, 128, 128, // 6170 128, 128, 0, 128, 128, 128, 128, 128, // 6171 0, 128, 1, 128, 128, 128, 128, 128, // 6172 128, 0, 1, 128, 128, 128, 128, 128, // 6173 0, 1, 2, 128, 128, 128, 128, 128, // 6174 128, 128, 128, 0, 128, 128, 128, 128, // 6175 0, 128, 128, 1, 128, 128, 128, 128, // 6176 128, 0, 128, 1, 128, 128, 128, 128, // 6177 0, 1, 128, 2, 128, 128, 128, 128, // 6178 128, 128, 0, 1, 128, 128, 128, 128, // 6179 0, 128, 1, 2, 128, 128, 128, 128, // 6180 128, 0, 1, 2, 128, 128, 128, 128, // 6181 0, 1, 2, 3, 128, 128, 128, 128, // 6182 128, 128, 128, 128, 0, 128, 128, 128, // 6183 0, 128, 128, 128, 1, 128, 128, 128, // 6184 128, 0, 128, 128, 1, 128, 128, 128, // 6185 0, 1, 128, 128, 2, 128, 128, 128, // 6186 128, 128, 0, 128, 1, 128, 128, 128, // 6187 0, 128, 1, 128, 2, 128, 128, 128, // 6188 128, 0, 1, 128, 2, 128, 128, 128, // 6189 0, 1, 2, 128, 3, 128, 128, 128, // 6190 128, 128, 128, 0, 1, 128, 128, 128, // 6191 0, 128, 128, 1, 2, 128, 128, 128, // 6192 128, 0, 128, 1, 2, 128, 128, 128, // 6193 0, 1, 128, 2, 3, 128, 128, 128, // 6194 128, 128, 0, 1, 2, 128, 128, 128, // 6195 0, 128, 1, 2, 3, 128, 128, 128, // 6196 128, 0, 1, 2, 3, 128, 128, 128, // 6197 0, 1, 2, 3, 4, 128, 128, 128, // 6198 128, 128, 128, 128, 128, 0, 128, 128, // 6199 0, 128, 128, 128, 128, 1, 128, 128, // 6200 128, 0, 128, 128, 128, 1, 128, 128, // 6201 0, 1, 128, 128, 128, 2, 128, 128, // 6202 128, 128, 0, 128, 128, 1, 128, 128, // 6203 0, 128, 1, 128, 128, 2, 128, 128, // 6204 128, 0, 1, 128, 128, 2, 128, 128, // 6205 0, 1, 2, 128, 128, 3, 128, 128, // 6206 128, 128, 128, 0, 128, 1, 128, 128, // 6207 0, 128, 128, 1, 128, 2, 128, 128, // 6208 128, 0, 128, 1, 128, 2, 128, 128, // 6209 0, 1, 128, 2, 128, 3, 128, 128, // 6210 128, 128, 0, 1, 128, 2, 128, 128, // 6211 0, 128, 1, 2, 128, 3, 128, 128, // 6212 128, 0, 1, 2, 128, 3, 128, 128, // 6213 0, 1, 2, 3, 128, 4, 128, 128, // 6214 128, 128, 128, 128, 0, 1, 128, 128, // 6215 0, 128, 128, 128, 1, 2, 128, 128, // 6216 128, 0, 128, 128, 1, 2, 128, 128, // 6217 0, 1, 128, 128, 2, 3, 128, 128, // 6218 128, 128, 0, 128, 1, 2, 128, 128, // 6219 0, 128, 1, 128, 2, 3, 128, 128, // 6220 128, 0, 1, 128, 2, 3, 128, 128, // 6221 0, 1, 2, 128, 3, 4, 128, 128, // 6222 128, 128, 128, 0, 1, 2, 128, 128, // 6223 0, 128, 128, 1, 2, 3, 128, 128, // 6224 128, 0, 128, 1, 2, 3, 128, 128, // 6225 0, 1, 128, 2, 3, 4, 128, 128, // 6226 128, 128, 0, 1, 2, 3, 128, 128, // 6227 0, 128, 1, 2, 3, 4, 128, 128, // 6228 128, 0, 1, 2, 3, 4, 128, 128, // 6229 0, 1, 2, 3, 4, 5, 128, 128, // 6230 128, 128, 128, 128, 128, 128, 0, 128, // 6231 0, 128, 128, 128, 128, 128, 1, 128, // 6232 128, 0, 128, 128, 128, 128, 1, 128, // 6233 0, 1, 128, 128, 128, 128, 2, 128, // 6234 128, 128, 0, 128, 128, 128, 1, 128, // 6235 0, 128, 1, 128, 128, 128, 2, 128, // 6236 128, 0, 1, 128, 128, 128, 2, 128, // 6237 0, 1, 2, 128, 128, 128, 3, 128, // 6238 128, 128, 128, 0, 128, 128, 1, 128, // 6239 0, 128, 128, 1, 128, 128, 2, 128, // 6240 128, 0, 128, 1, 128, 128, 2, 128, // 6241 0, 1, 128, 2, 128, 128, 3, 128, // 6242 128, 128, 0, 1, 128, 128, 2, 128, // 6243 0, 128, 1, 2, 128, 128, 3, 128, // 6244 128, 0, 1, 2, 128, 128, 3, 128, // 6245 0, 1, 2, 3, 128, 128, 4, 128, // 6246 128, 128, 128, 128, 0, 128, 1, 128, // 6247 0, 128, 128, 128, 1, 128, 2, 128, // 6248 128, 0, 128, 128, 1, 128, 2, 128, // 6249 0, 1, 128, 128, 2, 128, 3, 128, // 6250 128, 128, 0, 128, 1, 128, 2, 128, // 6251 0, 128, 1, 128, 2, 128, 3, 128, // 6252 128, 0, 1, 128, 2, 128, 3, 128, // 6253 0, 1, 2, 128, 3, 128, 4, 128, // 6254 128, 128, 128, 0, 1, 128, 2, 128, // 6255 0, 128, 128, 1, 2, 128, 3, 128, // 6256 128, 0, 128, 1, 2, 128, 3, 128, // 6257 0, 1, 128, 2, 3, 128, 4, 128, // 6258 128, 128, 0, 1, 2, 128, 3, 128, // 6259 0, 128, 1, 2, 3, 128, 4, 128, // 6260 128, 0, 1, 2, 3, 128, 4, 128, // 6261 0, 1, 2, 3, 4, 128, 5, 128, // 6262 128, 128, 128, 128, 128, 0, 1, 128, // 6263 0, 128, 128, 128, 128, 1, 2, 128, // 6264 128, 0, 128, 128, 128, 1, 2, 128, // 6265 0, 1, 128, 128, 128, 2, 3, 128, // 6266 128, 128, 0, 128, 128, 1, 2, 128, // 6267 0, 128, 1, 128, 128, 2, 3, 128, // 6268 128, 0, 1, 128, 128, 2, 3, 128, // 6269 0, 1, 2, 128, 128, 3, 4, 128, // 6270 128, 128, 128, 0, 128, 1, 2, 128, // 6271 0, 128, 128, 1, 128, 2, 3, 128, // 6272 128, 0, 128, 1, 128, 2, 3, 128, // 6273 0, 1, 128, 2, 128, 3, 4, 128, // 6274 128, 128, 0, 1, 128, 2, 3, 128, // 6275 0, 128, 1, 2, 128, 3, 4, 128, // 6276 128, 0, 1, 2, 128, 3, 4, 128, // 6277 0, 1, 2, 3, 128, 4, 5, 128, // 6278 128, 128, 128, 128, 0, 1, 2, 128, // 6279 0, 128, 128, 128, 1, 2, 3, 128, // 6280 128, 0, 128, 128, 1, 2, 3, 128, // 6281 0, 1, 128, 128, 2, 3, 4, 128, // 6282 128, 128, 0, 128, 1, 2, 3, 128, // 6283 0, 128, 1, 128, 2, 3, 4, 128, // 6284 128, 0, 1, 128, 2, 3, 4, 128, // 6285 0, 1, 2, 128, 3, 4, 5, 128, // 6286 128, 128, 128, 0, 1, 2, 3, 128, // 6287 0, 128, 128, 1, 2, 3, 4, 128, // 6288 128, 0, 128, 1, 2, 3, 4, 128, // 6289 0, 1, 128, 2, 3, 4, 5, 128, // 6290 128, 128, 0, 1, 2, 3, 4, 128, // 6291 0, 128, 1, 2, 3, 4, 5, 128, // 6292 128, 0, 1, 2, 3, 4, 5, 128, // 6293 0, 1, 2, 3, 4, 5, 6, 128, // 6294 128, 128, 128, 128, 128, 128, 128, 0, // 6295 0, 128, 128, 128, 128, 128, 128, 1, // 6296 128, 0, 128, 128, 128, 128, 128, 1, // 6297 0, 1, 128, 128, 128, 128, 128, 2, // 6298 128, 128, 0, 128, 128, 128, 128, 1, // 6299 0, 128, 1, 128, 128, 128, 128, 2, // 6300 128, 0, 1, 128, 128, 128, 128, 2, // 6301 0, 1, 2, 128, 128, 128, 128, 3, // 6302 128, 128, 128, 0, 128, 128, 128, 1, // 6303 0, 128, 128, 1, 128, 128, 128, 2, // 6304 128, 0, 128, 1, 128, 128, 128, 2, // 6305 0, 1, 128, 2, 128, 128, 128, 3, // 6306 128, 128, 0, 1, 128, 128, 128, 2, // 6307 0, 128, 1, 2, 128, 128, 128, 3, // 6308 128, 0, 1, 2, 128, 128, 128, 3, // 6309 0, 1, 2, 3, 128, 128, 128, 4, // 6310 128, 128, 128, 128, 0, 128, 128, 1, // 6311 0, 128, 128, 128, 1, 128, 128, 2, // 6312 128, 0, 128, 128, 1, 128, 128, 2, // 6313 0, 1, 128, 128, 2, 128, 128, 3, // 6314 128, 128, 0, 128, 1, 128, 128, 2, // 6315 0, 128, 1, 128, 2, 128, 128, 3, // 6316 128, 0, 1, 128, 2, 128, 128, 3, // 6317 0, 1, 2, 128, 3, 128, 128, 4, // 6318 128, 128, 128, 0, 1, 128, 128, 2, // 6319 0, 128, 128, 1, 2, 128, 128, 3, // 6320 128, 0, 128, 1, 2, 128, 128, 3, // 6321 0, 1, 128, 2, 3, 128, 128, 4, // 6322 128, 128, 0, 1, 2, 128, 128, 3, // 6323 0, 128, 1, 2, 3, 128, 128, 4, // 6324 128, 0, 1, 2, 3, 128, 128, 4, // 6325 0, 1, 2, 3, 4, 128, 128, 5, // 6326 128, 128, 128, 128, 128, 0, 128, 1, // 6327 0, 128, 128, 128, 128, 1, 128, 2, // 6328 128, 0, 128, 128, 128, 1, 128, 2, // 6329 0, 1, 128, 128, 128, 2, 128, 3, // 6330 128, 128, 0, 128, 128, 1, 128, 2, // 6331 0, 128, 1, 128, 128, 2, 128, 3, // 6332 128, 0, 1, 128, 128, 2, 128, 3, // 6333 0, 1, 2, 128, 128, 3, 128, 4, // 6334 128, 128, 128, 0, 128, 1, 128, 2, // 6335 0, 128, 128, 1, 128, 2, 128, 3, // 6336 128, 0, 128, 1, 128, 2, 128, 3, // 6337 0, 1, 128, 2, 128, 3, 128, 4, // 6338 128, 128, 0, 1, 128, 2, 128, 3, // 6339 0, 128, 1, 2, 128, 3, 128, 4, // 6340 128, 0, 1, 2, 128, 3, 128, 4, // 6341 0, 1, 2, 3, 128, 4, 128, 5, // 6342 128, 128, 128, 128, 0, 1, 128, 2, // 6343 0, 128, 128, 128, 1, 2, 128, 3, // 6344 128, 0, 128, 128, 1, 2, 128, 3, // 6345 0, 1, 128, 128, 2, 3, 128, 4, // 6346 128, 128, 0, 128, 1, 2, 128, 3, // 6347 0, 128, 1, 128, 2, 3, 128, 4, // 6348 128, 0, 1, 128, 2, 3, 128, 4, // 6349 0, 1, 2, 128, 3, 4, 128, 5, // 6350 128, 128, 128, 0, 1, 2, 128, 3, // 6351 0, 128, 128, 1, 2, 3, 128, 4, // 6352 128, 0, 128, 1, 2, 3, 128, 4, // 6353 0, 1, 128, 2, 3, 4, 128, 5, // 6354 128, 128, 0, 1, 2, 3, 128, 4, // 6355 0, 128, 1, 2, 3, 4, 128, 5, // 6356 128, 0, 1, 2, 3, 4, 128, 5, // 6357 0, 1, 2, 3, 4, 5, 128, 6, // 6358 128, 128, 128, 128, 128, 128, 0, 1, // 6359 0, 128, 128, 128, 128, 128, 1, 2, // 6360 128, 0, 128, 128, 128, 128, 1, 2, // 6361 0, 1, 128, 128, 128, 128, 2, 3, // 6362 128, 128, 0, 128, 128, 128, 1, 2, // 6363 0, 128, 1, 128, 128, 128, 2, 3, // 6364 128, 0, 1, 128, 128, 128, 2, 3, // 6365 0, 1, 2, 128, 128, 128, 3, 4, // 6366 128, 128, 128, 0, 128, 128, 1, 2, // 6367 0, 128, 128, 1, 128, 128, 2, 3, // 6368 128, 0, 128, 1, 128, 128, 2, 3, // 6369 0, 1, 128, 2, 128, 128, 3, 4, // 6370 128, 128, 0, 1, 128, 128, 2, 3, // 6371 0, 128, 1, 2, 128, 128, 3, 4, // 6372 128, 0, 1, 2, 128, 128, 3, 4, // 6373 0, 1, 2, 3, 128, 128, 4, 5, // 6374 128, 128, 128, 128, 0, 128, 1, 2, // 6375 0, 128, 128, 128, 1, 128, 2, 3, // 6376 128, 0, 128, 128, 1, 128, 2, 3, // 6377 0, 1, 128, 128, 2, 128, 3, 4, // 6378 128, 128, 0, 128, 1, 128, 2, 3, // 6379 0, 128, 1, 128, 2, 128, 3, 4, // 6380 128, 0, 1, 128, 2, 128, 3, 4, // 6381 0, 1, 2, 128, 3, 128, 4, 5, // 6382 128, 128, 128, 0, 1, 128, 2, 3, // 6383 0, 128, 128, 1, 2, 128, 3, 4, // 6384 128, 0, 128, 1, 2, 128, 3, 4, // 6385 0, 1, 128, 2, 3, 128, 4, 5, // 6386 128, 128, 0, 1, 2, 128, 3, 4, // 6387 0, 128, 1, 2, 3, 128, 4, 5, // 6388 128, 0, 1, 2, 3, 128, 4, 5, // 6389 0, 1, 2, 3, 4, 128, 5, 6, // 6390 128, 128, 128, 128, 128, 0, 1, 2, // 6391 0, 128, 128, 128, 128, 1, 2, 3, // 6392 128, 0, 128, 128, 128, 1, 2, 3, // 6393 0, 1, 128, 128, 128, 2, 3, 4, // 6394 128, 128, 0, 128, 128, 1, 2, 3, // 6395 0, 128, 1, 128, 128, 2, 3, 4, // 6396 128, 0, 1, 128, 128, 2, 3, 4, // 6397 0, 1, 2, 128, 128, 3, 4, 5, // 6398 128, 128, 128, 0, 128, 1, 2, 3, // 6399 0, 128, 128, 1, 128, 2, 3, 4, // 6400 128, 0, 128, 1, 128, 2, 3, 4, // 6401 0, 1, 128, 2, 128, 3, 4, 5, // 6402 128, 128, 0, 1, 128, 2, 3, 4, // 6403 0, 128, 1, 2, 128, 3, 4, 5, // 6404 128, 0, 1, 2, 128, 3, 4, 5, // 6405 0, 1, 2, 3, 128, 4, 5, 6, // 6406 128, 128, 128, 128, 0, 1, 2, 3, // 6407 0, 128, 128, 128, 1, 2, 3, 4, // 6408 128, 0, 128, 128, 1, 2, 3, 4, // 6409 0, 1, 128, 128, 2, 3, 4, 5, // 6410 128, 128, 0, 128, 1, 2, 3, 4, // 6411 0, 128, 1, 128, 2, 3, 4, 5, // 6412 128, 0, 1, 128, 2, 3, 4, 5, // 6413 0, 1, 2, 128, 3, 4, 5, 6, // 6414 128, 128, 128, 0, 1, 2, 3, 4, // 6415 0, 128, 128, 1, 2, 3, 4, 5, // 6416 128, 0, 128, 1, 2, 3, 4, 5, // 6417 0, 1, 128, 2, 3, 4, 5, 6, // 6418 128, 128, 0, 1, 2, 3, 4, 5, // 6419 0, 128, 1, 2, 3, 4, 5, 6, // 6420 128, 0, 1, 2, 3, 4, 5, 6, // 6421 0, 1, 2, 3, 4, 5, 6, 7}; 6422 return LoadU(du8, table + mask_bits * 8); 6423 } 6424 6425 } // namespace detail 6426 6427 // Half vector of bytes: one table lookup 6428 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1), HWY_IF_V_SIZE_LE(T, N, 8)> 6429 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { 6430 const DFromV<decltype(v)> d; 6431 6432 const uint64_t mask_bits = BitsFromMask(d, mask); 6433 const Vec128<uint8_t, N> indices = 6434 detail::IndicesForExpandFromBits<N>(mask_bits); 6435 return BitCast(d, TableLookupBytesOr0(v, indices)); 6436 } 6437 6438 // Full vector of bytes: two table lookups 6439 template <typename T, HWY_IF_T_SIZE(T, 1)> 6440 HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) { 6441 const Full128<T> d; 6442 const RebindToUnsigned<decltype(d)> du; 6443 const Half<decltype(du)> duh; 6444 const Vec128<uint8_t> vu = BitCast(du, v); 6445 6446 const uint64_t mask_bits = BitsFromMask(d, mask); 6447 const uint64_t maskL = mask_bits & 0xFF; 6448 const uint64_t maskH = mask_bits >> 8; 6449 6450 // We want to skip past the v bytes already consumed by idxL. There is no 6451 // instruction for shift-reg by variable bytes. Storing v itself would work 6452 // but would involve a store-load forwarding stall. We instead shuffle using 6453 // loaded indices. 6454 // TODO: MultiRotateRight would also help, but if we have that, we probably 6455 // also have native 8-bit Expand? 6456 alignas(16) static constexpr uint8_t iota[32] = { 6457 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 6458 11, 12, 13, 14, 15, 128, 128, 128, 128, 128, 128, 6459 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; 6460 const VFromD<decltype(du)> shift = LoadU(du, iota + PopCount(maskL)); 6461 const VFromD<decltype(duh)> vL = LowerHalf(duh, vu); 6462 const VFromD<decltype(duh)> vH = 6463 LowerHalf(duh, TableLookupBytesOr0(vu, shift)); 6464 6465 const VFromD<decltype(duh)> idxL = detail::IndicesForExpandFromBits<8>(maskL); 6466 const VFromD<decltype(duh)> idxH = detail::IndicesForExpandFromBits<8>(maskH); 6467 6468 const VFromD<decltype(duh)> expandL = TableLookupBytesOr0(vL, idxL); 6469 const VFromD<decltype(duh)> expandH = TableLookupBytesOr0(vH, idxH); 6470 return BitCast(d, Combine(du, expandH, expandL)); 6471 } 6472 6473 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 6474 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { 6475 const DFromV<decltype(v)> d; 6476 const RebindToUnsigned<decltype(d)> du; 6477 6478 const Rebind<uint8_t, decltype(d)> du8; 6479 const uint64_t mask_bits = BitsFromMask(d, mask); 6480 6481 // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply 6482 // the nibble trick used below because not all indices fit within one lane. 6483 alignas(16) static constexpr uint8_t table[2048] = { 6484 // PrintExpand16x8ByteTables 6485 128, 128, 128, 128, 128, 128, 128, 128, // 6486 0, 128, 128, 128, 128, 128, 128, 128, // 6487 128, 0, 128, 128, 128, 128, 128, 128, // 6488 0, 2, 128, 128, 128, 128, 128, 128, // 6489 128, 128, 0, 128, 128, 128, 128, 128, // 6490 0, 128, 2, 128, 128, 128, 128, 128, // 6491 128, 0, 2, 128, 128, 128, 128, 128, // 6492 0, 2, 4, 128, 128, 128, 128, 128, // 6493 128, 128, 128, 0, 128, 128, 128, 128, // 6494 0, 128, 128, 2, 128, 128, 128, 128, // 6495 128, 0, 128, 2, 128, 128, 128, 128, // 6496 0, 2, 128, 4, 128, 128, 128, 128, // 6497 128, 128, 0, 2, 128, 128, 128, 128, // 6498 0, 128, 2, 4, 128, 128, 128, 128, // 6499 128, 0, 2, 4, 128, 128, 128, 128, // 6500 0, 2, 4, 6, 128, 128, 128, 128, // 6501 128, 128, 128, 128, 0, 128, 128, 128, // 6502 0, 128, 128, 128, 2, 128, 128, 128, // 6503 128, 0, 128, 128, 2, 128, 128, 128, // 6504 0, 2, 128, 128, 4, 128, 128, 128, // 6505 128, 128, 0, 128, 2, 128, 128, 128, // 6506 0, 128, 2, 128, 4, 128, 128, 128, // 6507 128, 0, 2, 128, 4, 128, 128, 128, // 6508 0, 2, 4, 128, 6, 128, 128, 128, // 6509 128, 128, 128, 0, 2, 128, 128, 128, // 6510 0, 128, 128, 2, 4, 128, 128, 128, // 6511 128, 0, 128, 2, 4, 128, 128, 128, // 6512 0, 2, 128, 4, 6, 128, 128, 128, // 6513 128, 128, 0, 2, 4, 128, 128, 128, // 6514 0, 128, 2, 4, 6, 128, 128, 128, // 6515 128, 0, 2, 4, 6, 128, 128, 128, // 6516 0, 2, 4, 6, 8, 128, 128, 128, // 6517 128, 128, 128, 128, 128, 0, 128, 128, // 6518 0, 128, 128, 128, 128, 2, 128, 128, // 6519 128, 0, 128, 128, 128, 2, 128, 128, // 6520 0, 2, 128, 128, 128, 4, 128, 128, // 6521 128, 128, 0, 128, 128, 2, 128, 128, // 6522 0, 128, 2, 128, 128, 4, 128, 128, // 6523 128, 0, 2, 128, 128, 4, 128, 128, // 6524 0, 2, 4, 128, 128, 6, 128, 128, // 6525 128, 128, 128, 0, 128, 2, 128, 128, // 6526 0, 128, 128, 2, 128, 4, 128, 128, // 6527 128, 0, 128, 2, 128, 4, 128, 128, // 6528 0, 2, 128, 4, 128, 6, 128, 128, // 6529 128, 128, 0, 2, 128, 4, 128, 128, // 6530 0, 128, 2, 4, 128, 6, 128, 128, // 6531 128, 0, 2, 4, 128, 6, 128, 128, // 6532 0, 2, 4, 6, 128, 8, 128, 128, // 6533 128, 128, 128, 128, 0, 2, 128, 128, // 6534 0, 128, 128, 128, 2, 4, 128, 128, // 6535 128, 0, 128, 128, 2, 4, 128, 128, // 6536 0, 2, 128, 128, 4, 6, 128, 128, // 6537 128, 128, 0, 128, 2, 4, 128, 128, // 6538 0, 128, 2, 128, 4, 6, 128, 128, // 6539 128, 0, 2, 128, 4, 6, 128, 128, // 6540 0, 2, 4, 128, 6, 8, 128, 128, // 6541 128, 128, 128, 0, 2, 4, 128, 128, // 6542 0, 128, 128, 2, 4, 6, 128, 128, // 6543 128, 0, 128, 2, 4, 6, 128, 128, // 6544 0, 2, 128, 4, 6, 8, 128, 128, // 6545 128, 128, 0, 2, 4, 6, 128, 128, // 6546 0, 128, 2, 4, 6, 8, 128, 128, // 6547 128, 0, 2, 4, 6, 8, 128, 128, // 6548 0, 2, 4, 6, 8, 10, 128, 128, // 6549 128, 128, 128, 128, 128, 128, 0, 128, // 6550 0, 128, 128, 128, 128, 128, 2, 128, // 6551 128, 0, 128, 128, 128, 128, 2, 128, // 6552 0, 2, 128, 128, 128, 128, 4, 128, // 6553 128, 128, 0, 128, 128, 128, 2, 128, // 6554 0, 128, 2, 128, 128, 128, 4, 128, // 6555 128, 0, 2, 128, 128, 128, 4, 128, // 6556 0, 2, 4, 128, 128, 128, 6, 128, // 6557 128, 128, 128, 0, 128, 128, 2, 128, // 6558 0, 128, 128, 2, 128, 128, 4, 128, // 6559 128, 0, 128, 2, 128, 128, 4, 128, // 6560 0, 2, 128, 4, 128, 128, 6, 128, // 6561 128, 128, 0, 2, 128, 128, 4, 128, // 6562 0, 128, 2, 4, 128, 128, 6, 128, // 6563 128, 0, 2, 4, 128, 128, 6, 128, // 6564 0, 2, 4, 6, 128, 128, 8, 128, // 6565 128, 128, 128, 128, 0, 128, 2, 128, // 6566 0, 128, 128, 128, 2, 128, 4, 128, // 6567 128, 0, 128, 128, 2, 128, 4, 128, // 6568 0, 2, 128, 128, 4, 128, 6, 128, // 6569 128, 128, 0, 128, 2, 128, 4, 128, // 6570 0, 128, 2, 128, 4, 128, 6, 128, // 6571 128, 0, 2, 128, 4, 128, 6, 128, // 6572 0, 2, 4, 128, 6, 128, 8, 128, // 6573 128, 128, 128, 0, 2, 128, 4, 128, // 6574 0, 128, 128, 2, 4, 128, 6, 128, // 6575 128, 0, 128, 2, 4, 128, 6, 128, // 6576 0, 2, 128, 4, 6, 128, 8, 128, // 6577 128, 128, 0, 2, 4, 128, 6, 128, // 6578 0, 128, 2, 4, 6, 128, 8, 128, // 6579 128, 0, 2, 4, 6, 128, 8, 128, // 6580 0, 2, 4, 6, 8, 128, 10, 128, // 6581 128, 128, 128, 128, 128, 0, 2, 128, // 6582 0, 128, 128, 128, 128, 2, 4, 128, // 6583 128, 0, 128, 128, 128, 2, 4, 128, // 6584 0, 2, 128, 128, 128, 4, 6, 128, // 6585 128, 128, 0, 128, 128, 2, 4, 128, // 6586 0, 128, 2, 128, 128, 4, 6, 128, // 6587 128, 0, 2, 128, 128, 4, 6, 128, // 6588 0, 2, 4, 128, 128, 6, 8, 128, // 6589 128, 128, 128, 0, 128, 2, 4, 128, // 6590 0, 128, 128, 2, 128, 4, 6, 128, // 6591 128, 0, 128, 2, 128, 4, 6, 128, // 6592 0, 2, 128, 4, 128, 6, 8, 128, // 6593 128, 128, 0, 2, 128, 4, 6, 128, // 6594 0, 128, 2, 4, 128, 6, 8, 128, // 6595 128, 0, 2, 4, 128, 6, 8, 128, // 6596 0, 2, 4, 6, 128, 8, 10, 128, // 6597 128, 128, 128, 128, 0, 2, 4, 128, // 6598 0, 128, 128, 128, 2, 4, 6, 128, // 6599 128, 0, 128, 128, 2, 4, 6, 128, // 6600 0, 2, 128, 128, 4, 6, 8, 128, // 6601 128, 128, 0, 128, 2, 4, 6, 128, // 6602 0, 128, 2, 128, 4, 6, 8, 128, // 6603 128, 0, 2, 128, 4, 6, 8, 128, // 6604 0, 2, 4, 128, 6, 8, 10, 128, // 6605 128, 128, 128, 0, 2, 4, 6, 128, // 6606 0, 128, 128, 2, 4, 6, 8, 128, // 6607 128, 0, 128, 2, 4, 6, 8, 128, // 6608 0, 2, 128, 4, 6, 8, 10, 128, // 6609 128, 128, 0, 2, 4, 6, 8, 128, // 6610 0, 128, 2, 4, 6, 8, 10, 128, // 6611 128, 0, 2, 4, 6, 8, 10, 128, // 6612 0, 2, 4, 6, 8, 10, 12, 128, // 6613 128, 128, 128, 128, 128, 128, 128, 0, // 6614 0, 128, 128, 128, 128, 128, 128, 2, // 6615 128, 0, 128, 128, 128, 128, 128, 2, // 6616 0, 2, 128, 128, 128, 128, 128, 4, // 6617 128, 128, 0, 128, 128, 128, 128, 2, // 6618 0, 128, 2, 128, 128, 128, 128, 4, // 6619 128, 0, 2, 128, 128, 128, 128, 4, // 6620 0, 2, 4, 128, 128, 128, 128, 6, // 6621 128, 128, 128, 0, 128, 128, 128, 2, // 6622 0, 128, 128, 2, 128, 128, 128, 4, // 6623 128, 0, 128, 2, 128, 128, 128, 4, // 6624 0, 2, 128, 4, 128, 128, 128, 6, // 6625 128, 128, 0, 2, 128, 128, 128, 4, // 6626 0, 128, 2, 4, 128, 128, 128, 6, // 6627 128, 0, 2, 4, 128, 128, 128, 6, // 6628 0, 2, 4, 6, 128, 128, 128, 8, // 6629 128, 128, 128, 128, 0, 128, 128, 2, // 6630 0, 128, 128, 128, 2, 128, 128, 4, // 6631 128, 0, 128, 128, 2, 128, 128, 4, // 6632 0, 2, 128, 128, 4, 128, 128, 6, // 6633 128, 128, 0, 128, 2, 128, 128, 4, // 6634 0, 128, 2, 128, 4, 128, 128, 6, // 6635 128, 0, 2, 128, 4, 128, 128, 6, // 6636 0, 2, 4, 128, 6, 128, 128, 8, // 6637 128, 128, 128, 0, 2, 128, 128, 4, // 6638 0, 128, 128, 2, 4, 128, 128, 6, // 6639 128, 0, 128, 2, 4, 128, 128, 6, // 6640 0, 2, 128, 4, 6, 128, 128, 8, // 6641 128, 128, 0, 2, 4, 128, 128, 6, // 6642 0, 128, 2, 4, 6, 128, 128, 8, // 6643 128, 0, 2, 4, 6, 128, 128, 8, // 6644 0, 2, 4, 6, 8, 128, 128, 10, // 6645 128, 128, 128, 128, 128, 0, 128, 2, // 6646 0, 128, 128, 128, 128, 2, 128, 4, // 6647 128, 0, 128, 128, 128, 2, 128, 4, // 6648 0, 2, 128, 128, 128, 4, 128, 6, // 6649 128, 128, 0, 128, 128, 2, 128, 4, // 6650 0, 128, 2, 128, 128, 4, 128, 6, // 6651 128, 0, 2, 128, 128, 4, 128, 6, // 6652 0, 2, 4, 128, 128, 6, 128, 8, // 6653 128, 128, 128, 0, 128, 2, 128, 4, // 6654 0, 128, 128, 2, 128, 4, 128, 6, // 6655 128, 0, 128, 2, 128, 4, 128, 6, // 6656 0, 2, 128, 4, 128, 6, 128, 8, // 6657 128, 128, 0, 2, 128, 4, 128, 6, // 6658 0, 128, 2, 4, 128, 6, 128, 8, // 6659 128, 0, 2, 4, 128, 6, 128, 8, // 6660 0, 2, 4, 6, 128, 8, 128, 10, // 6661 128, 128, 128, 128, 0, 2, 128, 4, // 6662 0, 128, 128, 128, 2, 4, 128, 6, // 6663 128, 0, 128, 128, 2, 4, 128, 6, // 6664 0, 2, 128, 128, 4, 6, 128, 8, // 6665 128, 128, 0, 128, 2, 4, 128, 6, // 6666 0, 128, 2, 128, 4, 6, 128, 8, // 6667 128, 0, 2, 128, 4, 6, 128, 8, // 6668 0, 2, 4, 128, 6, 8, 128, 10, // 6669 128, 128, 128, 0, 2, 4, 128, 6, // 6670 0, 128, 128, 2, 4, 6, 128, 8, // 6671 128, 0, 128, 2, 4, 6, 128, 8, // 6672 0, 2, 128, 4, 6, 8, 128, 10, // 6673 128, 128, 0, 2, 4, 6, 128, 8, // 6674 0, 128, 2, 4, 6, 8, 128, 10, // 6675 128, 0, 2, 4, 6, 8, 128, 10, // 6676 0, 2, 4, 6, 8, 10, 128, 12, // 6677 128, 128, 128, 128, 128, 128, 0, 2, // 6678 0, 128, 128, 128, 128, 128, 2, 4, // 6679 128, 0, 128, 128, 128, 128, 2, 4, // 6680 0, 2, 128, 128, 128, 128, 4, 6, // 6681 128, 128, 0, 128, 128, 128, 2, 4, // 6682 0, 128, 2, 128, 128, 128, 4, 6, // 6683 128, 0, 2, 128, 128, 128, 4, 6, // 6684 0, 2, 4, 128, 128, 128, 6, 8, // 6685 128, 128, 128, 0, 128, 128, 2, 4, // 6686 0, 128, 128, 2, 128, 128, 4, 6, // 6687 128, 0, 128, 2, 128, 128, 4, 6, // 6688 0, 2, 128, 4, 128, 128, 6, 8, // 6689 128, 128, 0, 2, 128, 128, 4, 6, // 6690 0, 128, 2, 4, 128, 128, 6, 8, // 6691 128, 0, 2, 4, 128, 128, 6, 8, // 6692 0, 2, 4, 6, 128, 128, 8, 10, // 6693 128, 128, 128, 128, 0, 128, 2, 4, // 6694 0, 128, 128, 128, 2, 128, 4, 6, // 6695 128, 0, 128, 128, 2, 128, 4, 6, // 6696 0, 2, 128, 128, 4, 128, 6, 8, // 6697 128, 128, 0, 128, 2, 128, 4, 6, // 6698 0, 128, 2, 128, 4, 128, 6, 8, // 6699 128, 0, 2, 128, 4, 128, 6, 8, // 6700 0, 2, 4, 128, 6, 128, 8, 10, // 6701 128, 128, 128, 0, 2, 128, 4, 6, // 6702 0, 128, 128, 2, 4, 128, 6, 8, // 6703 128, 0, 128, 2, 4, 128, 6, 8, // 6704 0, 2, 128, 4, 6, 128, 8, 10, // 6705 128, 128, 0, 2, 4, 128, 6, 8, // 6706 0, 128, 2, 4, 6, 128, 8, 10, // 6707 128, 0, 2, 4, 6, 128, 8, 10, // 6708 0, 2, 4, 6, 8, 128, 10, 12, // 6709 128, 128, 128, 128, 128, 0, 2, 4, // 6710 0, 128, 128, 128, 128, 2, 4, 6, // 6711 128, 0, 128, 128, 128, 2, 4, 6, // 6712 0, 2, 128, 128, 128, 4, 6, 8, // 6713 128, 128, 0, 128, 128, 2, 4, 6, // 6714 0, 128, 2, 128, 128, 4, 6, 8, // 6715 128, 0, 2, 128, 128, 4, 6, 8, // 6716 0, 2, 4, 128, 128, 6, 8, 10, // 6717 128, 128, 128, 0, 128, 2, 4, 6, // 6718 0, 128, 128, 2, 128, 4, 6, 8, // 6719 128, 0, 128, 2, 128, 4, 6, 8, // 6720 0, 2, 128, 4, 128, 6, 8, 10, // 6721 128, 128, 0, 2, 128, 4, 6, 8, // 6722 0, 128, 2, 4, 128, 6, 8, 10, // 6723 128, 0, 2, 4, 128, 6, 8, 10, // 6724 0, 2, 4, 6, 128, 8, 10, 12, // 6725 128, 128, 128, 128, 0, 2, 4, 6, // 6726 0, 128, 128, 128, 2, 4, 6, 8, // 6727 128, 0, 128, 128, 2, 4, 6, 8, // 6728 0, 2, 128, 128, 4, 6, 8, 10, // 6729 128, 128, 0, 128, 2, 4, 6, 8, // 6730 0, 128, 2, 128, 4, 6, 8, 10, // 6731 128, 0, 2, 128, 4, 6, 8, 10, // 6732 0, 2, 4, 128, 6, 8, 10, 12, // 6733 128, 128, 128, 0, 2, 4, 6, 8, // 6734 0, 128, 128, 2, 4, 6, 8, 10, // 6735 128, 0, 128, 2, 4, 6, 8, 10, // 6736 0, 2, 128, 4, 6, 8, 10, 12, // 6737 128, 128, 0, 2, 4, 6, 8, 10, // 6738 0, 128, 2, 4, 6, 8, 10, 12, // 6739 128, 0, 2, 4, 6, 8, 10, 12, // 6740 0, 2, 4, 6, 8, 10, 12, 14}; 6741 // Extend to double length because InterleaveLower will only use the (valid) 6742 // lower half, and we want N u16. 6743 const Twice<decltype(du8)> du8x2; 6744 const Vec128<uint8_t, 2 * N> indices8 = 6745 ZeroExtendVector(du8x2, Load(du8, table + mask_bits * 8)); 6746 const Vec128<uint16_t, N> indices16 = 6747 BitCast(du, InterleaveLower(du8x2, indices8, indices8)); 6748 // TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte 6749 // indices, add 0 to even and 1 to odd byte lanes. 6750 const Vec128<uint16_t, N> byte_indices = Add( 6751 indices16, 6752 Set(du, static_cast<uint16_t>(HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001))); 6753 return BitCast(d, TableLookupBytesOr0(v, byte_indices)); 6754 } 6755 6756 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 6757 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) { 6758 const DFromV<decltype(v)> d; 6759 const RebindToUnsigned<decltype(d)> du; 6760 6761 const uint64_t mask_bits = BitsFromMask(d, mask); 6762 6763 alignas(16) static constexpr uint32_t packed_array[16] = { 6764 // PrintExpand64x4Nibble - same for 32x4. 6765 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0, 6766 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10, 6767 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210}; 6768 6769 // For lane i, shift the i-th 4-bit index down to bits [0, 2). 6770 const Vec128<uint32_t, N> packed = Set(du, packed_array[mask_bits]); 6771 alignas(16) static constexpr uint32_t shifts[4] = {0, 4, 8, 12}; 6772 Vec128<uint32_t, N> indices = packed >> Load(du, shifts); 6773 // AVX2 _mm256_permutexvar_epi32 will ignore upper bits, but IndicesFromVec 6774 // checks bounds, so clear the upper bits. 6775 indices = And(indices, Set(du, N - 1)); 6776 const Vec128<uint32_t, N> expand = 6777 TableLookupLanes(BitCast(du, v), IndicesFromVec(du, indices)); 6778 // TableLookupLanes cannot also zero masked-off lanes, so do that now. 6779 return IfThenElseZero(mask, BitCast(d, expand)); 6780 } 6781 6782 template <typename T, HWY_IF_T_SIZE(T, 8)> 6783 HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) { 6784 // Same as Compress, just zero out the mask=false lanes. 6785 return IfThenElseZero(mask, Compress(v, mask)); 6786 } 6787 6788 // For single-element vectors, this is at least as fast as native. 6789 template <typename T> 6790 HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) { 6791 return IfThenElseZero(mask, v); 6792 } 6793 6794 // ------------------------------ LoadExpand 6795 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 6796 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, 6797 const TFromD<D>* HWY_RESTRICT unaligned) { 6798 return Expand(LoadU(d, unaligned), mask); 6799 } 6800 6801 #endif // HWY_NATIVE_EXPAND 6802 6803 // ------------------------------ TwoTablesLookupLanes 6804 6805 template <class D> 6806 using IndicesFromD = decltype(IndicesFromVec(D(), Zero(RebindToUnsigned<D>()))); 6807 6808 // RVV/SVE have their own implementations of 6809 // TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, IndicesFromD<D> idx) 6810 #if HWY_TARGET != HWY_RVV && !HWY_TARGET_IS_SVE 6811 template <class D> 6812 HWY_API VFromD<D> TwoTablesLookupLanes(D /*d*/, VFromD<D> a, VFromD<D> b, 6813 IndicesFromD<D> idx) { 6814 return TwoTablesLookupLanes(a, b, idx); 6815 } 6816 #endif 6817 6818 // ------------------------------ Reverse2, Reverse4, Reverse8 (8-bit) 6819 6820 #if (defined(HWY_NATIVE_REVERSE2_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE 6821 #ifdef HWY_NATIVE_REVERSE2_8 6822 #undef HWY_NATIVE_REVERSE2_8 6823 #else 6824 #define HWY_NATIVE_REVERSE2_8 6825 #endif 6826 6827 #undef HWY_PREFER_ROTATE 6828 // Platforms on which RotateRight is likely faster than TableLookupBytes. 6829 // RVV and SVE anyway have their own implementation of this. 6830 #if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \ 6831 HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8 6832 #define HWY_PREFER_ROTATE 1 6833 #else 6834 #define HWY_PREFER_ROTATE 0 6835 #endif 6836 6837 template <class D, HWY_IF_T_SIZE_D(D, 1)> 6838 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { 6839 // Exclude AVX3 because its 16-bit RotateRight is actually 3 instructions. 6840 #if HWY_PREFER_ROTATE && HWY_TARGET > HWY_AVX3 6841 const Repartition<uint16_t, decltype(d)> du16; 6842 return BitCast(d, RotateRight<8>(BitCast(du16, v))); 6843 #else 6844 const VFromD<D> shuffle = Dup128VecFromValues(d, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 6845 11, 10, 13, 12, 15, 14); 6846 return TableLookupBytes(v, shuffle); 6847 #endif 6848 } 6849 6850 template <class D, HWY_IF_T_SIZE_D(D, 1)> 6851 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { 6852 #if HWY_PREFER_ROTATE 6853 const Repartition<uint16_t, decltype(d)> du16; 6854 return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v)))); 6855 #else 6856 const Repartition<uint8_t, decltype(d)> du8; 6857 const VFromD<decltype(du8)> shuffle = Dup128VecFromValues( 6858 du8, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); 6859 return TableLookupBytes(v, BitCast(d, shuffle)); 6860 #endif 6861 } 6862 6863 template <class D, HWY_IF_T_SIZE_D(D, 1)> 6864 HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { 6865 #if HWY_PREFER_ROTATE 6866 const Repartition<uint32_t, D> du32; 6867 return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v)))); 6868 #else 6869 const Repartition<uint8_t, decltype(d)> du8; 6870 const VFromD<decltype(du8)> shuffle = Dup128VecFromValues( 6871 du8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); 6872 return TableLookupBytes(v, BitCast(d, shuffle)); 6873 #endif 6874 } 6875 6876 #endif // HWY_NATIVE_REVERSE2_8 6877 6878 // ------------------------------ ReverseLaneBytes 6879 6880 #if (defined(HWY_NATIVE_REVERSE_LANE_BYTES) == defined(HWY_TARGET_TOGGLE)) 6881 #ifdef HWY_NATIVE_REVERSE_LANE_BYTES 6882 #undef HWY_NATIVE_REVERSE_LANE_BYTES 6883 #else 6884 #define HWY_NATIVE_REVERSE_LANE_BYTES 6885 #endif 6886 6887 template <class V, HWY_IF_T_SIZE_V(V, 2)> 6888 HWY_API V ReverseLaneBytes(V v) { 6889 const DFromV<V> d; 6890 const Repartition<uint8_t, decltype(d)> du8; 6891 return BitCast(d, Reverse2(du8, BitCast(du8, v))); 6892 } 6893 6894 template <class V, HWY_IF_T_SIZE_V(V, 4)> 6895 HWY_API V ReverseLaneBytes(V v) { 6896 const DFromV<V> d; 6897 const Repartition<uint8_t, decltype(d)> du8; 6898 return BitCast(d, Reverse4(du8, BitCast(du8, v))); 6899 } 6900 6901 template <class V, HWY_IF_T_SIZE_V(V, 8)> 6902 HWY_API V ReverseLaneBytes(V v) { 6903 const DFromV<V> d; 6904 const Repartition<uint8_t, decltype(d)> du8; 6905 return BitCast(d, Reverse8(du8, BitCast(du8, v))); 6906 } 6907 6908 #endif // HWY_NATIVE_REVERSE_LANE_BYTES 6909 6910 // ------------------------------ ReverseBits 6911 6912 // On these targets, we emulate 8-bit shifts using 16-bit shifts and therefore 6913 // require at least two lanes to BitCast to 16-bit. We avoid Highway's 8-bit 6914 // shifts because those would add extra masking already taken care of by 6915 // UI8ReverseBitsStep. Note that AVX3_DL/AVX3_ZEN4 support GFNI and use it to 6916 // implement ReverseBits, so this code is not used there. 6917 #undef HWY_REVERSE_BITS_MIN_BYTES 6918 #if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \ 6919 HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256) 6920 #define HWY_REVERSE_BITS_MIN_BYTES 2 6921 #else 6922 #define HWY_REVERSE_BITS_MIN_BYTES 1 6923 #endif 6924 6925 #if (defined(HWY_NATIVE_REVERSE_BITS_UI8) == defined(HWY_TARGET_TOGGLE)) 6926 #ifdef HWY_NATIVE_REVERSE_BITS_UI8 6927 #undef HWY_NATIVE_REVERSE_BITS_UI8 6928 #else 6929 #define HWY_NATIVE_REVERSE_BITS_UI8 6930 #endif 6931 6932 namespace detail { 6933 6934 template <int kShiftAmt, int kShrResultMask, class V, 6935 HWY_IF_V_SIZE_GT_D(DFromV<V>, HWY_REVERSE_BITS_MIN_BYTES - 1)> 6936 HWY_INLINE V UI8ReverseBitsStep(V v) { 6937 const DFromV<decltype(v)> d; 6938 const RebindToUnsigned<decltype(d)> du; 6939 #if HWY_REVERSE_BITS_MIN_BYTES == 2 6940 const Repartition<uint16_t, decltype(d)> d_shift; 6941 #else 6942 const RebindToUnsigned<decltype(d)> d_shift; 6943 #endif 6944 6945 const auto v_to_shift = BitCast(d_shift, v); 6946 const auto shl_result = BitCast(d, ShiftLeft<kShiftAmt>(v_to_shift)); 6947 const auto shr_result = BitCast(d, ShiftRight<kShiftAmt>(v_to_shift)); 6948 const auto shr_result_mask = 6949 BitCast(d, Set(du, static_cast<uint8_t>(kShrResultMask))); 6950 return Or(And(shr_result, shr_result_mask), 6951 AndNot(shr_result_mask, shl_result)); 6952 } 6953 6954 #if HWY_REVERSE_BITS_MIN_BYTES == 2 6955 template <int kShiftAmt, int kShrResultMask, class V, 6956 HWY_IF_V_SIZE_D(DFromV<V>, 1)> 6957 HWY_INLINE V UI8ReverseBitsStep(V v) { 6958 return V{UI8ReverseBitsStep<kShiftAmt, kShrResultMask>(Vec128<uint8_t>{v.raw}) 6959 .raw}; 6960 } 6961 #endif 6962 6963 } // namespace detail 6964 6965 template <class V, HWY_IF_T_SIZE_V(V, 1)> 6966 HWY_API V ReverseBits(V v) { 6967 auto result = detail::UI8ReverseBitsStep<1, 0x55>(v); 6968 result = detail::UI8ReverseBitsStep<2, 0x33>(result); 6969 result = detail::UI8ReverseBitsStep<4, 0x0F>(result); 6970 return result; 6971 } 6972 6973 #endif // HWY_NATIVE_REVERSE_BITS_UI8 6974 6975 #if (defined(HWY_NATIVE_REVERSE_BITS_UI16_32_64) == defined(HWY_TARGET_TOGGLE)) 6976 #ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64 6977 #undef HWY_NATIVE_REVERSE_BITS_UI16_32_64 6978 #else 6979 #define HWY_NATIVE_REVERSE_BITS_UI16_32_64 6980 #endif 6981 6982 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8)), 6983 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 6984 HWY_API V ReverseBits(V v) { 6985 const DFromV<decltype(v)> d; 6986 const Repartition<uint8_t, decltype(d)> du8; 6987 return ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v)))); 6988 } 6989 #endif // HWY_NATIVE_REVERSE_BITS_UI16_32_64 6990 6991 // ------------------------------ Per4LaneBlockShuffle 6992 6993 #if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE)) 6994 #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 6995 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 6996 #else 6997 #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 6998 #endif 6999 7000 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 7001 namespace detail { 7002 7003 template <class D> 7004 HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, 7005 const uint32_t x2, 7006 const uint32_t x1, 7007 const uint32_t x0) { 7008 #if HWY_TARGET == HWY_RVV 7009 constexpr int kPow2 = d.Pow2(); 7010 constexpr int kLoadPow2 = HWY_MAX(kPow2, -1); 7011 const ScalableTag<uint32_t, kLoadPow2> d_load; 7012 #else 7013 constexpr size_t kMaxBytes = d.MaxBytes(); 7014 #if HWY_TARGET_IS_NEON 7015 constexpr size_t kMinLanesToLoad = 2; 7016 #else 7017 constexpr size_t kMinLanesToLoad = 4; 7018 #endif 7019 constexpr size_t kNumToLoad = 7020 HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad); 7021 const CappedTag<uint32_t, kNumToLoad> d_load; 7022 #endif 7023 return ResizeBitCast(d, Dup128VecFromValues(d_load, x0, x1, x2, x3)); 7024 } 7025 7026 } // namespace detail 7027 #endif 7028 7029 #endif // HWY_NATIVE_PER4LANEBLKSHUF_DUP32 7030 7031 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 7032 namespace detail { 7033 7034 template <class V> 7035 HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<0> /*idx_10_tag*/, V v) { 7036 return DupEven(v); 7037 } 7038 7039 template <class V> 7040 HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<1> /*idx_10_tag*/, V v) { 7041 const DFromV<decltype(v)> d; 7042 return Reverse2(d, v); 7043 } 7044 7045 template <class V> 7046 HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<2> /*idx_10_tag*/, V v) { 7047 return v; 7048 } 7049 7050 template <class V> 7051 HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<3> /*idx_10_tag*/, V v) { 7052 return DupOdd(v); 7053 } 7054 7055 HWY_INLINE uint32_t U8x4Per4LaneBlkIndices(const uint32_t idx3, 7056 const uint32_t idx2, 7057 const uint32_t idx1, 7058 const uint32_t idx0) { 7059 #if HWY_IS_LITTLE_ENDIAN 7060 return static_cast<uint32_t>((idx3 << 24) | (idx2 << 16) | (idx1 << 8) | 7061 idx0); 7062 #else 7063 return static_cast<uint32_t>(idx3 | (idx2 << 8) | (idx1 << 16) | 7064 (idx0 << 24)); 7065 #endif 7066 } 7067 7068 template <class D> 7069 HWY_INLINE Vec<D> TblLookupPer4LaneBlkU8IdxInBlk(D d, const uint32_t idx3, 7070 const uint32_t idx2, 7071 const uint32_t idx1, 7072 const uint32_t idx0) { 7073 #if HWY_TARGET == HWY_RVV 7074 const AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32; 7075 #else 7076 const Repartition<uint32_t, D> du32; 7077 #endif 7078 7079 return ResizeBitCast( 7080 d, Set(du32, U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0))); 7081 } 7082 7083 #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_EMU128 7084 #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) void* = nullptr 7085 #else 7086 #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) HWY_IF_T_SIZE_D(D, 8) 7087 7088 template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))> 7089 HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, V idx) { 7090 const DFromV<decltype(v)> d; 7091 const Repartition<uint8_t, decltype(d)> du8; 7092 return BitCast(d, TableLookupBytes(BitCast(du8, v), BitCast(du8, idx))); 7093 } 7094 7095 template <class D, HWY_IF_T_SIZE_D(D, 1)> 7096 HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, 7097 const uint32_t idx2, 7098 const uint32_t idx1, 7099 const uint32_t idx0) { 7100 const Repartition<uint32_t, decltype(d)> du32; 7101 const uint32_t idx3210 = U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0); 7102 const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32( 7103 du32, static_cast<uint32_t>(idx3210 + 0x0C0C0C0C), 7104 static_cast<uint32_t>(idx3210 + 0x08080808), 7105 static_cast<uint32_t>(idx3210 + 0x04040404), 7106 static_cast<uint32_t>(idx3210)); 7107 return ResizeBitCast(d, v_byte_idx); 7108 } 7109 7110 template <class D, HWY_IF_T_SIZE_D(D, 2)> 7111 HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, 7112 const uint32_t idx2, 7113 const uint32_t idx1, 7114 const uint32_t idx0) { 7115 const Repartition<uint32_t, decltype(d)> du32; 7116 #if HWY_IS_LITTLE_ENDIAN 7117 const uint32_t idx10 = static_cast<uint32_t>((idx1 << 16) | idx0); 7118 const uint32_t idx32 = static_cast<uint32_t>((idx3 << 16) | idx2); 7119 constexpr uint32_t kLaneByteOffsets{0x01000100}; 7120 #else 7121 const uint32_t idx10 = static_cast<uint32_t>(idx1 | (idx0 << 16)); 7122 const uint32_t idx32 = static_cast<uint32_t>(idx3 | (idx2 << 16)); 7123 constexpr uint32_t kLaneByteOffsets{0x00010001}; 7124 #endif 7125 constexpr uint32_t kHiLaneByteOffsets{kLaneByteOffsets + 0x08080808u}; 7126 7127 const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32( 7128 du32, static_cast<uint32_t>(idx32 * 0x0202u + kHiLaneByteOffsets), 7129 static_cast<uint32_t>(idx10 * 0x0202u + kHiLaneByteOffsets), 7130 static_cast<uint32_t>(idx32 * 0x0202u + kLaneByteOffsets), 7131 static_cast<uint32_t>(idx10 * 0x0202u + kLaneByteOffsets)); 7132 return ResizeBitCast(d, v_byte_idx); 7133 } 7134 7135 template <class D, HWY_IF_T_SIZE_D(D, 4)> 7136 HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, 7137 const uint32_t idx2, 7138 const uint32_t idx1, 7139 const uint32_t idx0) { 7140 const Repartition<uint32_t, decltype(d)> du32; 7141 #if HWY_IS_LITTLE_ENDIAN 7142 constexpr uint32_t kLaneByteOffsets{0x03020100}; 7143 #else 7144 constexpr uint32_t kLaneByteOffsets{0x00010203}; 7145 #endif 7146 7147 const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32( 7148 du32, static_cast<uint32_t>(idx3 * 0x04040404u + kLaneByteOffsets), 7149 static_cast<uint32_t>(idx2 * 0x04040404u + kLaneByteOffsets), 7150 static_cast<uint32_t>(idx1 * 0x04040404u + kLaneByteOffsets), 7151 static_cast<uint32_t>(idx0 * 0x04040404u + kLaneByteOffsets)); 7152 return ResizeBitCast(d, v_byte_idx); 7153 } 7154 #endif 7155 7156 template <class D, HWY_IF_T_SIZE_D(D, 1)> 7157 HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, 7158 const uint32_t idx2, 7159 const uint32_t idx1, 7160 const uint32_t idx0) { 7161 return TblLookupPer4LaneBlkU8IdxInBlk(d, idx3, idx2, idx1, idx0); 7162 } 7163 7164 #if HWY_TARGET == HWY_RVV 7165 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 7166 HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, 7167 const uint32_t idx2, 7168 const uint32_t idx1, 7169 const uint32_t idx0) { 7170 const Rebind<uint8_t, decltype(d)> du8; 7171 return PromoteTo(d, 7172 TblLookupPer4LaneBlkU8IdxInBlk(du8, idx3, idx2, idx1, idx0)); 7173 } 7174 #else 7175 template <class D, HWY_IF_T_SIZE_D(D, 2)> 7176 HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, 7177 const uint32_t idx2, 7178 const uint32_t idx1, 7179 const uint32_t idx0) { 7180 const uint16_t u16_idx0 = static_cast<uint16_t>(idx0); 7181 const uint16_t u16_idx1 = static_cast<uint16_t>(idx1); 7182 const uint16_t u16_idx2 = static_cast<uint16_t>(idx2); 7183 const uint16_t u16_idx3 = static_cast<uint16_t>(idx3); 7184 #if HWY_TARGET_IS_NEON 7185 constexpr size_t kMinLanesToLoad = 4; 7186 #else 7187 constexpr size_t kMinLanesToLoad = 8; 7188 #endif 7189 constexpr size_t kNumToLoad = HWY_MAX(HWY_MAX_LANES_D(D), kMinLanesToLoad); 7190 const CappedTag<uint16_t, kNumToLoad> d_load; 7191 return ResizeBitCast( 7192 d, Dup128VecFromValues(d_load, u16_idx0, u16_idx1, u16_idx2, u16_idx3, 7193 u16_idx0, u16_idx1, u16_idx2, u16_idx3)); 7194 } 7195 7196 template <class D, HWY_IF_T_SIZE_D(D, 4)> 7197 HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, 7198 const uint32_t idx2, 7199 const uint32_t idx1, 7200 const uint32_t idx0) { 7201 return Per4LaneBlkShufDupSet4xU32(d, idx3, idx2, idx1, idx0); 7202 } 7203 7204 template <class D, HWY_IF_T_SIZE_D(D, 8)> 7205 HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, 7206 const uint32_t idx2, 7207 const uint32_t idx1, 7208 const uint32_t idx0) { 7209 const RebindToUnsigned<decltype(d)> du; 7210 const Rebind<uint32_t, decltype(d)> du32; 7211 return BitCast(d, PromoteTo(du, Per4LaneBlkShufDupSet4xU32(du32, idx3, idx2, 7212 idx1, idx0))); 7213 } 7214 #endif 7215 7216 template <class D, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D)> 7217 HWY_INLINE IndicesFromD<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, 7218 const uint32_t idx2, 7219 const uint32_t idx1, 7220 const uint32_t idx0) { 7221 const RebindToUnsigned<decltype(d)> du; 7222 using TU = TFromD<decltype(du)>; 7223 auto idx_in_blk = TblLookupPer4LaneBlkIdxInBlk(du, idx3, idx2, idx1, idx0); 7224 7225 constexpr size_t kN = HWY_MAX_LANES_D(D); 7226 if (kN < 4) { 7227 idx_in_blk = And(idx_in_blk, Set(du, static_cast<TU>(kN - 1))); 7228 } 7229 7230 #if HWY_TARGET == HWY_RVV 7231 const auto blk_offsets = AndS(Iota0(du), static_cast<TU>(~TU{3})); 7232 #else 7233 const auto blk_offsets = 7234 And(Iota(du, TU{0}), Set(du, static_cast<TU>(~TU{3}))); 7235 #endif 7236 return IndicesFromVec(d, Add(idx_in_blk, blk_offsets)); 7237 } 7238 7239 template <class V, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(DFromV<V>)> 7240 HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, IndicesFromD<DFromV<V>> idx) { 7241 return TableLookupLanes(v, idx); 7242 } 7243 7244 #undef HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE 7245 7246 template <class V> 7247 HWY_INLINE V TblLookupPer4LaneBlkShuf(V v, size_t idx3210) { 7248 const DFromV<decltype(v)> d; 7249 const uint32_t idx3 = static_cast<uint32_t>((idx3210 >> 6) & 3); 7250 const uint32_t idx2 = static_cast<uint32_t>((idx3210 >> 4) & 3); 7251 const uint32_t idx1 = static_cast<uint32_t>((idx3210 >> 2) & 3); 7252 const uint32_t idx0 = static_cast<uint32_t>(idx3210 & 3); 7253 const auto idx = TblLookupPer4LaneBlkShufIdx(d, idx3, idx2, idx1, idx0); 7254 return Per4LaneBlkShufDoTblLookup(v, idx); 7255 } 7256 7257 // The detail::Per4LaneBlockShuffle overloads that have the extra lane_size_tag 7258 // and vect_size_tag parameters are only called for vectors that have at 7259 // least 4 lanes (or scalable vectors that might possibly have 4 or more lanes) 7260 template <size_t kIdx3210, size_t kLaneSize, size_t kVectSize, class V> 7261 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 7262 hwy::SizeTag<kLaneSize> /*lane_size_tag*/, 7263 hwy::SizeTag<kVectSize> /*vect_size_tag*/, 7264 V v) { 7265 return TblLookupPer4LaneBlkShuf(v, kIdx3210); 7266 } 7267 7268 #if HWY_HAVE_FLOAT64 7269 template <class V> 7270 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide( 7271 hwy::FloatTag /* type_tag */, hwy::SizeTag<4> /* lane_size_tag */, V v) { 7272 const DFromV<decltype(v)> d; 7273 const RepartitionToWide<decltype(d)> dw; 7274 return BitCast(dw, v); 7275 } 7276 #endif 7277 7278 template <size_t kLaneSize, class V> 7279 HWY_INLINE VFromD<RepartitionToWide<RebindToUnsigned<DFromV<V>>>> 7280 Per4LaneBlockShufCastToWide(hwy::FloatTag /* type_tag */, 7281 hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) { 7282 const DFromV<decltype(v)> d; 7283 const RebindToUnsigned<decltype(d)> du; 7284 const RepartitionToWide<decltype(du)> dw; 7285 return BitCast(dw, v); 7286 } 7287 7288 template <size_t kLaneSize, class V> 7289 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide( 7290 hwy::NonFloatTag /* type_tag */, 7291 hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) { 7292 const DFromV<decltype(v)> d; 7293 const RepartitionToWide<decltype(d)> dw; 7294 return BitCast(dw, v); 7295 } 7296 7297 template <class V> 7298 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x1B> /*idx_3210_tag*/, V v) { 7299 const DFromV<decltype(v)> d; 7300 return Reverse4(d, v); 7301 } 7302 7303 template <class V, 7304 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | 7305 (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))> 7306 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/, V v) { 7307 const DFromV<decltype(v)> d; 7308 const auto vw = Per4LaneBlockShufCastToWide( 7309 hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v); 7310 return BitCast(d, DupEven(vw)); 7311 } 7312 7313 template <class V, 7314 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | 7315 (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))> 7316 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) { 7317 const DFromV<decltype(v)> d; 7318 const auto vw = Per4LaneBlockShufCastToWide( 7319 hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v); 7320 const DFromV<decltype(vw)> dw; 7321 return BitCast(d, Reverse2(dw, vw)); 7322 } 7323 7324 #if HWY_MAX_BYTES >= 32 7325 template <class V, HWY_IF_T_SIZE_V(V, 8)> 7326 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) { 7327 return SwapAdjacentBlocks(v); 7328 } 7329 #endif 7330 7331 template <class V, HWY_IF_LANES_D(DFromV<V>, 4), 7332 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> 7333 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) { 7334 const DFromV<decltype(v)> d; 7335 return InterleaveLower(d, v, v); 7336 } 7337 7338 template <class V, HWY_IF_T_SIZE_V(V, 4)> 7339 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) { 7340 const DFromV<decltype(v)> d; 7341 return InterleaveLower(d, v, v); 7342 } 7343 7344 template <class V, HWY_IF_LANES_D(DFromV<V>, 4)> 7345 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, V v) { 7346 const DFromV<decltype(v)> d; 7347 return ConcatEven(d, v, v); 7348 } 7349 7350 template <class V> 7351 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xA0> /*idx_3210_tag*/, V v) { 7352 return DupEven(v); 7353 } 7354 7355 template <class V> 7356 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xB1> /*idx_3210_tag*/, V v) { 7357 const DFromV<decltype(v)> d; 7358 return Reverse2(d, v); 7359 } 7360 7361 template <class V, HWY_IF_LANES_D(DFromV<V>, 4)> 7362 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, V v) { 7363 const DFromV<decltype(v)> d; 7364 return ConcatOdd(d, v, v); 7365 } 7366 7367 template <class V> 7368 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xE4> /*idx_3210_tag*/, V v) { 7369 return v; 7370 } 7371 7372 template <class V, 7373 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | 7374 (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))> 7375 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/, V v) { 7376 const DFromV<decltype(v)> d; 7377 const auto vw = Per4LaneBlockShufCastToWide( 7378 hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v); 7379 return BitCast(d, DupOdd(vw)); 7380 } 7381 7382 template <class V> 7383 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xF5> /*idx_3210_tag*/, V v) { 7384 return DupOdd(v); 7385 } 7386 7387 template <class V, HWY_IF_T_SIZE_V(V, 4)> 7388 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/, V v) { 7389 const DFromV<decltype(v)> d; 7390 return InterleaveUpper(d, v, v); 7391 } 7392 7393 template <size_t kIdx3210, class V> 7394 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag, V v) { 7395 const DFromV<decltype(v)> d; 7396 return Per4LaneBlockShuffle(idx_3210_tag, hwy::SizeTag<sizeof(TFromV<V>)>(), 7397 hwy::SizeTag<d.MaxBytes()>(), v); 7398 } 7399 7400 } // namespace detail 7401 #endif // HWY_TARGET != HWY_SCALAR 7402 7403 template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V, 7404 HWY_IF_LANES_D(DFromV<V>, 1)> 7405 HWY_API V Per4LaneBlockShuffle(V v) { 7406 static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true"); 7407 static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true"); 7408 static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true"); 7409 static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true"); 7410 7411 return v; 7412 } 7413 7414 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 7415 template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V, 7416 HWY_IF_LANES_D(DFromV<V>, 2)> 7417 HWY_API V Per4LaneBlockShuffle(V v) { 7418 static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true"); 7419 static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true"); 7420 static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true"); 7421 static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true"); 7422 7423 constexpr bool isReverse2 = (kIdx0 == 1 || kIdx1 == 0) && (kIdx0 != kIdx1); 7424 constexpr size_t kPer2BlkIdx0 = (kIdx0 <= 1) ? kIdx0 : (isReverse2 ? 1 : 0); 7425 constexpr size_t kPer2BlkIdx1 = (kIdx1 <= 1) ? kIdx1 : (isReverse2 ? 0 : 1); 7426 7427 constexpr size_t kIdx10 = (kPer2BlkIdx1 << 1) | kPer2BlkIdx0; 7428 static_assert(kIdx10 <= 3, "kIdx10 <= 3 must be true"); 7429 return detail::Per2LaneBlockShuffle(hwy::SizeTag<kIdx10>(), v); 7430 } 7431 7432 template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V, 7433 HWY_IF_LANES_GT_D(DFromV<V>, 2)> 7434 HWY_API V Per4LaneBlockShuffle(V v) { 7435 static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true"); 7436 static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true"); 7437 static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true"); 7438 static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true"); 7439 7440 constexpr size_t kIdx3210 = 7441 (kIdx3 << 6) | (kIdx2 << 4) | (kIdx1 << 2) | kIdx0; 7442 return detail::Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210>(), v); 7443 } 7444 #endif 7445 7446 // ------------------------------ PairwiseAdd128/PairwiseSub128 7447 // (Per4LaneBlockShuffle) 7448 #if (defined(HWY_NATIVE_PAIRWISE_ADD_128) == defined(HWY_TARGET_TOGGLE)) 7449 #ifdef HWY_NATIVE_PAIRWISE_ADD_128 7450 #undef HWY_NATIVE_PAIRWISE_ADD_128 7451 #else 7452 #define HWY_NATIVE_PAIRWISE_ADD_128 7453 #endif 7454 7455 namespace detail { 7456 7457 // detail::BlockwiseConcatOddEven(d, v) returns the even lanes of each block of 7458 // v followed by the odd lanes of v 7459 #if HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV || \ 7460 HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX 7461 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), 7462 HWY_IF_V_SIZE_GT_D(D, 8)> 7463 static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d, 7464 Vec<D> v) { 7465 #if HWY_TARGET == HWY_RVV 7466 const ScalableTag<uint64_t, HWY_MAX(HWY_POW2_D(D), 0)> du64; 7467 #else 7468 const Repartition<uint64_t, DFromV<decltype(v)>> du64; 7469 #endif 7470 7471 const Repartition<TFromD<decltype(d)>, decltype(du64)> d_concat; 7472 const auto v_to_concat = ResizeBitCast(d_concat, v); 7473 7474 const auto evens = ConcatEven(d, v_to_concat, v_to_concat); 7475 const auto odds = ConcatOdd(d, v_to_concat, v_to_concat); 7476 return ResizeBitCast( 7477 d, InterleaveWholeLower(BitCast(du64, evens), BitCast(du64, odds))); 7478 } 7479 7480 #else // !(HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV) 7481 7482 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)> 7483 static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d, 7484 Vec<D> v) { 7485 #if HWY_TARGET == HWY_SSE2 7486 const RebindToUnsigned<decltype(d)> du; 7487 const RebindToSigned<RepartitionToWide<decltype(du)>> dw; 7488 7489 const auto vu = BitCast(du, v); 7490 return BitCast( 7491 d, OrderedDemote2To(du, PromoteEvenTo(dw, vu), PromoteOddTo(dw, vu))); 7492 #else 7493 const Repartition<uint8_t, decltype(d)> du8; 7494 const auto idx = 7495 BitCast(d, Dup128VecFromValues(du8, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 7496 9, 11, 13, 15)); 7497 return TableLookupBytes(v, idx); 7498 #endif 7499 } 7500 7501 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)> 7502 static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D d, 7503 Vec<D> v) { 7504 #if HWY_TARGET == HWY_SSE2 7505 const RebindToSigned<decltype(d)> di; 7506 const RepartitionToWide<decltype(di)> dw; 7507 const auto vi = BitCast(di, v); 7508 return BitCast( 7509 d, OrderedDemote2To(di, PromoteEvenTo(dw, vi), PromoteOddTo(dw, vi))); 7510 #else 7511 const Repartition<uint8_t, decltype(d)> du8; 7512 const auto idx = BitCast(d, Dup128VecFromValues(du8, 0, 1, 4, 5, 8, 9, 12, 13, 7513 2, 3, 6, 7, 10, 11, 14, 15)); 7514 return TableLookupBytes(v, idx); 7515 #endif 7516 } 7517 7518 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)> 7519 static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D /*d*/, 7520 Vec<D> v) { 7521 return Per4LaneBlockShuffle<3, 1, 2, 0>(v); 7522 } 7523 #endif // HWY_TARGET_IS_NEON || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_RVV 7524 7525 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)> 7526 static HWY_INLINE HWY_MAYBE_UNUSED Vec<D> BlockwiseConcatOddEven(D /*d*/, 7527 Vec<D> v) { 7528 return v; 7529 } 7530 7531 } // namespace detail 7532 7533 // Pairwise add with output in 128 bit blocks of a and b. 7534 template <class D, HWY_IF_PAIRWISE_ADD_128_D(D)> 7535 HWY_API Vec<D> PairwiseAdd128(D d, Vec<D> a, Vec<D> b) { 7536 return detail::BlockwiseConcatOddEven(d, PairwiseAdd(d, a, b)); 7537 } 7538 7539 // Pairwise sub with output in 128 bit blocks of a and b. 7540 template <class D, HWY_IF_PAIRWISE_SUB_128_D(D)> 7541 HWY_API Vec<D> PairwiseSub128(D d, Vec<D> a, Vec<D> b) { 7542 return detail::BlockwiseConcatOddEven(d, PairwiseSub(d, a, b)); 7543 } 7544 7545 #endif 7546 7547 // ------------------------------ Blocks 7548 7549 template <class D> 7550 HWY_API size_t Blocks(D d) { 7551 return (d.MaxBytes() <= 16) ? 1 : ((Lanes(d) * sizeof(TFromD<D>) + 15) / 16); 7552 } 7553 7554 // ------------------------------ Block insert/extract/broadcast ops 7555 #if (defined(HWY_NATIVE_BLK_INSERT_EXTRACT) == defined(HWY_TARGET_TOGGLE)) 7556 #ifdef HWY_NATIVE_BLK_INSERT_EXTRACT 7557 #undef HWY_NATIVE_BLK_INSERT_EXTRACT 7558 #else 7559 #define HWY_NATIVE_BLK_INSERT_EXTRACT 7560 #endif 7561 7562 template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)> 7563 HWY_API V InsertBlock(V /*v*/, V blk_to_insert) { 7564 static_assert(kBlockIdx == 0, "Invalid block index"); 7565 return blk_to_insert; 7566 } 7567 7568 template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)> 7569 HWY_API V ExtractBlock(V v) { 7570 static_assert(kBlockIdx == 0, "Invalid block index"); 7571 return v; 7572 } 7573 7574 template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)> 7575 HWY_API V BroadcastBlock(V v) { 7576 static_assert(kBlockIdx == 0, "Invalid block index"); 7577 return v; 7578 } 7579 7580 #endif // HWY_NATIVE_BLK_INSERT_EXTRACT 7581 7582 // ------------------------------ BroadcastLane 7583 #if (defined(HWY_NATIVE_BROADCASTLANE) == defined(HWY_TARGET_TOGGLE)) 7584 #ifdef HWY_NATIVE_BROADCASTLANE 7585 #undef HWY_NATIVE_BROADCASTLANE 7586 #else 7587 #define HWY_NATIVE_BROADCASTLANE 7588 #endif 7589 7590 template <int kLane, class V, HWY_IF_V_SIZE_LE_V(V, 16)> 7591 HWY_API V BroadcastLane(V v) { 7592 return Broadcast<kLane>(v); 7593 } 7594 7595 #endif // HWY_NATIVE_BROADCASTLANE 7596 7597 // ------------------------------ Slide1Up and Slide1Down 7598 #if (defined(HWY_NATIVE_SLIDE1_UP_DOWN) == defined(HWY_TARGET_TOGGLE)) 7599 #ifdef HWY_NATIVE_SLIDE1_UP_DOWN 7600 #undef HWY_NATIVE_SLIDE1_UP_DOWN 7601 #else 7602 #define HWY_NATIVE_SLIDE1_UP_DOWN 7603 #endif 7604 7605 template <class D, HWY_IF_LANES_D(D, 1)> 7606 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> /*v*/) { 7607 return Zero(d); 7608 } 7609 template <class D, HWY_IF_LANES_D(D, 1)> 7610 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> /*v*/) { 7611 return Zero(d); 7612 } 7613 7614 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 7615 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)> 7616 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) { 7617 return ShiftLeftLanes<1>(d, v); 7618 } 7619 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)> 7620 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) { 7621 return ShiftRightLanes<1>(d, v); 7622 } 7623 #endif // HWY_TARGET != HWY_SCALAR 7624 7625 #endif // HWY_NATIVE_SLIDE1_UP_DOWN 7626 7627 // ------------------------------ SlideUpBlocks 7628 7629 template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)> 7630 HWY_API VFromD<D> SlideUpBlocks(D /*d*/, VFromD<D> v) { 7631 static_assert(kBlocks == 0, "kBlocks == 0 must be true"); 7632 return v; 7633 } 7634 7635 #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 7636 template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)> 7637 HWY_API VFromD<D> SlideUpBlocks(D d, VFromD<D> v) { 7638 static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(), 7639 "kBlocks must be between 0 and d.MaxBlocks() - 1"); 7640 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>); 7641 return SlideUpLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock); 7642 } 7643 #endif 7644 7645 // ------------------------------ SlideDownBlocks 7646 7647 template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)> 7648 HWY_API VFromD<D> SlideDownBlocks(D /*d*/, VFromD<D> v) { 7649 static_assert(kBlocks == 0, "kBlocks == 0 must be true"); 7650 return v; 7651 } 7652 7653 #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 7654 template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)> 7655 HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) { 7656 static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(), 7657 "kBlocks must be between 0 and d.MaxBlocks() - 1"); 7658 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>); 7659 return SlideDownLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock); 7660 } 7661 #endif 7662 7663 // ------------------------------ Slide mask up/down 7664 #if (defined(HWY_NATIVE_SLIDE_MASK) == defined(HWY_TARGET_TOGGLE)) 7665 7666 #ifdef HWY_NATIVE_SLIDE_MASK 7667 #undef HWY_NATIVE_SLIDE_MASK 7668 #else 7669 #define HWY_NATIVE_SLIDE_MASK 7670 #endif 7671 7672 template <class D> 7673 HWY_API Mask<D> SlideMask1Up(D d, Mask<D> m) { 7674 return MaskFromVec(Slide1Up(d, VecFromMask(d, m))); 7675 } 7676 7677 template <class D> 7678 HWY_API Mask<D> SlideMask1Down(D d, Mask<D> m) { 7679 return MaskFromVec(Slide1Down(d, VecFromMask(d, m))); 7680 } 7681 7682 template <class D> 7683 HWY_API Mask<D> SlideMaskUpLanes(D d, Mask<D> m, size_t amt) { 7684 return MaskFromVec(SlideUpLanes(d, VecFromMask(d, m), amt)); 7685 } 7686 7687 template <class D> 7688 HWY_API Mask<D> SlideMaskDownLanes(D d, Mask<D> m, size_t amt) { 7689 return MaskFromVec(SlideDownLanes(d, VecFromMask(d, m), amt)); 7690 } 7691 7692 #endif // HWY_NATIVE_SLIDE_MASK 7693 7694 // ------------------------------ SumsOfAdjQuadAbsDiff 7695 7696 #if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \ 7697 defined(HWY_TARGET_TOGGLE)) 7698 #ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF 7699 #undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF 7700 #else 7701 #define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF 7702 #endif 7703 7704 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 7705 template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)> 7706 HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) { 7707 static_assert(0 <= kAOffset && kAOffset <= 1, 7708 "kAOffset must be between 0 and 1"); 7709 static_assert(0 <= kBOffset && kBOffset <= 3, 7710 "kBOffset must be between 0 and 3"); 7711 using D8 = DFromV<V8>; 7712 const D8 d8; 7713 const RebindToUnsigned<decltype(d8)> du8; 7714 const RepartitionToWide<decltype(d8)> d16; 7715 const RepartitionToWide<decltype(du8)> du16; 7716 7717 // Ensure that a is resized to a vector that has at least 7718 // HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the interleave and 7719 // CombineShiftRightBytes operations below. 7720 #if HWY_TARGET == HWY_RVV 7721 // On RVV targets, need to ensure that d8_interleave.Pow2() >= 0 is true 7722 // to ensure that Lanes(d8_interleave) >= 16 is true. 7723 7724 // Lanes(d8_interleave) >= Lanes(d8) is guaranteed to be true on RVV 7725 // targets as d8_interleave.Pow2() >= d8.Pow2() is true. 7726 constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0); 7727 const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave; 7728 #elif HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE 7729 // On SVE targets, Lanes(d8_interleave) >= 16 and 7730 // Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD 7731 // tag for a full u8/i8 vector on SVE. 7732 const D8 d8_interleave; 7733 #else 7734 // On targets that use non-scalable vector types, Lanes(d8_interleave) is 7735 // equal to HWY_MAX(Lanes(d8), size_t{8} << kAOffset). 7736 constexpr size_t kInterleaveLanes = 7737 HWY_MAX(HWY_MAX_LANES_D(D8), size_t{8} << kAOffset); 7738 const FixedTag<TFromD<D8>, kInterleaveLanes> d8_interleave; 7739 #endif 7740 7741 // The ResizeBitCast operation below will resize a to a vector that has 7742 // at least HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the 7743 // InterleaveLower, InterleaveUpper, and CombineShiftRightBytes operations 7744 // below. 7745 const auto a_to_interleave = ResizeBitCast(d8_interleave, a); 7746 7747 const auto a_interleaved_lo = 7748 InterleaveLower(d8_interleave, a_to_interleave, a_to_interleave); 7749 const auto a_interleaved_hi = 7750 InterleaveUpper(d8_interleave, a_to_interleave, a_to_interleave); 7751 7752 /* a01: { a[kAOffset*4+0], a[kAOffset*4+1], a[kAOffset*4+1], a[kAOffset*4+2], 7753 a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4], 7754 a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6], 7755 a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8] } 7756 */ 7757 /* a23: { a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4], 7758 a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6], 7759 a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8], 7760 a[kAOffset*4+8], a[kAOffset*4+9], a[kAOffset*4+9], a[kAOffset*4+10] 7761 } */ 7762 7763 // a01 and a23 are resized back to V8 as only the first Lanes(d8) lanes of 7764 // the CombineShiftRightBytes are needed for the subsequent AbsDiff operations 7765 // and as a01 and a23 need to be the same vector type as b01 and b23 for the 7766 // AbsDiff operations below. 7767 const V8 a01 = 7768 ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 1>( 7769 d8_interleave, a_interleaved_hi, a_interleaved_lo)); 7770 const V8 a23 = 7771 ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 5>( 7772 d8_interleave, a_interleaved_hi, a_interleaved_lo)); 7773 7774 /* b01: { b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1], 7775 b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1], 7776 b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1], 7777 b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1] } 7778 */ 7779 /* b23: { b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3], 7780 b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3], 7781 b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3], 7782 b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3] } 7783 */ 7784 const V8 b01 = BitCast(d8, Broadcast<kBOffset * 2>(BitCast(d16, b))); 7785 const V8 b23 = BitCast(d8, Broadcast<kBOffset * 2 + 1>(BitCast(d16, b))); 7786 7787 const VFromD<decltype(du16)> absdiff_sum_01 = 7788 SumsOf2(BitCast(du8, AbsDiff(a01, b01))); 7789 const VFromD<decltype(du16)> absdiff_sum_23 = 7790 SumsOf2(BitCast(du8, AbsDiff(a23, b23))); 7791 return BitCast(d16, Add(absdiff_sum_01, absdiff_sum_23)); 7792 } 7793 #endif // HWY_TARGET != HWY_SCALAR 7794 7795 #endif // HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF 7796 7797 // ------------------------------ SumsOfShuffledQuadAbsDiff 7798 7799 #if (defined(HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF) == \ 7800 defined(HWY_TARGET_TOGGLE)) 7801 #ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF 7802 #undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF 7803 #else 7804 #define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF 7805 #endif 7806 7807 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 7808 template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8, 7809 HWY_IF_UI8_D(DFromV<V8>)> 7810 HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a, 7811 V8 b) { 7812 static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3"); 7813 static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3"); 7814 static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3"); 7815 static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3"); 7816 7817 #if HWY_TARGET == HWY_RVV 7818 // On RVV, ensure that both vA and vB have a LMUL of at least 1/2 so that 7819 // both vA and vB can be bitcasted to a u32 vector. 7820 const detail::AdjustSimdTagToMinVecPow2< 7821 RepartitionToWideX2<DFromV<decltype(a)>>> 7822 d32; 7823 const RepartitionToNarrow<decltype(d32)> d16; 7824 const RepartitionToNarrow<decltype(d16)> d8; 7825 7826 const auto vA = ResizeBitCast(d8, a); 7827 const auto vB = ResizeBitCast(d8, b); 7828 #else 7829 const DFromV<decltype(a)> d8; 7830 const RepartitionToWide<decltype(d8)> d16; 7831 const RepartitionToWide<decltype(d16)> d32; 7832 7833 const auto vA = a; 7834 const auto vB = b; 7835 #endif 7836 7837 const RebindToUnsigned<decltype(d8)> du8; 7838 7839 const auto a_shuf = 7840 Per4LaneBlockShuffle<kIdx3, kIdx2, kIdx1, kIdx0>(BitCast(d32, vA)); 7841 /* a0123_2345: { a_shuf[0], a_shuf[1], a_shuf[2], a_shuf[3], 7842 a_shuf[2], a_shuf[3], a_shuf[4], a_shuf[5], 7843 a_shuf[8], a_shuf[9], a_shuf[10], a_shuf[11], 7844 a_shuf[10], a_shuf[11], a_shuf[12], a_shuf[13] } */ 7845 /* a1234_3456: { a_shuf[1], a_shuf[2], a_shuf[3], a_shuf[4], 7846 a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[6], 7847 a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[12], 7848 a_shuf[11], a_shuf[12], a_shuf[13], a_shuf[14] } */ 7849 #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE 7850 // On RVV/SVE targets, use Slide1Up/Slide1Down instead of 7851 // ShiftLeftBytes/ShiftRightBytes to avoid unnecessary zeroing out of any 7852 // lanes that are shifted into an adjacent 16-byte block as any lanes that are 7853 // shifted into an adjacent 16-byte block by Slide1Up/Slide1Down will be 7854 // replaced by the OddEven operation. 7855 const auto a_0123_2345 = BitCast( 7856 d8, OddEven(BitCast(d32, Slide1Up(d16, BitCast(d16, a_shuf))), a_shuf)); 7857 const auto a_1234_3456 = 7858 BitCast(d8, OddEven(BitCast(d32, Slide1Up(d8, BitCast(d8, a_shuf))), 7859 BitCast(d32, Slide1Down(d8, BitCast(d8, a_shuf))))); 7860 #else 7861 const auto a_0123_2345 = 7862 BitCast(d8, OddEven(ShiftLeftBytes<2>(d32, a_shuf), a_shuf)); 7863 const auto a_1234_3456 = BitCast( 7864 d8, 7865 OddEven(ShiftLeftBytes<1>(d32, a_shuf), ShiftRightBytes<1>(d32, a_shuf))); 7866 #endif 7867 7868 auto even_sums = SumsOf4(BitCast(du8, AbsDiff(a_0123_2345, vB))); 7869 auto odd_sums = SumsOf4(BitCast(du8, AbsDiff(a_1234_3456, vB))); 7870 7871 #if HWY_IS_LITTLE_ENDIAN 7872 odd_sums = ShiftLeft<16>(odd_sums); 7873 #else 7874 even_sums = ShiftLeft<16>(even_sums); 7875 #endif 7876 7877 const auto sums = OddEven(BitCast(d16, odd_sums), BitCast(d16, even_sums)); 7878 7879 #if HWY_TARGET == HWY_RVV 7880 return ResizeBitCast(RepartitionToWide<DFromV<V8>>(), sums); 7881 #else 7882 return sums; 7883 #endif 7884 } 7885 #endif // HWY_TARGET != HWY_SCALAR 7886 7887 #endif // HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF 7888 7889 // ------------------------------ BitShuffle (Rol) 7890 #if (defined(HWY_NATIVE_BITSHUFFLE) == defined(HWY_TARGET_TOGGLE)) 7891 #ifdef HWY_NATIVE_BITSHUFFLE 7892 #undef HWY_NATIVE_BITSHUFFLE 7893 #else 7894 #define HWY_NATIVE_BITSHUFFLE 7895 #endif 7896 7897 #if HWY_HAVE_INTEGER64 && HWY_TARGET != HWY_SCALAR 7898 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>)> 7899 HWY_API V BitShuffle(V v, VI idx) { 7900 const DFromV<decltype(v)> d64; 7901 const RebindToUnsigned<decltype(d64)> du64; 7902 const Repartition<uint8_t, decltype(d64)> du8; 7903 7904 #if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \ 7905 HWY_TARGET == HWY_WASM_EMU256 7906 const Repartition<uint16_t, decltype(d64)> d_idx_shr; 7907 #else 7908 const Repartition<uint8_t, decltype(d64)> d_idx_shr; 7909 #endif 7910 7911 #if HWY_IS_LITTLE_ENDIAN 7912 constexpr uint64_t kExtractedBitsMask = 7913 static_cast<uint64_t>(0x8040201008040201u); 7914 #else 7915 constexpr uint64_t kExtractedBitsMask = 7916 static_cast<uint64_t>(0x0102040810204080u); 7917 #endif 7918 7919 const auto k7 = Set(du8, uint8_t{0x07}); 7920 7921 auto unmasked_byte_idx = BitCast(du8, ShiftRight<3>(BitCast(d_idx_shr, idx))); 7922 #if HWY_IS_BIG_ENDIAN 7923 // Need to invert the lower 3 bits of unmasked_byte_idx[i] on big-endian 7924 // targets 7925 unmasked_byte_idx = Xor(unmasked_byte_idx, k7); 7926 #endif // HWY_IS_BIG_ENDIAN 7927 7928 const auto byte_idx = BitwiseIfThenElse( 7929 k7, unmasked_byte_idx, 7930 BitCast(du8, Dup128VecFromValues(du64, uint64_t{0}, 7931 uint64_t{0x0808080808080808u}))); 7932 // We want to shift right by idx & 7 to extract the desired bit in `bytes`, 7933 // and left by iota & 7 to put it in the correct output bit. To correctly 7934 // handle shift counts from -7 to 7, we rotate. 7935 const auto rotate_left_bits = Sub(Iota(du8, uint8_t{0}), BitCast(du8, idx)); 7936 7937 const auto extracted_bits = 7938 And(Rol(TableLookupBytes(v, byte_idx), rotate_left_bits), 7939 BitCast(du8, Set(du64, kExtractedBitsMask))); 7940 // Combine bit-sliced (one bit per byte) into one 64-bit sum. 7941 return BitCast(d64, SumsOf8(extracted_bits)); 7942 } 7943 #endif // HWY_HAVE_INTEGER64 && HWY_TARGET != HWY_SCALAR 7944 7945 #endif // HWY_NATIVE_BITSHUFFLE 7946 7947 template <class V, class M> 7948 HWY_API V MaskedOr(M m, V a, V b) { 7949 return IfThenElseZero(m, Or(a, b)); 7950 } 7951 // ------------------------------ AllBits1/AllBits0 7952 #if (defined(HWY_NATIVE_ALLONES) == defined(HWY_TARGET_TOGGLE)) 7953 #ifdef HWY_NATIVE_ALLONES 7954 #undef HWY_NATIVE_ALLONES 7955 #else 7956 #define HWY_NATIVE_ALLONES 7957 #endif 7958 7959 template <class D, class V = VFromD<D>> 7960 HWY_API bool AllBits1(D d, V v) { 7961 const RebindToUnsigned<decltype(d)> du; 7962 using TU = TFromD<decltype(du)>; 7963 return AllTrue(du, Eq(BitCast(du, v), Set(du, hwy::HighestValue<TU>()))); 7964 } 7965 #endif // HWY_NATIVE_ALLONES 7966 7967 #if (defined(HWY_NATIVE_ALLZEROS) == defined(HWY_TARGET_TOGGLE)) 7968 #ifdef HWY_NATIVE_ALLZEROS 7969 #undef HWY_NATIVE_ALLZEROS 7970 #else 7971 #define HWY_NATIVE_ALLZEROS 7972 #endif 7973 7974 template <class D, class V = VFromD<D>> 7975 HWY_API bool AllBits0(D d, V v) { 7976 return AllTrue(d, Eq(v, Zero(d))); 7977 } 7978 #endif // HWY_NATIVE_ALLZEROS 7979 7980 // ------------------------------ MultiRotateRight 7981 #if (defined(HWY_NATIVE_MULTIROTATERIGHT) == defined(HWY_TARGET_TOGGLE)) 7982 #ifdef HWY_NATIVE_MULTIROTATERIGHT 7983 #undef HWY_NATIVE_MULTIROTATERIGHT 7984 #else 7985 #define HWY_NATIVE_MULTIROTATERIGHT 7986 #endif 7987 7988 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>), 7989 class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>, 7990 HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2)), 7991 HWY_IF_V_SIZE_V(V, 8)> 7992 HWY_API V MultiRotateRight(V v, VI idx) { 7993 const DFromV<V> d64; 7994 const Twice<decltype(d64)> dt64; 7995 const Repartition<uint8_t, decltype(d64)> du8; 7996 const Repartition<uint8_t, decltype(dt64)> dt_u8; 7997 const Repartition<uint16_t, decltype(dt64)> dt_u16; 7998 const auto k7 = Set(du8, uint8_t{0x07}); 7999 const auto k63 = Set(du8, uint8_t{0x3F}); 8000 8001 const auto masked_idx = And(k63, BitCast(du8, idx)); 8002 8003 auto byte_idx = ShiftRight<3>(masked_idx); 8004 #if HWY_IS_LITTLE_ENDIAN 8005 const auto hi_byte_idx = Add(byte_idx, Set(du8, uint8_t{1})); 8006 #else 8007 byte_idx = Xor(byte_idx, k7); 8008 const auto hi_byte_idx = Add(byte_idx, k7); 8009 #endif 8010 8011 const auto idx_shift = And(k7, masked_idx); 8012 8013 // Calculate even lanes 8014 const auto even_src = DupEven(ResizeBitCast(dt64, v)); 8015 // Expand indexes to pull out 16 bit segments of idx and idx + 1 8016 #if HWY_IS_LITTLE_ENDIAN 8017 const auto even_idx = InterleaveLower(ResizeBitCast(dt_u8, byte_idx), 8018 ResizeBitCast(dt_u8, hi_byte_idx)); 8019 #else 8020 const auto even_idx = InterleaveLower(ResizeBitCast(dt_u8, hi_byte_idx), 8021 ResizeBitCast(dt_u8, byte_idx)); 8022 #endif 8023 // TableLookupBytes indexes select from within a 16 byte block 8024 const auto even_segments = TableLookupBytes(even_src, even_idx); 8025 // Extract unaligned bytes from 16 bit segments 8026 const auto even_idx_shift = PromoteTo(dt_u16, idx_shift); 8027 const auto extracted_even_bytes = 8028 Shr(BitCast(dt_u16, even_segments), even_idx_shift); 8029 8030 // Extract the even bytes of each 128 bit block and pack into lower 64 bits 8031 #if HWY_IS_LITTLE_ENDIAN 8032 const auto even_lanes = BitCast( 8033 dt64, 8034 ConcatEven(dt_u8, Zero(dt_u8), BitCast(dt_u8, extracted_even_bytes))); 8035 #else 8036 const auto even_lanes = BitCast( 8037 dt64, 8038 ConcatOdd(dt_u8, Zero(dt_u8), BitCast(dt_u8, extracted_even_bytes))); 8039 #endif 8040 8041 return LowerHalf(d64, even_lanes); 8042 } 8043 8044 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>), 8045 class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>, 8046 HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2)), 8047 HWY_IF_V_SIZE_GT_V(V, 8)> 8048 HWY_API V MultiRotateRight(V v, VI idx) { 8049 const DFromV<V> d64; 8050 const Repartition<uint8_t, decltype(d64)> du8; 8051 const Repartition<uint16_t, decltype(d64)> du16; 8052 const auto k7 = Set(du8, uint8_t{0x07}); 8053 const auto k63 = Set(du8, uint8_t{0x3F}); 8054 8055 const auto masked_idx = And(k63, BitCast(du8, idx)); 8056 8057 auto byte_idx = ShiftRight<3>(masked_idx); 8058 #if HWY_IS_LITTLE_ENDIAN 8059 const auto hi_byte_idx = Add(byte_idx, Set(du8, uint8_t{1})); 8060 #else 8061 byte_idx = Xor(byte_idx, k7); 8062 const auto hi_byte_idx = Add(byte_idx, k7); 8063 #endif 8064 8065 const auto idx_shift = And(k7, masked_idx); 8066 8067 // Calculate even lanes 8068 const auto even_src = DupEven(v); 8069 // Expand indexes to pull out 16 bit segments of idx and idx + 1 8070 #if HWY_IS_LITTLE_ENDIAN 8071 const auto even_idx = InterleaveLower(byte_idx, hi_byte_idx); 8072 #else 8073 const auto even_idx = InterleaveLower(hi_byte_idx, byte_idx); 8074 #endif 8075 // TableLookupBytes indexes select from within a 16 byte block 8076 const auto even_segments = TableLookupBytes(even_src, even_idx); 8077 // Extract unaligned bytes from 16 bit segments 8078 #if HWY_IS_LITTLE_ENDIAN 8079 const auto even_idx_shift = ZipLower(idx_shift, Zero(du8)); 8080 #else 8081 const auto even_idx_shift = ZipLower(Zero(du8), idx_shift); 8082 #endif 8083 const auto extracted_even_bytes = 8084 Shr(BitCast(du16, even_segments), even_idx_shift); 8085 8086 // Calculate odd lanes 8087 const auto odd_src = DupOdd(v); 8088 // Expand indexes to pull out 16 bit segments of idx and idx + 1 8089 #if HWY_IS_LITTLE_ENDIAN 8090 const auto odd_idx = InterleaveUpper(du8, byte_idx, hi_byte_idx); 8091 #else 8092 const auto odd_idx = InterleaveUpper(du8, hi_byte_idx, byte_idx); 8093 #endif 8094 // TableLookupBytes indexes select from within a 16 byte block 8095 const auto odd_segments = TableLookupBytes(odd_src, odd_idx); 8096 // Extract unaligned bytes from 16 bit segments 8097 #if HWY_IS_LITTLE_ENDIAN 8098 const auto odd_idx_shift = ZipUpper(du16, idx_shift, Zero(du8)); 8099 #else 8100 const auto odd_idx_shift = ZipUpper(du16, Zero(du8), idx_shift); 8101 #endif 8102 const auto extracted_odd_bytes = 8103 Shr(BitCast(du16, odd_segments), odd_idx_shift); 8104 8105 // Extract the even bytes of each 128 bit block and pack into lower 64 bits 8106 #if HWY_IS_LITTLE_ENDIAN 8107 const auto even_lanes = BitCast( 8108 d64, ConcatEven(du8, Zero(du8), BitCast(du8, extracted_even_bytes))); 8109 const auto odd_lanes = BitCast( 8110 d64, ConcatEven(du8, Zero(du8), BitCast(du8, extracted_odd_bytes))); 8111 #else 8112 const auto even_lanes = BitCast( 8113 d64, ConcatOdd(du8, Zero(du8), BitCast(du8, extracted_even_bytes))); 8114 const auto odd_lanes = BitCast( 8115 d64, ConcatOdd(du8, Zero(du8), BitCast(du8, extracted_odd_bytes))); 8116 #endif 8117 // Interleave at 64 bit level 8118 return InterleaveWholeLower(even_lanes, odd_lanes); 8119 } 8120 8121 #if HWY_TARGET == HWY_RVV 8122 8123 // MultiRotateRight for LMUL=1/2 case on RVV 8124 template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>), 8125 class VI_2 = VFromD<Repartition<TFromV<VI>, DFromV<V>>>, 8126 HWY_IF_POW2_LE_D(DFromV<V>, 0), 8127 HWY_IF_LANES_D(DFromV<VI>, HWY_MAX_LANES_V(VI_2) / 2)> 8128 HWY_API V MultiRotateRight(V v, VI idx) { 8129 return MultiRotateRight(v, ResizeBitCast(Twice<DFromV<VI>>(), idx)); 8130 } 8131 8132 #endif 8133 8134 #endif 8135 8136 // ================================================== Operator wrapper 8137 8138 // SVE* and RVV currently cannot define operators and have already defined 8139 // (only) the corresponding functions such as Add. 8140 #if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE)) 8141 #ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS 8142 #undef HWY_NATIVE_OPERATOR_REPLACEMENTS 8143 #else 8144 #define HWY_NATIVE_OPERATOR_REPLACEMENTS 8145 #endif 8146 8147 template <class V> 8148 HWY_API V Add(V a, V b) { 8149 return a + b; 8150 } 8151 template <class V> 8152 HWY_API V Sub(V a, V b) { 8153 return a - b; 8154 } 8155 8156 template <class V> 8157 HWY_API V Mul(V a, V b) { 8158 return a * b; 8159 } 8160 template <class V> 8161 HWY_API V Div(V a, V b) { 8162 return a / b; 8163 } 8164 template <class V> 8165 HWY_API V Mod(V a, V b) { 8166 return a % b; 8167 } 8168 8169 template <class V> 8170 V Shl(V a, V b) { 8171 return a << b; 8172 } 8173 template <class V> 8174 V Shr(V a, V b) { 8175 return a >> b; 8176 } 8177 8178 template <class V> 8179 HWY_API auto Eq(V a, V b) -> decltype(a == b) { 8180 return a == b; 8181 } 8182 template <class V> 8183 HWY_API auto Ne(V a, V b) -> decltype(a == b) { 8184 return a != b; 8185 } 8186 template <class V> 8187 HWY_API auto Lt(V a, V b) -> decltype(a == b) { 8188 return a < b; 8189 } 8190 8191 template <class V> 8192 HWY_API auto Gt(V a, V b) -> decltype(a == b) { 8193 return a > b; 8194 } 8195 template <class V> 8196 HWY_API auto Ge(V a, V b) -> decltype(a == b) { 8197 return a >= b; 8198 } 8199 8200 template <class V> 8201 HWY_API auto Le(V a, V b) -> decltype(a == b) { 8202 return a <= b; 8203 } 8204 8205 #endif // HWY_NATIVE_OPERATOR_REPLACEMENTS 8206 8207 #undef HWY_GENERIC_IF_EMULATED_D 8208 8209 // TODO: remove once callers are updated. 8210 // SVE and RVV do not support DFromM because their masks are loosely typed. 8211 #if HWY_MAX_BYTES <= 64 && !HWY_TARGET_IS_SVE && HWY_TARGET != HWY_RVV 8212 namespace detail { 8213 template <class M> 8214 uint64_t BitsFromMask(M m) { 8215 const DFromM<M> d; 8216 return ::hwy::HWY_NAMESPACE::BitsFromMask(d, m); 8217 } 8218 } // namespace detail 8219 #endif // !HWY_HAVE_SCALABLE && HWY_MAX_BYTES <= 64 8220 8221 // NOLINTNEXTLINE(google-readability-namespace-comments) 8222 } // namespace HWY_NAMESPACE 8223 } // namespace hwy 8224 HWY_AFTER_NAMESPACE();