emu128-inl.h (90027B)
1 // Copyright 2022 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // Single-element vectors and operations. 17 // External include guard in highway.h - see comment there. 18 19 #include "hwy/base.h" 20 21 #ifndef HWY_NO_LIBCXX 22 #include <math.h> // sqrtf 23 #endif 24 25 #include "hwy/ops/shared-inl.h" 26 27 HWY_BEFORE_NAMESPACE(); 28 namespace hwy { 29 namespace HWY_NAMESPACE { 30 31 template <typename T> 32 using Full128 = Simd<T, 16 / sizeof(T), 0>; 33 34 // (Wrapper class required for overloading comparison operators.) 35 template <typename T, size_t N = 16 / sizeof(T)> 36 struct Vec128 { 37 using PrivateT = T; // only for DFromV 38 static constexpr size_t kPrivateN = N; // only for DFromV 39 40 HWY_INLINE Vec128() = default; 41 Vec128(const Vec128&) = default; 42 Vec128& operator=(const Vec128&) = default; 43 44 HWY_INLINE Vec128& operator*=(const Vec128 other) { 45 return *this = (*this * other); 46 } 47 HWY_INLINE Vec128& operator/=(const Vec128 other) { 48 return *this = (*this / other); 49 } 50 HWY_INLINE Vec128& operator+=(const Vec128 other) { 51 return *this = (*this + other); 52 } 53 HWY_INLINE Vec128& operator-=(const Vec128 other) { 54 return *this = (*this - other); 55 } 56 HWY_INLINE Vec128& operator%=(const Vec128 other) { 57 return *this = (*this % other); 58 } 59 HWY_INLINE Vec128& operator&=(const Vec128 other) { 60 return *this = (*this & other); 61 } 62 HWY_INLINE Vec128& operator|=(const Vec128 other) { 63 return *this = (*this | other); 64 } 65 HWY_INLINE Vec128& operator^=(const Vec128 other) { 66 return *this = (*this ^ other); 67 } 68 69 // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h 70 // relies on this for LoadInterleaved*. CAVEAT: this method of padding 71 // prevents using range for, especially in SumOfLanes, where it would be 72 // incorrect. Moving padding to another field would require handling the case 73 // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward. 74 T raw[16 / sizeof(T)] = {}; 75 }; 76 77 // 0 or FF..FF, same size as Vec128. 78 template <typename T, size_t N = 16 / sizeof(T)> 79 struct Mask128 { 80 using Raw = hwy::MakeUnsigned<T>; 81 82 using PrivateT = T; // only for DFromM 83 static constexpr size_t kPrivateN = N; // only for DFromM 84 85 static HWY_INLINE Raw FromBool(bool b) { 86 return b ? static_cast<Raw>(~Raw{0}) : 0; 87 } 88 89 // Must match the size of Vec128. 90 Raw bits[16 / sizeof(T)] = {}; 91 }; 92 93 template <class V> 94 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; 95 96 template <class M> 97 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; 98 99 template <class V> 100 using TFromV = typename V::PrivateT; 101 102 // ------------------------------ Zero 103 104 // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. 105 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 106 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 107 Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> v; // zero-initialized 108 return v; 109 } 110 111 template <class D> 112 using VFromD = decltype(Zero(D())); 113 114 // ------------------------------ BitCast 115 116 template <class D, class VFrom> 117 HWY_API VFromD<D> BitCast(D /* tag */, VFrom v) { 118 VFromD<D> to; 119 CopySameSize(&v.raw, &to.raw); 120 return to; 121 } 122 123 // ------------------------------ ResizeBitCast 124 125 template <class D, class VFrom> 126 HWY_API VFromD<D> ResizeBitCast(D d, VFrom v) { 127 using DFrom = DFromV<VFrom>; 128 using TFrom = TFromD<DFrom>; 129 using TTo = TFromD<D>; 130 131 constexpr size_t kFromByteLen = sizeof(TFrom) * HWY_MAX_LANES_D(DFrom); 132 constexpr size_t kToByteLen = sizeof(TTo) * HWY_MAX_LANES_D(D); 133 constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen); 134 135 VFromD<D> to = Zero(d); 136 CopyBytes<kCopyByteLen>(&v.raw, &to.raw); 137 return to; 138 } 139 140 namespace detail { 141 142 // ResizeBitCast on the HWY_EMU128 target has zero-extending semantics if 143 // VFromD<DTo> is a larger vector than FromV 144 template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom> 145 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */, 146 ToSizeTag /* to_size_tag */, 147 DTo d_to, DFrom /* d_from */, 148 VFromD<DFrom> v) { 149 return ResizeBitCast(d_to, v); 150 } 151 152 } // namespace detail 153 154 // ------------------------------ Set 155 template <class D, typename T2> 156 HWY_API VFromD<D> Set(D d, const T2 t) { 157 VFromD<D> v; 158 for (size_t i = 0; i < MaxLanes(d); ++i) { 159 v.raw[i] = ConvertScalarTo<TFromD<D>>(t); 160 } 161 return v; 162 } 163 164 // ------------------------------ Undefined 165 template <class D> 166 HWY_API VFromD<D> Undefined(D d) { 167 return Zero(d); 168 } 169 170 // ------------------------------ Dup128VecFromValues 171 172 template <class D, HWY_IF_T_SIZE_D(D, 1)> 173 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 174 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 175 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, 176 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10, 177 TFromD<D> t11, TFromD<D> t12, 178 TFromD<D> t13, TFromD<D> t14, 179 TFromD<D> t15) { 180 VFromD<D> result; 181 result.raw[0] = t0; 182 result.raw[1] = t1; 183 result.raw[2] = t2; 184 result.raw[3] = t3; 185 result.raw[4] = t4; 186 result.raw[5] = t5; 187 result.raw[6] = t6; 188 result.raw[7] = t7; 189 result.raw[8] = t8; 190 result.raw[9] = t9; 191 result.raw[10] = t10; 192 result.raw[11] = t11; 193 result.raw[12] = t12; 194 result.raw[13] = t13; 195 result.raw[14] = t14; 196 result.raw[15] = t15; 197 return result; 198 } 199 200 template <class D, HWY_IF_T_SIZE_D(D, 2)> 201 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 202 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 203 TFromD<D> t5, TFromD<D> t6, 204 TFromD<D> t7) { 205 VFromD<D> result; 206 result.raw[0] = t0; 207 result.raw[1] = t1; 208 result.raw[2] = t2; 209 result.raw[3] = t3; 210 result.raw[4] = t4; 211 result.raw[5] = t5; 212 result.raw[6] = t6; 213 result.raw[7] = t7; 214 return result; 215 } 216 217 template <class D, HWY_IF_T_SIZE_D(D, 4)> 218 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 219 TFromD<D> t2, TFromD<D> t3) { 220 VFromD<D> result; 221 result.raw[0] = t0; 222 result.raw[1] = t1; 223 result.raw[2] = t2; 224 result.raw[3] = t3; 225 return result; 226 } 227 228 template <class D, HWY_IF_T_SIZE_D(D, 8)> 229 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { 230 VFromD<D> result; 231 result.raw[0] = t0; 232 result.raw[1] = t1; 233 return result; 234 } 235 236 // ------------------------------ Iota 237 238 template <class D, typename T = TFromD<D>, typename T2> 239 HWY_API VFromD<D> Iota(D d, T2 first) { 240 VFromD<D> v; 241 for (size_t i = 0; i < MaxLanes(d); ++i) { 242 v.raw[i] = AddWithWraparound(static_cast<T>(first), i); 243 } 244 return v; 245 } 246 247 // ================================================== LOGICAL 248 249 // ------------------------------ Not 250 template <typename T, size_t N> 251 HWY_API Vec128<T, N> Not(Vec128<T, N> v) { 252 const DFromV<decltype(v)> d; 253 const RebindToUnsigned<decltype(d)> du; 254 using TU = TFromD<decltype(du)>; 255 VFromD<decltype(du)> vu = BitCast(du, v); 256 for (size_t i = 0; i < N; ++i) { 257 vu.raw[i] = static_cast<TU>(~vu.raw[i]); 258 } 259 return BitCast(d, vu); 260 } 261 262 // ------------------------------ And 263 template <typename T, size_t N> 264 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) { 265 const DFromV<decltype(a)> d; 266 const RebindToUnsigned<decltype(d)> du; 267 auto au = BitCast(du, a); 268 auto bu = BitCast(du, b); 269 for (size_t i = 0; i < N; ++i) { 270 au.raw[i] &= bu.raw[i]; 271 } 272 return BitCast(d, au); 273 } 274 template <typename T, size_t N> 275 HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) { 276 return And(a, b); 277 } 278 279 // ------------------------------ AndNot 280 template <typename T, size_t N> 281 HWY_API Vec128<T, N> AndNot(Vec128<T, N> a, Vec128<T, N> b) { 282 return And(Not(a), b); 283 } 284 285 // ------------------------------ Or 286 template <typename T, size_t N> 287 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) { 288 const DFromV<decltype(a)> d; 289 const RebindToUnsigned<decltype(d)> du; 290 auto au = BitCast(du, a); 291 auto bu = BitCast(du, b); 292 for (size_t i = 0; i < N; ++i) { 293 au.raw[i] |= bu.raw[i]; 294 } 295 return BitCast(d, au); 296 } 297 template <typename T, size_t N> 298 HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) { 299 return Or(a, b); 300 } 301 302 // ------------------------------ Xor 303 template <typename T, size_t N> 304 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) { 305 const DFromV<decltype(a)> d; 306 const RebindToUnsigned<decltype(d)> du; 307 auto au = BitCast(du, a); 308 auto bu = BitCast(du, b); 309 for (size_t i = 0; i < N; ++i) { 310 au.raw[i] ^= bu.raw[i]; 311 } 312 return BitCast(d, au); 313 } 314 template <typename T, size_t N> 315 HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) { 316 return Xor(a, b); 317 } 318 319 // ------------------------------ Xor3 320 template <typename T, size_t N> 321 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { 322 return Xor(x1, Xor(x2, x3)); 323 } 324 325 // ------------------------------ Or3 326 template <typename T, size_t N> 327 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { 328 return Or(o1, Or(o2, o3)); 329 } 330 331 // ------------------------------ OrAnd 332 template <typename T, size_t N> 333 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { 334 return Or(o, And(a1, a2)); 335 } 336 337 // ------------------------------ IfVecThenElse 338 template <typename T, size_t N> 339 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, 340 Vec128<T, N> no) { 341 return Or(And(mask, yes), AndNot(mask, no)); 342 } 343 344 // ------------------------------ CopySign 345 template <typename T, size_t N> 346 HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) { 347 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 348 const DFromV<decltype(magn)> d; 349 return BitwiseIfThenElse(SignBit(d), sign, magn); 350 } 351 352 // ------------------------------ CopySignToAbs 353 template <typename T, size_t N> 354 HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) { 355 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 356 const DFromV<decltype(abs)> d; 357 return OrAnd(abs, SignBit(d), sign); 358 } 359 360 // ------------------------------ BroadcastSignBit 361 template <typename T, size_t N> 362 HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) { 363 for (size_t i = 0; i < N; ++i) { 364 v.raw[i] = ScalarShr(v.raw[i], sizeof(T) * 8 - 1); 365 } 366 return v; 367 } 368 369 // ------------------------------ Mask 370 371 // v must be 0 or FF..FF. 372 template <typename T, size_t N> 373 HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) { 374 Mask128<T, N> mask; 375 CopySameSize(&v.raw, &mask.bits); 376 return mask; 377 } 378 379 template <class D> 380 using MFromD = decltype(MaskFromVec(VFromD<D>())); 381 382 template <class DTo, class MFrom> 383 HWY_API MFromD<DTo> RebindMask(DTo /* tag */, MFrom mask) { 384 MFromD<DTo> to; 385 CopySameSize(&mask.bits, &to.bits); 386 return to; 387 } 388 389 template <class D> 390 VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) { 391 VFromD<D> v; 392 CopySameSize(&mask.bits, &v.raw); 393 return v; 394 } 395 396 template <class D> 397 uint64_t BitsFromMask(D d, MFromD<D> mask) { 398 uint64_t bits = 0; 399 for (size_t i = 0; i < Lanes(d); ++i) { 400 bits |= mask.bits[i] ? (1ull << i) : 0; 401 } 402 return bits; 403 } 404 405 template <class D> 406 HWY_API MFromD<D> FirstN(D d, size_t n) { 407 MFromD<D> m; 408 for (size_t i = 0; i < MaxLanes(d); ++i) { 409 m.bits[i] = MFromD<D>::FromBool(i < n); 410 } 411 return m; 412 } 413 414 // Returns mask ? yes : no. 415 template <typename T, size_t N> 416 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, 417 Vec128<T, N> no) { 418 const DFromV<decltype(yes)> d; 419 return IfVecThenElse(VecFromMask(d, mask), yes, no); 420 } 421 422 template <typename T, size_t N> 423 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { 424 const DFromV<decltype(yes)> d; 425 return IfVecThenElse(VecFromMask(d, mask), yes, Zero(d)); 426 } 427 428 template <typename T, size_t N> 429 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { 430 const DFromV<decltype(no)> d; 431 return IfVecThenElse(VecFromMask(d, mask), Zero(d), no); 432 } 433 434 template <typename T, size_t N> 435 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, 436 Vec128<T, N> no) { 437 const DFromV<decltype(v)> d; 438 const RebindToSigned<decltype(d)> di; 439 const auto vi = BitCast(di, v); 440 441 for (size_t i = 0; i < N; ++i) { 442 v.raw[i] = vi.raw[i] < 0 ? yes.raw[i] : no.raw[i]; 443 } 444 return v; 445 } 446 447 // ------------------------------ Mask logical 448 449 template <typename T, size_t N> 450 HWY_API Mask128<T, N> Not(Mask128<T, N> m) { 451 const Simd<T, N, 0> d; 452 return MaskFromVec(Not(VecFromMask(d, m))); 453 } 454 455 template <typename T, size_t N> 456 HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) { 457 const Simd<T, N, 0> d; 458 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); 459 } 460 461 template <typename T, size_t N> 462 HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) { 463 const Simd<T, N, 0> d; 464 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); 465 } 466 467 template <typename T, size_t N> 468 HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) { 469 const Simd<T, N, 0> d; 470 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); 471 } 472 473 template <typename T, size_t N> 474 HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) { 475 const Simd<T, N, 0> d; 476 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); 477 } 478 479 template <typename T, size_t N> 480 HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) { 481 const Simd<T, N, 0> d; 482 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); 483 } 484 485 // ================================================== SHIFTS 486 487 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) 488 489 template <int kBits, typename T, size_t N> 490 HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) { 491 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); 492 using TU = hwy::MakeUnsigned<T>; 493 for (size_t i = 0; i < N; ++i) { 494 const TU raw_u = static_cast<TU>(v.raw[i]); 495 const auto shifted = raw_u << kBits; // separate line to avoid MSVC warning 496 v.raw[i] = static_cast<T>(shifted); 497 } 498 return v; 499 } 500 501 template <int kBits, typename T, size_t N> 502 HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) { 503 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); 504 // Signed right shift is now guaranteed to be arithmetic (rounding toward 505 // negative infinity, i.e. shifting in the sign bit). 506 for (size_t i = 0; i < N; ++i) { 507 v.raw[i] = ScalarShr(v.raw[i], kBits); 508 } 509 510 return v; 511 } 512 513 // ------------------------------ RotateRight (ShiftRight) 514 template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 515 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { 516 const DFromV<decltype(v)> d; 517 const RebindToUnsigned<decltype(d)> du; 518 519 constexpr size_t kSizeInBits = sizeof(T) * 8; 520 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); 521 if (kBits == 0) return v; 522 523 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))), 524 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); 525 } 526 527 // ------------------------------ ShiftLeftSame 528 529 template <typename T, size_t N> 530 HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) { 531 for (size_t i = 0; i < N; ++i) { 532 const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << bits; 533 v.raw[i] = static_cast<T>(shifted); 534 } 535 return v; 536 } 537 538 template <typename T, size_t N> 539 HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) { 540 for (size_t i = 0; i < N; ++i) { 541 v.raw[i] = ScalarShr(v.raw[i], bits); 542 } 543 544 return v; 545 } 546 547 // ------------------------------ Shl 548 549 template <typename T, size_t N> 550 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) { 551 for (size_t i = 0; i < N; ++i) { 552 const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) 553 << bits.raw[i]; 554 v.raw[i] = static_cast<T>(shifted); 555 } 556 return v; 557 } 558 559 template <typename T, size_t N> 560 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) { 561 for (size_t i = 0; i < N; ++i) { 562 v.raw[i] = ScalarShr(v.raw[i], static_cast<int>(bits.raw[i])); 563 } 564 565 return v; 566 } 567 568 // ================================================== ARITHMETIC 569 570 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility 571 namespace detail { 572 573 template <typename T, size_t N> 574 HWY_INLINE Vec128<T, N> Add(hwy::NonFloatTag /*tag*/, Vec128<T, N> a, 575 Vec128<T, N> b) { 576 for (size_t i = 0; i < N; ++i) { 577 const uint64_t a64 = static_cast<uint64_t>(a.raw[i]); 578 const uint64_t b64 = static_cast<uint64_t>(b.raw[i]); 579 a.raw[i] = static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))); 580 } 581 return a; 582 } 583 template <typename T, size_t N> 584 HWY_INLINE Vec128<T, N> Sub(hwy::NonFloatTag /*tag*/, Vec128<T, N> a, 585 Vec128<T, N> b) { 586 for (size_t i = 0; i < N; ++i) { 587 const uint64_t a64 = static_cast<uint64_t>(a.raw[i]); 588 const uint64_t b64 = static_cast<uint64_t>(b.raw[i]); 589 a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))); 590 } 591 return a; 592 } 593 594 template <typename T, size_t N> 595 HWY_INLINE Vec128<T, N> Add(hwy::FloatTag /*tag*/, Vec128<T, N> a, 596 Vec128<T, N> b) { 597 for (size_t i = 0; i < N; ++i) { 598 a.raw[i] += b.raw[i]; 599 } 600 return a; 601 } 602 603 template <typename T, size_t N> 604 HWY_INLINE Vec128<T, N> Sub(hwy::FloatTag /*tag*/, Vec128<T, N> a, 605 Vec128<T, N> b) { 606 for (size_t i = 0; i < N; ++i) { 607 a.raw[i] -= b.raw[i]; 608 } 609 return a; 610 } 611 612 } // namespace detail 613 614 template <typename T, size_t N> 615 HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) { 616 return detail::Sub(hwy::IsFloatTag<T>(), a, b); 617 } 618 template <typename T, size_t N> 619 HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) { 620 return detail::Add(hwy::IsFloatTag<T>(), a, b); 621 } 622 623 // ------------------------------ SumsOf8 624 625 template <size_t N> 626 HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(Vec128<uint8_t, N> v) { 627 Vec128<uint64_t, (N + 7) / 8> sums; 628 for (size_t i = 0; i < N; ++i) { 629 sums.raw[i / 8] += v.raw[i]; 630 } 631 return sums; 632 } 633 634 template <size_t N> 635 HWY_API Vec128<int64_t, (N + 7) / 8> SumsOf8(Vec128<int8_t, N> v) { 636 Vec128<int64_t, (N + 7) / 8> sums; 637 for (size_t i = 0; i < N; ++i) { 638 sums.raw[i / 8] += v.raw[i]; 639 } 640 return sums; 641 } 642 643 // ------------------------------ SaturatedAdd 644 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)), 645 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 646 HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) { 647 using TW = MakeSigned<MakeWide<T>>; 648 for (size_t i = 0; i < N; ++i) { 649 a.raw[i] = static_cast<T>(HWY_MIN( 650 HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) + b.raw[i]), 651 hwy::HighestValue<T>())); 652 } 653 return a; 654 } 655 656 // ------------------------------ SaturatedSub 657 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)), 658 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 659 HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) { 660 using TW = MakeSigned<MakeWide<T>>; 661 for (size_t i = 0; i < N; ++i) { 662 a.raw[i] = static_cast<T>(HWY_MIN( 663 HWY_MAX(hwy::LowestValue<T>(), static_cast<TW>(a.raw[i]) - b.raw[i]), 664 hwy::HighestValue<T>())); 665 } 666 return a; 667 } 668 669 // ------------------------------ AverageRound 670 671 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32 672 #undef HWY_NATIVE_AVERAGE_ROUND_UI32 673 #else 674 #define HWY_NATIVE_AVERAGE_ROUND_UI32 675 #endif 676 677 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64 678 #undef HWY_NATIVE_AVERAGE_ROUND_UI64 679 #else 680 #define HWY_NATIVE_AVERAGE_ROUND_UI64 681 #endif 682 683 template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 684 HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) { 685 for (size_t i = 0; i < N; ++i) { 686 const T a_val = a.raw[i]; 687 const T b_val = b.raw[i]; 688 a.raw[i] = static_cast<T>((a_val | b_val) - ScalarShr(a_val ^ b_val, 1)); 689 } 690 return a; 691 } 692 693 // ------------------------------ Abs 694 695 template <typename T, size_t N> 696 HWY_API Vec128<T, N> Abs(Vec128<T, N> a) { 697 for (size_t i = 0; i < N; ++i) { 698 a.raw[i] = ScalarAbs(a.raw[i]); 699 } 700 return a; 701 } 702 703 // ------------------------------ Min/Max 704 705 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility 706 namespace detail { 707 708 template <typename T, size_t N> 709 HWY_INLINE Vec128<T, N> Min(hwy::NonFloatTag /*tag*/, Vec128<T, N> a, 710 Vec128<T, N> b) { 711 for (size_t i = 0; i < N; ++i) { 712 a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]); 713 } 714 return a; 715 } 716 template <typename T, size_t N> 717 HWY_INLINE Vec128<T, N> Max(hwy::NonFloatTag /*tag*/, Vec128<T, N> a, 718 Vec128<T, N> b) { 719 for (size_t i = 0; i < N; ++i) { 720 a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]); 721 } 722 return a; 723 } 724 725 template <typename T, size_t N> 726 HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a, 727 Vec128<T, N> b) { 728 for (size_t i = 0; i < N; ++i) { 729 if (ScalarIsNaN(a.raw[i])) { 730 a.raw[i] = b.raw[i]; 731 } else if (ScalarIsNaN(b.raw[i])) { 732 // no change 733 } else { 734 a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]); 735 } 736 } 737 return a; 738 } 739 template <typename T, size_t N> 740 HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a, 741 Vec128<T, N> b) { 742 for (size_t i = 0; i < N; ++i) { 743 if (ScalarIsNaN(a.raw[i])) { 744 a.raw[i] = b.raw[i]; 745 } else if (ScalarIsNaN(b.raw[i])) { 746 // no change 747 } else { 748 a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]); 749 } 750 } 751 return a; 752 } 753 754 } // namespace detail 755 756 template <typename T, size_t N> 757 HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) { 758 return detail::Min(hwy::IsFloatTag<T>(), a, b); 759 } 760 761 template <typename T, size_t N> 762 HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) { 763 return detail::Max(hwy::IsFloatTag<T>(), a, b); 764 } 765 766 // ------------------------------ Neg 767 768 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility 769 namespace detail { 770 771 template <typename T, size_t N> 772 HWY_API Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, Vec128<T, N> v) { 773 const DFromV<decltype(v)> d; 774 return Zero(d) - v; 775 } 776 777 template <typename T, size_t N> 778 HWY_API Vec128<T, N> Neg(hwy::FloatTag /*tag*/, Vec128<T, N> v) { 779 const DFromV<decltype(v)> d; 780 return Xor(v, SignBit(d)); 781 } 782 783 template <typename T, size_t N> 784 HWY_API Vec128<T, N> Neg(hwy::SpecialTag /*tag*/, Vec128<T, N> v) { 785 const DFromV<decltype(v)> d; 786 return Xor(v, SignBit(d)); 787 } 788 789 } // namespace detail 790 791 template <typename T, size_t N> 792 HWY_API Vec128<T, N> Neg(Vec128<T, N> v) { 793 return detail::Neg(hwy::IsFloatTag<T>(), v); 794 } 795 796 // ------------------------------ Mul/Div 797 798 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility 799 namespace detail { 800 801 template <typename T, size_t N> 802 HWY_INLINE Vec128<T, N> Mul(hwy::FloatTag /*tag*/, Vec128<T, N> a, 803 Vec128<T, N> b) { 804 for (size_t i = 0; i < N; ++i) { 805 a.raw[i] *= b.raw[i]; 806 } 807 return a; 808 } 809 810 template <typename T, size_t N> 811 HWY_INLINE Vec128<T, N> Mul(SignedTag /*tag*/, Vec128<T, N> a, Vec128<T, N> b) { 812 for (size_t i = 0; i < N; ++i) { 813 a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) * 814 static_cast<uint64_t>(b.raw[i])); 815 } 816 return a; 817 } 818 819 template <typename T, size_t N> 820 HWY_INLINE Vec128<T, N> Mul(UnsignedTag /*tag*/, Vec128<T, N> a, 821 Vec128<T, N> b) { 822 for (size_t i = 0; i < N; ++i) { 823 a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) * 824 static_cast<uint64_t>(b.raw[i])); 825 } 826 return a; 827 } 828 829 } // namespace detail 830 831 // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. 832 #ifdef HWY_NATIVE_MUL_8 833 #undef HWY_NATIVE_MUL_8 834 #else 835 #define HWY_NATIVE_MUL_8 836 #endif 837 #ifdef HWY_NATIVE_MUL_64 838 #undef HWY_NATIVE_MUL_64 839 #else 840 #define HWY_NATIVE_MUL_64 841 #endif 842 843 template <typename T, size_t N> 844 HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) { 845 return detail::Mul(hwy::TypeTag<T>(), a, b); 846 } 847 848 template <typename T, size_t N, HWY_IF_FLOAT(T)> 849 HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) { 850 for (size_t i = 0; i < N; ++i) { 851 a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i]; 852 } 853 return a; 854 } 855 856 // Returns the upper sizeof(T)*8 bits of a * b in each lane. 857 template <class T, size_t N, 858 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)), 859 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 860 HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) { 861 using TW = MakeWide<T>; 862 for (size_t i = 0; i < N; ++i) { 863 a.raw[i] = static_cast<T>( 864 (static_cast<TW>(a.raw[i]) * static_cast<TW>(b.raw[i])) >> 865 (sizeof(T) * 8)); 866 } 867 return a; 868 } 869 870 template <class T, HWY_IF_UI64(T)> 871 HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) { 872 T hi; 873 Mul128(GetLane(a), GetLane(b), &hi); 874 return Set(Full64<T>(), hi); 875 } 876 877 template <class T, HWY_IF_UI64(T)> 878 HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) { 879 T hi_0; 880 T hi_1; 881 882 Mul128(GetLane(a), GetLane(b), &hi_0); 883 Mul128(ExtractLane(a, 1), ExtractLane(b, 1), &hi_1); 884 885 return Dup128VecFromValues(Full128<T>(), hi_0, hi_1); 886 } 887 888 template <size_t N> 889 HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a, 890 Vec128<int16_t, N> b) { 891 for (size_t i = 0; i < N; ++i) { 892 a.raw[i] = static_cast<int16_t>((a.raw[i] * b.raw[i] + 16384) >> 15); 893 } 894 return a; 895 } 896 897 // Multiplies even lanes (0, 2, ..) and returns the double-wide result. 898 template <class T, size_t N, 899 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)), 900 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 901 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a, 902 Vec128<T, N> b) { 903 using TW = MakeWide<T>; 904 Vec128<TW, (N + 1) / 2> mul; 905 for (size_t i = 0; i < N; i += 2) { 906 const TW a_wide = a.raw[i]; 907 mul.raw[i / 2] = static_cast<TW>(a_wide * b.raw[i]); 908 } 909 return mul; 910 } 911 912 // Multiplies odd lanes (1, 3, ..) and returns the double-wide result. 913 template <class T, size_t N, 914 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)), 915 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 916 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a, 917 Vec128<T, N> b) { 918 using TW = MakeWide<T>; 919 Vec128<TW, (N + 1) / 2> mul; 920 for (size_t i = 0; i < N; i += 2) { 921 const TW a_wide = a.raw[i + 1]; 922 mul.raw[i / 2] = static_cast<TW>(a_wide * b.raw[i + 1]); 923 } 924 return mul; 925 } 926 927 template <size_t N> 928 HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) { 929 for (size_t i = 0; i < N; ++i) { 930 // Zero inputs are allowed, but callers are responsible for replacing the 931 // return value with something else (typically using IfThenElse). This check 932 // avoids a ubsan error. The result is arbitrary. 933 v.raw[i] = (ScalarAbs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i]; 934 } 935 return v; 936 } 937 938 // generic_ops takes care of integer T. 939 template <typename T, size_t N, HWY_IF_FLOAT(T)> 940 HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) { 941 return Abs(a - b); 942 } 943 944 // ------------------------------ Floating-point multiply-add variants 945 946 template <typename T, size_t N, HWY_IF_FLOAT(T)> 947 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x, 948 Vec128<T, N> add) { 949 return mul * x + add; 950 } 951 952 template <typename T, size_t N, HWY_IF_FLOAT(T)> 953 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x, 954 Vec128<T, N> add) { 955 return add - mul * x; 956 } 957 958 template <typename T, size_t N, HWY_IF_FLOAT(T)> 959 HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x, 960 Vec128<T, N> sub) { 961 return mul * x - sub; 962 } 963 964 template <typename T, size_t N, HWY_IF_FLOAT(T)> 965 HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x, 966 Vec128<T, N> sub) { 967 return Neg(mul) * x - sub; 968 } 969 970 // ------------------------------ Floating-point square root 971 972 template <size_t N> 973 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) { 974 for (size_t i = 0; i < N; ++i) { 975 const float half = v.raw[i] * 0.5f; 976 // Initial guess based on log2(f) 977 v.raw[i] = BitCastScalar<float>(static_cast<uint32_t>( 978 0x5F3759DF - (BitCastScalar<uint32_t>(v.raw[i]) >> 1))); 979 // One Newton-Raphson iteration 980 v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i])); 981 } 982 return v; 983 } 984 985 namespace detail { 986 987 static HWY_INLINE float ScalarSqrt(float v) { 988 #if defined(HWY_NO_LIBCXX) 989 #if HWY_COMPILER_GCC_ACTUAL 990 return __builtin_sqrt(v); 991 #else 992 uint32_t bits = BitCastScalar<uint32_t>(v); 993 // Coarse approximation, letting the exponent LSB leak into the mantissa 994 bits = (1 << 29) + (bits >> 1) - (1 << 22); 995 return BitCastScalar<float>(bits); 996 #endif // !HWY_COMPILER_GCC_ACTUAL 997 #else 998 return sqrtf(v); 999 #endif // !HWY_NO_LIBCXX 1000 } 1001 static HWY_INLINE double ScalarSqrt(double v) { 1002 #if defined(HWY_NO_LIBCXX) 1003 #if HWY_COMPILER_GCC_ACTUAL 1004 return __builtin_sqrt(v); 1005 #else 1006 uint64_t bits = BitCastScalar<uint64_t>(v); 1007 // Coarse approximation, letting the exponent LSB leak into the mantissa 1008 bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51); 1009 return BitCastScalar<double>(bits); 1010 #endif // !HWY_COMPILER_GCC_ACTUAL 1011 #else 1012 return sqrt(v); 1013 #endif // HWY_NO_LIBCXX 1014 } 1015 1016 } // namespace detail 1017 1018 template <typename T, size_t N> 1019 HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) { 1020 for (size_t i = 0; i < N; ++i) { 1021 v.raw[i] = detail::ScalarSqrt(v.raw[i]); 1022 } 1023 return v; 1024 } 1025 1026 // ------------------------------ Floating-point rounding 1027 1028 template <typename T, size_t N> 1029 HWY_API Vec128<T, N> Round(Vec128<T, N> v) { 1030 using TI = MakeSigned<T>; 1031 const T k0 = ConvertScalarTo<T>(0); 1032 const Vec128<T, N> a = Abs(v); 1033 for (size_t i = 0; i < N; ++i) { 1034 if (!(a.raw[i] < MantissaEnd<T>())) { // Huge or NaN 1035 continue; 1036 } 1037 const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5); 1038 const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias); 1039 if (rounded == 0) { 1040 v.raw[i] = v.raw[i] < 0 ? ConvertScalarTo<T>(-0) : k0; 1041 continue; 1042 } 1043 const T rounded_f = ConvertScalarTo<T>(rounded); 1044 // Round to even 1045 if ((rounded & 1) && 1046 ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) { 1047 v.raw[i] = ConvertScalarTo<T>(rounded - (v.raw[i] < k0 ? -1 : 1)); 1048 continue; 1049 } 1050 v.raw[i] = rounded_f; 1051 } 1052 return v; 1053 } 1054 1055 // Round-to-nearest even. 1056 template <class T, size_t N, HWY_IF_FLOAT3264(T)> 1057 HWY_API Vec128<MakeSigned<T>, N> NearestInt(Vec128<T, N> v) { 1058 using TI = MakeSigned<T>; 1059 const T k0 = ConvertScalarTo<T>(0); 1060 1061 const Vec128<T, N> abs = Abs(v); 1062 Vec128<TI, N> ret; 1063 for (size_t i = 0; i < N; ++i) { 1064 const bool signbit = ScalarSignBit(v.raw[i]); 1065 1066 if (!(abs.raw[i] < MantissaEnd<T>())) { // Huge or NaN 1067 // Check if too large to cast or NaN 1068 if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) { 1069 ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>(); 1070 continue; 1071 } 1072 ret.raw[i] = static_cast<TI>(v.raw[i]); 1073 continue; 1074 } 1075 const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5); 1076 const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias); 1077 if (rounded == 0) { 1078 ret.raw[i] = 0; 1079 continue; 1080 } 1081 const T rounded_f = ConvertScalarTo<T>(rounded); 1082 // Round to even 1083 if ((rounded & 1) && 1084 ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) { 1085 ret.raw[i] = rounded - (signbit ? -1 : 1); 1086 continue; 1087 } 1088 ret.raw[i] = rounded; 1089 } 1090 return ret; 1091 } 1092 1093 template <class DI32, HWY_IF_I32_D(DI32)> 1094 HWY_API VFromD<DI32> DemoteToNearestInt(DI32 /*di32*/, 1095 VFromD<Rebind<double, DI32>> v) { 1096 using T = double; 1097 using TI = int32_t; 1098 const T k0 = ConvertScalarTo<T>(0); 1099 1100 constexpr size_t N = HWY_MAX_LANES_D(DI32); 1101 1102 const VFromD<Rebind<double, DI32>> abs = Abs(v); 1103 VFromD<DI32> ret; 1104 for (size_t i = 0; i < N; ++i) { 1105 const bool signbit = ScalarSignBit(v.raw[i]); 1106 1107 // Check if too large to cast or NaN 1108 if (!(abs.raw[i] <= ConvertScalarTo<T>(LimitsMax<TI>()))) { 1109 ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>(); 1110 continue; 1111 } 1112 1113 const T bias = ConvertScalarTo<T>(v.raw[i] < k0 ? -0.5 : 0.5); 1114 const TI rounded = ConvertScalarTo<TI>(v.raw[i] + bias); 1115 if (rounded == 0) { 1116 ret.raw[i] = 0; 1117 continue; 1118 } 1119 const T rounded_f = ConvertScalarTo<T>(rounded); 1120 // Round to even 1121 if ((rounded & 1) && 1122 ScalarAbs(rounded_f - v.raw[i]) == ConvertScalarTo<T>(0.5)) { 1123 ret.raw[i] = rounded - (signbit ? -1 : 1); 1124 continue; 1125 } 1126 ret.raw[i] = rounded; 1127 } 1128 return ret; 1129 } 1130 1131 template <typename T, size_t N> 1132 HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) { 1133 using TI = MakeSigned<T>; 1134 const Vec128<T, N> abs = Abs(v); 1135 for (size_t i = 0; i < N; ++i) { 1136 if (!(abs.raw[i] <= MantissaEnd<T>())) { // Huge or NaN 1137 continue; 1138 } 1139 const TI truncated = static_cast<TI>(v.raw[i]); 1140 if (truncated == 0) { 1141 v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0}; 1142 continue; 1143 } 1144 v.raw[i] = static_cast<T>(truncated); 1145 } 1146 return v; 1147 } 1148 1149 // Toward +infinity, aka ceiling 1150 template <typename Float, size_t N> 1151 Vec128<Float, N> Ceil(Vec128<Float, N> v) { 1152 constexpr int kMantissaBits = MantissaBits<Float>(); 1153 using Bits = MakeUnsigned<Float>; 1154 const Bits kExponentMask = MaxExponentField<Float>(); 1155 const Bits kMantissaMask = MantissaMask<Float>(); 1156 const Bits kBias = kExponentMask / 2; 1157 1158 for (size_t i = 0; i < N; ++i) { 1159 const bool positive = v.raw[i] > Float(0.0); 1160 1161 Bits bits = BitCastScalar<Bits>(v.raw[i]); 1162 1163 const int exponent = 1164 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias); 1165 // Already an integer. 1166 if (exponent >= kMantissaBits) continue; 1167 // |v| <= 1 => 0 or 1. 1168 if (exponent < 0) { 1169 v.raw[i] = positive ? Float{1} : Float{-0.0}; 1170 continue; 1171 } 1172 1173 const Bits mantissa_mask = kMantissaMask >> exponent; 1174 // Already an integer 1175 if ((bits & mantissa_mask) == 0) continue; 1176 1177 // Clear fractional bits and round up 1178 if (positive) bits += (kMantissaMask + 1) >> exponent; 1179 bits &= ~mantissa_mask; 1180 1181 v.raw[i] = BitCastScalar<Float>(bits); 1182 } 1183 return v; 1184 } 1185 1186 // Toward -infinity, aka floor 1187 template <typename Float, size_t N> 1188 Vec128<Float, N> Floor(Vec128<Float, N> v) { 1189 constexpr int kMantissaBits = MantissaBits<Float>(); 1190 using Bits = MakeUnsigned<Float>; 1191 const Bits kExponentMask = MaxExponentField<Float>(); 1192 const Bits kMantissaMask = MantissaMask<Float>(); 1193 const Bits kBias = kExponentMask / 2; 1194 1195 for (size_t i = 0; i < N; ++i) { 1196 const bool negative = v.raw[i] < Float(0.0); 1197 1198 Bits bits = BitCastScalar<Bits>(v.raw[i]); 1199 1200 const int exponent = 1201 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias); 1202 // Already an integer. 1203 if (exponent >= kMantissaBits) continue; 1204 // |v| <= 1 => -1 or 0. 1205 if (exponent < 0) { 1206 v.raw[i] = negative ? Float(-1.0) : Float(0.0); 1207 continue; 1208 } 1209 1210 const Bits mantissa_mask = kMantissaMask >> exponent; 1211 // Already an integer 1212 if ((bits & mantissa_mask) == 0) continue; 1213 1214 // Clear fractional bits and round down 1215 if (negative) bits += (kMantissaMask + 1) >> exponent; 1216 bits &= ~mantissa_mask; 1217 1218 v.raw[i] = BitCastScalar<Float>(bits); 1219 } 1220 return v; 1221 } 1222 1223 // ------------------------------ Floating-point classification 1224 1225 template <typename T, size_t N> 1226 HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) { 1227 Mask128<T, N> ret; 1228 for (size_t i = 0; i < N; ++i) { 1229 // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY. 1230 ret.bits[i] = Mask128<T, N>::FromBool(ScalarIsNaN(v.raw[i])); 1231 } 1232 return ret; 1233 } 1234 1235 // ================================================== COMPARE 1236 1237 template <typename T, size_t N> 1238 HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) { 1239 Mask128<T, N> m; 1240 for (size_t i = 0; i < N; ++i) { 1241 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] == b.raw[i]); 1242 } 1243 return m; 1244 } 1245 1246 template <typename T, size_t N> 1247 HWY_API Mask128<T, N> operator!=(Vec128<T, N> a, Vec128<T, N> b) { 1248 Mask128<T, N> m; 1249 for (size_t i = 0; i < N; ++i) { 1250 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] != b.raw[i]); 1251 } 1252 return m; 1253 } 1254 1255 template <typename T, size_t N> 1256 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { 1257 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); 1258 return (v & bit) == bit; 1259 } 1260 1261 template <typename T, size_t N> 1262 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) { 1263 Mask128<T, N> m; 1264 for (size_t i = 0; i < N; ++i) { 1265 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] < b.raw[i]); 1266 } 1267 return m; 1268 } 1269 template <typename T, size_t N> 1270 HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) { 1271 Mask128<T, N> m; 1272 for (size_t i = 0; i < N; ++i) { 1273 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] > b.raw[i]); 1274 } 1275 return m; 1276 } 1277 1278 template <typename T, size_t N> 1279 HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) { 1280 Mask128<T, N> m; 1281 for (size_t i = 0; i < N; ++i) { 1282 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] <= b.raw[i]); 1283 } 1284 return m; 1285 } 1286 template <typename T, size_t N> 1287 HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { 1288 Mask128<T, N> m; 1289 for (size_t i = 0; i < N; ++i) { 1290 m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] >= b.raw[i]); 1291 } 1292 return m; 1293 } 1294 1295 // ------------------------------ Lt128 1296 1297 // Only makes sense for full vectors of u64. 1298 template <class D> 1299 HWY_API MFromD<D> Lt128(D /* tag */, Vec128<uint64_t> a, Vec128<uint64_t> b) { 1300 const bool lt = 1301 (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]); 1302 Mask128<uint64_t> ret; 1303 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt); 1304 return ret; 1305 } 1306 1307 template <class D> 1308 HWY_API MFromD<D> Lt128Upper(D /* tag */, Vec128<uint64_t> a, 1309 Vec128<uint64_t> b) { 1310 const bool lt = a.raw[1] < b.raw[1]; 1311 Mask128<uint64_t> ret; 1312 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt); 1313 return ret; 1314 } 1315 1316 // ------------------------------ Eq128 1317 1318 // Only makes sense for full vectors of u64. 1319 template <class D> 1320 HWY_API MFromD<D> Eq128(D /* tag */, Vec128<uint64_t> a, Vec128<uint64_t> b) { 1321 const bool eq = a.raw[1] == b.raw[1] && a.raw[0] == b.raw[0]; 1322 Mask128<uint64_t> ret; 1323 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq); 1324 return ret; 1325 } 1326 1327 template <class D> 1328 HWY_API Mask128<uint64_t> Ne128(D /* tag */, Vec128<uint64_t> a, 1329 Vec128<uint64_t> b) { 1330 const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0]; 1331 Mask128<uint64_t> ret; 1332 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne); 1333 return ret; 1334 } 1335 1336 template <class D> 1337 HWY_API MFromD<D> Eq128Upper(D /* tag */, Vec128<uint64_t> a, 1338 Vec128<uint64_t> b) { 1339 const bool eq = a.raw[1] == b.raw[1]; 1340 Mask128<uint64_t> ret; 1341 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq); 1342 return ret; 1343 } 1344 1345 template <class D> 1346 HWY_API MFromD<D> Ne128Upper(D /* tag */, Vec128<uint64_t> a, 1347 Vec128<uint64_t> b) { 1348 const bool ne = a.raw[1] != b.raw[1]; 1349 Mask128<uint64_t> ret; 1350 ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne); 1351 return ret; 1352 } 1353 1354 // ------------------------------ Min128, Max128 (Lt128) 1355 1356 template <class D> 1357 HWY_API VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) { 1358 return IfThenElse(Lt128(d, a, b), a, b); 1359 } 1360 1361 template <class D> 1362 HWY_API VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) { 1363 return IfThenElse(Lt128(d, b, a), a, b); 1364 } 1365 1366 template <class D> 1367 HWY_API VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) { 1368 return IfThenElse(Lt128Upper(d, a, b), a, b); 1369 } 1370 1371 template <class D> 1372 HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) { 1373 return IfThenElse(Lt128Upper(d, b, a), a, b); 1374 } 1375 1376 // ================================================== MEMORY 1377 1378 // ------------------------------ Load 1379 1380 template <class D> 1381 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) { 1382 VFromD<D> v; 1383 CopyBytes<d.MaxBytes()>(aligned, v.raw); // copy from array 1384 return v; 1385 } 1386 1387 template <class D> 1388 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, 1389 const TFromD<D>* HWY_RESTRICT p) { 1390 return IfThenElseZero(m, LoadU(d, p)); 1391 } 1392 1393 template <class D> 1394 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, 1395 const TFromD<D>* HWY_RESTRICT p) { 1396 return IfThenElse(m, LoadU(d, p), v); 1397 } 1398 1399 template <class D> 1400 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 1401 return Load(d, p); 1402 } 1403 1404 // In some use cases, "load single lane" is sufficient; otherwise avoid this. 1405 template <class D> 1406 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT aligned) { 1407 return Load(d, aligned); 1408 } 1409 1410 #ifdef HWY_NATIVE_LOAD_N 1411 #undef HWY_NATIVE_LOAD_N 1412 #else 1413 #define HWY_NATIVE_LOAD_N 1414 #endif 1415 1416 template <class D> 1417 HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p, 1418 size_t max_lanes_to_load) { 1419 VFromD<D> v = Zero(d); 1420 const size_t N = Lanes(d); 1421 const size_t num_of_lanes_to_load = HWY_MIN(max_lanes_to_load, N); 1422 CopyBytes(p, v.raw, num_of_lanes_to_load * sizeof(TFromD<D>)); 1423 return v; 1424 } 1425 1426 template <class D> 1427 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p, 1428 size_t max_lanes_to_load) { 1429 VFromD<D> v = no; 1430 const size_t N = Lanes(d); 1431 const size_t num_of_lanes_to_load = HWY_MIN(max_lanes_to_load, N); 1432 CopyBytes(p, v.raw, num_of_lanes_to_load * sizeof(TFromD<D>)); 1433 return v; 1434 } 1435 1436 // ------------------------------ Store 1437 1438 template <class D> 1439 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { 1440 CopyBytes<d.MaxBytes()>(v.raw, aligned); // copy to array 1441 } 1442 1443 template <class D> 1444 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 1445 Store(v, d, p); 1446 } 1447 1448 template <class D> 1449 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, 1450 TFromD<D>* HWY_RESTRICT p) { 1451 for (size_t i = 0; i < MaxLanes(d); ++i) { 1452 if (m.bits[i]) p[i] = v.raw[i]; 1453 } 1454 } 1455 1456 #ifdef HWY_NATIVE_STORE_N 1457 #undef HWY_NATIVE_STORE_N 1458 #else 1459 #define HWY_NATIVE_STORE_N 1460 #endif 1461 1462 template <class D> 1463 HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p, 1464 size_t max_lanes_to_store) { 1465 const size_t N = Lanes(d); 1466 const size_t num_of_lanes_to_store = HWY_MIN(max_lanes_to_store, N); 1467 CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD<D>)); 1468 } 1469 1470 // ================================================== COMBINE 1471 1472 template <typename T, size_t N> 1473 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { 1474 Vec128<T, N / 2> ret; 1475 CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw); 1476 return ret; 1477 } 1478 1479 template <class D> 1480 HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) { 1481 return LowerHalf(v); 1482 } 1483 1484 template <class D> 1485 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { 1486 VFromD<D> ret; 1487 CopyBytes<d.MaxBytes()>(&v.raw[MaxLanes(d)], ret.raw); 1488 return ret; 1489 } 1490 1491 template <class D> 1492 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> v) { 1493 const Half<decltype(d)> dh; 1494 VFromD<D> ret; // zero-initialized 1495 CopyBytes<dh.MaxBytes()>(v.raw, ret.raw); 1496 return ret; 1497 } 1498 1499 template <class D, class VH = VFromD<Half<D>>> 1500 HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) { 1501 const Half<decltype(d)> dh; 1502 VFromD<D> ret; 1503 CopyBytes<dh.MaxBytes()>(lo_half.raw, &ret.raw[0]); 1504 CopyBytes<dh.MaxBytes()>(hi_half.raw, &ret.raw[MaxLanes(dh)]); 1505 return ret; 1506 } 1507 1508 template <class D> 1509 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { 1510 const Half<decltype(d)> dh; 1511 VFromD<D> ret; 1512 CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]); 1513 CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]); 1514 return ret; 1515 } 1516 1517 template <class D> 1518 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { 1519 const Half<decltype(d)> dh; 1520 VFromD<D> ret; 1521 CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]); 1522 CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]); 1523 return ret; 1524 } 1525 1526 template <class D> 1527 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { 1528 const Half<decltype(d)> dh; 1529 VFromD<D> ret; 1530 CopyBytes<dh.MaxBytes()>(&lo.raw[MaxLanes(dh)], &ret.raw[0]); 1531 CopyBytes<dh.MaxBytes()>(hi.raw, &ret.raw[MaxLanes(dh)]); 1532 return ret; 1533 } 1534 1535 template <class D> 1536 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { 1537 const Half<decltype(d)> dh; 1538 VFromD<D> ret; 1539 CopyBytes<dh.MaxBytes()>(lo.raw, &ret.raw[0]); 1540 CopyBytes<dh.MaxBytes()>(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]); 1541 return ret; 1542 } 1543 1544 template <class D> 1545 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) { 1546 const Half<decltype(d)> dh; 1547 VFromD<D> ret; 1548 for (size_t i = 0; i < MaxLanes(dh); ++i) { 1549 ret.raw[i] = lo.raw[2 * i]; 1550 } 1551 for (size_t i = 0; i < MaxLanes(dh); ++i) { 1552 ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i]; 1553 } 1554 return ret; 1555 } 1556 1557 // 2023-11-23: workaround for incorrect codegen (reduction_test fails for 1558 // SumsOf2 because PromoteOddTo, which uses ConcatOdd, returns zero). 1559 #if HWY_ARCH_RISCV && HWY_TARGET == HWY_EMU128 && HWY_COMPILER_CLANG 1560 #define HWY_EMU128_CONCAT_INLINE HWY_NOINLINE 1561 #else 1562 #define HWY_EMU128_CONCAT_INLINE HWY_API 1563 #endif 1564 1565 template <class D> 1566 HWY_EMU128_CONCAT_INLINE VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) { 1567 const Half<decltype(d)> dh; 1568 VFromD<D> ret; 1569 for (size_t i = 0; i < MaxLanes(dh); ++i) { 1570 ret.raw[i] = lo.raw[2 * i + 1]; 1571 } 1572 for (size_t i = 0; i < MaxLanes(dh); ++i) { 1573 ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1]; 1574 } 1575 return ret; 1576 } 1577 1578 // ------------------------------ CombineShiftRightBytes 1579 template <int kBytes, class D> 1580 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { 1581 VFromD<D> ret; 1582 const uint8_t* HWY_RESTRICT lo8 = 1583 reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw); 1584 uint8_t* HWY_RESTRICT ret8 = 1585 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw); 1586 CopyBytes<d.MaxBytes() - kBytes>(lo8 + kBytes, ret8); 1587 CopyBytes<kBytes>(hi.raw, ret8 + d.MaxBytes() - kBytes); 1588 return ret; 1589 } 1590 1591 // ------------------------------ ShiftLeftBytes 1592 1593 template <int kBytes, class D> 1594 HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) { 1595 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 1596 VFromD<D> ret; 1597 uint8_t* HWY_RESTRICT ret8 = 1598 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw); 1599 ZeroBytes<kBytes>(ret8); 1600 CopyBytes<d.MaxBytes() - kBytes>(v.raw, ret8 + kBytes); 1601 return ret; 1602 } 1603 1604 template <int kBytes, typename T, size_t N> 1605 HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) { 1606 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); 1607 } 1608 1609 // ------------------------------ ShiftLeftLanes 1610 1611 template <int kLanes, class D, typename T = TFromD<D>> 1612 HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) { 1613 const Repartition<uint8_t, decltype(d)> d8; 1614 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v))); 1615 } 1616 1617 template <int kLanes, typename T, size_t N> 1618 HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) { 1619 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); 1620 } 1621 1622 // ------------------------------ ShiftRightBytes 1623 template <int kBytes, class D> 1624 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { 1625 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 1626 VFromD<D> ret; 1627 const uint8_t* HWY_RESTRICT v8 = 1628 reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw); 1629 uint8_t* HWY_RESTRICT ret8 = 1630 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw); 1631 CopyBytes<d.MaxBytes() - kBytes>(v8 + kBytes, ret8); 1632 ZeroBytes<kBytes>(ret8 + d.MaxBytes() - kBytes); 1633 return ret; 1634 } 1635 1636 // ------------------------------ ShiftRightLanes 1637 template <int kLanes, class D> 1638 HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) { 1639 const Repartition<uint8_t, decltype(d)> d8; 1640 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); 1641 return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v))); 1642 } 1643 1644 // ------------------------------ Tuples, PromoteEvenTo/PromoteOddTo 1645 #include "hwy/ops/inside-inl.h" 1646 1647 // ------------------------------ LoadInterleaved2/3/4 1648 1649 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. 1650 // We implement those here because scalar code is likely faster than emulation 1651 // via shuffles. 1652 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED 1653 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED 1654 #else 1655 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED 1656 #endif 1657 1658 // Same for Load/StoreInterleaved of special floats. 1659 #ifdef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED 1660 #undef HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED 1661 #else 1662 #define HWY_NATIVE_LOAD_STORE_SPECIAL_FLOAT_INTERLEAVED 1663 #endif 1664 1665 template <class D, typename T = TFromD<D>> 1666 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, 1667 VFromD<D>& v0, VFromD<D>& v1) { 1668 alignas(16) T buf0[MaxLanes(d)]; 1669 alignas(16) T buf1[MaxLanes(d)]; 1670 for (size_t i = 0; i < MaxLanes(d); ++i) { 1671 buf0[i] = *unaligned++; 1672 buf1[i] = *unaligned++; 1673 } 1674 v0 = Load(d, buf0); 1675 v1 = Load(d, buf1); 1676 } 1677 1678 template <class D, typename T = TFromD<D>> 1679 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, 1680 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { 1681 alignas(16) T buf0[MaxLanes(d)]; 1682 alignas(16) T buf1[MaxLanes(d)]; 1683 alignas(16) T buf2[MaxLanes(d)]; 1684 for (size_t i = 0; i < MaxLanes(d); ++i) { 1685 buf0[i] = *unaligned++; 1686 buf1[i] = *unaligned++; 1687 buf2[i] = *unaligned++; 1688 } 1689 v0 = Load(d, buf0); 1690 v1 = Load(d, buf1); 1691 v2 = Load(d, buf2); 1692 } 1693 1694 template <class D, typename T = TFromD<D>> 1695 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, 1696 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, 1697 VFromD<D>& v3) { 1698 alignas(16) T buf0[MaxLanes(d)]; 1699 alignas(16) T buf1[MaxLanes(d)]; 1700 alignas(16) T buf2[MaxLanes(d)]; 1701 alignas(16) T buf3[MaxLanes(d)]; 1702 for (size_t i = 0; i < MaxLanes(d); ++i) { 1703 buf0[i] = *unaligned++; 1704 buf1[i] = *unaligned++; 1705 buf2[i] = *unaligned++; 1706 buf3[i] = *unaligned++; 1707 } 1708 v0 = Load(d, buf0); 1709 v1 = Load(d, buf1); 1710 v2 = Load(d, buf2); 1711 v3 = Load(d, buf3); 1712 } 1713 1714 // ------------------------------ StoreInterleaved2/3/4 1715 1716 template <class D> 1717 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, 1718 TFromD<D>* HWY_RESTRICT unaligned) { 1719 for (size_t i = 0; i < MaxLanes(d); ++i) { 1720 *unaligned++ = v0.raw[i]; 1721 *unaligned++ = v1.raw[i]; 1722 } 1723 } 1724 1725 template <class D> 1726 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, 1727 TFromD<D>* HWY_RESTRICT unaligned) { 1728 for (size_t i = 0; i < MaxLanes(d); ++i) { 1729 *unaligned++ = v0.raw[i]; 1730 *unaligned++ = v1.raw[i]; 1731 *unaligned++ = v2.raw[i]; 1732 } 1733 } 1734 1735 template <class D> 1736 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, 1737 VFromD<D> v3, D d, 1738 TFromD<D>* HWY_RESTRICT unaligned) { 1739 for (size_t i = 0; i < MaxLanes(d); ++i) { 1740 *unaligned++ = v0.raw[i]; 1741 *unaligned++ = v1.raw[i]; 1742 *unaligned++ = v2.raw[i]; 1743 *unaligned++ = v3.raw[i]; 1744 } 1745 } 1746 1747 // ------------------------------ Stream 1748 template <class D> 1749 HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { 1750 Store(v, d, aligned); 1751 } 1752 1753 // ------------------------------ Scatter in generic_ops-inl.h 1754 // ------------------------------ Gather in generic_ops-inl.h 1755 1756 // ================================================== CONVERT 1757 1758 // ConvertTo and DemoteTo with floating-point input and integer output truncate 1759 // (rounding toward zero). 1760 1761 namespace detail { 1762 1763 template <class ToT, class FromT> 1764 HWY_INLINE ToT CastValueForF2IConv(FromT val) { 1765 // Prevent ubsan errors when converting float to narrower integer 1766 1767 using FromTU = MakeUnsigned<FromT>; 1768 using ToTU = MakeUnsigned<ToT>; 1769 1770 constexpr unsigned kMaxExpField = 1771 static_cast<unsigned>(MaxExponentField<FromT>()); 1772 constexpr unsigned kExpBias = kMaxExpField >> 1; 1773 constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN( 1774 kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()), 1775 kMaxExpField)); 1776 1777 // If ToT is signed, compare only the exponent bits of val against 1778 // kMinOutOfRangeExpField. 1779 // 1780 // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of 1781 // val against kMinOutOfRangeExpField as a negative value is outside of the 1782 // range of an unsigned integer type. 1783 const FromT val_to_compare = 1784 static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val); 1785 1786 // val is within the range of ToT if 1787 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less 1788 // than kMinOutOfRangeExpField 1789 // 1790 // Otherwise, val is either outside of the range of ToT or equal to 1791 // LimitsMin<ToT>() if 1792 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater 1793 // than or equal to kMinOutOfRangeExpField. 1794 1795 return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >> 1796 MantissaBits<FromT>()) < kMinOutOfRangeExpField) 1797 ? static_cast<ToT>(val) 1798 : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) + 1799 static_cast<ToTU>(ScalarSignBit(val))); 1800 } 1801 1802 template <class ToT, class ToTypeTag, class FromT> 1803 HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) { 1804 return ConvertScalarTo<ToT>(val); 1805 } 1806 1807 template <class ToT> 1808 HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/, 1809 float val) { 1810 return CastValueForF2IConv<ToT>(val); 1811 } 1812 1813 template <class ToT> 1814 HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/, 1815 float val) { 1816 return CastValueForF2IConv<ToT>(val); 1817 } 1818 // If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val) 1819 // returns static_cast<ToT>(val) 1820 // 1821 // Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an 1822 // implementation-defined result if val is not within the range of ToT. 1823 template <class ToT, class FromT> 1824 HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) { 1825 // Prevent ubsan errors when converting float to narrower integer 1826 1827 using FromTU = MakeUnsigned<FromT>; 1828 1829 constexpr unsigned kMaxExpField = 1830 static_cast<unsigned>(MaxExponentField<FromT>()); 1831 constexpr unsigned kExpBias = kMaxExpField >> 1; 1832 constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN( 1833 kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()), 1834 kMaxExpField)); 1835 1836 // If ToT is signed, compare only the exponent bits of val against 1837 // kMinOutOfRangeExpField. 1838 // 1839 // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of 1840 // val against kMinOutOfRangeExpField as a negative value is outside of the 1841 // range of an unsigned integer type. 1842 const FromT val_to_compare = 1843 static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val); 1844 1845 // val is within the range of ToT if 1846 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less 1847 // than kMinOutOfRangeExpField 1848 // 1849 // Otherwise, val is either outside of the range of ToT or equal to 1850 // LimitsMin<ToT>() if 1851 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater 1852 // than or equal to kMinOutOfRangeExpField. 1853 1854 return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >> 1855 MantissaBits<FromT>()) < kMinOutOfRangeExpField) 1856 ? static_cast<ToT>(val) 1857 : static_cast<ToT>(LimitsMin<ToT>()); 1858 } 1859 1860 } // namespace detail 1861 1862 template <class DTo, typename TFrom, HWY_IF_NOT_SPECIAL_FLOAT(TFrom)> 1863 HWY_API VFromD<DTo> PromoteTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) { 1864 static_assert(sizeof(TFromD<DTo>) > sizeof(TFrom), "Not promoting"); 1865 VFromD<DTo> ret; 1866 for (size_t i = 0; i < MaxLanes(d); ++i) { 1867 // For bits Y > X, floatX->floatY and intX->intY are always representable. 1868 ret.raw[i] = detail::CastValueForPromoteTo<TFromD<DTo>>( 1869 hwy::TypeTag<TFromD<DTo>>(), from.raw[i]); 1870 } 1871 return ret; 1872 } 1873 1874 #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 1875 #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 1876 #else 1877 #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 1878 #endif 1879 1880 template <class D64, HWY_IF_UI64_D(D64)> 1881 HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) { 1882 VFromD<D64> ret; 1883 for (size_t i = 0; i < MaxLanes(d64); ++i) { 1884 ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D64>>(v.raw[i]); 1885 } 1886 return ret; 1887 } 1888 1889 // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here, 1890 // so we overload for TFrom=double and ToT={float,int32_t}. 1891 template <class D, HWY_IF_F32_D(D)> 1892 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) { 1893 VFromD<D> ret; 1894 for (size_t i = 0; i < MaxLanes(d); ++i) { 1895 // Prevent ubsan errors when converting float to narrower integer/float 1896 if (ScalarIsInf(from.raw[i]) || 1897 ScalarAbs(from.raw[i]) > static_cast<double>(HighestValue<float>())) { 1898 ret.raw[i] = ScalarSignBit(from.raw[i]) ? LowestValue<float>() 1899 : HighestValue<float>(); 1900 continue; 1901 } 1902 ret.raw[i] = static_cast<float>(from.raw[i]); 1903 } 1904 return ret; 1905 } 1906 template <class D, HWY_IF_UI32_D(D)> 1907 HWY_API VFromD<D> DemoteTo(D d, VFromD<Rebind<double, D>> from) { 1908 VFromD<D> ret; 1909 for (size_t i = 0; i < MaxLanes(d); ++i) { 1910 // Prevent ubsan errors when converting double to narrower integer/int32_t 1911 ret.raw[i] = detail::CastValueForF2IConv<TFromD<D>>(from.raw[i]); 1912 } 1913 return ret; 1914 } 1915 1916 template <class DTo, typename TFrom, size_t N, HWY_IF_SIGNED(TFrom), 1917 HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)> 1918 HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) { 1919 using TTo = TFromD<DTo>; 1920 static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); 1921 1922 VFromD<DTo> ret; 1923 for (size_t i = 0; i < N; ++i) { 1924 // Int to int: choose closest value in ToT to `from` (avoids UB) 1925 from.raw[i] = 1926 HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw[i]), LimitsMax<TTo>()); 1927 ret.raw[i] = static_cast<TTo>(from.raw[i]); 1928 } 1929 return ret; 1930 } 1931 1932 // Disable the default unsigned to signed DemoteTo/ReorderDemote2To 1933 // implementations in generic_ops-inl.h on EMU128 as the EMU128 target has 1934 // target-specific implementations of the unsigned to signed DemoteTo and 1935 // ReorderDemote2To ops 1936 1937 // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of 1938 // hwy::EnableIf<false>* = nullptr to avoid compiler errors since 1939 // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause 1940 // SFINAE to occur instead of a hard error due to a dependency on the V template 1941 // argument 1942 #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V 1943 #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \ 1944 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr 1945 1946 template <class DTo, typename TFrom, size_t N, HWY_IF_UNSIGNED(TFrom), 1947 HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)> 1948 HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) { 1949 using TTo = TFromD<DTo>; 1950 static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); 1951 1952 const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>()); 1953 1954 VFromD<DTo> ret; 1955 for (size_t i = 0; i < N; ++i) { 1956 // Int to int: choose closest value in ToT to `from` (avoids UB) 1957 ret.raw[i] = static_cast<TTo>(HWY_MIN(from.raw[i], max)); 1958 } 1959 return ret; 1960 } 1961 1962 template <class DTo, typename TFrom, size_t N, HWY_IF_UI64(TFrom), 1963 HWY_IF_F32_D(DTo)> 1964 HWY_API VFromD<DTo> DemoteTo(DTo /* tag */, Vec128<TFrom, N> from) { 1965 using TTo = TFromD<DTo>; 1966 static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); 1967 1968 VFromD<DTo> ret; 1969 for (size_t i = 0; i < N; ++i) { 1970 // int64_t/uint64_t to float: okay to cast to float as an int64_t/uint64_t 1971 // value is always within the range of a float 1972 ret.raw[i] = static_cast<TTo>(from.raw[i]); 1973 } 1974 return ret; 1975 } 1976 1977 template <class DBF16, HWY_IF_BF16_D(DBF16), class VF32> 1978 HWY_API VFromD<DBF16> ReorderDemote2To(DBF16 dbf16, VF32 a, VF32 b) { 1979 const Repartition<uint32_t, decltype(dbf16)> du32; 1980 const VFromD<decltype(du32)> b_in_lower = ShiftRight<16>(BitCast(du32, b)); 1981 // Avoid OddEven - we want the upper half of `a` even on big-endian systems. 1982 const VFromD<decltype(du32)> a_mask = Set(du32, 0xFFFF0000); 1983 return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower)); 1984 } 1985 1986 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V, 1987 HWY_IF_SIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 1988 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> 1989 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 1990 const RepartitionToWide<decltype(dn)> dw; 1991 const size_t NW = Lanes(dw); 1992 using TN = TFromD<DN>; 1993 const TN min = LimitsMin<TN>(); 1994 const TN max = LimitsMax<TN>(); 1995 VFromD<DN> ret; 1996 for (size_t i = 0; i < NW; ++i) { 1997 ret.raw[i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, a.raw[i]), max)); 1998 } 1999 for (size_t i = 0; i < NW; ++i) { 2000 ret.raw[NW + i] = static_cast<TN>(HWY_MIN(HWY_MAX(min, b.raw[i]), max)); 2001 } 2002 return ret; 2003 } 2004 2005 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DN), class V, 2006 HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 2007 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> 2008 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 2009 const RepartitionToWide<decltype(dn)> dw; 2010 const size_t NW = Lanes(dw); 2011 using TN = TFromD<DN>; 2012 using TN_U = MakeUnsigned<TN>; 2013 const TN_U max = static_cast<TN_U>(LimitsMax<TN>()); 2014 VFromD<DN> ret; 2015 for (size_t i = 0; i < NW; ++i) { 2016 ret.raw[i] = static_cast<TN>(HWY_MIN(a.raw[i], max)); 2017 } 2018 for (size_t i = 0; i < NW; ++i) { 2019 ret.raw[NW + i] = static_cast<TN>(HWY_MIN(b.raw[i], max)); 2020 } 2021 return ret; 2022 } 2023 2024 template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V, 2025 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 2026 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 2027 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> 2028 HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) { 2029 return ReorderDemote2To(dn, a, b); 2030 } 2031 2032 template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), class V, 2033 HWY_IF_F32_D(DFromV<V>), 2034 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> 2035 HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) { 2036 const size_t NW = Lanes(dn) / 2; 2037 using TN = TFromD<DN>; 2038 VFromD<DN> ret; 2039 for (size_t i = 0; i < NW; ++i) { 2040 ret.raw[i] = ConvertScalarTo<TN>(a.raw[i]); 2041 } 2042 for (size_t i = 0; i < NW; ++i) { 2043 ret.raw[NW + i] = ConvertScalarTo<TN>(b.raw[i]); 2044 } 2045 return ret; 2046 } 2047 2048 namespace detail { 2049 2050 HWY_INLINE void StoreU16ToF16(const uint16_t val, 2051 hwy::float16_t* HWY_RESTRICT to) { 2052 CopySameSize(&val, to); 2053 } 2054 2055 HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) { 2056 uint16_t bits16; 2057 CopySameSize(from, &bits16); 2058 return bits16; 2059 } 2060 2061 } // namespace detail 2062 2063 template <class D, HWY_IF_F32_D(D), size_t N> 2064 HWY_API VFromD<D> PromoteTo(D /* tag */, Vec128<bfloat16_t, N> v) { 2065 VFromD<D> ret; 2066 for (size_t i = 0; i < N; ++i) { 2067 ret.raw[i] = F32FromBF16(v.raw[i]); 2068 } 2069 return ret; 2070 } 2071 2072 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16 2073 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16 2074 #else 2075 #define HWY_NATIVE_DEMOTE_F32_TO_BF16 2076 #endif 2077 2078 template <class D, HWY_IF_BF16_D(D), size_t N> 2079 HWY_API VFromD<D> DemoteTo(D /* tag */, Vec128<float, N> v) { 2080 VFromD<D> ret; 2081 for (size_t i = 0; i < N; ++i) { 2082 ret.raw[i] = BF16FromF32(v.raw[i]); 2083 } 2084 return ret; 2085 } 2086 2087 #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 2088 #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 2089 #else 2090 #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 2091 #endif 2092 2093 template <class D32, HWY_IF_UI32_D(D32)> 2094 HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) { 2095 VFromD<D32> ret; 2096 for (size_t i = 0; i < MaxLanes(d32); ++i) { 2097 ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<D32>>(v.raw[i]); 2098 } 2099 return ret; 2100 } 2101 2102 // Tag dispatch instead of SFINAE for MSVC 2017 compatibility 2103 namespace detail { 2104 2105 template <typename TFrom, typename DTo> 2106 HWY_API VFromD<DTo> ConvertTo(hwy::FloatTag /*tag*/, DTo /*tag*/, 2107 Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) { 2108 using ToT = TFromD<DTo>; 2109 static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size"); 2110 VFromD<DTo> ret; 2111 constexpr size_t N = HWY_MAX_LANES_D(DTo); 2112 2113 for (size_t i = 0; i < N; ++i) { 2114 // float## -> int##: return closest representable value 2115 ret.raw[i] = CastValueForF2IConv<ToT>(from.raw[i]); 2116 } 2117 return ret; 2118 } 2119 2120 template <typename TFrom, typename DTo> 2121 HWY_API VFromD<DTo> ConvertTo(hwy::NonFloatTag /*tag*/, DTo /* tag */, 2122 Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) { 2123 using ToT = TFromD<DTo>; 2124 static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size"); 2125 VFromD<DTo> ret; 2126 constexpr size_t N = HWY_MAX_LANES_D(DTo); 2127 for (size_t i = 0; i < N; ++i) { 2128 // int## -> float##: no check needed 2129 ret.raw[i] = static_cast<ToT>(from.raw[i]); 2130 } 2131 return ret; 2132 } 2133 2134 } // namespace detail 2135 2136 template <class DTo, typename TFrom> 2137 HWY_API VFromD<DTo> ConvertTo(DTo d, Vec128<TFrom, HWY_MAX_LANES_D(DTo)> from) { 2138 return detail::ConvertTo(hwy::IsFloatTag<TFrom>(), d, from); 2139 } 2140 2141 #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 2142 #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 2143 #else 2144 #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 2145 #endif 2146 2147 template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI), 2148 HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))> 2149 HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) { 2150 VFromD<DI> ret; 2151 for (size_t i = 0; i < MaxLanes(di); i++) { 2152 ret.raw[i] = detail::CastValueForInRangeF2IConv<TFromD<DI>>(v.raw[i]); 2153 } 2154 return ret; 2155 } 2156 2157 template <size_t N> 2158 HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) { 2159 return DemoteTo(Simd<uint8_t, N, 0>(), v); 2160 } 2161 2162 // ------------------------------ Truncations 2163 2164 template <class D, HWY_IF_U8_D(D), size_t N> 2165 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) { 2166 VFromD<D> ret; 2167 for (size_t i = 0; i < N; ++i) { 2168 ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF); 2169 } 2170 return ret; 2171 } 2172 2173 template <class D, HWY_IF_U16_D(D), size_t N> 2174 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) { 2175 VFromD<D> ret; 2176 for (size_t i = 0; i < N; ++i) { 2177 ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF); 2178 } 2179 return ret; 2180 } 2181 2182 template <class D, HWY_IF_U32_D(D), size_t N> 2183 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint64_t, N> v) { 2184 VFromD<D> ret; 2185 for (size_t i = 0; i < N; ++i) { 2186 ret.raw[i] = static_cast<uint32_t>(v.raw[i] & 0xFFFFFFFFu); 2187 } 2188 return ret; 2189 } 2190 2191 template <class D, HWY_IF_U8_D(D), size_t N> 2192 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint32_t, N> v) { 2193 VFromD<D> ret; 2194 for (size_t i = 0; i < N; ++i) { 2195 ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF); 2196 } 2197 return ret; 2198 } 2199 2200 template <class D, HWY_IF_U16_D(D), size_t N> 2201 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint32_t, N> v) { 2202 VFromD<D> ret; 2203 for (size_t i = 0; i < N; ++i) { 2204 ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF); 2205 } 2206 return ret; 2207 } 2208 2209 template <class D, HWY_IF_U8_D(D), size_t N> 2210 HWY_API VFromD<D> TruncateTo(D /* tag */, Vec128<uint16_t, N> v) { 2211 VFromD<D> ret; 2212 for (size_t i = 0; i < N; ++i) { 2213 ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF); 2214 } 2215 return ret; 2216 } 2217 2218 #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO 2219 #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO 2220 #else 2221 #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO 2222 #endif 2223 2224 template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V), 2225 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2), 2226 HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV<V>) * 2)> 2227 HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) { 2228 const RepartitionToWide<decltype(dn)> dw; 2229 const size_t NW = Lanes(dw); 2230 using TW = TFromD<decltype(dw)>; 2231 using TN = TFromD<decltype(dn)>; 2232 VFromD<DN> ret; 2233 constexpr TW max_val{LimitsMax<TN>()}; 2234 2235 for (size_t i = 0; i < NW; ++i) { 2236 ret.raw[i] = static_cast<TN>(a.raw[i] & max_val); 2237 } 2238 for (size_t i = 0; i < NW; ++i) { 2239 ret.raw[NW + i] = static_cast<TN>(b.raw[i] & max_val); 2240 } 2241 return ret; 2242 } 2243 2244 // ================================================== SWIZZLE 2245 2246 template <typename T, size_t N> 2247 HWY_API T GetLane(Vec128<T, N> v) { 2248 return v.raw[0]; 2249 } 2250 2251 template <typename T, size_t N> 2252 HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) { 2253 v.raw[i] = t; 2254 return v; 2255 } 2256 2257 template <typename T, size_t N> 2258 HWY_API T ExtractLane(Vec128<T, N> v, size_t i) { 2259 return v.raw[i]; 2260 } 2261 2262 template <typename T, size_t N> 2263 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { 2264 for (size_t i = 0; i < N; i += 2) { 2265 v.raw[i + 1] = v.raw[i]; 2266 } 2267 return v; 2268 } 2269 2270 template <typename T, size_t N> 2271 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 2272 for (size_t i = 0; i < N; i += 2) { 2273 v.raw[i] = v.raw[i + 1]; 2274 } 2275 return v; 2276 } 2277 2278 template <typename T, size_t N> 2279 HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) { 2280 for (size_t i = 0; i < N; i += 2) { 2281 odd.raw[i] = even.raw[i]; 2282 } 2283 return odd; 2284 } 2285 2286 template <class D> 2287 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 2288 constexpr size_t N = HWY_MAX_LANES_D(D); 2289 for (size_t i = 1; i < N; i += 2) { 2290 a.raw[i] = b.raw[i - 1]; 2291 } 2292 return a; 2293 } 2294 2295 template <class D> 2296 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 2297 constexpr size_t N = HWY_MAX_LANES_D(D); 2298 for (size_t i = 1; i < N; i += 2) { 2299 b.raw[i - 1] = a.raw[i]; 2300 } 2301 return b; 2302 } 2303 2304 template <typename T, size_t N> 2305 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { 2306 return even; 2307 } 2308 2309 // ------------------------------ SwapAdjacentBlocks 2310 template <typename T, size_t N> 2311 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { 2312 return v; 2313 } 2314 2315 // ------------------------------ InterleaveEvenBlocks 2316 template <class D, class V = VFromD<D>> 2317 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) { 2318 return a; 2319 } 2320 // ------------------------------ InterleaveOddBlocks 2321 template <class D, class V = VFromD<D>> 2322 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) { 2323 return a; 2324 } 2325 2326 // ------------------------------ TableLookupLanes 2327 2328 // Returned by SetTableIndices for use by TableLookupLanes. 2329 template <typename T, size_t N> 2330 struct Indices128 { 2331 MakeSigned<T> raw[N]; 2332 }; 2333 2334 template <class D, typename TI, size_t N> 2335 HWY_API Indices128<TFromD<D>, N> IndicesFromVec(D d, Vec128<TI, N> vec) { 2336 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size must match"); 2337 Indices128<TFromD<D>, N> ret; 2338 CopyBytes<d.MaxBytes()>(vec.raw, ret.raw); 2339 return ret; 2340 } 2341 2342 template <class D, typename TI> 2343 HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices( 2344 D d, const TI* idx) { 2345 return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx)); 2346 } 2347 2348 template <typename T, size_t N> 2349 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { 2350 Vec128<T, N> ret; 2351 for (size_t i = 0; i < N; ++i) { 2352 ret.raw[i] = v.raw[idx.raw[i]]; 2353 } 2354 return ret; 2355 } 2356 2357 template <typename T, size_t N> 2358 HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, 2359 Indices128<T, N> idx) { 2360 using TI = MakeSigned<T>; 2361 Vec128<T, N> ret; 2362 constexpr TI kVecLaneIdxMask = static_cast<TI>(N - 1); 2363 for (size_t i = 0; i < N; ++i) { 2364 const auto src_idx = idx.raw[i]; 2365 const auto masked_src_lane_idx = src_idx & kVecLaneIdxMask; 2366 ret.raw[i] = (src_idx < static_cast<TI>(N)) ? a.raw[masked_src_lane_idx] 2367 : b.raw[masked_src_lane_idx]; 2368 } 2369 return ret; 2370 } 2371 2372 // ------------------------------ ReverseBlocks 2373 template <class D> 2374 HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { 2375 return v; // Single block: no change 2376 } 2377 2378 // ------------------------------ Reverse 2379 2380 template <class D> 2381 HWY_API VFromD<D> Reverse(D d, VFromD<D> v) { 2382 VFromD<D> ret; 2383 for (size_t i = 0; i < MaxLanes(d); ++i) { 2384 ret.raw[i] = v.raw[MaxLanes(d) - 1 - i]; 2385 } 2386 return ret; 2387 } 2388 2389 // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. 2390 #ifdef HWY_NATIVE_REVERSE2_8 2391 #undef HWY_NATIVE_REVERSE2_8 2392 #else 2393 #define HWY_NATIVE_REVERSE2_8 2394 #endif 2395 2396 template <class D> 2397 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { 2398 VFromD<D> ret; 2399 for (size_t i = 0; i < MaxLanes(d); i += 2) { 2400 ret.raw[i + 0] = v.raw[i + 1]; 2401 ret.raw[i + 1] = v.raw[i + 0]; 2402 } 2403 return ret; 2404 } 2405 2406 template <class D> 2407 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { 2408 VFromD<D> ret; 2409 for (size_t i = 0; i < MaxLanes(d); i += 4) { 2410 ret.raw[i + 0] = v.raw[i + 3]; 2411 ret.raw[i + 1] = v.raw[i + 2]; 2412 ret.raw[i + 2] = v.raw[i + 1]; 2413 ret.raw[i + 3] = v.raw[i + 0]; 2414 } 2415 return ret; 2416 } 2417 2418 template <class D> 2419 HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { 2420 VFromD<D> ret; 2421 for (size_t i = 0; i < MaxLanes(d); i += 8) { 2422 ret.raw[i + 0] = v.raw[i + 7]; 2423 ret.raw[i + 1] = v.raw[i + 6]; 2424 ret.raw[i + 2] = v.raw[i + 5]; 2425 ret.raw[i + 3] = v.raw[i + 4]; 2426 ret.raw[i + 4] = v.raw[i + 3]; 2427 ret.raw[i + 5] = v.raw[i + 2]; 2428 ret.raw[i + 6] = v.raw[i + 1]; 2429 ret.raw[i + 7] = v.raw[i + 0]; 2430 } 2431 return ret; 2432 } 2433 2434 // ------------------------------ SlideUpLanes 2435 2436 template <class D> 2437 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 2438 VFromD<D> ret = Zero(d); 2439 constexpr size_t N = HWY_MAX_LANES_D(D); 2440 const size_t clamped_amt = HWY_MIN(amt, N); 2441 CopyBytes(v.raw, ret.raw + clamped_amt, 2442 (N - clamped_amt) * sizeof(TFromD<D>)); 2443 return ret; 2444 } 2445 2446 // ------------------------------ SlideDownLanes 2447 2448 template <class D> 2449 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 2450 VFromD<D> ret = Zero(d); 2451 constexpr size_t N = HWY_MAX_LANES_D(D); 2452 const size_t clamped_amt = HWY_MIN(amt, N); 2453 CopyBytes(v.raw + clamped_amt, ret.raw, 2454 (N - clamped_amt) * sizeof(TFromD<D>)); 2455 return ret; 2456 } 2457 2458 // ================================================== BLOCKWISE 2459 2460 // ------------------------------ Shuffle* 2461 2462 // Swap 32-bit halves in 64-bit halves. 2463 template <typename T, size_t N> 2464 HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) { 2465 static_assert(sizeof(T) == 4, "Only for 32-bit"); 2466 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2467 return Reverse2(DFromV<decltype(v)>(), v); 2468 } 2469 2470 // Swap 64-bit halves 2471 template <typename T> 2472 HWY_API Vec128<T> Shuffle1032(Vec128<T> v) { 2473 static_assert(sizeof(T) == 4, "Only for 32-bit"); 2474 Vec128<T> ret; 2475 ret.raw[3] = v.raw[1]; 2476 ret.raw[2] = v.raw[0]; 2477 ret.raw[1] = v.raw[3]; 2478 ret.raw[0] = v.raw[2]; 2479 return ret; 2480 } 2481 template <typename T> 2482 HWY_API Vec128<T> Shuffle01(Vec128<T> v) { 2483 static_assert(sizeof(T) == 8, "Only for 64-bit"); 2484 return Reverse2(DFromV<decltype(v)>(), v); 2485 } 2486 2487 // Rotate right 32 bits 2488 template <typename T> 2489 HWY_API Vec128<T> Shuffle0321(Vec128<T> v) { 2490 Vec128<T> ret; 2491 ret.raw[3] = v.raw[0]; 2492 ret.raw[2] = v.raw[3]; 2493 ret.raw[1] = v.raw[2]; 2494 ret.raw[0] = v.raw[1]; 2495 return ret; 2496 } 2497 2498 // Rotate left 32 bits 2499 template <typename T> 2500 HWY_API Vec128<T> Shuffle2103(Vec128<T> v) { 2501 Vec128<T> ret; 2502 ret.raw[3] = v.raw[2]; 2503 ret.raw[2] = v.raw[1]; 2504 ret.raw[1] = v.raw[0]; 2505 ret.raw[0] = v.raw[3]; 2506 return ret; 2507 } 2508 2509 template <typename T> 2510 HWY_API Vec128<T> Shuffle0123(Vec128<T> v) { 2511 return Reverse4(DFromV<decltype(v)>(), v); 2512 } 2513 2514 // ------------------------------ Broadcast 2515 template <int kLane, typename T, size_t N> 2516 HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) { 2517 for (size_t i = 0; i < N; ++i) { 2518 v.raw[i] = v.raw[kLane]; 2519 } 2520 return v; 2521 } 2522 2523 // ------------------------------ TableLookupBytes, TableLookupBytesOr0 2524 2525 template <typename T, size_t N, typename TI, size_t NI> 2526 HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> v, 2527 Vec128<TI, NI> indices) { 2528 const uint8_t* HWY_RESTRICT v_bytes = 2529 reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw); 2530 const uint8_t* HWY_RESTRICT idx_bytes = 2531 reinterpret_cast<const uint8_t*>(indices.raw); 2532 Vec128<TI, NI> ret; 2533 uint8_t* HWY_RESTRICT ret_bytes = 2534 reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw); 2535 for (size_t i = 0; i < NI * sizeof(TI); ++i) { 2536 const size_t idx = idx_bytes[i]; 2537 // Avoid out of bounds reads. 2538 ret_bytes[i] = idx < sizeof(T) * N ? v_bytes[idx] : 0; 2539 } 2540 return ret; 2541 } 2542 2543 template <typename T, size_t N, typename TI, size_t NI> 2544 HWY_API Vec128<TI, NI> TableLookupBytesOr0(Vec128<T, N> v, 2545 Vec128<TI, NI> indices) { 2546 // Same as TableLookupBytes, which already returns 0 if out of bounds. 2547 return TableLookupBytes(v, indices); 2548 } 2549 2550 // ------------------------------ InterleaveLower/InterleaveUpper 2551 2552 template <typename T, size_t N> 2553 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 2554 Vec128<T, N> ret; 2555 for (size_t i = 0; i < N / 2; ++i) { 2556 ret.raw[2 * i + 0] = a.raw[i]; 2557 ret.raw[2 * i + 1] = b.raw[i]; 2558 } 2559 return ret; 2560 } 2561 2562 // Additional overload for the optional tag. 2563 template <class D> 2564 HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) { 2565 return InterleaveLower(a, b); 2566 } 2567 2568 template <class D> 2569 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { 2570 const Half<decltype(d)> dh; 2571 VFromD<D> ret; 2572 for (size_t i = 0; i < MaxLanes(dh); ++i) { 2573 ret.raw[2 * i + 0] = a.raw[MaxLanes(dh) + i]; 2574 ret.raw[2 * i + 1] = b.raw[MaxLanes(dh) + i]; 2575 } 2576 return ret; 2577 } 2578 2579 // ------------------------------ ZipLower/ZipUpper (InterleaveLower) 2580 2581 // Same as Interleave*, except that the return lanes are double-width integers; 2582 // this is necessary because the single-lane scalar cannot return two values. 2583 template <class V, class DW = RepartitionToWide<DFromV<V>>> 2584 HWY_API VFromD<DW> ZipLower(V a, V b) { 2585 return BitCast(DW(), InterleaveLower(a, b)); 2586 } 2587 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 2588 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { 2589 return BitCast(dw, InterleaveLower(D(), a, b)); 2590 } 2591 2592 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 2593 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { 2594 return BitCast(dw, InterleaveUpper(D(), a, b)); 2595 } 2596 2597 // ================================================== MASK 2598 2599 template <class D> 2600 HWY_API bool AllFalse(D d, MFromD<D> mask) { 2601 typename MFromD<D>::Raw or_sum = 0; 2602 for (size_t i = 0; i < MaxLanes(d); ++i) { 2603 or_sum |= mask.bits[i]; 2604 } 2605 return or_sum == 0; 2606 } 2607 2608 template <class D> 2609 HWY_API bool AllTrue(D d, MFromD<D> mask) { 2610 constexpr uint64_t kAll = LimitsMax<typename MFromD<D>::Raw>(); 2611 uint64_t and_sum = kAll; 2612 for (size_t i = 0; i < MaxLanes(d); ++i) { 2613 and_sum &= mask.bits[i]; 2614 } 2615 return and_sum == kAll; 2616 } 2617 2618 // `p` points to at least 8 readable bytes, not all of which need be valid. 2619 template <class D> 2620 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { 2621 MFromD<D> m; 2622 for (size_t i = 0; i < MaxLanes(d); ++i) { 2623 const size_t bit = size_t{1} << (i & 7); 2624 const size_t idx_byte = i >> 3; 2625 m.bits[i] = MFromD<D>::FromBool((bits[idx_byte] & bit) != 0); 2626 } 2627 return m; 2628 } 2629 2630 template <class D> 2631 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 2632 MFromD<D> m; 2633 for (size_t i = 0; i < MaxLanes(d); ++i) { 2634 m.bits[i] = MFromD<D>::FromBool(((mask_bits >> i) & 1u) != 0); 2635 } 2636 return m; 2637 } 2638 2639 // `p` points to at least 8 writable bytes. 2640 template <class D> 2641 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { 2642 bits[0] = 0; 2643 if (MaxLanes(d) > 8) bits[1] = 0; // MaxLanes(d) <= 16, so max two bytes 2644 for (size_t i = 0; i < MaxLanes(d); ++i) { 2645 const size_t bit = size_t{1} << (i & 7); 2646 const size_t idx_byte = i >> 3; 2647 if (mask.bits[i]) { 2648 bits[idx_byte] = static_cast<uint8_t>(bits[idx_byte] | bit); 2649 } 2650 } 2651 return MaxLanes(d) > 8 ? 2 : 1; 2652 } 2653 2654 template <class D> 2655 HWY_API size_t CountTrue(D d, MFromD<D> mask) { 2656 size_t count = 0; 2657 for (size_t i = 0; i < MaxLanes(d); ++i) { 2658 count += mask.bits[i] != 0; 2659 } 2660 return count; 2661 } 2662 2663 template <class D> 2664 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { 2665 for (size_t i = 0; i < MaxLanes(d); ++i) { 2666 if (mask.bits[i] != 0) return i; 2667 } 2668 HWY_DASSERT(false); 2669 return 0; 2670 } 2671 2672 template <class D> 2673 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { 2674 for (size_t i = 0; i < MaxLanes(d); ++i) { 2675 if (mask.bits[i] != 0) return static_cast<intptr_t>(i); 2676 } 2677 return intptr_t{-1}; 2678 } 2679 2680 template <class D> 2681 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { 2682 for (intptr_t i = static_cast<intptr_t>(MaxLanes(d) - 1); i >= 0; i--) { 2683 if (mask.bits[i] != 0) return static_cast<size_t>(i); 2684 } 2685 HWY_DASSERT(false); 2686 return 0; 2687 } 2688 2689 template <class D> 2690 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { 2691 for (intptr_t i = static_cast<intptr_t>(MaxLanes(d) - 1); i >= 0; i--) { 2692 if (mask.bits[i] != 0) return i; 2693 } 2694 return intptr_t{-1}; 2695 } 2696 2697 // ------------------------------ Compress 2698 2699 template <typename T> 2700 struct CompressIsPartition { 2701 enum { value = (sizeof(T) != 1) }; 2702 }; 2703 2704 template <typename T, size_t N> 2705 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { 2706 size_t count = 0; 2707 Vec128<T, N> ret; 2708 for (size_t i = 0; i < N; ++i) { 2709 if (mask.bits[i]) { 2710 ret.raw[count++] = v.raw[i]; 2711 } 2712 } 2713 for (size_t i = 0; i < N; ++i) { 2714 if (!mask.bits[i]) { 2715 ret.raw[count++] = v.raw[i]; 2716 } 2717 } 2718 HWY_DASSERT(count == N); 2719 return ret; 2720 } 2721 2722 // ------------------------------ Expand 2723 2724 // Could also just allow generic_ops-inl.h to implement these, but use our 2725 // simple implementation below to ensure the test is correct. 2726 #ifdef HWY_NATIVE_EXPAND 2727 #undef HWY_NATIVE_EXPAND 2728 #else 2729 #define HWY_NATIVE_EXPAND 2730 #endif 2731 2732 template <typename T, size_t N> 2733 HWY_API Vec128<T, N> Expand(Vec128<T, N> v, const Mask128<T, N> mask) { 2734 size_t in_pos = 0; 2735 Vec128<T, N> ret; 2736 for (size_t i = 0; i < N; ++i) { 2737 if (mask.bits[i]) { 2738 ret.raw[i] = v.raw[in_pos++]; 2739 } else { 2740 ret.raw[i] = ConvertScalarTo<T>(0); 2741 } 2742 } 2743 return ret; 2744 } 2745 2746 // ------------------------------ LoadExpand 2747 2748 template <class D> 2749 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, 2750 const TFromD<D>* HWY_RESTRICT unaligned) { 2751 size_t in_pos = 0; 2752 VFromD<D> ret; 2753 for (size_t i = 0; i < Lanes(d); ++i) { 2754 if (mask.bits[i]) { 2755 ret.raw[i] = unaligned[in_pos++]; 2756 } else { 2757 ret.raw[i] = TFromD<D>(); // zero, also works for float16_t 2758 } 2759 } 2760 return ret; 2761 } 2762 2763 // ------------------------------ CompressNot 2764 template <typename T, size_t N> 2765 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { 2766 size_t count = 0; 2767 Vec128<T, N> ret; 2768 for (size_t i = 0; i < N; ++i) { 2769 if (!mask.bits[i]) { 2770 ret.raw[count++] = v.raw[i]; 2771 } 2772 } 2773 for (size_t i = 0; i < N; ++i) { 2774 if (mask.bits[i]) { 2775 ret.raw[count++] = v.raw[i]; 2776 } 2777 } 2778 HWY_DASSERT(count == N); 2779 return ret; 2780 } 2781 2782 // ------------------------------ CompressBlocksNot 2783 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, 2784 Mask128<uint64_t> /* m */) { 2785 return v; 2786 } 2787 2788 // ------------------------------ CompressBits 2789 template <typename T, size_t N> 2790 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, 2791 const uint8_t* HWY_RESTRICT bits) { 2792 return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits)); 2793 } 2794 2795 // ------------------------------ CompressStore 2796 2797 // generic_ops-inl defines the 8-bit versions. 2798 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 2799 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, 2800 TFromD<D>* HWY_RESTRICT unaligned) { 2801 size_t count = 0; 2802 for (size_t i = 0; i < MaxLanes(d); ++i) { 2803 if (mask.bits[i]) { 2804 unaligned[count++] = v.raw[i]; 2805 } 2806 } 2807 return count; 2808 } 2809 2810 // ------------------------------ CompressBlendedStore 2811 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 2812 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> mask, D d, 2813 TFromD<D>* HWY_RESTRICT unaligned) { 2814 return CompressStore(v, mask, d, unaligned); 2815 } 2816 2817 // ------------------------------ CompressBitsStore 2818 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 2819 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, 2820 D d, TFromD<D>* HWY_RESTRICT unaligned) { 2821 const MFromD<D> mask = LoadMaskBits(d, bits); 2822 StoreU(Compress(v, mask), d, unaligned); 2823 return CountTrue(d, mask); 2824 } 2825 2826 // ------------------------------ Additional mask logical operations 2827 template <class T> 2828 HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) { 2829 return mask; 2830 } 2831 2832 template <class T, size_t N, HWY_IF_LANES_GT(N, 1)> 2833 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) { 2834 using TU = hwy::MakeUnsigned<T>; 2835 2836 Mask128<T, N> result; 2837 TU result_lane_mask{0}; 2838 for (size_t i = 0; i < N; i++) { 2839 result_lane_mask = static_cast<TU>(result_lane_mask | mask.bits[i]); 2840 result.bits[i] = result_lane_mask; 2841 } 2842 return result; 2843 } 2844 2845 template <class T, size_t N> 2846 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) { 2847 return Not(SetAtOrAfterFirst(mask)); 2848 } 2849 2850 template <class T, size_t N> 2851 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) { 2852 using TU = hwy::MakeUnsigned<T>; 2853 using TI = hwy::MakeSigned<T>; 2854 2855 Mask128<T, N> result; 2856 TU result_lane_mask = static_cast<TU>(~TU{0}); 2857 for (size_t i = 0; i < N; i++) { 2858 const auto curr_lane_mask_bits = mask.bits[i]; 2859 result.bits[i] = static_cast<TU>(curr_lane_mask_bits & result_lane_mask); 2860 result_lane_mask = 2861 static_cast<TU>(result_lane_mask & 2862 static_cast<TU>(-static_cast<TI>(mask.bits[i] == 0))); 2863 } 2864 return result; 2865 } 2866 2867 template <class T, size_t N> 2868 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) { 2869 using TU = hwy::MakeUnsigned<T>; 2870 using TI = hwy::MakeSigned<T>; 2871 2872 Mask128<T, N> result; 2873 TU result_lane_mask = static_cast<TU>(~TU{0}); 2874 for (size_t i = 0; i < N; i++) { 2875 result.bits[i] = result_lane_mask; 2876 result_lane_mask = 2877 static_cast<TU>(result_lane_mask & 2878 static_cast<TU>(-static_cast<TI>(mask.bits[i] == 0))); 2879 } 2880 return result; 2881 } 2882 2883 // ------------------------------ WidenMulPairwiseAdd 2884 2885 template <class DF, HWY_IF_F32_D(DF), class VBF> 2886 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) { 2887 return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), 2888 Mul(PromoteOddTo(df, a), PromoteOddTo(df, b))); 2889 } 2890 2891 template <class D, HWY_IF_UI32_D(D), class V16> 2892 HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, V16 a, V16 b) { 2893 return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), 2894 Mul(PromoteOddTo(d32, a), PromoteOddTo(d32, b))); 2895 } 2896 2897 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) 2898 2899 template <class D, HWY_IF_UI32_D(D), class V16> 2900 HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, V16 a, V16 b, 2901 const VFromD<D> sum0, 2902 VFromD<D>& sum1) { 2903 sum1 = MulAdd(PromoteOddTo(d32, a), PromoteOddTo(d32, b), sum1); 2904 return MulAdd(PromoteEvenTo(d32, a), PromoteEvenTo(d32, b), sum0); 2905 } 2906 2907 // ------------------------------ RearrangeToOddPlusEven 2908 template <class VW> 2909 HWY_API VW RearrangeToOddPlusEven(VW sum0, VW sum1) { 2910 return Add(sum0, sum1); 2911 } 2912 2913 // ================================================== REDUCTIONS 2914 2915 #ifdef HWY_NATIVE_REDUCE_SCALAR 2916 #undef HWY_NATIVE_REDUCE_SCALAR 2917 #else 2918 #define HWY_NATIVE_REDUCE_SCALAR 2919 #endif 2920 2921 template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)> 2922 HWY_API T ReduceSum(D d, VFromD<D> v) { 2923 T sum = T{0}; 2924 for (size_t i = 0; i < MaxLanes(d); ++i) { 2925 sum += v.raw[i]; 2926 } 2927 return sum; 2928 } 2929 2930 template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)> 2931 HWY_API T ReduceMin(D d, VFromD<D> v) { 2932 T min = PositiveInfOrHighestValue<T>(); 2933 for (size_t i = 0; i < MaxLanes(d); ++i) { 2934 min = HWY_MIN(min, v.raw[i]); 2935 } 2936 return min; 2937 } 2938 template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)> 2939 HWY_API T ReduceMax(D d, VFromD<D> v) { 2940 T max = NegativeInfOrLowestValue<T>(); 2941 for (size_t i = 0; i < MaxLanes(d); ++i) { 2942 max = HWY_MAX(max, v.raw[i]); 2943 } 2944 return max; 2945 } 2946 2947 // ------------------------------ SumOfLanes 2948 2949 template <class D, HWY_IF_LANES_GT_D(D, 1)> 2950 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { 2951 return Set(d, ReduceSum(d, v)); 2952 } 2953 template <class D, HWY_IF_LANES_GT_D(D, 1)> 2954 HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) { 2955 return Set(d, ReduceMin(d, v)); 2956 } 2957 template <class D, HWY_IF_LANES_GT_D(D, 1)> 2958 HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) { 2959 return Set(d, ReduceMax(d, v)); 2960 } 2961 2962 // ================================================== OPS WITH DEPENDENCIES 2963 2964 // ------------------------------ MulEven/Odd 64x64 (UpperHalf) 2965 2966 template <class T, HWY_IF_UI64(T)> 2967 HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) { 2968 alignas(16) T mul[2]; 2969 mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); 2970 return Load(Full128<T>(), mul); 2971 } 2972 2973 template <class T, HWY_IF_UI64(T)> 2974 HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) { 2975 alignas(16) T mul[2]; 2976 const Half<Full128<T>> d2; 2977 mul[0] = 2978 Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]); 2979 return Load(Full128<T>(), mul); 2980 } 2981 2982 // NOLINTNEXTLINE(google-readability-namespace-comments) 2983 } // namespace HWY_NAMESPACE 2984 } // namespace hwy 2985 HWY_AFTER_NAMESPACE();