scalar-inl.h (69788B)
1 // Copyright 2019 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // Single-element vectors and operations. 17 // External include guard in highway.h - see comment there. 18 19 #include <stdint.h> 20 #ifndef HWY_NO_LIBCXX 21 #include <math.h> // sqrtf 22 #endif 23 24 #include "hwy/ops/shared-inl.h" 25 26 HWY_BEFORE_NAMESPACE(); 27 namespace hwy { 28 namespace HWY_NAMESPACE { 29 30 // Single instruction, single data. 31 template <typename T> 32 using Sisd = Simd<T, 1, 0>; 33 34 // (Wrapper class required for overloading comparison operators.) 35 template <typename T> 36 struct Vec1 { 37 using PrivateT = T; // only for DFromV 38 static constexpr size_t kPrivateN = 1; // only for DFromV 39 40 HWY_INLINE Vec1() = default; 41 Vec1(const Vec1&) = default; 42 Vec1& operator=(const Vec1&) = default; 43 HWY_INLINE explicit Vec1(const T t) : raw(t) {} 44 45 HWY_INLINE Vec1& operator*=(const Vec1 other) { 46 return *this = (*this * other); 47 } 48 HWY_INLINE Vec1& operator/=(const Vec1 other) { 49 return *this = (*this / other); 50 } 51 HWY_INLINE Vec1& operator+=(const Vec1 other) { 52 return *this = (*this + other); 53 } 54 HWY_INLINE Vec1& operator-=(const Vec1 other) { 55 return *this = (*this - other); 56 } 57 HWY_INLINE Vec1& operator%=(const Vec1 other) { 58 return *this = (*this % other); 59 } 60 HWY_INLINE Vec1& operator&=(const Vec1 other) { 61 return *this = (*this & other); 62 } 63 HWY_INLINE Vec1& operator|=(const Vec1 other) { 64 return *this = (*this | other); 65 } 66 HWY_INLINE Vec1& operator^=(const Vec1 other) { 67 return *this = (*this ^ other); 68 } 69 70 T raw; 71 }; 72 73 // 0 or FF..FF, same size as Vec1. 74 template <typename T> 75 struct Mask1 { 76 using Raw = hwy::MakeUnsigned<T>; 77 78 using PrivateT = T; // only for DFromM 79 static constexpr size_t kPrivateN = 1; // only for DFromM 80 81 static HWY_INLINE Mask1<T> FromBool(bool b) { 82 Mask1<T> mask; 83 mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0; 84 return mask; 85 } 86 87 Raw bits; 88 }; 89 90 template <class V> 91 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; 92 93 template <class M> 94 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; 95 96 template <class V> 97 using TFromV = typename V::PrivateT; 98 99 // ------------------------------ BitCast 100 101 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom> 102 HWY_API Vec1<TTo> BitCast(DTo /* tag */, Vec1<TFrom> v) { 103 static_assert(sizeof(TTo) <= sizeof(TFrom), "Promoting is undefined"); 104 TTo to; 105 CopyBytes<sizeof(TTo)>(&v.raw, &to); // not same size - ok to shrink 106 return Vec1<TTo>(to); 107 } 108 109 // ------------------------------ Zero 110 111 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> 112 HWY_API Vec1<T> Zero(D /* tag */) { 113 return Vec1<T>(ConvertScalarTo<T>(0)); 114 } 115 116 template <class D> 117 using VFromD = decltype(Zero(D())); 118 119 // ------------------------------ Set 120 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2> 121 HWY_API Vec1<T> Set(D /* tag */, const T2 t) { 122 return Vec1<T>(static_cast<T>(t)); 123 } 124 125 // ------------------------------ Undefined 126 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> 127 HWY_API Vec1<T> Undefined(D d) { 128 return Zero(d); 129 } 130 131 // ------------------------------ Iota 132 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename T2> 133 HWY_API Vec1<T> Iota(const D /* tag */, const T2 first) { 134 return Vec1<T>(static_cast<T>(first)); 135 } 136 137 // ------------------------------ ResizeBitCast 138 139 template <class D, typename FromV> 140 HWY_API VFromD<D> ResizeBitCast(D /* tag */, FromV v) { 141 using TFrom = TFromV<FromV>; 142 using TTo = TFromD<D>; 143 constexpr size_t kCopyLen = HWY_MIN(sizeof(TFrom), sizeof(TTo)); 144 TTo to{}; 145 CopyBytes<kCopyLen>(&v.raw, &to); 146 return VFromD<D>(to); 147 } 148 149 namespace detail { 150 151 // ResizeBitCast on the HWY_SCALAR target has zero-extending semantics if 152 // sizeof(TFromD<DTo>) is greater than sizeof(TFromV<FromV>) 153 template <class FromSizeTag, class ToSizeTag, class DTo, class DFrom> 154 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */, 155 ToSizeTag /* to_size_tag */, 156 DTo d_to, DFrom /*d_from*/, 157 VFromD<DFrom> v) { 158 return ResizeBitCast(d_to, v); 159 } 160 161 } // namespace detail 162 163 // ------------------------------ Dup128VecFromValues 164 165 template <class D, HWY_IF_T_SIZE_D(D, 1)> 166 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/, 167 TFromD<D> /*t2*/, TFromD<D> /*t3*/, 168 TFromD<D> /*t4*/, TFromD<D> /*t5*/, 169 TFromD<D> /*t6*/, TFromD<D> /*t7*/, 170 TFromD<D> /*t8*/, TFromD<D> /*t9*/, 171 TFromD<D> /*t10*/, TFromD<D> /*t11*/, 172 TFromD<D> /*t12*/, TFromD<D> /*t13*/, 173 TFromD<D> /*t14*/, TFromD<D> /*t15*/) { 174 return VFromD<D>(t0); 175 } 176 177 template <class D, HWY_IF_T_SIZE_D(D, 2)> 178 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/, 179 TFromD<D> /*t2*/, TFromD<D> /*t3*/, 180 TFromD<D> /*t4*/, TFromD<D> /*t5*/, 181 TFromD<D> /*t6*/, TFromD<D> /*t7*/) { 182 return VFromD<D>(t0); 183 } 184 185 template <class D, HWY_IF_T_SIZE_D(D, 4)> 186 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/, 187 TFromD<D> /*t2*/, TFromD<D> /*t3*/) { 188 return VFromD<D>(t0); 189 } 190 191 template <class D, HWY_IF_T_SIZE_D(D, 8)> 192 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> /*t1*/) { 193 return VFromD<D>(t0); 194 } 195 196 // ================================================== LOGICAL 197 198 // ------------------------------ Not 199 200 template <typename T> 201 HWY_API Vec1<T> Not(const Vec1<T> v) { 202 using TU = MakeUnsigned<T>; 203 const Sisd<TU> du; 204 return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw))); 205 } 206 207 // ------------------------------ And 208 209 template <typename T> 210 HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) { 211 using TU = MakeUnsigned<T>; 212 const Sisd<TU> du; 213 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw)); 214 } 215 template <typename T> 216 HWY_API Vec1<T> operator&(const Vec1<T> a, const Vec1<T> b) { 217 return And(a, b); 218 } 219 220 // ------------------------------ AndNot 221 222 template <typename T> 223 HWY_API Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) { 224 using TU = MakeUnsigned<T>; 225 const Sisd<TU> du; 226 return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw & 227 BitCast(du, b).raw))); 228 } 229 230 // ------------------------------ Or 231 232 template <typename T> 233 HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) { 234 using TU = MakeUnsigned<T>; 235 const Sisd<TU> du; 236 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw)); 237 } 238 template <typename T> 239 HWY_API Vec1<T> operator|(const Vec1<T> a, const Vec1<T> b) { 240 return Or(a, b); 241 } 242 243 // ------------------------------ Xor 244 245 template <typename T> 246 HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) { 247 using TU = MakeUnsigned<T>; 248 const Sisd<TU> du; 249 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw)); 250 } 251 template <typename T> 252 HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) { 253 return Xor(a, b); 254 } 255 256 // ------------------------------ Xor3 257 258 template <typename T> 259 HWY_API Vec1<T> Xor3(Vec1<T> x1, Vec1<T> x2, Vec1<T> x3) { 260 return Xor(x1, Xor(x2, x3)); 261 } 262 263 // ------------------------------ Or3 264 265 template <typename T> 266 HWY_API Vec1<T> Or3(Vec1<T> o1, Vec1<T> o2, Vec1<T> o3) { 267 return Or(o1, Or(o2, o3)); 268 } 269 270 // ------------------------------ OrAnd 271 272 template <typename T> 273 HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) { 274 return Or(o, And(a1, a2)); 275 } 276 277 // ------------------------------ Mask 278 279 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom> 280 HWY_API Mask1<TTo> RebindMask(DTo /*tag*/, Mask1<TFrom> m) { 281 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); 282 return Mask1<TTo>{m.bits}; 283 } 284 285 // v must be 0 or FF..FF. 286 template <typename T> 287 HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) { 288 Mask1<T> mask; 289 CopySameSize(&v, &mask); 290 return mask; 291 } 292 293 template <class D> 294 using MFromD = decltype(MaskFromVec(VFromD<D>())); 295 296 template <class D, typename T = TFromD<D>> 297 Vec1<T> VecFromMask(D /* tag */, const Mask1<T> mask) { 298 Vec1<T> v; 299 CopySameSize(&mask, &v); 300 return v; 301 } 302 303 template <class D> 304 uint64_t BitsFromMask(D, MFromD<D> mask) { 305 return mask.bits ? 1 : 0; 306 } 307 308 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> 309 HWY_API Mask1<T> FirstN(D /*tag*/, size_t n) { 310 return Mask1<T>::FromBool(n != 0); 311 } 312 313 #ifdef HWY_NATIVE_SET_MASK 314 #undef HWY_NATIVE_SET_MASK 315 #else 316 #define HWY_NATIVE_SET_MASK 317 #endif 318 319 template <class D> 320 HWY_API MFromD<D> SetMask(D /*d*/, bool val) { 321 return MFromD<D>::FromBool(val); 322 } 323 324 // ------------------------------ IfVecThenElse 325 template <typename T> 326 HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) { 327 return IfThenElse(MaskFromVec(mask), yes, no); 328 } 329 330 // ------------------------------ CopySign 331 template <typename T> 332 HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) { 333 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 334 const DFromV<decltype(magn)> d; 335 return BitwiseIfThenElse(SignBit(d), sign, magn); 336 } 337 338 // ------------------------------ CopySignToAbs 339 template <typename T> 340 HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) { 341 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 342 const Sisd<T> d; 343 return OrAnd(abs, SignBit(d), sign); 344 } 345 346 // ------------------------------ BroadcastSignBit 347 template <typename T> 348 HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) { 349 return Vec1<T>(ScalarShr(v.raw, sizeof(T) * 8 - 1)); 350 } 351 352 // ------------------------------ PopulationCount 353 354 #ifdef HWY_NATIVE_POPCNT 355 #undef HWY_NATIVE_POPCNT 356 #else 357 #define HWY_NATIVE_POPCNT 358 #endif 359 360 template <typename T> 361 HWY_API Vec1<T> PopulationCount(Vec1<T> v) { 362 return Vec1<T>(static_cast<T>(PopCount(v.raw))); 363 } 364 365 // ------------------------------ IfThenElse 366 367 // Returns mask ? yes : no. 368 template <typename T> 369 HWY_API Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes, 370 const Vec1<T> no) { 371 return mask.bits ? yes : no; 372 } 373 374 template <typename T> 375 HWY_API Vec1<T> IfThenElseZero(const Mask1<T> mask, const Vec1<T> yes) { 376 return mask.bits ? yes : Vec1<T>(ConvertScalarTo<T>(0)); 377 } 378 379 template <typename T> 380 HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) { 381 return mask.bits ? Vec1<T>(ConvertScalarTo<T>(0)) : no; 382 } 383 384 template <typename T> 385 HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) { 386 const DFromV<decltype(v)> d; 387 const RebindToSigned<decltype(d)> di; 388 const auto vi = BitCast(di, v); 389 390 return vi.raw < 0 ? yes : no; 391 } 392 393 // ------------------------------ Mask logical 394 395 template <typename T> 396 HWY_API Mask1<T> Not(const Mask1<T> m) { 397 return MaskFromVec(Not(VecFromMask(Sisd<T>(), m))); 398 } 399 400 template <typename T> 401 HWY_API Mask1<T> And(const Mask1<T> a, Mask1<T> b) { 402 const Sisd<T> d; 403 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); 404 } 405 406 template <typename T> 407 HWY_API Mask1<T> AndNot(const Mask1<T> a, Mask1<T> b) { 408 const Sisd<T> d; 409 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); 410 } 411 412 template <typename T> 413 HWY_API Mask1<T> Or(const Mask1<T> a, Mask1<T> b) { 414 const Sisd<T> d; 415 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); 416 } 417 418 template <typename T> 419 HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) { 420 const Sisd<T> d; 421 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); 422 } 423 424 template <typename T> 425 HWY_API Mask1<T> ExclusiveNeither(const Mask1<T> a, Mask1<T> b) { 426 const Sisd<T> d; 427 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); 428 } 429 430 template <class T> 431 HWY_API Mask1<T> SetAtOrAfterFirst(Mask1<T> mask) { 432 return mask; 433 } 434 435 template <class T> 436 HWY_API Mask1<T> SetBeforeFirst(Mask1<T> mask) { 437 return Not(mask); 438 } 439 440 template <class T> 441 HWY_API Mask1<T> SetOnlyFirst(Mask1<T> mask) { 442 return mask; 443 } 444 445 template <class T> 446 HWY_API Mask1<T> SetAtOrBeforeFirst(Mask1<T> /*mask*/) { 447 return Mask1<T>::FromBool(true); 448 } 449 450 // ------------------------------ LowerHalfOfMask 451 452 #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK 453 #undef HWY_NATIVE_LOWER_HALF_OF_MASK 454 #else 455 #define HWY_NATIVE_LOWER_HALF_OF_MASK 456 #endif 457 458 template <class D> 459 HWY_API MFromD<D> LowerHalfOfMask(D /*d*/, MFromD<D> m) { 460 return m; 461 } 462 463 // ================================================== SHIFTS 464 465 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) 466 467 template <int kBits, typename T> 468 HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) { 469 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); 470 return Vec1<T>( 471 static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits)); 472 } 473 474 template <int kBits, typename T> 475 HWY_API Vec1<T> ShiftRight(const Vec1<T> v) { 476 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); 477 return Vec1<T>(ScalarShr(v.raw, kBits)); 478 } 479 480 // ------------------------------ RotateRight (ShiftRight) 481 template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 482 HWY_API Vec1<T> RotateRight(const Vec1<T> v) { 483 const DFromV<decltype(v)> d; 484 const RebindToUnsigned<decltype(d)> du; 485 486 constexpr size_t kSizeInBits = sizeof(T) * 8; 487 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); 488 if (kBits == 0) return v; 489 490 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))), 491 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); 492 } 493 494 // ------------------------------ ShiftLeftSame (BroadcastSignBit) 495 496 template <typename T> 497 HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) { 498 return Vec1<T>( 499 static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits)); 500 } 501 502 template <typename T> 503 HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) { 504 return Vec1<T>(ScalarShr(v.raw, bits)); 505 } 506 507 // ------------------------------ Shl 508 509 // Single-lane => same as ShiftLeftSame except for the argument type. 510 template <typename T> 511 HWY_API Vec1<T> operator<<(const Vec1<T> v, const Vec1<T> bits) { 512 return ShiftLeftSame(v, static_cast<int>(bits.raw)); 513 } 514 515 template <typename T> 516 HWY_API Vec1<T> operator>>(const Vec1<T> v, const Vec1<T> bits) { 517 return ShiftRightSame(v, static_cast<int>(bits.raw)); 518 } 519 520 // ================================================== ARITHMETIC 521 522 template <typename T> 523 HWY_API Vec1<T> operator+(Vec1<T> a, Vec1<T> b) { 524 const uint64_t a64 = static_cast<uint64_t>(a.raw); 525 const uint64_t b64 = static_cast<uint64_t>(b.raw); 526 return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0)))); 527 } 528 HWY_API Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) { 529 return Vec1<float>(a.raw + b.raw); 530 } 531 HWY_API Vec1<double> operator+(const Vec1<double> a, const Vec1<double> b) { 532 return Vec1<double>(a.raw + b.raw); 533 } 534 535 template <typename T> 536 HWY_API Vec1<T> operator-(Vec1<T> a, Vec1<T> b) { 537 const uint64_t a64 = static_cast<uint64_t>(a.raw); 538 const uint64_t b64 = static_cast<uint64_t>(b.raw); 539 return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)))); 540 } 541 HWY_API Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) { 542 return Vec1<float>(a.raw - b.raw); 543 } 544 HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) { 545 return Vec1<double>(a.raw - b.raw); 546 } 547 548 // ------------------------------ SumsOf8 549 550 HWY_API Vec1<int64_t> SumsOf8(const Vec1<int8_t> v) { 551 return Vec1<int64_t>(v.raw); 552 } 553 HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) { 554 return Vec1<uint64_t>(v.raw); 555 } 556 557 // ------------------------------ SumsOf2 558 559 template <class T> 560 HWY_API Vec1<MakeWide<T>> SumsOf2(const Vec1<T> v) { 561 const DFromV<decltype(v)> d; 562 const Rebind<MakeWide<T>, decltype(d)> dw; 563 return PromoteTo(dw, v); 564 } 565 566 // ------------------------------ SaturatedAdd 567 568 // Returns a + b clamped to the destination range. 569 570 // Unsigned 571 HWY_API Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a, 572 const Vec1<uint8_t> b) { 573 return Vec1<uint8_t>( 574 static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255))); 575 } 576 HWY_API Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a, 577 const Vec1<uint16_t> b) { 578 return Vec1<uint16_t>(static_cast<uint16_t>( 579 HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) + b.raw), 65535))); 580 } 581 582 // Signed 583 HWY_API Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a, const Vec1<int8_t> b) { 584 return Vec1<int8_t>( 585 static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127))); 586 } 587 HWY_API Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a, 588 const Vec1<int16_t> b) { 589 return Vec1<int16_t>(static_cast<int16_t>( 590 HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) + b.raw), 32767))); 591 } 592 593 // ------------------------------ Saturating subtraction 594 595 // Returns a - b clamped to the destination range. 596 597 // Unsigned 598 HWY_API Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a, 599 const Vec1<uint8_t> b) { 600 return Vec1<uint8_t>( 601 static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255))); 602 } 603 HWY_API Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a, 604 const Vec1<uint16_t> b) { 605 return Vec1<uint16_t>(static_cast<uint16_t>( 606 HWY_MIN(HWY_MAX(0, static_cast<int32_t>(a.raw) - b.raw), 65535))); 607 } 608 609 // Signed 610 HWY_API Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a, const Vec1<int8_t> b) { 611 return Vec1<int8_t>( 612 static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127))); 613 } 614 HWY_API Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a, 615 const Vec1<int16_t> b) { 616 return Vec1<int16_t>(static_cast<int16_t>( 617 HWY_MIN(HWY_MAX(-32768, static_cast<int32_t>(a.raw) - b.raw), 32767))); 618 } 619 620 // ------------------------------ Average 621 622 // Returns (a + b + 1) / 2 623 624 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32 625 #undef HWY_NATIVE_AVERAGE_ROUND_UI32 626 #else 627 #define HWY_NATIVE_AVERAGE_ROUND_UI32 628 #endif 629 630 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI64 631 #undef HWY_NATIVE_AVERAGE_ROUND_UI64 632 #else 633 #define HWY_NATIVE_AVERAGE_ROUND_UI64 634 #endif 635 636 template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 637 HWY_API Vec1<T> AverageRound(const Vec1<T> a, const Vec1<T> b) { 638 const T a_val = a.raw; 639 const T b_val = b.raw; 640 return Vec1<T>(static_cast<T>((a_val | b_val) - ScalarShr(a_val ^ b_val, 1))); 641 } 642 643 // ------------------------------ Absolute value 644 645 template <typename T> 646 HWY_API Vec1<T> Abs(const Vec1<T> a) { 647 return Vec1<T>(ScalarAbs(a.raw)); 648 } 649 650 // ------------------------------ Min/Max 651 652 // <cmath> may be unavailable, so implement our own. 653 654 template <typename T, HWY_IF_NOT_FLOAT(T)> 655 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) { 656 return Vec1<T>(HWY_MIN(a.raw, b.raw)); 657 } 658 659 template <typename T, HWY_IF_FLOAT(T)> 660 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) { 661 if (ScalarIsNaN(a.raw)) return b; 662 if (ScalarIsNaN(b.raw)) return a; 663 return Vec1<T>(HWY_MIN(a.raw, b.raw)); 664 } 665 666 template <typename T, HWY_IF_NOT_FLOAT(T)> 667 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) { 668 return Vec1<T>(HWY_MAX(a.raw, b.raw)); 669 } 670 671 template <typename T, HWY_IF_FLOAT(T)> 672 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) { 673 if (ScalarIsNaN(a.raw)) return b; 674 if (ScalarIsNaN(b.raw)) return a; 675 return Vec1<T>(HWY_MAX(a.raw, b.raw)); 676 } 677 678 // ------------------------------ Floating-point negate 679 680 template <typename T, HWY_IF_FLOAT_OR_SPECIAL(T)> 681 HWY_API Vec1<T> Neg(const Vec1<T> v) { 682 return Xor(v, SignBit(Sisd<T>())); 683 } 684 685 template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 686 HWY_API Vec1<T> Neg(const Vec1<T> v) { 687 return Zero(Sisd<T>()) - v; 688 } 689 690 // ------------------------------ mul/div 691 692 // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. 693 #ifdef HWY_NATIVE_MUL_8 694 #undef HWY_NATIVE_MUL_8 695 #else 696 #define HWY_NATIVE_MUL_8 697 #endif 698 #ifdef HWY_NATIVE_MUL_64 699 #undef HWY_NATIVE_MUL_64 700 #else 701 #define HWY_NATIVE_MUL_64 702 #endif 703 704 template <typename T, HWY_IF_FLOAT(T)> 705 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) { 706 return Vec1<T>(static_cast<T>(double{a.raw} * b.raw)); 707 } 708 709 template <typename T, HWY_IF_NOT_FLOAT(T)> 710 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) { 711 return Vec1<T>(static_cast<T>(static_cast<uint64_t>(a.raw) * 712 static_cast<uint64_t>(b.raw))); 713 } 714 715 template <typename T, HWY_IF_FLOAT(T)> 716 HWY_API Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) { 717 return Vec1<T>(a.raw / b.raw); 718 } 719 720 // Returns the upper sizeof(T)*8 bits of a * b in each lane. 721 template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)), 722 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 723 HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) { 724 using TW = MakeWide<T>; 725 return Vec1<T>(static_cast<T>( 726 (static_cast<TW>(a.raw) * static_cast<TW>(b.raw)) >> (sizeof(T) * 8))); 727 } 728 template <class T, HWY_IF_UI64(T)> 729 HWY_API Vec1<T> MulHigh(const Vec1<T> a, const Vec1<T> b) { 730 T hi; 731 Mul128(a.raw, b.raw, &hi); 732 return Vec1<T>(hi); 733 } 734 735 HWY_API Vec1<int16_t> MulFixedPoint15(Vec1<int16_t> a, Vec1<int16_t> b) { 736 return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw + 16384) >> 15)); 737 } 738 739 // Multiplies even lanes (0, 2 ..) and returns the double-wide result. 740 template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)), 741 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 742 HWY_API Vec1<MakeWide<T>> MulEven(const Vec1<T> a, const Vec1<T> b) { 743 using TW = MakeWide<T>; 744 const TW a_wide = a.raw; 745 return Vec1<TW>(static_cast<TW>(a_wide * b.raw)); 746 } 747 748 template <class T> 749 HWY_API Vec1<MakeWide<T>> MulOdd(const Vec1<T>, const Vec1<T>) { 750 static_assert(sizeof(T) == 0, "There are no odd lanes"); 751 } 752 753 // Approximate reciprocal 754 HWY_API Vec1<float> ApproximateReciprocal(const Vec1<float> v) { 755 // Zero inputs are allowed, but callers are responsible for replacing the 756 // return value with something else (typically using IfThenElse). This check 757 // avoids a ubsan error. The return value is arbitrary. 758 if (v.raw == 0.0f) return Vec1<float>(0.0f); 759 return Vec1<float>(1.0f / v.raw); 760 } 761 762 // generic_ops takes care of integer T. 763 template <typename T, HWY_IF_FLOAT(T)> 764 HWY_API Vec1<T> AbsDiff(const Vec1<T> a, const Vec1<T> b) { 765 return Abs(a - b); 766 } 767 768 // ------------------------------ Floating-point multiply-add variants 769 770 template <typename T, HWY_IF_FLOAT(T)> 771 HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) { 772 return mul * x + add; 773 } 774 775 template <typename T, HWY_IF_FLOAT(T)> 776 HWY_API Vec1<T> NegMulAdd(const Vec1<T> mul, const Vec1<T> x, 777 const Vec1<T> add) { 778 return add - mul * x; 779 } 780 781 template <typename T, HWY_IF_FLOAT(T)> 782 HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) { 783 return mul * x - sub; 784 } 785 786 template <typename T, HWY_IF_FLOAT(T)> 787 HWY_API Vec1<T> NegMulSub(const Vec1<T> mul, const Vec1<T> x, 788 const Vec1<T> sub) { 789 return Neg(mul) * x - sub; 790 } 791 792 // ------------------------------ Floating-point square root 793 794 // Approximate reciprocal square root 795 HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) { 796 float f = v.raw; 797 const float half = f * 0.5f; 798 uint32_t bits; 799 CopySameSize(&f, &bits); 800 // Initial guess based on log2(f) 801 bits = 0x5F3759DF - (bits >> 1); 802 CopySameSize(&bits, &f); 803 // One Newton-Raphson iteration 804 return Vec1<float>(f * (1.5f - (half * f * f))); 805 } 806 807 // Square root 808 HWY_API Vec1<float> Sqrt(Vec1<float> v) { 809 #if defined(HWY_NO_LIBCXX) 810 #if HWY_COMPILER_GCC_ACTUAL 811 return Vec1<float>(__builtin_sqrt(v.raw)); 812 #else 813 uint32_t bits; 814 CopyBytes<sizeof(bits)>(&v, &bits); 815 // Coarse approximation, letting the exponent LSB leak into the mantissa 816 bits = (1 << 29) + (bits >> 1) - (1 << 22); 817 CopyBytes<sizeof(bits)>(&bits, &v); 818 return v; 819 #endif // !HWY_COMPILER_GCC_ACTUAL 820 #else 821 return Vec1<float>(sqrtf(v.raw)); 822 #endif // !HWY_NO_LIBCXX 823 } 824 HWY_API Vec1<double> Sqrt(Vec1<double> v) { 825 #if defined(HWY_NO_LIBCXX) 826 #if HWY_COMPILER_GCC_ACTUAL 827 return Vec1<double>(__builtin_sqrt(v.raw)); 828 #else 829 uint64_t bits; 830 CopyBytes<sizeof(bits)>(&v, &bits); 831 // Coarse approximation, letting the exponent LSB leak into the mantissa 832 bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51); 833 CopyBytes<sizeof(bits)>(&bits, &v); 834 return v; 835 #endif // !HWY_COMPILER_GCC_ACTUAL 836 #else 837 return Vec1<double>(sqrt(v.raw)); 838 #endif // HWY_NO_LIBCXX 839 } 840 841 // ------------------------------ Floating-point rounding 842 843 template <typename T> 844 HWY_API Vec1<T> Round(const Vec1<T> v) { 845 using TI = MakeSigned<T>; 846 if (!(Abs(v).raw < MantissaEnd<T>())) { // Huge or NaN 847 return v; 848 } 849 const T k0 = ConvertScalarTo<T>(0); 850 const T bias = ConvertScalarTo<T>(v.raw < k0 ? -0.5 : 0.5); 851 const TI rounded = ConvertScalarTo<TI>(v.raw + bias); 852 if (rounded == 0) return CopySignToAbs(Vec1<T>(k0), v); 853 TI offset = 0; 854 // Round to even 855 if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) == 856 ConvertScalarTo<T>(0.5)) { 857 offset = v.raw < k0 ? -1 : 1; 858 } 859 return Vec1<T>(ConvertScalarTo<T>(rounded - offset)); 860 } 861 862 // Round-to-nearest even. 863 template <class T, HWY_IF_FLOAT3264(T)> 864 HWY_API Vec1<MakeSigned<T>> NearestInt(const Vec1<T> v) { 865 using TI = MakeSigned<T>; 866 867 const T abs = Abs(v).raw; 868 const bool is_sign = ScalarSignBit(v.raw); 869 870 if (!(abs < MantissaEnd<T>())) { // Huge or NaN 871 // Check if too large to cast or NaN 872 if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) { 873 return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>()); 874 } 875 return Vec1<TI>(ConvertScalarTo<TI>(v.raw)); 876 } 877 const T bias = 878 ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5); 879 const TI rounded = ConvertScalarTo<TI>(v.raw + bias); 880 if (rounded == 0) return Vec1<TI>(0); 881 TI offset = 0; 882 // Round to even 883 if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) == 884 ConvertScalarTo<T>(0.5)) { 885 offset = is_sign ? -1 : 1; 886 } 887 return Vec1<TI>(rounded - offset); 888 } 889 890 // Round-to-nearest even. 891 template <class DI32, HWY_IF_I32_D(DI32)> 892 HWY_API VFromD<DI32> DemoteToNearestInt(DI32 /*di32*/, const Vec1<double> v) { 893 using T = double; 894 using TI = int32_t; 895 896 const T abs = Abs(v).raw; 897 const bool is_sign = ScalarSignBit(v.raw); 898 899 // Check if too large to cast or NaN 900 if (!(abs <= ConvertScalarTo<T>(LimitsMax<TI>()))) { 901 return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>()); 902 } 903 904 const T bias = 905 ConvertScalarTo<T>(v.raw < ConvertScalarTo<T>(0.0) ? -0.5 : 0.5); 906 const TI rounded = ConvertScalarTo<TI>(v.raw + bias); 907 if (rounded == 0) return Vec1<TI>(0); 908 TI offset = 0; 909 // Round to even 910 if ((rounded & 1) && ScalarAbs(ConvertScalarTo<T>(rounded) - v.raw) == 911 ConvertScalarTo<T>(0.5)) { 912 offset = is_sign ? -1 : 1; 913 } 914 return Vec1<TI>(rounded - offset); 915 } 916 917 template <typename T> 918 HWY_API Vec1<T> Trunc(const Vec1<T> v) { 919 using TI = MakeSigned<T>; 920 if (!(Abs(v).raw <= MantissaEnd<T>())) { // Huge or NaN 921 return v; 922 } 923 const TI truncated = ConvertScalarTo<TI>(v.raw); 924 if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v); 925 return Vec1<T>(ConvertScalarTo<T>(truncated)); 926 } 927 928 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits, 929 class V> 930 V Ceiling(const V v) { 931 const Bits kExponentMask = (1ull << kExponentBits) - 1; 932 const Bits kMantissaMask = (1ull << kMantissaBits) - 1; 933 const Bits kBias = kExponentMask / 2; 934 935 Float f = v.raw; 936 const bool positive = f > Float(0.0); 937 938 Bits bits; 939 CopySameSize(&v, &bits); 940 941 const int exponent = 942 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias); 943 // Already an integer. 944 if (exponent >= kMantissaBits) return v; 945 // |v| <= 1 => 0 or 1. 946 if (exponent < 0) return positive ? V(1) : V(-0.0); 947 948 const Bits mantissa_mask = kMantissaMask >> exponent; 949 // Already an integer 950 if ((bits & mantissa_mask) == 0) return v; 951 952 // Clear fractional bits and round up 953 if (positive) bits += (kMantissaMask + 1) >> exponent; 954 bits &= ~mantissa_mask; 955 956 CopySameSize(&bits, &f); 957 return V(f); 958 } 959 960 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits, 961 class V> 962 V Floor(const V v) { 963 const Bits kExponentMask = (1ull << kExponentBits) - 1; 964 const Bits kMantissaMask = (1ull << kMantissaBits) - 1; 965 const Bits kBias = kExponentMask / 2; 966 967 Float f = v.raw; 968 const bool negative = f < Float(0.0); 969 970 Bits bits; 971 CopySameSize(&v, &bits); 972 973 const int exponent = 974 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias); 975 // Already an integer. 976 if (exponent >= kMantissaBits) return v; 977 // |v| <= 1 => -1 or 0. 978 if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0)); 979 980 const Bits mantissa_mask = kMantissaMask >> exponent; 981 // Already an integer 982 if ((bits & mantissa_mask) == 0) return v; 983 984 // Clear fractional bits and round down 985 if (negative) bits += (kMantissaMask + 1) >> exponent; 986 bits &= ~mantissa_mask; 987 988 CopySameSize(&bits, &f); 989 return V(f); 990 } 991 992 // Toward +infinity, aka ceiling 993 HWY_API Vec1<float> Ceil(const Vec1<float> v) { 994 return Ceiling<float, uint32_t, 23, 8>(v); 995 } 996 HWY_API Vec1<double> Ceil(const Vec1<double> v) { 997 return Ceiling<double, uint64_t, 52, 11>(v); 998 } 999 1000 // Toward -infinity, aka floor 1001 HWY_API Vec1<float> Floor(const Vec1<float> v) { 1002 return Floor<float, uint32_t, 23, 8>(v); 1003 } 1004 HWY_API Vec1<double> Floor(const Vec1<double> v) { 1005 return Floor<double, uint64_t, 52, 11>(v); 1006 } 1007 1008 // ================================================== COMPARE 1009 1010 template <typename T> 1011 HWY_API Mask1<T> operator==(const Vec1<T> a, const Vec1<T> b) { 1012 return Mask1<T>::FromBool(a.raw == b.raw); 1013 } 1014 1015 template <typename T> 1016 HWY_API Mask1<T> operator!=(const Vec1<T> a, const Vec1<T> b) { 1017 return Mask1<T>::FromBool(a.raw != b.raw); 1018 } 1019 1020 template <typename T> 1021 HWY_API Mask1<T> TestBit(const Vec1<T> v, const Vec1<T> bit) { 1022 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); 1023 return (v & bit) == bit; 1024 } 1025 1026 template <typename T> 1027 HWY_API Mask1<T> operator<(const Vec1<T> a, const Vec1<T> b) { 1028 return Mask1<T>::FromBool(a.raw < b.raw); 1029 } 1030 template <typename T> 1031 HWY_API Mask1<T> operator>(const Vec1<T> a, const Vec1<T> b) { 1032 return Mask1<T>::FromBool(a.raw > b.raw); 1033 } 1034 1035 template <typename T> 1036 HWY_API Mask1<T> operator<=(const Vec1<T> a, const Vec1<T> b) { 1037 return Mask1<T>::FromBool(a.raw <= b.raw); 1038 } 1039 template <typename T> 1040 HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) { 1041 return Mask1<T>::FromBool(a.raw >= b.raw); 1042 } 1043 1044 // ------------------------------ Floating-point classification (==) 1045 1046 template <typename T> 1047 HWY_API Mask1<T> IsNaN(const Vec1<T> v) { 1048 // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY. 1049 return Mask1<T>::FromBool(ScalarIsNaN(v.raw)); 1050 } 1051 1052 // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite. 1053 #ifdef HWY_NATIVE_ISINF 1054 #undef HWY_NATIVE_ISINF 1055 #else 1056 #define HWY_NATIVE_ISINF 1057 #endif 1058 1059 HWY_API Mask1<float> IsInf(const Vec1<float> v) { 1060 const Sisd<float> d; 1061 const RebindToUnsigned<decltype(d)> du; 1062 const Vec1<uint32_t> vu = BitCast(du, v); 1063 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. 1064 return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u)); 1065 } 1066 HWY_API Mask1<double> IsInf(const Vec1<double> v) { 1067 const Sisd<double> d; 1068 const RebindToUnsigned<decltype(d)> du; 1069 const Vec1<uint64_t> vu = BitCast(du, v); 1070 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. 1071 return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull)); 1072 } 1073 1074 HWY_API Mask1<float> IsFinite(const Vec1<float> v) { 1075 const Vec1<uint32_t> vu = BitCast(Sisd<uint32_t>(), v); 1076 // Shift left to clear the sign bit, check whether exponent != max value. 1077 return Mask1<float>::FromBool((vu.raw << 1) < 0xFF000000u); 1078 } 1079 HWY_API Mask1<double> IsFinite(const Vec1<double> v) { 1080 const Vec1<uint64_t> vu = BitCast(Sisd<uint64_t>(), v); 1081 // Shift left to clear the sign bit, check whether exponent != max value. 1082 return Mask1<double>::FromBool((vu.raw << 1) < 0xFFE0000000000000ull); 1083 } 1084 1085 // ================================================== MEMORY 1086 1087 // ------------------------------ Load 1088 1089 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> 1090 HWY_API Vec1<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) { 1091 T t; 1092 CopySameSize(aligned, &t); 1093 return Vec1<T>(t); 1094 } 1095 1096 template <class D, typename T = TFromD<D>> 1097 HWY_API Vec1<T> MaskedLoad(Mask1<T> m, D d, const T* HWY_RESTRICT aligned) { 1098 return IfThenElseZero(m, Load(d, aligned)); 1099 } 1100 1101 template <class D, typename T = TFromD<D>> 1102 HWY_API Vec1<T> MaskedLoadOr(Vec1<T> v, Mask1<T> m, D d, 1103 const T* HWY_RESTRICT aligned) { 1104 return IfThenElse(m, Load(d, aligned), v); 1105 } 1106 1107 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> 1108 HWY_API Vec1<T> LoadU(D d, const T* HWY_RESTRICT p) { 1109 return Load(d, p); 1110 } 1111 1112 // In some use cases, "load single lane" is sufficient; otherwise avoid this. 1113 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> 1114 HWY_API Vec1<T> LoadDup128(D d, const T* HWY_RESTRICT aligned) { 1115 return Load(d, aligned); 1116 } 1117 1118 #ifdef HWY_NATIVE_LOAD_N 1119 #undef HWY_NATIVE_LOAD_N 1120 #else 1121 #define HWY_NATIVE_LOAD_N 1122 #endif 1123 1124 template <class D, typename T = TFromD<D>> 1125 HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p, 1126 size_t max_lanes_to_load) { 1127 return (max_lanes_to_load > 0) ? Load(d, p) : Zero(d); 1128 } 1129 1130 template <class D, typename T = TFromD<D>> 1131 HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p, 1132 size_t max_lanes_to_load) { 1133 return (max_lanes_to_load > 0) ? Load(d, p) : no; 1134 } 1135 1136 // ------------------------------ Store 1137 1138 template <class D, typename T = TFromD<D>> 1139 HWY_API void Store(const Vec1<T> v, D /* tag */, T* HWY_RESTRICT aligned) { 1140 CopySameSize(&v.raw, aligned); 1141 } 1142 1143 template <class D, typename T = TFromD<D>> 1144 HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT p) { 1145 return Store(v, d, p); 1146 } 1147 1148 template <class D, typename T = TFromD<D>> 1149 HWY_API void BlendedStore(const Vec1<T> v, Mask1<T> m, D d, T* HWY_RESTRICT p) { 1150 if (!m.bits) return; 1151 StoreU(v, d, p); 1152 } 1153 1154 #ifdef HWY_NATIVE_STORE_N 1155 #undef HWY_NATIVE_STORE_N 1156 #else 1157 #define HWY_NATIVE_STORE_N 1158 #endif 1159 1160 template <class D, typename T = TFromD<D>> 1161 HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p, 1162 size_t max_lanes_to_store) { 1163 if (max_lanes_to_store > 0) { 1164 Store(v, d, p); 1165 } 1166 } 1167 1168 // ------------------------------ Tuples 1169 #include "hwy/ops/inside-inl.h" 1170 1171 // ------------------------------ LoadInterleaved2/3/4 1172 1173 // Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2. 1174 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED 1175 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED 1176 #else 1177 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED 1178 #endif 1179 1180 template <class D, typename T = TFromD<D>> 1181 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0, 1182 Vec1<T>& v1) { 1183 v0 = LoadU(d, unaligned + 0); 1184 v1 = LoadU(d, unaligned + 1); 1185 } 1186 1187 template <class D, typename T = TFromD<D>> 1188 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0, 1189 Vec1<T>& v1, Vec1<T>& v2) { 1190 v0 = LoadU(d, unaligned + 0); 1191 v1 = LoadU(d, unaligned + 1); 1192 v2 = LoadU(d, unaligned + 2); 1193 } 1194 1195 template <class D, typename T = TFromD<D>> 1196 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, Vec1<T>& v0, 1197 Vec1<T>& v1, Vec1<T>& v2, Vec1<T>& v3) { 1198 v0 = LoadU(d, unaligned + 0); 1199 v1 = LoadU(d, unaligned + 1); 1200 v2 = LoadU(d, unaligned + 2); 1201 v3 = LoadU(d, unaligned + 3); 1202 } 1203 1204 // ------------------------------ StoreInterleaved2/3/4 1205 1206 template <class D, typename T = TFromD<D>> 1207 HWY_API void StoreInterleaved2(const Vec1<T> v0, const Vec1<T> v1, D d, 1208 T* HWY_RESTRICT unaligned) { 1209 StoreU(v0, d, unaligned + 0); 1210 StoreU(v1, d, unaligned + 1); 1211 } 1212 1213 template <class D, typename T = TFromD<D>> 1214 HWY_API void StoreInterleaved3(const Vec1<T> v0, const Vec1<T> v1, 1215 const Vec1<T> v2, D d, 1216 T* HWY_RESTRICT unaligned) { 1217 StoreU(v0, d, unaligned + 0); 1218 StoreU(v1, d, unaligned + 1); 1219 StoreU(v2, d, unaligned + 2); 1220 } 1221 1222 template <class D, typename T = TFromD<D>> 1223 HWY_API void StoreInterleaved4(const Vec1<T> v0, const Vec1<T> v1, 1224 const Vec1<T> v2, const Vec1<T> v3, D d, 1225 T* HWY_RESTRICT unaligned) { 1226 StoreU(v0, d, unaligned + 0); 1227 StoreU(v1, d, unaligned + 1); 1228 StoreU(v2, d, unaligned + 2); 1229 StoreU(v3, d, unaligned + 3); 1230 } 1231 1232 // ------------------------------ Stream 1233 1234 template <class D, typename T = TFromD<D>> 1235 HWY_API void Stream(const Vec1<T> v, D d, T* HWY_RESTRICT aligned) { 1236 return Store(v, d, aligned); 1237 } 1238 1239 // ------------------------------ Scatter 1240 1241 #ifdef HWY_NATIVE_SCATTER 1242 #undef HWY_NATIVE_SCATTER 1243 #else 1244 #define HWY_NATIVE_SCATTER 1245 #endif 1246 1247 template <class D, typename T = TFromD<D>, typename TI> 1248 HWY_API void ScatterOffset(Vec1<T> v, D d, T* base, Vec1<TI> offset) { 1249 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 1250 const intptr_t addr = 1251 reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw); 1252 Store(v, d, reinterpret_cast<T*>(addr)); 1253 } 1254 1255 template <class D, typename T = TFromD<D>, typename TI> 1256 HWY_API void ScatterIndex(Vec1<T> v, D d, T* HWY_RESTRICT base, 1257 Vec1<TI> index) { 1258 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 1259 Store(v, d, base + index.raw); 1260 } 1261 1262 template <class D, typename T = TFromD<D>, typename TI> 1263 HWY_API void MaskedScatterIndex(Vec1<T> v, Mask1<T> m, D d, 1264 T* HWY_RESTRICT base, Vec1<TI> index) { 1265 static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); 1266 if (m.bits) Store(v, d, base + index.raw); 1267 } 1268 1269 // ------------------------------ Gather 1270 1271 #ifdef HWY_NATIVE_GATHER 1272 #undef HWY_NATIVE_GATHER 1273 #else 1274 #define HWY_NATIVE_GATHER 1275 #endif 1276 1277 template <class D, typename T = TFromD<D>> 1278 HWY_API Vec1<T> GatherOffset(D d, const T* base, Vec1<MakeSigned<T>> offset) { 1279 HWY_DASSERT(offset.raw >= 0); 1280 const intptr_t addr = 1281 reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw); 1282 return Load(d, reinterpret_cast<const T*>(addr)); 1283 } 1284 1285 template <class D, typename T = TFromD<D>> 1286 HWY_API Vec1<T> GatherIndex(D d, const T* HWY_RESTRICT base, 1287 Vec1<MakeSigned<T>> index) { 1288 HWY_DASSERT(index.raw >= 0); 1289 return Load(d, base + index.raw); 1290 } 1291 1292 template <class D, typename T = TFromD<D>> 1293 HWY_API Vec1<T> MaskedGatherIndex(Mask1<T> m, D d, const T* HWY_RESTRICT base, 1294 Vec1<MakeSigned<T>> index) { 1295 HWY_DASSERT(index.raw >= 0); 1296 return MaskedLoad(m, d, base + index.raw); 1297 } 1298 1299 template <class D, typename T = TFromD<D>> 1300 HWY_API Vec1<T> MaskedGatherIndexOr(Vec1<T> no, Mask1<T> m, D d, 1301 const T* HWY_RESTRICT base, 1302 Vec1<MakeSigned<T>> index) { 1303 HWY_DASSERT(index.raw >= 0); 1304 return MaskedLoadOr(no, m, d, base + index.raw); 1305 } 1306 1307 // ================================================== CONVERT 1308 1309 // ConvertTo and DemoteTo with floating-point input and integer output truncate 1310 // (rounding toward zero). 1311 1312 namespace detail { 1313 1314 template <class ToT, class FromT> 1315 HWY_INLINE ToT CastValueForF2IConv(FromT val) { 1316 // Prevent ubsan errors when converting float to narrower integer 1317 1318 using FromTU = MakeUnsigned<FromT>; 1319 using ToTU = MakeUnsigned<ToT>; 1320 1321 constexpr unsigned kMaxExpField = 1322 static_cast<unsigned>(MaxExponentField<FromT>()); 1323 constexpr unsigned kExpBias = kMaxExpField >> 1; 1324 constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN( 1325 kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()), 1326 kMaxExpField)); 1327 1328 // If ToT is signed, compare only the exponent bits of val against 1329 // kMinOutOfRangeExpField. 1330 // 1331 // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of 1332 // val against kMinOutOfRangeExpField as a negative value is outside of the 1333 // range of an unsigned integer type. 1334 const FromT val_to_compare = 1335 static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val); 1336 1337 // val is within the range of ToT if 1338 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less 1339 // than kMinOutOfRangeExpField 1340 // 1341 // Otherwise, val is either outside of the range of ToT or equal to 1342 // LimitsMin<ToT>() if 1343 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater 1344 // than or equal to kMinOutOfRangeExpField. 1345 1346 return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >> 1347 MantissaBits<FromT>()) < kMinOutOfRangeExpField) 1348 ? static_cast<ToT>(val) 1349 : static_cast<ToT>(static_cast<ToTU>(LimitsMax<ToT>()) + 1350 static_cast<ToTU>(ScalarSignBit(val))); 1351 } 1352 1353 template <class ToT, class ToTypeTag, class FromT> 1354 HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) { 1355 return ConvertScalarTo<ToT>(val); 1356 } 1357 1358 template <class ToT> 1359 HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/, 1360 float val) { 1361 return CastValueForF2IConv<ToT>(val); 1362 } 1363 1364 template <class ToT> 1365 HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/, 1366 float val) { 1367 return CastValueForF2IConv<ToT>(val); 1368 } 1369 1370 // If val is within the range of ToT, CastValueForInRangeF2IConv<ToT>(val) 1371 // returns static_cast<ToT>(val) 1372 // 1373 // Otherwise, CastValueForInRangeF2IConv<ToT>(val) returns an 1374 // implementation-defined result if val is not within the range of ToT. 1375 template <class ToT, class FromT> 1376 HWY_INLINE ToT CastValueForInRangeF2IConv(FromT val) { 1377 // Prevent ubsan errors when converting float to narrower integer 1378 1379 using FromTU = MakeUnsigned<FromT>; 1380 1381 constexpr unsigned kMaxExpField = 1382 static_cast<unsigned>(MaxExponentField<FromT>()); 1383 constexpr unsigned kExpBias = kMaxExpField >> 1; 1384 constexpr unsigned kMinOutOfRangeExpField = static_cast<unsigned>(HWY_MIN( 1385 kExpBias + sizeof(ToT) * 8 - static_cast<unsigned>(IsSigned<ToT>()), 1386 kMaxExpField)); 1387 1388 // If ToT is signed, compare only the exponent bits of val against 1389 // kMinOutOfRangeExpField. 1390 // 1391 // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of 1392 // val against kMinOutOfRangeExpField as a negative value is outside of the 1393 // range of an unsigned integer type. 1394 const FromT val_to_compare = 1395 static_cast<FromT>(IsSigned<ToT>() ? ScalarAbs(val) : val); 1396 1397 // val is within the range of ToT if 1398 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is less 1399 // than kMinOutOfRangeExpField 1400 // 1401 // Otherwise, val is either outside of the range of ToT or equal to 1402 // LimitsMin<ToT>() if 1403 // (BitCastScalar<FromTU>(val_to_compare) >> MantissaBits<FromT>()) is greater 1404 // than or equal to kMinOutOfRangeExpField. 1405 1406 return (static_cast<unsigned>(BitCastScalar<FromTU>(val_to_compare) >> 1407 MantissaBits<FromT>()) < kMinOutOfRangeExpField) 1408 ? static_cast<ToT>(val) 1409 : static_cast<ToT>(LimitsMin<ToT>()); 1410 } 1411 1412 } // namespace detail 1413 1414 #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64 1415 #undef HWY_NATIVE_PROMOTE_F16_TO_F64 1416 #else 1417 #define HWY_NATIVE_PROMOTE_F16_TO_F64 1418 #endif 1419 1420 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom> 1421 HWY_API Vec1<TTo> PromoteTo(DTo /* tag */, Vec1<TFrom> from) { 1422 static_assert(sizeof(TTo) > sizeof(TFrom), "Not promoting"); 1423 // For bits Y > X, floatX->floatY and intX->intY are always representable. 1424 return Vec1<TTo>( 1425 detail::CastValueForPromoteTo<TTo>(hwy::TypeTag<TTo>(), from.raw)); 1426 } 1427 1428 #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 1429 #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 1430 #else 1431 #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 1432 #endif 1433 1434 template <class DTo, HWY_IF_UI64_D(DTo)> 1435 HWY_API VFromD<DTo> PromoteInRangeTo(DTo /* tag */, Vec1<float> from) { 1436 using TTo = TFromD<DTo>; 1437 return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(from.raw)); 1438 } 1439 1440 // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here, 1441 // so we overload for TFrom=double and TTo={float,int32_t}. 1442 template <class D, HWY_IF_F32_D(D)> 1443 HWY_API Vec1<float> DemoteTo(D /* tag */, Vec1<double> from) { 1444 // Prevent ubsan errors when converting float to narrower integer/float 1445 if (IsInf(from).bits || 1446 Abs(from).raw > static_cast<double>(HighestValue<float>())) { 1447 return Vec1<float>(ScalarSignBit(from.raw) ? LowestValue<float>() 1448 : HighestValue<float>()); 1449 } 1450 return Vec1<float>(static_cast<float>(from.raw)); 1451 } 1452 template <class D, HWY_IF_UI32_D(D)> 1453 HWY_API VFromD<D> DemoteTo(D /* tag */, Vec1<double> from) { 1454 // Prevent ubsan errors when converting int32_t to narrower integer/int32_t 1455 return Vec1<TFromD<D>>(detail::CastValueForF2IConv<TFromD<D>>(from.raw)); 1456 } 1457 1458 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, 1459 HWY_IF_SIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DTo>)> 1460 HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) { 1461 static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above"); 1462 static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); 1463 1464 // Int to int: choose closest value in TTo to `from` (avoids UB) 1465 from.raw = HWY_MIN(HWY_MAX(LimitsMin<TTo>(), from.raw), LimitsMax<TTo>()); 1466 return Vec1<TTo>(static_cast<TTo>(from.raw)); 1467 } 1468 1469 // Disable the default unsigned to signed DemoteTo implementation in 1470 // generic_ops-inl.h on SCALAR as the SCALAR target has a target-specific 1471 // implementation of the unsigned to signed DemoteTo op and as ReorderDemote2To 1472 // is not supported on the SCALAR target 1473 1474 // NOTE: hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr is used instead of 1475 // hwy::EnableIf<false>* = nullptr to avoid compiler errors since 1476 // !hwy::IsSame<V, V>() is always false and as !hwy::IsSame<V, V>() will cause 1477 // SFINAE to occur instead of a hard error due to a dependency on the V template 1478 // argument 1479 #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V 1480 #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) \ 1481 hwy::EnableIf<!hwy::IsSame<V, V>()>* = nullptr 1482 1483 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, 1484 HWY_IF_UNSIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DTo)> 1485 HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) { 1486 static_assert(!IsFloat<TFrom>(), "TFrom=double are handled above"); 1487 static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); 1488 1489 const auto max = static_cast<MakeUnsigned<TTo>>(LimitsMax<TTo>()); 1490 1491 // Int to int: choose closest value in TTo to `from` (avoids UB) 1492 return Vec1<TTo>(static_cast<TTo>(HWY_MIN(from.raw, max))); 1493 } 1494 1495 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, 1496 HWY_IF_UI64(TFrom), HWY_IF_F32_D(DTo)> 1497 HWY_API Vec1<TTo> DemoteTo(DTo /* tag */, Vec1<TFrom> from) { 1498 // int64_t/uint64_t to float: simply cast to TTo 1499 return Vec1<TTo>(static_cast<TTo>(from.raw)); 1500 } 1501 1502 #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 1503 #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 1504 #else 1505 #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO 1506 #endif 1507 1508 template <class D32, HWY_IF_UI32_D(D32)> 1509 HWY_API VFromD<D32> DemoteInRangeTo(D32 /*d32*/, 1510 VFromD<Rebind<double, D32>> v) { 1511 using TTo = TFromD<D32>; 1512 return Vec1<TTo>(detail::CastValueForInRangeF2IConv<TTo>(v.raw)); 1513 } 1514 1515 // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions; 1516 // use this scalar version to verify the vector implementation. 1517 #ifdef HWY_NATIVE_F16C 1518 #undef HWY_NATIVE_F16C 1519 #else 1520 #define HWY_NATIVE_F16C 1521 #endif 1522 1523 template <class D, HWY_IF_F32_D(D)> 1524 HWY_API Vec1<float> PromoteTo(D /* tag */, const Vec1<float16_t> v) { 1525 return Vec1<float>(F32FromF16(v.raw)); 1526 } 1527 1528 template <class D, HWY_IF_F32_D(D)> 1529 HWY_API Vec1<float> PromoteTo(D d, const Vec1<bfloat16_t> v) { 1530 return Set(d, F32FromBF16(v.raw)); 1531 } 1532 1533 template <class DTo, typename TFrom> 1534 HWY_API VFromD<DTo> PromoteEvenTo(DTo d_to, Vec1<TFrom> v) { 1535 return PromoteTo(d_to, v); 1536 } 1537 1538 template <class D, HWY_IF_F16_D(D)> 1539 HWY_API Vec1<float16_t> DemoteTo(D /* tag */, const Vec1<float> v) { 1540 return Vec1<float16_t>(F16FromF32(v.raw)); 1541 } 1542 1543 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16 1544 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16 1545 #else 1546 #define HWY_NATIVE_DEMOTE_F32_TO_BF16 1547 #endif 1548 1549 template <class D, HWY_IF_BF16_D(D)> 1550 HWY_API Vec1<bfloat16_t> DemoteTo(D d, const Vec1<float> v) { 1551 return Set(d, BF16FromF32(v.raw)); 1552 } 1553 1554 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, 1555 HWY_IF_FLOAT(TFrom)> 1556 HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) { 1557 static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size"); 1558 // float## -> int##: return closest representable value. 1559 return Vec1<TTo>(detail::CastValueForF2IConv<TTo>(from.raw)); 1560 } 1561 1562 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, 1563 HWY_IF_NOT_FLOAT(TFrom)> 1564 HWY_API Vec1<TTo> ConvertTo(DTo /* tag */, Vec1<TFrom> from) { 1565 static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size"); 1566 // int## -> float##: no check needed 1567 return Vec1<TTo>(static_cast<TTo>(from.raw)); 1568 } 1569 1570 #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 1571 #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 1572 #else 1573 #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO 1574 #endif 1575 1576 template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI), 1577 HWY_IF_T_SIZE_ONE_OF_D(DI, (1 << 4) | (1 << 8))> 1578 HWY_API VFromD<DI> ConvertInRangeTo(DI /*di*/, VFromD<RebindToFloat<DI>> v) { 1579 using TTo = TFromD<DI>; 1580 return VFromD<DI>(detail::CastValueForInRangeF2IConv<TTo>(v.raw)); 1581 } 1582 1583 HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) { 1584 return DemoteTo(Sisd<uint8_t>(), v); 1585 } 1586 1587 // ------------------------------ TruncateTo 1588 1589 template <class D, HWY_IF_U8_D(D)> 1590 HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) { 1591 return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)}; 1592 } 1593 1594 template <class D, HWY_IF_U16_D(D)> 1595 HWY_API Vec1<uint16_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) { 1596 return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)}; 1597 } 1598 1599 template <class D, HWY_IF_U32_D(D)> 1600 HWY_API Vec1<uint32_t> TruncateTo(D /* tag */, Vec1<uint64_t> v) { 1601 return Vec1<uint32_t>{static_cast<uint32_t>(v.raw & 0xFFFFFFFFu)}; 1602 } 1603 1604 template <class D, HWY_IF_U8_D(D)> 1605 HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint32_t> v) { 1606 return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)}; 1607 } 1608 1609 template <class D, HWY_IF_U16_D(D)> 1610 HWY_API Vec1<uint16_t> TruncateTo(D /* tag */, Vec1<uint32_t> v) { 1611 return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)}; 1612 } 1613 1614 template <class D, HWY_IF_U8_D(D)> 1615 HWY_API Vec1<uint8_t> TruncateTo(D /* tag */, Vec1<uint16_t> v) { 1616 return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)}; 1617 } 1618 1619 // ================================================== COMBINE 1620 // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported. 1621 1622 template <typename T> 1623 HWY_API Vec1<T> LowerHalf(Vec1<T> v) { 1624 return v; 1625 } 1626 1627 template <class D, typename T = TFromD<D>> 1628 HWY_API Vec1<T> LowerHalf(D /* tag */, Vec1<T> v) { 1629 return v; 1630 } 1631 1632 // ================================================== SWIZZLE 1633 1634 template <typename T> 1635 HWY_API T GetLane(const Vec1<T> v) { 1636 return v.raw; 1637 } 1638 1639 template <typename T> 1640 HWY_API T ExtractLane(const Vec1<T> v, size_t i) { 1641 HWY_DASSERT(i == 0); 1642 (void)i; 1643 return v.raw; 1644 } 1645 1646 template <typename T> 1647 HWY_API Vec1<T> InsertLane(Vec1<T> v, size_t i, T t) { 1648 HWY_DASSERT(i == 0); 1649 (void)i; 1650 v.raw = t; 1651 return v; 1652 } 1653 1654 template <typename T> 1655 HWY_API Vec1<T> DupEven(Vec1<T> v) { 1656 return v; 1657 } 1658 // DupOdd is unsupported. 1659 1660 template <typename T> 1661 HWY_API Vec1<T> OddEven(Vec1<T> /* odd */, Vec1<T> even) { 1662 return even; 1663 } 1664 1665 template <typename T> 1666 HWY_API Vec1<T> OddEvenBlocks(Vec1<T> /* odd */, Vec1<T> even) { 1667 return even; 1668 } 1669 1670 // ------------------------------ SwapAdjacentBlocks 1671 template <typename T> 1672 HWY_API Vec1<T> SwapAdjacentBlocks(Vec1<T> v) { 1673 return v; 1674 } 1675 1676 // ------------------------------ InterleaveEvenBlocks 1677 template <class D, class V = VFromD<D>> 1678 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) { 1679 return a; 1680 } 1681 // ------------------------------ InterleaveOddBlocks 1682 template <class D, class V = VFromD<D>> 1683 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) { 1684 return a; 1685 } 1686 1687 // ------------------------------ TableLookupLanes 1688 1689 // Returned by SetTableIndices for use by TableLookupLanes. 1690 template <typename T> 1691 struct Indices1 { 1692 MakeSigned<T> raw; 1693 }; 1694 1695 template <class D, typename T = TFromD<D>, typename TI> 1696 HWY_API Indices1<T> IndicesFromVec(D, Vec1<TI> vec) { 1697 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size"); 1698 HWY_DASSERT(vec.raw <= 1); 1699 return Indices1<T>{static_cast<MakeSigned<T>>(vec.raw)}; 1700 } 1701 1702 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>, typename TI> 1703 HWY_API Indices1<T> SetTableIndices(D d, const TI* idx) { 1704 return IndicesFromVec(d, LoadU(Sisd<TI>(), idx)); 1705 } 1706 1707 template <typename T> 1708 HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) { 1709 return v; 1710 } 1711 1712 template <typename T> 1713 HWY_API Vec1<T> TwoTablesLookupLanes(const Vec1<T> a, const Vec1<T> b, 1714 const Indices1<T> idx) { 1715 return (idx.raw == 0) ? a : b; 1716 } 1717 1718 // ------------------------------ ReverseBlocks 1719 1720 // Single block: no change 1721 template <class D, typename T = TFromD<D>> 1722 HWY_API Vec1<T> ReverseBlocks(D /* tag */, const Vec1<T> v) { 1723 return v; 1724 } 1725 1726 // ------------------------------ Reverse 1727 1728 template <class D, typename T = TFromD<D>> 1729 HWY_API Vec1<T> Reverse(D /* tag */, const Vec1<T> v) { 1730 return v; 1731 } 1732 1733 // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. 1734 #ifdef HWY_NATIVE_REVERSE2_8 1735 #undef HWY_NATIVE_REVERSE2_8 1736 #else 1737 #define HWY_NATIVE_REVERSE2_8 1738 #endif 1739 1740 // Must not be called: 1741 template <class D, typename T = TFromD<D>> 1742 HWY_API Vec1<T> Reverse2(D /* tag */, const Vec1<T> v) { 1743 return v; 1744 } 1745 1746 template <class D, typename T = TFromD<D>> 1747 HWY_API Vec1<T> Reverse4(D /* tag */, const Vec1<T> v) { 1748 return v; 1749 } 1750 1751 template <class D, typename T = TFromD<D>> 1752 HWY_API Vec1<T> Reverse8(D /* tag */, const Vec1<T> v) { 1753 return v; 1754 } 1755 1756 // ------------------------------ ReverseLaneBytes 1757 1758 #ifdef HWY_NATIVE_REVERSE_LANE_BYTES 1759 #undef HWY_NATIVE_REVERSE_LANE_BYTES 1760 #else 1761 #define HWY_NATIVE_REVERSE_LANE_BYTES 1762 #endif 1763 1764 HWY_API Vec1<uint16_t> ReverseLaneBytes(Vec1<uint16_t> v) { 1765 const uint32_t val{v.raw}; 1766 return Vec1<uint16_t>( 1767 static_cast<uint16_t>(((val << 8) & 0xFF00u) | ((val >> 8) & 0x00FFu))); 1768 } 1769 1770 HWY_API Vec1<uint32_t> ReverseLaneBytes(Vec1<uint32_t> v) { 1771 const uint32_t val = v.raw; 1772 return Vec1<uint32_t>(static_cast<uint32_t>( 1773 ((val << 24) & 0xFF000000u) | ((val << 8) & 0x00FF0000u) | 1774 ((val >> 8) & 0x0000FF00u) | ((val >> 24) & 0x000000FFu))); 1775 } 1776 1777 HWY_API Vec1<uint64_t> ReverseLaneBytes(Vec1<uint64_t> v) { 1778 const uint64_t val = v.raw; 1779 return Vec1<uint64_t>(static_cast<uint64_t>( 1780 ((val << 56) & 0xFF00000000000000u) | 1781 ((val << 40) & 0x00FF000000000000u) | 1782 ((val << 24) & 0x0000FF0000000000u) | ((val << 8) & 0x000000FF00000000u) | 1783 ((val >> 8) & 0x00000000FF000000u) | ((val >> 24) & 0x0000000000FF0000u) | 1784 ((val >> 40) & 0x000000000000FF00u) | 1785 ((val >> 56) & 0x00000000000000FFu))); 1786 } 1787 1788 template <class V, HWY_IF_SIGNED_V(V), 1789 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))> 1790 HWY_API V ReverseLaneBytes(V v) { 1791 const DFromV<decltype(v)> d; 1792 const RebindToUnsigned<decltype(d)> du; 1793 return BitCast(d, ReverseLaneBytes(BitCast(du, v))); 1794 } 1795 1796 // ------------------------------ ReverseBits 1797 #ifdef HWY_NATIVE_REVERSE_BITS_UI8 1798 #undef HWY_NATIVE_REVERSE_BITS_UI8 1799 #else 1800 #define HWY_NATIVE_REVERSE_BITS_UI8 1801 #endif 1802 1803 #ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64 1804 #undef HWY_NATIVE_REVERSE_BITS_UI16_32_64 1805 #else 1806 #define HWY_NATIVE_REVERSE_BITS_UI16_32_64 1807 #endif 1808 1809 namespace detail { 1810 1811 template <class T> 1812 HWY_INLINE T ReverseBitsOfEachByte(T val) { 1813 using TU = MakeUnsigned<T>; 1814 constexpr TU kMaxUnsignedVal{LimitsMax<TU>()}; 1815 constexpr TU kShrMask1 = 1816 static_cast<TU>(0x5555555555555555u & kMaxUnsignedVal); 1817 constexpr TU kShrMask2 = 1818 static_cast<TU>(0x3333333333333333u & kMaxUnsignedVal); 1819 constexpr TU kShrMask3 = 1820 static_cast<TU>(0x0F0F0F0F0F0F0F0Fu & kMaxUnsignedVal); 1821 1822 constexpr TU kShlMask1 = static_cast<TU>(~kShrMask1); 1823 constexpr TU kShlMask2 = static_cast<TU>(~kShrMask2); 1824 constexpr TU kShlMask3 = static_cast<TU>(~kShrMask3); 1825 1826 TU result = static_cast<TU>(val); 1827 result = static_cast<TU>(((result << 1) & kShlMask1) | 1828 ((result >> 1) & kShrMask1)); 1829 result = static_cast<TU>(((result << 2) & kShlMask2) | 1830 ((result >> 2) & kShrMask2)); 1831 result = static_cast<TU>(((result << 4) & kShlMask3) | 1832 ((result >> 4) & kShrMask3)); 1833 return static_cast<T>(result); 1834 } 1835 1836 } // namespace detail 1837 1838 template <class V, HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, 1)> 1839 HWY_API V ReverseBits(V v) { 1840 return V(detail::ReverseBitsOfEachByte(v.raw)); 1841 } 1842 1843 template <class V, HWY_IF_UNSIGNED_V(V), 1844 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))> 1845 HWY_API V ReverseBits(V v) { 1846 return ReverseLaneBytes(V(detail::ReverseBitsOfEachByte(v.raw))); 1847 } 1848 1849 template <class V, HWY_IF_SIGNED_V(V)> 1850 HWY_API V ReverseBits(V v) { 1851 const DFromV<decltype(v)> d; 1852 const RebindToUnsigned<decltype(d)> du; 1853 return BitCast(d, ReverseBits(BitCast(du, v))); 1854 } 1855 1856 // ------------------------------ SlideUpLanes 1857 1858 template <typename D> 1859 HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { 1860 return v; 1861 } 1862 1863 // ------------------------------ SlideDownLanes 1864 1865 template <typename D> 1866 HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { 1867 return v; 1868 } 1869 1870 // ================================================== BLOCKWISE 1871 // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported. 1872 1873 // ------------------------------ Broadcast/splat any lane 1874 1875 template <int kLane, typename T> 1876 HWY_API Vec1<T> Broadcast(const Vec1<T> v) { 1877 static_assert(kLane == 0, "Scalar only has one lane"); 1878 return v; 1879 } 1880 1881 // ------------------------------ TableLookupBytes, TableLookupBytesOr0 1882 1883 template <typename T, typename TI> 1884 HWY_API Vec1<TI> TableLookupBytes(const Vec1<T> in, const Vec1<TI> indices) { 1885 uint8_t in_bytes[sizeof(T)]; 1886 uint8_t idx_bytes[sizeof(T)]; 1887 uint8_t out_bytes[sizeof(T)]; 1888 CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes 1889 CopyBytes<sizeof(T)>(&indices, &idx_bytes); 1890 for (size_t i = 0; i < sizeof(T); ++i) { 1891 out_bytes[i] = in_bytes[idx_bytes[i]]; 1892 } 1893 TI out; 1894 CopyBytes<sizeof(TI)>(&out_bytes, &out); 1895 return Vec1<TI>{out}; 1896 } 1897 1898 template <typename T, typename TI> 1899 HWY_API Vec1<TI> TableLookupBytesOr0(const Vec1<T> in, const Vec1<TI> indices) { 1900 uint8_t in_bytes[sizeof(T)]; 1901 uint8_t idx_bytes[sizeof(T)]; 1902 uint8_t out_bytes[sizeof(T)]; 1903 CopyBytes<sizeof(T)>(&in, &in_bytes); // copy to bytes 1904 CopyBytes<sizeof(T)>(&indices, &idx_bytes); 1905 for (size_t i = 0; i < sizeof(T); ++i) { 1906 out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]]; 1907 } 1908 TI out; 1909 CopyBytes<sizeof(TI)>(&out_bytes, &out); 1910 return Vec1<TI>{out}; 1911 } 1912 1913 // ------------------------------ ZipLower 1914 1915 HWY_API Vec1<uint16_t> ZipLower(Vec1<uint8_t> a, Vec1<uint8_t> b) { 1916 return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t{b.raw} << 8) + a.raw)); 1917 } 1918 HWY_API Vec1<uint32_t> ZipLower(Vec1<uint16_t> a, Vec1<uint16_t> b) { 1919 return Vec1<uint32_t>((uint32_t{b.raw} << 16) + a.raw); 1920 } 1921 HWY_API Vec1<uint64_t> ZipLower(Vec1<uint32_t> a, Vec1<uint32_t> b) { 1922 return Vec1<uint64_t>((uint64_t{b.raw} << 32) + a.raw); 1923 } 1924 HWY_API Vec1<int16_t> ZipLower(Vec1<int8_t> a, Vec1<int8_t> b) { 1925 return Vec1<int16_t>(static_cast<int16_t>((int32_t{b.raw} << 8) + a.raw)); 1926 } 1927 HWY_API Vec1<int32_t> ZipLower(Vec1<int16_t> a, Vec1<int16_t> b) { 1928 return Vec1<int32_t>((int32_t{b.raw} << 16) + a.raw); 1929 } 1930 HWY_API Vec1<int64_t> ZipLower(Vec1<int32_t> a, Vec1<int32_t> b) { 1931 return Vec1<int64_t>((int64_t{b.raw} << 32) + a.raw); 1932 } 1933 1934 template <class DW, typename TW = TFromD<DW>, typename TN = MakeNarrow<TW>> 1935 HWY_API Vec1<TW> ZipLower(DW /* tag */, Vec1<TN> a, Vec1<TN> b) { 1936 return Vec1<TW>(static_cast<TW>((TW{b.raw} << (sizeof(TN) * 8)) + a.raw)); 1937 } 1938 1939 // ================================================== MASK 1940 1941 template <class D, typename T = TFromD<D>> 1942 HWY_API bool AllFalse(D /* tag */, const Mask1<T> mask) { 1943 return mask.bits == 0; 1944 } 1945 1946 template <class D, typename T = TFromD<D>> 1947 HWY_API bool AllTrue(D /* tag */, const Mask1<T> mask) { 1948 return mask.bits != 0; 1949 } 1950 1951 // `p` points to at least 8 readable bytes, not all of which need be valid. 1952 template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>> 1953 HWY_API Mask1<T> LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) { 1954 return Mask1<T>::FromBool((bits[0] & 1) != 0); 1955 } 1956 1957 template <class D, HWY_IF_LANES_D(D, 1)> 1958 HWY_API MFromD<D> Dup128MaskFromMaskBits(D /*d*/, unsigned mask_bits) { 1959 return MFromD<D>::FromBool((mask_bits & 1) != 0); 1960 } 1961 1962 // `p` points to at least 8 writable bytes. 1963 template <class D, typename T = TFromD<D>> 1964 HWY_API size_t StoreMaskBits(D d, const Mask1<T> mask, uint8_t* bits) { 1965 *bits = AllTrue(d, mask); 1966 return 1; 1967 } 1968 1969 template <class D, typename T = TFromD<D>> 1970 HWY_API size_t CountTrue(D /* tag */, const Mask1<T> mask) { 1971 return mask.bits == 0 ? 0 : 1; 1972 } 1973 1974 template <class D, typename T = TFromD<D>> 1975 HWY_API intptr_t FindFirstTrue(D /* tag */, const Mask1<T> mask) { 1976 return mask.bits == 0 ? -1 : 0; 1977 } 1978 1979 template <class D, typename T = TFromD<D>> 1980 HWY_API size_t FindKnownFirstTrue(D /* tag */, const Mask1<T> /* m */) { 1981 return 0; // There is only one lane and we know it is true. 1982 } 1983 1984 template <class D, typename T = TFromD<D>> 1985 HWY_API intptr_t FindLastTrue(D /* tag */, const Mask1<T> mask) { 1986 return mask.bits == 0 ? -1 : 0; 1987 } 1988 1989 template <class D, typename T = TFromD<D>> 1990 HWY_API size_t FindKnownLastTrue(D /* tag */, const Mask1<T> /* m */) { 1991 return 0; // There is only one lane and we know it is true. 1992 } 1993 1994 // ------------------------------ Compress, CompressBits 1995 1996 template <typename T> 1997 struct CompressIsPartition { 1998 enum { value = 1 }; 1999 }; 2000 2001 template <typename T> 2002 HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) { 2003 // A single lane is already partitioned by definition. 2004 return v; 2005 } 2006 2007 template <typename T> 2008 HWY_API Vec1<T> CompressNot(Vec1<T> v, const Mask1<T> /* mask */) { 2009 // A single lane is already partitioned by definition. 2010 return v; 2011 } 2012 2013 // ------------------------------ CompressStore 2014 template <class D, typename T = TFromD<D>> 2015 HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, D d, 2016 T* HWY_RESTRICT unaligned) { 2017 StoreU(Compress(v, mask), d, unaligned); 2018 return CountTrue(d, mask); 2019 } 2020 2021 // ------------------------------ CompressBlendedStore 2022 template <class D, typename T = TFromD<D>> 2023 HWY_API size_t CompressBlendedStore(Vec1<T> v, const Mask1<T> mask, D d, 2024 T* HWY_RESTRICT unaligned) { 2025 if (!mask.bits) return 0; 2026 StoreU(v, d, unaligned); 2027 return 1; 2028 } 2029 2030 // ------------------------------ CompressBits 2031 template <typename T> 2032 HWY_API Vec1<T> CompressBits(Vec1<T> v, const uint8_t* HWY_RESTRICT /*bits*/) { 2033 return v; 2034 } 2035 2036 // ------------------------------ CompressBitsStore 2037 template <class D, typename T = TFromD<D>> 2038 HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits, 2039 D d, T* HWY_RESTRICT unaligned) { 2040 const Mask1<T> mask = LoadMaskBits(d, bits); 2041 StoreU(Compress(v, mask), d, unaligned); 2042 return CountTrue(d, mask); 2043 } 2044 2045 // ------------------------------ Expand 2046 2047 // generic_ops-inl.h requires Vec64/128, so implement [Load]Expand here. 2048 #ifdef HWY_NATIVE_EXPAND 2049 #undef HWY_NATIVE_EXPAND 2050 #else 2051 #define HWY_NATIVE_EXPAND 2052 #endif 2053 2054 template <typename T> 2055 HWY_API Vec1<T> Expand(Vec1<T> v, const Mask1<T> mask) { 2056 return IfThenElseZero(mask, v); 2057 } 2058 2059 // ------------------------------ LoadExpand 2060 template <class D> 2061 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, 2062 const TFromD<D>* HWY_RESTRICT unaligned) { 2063 return MaskedLoad(mask, d, unaligned); 2064 } 2065 2066 // ------------------------------ WidenMulPairwiseAdd 2067 2068 template <class D32, HWY_IF_F32_D(D32)> 2069 HWY_API Vec1<float> WidenMulPairwiseAdd(D32 /* tag */, Vec1<bfloat16_t> a, 2070 Vec1<bfloat16_t> b) { 2071 return Vec1<float>(F32FromBF16(a.raw)) * Vec1<float>(F32FromBF16(b.raw)); 2072 } 2073 2074 template <class D32, HWY_IF_I32_D(D32)> 2075 HWY_API Vec1<int32_t> WidenMulPairwiseAdd(D32 /* tag */, Vec1<int16_t> a, 2076 Vec1<int16_t> b) { 2077 return Vec1<int32_t>(a.raw * b.raw); 2078 } 2079 2080 // ------------------------------ SatWidenMulAccumFixedPoint 2081 #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT 2082 #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT 2083 #else 2084 #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT 2085 #endif 2086 2087 template <class DI32, HWY_IF_I32_D(DI32)> 2088 HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32, 2089 VFromD<Rebind<int16_t, DI32>> a, 2090 VFromD<Rebind<int16_t, DI32>> b, 2091 VFromD<DI32> sum) { 2092 // Multiplying static_cast<int32_t>(a.raw) by static_cast<int32_t>(b.raw) 2093 // followed by an addition of the product is okay as 2094 // (a.raw * b.raw * 2) is between -2147418112 and 2147483648 and as 2095 // a.raw * b.raw * 2 can only overflow an int32_t if both a.raw and b.raw are 2096 // equal to -32768. 2097 2098 const VFromD<DI32> product(static_cast<int32_t>(a.raw) * 2099 static_cast<int32_t>(b.raw)); 2100 const VFromD<DI32> product2 = Add(product, product); 2101 2102 const auto mul_overflow = 2103 VecFromMask(di32, Eq(product2, Set(di32, LimitsMin<int32_t>()))); 2104 2105 return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)), 2106 Add(product2, mul_overflow)); 2107 } 2108 2109 // ------------------------------ SatWidenMulPairwiseAdd 2110 2111 #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD 2112 #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD 2113 #else 2114 #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD 2115 #endif 2116 2117 template <class DI16, HWY_IF_I16_D(DI16)> 2118 HWY_API Vec1<int16_t> SatWidenMulPairwiseAdd(DI16 /* tag */, Vec1<uint8_t> a, 2119 Vec1<int8_t> b) { 2120 // Saturation of a.raw * b.raw is not needed on the HWY_SCALAR target as the 2121 // input vectors only have 1 lane on the HWY_SCALAR target and as 2122 // a.raw * b.raw is between -32640 and 32385, which is already within the 2123 // range of an int16_t. 2124 2125 // On other targets, a saturated addition of a[0]*b[0] + a[1]*b[1] is needed 2126 // as it is possible for the addition of a[0]*b[0] + a[1]*b[1] to overflow if 2127 // a[0], a[1], b[0], and b[1] are all non-zero and b[0] and b[1] both have the 2128 // same sign. 2129 2130 return Vec1<int16_t>(static_cast<int16_t>(a.raw) * 2131 static_cast<int16_t>(b.raw)); 2132 } 2133 2134 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) 2135 2136 #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 2137 #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 2138 #else 2139 #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 2140 #endif 2141 2142 template <class D32, HWY_IF_F32_D(D32)> 2143 HWY_API Vec1<float> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<bfloat16_t> a, 2144 Vec1<bfloat16_t> b, 2145 const Vec1<float> sum0, 2146 Vec1<float>& /* sum1 */) { 2147 return MulAdd(Vec1<float>(F32FromBF16(a.raw)), 2148 Vec1<float>(F32FromBF16(b.raw)), sum0); 2149 } 2150 2151 template <class D32, HWY_IF_I32_D(D32)> 2152 HWY_API Vec1<int32_t> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<int16_t> a, 2153 Vec1<int16_t> b, 2154 const Vec1<int32_t> sum0, 2155 Vec1<int32_t>& /* sum1 */) { 2156 return Vec1<int32_t>(a.raw * b.raw + sum0.raw); 2157 } 2158 2159 template <class DU32, HWY_IF_U32_D(DU32)> 2160 HWY_API Vec1<uint32_t> ReorderWidenMulAccumulate(DU32 /* tag */, 2161 Vec1<uint16_t> a, 2162 Vec1<uint16_t> b, 2163 const Vec1<uint32_t> sum0, 2164 Vec1<uint32_t>& /* sum1 */) { 2165 return Vec1<uint32_t>(static_cast<uint32_t>(a.raw) * b.raw + sum0.raw); 2166 } 2167 2168 // ------------------------------ RearrangeToOddPlusEven 2169 template <typename TW> 2170 HWY_API Vec1<TW> RearrangeToOddPlusEven(Vec1<TW> sum0, Vec1<TW> /* sum1 */) { 2171 return sum0; // invariant already holds 2172 } 2173 2174 // ================================================== REDUCTIONS 2175 2176 // Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum. 2177 2178 // NOLINTNEXTLINE(google-readability-namespace-comments) 2179 } // namespace HWY_NAMESPACE 2180 } // namespace hwy 2181 HWY_AFTER_NAMESPACE();