wasm_256-inl.h (80865B)
1 // Copyright 2021 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // 256-bit WASM vectors and operations. Experimental. 17 // External include guard in highway.h - see comment there. 18 19 // For half-width vectors. Already includes base.h and shared-inl.h. 20 #include "hwy/ops/wasm_128-inl.h" 21 22 HWY_BEFORE_NAMESPACE(); 23 namespace hwy { 24 namespace HWY_NAMESPACE { 25 26 template <typename T> 27 class Vec256 { 28 public: 29 using PrivateT = T; // only for DFromV 30 static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV 31 32 // Compound assignment. Only usable if there is a corresponding non-member 33 // binary operator overload. For example, only f32 and f64 support division. 34 HWY_INLINE Vec256& operator*=(const Vec256 other) { 35 return *this = (*this * other); 36 } 37 HWY_INLINE Vec256& operator/=(const Vec256 other) { 38 return *this = (*this / other); 39 } 40 HWY_INLINE Vec256& operator+=(const Vec256 other) { 41 return *this = (*this + other); 42 } 43 HWY_INLINE Vec256& operator-=(const Vec256 other) { 44 return *this = (*this - other); 45 } 46 HWY_INLINE Vec256& operator%=(const Vec256 other) { 47 return *this = (*this % other); 48 } 49 HWY_INLINE Vec256& operator&=(const Vec256 other) { 50 return *this = (*this & other); 51 } 52 HWY_INLINE Vec256& operator|=(const Vec256 other) { 53 return *this = (*this | other); 54 } 55 HWY_INLINE Vec256& operator^=(const Vec256 other) { 56 return *this = (*this ^ other); 57 } 58 59 Vec128<T> v0; 60 Vec128<T> v1; 61 }; 62 63 template <typename T> 64 struct Mask256 { 65 using PrivateT = T; // only for DFromM 66 static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromM 67 68 Mask128<T> m0; 69 Mask128<T> m1; 70 }; 71 72 // ------------------------------ Zero 73 74 // Avoid VFromD here because it is defined in terms of Zero. 75 template <class D, HWY_IF_V_SIZE_D(D, 32)> 76 HWY_API Vec256<TFromD<D>> Zero(D d) { 77 const Half<decltype(d)> dh; 78 Vec256<TFromD<D>> ret; 79 ret.v0 = ret.v1 = Zero(dh); 80 return ret; 81 } 82 83 // ------------------------------ BitCast 84 template <class D, typename TFrom> 85 HWY_API VFromD<D> BitCast(D d, Vec256<TFrom> v) { 86 const Half<decltype(d)> dh; 87 VFromD<D> ret; 88 ret.v0 = BitCast(dh, v.v0); 89 ret.v1 = BitCast(dh, v.v1); 90 return ret; 91 } 92 93 // ------------------------------ ResizeBitCast 94 95 // 32-byte vector to 32-byte vector: Same as BitCast 96 template <class D, typename FromV, HWY_IF_V_SIZE_V(FromV, 32), 97 HWY_IF_V_SIZE_D(D, 32)> 98 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 99 return BitCast(d, v); 100 } 101 102 // <= 16-byte vector to 32-byte vector 103 template <class D, typename FromV, HWY_IF_V_SIZE_LE_V(FromV, 16), 104 HWY_IF_V_SIZE_D(D, 32)> 105 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 106 const Half<decltype(d)> dh; 107 VFromD<D> ret; 108 ret.v0 = ResizeBitCast(dh, v); 109 ret.v1 = Zero(dh); 110 return ret; 111 } 112 113 // 32-byte vector to <= 16-byte vector 114 template <class D, typename FromV, HWY_IF_V_SIZE_V(FromV, 32), 115 HWY_IF_V_SIZE_LE_D(D, 16)> 116 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 117 return ResizeBitCast(d, v.v0); 118 } 119 120 // ------------------------------ Set 121 template <class D, HWY_IF_V_SIZE_D(D, 32), typename T2> 122 HWY_API VFromD<D> Set(D d, const T2 t) { 123 const Half<decltype(d)> dh; 124 VFromD<D> ret; 125 ret.v0 = ret.v1 = Set(dh, static_cast<TFromD<D>>(t)); 126 return ret; 127 } 128 129 // Undefined, Iota defined in wasm_128. 130 131 // ------------------------------ Dup128VecFromValues 132 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 32)> 133 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 134 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 135 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, 136 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10, 137 TFromD<D> t11, TFromD<D> t12, 138 TFromD<D> t13, TFromD<D> t14, 139 TFromD<D> t15) { 140 const Half<decltype(d)> dh; 141 VFromD<D> ret; 142 ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t8, 143 t9, t10, t11, t12, t13, t14, t15); 144 return ret; 145 } 146 147 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 32)> 148 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 149 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 150 TFromD<D> t5, TFromD<D> t6, 151 TFromD<D> t7) { 152 const Half<decltype(d)> dh; 153 VFromD<D> ret; 154 ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7); 155 return ret; 156 } 157 158 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 32)> 159 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 160 TFromD<D> t2, TFromD<D> t3) { 161 const Half<decltype(d)> dh; 162 VFromD<D> ret; 163 ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1, t2, t3); 164 return ret; 165 } 166 167 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 32)> 168 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) { 169 const Half<decltype(d)> dh; 170 VFromD<D> ret; 171 ret.v0 = ret.v1 = Dup128VecFromValues(dh, t0, t1); 172 return ret; 173 } 174 175 // ================================================== ARITHMETIC 176 177 template <typename T> 178 HWY_API Vec256<T> operator+(Vec256<T> a, const Vec256<T> b) { 179 a.v0 += b.v0; 180 a.v1 += b.v1; 181 return a; 182 } 183 184 template <typename T> 185 HWY_API Vec256<T> operator-(Vec256<T> a, const Vec256<T> b) { 186 a.v0 -= b.v0; 187 a.v1 -= b.v1; 188 return a; 189 } 190 191 // ------------------------------ SumsOf8 192 HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) { 193 Vec256<uint64_t> ret; 194 ret.v0 = SumsOf8(v.v0); 195 ret.v1 = SumsOf8(v.v1); 196 return ret; 197 } 198 199 HWY_API Vec256<int64_t> SumsOf8(const Vec256<int8_t> v) { 200 Vec256<int64_t> ret; 201 ret.v0 = SumsOf8(v.v0); 202 ret.v1 = SumsOf8(v.v1); 203 return ret; 204 } 205 206 template <typename T> 207 HWY_API Vec256<T> SaturatedAdd(Vec256<T> a, const Vec256<T> b) { 208 a.v0 = SaturatedAdd(a.v0, b.v0); 209 a.v1 = SaturatedAdd(a.v1, b.v1); 210 return a; 211 } 212 213 template <typename T> 214 HWY_API Vec256<T> SaturatedSub(Vec256<T> a, const Vec256<T> b) { 215 a.v0 = SaturatedSub(a.v0, b.v0); 216 a.v1 = SaturatedSub(a.v1, b.v1); 217 return a; 218 } 219 220 template <typename T, HWY_IF_UNSIGNED(T), 221 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> 222 HWY_API Vec256<T> AverageRound(Vec256<T> a, const Vec256<T> b) { 223 a.v0 = AverageRound(a.v0, b.v0); 224 a.v1 = AverageRound(a.v1, b.v1); 225 return a; 226 } 227 228 template <typename T> 229 HWY_API Vec256<T> Abs(Vec256<T> v) { 230 v.v0 = Abs(v.v0); 231 v.v1 = Abs(v.v1); 232 return v; 233 } 234 235 // ------------------------------ Shift lanes by constant #bits 236 237 template <int kBits, typename T> 238 HWY_API Vec256<T> ShiftLeft(Vec256<T> v) { 239 v.v0 = ShiftLeft<kBits>(v.v0); 240 v.v1 = ShiftLeft<kBits>(v.v1); 241 return v; 242 } 243 244 template <int kBits, typename T> 245 HWY_API Vec256<T> ShiftRight(Vec256<T> v) { 246 v.v0 = ShiftRight<kBits>(v.v0); 247 v.v1 = ShiftRight<kBits>(v.v1); 248 return v; 249 } 250 251 // ------------------------------ RotateRight (ShiftRight, Or) 252 template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 253 HWY_API Vec256<T> RotateRight(const Vec256<T> v) { 254 const DFromV<decltype(v)> d; 255 const RebindToUnsigned<decltype(d)> du; 256 257 constexpr size_t kSizeInBits = sizeof(T) * 8; 258 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); 259 if (kBits == 0) return v; 260 261 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))), 262 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); 263 } 264 265 // ------------------------------ Shift lanes by same variable #bits 266 267 template <typename T> 268 HWY_API Vec256<T> ShiftLeftSame(Vec256<T> v, const int bits) { 269 v.v0 = ShiftLeftSame(v.v0, bits); 270 v.v1 = ShiftLeftSame(v.v1, bits); 271 return v; 272 } 273 274 template <typename T> 275 HWY_API Vec256<T> ShiftRightSame(Vec256<T> v, const int bits) { 276 v.v0 = ShiftRightSame(v.v0, bits); 277 v.v1 = ShiftRightSame(v.v1, bits); 278 return v; 279 } 280 281 // ------------------------------ Min, Max 282 template <typename T> 283 HWY_API Vec256<T> Min(Vec256<T> a, const Vec256<T> b) { 284 a.v0 = Min(a.v0, b.v0); 285 a.v1 = Min(a.v1, b.v1); 286 return a; 287 } 288 289 template <typename T> 290 HWY_API Vec256<T> Max(Vec256<T> a, const Vec256<T> b) { 291 a.v0 = Max(a.v0, b.v0); 292 a.v1 = Max(a.v1, b.v1); 293 return a; 294 } 295 // ------------------------------ Integer multiplication 296 297 template <typename T> 298 HWY_API Vec256<T> operator*(Vec256<T> a, const Vec256<T> b) { 299 a.v0 *= b.v0; 300 a.v1 *= b.v1; 301 return a; 302 } 303 304 template <typename T> 305 HWY_API Vec256<T> MulHigh(Vec256<T> a, const Vec256<T> b) { 306 a.v0 = MulHigh(a.v0, b.v0); 307 a.v1 = MulHigh(a.v1, b.v1); 308 return a; 309 } 310 311 template <typename T> 312 HWY_API Vec256<T> MulFixedPoint15(Vec256<T> a, const Vec256<T> b) { 313 a.v0 = MulFixedPoint15(a.v0, b.v0); 314 a.v1 = MulFixedPoint15(a.v1, b.v1); 315 return a; 316 } 317 318 // Cannot use MakeWide because that returns uint128_t for uint64_t, but we want 319 // uint64_t. 320 template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)), 321 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 322 HWY_API Vec256<MakeWide<T>> MulEven(Vec256<T> a, const Vec256<T> b) { 323 Vec256<MakeWide<T>> ret; 324 ret.v0 = MulEven(a.v0, b.v0); 325 ret.v1 = MulEven(a.v1, b.v1); 326 return ret; 327 } 328 template <class T, HWY_IF_UI64(T)> 329 HWY_API Vec256<T> MulEven(Vec256<T> a, const Vec256<T> b) { 330 Vec256<T> ret; 331 ret.v0 = MulEven(a.v0, b.v0); 332 ret.v1 = MulEven(a.v1, b.v1); 333 return ret; 334 } 335 336 template <class T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)), 337 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 338 HWY_API Vec256<MakeWide<T>> MulOdd(Vec256<T> a, const Vec256<T> b) { 339 Vec256<MakeWide<T>> ret; 340 ret.v0 = MulOdd(a.v0, b.v0); 341 ret.v1 = MulOdd(a.v1, b.v1); 342 return ret; 343 } 344 template <class T, HWY_IF_UI64(T)> 345 HWY_API Vec256<T> MulOdd(Vec256<T> a, const Vec256<T> b) { 346 Vec256<T> ret; 347 ret.v0 = MulOdd(a.v0, b.v0); 348 ret.v1 = MulOdd(a.v1, b.v1); 349 return ret; 350 } 351 352 // ------------------------------ Negate 353 template <typename T> 354 HWY_API Vec256<T> Neg(Vec256<T> v) { 355 v.v0 = Neg(v.v0); 356 v.v1 = Neg(v.v1); 357 return v; 358 } 359 360 // ------------------------------ AbsDiff 361 // generic_ops takes care of integer T. 362 template <typename T, HWY_IF_FLOAT(T)> 363 HWY_API Vec256<T> AbsDiff(const Vec256<T> a, const Vec256<T> b) { 364 return Abs(a - b); 365 } 366 367 // ------------------------------ Floating-point division 368 // generic_ops takes care of integer T. 369 template <typename T, HWY_IF_FLOAT(T)> 370 HWY_API Vec256<T> operator/(Vec256<T> a, const Vec256<T> b) { 371 a.v0 /= b.v0; 372 a.v1 /= b.v1; 373 return a; 374 } 375 376 // ------------------------------ Floating-point multiply-add variants 377 378 template <class T, HWY_IF_FLOAT3264(T)> 379 HWY_API Vec256<T> MulAdd(Vec256<T> mul, Vec256<T> x, Vec256<T> add) { 380 mul.v0 = MulAdd(mul.v0, x.v0, add.v0); 381 mul.v1 = MulAdd(mul.v1, x.v1, add.v1); 382 return mul; 383 } 384 385 template <class T, HWY_IF_FLOAT3264(T)> 386 HWY_API Vec256<T> NegMulAdd(Vec256<T> mul, Vec256<T> x, Vec256<T> add) { 387 mul.v0 = NegMulAdd(mul.v0, x.v0, add.v0); 388 mul.v1 = NegMulAdd(mul.v1, x.v1, add.v1); 389 return mul; 390 } 391 392 template <class T, HWY_IF_FLOAT3264(T)> 393 HWY_API Vec256<T> MulSub(Vec256<T> mul, Vec256<T> x, Vec256<T> sub) { 394 mul.v0 = MulSub(mul.v0, x.v0, sub.v0); 395 mul.v1 = MulSub(mul.v1, x.v1, sub.v1); 396 return mul; 397 } 398 399 template <class T, HWY_IF_FLOAT3264(T)> 400 HWY_API Vec256<T> NegMulSub(Vec256<T> mul, Vec256<T> x, Vec256<T> sub) { 401 mul.v0 = NegMulSub(mul.v0, x.v0, sub.v0); 402 mul.v1 = NegMulSub(mul.v1, x.v1, sub.v1); 403 return mul; 404 } 405 406 // ------------------------------ Floating-point square root 407 408 template <typename T> 409 HWY_API Vec256<T> Sqrt(Vec256<T> v) { 410 v.v0 = Sqrt(v.v0); 411 v.v1 = Sqrt(v.v1); 412 return v; 413 } 414 415 // ------------------------------ Floating-point rounding 416 417 // Toward nearest integer, ties to even 418 template <class T, HWY_IF_FLOAT3264(T)> 419 HWY_API Vec256<T> Round(Vec256<T> v) { 420 v.v0 = Round(v.v0); 421 v.v1 = Round(v.v1); 422 return v; 423 } 424 425 // Toward zero, aka truncate 426 template <class T, HWY_IF_FLOAT3264(T)> 427 HWY_API Vec256<T> Trunc(Vec256<T> v) { 428 v.v0 = Trunc(v.v0); 429 v.v1 = Trunc(v.v1); 430 return v; 431 } 432 433 // Toward +infinity, aka ceiling 434 template <class T, HWY_IF_FLOAT3264(T)> 435 HWY_API Vec256<T> Ceil(Vec256<T> v) { 436 v.v0 = Ceil(v.v0); 437 v.v1 = Ceil(v.v1); 438 return v; 439 } 440 441 // Toward -infinity, aka floor 442 template <class T, HWY_IF_FLOAT3264(T)> 443 HWY_API Vec256<T> Floor(Vec256<T> v) { 444 v.v0 = Floor(v.v0); 445 v.v1 = Floor(v.v1); 446 return v; 447 } 448 449 // ------------------------------ Floating-point classification 450 451 template <typename T> 452 HWY_API Mask256<T> IsNaN(const Vec256<T> v) { 453 return v != v; 454 } 455 456 template <typename T, HWY_IF_FLOAT(T)> 457 HWY_API Mask256<T> IsInf(const Vec256<T> v) { 458 const DFromV<decltype(v)> d; 459 const RebindToUnsigned<decltype(d)> du; 460 const VFromD<decltype(du)> vu = BitCast(du, v); 461 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. 462 return RebindMask(d, Eq(Add(vu, vu), Set(du, hwy::MaxExponentTimes2<T>()))); 463 } 464 465 // Returns whether normal/subnormal/zero. 466 template <typename T, HWY_IF_FLOAT(T)> 467 HWY_API Mask256<T> IsFinite(const Vec256<T> v) { 468 const DFromV<decltype(v)> d; 469 const RebindToUnsigned<decltype(d)> du; 470 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison 471 const VFromD<decltype(du)> vu = BitCast(du, v); 472 // 'Shift left' to clear the sign bit, then right so we can compare with the 473 // max exponent (cannot compare with MaxExponentTimes2 directly because it is 474 // negative and non-negative floats would be greater). 475 const VFromD<decltype(di)> exp = 476 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu))); 477 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>()))); 478 } 479 480 // ================================================== COMPARE 481 482 // Comparisons fill a lane with 1-bits if the condition is true, else 0. 483 484 template <class DTo, typename TFrom, typename TTo = TFromD<DTo>> 485 HWY_API MFromD<DTo> RebindMask(DTo /*tag*/, Mask256<TFrom> m) { 486 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); 487 return MFromD<DTo>{Mask128<TTo>{m.m0.raw}, Mask128<TTo>{m.m1.raw}}; 488 } 489 490 template <typename T> 491 HWY_API Mask256<T> TestBit(Vec256<T> v, Vec256<T> bit) { 492 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); 493 return (v & bit) == bit; 494 } 495 496 template <typename T> 497 HWY_API Mask256<T> operator==(Vec256<T> a, const Vec256<T> b) { 498 Mask256<T> m; 499 m.m0 = operator==(a.v0, b.v0); 500 m.m1 = operator==(a.v1, b.v1); 501 return m; 502 } 503 504 template <typename T> 505 HWY_API Mask256<T> operator!=(Vec256<T> a, const Vec256<T> b) { 506 Mask256<T> m; 507 m.m0 = operator!=(a.v0, b.v0); 508 m.m1 = operator!=(a.v1, b.v1); 509 return m; 510 } 511 512 template <typename T> 513 HWY_API Mask256<T> operator<(Vec256<T> a, const Vec256<T> b) { 514 Mask256<T> m; 515 m.m0 = operator<(a.v0, b.v0); 516 m.m1 = operator<(a.v1, b.v1); 517 return m; 518 } 519 520 template <typename T> 521 HWY_API Mask256<T> operator>(Vec256<T> a, const Vec256<T> b) { 522 Mask256<T> m; 523 m.m0 = operator>(a.v0, b.v0); 524 m.m1 = operator>(a.v1, b.v1); 525 return m; 526 } 527 528 template <typename T> 529 HWY_API Mask256<T> operator<=(Vec256<T> a, const Vec256<T> b) { 530 Mask256<T> m; 531 m.m0 = operator<=(a.v0, b.v0); 532 m.m1 = operator<=(a.v1, b.v1); 533 return m; 534 } 535 536 template <typename T> 537 HWY_API Mask256<T> operator>=(Vec256<T> a, const Vec256<T> b) { 538 Mask256<T> m; 539 m.m0 = operator>=(a.v0, b.v0); 540 m.m1 = operator>=(a.v1, b.v1); 541 return m; 542 } 543 544 // ------------------------------ FirstN (Iota, Lt) 545 546 template <class D, HWY_IF_V_SIZE_D(D, 32)> 547 HWY_API MFromD<D> FirstN(const D d, size_t num) { 548 const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper. 549 using TI = TFromD<decltype(di)>; 550 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<TI>(num))); 551 } 552 553 // ================================================== LOGICAL 554 555 template <typename T> 556 HWY_API Vec256<T> Not(Vec256<T> v) { 557 v.v0 = Not(v.v0); 558 v.v1 = Not(v.v1); 559 return v; 560 } 561 562 template <typename T> 563 HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) { 564 a.v0 = And(a.v0, b.v0); 565 a.v1 = And(a.v1, b.v1); 566 return a; 567 } 568 569 template <typename T> 570 HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) { 571 not_mask.v0 = AndNot(not_mask.v0, mask.v0); 572 not_mask.v1 = AndNot(not_mask.v1, mask.v1); 573 return not_mask; 574 } 575 576 template <typename T> 577 HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) { 578 a.v0 = Or(a.v0, b.v0); 579 a.v1 = Or(a.v1, b.v1); 580 return a; 581 } 582 583 template <typename T> 584 HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) { 585 a.v0 = Xor(a.v0, b.v0); 586 a.v1 = Xor(a.v1, b.v1); 587 return a; 588 } 589 590 template <typename T> 591 HWY_API Vec256<T> Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) { 592 return Xor(x1, Xor(x2, x3)); 593 } 594 595 template <typename T> 596 HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) { 597 return Or(o1, Or(o2, o3)); 598 } 599 600 template <typename T> 601 HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) { 602 return Or(o, And(a1, a2)); 603 } 604 605 template <typename T> 606 HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) { 607 return IfThenElse(MaskFromVec(mask), yes, no); 608 } 609 610 // ------------------------------ Operator overloads (internal-only if float) 611 612 template <typename T> 613 HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) { 614 return And(a, b); 615 } 616 617 template <typename T> 618 HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) { 619 return Or(a, b); 620 } 621 622 template <typename T> 623 HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) { 624 return Xor(a, b); 625 } 626 627 // ------------------------------ CopySign 628 template <typename T> 629 HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) { 630 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 631 const DFromV<decltype(magn)> d; 632 return BitwiseIfThenElse(SignBit(d), sign, magn); 633 } 634 635 // ------------------------------ CopySignToAbs 636 template <typename T> 637 HWY_API Vec256<T> CopySignToAbs(const Vec256<T> abs, const Vec256<T> sign) { 638 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 639 const DFromV<decltype(sign)> d; 640 return OrAnd(abs, SignBit(d), sign); 641 } 642 643 // ------------------------------ Mask 644 645 // Mask and Vec are the same (true = FF..FF). 646 template <typename T> 647 HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) { 648 Mask256<T> m; 649 m.m0 = MaskFromVec(v.v0); 650 m.m1 = MaskFromVec(v.v1); 651 return m; 652 } 653 654 template <class D, typename T = TFromD<D>> 655 HWY_API Vec256<T> VecFromMask(D d, Mask256<T> m) { 656 const Half<decltype(d)> dh; 657 Vec256<T> v; 658 v.v0 = VecFromMask(dh, m.m0); 659 v.v1 = VecFromMask(dh, m.m1); 660 return v; 661 } 662 663 template <class D, HWY_IF_V_SIZE_D(D, 32)> 664 HWY_API uint64_t BitsFromMask(D d, MFromD<D> m) { 665 const Half<decltype(d)> dh; 666 const uint64_t lo = BitsFromMask(dh, m.m0); 667 const uint64_t hi = BitsFromMask(dh, m.m1); 668 return (hi << Lanes(dh)) | lo; 669 } 670 671 // mask ? yes : no 672 template <typename T> 673 HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) { 674 yes.v0 = IfThenElse(mask.m0, yes.v0, no.v0); 675 yes.v1 = IfThenElse(mask.m1, yes.v1, no.v1); 676 return yes; 677 } 678 679 // mask ? yes : 0 680 template <typename T> 681 HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) { 682 return yes & VecFromMask(DFromV<decltype(yes)>(), mask); 683 } 684 685 // mask ? 0 : no 686 template <typename T> 687 HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) { 688 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no); 689 } 690 691 template <typename T> 692 HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) { 693 v.v0 = IfNegativeThenElse(v.v0, yes.v0, no.v0); 694 v.v1 = IfNegativeThenElse(v.v1, yes.v1, no.v1); 695 return v; 696 } 697 698 // ------------------------------ Mask logical 699 700 template <typename T> 701 HWY_API Mask256<T> Not(const Mask256<T> m) { 702 return MaskFromVec(Not(VecFromMask(Full256<T>(), m))); 703 } 704 705 template <typename T> 706 HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) { 707 const Full256<T> d; 708 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); 709 } 710 711 template <typename T> 712 HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) { 713 const Full256<T> d; 714 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); 715 } 716 717 template <typename T> 718 HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) { 719 const Full256<T> d; 720 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); 721 } 722 723 template <typename T> 724 HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) { 725 const Full256<T> d; 726 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); 727 } 728 729 template <typename T> 730 HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) { 731 const Full256<T> d; 732 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); 733 } 734 735 // ------------------------------ Shl (BroadcastSignBit, IfThenElse) 736 template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 737 HWY_API Vec256<T> operator<<(Vec256<T> v, const Vec256<T> bits) { 738 v.v0 = operator<<(v.v0, bits.v0); 739 v.v1 = operator<<(v.v1, bits.v1); 740 return v; 741 } 742 743 // ------------------------------ Shr (BroadcastSignBit, IfThenElse) 744 template <typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 745 HWY_API Vec256<T> operator>>(Vec256<T> v, const Vec256<T> bits) { 746 v.v0 = operator>>(v.v0, bits.v0); 747 v.v1 = operator>>(v.v1, bits.v1); 748 return v; 749 } 750 751 // ------------------------------ BroadcastSignBit (compare, VecFromMask) 752 753 template <typename T, HWY_IF_NOT_T_SIZE(T, 1)> 754 HWY_API Vec256<T> BroadcastSignBit(const Vec256<T> v) { 755 return ShiftRight<sizeof(T) * 8 - 1>(v); 756 } 757 HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) { 758 const DFromV<decltype(v)> d; 759 return VecFromMask(d, v < Zero(d)); 760 } 761 762 // ================================================== MEMORY 763 764 // ------------------------------ Load 765 766 template <class D, HWY_IF_V_SIZE_D(D, 32)> 767 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) { 768 const Half<decltype(d)> dh; 769 VFromD<D> ret; 770 ret.v0 = Load(dh, aligned); 771 ret.v1 = Load(dh, aligned + Lanes(dh)); 772 return ret; 773 } 774 775 template <class D, typename T = TFromD<D>> 776 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, D d, const T* HWY_RESTRICT aligned) { 777 return IfThenElseZero(m, Load(d, aligned)); 778 } 779 780 template <class D, typename T = TFromD<D>> 781 HWY_API Vec256<T> MaskedLoadOr(Vec256<T> v, Mask256<T> m, D d, 782 const T* HWY_RESTRICT aligned) { 783 return IfThenElse(m, Load(d, aligned), v); 784 } 785 786 // LoadU == Load. 787 template <class D, HWY_IF_V_SIZE_D(D, 32)> 788 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 789 return Load(d, p); 790 } 791 792 template <class D, HWY_IF_V_SIZE_D(D, 32)> 793 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { 794 const Half<decltype(d)> dh; 795 VFromD<D> ret; 796 ret.v0 = ret.v1 = Load(dh, p); 797 return ret; 798 } 799 800 // ------------------------------ Store 801 802 template <class D, typename T = TFromD<D>> 803 HWY_API void Store(Vec256<T> v, D d, T* HWY_RESTRICT aligned) { 804 const Half<decltype(d)> dh; 805 Store(v.v0, dh, aligned); 806 Store(v.v1, dh, aligned + Lanes(dh)); 807 } 808 809 // StoreU == Store. 810 template <class D, typename T = TFromD<D>> 811 HWY_API void StoreU(Vec256<T> v, D d, T* HWY_RESTRICT p) { 812 Store(v, d, p); 813 } 814 815 template <class D, typename T = TFromD<D>> 816 HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, D d, T* HWY_RESTRICT p) { 817 StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); 818 } 819 820 // ------------------------------ Stream 821 template <class D, typename T = TFromD<D>> 822 HWY_API void Stream(Vec256<T> v, D d, T* HWY_RESTRICT aligned) { 823 // Same as aligned stores. 824 Store(v, d, aligned); 825 } 826 827 // ------------------------------ Scatter, Gather defined in wasm_128 828 829 // ================================================== SWIZZLE 830 831 // ------------------------------ ExtractLane 832 template <typename T> 833 HWY_API T ExtractLane(const Vec256<T> v, size_t i) { 834 alignas(32) T lanes[32 / sizeof(T)]; 835 Store(v, DFromV<decltype(v)>(), lanes); 836 return lanes[i]; 837 } 838 839 // ------------------------------ InsertLane 840 template <typename T> 841 HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) { 842 DFromV<decltype(v)> d; 843 alignas(32) T lanes[32 / sizeof(T)]; 844 Store(v, d, lanes); 845 lanes[i] = t; 846 return Load(d, lanes); 847 } 848 849 // ------------------------------ ExtractBlock 850 template <int kBlockIdx, class T> 851 HWY_API Vec128<T> ExtractBlock(Vec256<T> v) { 852 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); 853 return (kBlockIdx == 0) ? v.v0 : v.v1; 854 } 855 856 // ------------------------------ InsertBlock 857 template <int kBlockIdx, class T> 858 HWY_API Vec256<T> InsertBlock(Vec256<T> v, Vec128<T> blk_to_insert) { 859 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); 860 Vec256<T> result; 861 if (kBlockIdx == 0) { 862 result.v0 = blk_to_insert; 863 result.v1 = v.v1; 864 } else { 865 result.v0 = v.v0; 866 result.v1 = blk_to_insert; 867 } 868 return result; 869 } 870 871 // ------------------------------ BroadcastBlock 872 template <int kBlockIdx, class T> 873 HWY_API Vec256<T> BroadcastBlock(Vec256<T> v) { 874 static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); 875 Vec256<T> result; 876 result.v0 = result.v1 = (kBlockIdx == 0 ? v.v0 : v.v1); 877 return result; 878 } 879 880 // ------------------------------ LowerHalf 881 882 template <class D, typename T = TFromD<D>> 883 HWY_API Vec128<T> LowerHalf(D /* tag */, Vec256<T> v) { 884 return v.v0; 885 } 886 887 template <typename T> 888 HWY_API Vec128<T> LowerHalf(Vec256<T> v) { 889 return v.v0; 890 } 891 892 // ------------------------------ GetLane (LowerHalf) 893 template <typename T> 894 HWY_API T GetLane(const Vec256<T> v) { 895 return GetLane(LowerHalf(v)); 896 } 897 898 // ------------------------------ ShiftLeftBytes 899 900 template <int kBytes, class D, typename T = TFromD<D>> 901 HWY_API Vec256<T> ShiftLeftBytes(D d, Vec256<T> v) { 902 const Half<decltype(d)> dh; 903 v.v0 = ShiftLeftBytes<kBytes>(dh, v.v0); 904 v.v1 = ShiftLeftBytes<kBytes>(dh, v.v1); 905 return v; 906 } 907 908 template <int kBytes, typename T> 909 HWY_API Vec256<T> ShiftLeftBytes(Vec256<T> v) { 910 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); 911 } 912 913 // ------------------------------ ShiftLeftLanes 914 915 template <int kLanes, class D, typename T = TFromD<D>> 916 HWY_API Vec256<T> ShiftLeftLanes(D d, const Vec256<T> v) { 917 const Repartition<uint8_t, decltype(d)> d8; 918 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v))); 919 } 920 921 template <int kLanes, typename T> 922 HWY_API Vec256<T> ShiftLeftLanes(const Vec256<T> v) { 923 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); 924 } 925 926 // ------------------------------ ShiftRightBytes 927 template <int kBytes, class D, typename T = TFromD<D>> 928 HWY_API Vec256<T> ShiftRightBytes(D d, Vec256<T> v) { 929 const Half<decltype(d)> dh; 930 v.v0 = ShiftRightBytes<kBytes>(dh, v.v0); 931 v.v1 = ShiftRightBytes<kBytes>(dh, v.v1); 932 return v; 933 } 934 935 // ------------------------------ ShiftRightLanes 936 template <int kLanes, class D, typename T = TFromD<D>> 937 HWY_API Vec256<T> ShiftRightLanes(D d, const Vec256<T> v) { 938 const Repartition<uint8_t, decltype(d)> d8; 939 return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v))); 940 } 941 942 // ------------------------------ UpperHalf (ShiftRightBytes) 943 template <class D, typename T = TFromD<D>> 944 HWY_API Vec128<T> UpperHalf(D /* tag */, const Vec256<T> v) { 945 return v.v1; 946 } 947 948 // ------------------------------ CombineShiftRightBytes 949 950 template <int kBytes, class D, typename T = TFromD<D>> 951 HWY_API Vec256<T> CombineShiftRightBytes(D d, Vec256<T> hi, Vec256<T> lo) { 952 const Half<decltype(d)> dh; 953 hi.v0 = CombineShiftRightBytes<kBytes>(dh, hi.v0, lo.v0); 954 hi.v1 = CombineShiftRightBytes<kBytes>(dh, hi.v1, lo.v1); 955 return hi; 956 } 957 958 // ------------------------------ Broadcast/splat any lane 959 960 template <int kLane, typename T> 961 HWY_API Vec256<T> Broadcast(const Vec256<T> v) { 962 Vec256<T> ret; 963 ret.v0 = Broadcast<kLane>(v.v0); 964 ret.v1 = Broadcast<kLane>(v.v1); 965 return ret; 966 } 967 968 template <int kLane, typename T> 969 HWY_API Vec256<T> BroadcastLane(const Vec256<T> v) { 970 constexpr int kLanesPerBlock = static_cast<int>(16 / sizeof(T)); 971 static_assert(0 <= kLane && kLane < kLanesPerBlock * 2, "Invalid lane"); 972 constexpr int kLaneInBlkIdx = kLane & (kLanesPerBlock - 1); 973 Vec256<T> ret; 974 ret.v0 = ret.v1 = 975 Broadcast<kLaneInBlkIdx>(kLane >= kLanesPerBlock ? v.v1 : v.v0); 976 return ret; 977 } 978 979 // ------------------------------ TableLookupBytes 980 981 // Both full 982 template <typename T, typename TI> 983 HWY_API Vec256<TI> TableLookupBytes(const Vec256<T> bytes, Vec256<TI> from) { 984 from.v0 = TableLookupBytes(bytes.v0, from.v0); 985 from.v1 = TableLookupBytes(bytes.v1, from.v1); 986 return from; 987 } 988 989 // Partial index vector 990 template <typename T, typename TI, size_t NI> 991 HWY_API Vec128<TI, NI> TableLookupBytes(Vec256<T> bytes, 992 const Vec128<TI, NI> from) { 993 // First expand to full 128, then 256. 994 const auto from_256 = ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw}); 995 const auto tbl_full = TableLookupBytes(bytes, from_256); 996 // Shrink to 128, then partial. 997 return Vec128<TI, NI>{LowerHalf(Full128<TI>(), tbl_full).raw}; 998 } 999 1000 // Partial table vector 1001 template <typename T, size_t N, typename TI> 1002 HWY_API Vec256<TI> TableLookupBytes(Vec128<T, N> bytes, const Vec256<TI> from) { 1003 // First expand to full 128, then 256. 1004 const auto bytes_256 = ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw}); 1005 return TableLookupBytes(bytes_256, from); 1006 } 1007 1008 // Partial both are handled by wasm_128. 1009 1010 template <class V, class VI> 1011 HWY_API VI TableLookupBytesOr0(V bytes, VI from) { 1012 // wasm out-of-bounds policy already zeros, so TableLookupBytes is fine. 1013 return TableLookupBytes(bytes, from); 1014 } 1015 1016 // ------------------------------ Hard-coded shuffles 1017 1018 template <typename T> 1019 HWY_API Vec256<T> Shuffle01(Vec256<T> v) { 1020 v.v0 = Shuffle01(v.v0); 1021 v.v1 = Shuffle01(v.v1); 1022 return v; 1023 } 1024 1025 template <typename T> 1026 HWY_API Vec256<T> Shuffle2301(Vec256<T> v) { 1027 v.v0 = Shuffle2301(v.v0); 1028 v.v1 = Shuffle2301(v.v1); 1029 return v; 1030 } 1031 1032 template <typename T> 1033 HWY_API Vec256<T> Shuffle1032(Vec256<T> v) { 1034 v.v0 = Shuffle1032(v.v0); 1035 v.v1 = Shuffle1032(v.v1); 1036 return v; 1037 } 1038 1039 template <typename T> 1040 HWY_API Vec256<T> Shuffle0321(Vec256<T> v) { 1041 v.v0 = Shuffle0321(v.v0); 1042 v.v1 = Shuffle0321(v.v1); 1043 return v; 1044 } 1045 1046 template <typename T> 1047 HWY_API Vec256<T> Shuffle2103(Vec256<T> v) { 1048 v.v0 = Shuffle2103(v.v0); 1049 v.v1 = Shuffle2103(v.v1); 1050 return v; 1051 } 1052 1053 template <typename T> 1054 HWY_API Vec256<T> Shuffle0123(Vec256<T> v) { 1055 v.v0 = Shuffle0123(v.v0); 1056 v.v1 = Shuffle0123(v.v1); 1057 return v; 1058 } 1059 1060 // Used by generic_ops-inl.h 1061 namespace detail { 1062 1063 template <typename T, HWY_IF_T_SIZE(T, 4)> 1064 HWY_API Vec256<T> ShuffleTwo2301(Vec256<T> a, const Vec256<T> b) { 1065 a.v0 = ShuffleTwo2301(a.v0, b.v0); 1066 a.v1 = ShuffleTwo2301(a.v1, b.v1); 1067 return a; 1068 } 1069 template <typename T, HWY_IF_T_SIZE(T, 4)> 1070 HWY_API Vec256<T> ShuffleTwo1230(Vec256<T> a, const Vec256<T> b) { 1071 a.v0 = ShuffleTwo1230(a.v0, b.v0); 1072 a.v1 = ShuffleTwo1230(a.v1, b.v1); 1073 return a; 1074 } 1075 template <typename T, HWY_IF_T_SIZE(T, 4)> 1076 HWY_API Vec256<T> ShuffleTwo3012(Vec256<T> a, const Vec256<T> b) { 1077 a.v0 = ShuffleTwo3012(a.v0, b.v0); 1078 a.v1 = ShuffleTwo3012(a.v1, b.v1); 1079 return a; 1080 } 1081 1082 } // namespace detail 1083 1084 // ------------------------------ TableLookupLanes 1085 1086 // Returned by SetTableIndices for use by TableLookupLanes. 1087 template <typename T> 1088 struct Indices256 { 1089 __v128_u i0; 1090 __v128_u i1; 1091 }; 1092 1093 template <class D, typename T = TFromD<D>, typename TI> 1094 HWY_API Indices256<T> IndicesFromVec(D /* tag */, Vec256<TI> vec) { 1095 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 1096 Indices256<T> ret; 1097 ret.i0 = vec.v0.raw; 1098 ret.i1 = vec.v1.raw; 1099 return ret; 1100 } 1101 1102 template <class D, HWY_IF_V_SIZE_D(D, 32), typename TI> 1103 HWY_API Indices256<TFromD<D>> SetTableIndices(D d, const TI* idx) { 1104 const Rebind<TI, decltype(d)> di; 1105 return IndicesFromVec(d, LoadU(di, idx)); 1106 } 1107 1108 template <typename T> 1109 HWY_API Vec256<T> TableLookupLanes(const Vec256<T> v, Indices256<T> idx) { 1110 const DFromV<decltype(v)> d; 1111 const Half<decltype(d)> dh; 1112 const auto idx_i0 = IndicesFromVec(dh, Vec128<T>{idx.i0}); 1113 const auto idx_i1 = IndicesFromVec(dh, Vec128<T>{idx.i1}); 1114 1115 Vec256<T> result; 1116 result.v0 = TwoTablesLookupLanes(v.v0, v.v1, idx_i0); 1117 result.v1 = TwoTablesLookupLanes(v.v0, v.v1, idx_i1); 1118 return result; 1119 } 1120 1121 template <typename T> 1122 HWY_API Vec256<T> TableLookupLanesOr0(Vec256<T> v, Indices256<T> idx) { 1123 // The out of bounds behavior will already zero lanes. 1124 return TableLookupLanesOr0(v, idx); 1125 } 1126 1127 template <typename T> 1128 HWY_API Vec256<T> TwoTablesLookupLanes(const Vec256<T> a, const Vec256<T> b, 1129 Indices256<T> idx) { 1130 const DFromV<decltype(a)> d; 1131 const Half<decltype(d)> dh; 1132 const RebindToUnsigned<decltype(d)> du; 1133 using TU = MakeUnsigned<T>; 1134 constexpr size_t kLanesPerVect = 32 / sizeof(TU); 1135 1136 Vec256<TU> vi; 1137 vi.v0 = Vec128<TU>{idx.i0}; 1138 vi.v1 = Vec128<TU>{idx.i1}; 1139 const auto vmod = vi & Set(du, TU{kLanesPerVect - 1}); 1140 const auto is_lo = RebindMask(d, vi == vmod); 1141 1142 const auto idx_i0 = IndicesFromVec(dh, vmod.v0); 1143 const auto idx_i1 = IndicesFromVec(dh, vmod.v1); 1144 1145 Vec256<T> result_lo; 1146 Vec256<T> result_hi; 1147 result_lo.v0 = TwoTablesLookupLanes(a.v0, a.v1, idx_i0); 1148 result_lo.v1 = TwoTablesLookupLanes(a.v0, a.v1, idx_i1); 1149 result_hi.v0 = TwoTablesLookupLanes(b.v0, b.v1, idx_i0); 1150 result_hi.v1 = TwoTablesLookupLanes(b.v0, b.v1, idx_i1); 1151 return IfThenElse(is_lo, result_lo, result_hi); 1152 } 1153 1154 // ------------------------------ Reverse 1155 template <class D, typename T = TFromD<D>> 1156 HWY_API Vec256<T> Reverse(D d, const Vec256<T> v) { 1157 const Half<decltype(d)> dh; 1158 Vec256<T> ret; 1159 ret.v1 = Reverse(dh, v.v0); // note reversed v1 member order 1160 ret.v0 = Reverse(dh, v.v1); 1161 return ret; 1162 } 1163 1164 // ------------------------------ Reverse2 1165 template <class D, typename T = TFromD<D>> 1166 HWY_API Vec256<T> Reverse2(D d, Vec256<T> v) { 1167 const Half<decltype(d)> dh; 1168 v.v0 = Reverse2(dh, v.v0); 1169 v.v1 = Reverse2(dh, v.v1); 1170 return v; 1171 } 1172 1173 // ------------------------------ Reverse4 1174 1175 // Each block has only 2 lanes, so swap blocks and their lanes. 1176 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> 1177 HWY_API Vec256<T> Reverse4(D d, const Vec256<T> v) { 1178 const Half<decltype(d)> dh; 1179 Vec256<T> ret; 1180 ret.v0 = Reverse2(dh, v.v1); // swapped 1181 ret.v1 = Reverse2(dh, v.v0); 1182 return ret; 1183 } 1184 1185 template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE(T, 8)> 1186 HWY_API Vec256<T> Reverse4(D d, Vec256<T> v) { 1187 const Half<decltype(d)> dh; 1188 v.v0 = Reverse4(dh, v.v0); 1189 v.v1 = Reverse4(dh, v.v1); 1190 return v; 1191 } 1192 1193 // ------------------------------ Reverse8 1194 1195 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> 1196 HWY_API Vec256<T> Reverse8(D /* tag */, Vec256<T> /* v */) { 1197 HWY_ASSERT(0); // don't have 8 u64 lanes 1198 } 1199 1200 // Each block has only 4 lanes, so swap blocks and their lanes. 1201 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> 1202 HWY_API Vec256<T> Reverse8(D d, const Vec256<T> v) { 1203 const Half<decltype(d)> dh; 1204 Vec256<T> ret; 1205 ret.v0 = Reverse4(dh, v.v1); // swapped 1206 ret.v1 = Reverse4(dh, v.v0); 1207 return ret; 1208 } 1209 1210 template <class D, typename T = TFromD<D>, 1211 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> 1212 HWY_API Vec256<T> Reverse8(D d, Vec256<T> v) { 1213 const Half<decltype(d)> dh; 1214 v.v0 = Reverse8(dh, v.v0); 1215 v.v1 = Reverse8(dh, v.v1); 1216 return v; 1217 } 1218 1219 // ------------------------------ InterleaveLower 1220 1221 template <typename T> 1222 HWY_API Vec256<T> InterleaveLower(Vec256<T> a, Vec256<T> b) { 1223 a.v0 = InterleaveLower(a.v0, b.v0); 1224 a.v1 = InterleaveLower(a.v1, b.v1); 1225 return a; 1226 } 1227 1228 // wasm_128 already defines a template with D, V, V args. 1229 1230 // ------------------------------ InterleaveUpper (UpperHalf) 1231 1232 template <class D, typename T = TFromD<D>> 1233 HWY_API Vec256<T> InterleaveUpper(D d, Vec256<T> a, Vec256<T> b) { 1234 const Half<decltype(d)> dh; 1235 a.v0 = InterleaveUpper(dh, a.v0, b.v0); 1236 a.v1 = InterleaveUpper(dh, a.v1, b.v1); 1237 return a; 1238 } 1239 1240 // ------------------------------ InterleaveWholeLower 1241 template <class D, HWY_IF_V_SIZE_D(D, 32)> 1242 HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) { 1243 const Half<decltype(d)> dh; 1244 VFromD<D> ret; 1245 ret.v0 = InterleaveLower(a.v0, b.v0); 1246 ret.v1 = InterleaveUpper(dh, a.v0, b.v0); 1247 return ret; 1248 } 1249 1250 // ------------------------------ InterleaveWholeUpper 1251 template <class D, HWY_IF_V_SIZE_D(D, 32)> 1252 HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) { 1253 const Half<decltype(d)> dh; 1254 VFromD<D> ret; 1255 ret.v0 = InterleaveLower(a.v1, b.v1); 1256 ret.v1 = InterleaveUpper(dh, a.v1, b.v1); 1257 return ret; 1258 } 1259 1260 // ------------------------------ ZipLower/ZipUpper defined in wasm_128 1261 1262 // ================================================== COMBINE 1263 1264 // ------------------------------ Combine (InterleaveLower) 1265 template <class D, typename T = TFromD<D>> 1266 HWY_API Vec256<T> Combine(D /* d */, Vec128<T> hi, Vec128<T> lo) { 1267 Vec256<T> ret; 1268 ret.v1 = hi; 1269 ret.v0 = lo; 1270 return ret; 1271 } 1272 1273 // ------------------------------ ZeroExtendVector (Combine) 1274 template <class D, typename T = TFromD<D>> 1275 HWY_API Vec256<T> ZeroExtendVector(D d, Vec128<T> lo) { 1276 const Half<decltype(d)> dh; 1277 return Combine(d, Zero(dh), lo); 1278 } 1279 1280 // ------------------------------ ZeroExtendResizeBitCast 1281 1282 namespace detail { 1283 1284 template <size_t kFromVectSize, class DTo, class DFrom, 1285 HWY_IF_LANES_LE(kFromVectSize, 8)> 1286 HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast( 1287 hwy::SizeTag<kFromVectSize> /* from_size_tag */, 1288 hwy::SizeTag<32> /* to_size_tag */, DTo d_to, DFrom d_from, 1289 VFromD<DFrom> v) { 1290 const Half<decltype(d_to)> dh_to; 1291 return ZeroExtendVector(d_to, ZeroExtendResizeBitCast(dh_to, d_from, v)); 1292 } 1293 1294 } // namespace detail 1295 1296 // ------------------------------ ConcatLowerLower 1297 template <class D, typename T = TFromD<D>> 1298 HWY_API Vec256<T> ConcatLowerLower(D /* tag */, Vec256<T> hi, Vec256<T> lo) { 1299 Vec256<T> ret; 1300 ret.v1 = hi.v0; 1301 ret.v0 = lo.v0; 1302 return ret; 1303 } 1304 1305 // ------------------------------ ConcatUpperUpper 1306 template <class D, typename T = TFromD<D>> 1307 HWY_API Vec256<T> ConcatUpperUpper(D /* tag */, Vec256<T> hi, Vec256<T> lo) { 1308 Vec256<T> ret; 1309 ret.v1 = hi.v1; 1310 ret.v0 = lo.v1; 1311 return ret; 1312 } 1313 1314 // ------------------------------ ConcatLowerUpper 1315 template <class D, typename T = TFromD<D>> 1316 HWY_API Vec256<T> ConcatLowerUpper(D /* tag */, Vec256<T> hi, Vec256<T> lo) { 1317 Vec256<T> ret; 1318 ret.v1 = hi.v0; 1319 ret.v0 = lo.v1; 1320 return ret; 1321 } 1322 1323 // ------------------------------ ConcatUpperLower 1324 template <class D, typename T = TFromD<D>> 1325 HWY_API Vec256<T> ConcatUpperLower(D /* tag */, Vec256<T> hi, Vec256<T> lo) { 1326 Vec256<T> ret; 1327 ret.v1 = hi.v1; 1328 ret.v0 = lo.v0; 1329 return ret; 1330 } 1331 1332 // ------------------------------ ConcatOdd 1333 template <class D, typename T = TFromD<D>> 1334 HWY_API Vec256<T> ConcatOdd(D d, Vec256<T> hi, Vec256<T> lo) { 1335 const Half<decltype(d)> dh; 1336 Vec256<T> ret; 1337 ret.v0 = ConcatOdd(dh, lo.v1, lo.v0); 1338 ret.v1 = ConcatOdd(dh, hi.v1, hi.v0); 1339 return ret; 1340 } 1341 1342 // ------------------------------ ConcatEven 1343 template <class D, typename T = TFromD<D>> 1344 HWY_API Vec256<T> ConcatEven(D d, Vec256<T> hi, Vec256<T> lo) { 1345 const Half<decltype(d)> dh; 1346 Vec256<T> ret; 1347 ret.v0 = ConcatEven(dh, lo.v1, lo.v0); 1348 ret.v1 = ConcatEven(dh, hi.v1, hi.v0); 1349 return ret; 1350 } 1351 1352 // ------------------------------ DupEven 1353 template <typename T> 1354 HWY_API Vec256<T> DupEven(Vec256<T> v) { 1355 v.v0 = DupEven(v.v0); 1356 v.v1 = DupEven(v.v1); 1357 return v; 1358 } 1359 1360 // ------------------------------ DupOdd 1361 template <typename T> 1362 HWY_API Vec256<T> DupOdd(Vec256<T> v) { 1363 v.v0 = DupOdd(v.v0); 1364 v.v1 = DupOdd(v.v1); 1365 return v; 1366 } 1367 1368 // ------------------------------ OddEven 1369 template <typename T> 1370 HWY_API Vec256<T> OddEven(Vec256<T> a, const Vec256<T> b) { 1371 a.v0 = OddEven(a.v0, b.v0); 1372 a.v1 = OddEven(a.v1, b.v1); 1373 return a; 1374 } 1375 1376 // ------------------------------ InterleaveEven 1377 template <class D, HWY_IF_V_SIZE_D(D, 32)> 1378 HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) { 1379 const Half<decltype(d)> dh; 1380 a.v0 = InterleaveEven(dh, a.v0, b.v0); 1381 a.v1 = InterleaveEven(dh, a.v1, b.v1); 1382 return a; 1383 } 1384 1385 // ------------------------------ InterleaveOdd 1386 template <class D, HWY_IF_V_SIZE_D(D, 32)> 1387 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 1388 const Half<decltype(d)> dh; 1389 a.v0 = InterleaveOdd(dh, a.v0, b.v0); 1390 a.v1 = InterleaveOdd(dh, a.v1, b.v1); 1391 return a; 1392 } 1393 1394 // ------------------------------ OddEvenBlocks 1395 template <typename T> 1396 HWY_API Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) { 1397 odd.v0 = even.v0; 1398 return odd; 1399 } 1400 1401 // ------------------------------ SwapAdjacentBlocks 1402 template <typename T> 1403 HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) { 1404 Vec256<T> ret; 1405 ret.v0 = v.v1; // swapped order 1406 ret.v1 = v.v0; 1407 return ret; 1408 } 1409 1410 // ------------------------------ InterleaveEvenBlocks 1411 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)> 1412 HWY_API V InterleaveEvenBlocks(D, V a, V b) { 1413 V ret; 1414 ret.v0 = a.v0; 1415 ret.v1 = b.v0; 1416 return ret; 1417 } 1418 // ------------------------------ InterleaveOddBlocks 1419 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_D(D, 32)> 1420 HWY_API V InterleaveOddBlocks(D, V a, V b) { 1421 V ret; 1422 ret.v0 = a.v1; 1423 ret.v1 = b.v1; 1424 return ret; 1425 } 1426 1427 // ------------------------------ ReverseBlocks 1428 template <class D, typename T = TFromD<D>> 1429 HWY_API Vec256<T> ReverseBlocks(D /* tag */, const Vec256<T> v) { 1430 return SwapAdjacentBlocks(v); // 2 blocks, so Swap = Reverse 1431 } 1432 1433 // ------------------------------ Per4LaneBlockShuffle 1434 namespace detail { 1435 1436 template <size_t kIdx3210, class V> 1437 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 1438 hwy::SizeTag<1> /*lane_size_tag*/, 1439 hwy::SizeTag<32> /*vect_size_tag*/, V v) { 1440 const DFromV<decltype(v)> d; 1441 const Half<decltype(d)> dh; 1442 using VH = VFromD<decltype(dh)>; 1443 1444 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3); 1445 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3); 1446 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3); 1447 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3); 1448 1449 V ret; 1450 ret.v0 = VH{wasm_i8x16_shuffle( 1451 v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3, kIdx0 + 4, kIdx1 + 4, 1452 kIdx2 + 4, kIdx3 + 4, kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8, 1453 kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)}; 1454 ret.v1 = VH{wasm_i8x16_shuffle( 1455 v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3, kIdx0 + 4, kIdx1 + 4, 1456 kIdx2 + 4, kIdx3 + 4, kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8, 1457 kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)}; 1458 return ret; 1459 } 1460 1461 template <size_t kIdx3210, class V> 1462 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 1463 hwy::SizeTag<2> /*lane_size_tag*/, 1464 hwy::SizeTag<32> /*vect_size_tag*/, V v) { 1465 const DFromV<decltype(v)> d; 1466 const Half<decltype(d)> dh; 1467 using VH = VFromD<decltype(dh)>; 1468 1469 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3); 1470 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3); 1471 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3); 1472 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3); 1473 1474 V ret; 1475 ret.v0 = VH{wasm_i16x8_shuffle(v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3, 1476 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)}; 1477 ret.v1 = VH{wasm_i16x8_shuffle(v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3, 1478 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)}; 1479 return ret; 1480 } 1481 1482 template <size_t kIdx3210, class V> 1483 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 1484 hwy::SizeTag<4> /*lane_size_tag*/, 1485 hwy::SizeTag<32> /*vect_size_tag*/, V v) { 1486 const DFromV<decltype(v)> d; 1487 const Half<decltype(d)> dh; 1488 using VH = VFromD<decltype(dh)>; 1489 1490 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3); 1491 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3); 1492 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3); 1493 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3); 1494 1495 V ret; 1496 ret.v0 = 1497 VH{wasm_i32x4_shuffle(v.v0.raw, v.v0.raw, kIdx0, kIdx1, kIdx2, kIdx3)}; 1498 ret.v1 = 1499 VH{wasm_i32x4_shuffle(v.v1.raw, v.v1.raw, kIdx0, kIdx1, kIdx2, kIdx3)}; 1500 return ret; 1501 } 1502 1503 template <size_t kIdx3210, class V> 1504 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 1505 hwy::SizeTag<8> /*lane_size_tag*/, 1506 hwy::SizeTag<32> /*vect_size_tag*/, V v) { 1507 const DFromV<decltype(v)> d; 1508 const Half<decltype(d)> dh; 1509 using VH = VFromD<decltype(dh)>; 1510 1511 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3); 1512 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3); 1513 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3); 1514 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3); 1515 1516 V ret; 1517 ret.v0 = VH{wasm_i64x2_shuffle(v.v0.raw, v.v1.raw, kIdx0, kIdx1)}; 1518 ret.v1 = VH{wasm_i64x2_shuffle(v.v0.raw, v.v1.raw, kIdx2, kIdx3)}; 1519 return ret; 1520 } 1521 1522 } // namespace detail 1523 1524 // ------------------------------ SlideUpBlocks 1525 template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)> 1526 HWY_API VFromD<D> SlideUpBlocks(D d, VFromD<D> v) { 1527 static_assert(0 <= kBlocks && kBlocks <= 1, 1528 "kBlocks must be between 0 and 1"); 1529 return (kBlocks == 1) ? ConcatLowerLower(d, v, Zero(d)) : v; 1530 } 1531 1532 // ------------------------------ SlideDownBlocks 1533 template <int kBlocks, class D, HWY_IF_V_SIZE_D(D, 32)> 1534 HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) { 1535 static_assert(0 <= kBlocks && kBlocks <= 1, 1536 "kBlocks must be between 0 and 1"); 1537 const Half<decltype(d)> dh; 1538 return (kBlocks == 1) ? ZeroExtendVector(d, UpperHalf(dh, v)) : v; 1539 } 1540 1541 // ------------------------------ SlideUpLanes 1542 1543 template <class D, HWY_IF_V_SIZE_D(D, 32)> 1544 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 1545 const Half<decltype(d)> dh; 1546 const RebindToUnsigned<decltype(d)> du; 1547 const RebindToUnsigned<decltype(dh)> dh_u; 1548 const auto vu = BitCast(du, v); 1549 VFromD<D> ret; 1550 1551 #if !HWY_IS_DEBUG_BUILD 1552 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>); 1553 if (__builtin_constant_p(amt) && amt < kLanesPerBlock) { 1554 switch (amt * sizeof(TFromD<D>)) { 1555 case 0: 1556 return v; 1557 case 1: 1558 ret.v0 = BitCast(dh, ShiftLeftBytes<1>(dh_u, vu.v0)); 1559 ret.v1 = BitCast(dh, CombineShiftRightBytes<15>(dh_u, vu.v1, vu.v0)); 1560 return ret; 1561 case 2: 1562 ret.v0 = BitCast(dh, ShiftLeftBytes<2>(dh_u, vu.v0)); 1563 ret.v1 = BitCast(dh, CombineShiftRightBytes<14>(dh_u, vu.v1, vu.v0)); 1564 return ret; 1565 case 3: 1566 ret.v0 = BitCast(dh, ShiftLeftBytes<3>(dh_u, vu.v0)); 1567 ret.v1 = BitCast(dh, CombineShiftRightBytes<13>(dh_u, vu.v1, vu.v0)); 1568 return ret; 1569 case 4: 1570 ret.v0 = BitCast(dh, ShiftLeftBytes<4>(dh_u, vu.v0)); 1571 ret.v1 = BitCast(dh, CombineShiftRightBytes<12>(dh_u, vu.v1, vu.v0)); 1572 return ret; 1573 case 5: 1574 ret.v0 = BitCast(dh, ShiftLeftBytes<5>(dh_u, vu.v0)); 1575 ret.v1 = BitCast(dh, CombineShiftRightBytes<11>(dh_u, vu.v1, vu.v0)); 1576 return ret; 1577 case 6: 1578 ret.v0 = BitCast(dh, ShiftLeftBytes<6>(dh_u, vu.v0)); 1579 ret.v1 = BitCast(dh, CombineShiftRightBytes<10>(dh_u, vu.v1, vu.v0)); 1580 return ret; 1581 case 7: 1582 ret.v0 = BitCast(dh, ShiftLeftBytes<7>(dh_u, vu.v0)); 1583 ret.v1 = BitCast(dh, CombineShiftRightBytes<9>(dh_u, vu.v1, vu.v0)); 1584 return ret; 1585 case 8: 1586 ret.v0 = BitCast(dh, ShiftLeftBytes<8>(dh_u, vu.v0)); 1587 ret.v1 = BitCast(dh, CombineShiftRightBytes<8>(dh_u, vu.v1, vu.v0)); 1588 return ret; 1589 case 9: 1590 ret.v0 = BitCast(dh, ShiftLeftBytes<9>(dh_u, vu.v0)); 1591 ret.v1 = BitCast(dh, CombineShiftRightBytes<7>(dh_u, vu.v1, vu.v0)); 1592 return ret; 1593 case 10: 1594 ret.v0 = BitCast(dh, ShiftLeftBytes<10>(dh_u, vu.v0)); 1595 ret.v1 = BitCast(dh, CombineShiftRightBytes<6>(dh_u, vu.v1, vu.v0)); 1596 return ret; 1597 case 11: 1598 ret.v0 = BitCast(dh, ShiftLeftBytes<11>(dh_u, vu.v0)); 1599 ret.v1 = BitCast(dh, CombineShiftRightBytes<5>(dh_u, vu.v1, vu.v0)); 1600 return ret; 1601 case 12: 1602 ret.v0 = BitCast(dh, ShiftLeftBytes<12>(dh_u, vu.v0)); 1603 ret.v1 = BitCast(dh, CombineShiftRightBytes<4>(dh_u, vu.v1, vu.v0)); 1604 return ret; 1605 case 13: 1606 ret.v0 = BitCast(dh, ShiftLeftBytes<13>(dh_u, vu.v0)); 1607 ret.v1 = BitCast(dh, CombineShiftRightBytes<3>(dh_u, vu.v1, vu.v0)); 1608 return ret; 1609 case 14: 1610 ret.v0 = BitCast(dh, ShiftLeftBytes<14>(dh_u, vu.v0)); 1611 ret.v1 = BitCast(dh, CombineShiftRightBytes<2>(dh_u, vu.v1, vu.v0)); 1612 return ret; 1613 case 15: 1614 ret.v0 = BitCast(dh, ShiftLeftBytes<15>(dh_u, vu.v0)); 1615 ret.v1 = BitCast(dh, CombineShiftRightBytes<1>(dh_u, vu.v1, vu.v0)); 1616 return ret; 1617 } 1618 } 1619 1620 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) { 1621 ret.v0 = Zero(dh); 1622 ret.v1 = SlideUpLanes(dh, LowerHalf(dh, v), amt - kLanesPerBlock); 1623 return ret; 1624 } 1625 #endif 1626 1627 const Repartition<uint8_t, decltype(d)> du8; 1628 const RebindToSigned<decltype(du8)> di8; 1629 const Half<decltype(di8)> dh_i8; 1630 1631 const auto lo_byte_idx = BitCast( 1632 di8, 1633 Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromD<D>)))); 1634 1635 const auto hi_byte_idx = 1636 UpperHalf(dh_i8, lo_byte_idx) - Set(dh_i8, int8_t{16}); 1637 const auto hi_sel_mask = 1638 UpperHalf(dh_i8, lo_byte_idx) > Set(dh_i8, int8_t{15}); 1639 1640 ret = BitCast(d, 1641 TableLookupBytesOr0(ConcatLowerLower(du, vu, vu), lo_byte_idx)); 1642 ret.v1 = 1643 BitCast(dh, IfThenElse(hi_sel_mask, 1644 TableLookupBytes(UpperHalf(dh_u, vu), hi_byte_idx), 1645 BitCast(dh_i8, ret.v1))); 1646 return ret; 1647 } 1648 1649 // ------------------------------ Slide1Up 1650 template <typename D, HWY_IF_V_SIZE_D(D, 32)> 1651 HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) { 1652 VFromD<D> ret; 1653 const Half<decltype(d)> dh; 1654 constexpr int kShrByteAmt = static_cast<int>(16 - sizeof(TFromD<D>)); 1655 ret.v0 = ShiftLeftLanes<1>(dh, v.v0); 1656 ret.v1 = CombineShiftRightBytes<kShrByteAmt>(dh, v.v1, v.v0); 1657 return ret; 1658 } 1659 1660 // ------------------------------ SlideDownLanes 1661 1662 template <class D, HWY_IF_V_SIZE_D(D, 32)> 1663 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 1664 const Half<decltype(d)> dh; 1665 const RebindToUnsigned<decltype(d)> du; 1666 const RebindToUnsigned<decltype(dh)> dh_u; 1667 VFromD<D> ret; 1668 1669 const auto vu = BitCast(du, v); 1670 1671 #if !HWY_IS_DEBUG_BUILD 1672 constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>); 1673 if (__builtin_constant_p(amt) && amt < kLanesPerBlock) { 1674 switch (amt * sizeof(TFromD<D>)) { 1675 case 0: 1676 return v; 1677 case 1: 1678 ret.v0 = BitCast(dh, CombineShiftRightBytes<1>(dh_u, vu.v1, vu.v0)); 1679 ret.v1 = BitCast(dh, ShiftRightBytes<1>(dh_u, vu.v1)); 1680 return ret; 1681 case 2: 1682 ret.v0 = BitCast(dh, CombineShiftRightBytes<2>(dh_u, vu.v1, vu.v0)); 1683 ret.v1 = BitCast(dh, ShiftRightBytes<2>(dh_u, vu.v1)); 1684 return ret; 1685 case 3: 1686 ret.v0 = BitCast(dh, CombineShiftRightBytes<3>(dh_u, vu.v1, vu.v0)); 1687 ret.v1 = BitCast(dh, ShiftRightBytes<3>(dh_u, vu.v1)); 1688 return ret; 1689 case 4: 1690 ret.v0 = BitCast(dh, CombineShiftRightBytes<4>(dh_u, vu.v1, vu.v0)); 1691 ret.v1 = BitCast(dh, ShiftRightBytes<4>(dh_u, vu.v1)); 1692 return ret; 1693 case 5: 1694 ret.v0 = BitCast(dh, CombineShiftRightBytes<5>(dh_u, vu.v1, vu.v0)); 1695 ret.v1 = BitCast(dh, ShiftRightBytes<5>(dh_u, vu.v1)); 1696 return ret; 1697 case 6: 1698 ret.v0 = BitCast(dh, CombineShiftRightBytes<6>(dh_u, vu.v1, vu.v0)); 1699 ret.v1 = BitCast(dh, ShiftRightBytes<6>(dh_u, vu.v1)); 1700 return ret; 1701 case 7: 1702 ret.v0 = BitCast(dh, CombineShiftRightBytes<7>(dh_u, vu.v1, vu.v0)); 1703 ret.v1 = BitCast(dh, ShiftRightBytes<7>(dh_u, vu.v1)); 1704 return ret; 1705 case 8: 1706 ret.v0 = BitCast(dh, CombineShiftRightBytes<8>(dh_u, vu.v1, vu.v0)); 1707 ret.v1 = BitCast(dh, ShiftRightBytes<8>(dh_u, vu.v1)); 1708 return ret; 1709 case 9: 1710 ret.v0 = BitCast(dh, CombineShiftRightBytes<9>(dh_u, vu.v1, vu.v0)); 1711 ret.v1 = BitCast(dh, ShiftRightBytes<9>(dh_u, vu.v1)); 1712 return ret; 1713 case 10: 1714 ret.v0 = BitCast(dh, CombineShiftRightBytes<10>(dh_u, vu.v1, vu.v0)); 1715 ret.v1 = BitCast(dh, ShiftRightBytes<10>(dh_u, vu.v1)); 1716 return ret; 1717 case 11: 1718 ret.v0 = BitCast(dh, CombineShiftRightBytes<11>(dh_u, vu.v1, vu.v0)); 1719 ret.v1 = BitCast(dh, ShiftRightBytes<11>(dh_u, vu.v1)); 1720 return ret; 1721 case 12: 1722 ret.v0 = BitCast(dh, CombineShiftRightBytes<12>(dh_u, vu.v1, vu.v0)); 1723 ret.v1 = BitCast(dh, ShiftRightBytes<12>(dh_u, vu.v1)); 1724 return ret; 1725 case 13: 1726 ret.v0 = BitCast(dh, CombineShiftRightBytes<13>(dh_u, vu.v1, vu.v0)); 1727 ret.v1 = BitCast(dh, ShiftRightBytes<13>(dh_u, vu.v1)); 1728 return ret; 1729 case 14: 1730 ret.v0 = BitCast(dh, CombineShiftRightBytes<14>(dh_u, vu.v1, vu.v0)); 1731 ret.v1 = BitCast(dh, ShiftRightBytes<14>(dh_u, vu.v1)); 1732 return ret; 1733 case 15: 1734 ret.v0 = BitCast(dh, CombineShiftRightBytes<15>(dh_u, vu.v1, vu.v0)); 1735 ret.v1 = BitCast(dh, ShiftRightBytes<15>(dh_u, vu.v1)); 1736 return ret; 1737 } 1738 } 1739 1740 if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) { 1741 ret.v0 = SlideDownLanes(dh, UpperHalf(dh, v), amt - kLanesPerBlock); 1742 ret.v1 = Zero(dh); 1743 return ret; 1744 } 1745 #endif 1746 1747 const Repartition<uint8_t, decltype(d)> du8; 1748 const Half<decltype(du8)> dh_u8; 1749 1750 const auto lo_byte_idx = 1751 Iota(du8, static_cast<uint8_t>(amt * sizeof(TFromD<D>))); 1752 const auto u8_16 = Set(du8, uint8_t{16}); 1753 const auto hi_byte_idx = lo_byte_idx - u8_16; 1754 1755 const auto lo_sel_mask = 1756 LowerHalf(dh_u8, lo_byte_idx) < LowerHalf(dh_u8, u8_16); 1757 ret = BitCast(d, IfThenElseZero(hi_byte_idx < u8_16, 1758 TableLookupBytes(ConcatUpperUpper(du, vu, vu), 1759 hi_byte_idx))); 1760 ret.v0 = 1761 BitCast(dh, IfThenElse(lo_sel_mask, 1762 TableLookupBytes(LowerHalf(dh_u, vu), 1763 LowerHalf(dh_u8, lo_byte_idx)), 1764 BitCast(dh_u8, LowerHalf(dh, ret)))); 1765 return ret; 1766 } 1767 1768 // ------------------------------ Slide1Down 1769 template <typename D, HWY_IF_V_SIZE_D(D, 32)> 1770 HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) { 1771 VFromD<D> ret; 1772 const Half<decltype(d)> dh; 1773 constexpr int kShrByteAmt = static_cast<int>(sizeof(TFromD<D>)); 1774 ret.v0 = CombineShiftRightBytes<kShrByteAmt>(dh, v.v1, v.v0); 1775 ret.v1 = ShiftRightBytes<kShrByteAmt>(dh, v.v1); 1776 return ret; 1777 } 1778 1779 // ================================================== CONVERT 1780 1781 // ------------------------------ PromoteTo 1782 1783 template <class D, HWY_IF_V_SIZE_D(D, 32), typename TN, 1784 HWY_IF_T_SIZE_D(D, sizeof(TN) * 2)> 1785 HWY_API VFromD<D> PromoteTo(D d, Vec128<TN> v) { 1786 const Half<decltype(d)> dh; 1787 VFromD<D> ret; 1788 // PromoteLowerTo is defined later in generic_ops-inl.h. 1789 ret.v0 = PromoteTo(dh, LowerHalf(v)); 1790 ret.v1 = PromoteUpperTo(dh, v); 1791 return ret; 1792 } 1793 1794 // 4x promotion: 8-bit to 32-bit or 16-bit to 64-bit 1795 template <class DW, HWY_IF_V_SIZE_D(DW, 32), 1796 HWY_IF_T_SIZE_ONE_OF_D(DW, (1 << 4) | (1 << 8)), 1797 HWY_IF_NOT_FLOAT_D(DW), typename TN, 1798 HWY_IF_T_SIZE_D(DW, sizeof(TN) * 4), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TN)> 1799 HWY_API Vec256<TFromD<DW>> PromoteTo(DW d, Vec64<TN> v) { 1800 const Half<decltype(d)> dh; 1801 // 16-bit lanes for UI8->UI32, 32-bit lanes for UI16->UI64 1802 const Rebind<MakeWide<TN>, decltype(d)> d2; 1803 const auto v_2x = PromoteTo(d2, v); 1804 Vec256<TFromD<DW>> ret; 1805 // PromoteLowerTo is defined later in generic_ops-inl.h. 1806 ret.v0 = PromoteTo(dh, LowerHalf(v_2x)); 1807 ret.v1 = PromoteUpperTo(dh, v_2x); 1808 return ret; 1809 } 1810 1811 // 8x promotion: 8-bit to 64-bit 1812 template <class DW, HWY_IF_V_SIZE_D(DW, 32), HWY_IF_T_SIZE_D(DW, 8), 1813 HWY_IF_NOT_FLOAT_D(DW), typename TN, HWY_IF_T_SIZE(TN, 1)> 1814 HWY_API Vec256<TFromD<DW>> PromoteTo(DW d, Vec32<TN> v) { 1815 const Half<decltype(d)> dh; 1816 const Repartition<MakeWide<MakeWide<TN>>, decltype(dh)> d4; // 32-bit lanes 1817 const auto v32 = PromoteTo(d4, v); 1818 Vec256<TFromD<DW>> ret; 1819 // PromoteLowerTo is defined later in generic_ops-inl.h. 1820 ret.v0 = PromoteTo(dh, LowerHalf(v32)); 1821 ret.v1 = PromoteUpperTo(dh, v32); 1822 return ret; 1823 } 1824 1825 // ------------------------------ PromoteUpperTo 1826 1827 // Not native, but still define this here because wasm_128 toggles 1828 // HWY_NATIVE_PROMOTE_UPPER_TO. 1829 template <class D, class T> 1830 HWY_API VFromD<D> PromoteUpperTo(D d, Vec256<T> v) { 1831 // Lanes(d) may differ from Lanes(DFromV<decltype(v)>()). Use the lane type 1832 // from v because it cannot be deduced from D (could be either bf16 or f16). 1833 const Rebind<T, decltype(d)> dh; 1834 return PromoteTo(d, UpperHalf(dh, v)); 1835 } 1836 1837 // ------------------------------ DemoteTo 1838 1839 template <class D, HWY_IF_U16_D(D)> 1840 HWY_API Vec128<uint16_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { 1841 return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(v.v0.raw, v.v1.raw)}; 1842 } 1843 1844 template <class D, HWY_IF_I16_D(D)> 1845 HWY_API Vec128<int16_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { 1846 return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw)}; 1847 } 1848 1849 template <class D, HWY_IF_U8_D(D)> 1850 HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { 1851 const auto intermediate = wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw); 1852 return Vec64<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; 1853 } 1854 1855 template <class D, HWY_IF_U8_D(D)> 1856 HWY_API Vec128<uint8_t> DemoteTo(D /* tag */, Vec256<int16_t> v) { 1857 return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(v.v0.raw, v.v1.raw)}; 1858 } 1859 1860 template <class D, HWY_IF_I8_D(D)> 1861 HWY_API Vec64<int8_t> DemoteTo(D /* tag */, Vec256<int32_t> v) { 1862 const auto intermediate = wasm_i16x8_narrow_i32x4(v.v0.raw, v.v1.raw); 1863 return Vec64<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)}; 1864 } 1865 1866 template <class D, HWY_IF_I8_D(D)> 1867 HWY_API Vec128<int8_t> DemoteTo(D /* tag */, Vec256<int16_t> v) { 1868 return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(v.v0.raw, v.v1.raw)}; 1869 } 1870 1871 template <class D, HWY_IF_I32_D(D)> 1872 HWY_API Vec128<int32_t> DemoteTo(D di, Vec256<double> v) { 1873 const Vec64<int32_t> lo{wasm_i32x4_trunc_sat_f64x2_zero(v.v0.raw)}; 1874 const Vec64<int32_t> hi{wasm_i32x4_trunc_sat_f64x2_zero(v.v1.raw)}; 1875 return Combine(di, hi, lo); 1876 } 1877 1878 template <class D, HWY_IF_U32_D(D)> 1879 HWY_API Vec128<uint32_t> DemoteTo(D di, Vec256<double> v) { 1880 const Vec64<uint32_t> lo{wasm_u32x4_trunc_sat_f64x2_zero(v.v0.raw)}; 1881 const Vec64<uint32_t> hi{wasm_u32x4_trunc_sat_f64x2_zero(v.v1.raw)}; 1882 return Combine(di, hi, lo); 1883 } 1884 1885 template <class D, HWY_IF_F32_D(D)> 1886 HWY_API Vec128<float> DemoteTo(D df, Vec256<int64_t> v) { 1887 const Vec64<float> lo = DemoteTo(Full64<float>(), v.v0); 1888 const Vec64<float> hi = DemoteTo(Full64<float>(), v.v1); 1889 return Combine(df, hi, lo); 1890 } 1891 1892 template <class D, HWY_IF_F32_D(D)> 1893 HWY_API Vec128<float> DemoteTo(D df, Vec256<uint64_t> v) { 1894 const Vec64<float> lo = DemoteTo(Full64<float>(), v.v0); 1895 const Vec64<float> hi = DemoteTo(Full64<float>(), v.v1); 1896 return Combine(df, hi, lo); 1897 } 1898 1899 template <class D, HWY_IF_F16_D(D)> 1900 HWY_API Vec128<float16_t> DemoteTo(D d16, Vec256<float> v) { 1901 const Half<decltype(d16)> d16h; 1902 const Vec64<float16_t> lo = DemoteTo(d16h, v.v0); 1903 const Vec64<float16_t> hi = DemoteTo(d16h, v.v1); 1904 return Combine(d16, hi, lo); 1905 } 1906 1907 template <class D, HWY_IF_F32_D(D)> 1908 HWY_API Vec128<float> DemoteTo(D df32, Vec256<double> v) { 1909 const Half<decltype(df32)> df32h; 1910 const Vec64<float> lo = DemoteTo(df32h, v.v0); 1911 const Vec64<float> hi = DemoteTo(df32h, v.v1); 1912 return Combine(df32, hi, lo); 1913 } 1914 1915 // For already range-limited input [0, 255]. 1916 HWY_API Vec64<uint8_t> U8FromU32(Vec256<uint32_t> v) { 1917 const Full64<uint8_t> du8; 1918 const Full256<int32_t> di32; // no unsigned DemoteTo 1919 return DemoteTo(du8, BitCast(di32, v)); 1920 } 1921 1922 // ------------------------------ Truncations 1923 1924 template <class D, HWY_IF_U8_D(D)> 1925 HWY_API Vec32<uint8_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) { 1926 return Vec32<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 8, 16, 24, 0, 1927 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 1928 24)}; 1929 } 1930 1931 template <class D, HWY_IF_U16_D(D)> 1932 HWY_API Vec64<uint16_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) { 1933 return Vec64<uint16_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 8, 9, 16, 1934 17, 24, 25, 0, 1, 8, 9, 16, 17, 24, 1935 25)}; 1936 } 1937 1938 template <class D, HWY_IF_U32_D(D)> 1939 HWY_API Vec128<uint32_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) { 1940 return Vec128<uint32_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 2, 3, 8, 1941 9, 10, 11, 16, 17, 18, 19, 24, 25, 1942 26, 27)}; 1943 } 1944 1945 template <class D, HWY_IF_U8_D(D)> 1946 HWY_API Vec64<uint8_t> TruncateTo(D /* tag */, Vec256<uint32_t> v) { 1947 return Vec64<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 4, 8, 12, 16, 1948 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 1949 28)}; 1950 } 1951 1952 template <class D, HWY_IF_U16_D(D)> 1953 HWY_API Vec128<uint16_t> TruncateTo(D /* tag */, Vec256<uint32_t> v) { 1954 return Vec128<uint16_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 4, 5, 8, 1955 9, 12, 13, 16, 17, 20, 21, 24, 25, 1956 28, 29)}; 1957 } 1958 1959 template <class D, HWY_IF_U8_D(D)> 1960 HWY_API Vec128<uint8_t> TruncateTo(D /* tag */, Vec256<uint16_t> v) { 1961 return Vec128<uint8_t>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 2, 4, 6, 8, 1962 10, 12, 14, 16, 18, 20, 22, 24, 26, 1963 28, 30)}; 1964 } 1965 1966 // ------------------------------ ReorderDemote2To 1967 template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 32), 1968 HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), HWY_IF_SIGNED_V(V), 1969 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), 1970 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> 1971 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 1972 const Half<decltype(dn)> dnh; 1973 VFromD<DN> demoted; 1974 demoted.v0 = DemoteTo(dnh, a); 1975 demoted.v1 = DemoteTo(dnh, b); 1976 return demoted; 1977 } 1978 1979 template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 32), HWY_IF_UNSIGNED_D(DN), 1980 HWY_IF_UNSIGNED_V(V), 1981 HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)), 1982 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)> 1983 HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) { 1984 const Half<decltype(dn)> dnh; 1985 VFromD<DN> demoted; 1986 demoted.v0 = DemoteTo(dnh, a); 1987 demoted.v1 = DemoteTo(dnh, b); 1988 return demoted; 1989 } 1990 1991 // ------------------------------ Convert i32 <=> f32 (Round) 1992 1993 template <class DTo, typename TFrom, typename TTo = TFromD<DTo>> 1994 HWY_API Vec256<TTo> ConvertTo(DTo d, const Vec256<TFrom> v) { 1995 const Half<decltype(d)> dh; 1996 Vec256<TTo> ret; 1997 ret.v0 = ConvertTo(dh, v.v0); 1998 ret.v1 = ConvertTo(dh, v.v1); 1999 return ret; 2000 } 2001 2002 template <typename T, HWY_IF_FLOAT3264(T)> 2003 HWY_API Vec256<MakeSigned<T>> NearestInt(const Vec256<T> v) { 2004 return ConvertTo(Full256<MakeSigned<T>>(), Round(v)); 2005 } 2006 2007 // ================================================== MISC 2008 2009 // ------------------------------ LoadMaskBits (TestBit) 2010 2011 // `p` points to at least 8 readable bytes, not all of which need be valid. 2012 template <class D, HWY_IF_V_SIZE_D(D, 32), 2013 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> 2014 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { 2015 const Half<decltype(d)> dh; 2016 MFromD<D> ret; 2017 ret.m0 = LoadMaskBits(dh, bits); 2018 // If size=4, one 128-bit vector has 4 mask bits; otherwise 2 for size=8. 2019 // Both halves fit in one byte's worth of mask bits. 2020 constexpr size_t kBitsPerHalf = 16 / sizeof(TFromD<D>); 2021 const uint8_t bits_upper[8] = {static_cast<uint8_t>(bits[0] >> kBitsPerHalf)}; 2022 ret.m1 = LoadMaskBits(dh, bits_upper); 2023 return ret; 2024 } 2025 2026 template <class D, HWY_IF_V_SIZE_D(D, 32), 2027 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 2028 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { 2029 const Half<decltype(d)> dh; 2030 MFromD<D> ret; 2031 ret.m0 = LoadMaskBits(dh, bits); 2032 constexpr size_t kLanesPerHalf = 16 / sizeof(TFromD<D>); 2033 constexpr size_t kBytesPerHalf = kLanesPerHalf / 8; 2034 static_assert(kBytesPerHalf != 0, "Lane size <= 16 bits => at least 8 lanes"); 2035 ret.m1 = LoadMaskBits(dh, bits + kBytesPerHalf); 2036 return ret; 2037 } 2038 2039 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2040 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 2041 const Half<decltype(d)> dh; 2042 MFromD<D> ret; 2043 ret.m0 = ret.m1 = Dup128MaskFromMaskBits(dh, mask_bits); 2044 return ret; 2045 } 2046 2047 // ------------------------------ Mask 2048 2049 // `p` points to at least 8 writable bytes. 2050 template <class D, typename T = TFromD<D>, 2051 HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))> 2052 HWY_API size_t StoreMaskBits(D d, const Mask256<T> mask, uint8_t* bits) { 2053 const Half<decltype(d)> dh; 2054 StoreMaskBits(dh, mask.m0, bits); 2055 const uint8_t lo = bits[0]; 2056 StoreMaskBits(dh, mask.m1, bits); 2057 // If size=4, one 128-bit vector has 4 mask bits; otherwise 2 for size=8. 2058 // Both halves fit in one byte's worth of mask bits. 2059 constexpr size_t kBitsPerHalf = 16 / sizeof(T); 2060 bits[0] = static_cast<uint8_t>(lo | (bits[0] << kBitsPerHalf)); 2061 return (kBitsPerHalf * 2 + 7) / 8; 2062 } 2063 2064 template <class D, typename T = TFromD<D>, 2065 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))> 2066 HWY_API size_t StoreMaskBits(D d, const Mask256<T> mask, uint8_t* bits) { 2067 const Half<decltype(d)> dh; 2068 constexpr size_t kLanesPerHalf = 16 / sizeof(T); 2069 constexpr size_t kBytesPerHalf = kLanesPerHalf / 8; 2070 static_assert(kBytesPerHalf != 0, "Lane size <= 16 bits => at least 8 lanes"); 2071 StoreMaskBits(dh, mask.m0, bits); 2072 StoreMaskBits(dh, mask.m1, bits + kBytesPerHalf); 2073 return kBytesPerHalf * 2; 2074 } 2075 2076 template <class D, typename T = TFromD<D>> 2077 HWY_API size_t CountTrue(D d, const Mask256<T> m) { 2078 const Half<decltype(d)> dh; 2079 return CountTrue(dh, m.m0) + CountTrue(dh, m.m1); 2080 } 2081 2082 template <class D, typename T = TFromD<D>> 2083 HWY_API bool AllFalse(D d, const Mask256<T> m) { 2084 const Half<decltype(d)> dh; 2085 return AllFalse(dh, m.m0) && AllFalse(dh, m.m1); 2086 } 2087 2088 template <class D, typename T = TFromD<D>> 2089 HWY_API bool AllTrue(D d, const Mask256<T> m) { 2090 const Half<decltype(d)> dh; 2091 return AllTrue(dh, m.m0) && AllTrue(dh, m.m1); 2092 } 2093 2094 template <class D, typename T = TFromD<D>> 2095 HWY_API size_t FindKnownFirstTrue(D d, const Mask256<T> mask) { 2096 const Half<decltype(d)> dh; 2097 const intptr_t lo = FindFirstTrue(dh, mask.m0); // not known 2098 constexpr size_t kLanesPerHalf = 16 / sizeof(T); 2099 return lo >= 0 ? static_cast<size_t>(lo) 2100 : kLanesPerHalf + FindKnownFirstTrue(dh, mask.m1); 2101 } 2102 2103 template <class D, typename T = TFromD<D>> 2104 HWY_API intptr_t FindFirstTrue(D d, const Mask256<T> mask) { 2105 const Half<decltype(d)> dh; 2106 const intptr_t lo = FindFirstTrue(dh, mask.m0); 2107 constexpr int kLanesPerHalf = 16 / sizeof(T); 2108 if (lo >= 0) return lo; 2109 2110 const intptr_t hi = FindFirstTrue(dh, mask.m1); 2111 return hi + (hi >= 0 ? kLanesPerHalf : 0); 2112 } 2113 2114 template <class D, typename T = TFromD<D>> 2115 HWY_API size_t FindKnownLastTrue(D d, const Mask256<T> mask) { 2116 const Half<decltype(d)> dh; 2117 const intptr_t hi = FindLastTrue(dh, mask.m1); // not known 2118 constexpr size_t kLanesPerHalf = 16 / sizeof(T); 2119 return hi >= 0 ? kLanesPerHalf + static_cast<size_t>(hi) 2120 : FindKnownLastTrue(dh, mask.m0); 2121 } 2122 2123 template <class D, typename T = TFromD<D>> 2124 HWY_API intptr_t FindLastTrue(D d, const Mask256<T> mask) { 2125 const Half<decltype(d)> dh; 2126 constexpr int kLanesPerHalf = 16 / sizeof(T); 2127 const intptr_t hi = FindLastTrue(dh, mask.m1); 2128 return hi >= 0 ? kLanesPerHalf + hi : FindLastTrue(dh, mask.m0); 2129 } 2130 2131 // ------------------------------ CompressStore 2132 template <class D, typename T = TFromD<D>> 2133 HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, D d, 2134 T* HWY_RESTRICT unaligned) { 2135 const Half<decltype(d)> dh; 2136 const size_t count = CompressStore(v.v0, mask.m0, dh, unaligned); 2137 const size_t count2 = CompressStore(v.v1, mask.m1, dh, unaligned + count); 2138 return count + count2; 2139 } 2140 2141 // ------------------------------ CompressBlendedStore 2142 template <class D, typename T = TFromD<D>> 2143 HWY_API size_t CompressBlendedStore(Vec256<T> v, const Mask256<T> m, D d, 2144 T* HWY_RESTRICT unaligned) { 2145 const Half<decltype(d)> dh; 2146 const size_t count = CompressBlendedStore(v.v0, m.m0, dh, unaligned); 2147 const size_t count2 = CompressBlendedStore(v.v1, m.m1, dh, unaligned + count); 2148 return count + count2; 2149 } 2150 2151 // ------------------------------ CompressBitsStore 2152 2153 template <class D, typename T = TFromD<D>> 2154 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits, 2155 D d, T* HWY_RESTRICT unaligned) { 2156 const Mask256<T> m = LoadMaskBits(d, bits); 2157 return CompressStore(v, m, d, unaligned); 2158 } 2159 2160 // ------------------------------ Compress 2161 template <typename T> 2162 HWY_API Vec256<T> Compress(const Vec256<T> v, const Mask256<T> mask) { 2163 const DFromV<decltype(v)> d; 2164 alignas(32) T lanes[32 / sizeof(T)] = {}; 2165 (void)CompressStore(v, mask, d, lanes); 2166 return Load(d, lanes); 2167 } 2168 2169 // ------------------------------ CompressNot 2170 template <typename T> 2171 HWY_API Vec256<T> CompressNot(Vec256<T> v, const Mask256<T> mask) { 2172 return Compress(v, Not(mask)); 2173 } 2174 2175 // ------------------------------ CompressBlocksNot 2176 HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v, 2177 Mask256<uint64_t> mask) { 2178 const Full128<uint64_t> dh; 2179 // Because the non-selected (mask=1) blocks are undefined, we can return the 2180 // input unless mask = 01, in which case we must bring down the upper block. 2181 return AllTrue(dh, AndNot(mask.m1, mask.m0)) ? SwapAdjacentBlocks(v) : v; 2182 } 2183 2184 // ------------------------------ CompressBits 2185 template <typename T> 2186 HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) { 2187 const Mask256<T> m = LoadMaskBits(DFromV<decltype(v)>(), bits); 2188 return Compress(v, m); 2189 } 2190 2191 // ------------------------------ Expand 2192 template <typename T> 2193 HWY_API Vec256<T> Expand(const Vec256<T> v, const Mask256<T> mask) { 2194 Vec256<T> ret; 2195 const Full256<T> d; 2196 const Half<decltype(d)> dh; 2197 alignas(32) T lanes[32 / sizeof(T)] = {}; 2198 Store(v, d, lanes); 2199 ret.v0 = Expand(v.v0, mask.m0); 2200 ret.v1 = Expand(LoadU(dh, lanes + CountTrue(dh, mask.m0)), mask.m1); 2201 return ret; 2202 } 2203 2204 // ------------------------------ LoadExpand 2205 template <class D, HWY_IF_V_SIZE_D(D, 32)> 2206 HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d, 2207 const TFromD<D>* HWY_RESTRICT unaligned) { 2208 return Expand(LoadU(d, unaligned), mask); 2209 } 2210 2211 // ------------------------------ LoadInterleaved3/4 2212 2213 // Implemented in generic_ops, we just overload LoadTransposedBlocks3/4. 2214 2215 namespace detail { 2216 2217 // Input: 2218 // 1 0 (<- first block of unaligned) 2219 // 3 2 2220 // 5 4 2221 // Output: 2222 // 3 0 2223 // 4 1 2224 // 5 2 2225 template <class D, typename T = TFromD<D>> 2226 HWY_API void LoadTransposedBlocks3(D d, const T* HWY_RESTRICT unaligned, 2227 Vec256<T>& A, Vec256<T>& B, Vec256<T>& C) { 2228 const Vec256<T> v10 = LoadU(d, unaligned + 0 * MaxLanes(d)); 2229 const Vec256<T> v32 = LoadU(d, unaligned + 1 * MaxLanes(d)); 2230 const Vec256<T> v54 = LoadU(d, unaligned + 2 * MaxLanes(d)); 2231 2232 A = ConcatUpperLower(d, v32, v10); 2233 B = ConcatLowerUpper(d, v54, v10); 2234 C = ConcatUpperLower(d, v54, v32); 2235 } 2236 2237 // Input (128-bit blocks): 2238 // 1 0 (first block of unaligned) 2239 // 3 2 2240 // 5 4 2241 // 7 6 2242 // Output: 2243 // 4 0 (LSB of A) 2244 // 5 1 2245 // 6 2 2246 // 7 3 2247 template <class D, typename T = TFromD<D>> 2248 HWY_API void LoadTransposedBlocks4(D d, const T* HWY_RESTRICT unaligned, 2249 Vec256<T>& vA, Vec256<T>& vB, Vec256<T>& vC, 2250 Vec256<T>& vD) { 2251 const Vec256<T> v10 = LoadU(d, unaligned + 0 * MaxLanes(d)); 2252 const Vec256<T> v32 = LoadU(d, unaligned + 1 * MaxLanes(d)); 2253 const Vec256<T> v54 = LoadU(d, unaligned + 2 * MaxLanes(d)); 2254 const Vec256<T> v76 = LoadU(d, unaligned + 3 * MaxLanes(d)); 2255 2256 vA = ConcatLowerLower(d, v54, v10); 2257 vB = ConcatUpperUpper(d, v54, v10); 2258 vC = ConcatLowerLower(d, v76, v32); 2259 vD = ConcatUpperUpper(d, v76, v32); 2260 } 2261 2262 } // namespace detail 2263 2264 // ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower) 2265 2266 // Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4. 2267 2268 namespace detail { 2269 2270 // Input (128-bit blocks): 2271 // 2 0 (LSB of i) 2272 // 3 1 2273 // Output: 2274 // 1 0 2275 // 3 2 2276 template <class D, typename T = TFromD<D>> 2277 HWY_API void StoreTransposedBlocks2(Vec256<T> i, Vec256<T> j, D d, 2278 T* HWY_RESTRICT unaligned) { 2279 const Vec256<T> out0 = ConcatLowerLower(d, j, i); 2280 const Vec256<T> out1 = ConcatUpperUpper(d, j, i); 2281 StoreU(out0, d, unaligned + 0 * MaxLanes(d)); 2282 StoreU(out1, d, unaligned + 1 * MaxLanes(d)); 2283 } 2284 2285 // Input (128-bit blocks): 2286 // 3 0 (LSB of i) 2287 // 4 1 2288 // 5 2 2289 // Output: 2290 // 1 0 2291 // 3 2 2292 // 5 4 2293 template <class D, typename T = TFromD<D>> 2294 HWY_API void StoreTransposedBlocks3(Vec256<T> i, Vec256<T> j, Vec256<T> k, D d, 2295 T* HWY_RESTRICT unaligned) { 2296 const Vec256<T> out0 = ConcatLowerLower(d, j, i); 2297 const Vec256<T> out1 = ConcatUpperLower(d, i, k); 2298 const Vec256<T> out2 = ConcatUpperUpper(d, k, j); 2299 StoreU(out0, d, unaligned + 0 * MaxLanes(d)); 2300 StoreU(out1, d, unaligned + 1 * MaxLanes(d)); 2301 StoreU(out2, d, unaligned + 2 * MaxLanes(d)); 2302 } 2303 2304 // Input (128-bit blocks): 2305 // 4 0 (LSB of i) 2306 // 5 1 2307 // 6 2 2308 // 7 3 2309 // Output: 2310 // 1 0 2311 // 3 2 2312 // 5 4 2313 // 7 6 2314 template <class D, typename T = TFromD<D>> 2315 HWY_API void StoreTransposedBlocks4(Vec256<T> i, Vec256<T> j, Vec256<T> k, 2316 Vec256<T> l, D d, 2317 T* HWY_RESTRICT unaligned) { 2318 // Write lower halves, then upper. 2319 const Vec256<T> out0 = ConcatLowerLower(d, j, i); 2320 const Vec256<T> out1 = ConcatLowerLower(d, l, k); 2321 StoreU(out0, d, unaligned + 0 * MaxLanes(d)); 2322 StoreU(out1, d, unaligned + 1 * MaxLanes(d)); 2323 const Vec256<T> out2 = ConcatUpperUpper(d, j, i); 2324 const Vec256<T> out3 = ConcatUpperUpper(d, l, k); 2325 StoreU(out2, d, unaligned + 2 * MaxLanes(d)); 2326 StoreU(out3, d, unaligned + 3 * MaxLanes(d)); 2327 } 2328 2329 } // namespace detail 2330 2331 // ------------------------------ Additional mask logical operations 2332 2333 template <class T> 2334 HWY_API Mask256<T> SetAtOrAfterFirst(Mask256<T> mask) { 2335 const Full256<T> d; 2336 const Half<decltype(d)> dh; 2337 const Repartition<int64_t, decltype(dh)> dh_i64; 2338 2339 Mask256<T> result; 2340 result.m0 = SetAtOrAfterFirst(mask.m0); 2341 result.m1 = SetAtOrAfterFirst(mask.m1); 2342 2343 // Copy the sign bit of the lower 128-bit half to the upper 128-bit half 2344 const auto vmask_lo = BitCast(dh_i64, VecFromMask(dh, result.m0)); 2345 result.m1 = 2346 Or(result.m1, MaskFromVec(BitCast(dh, BroadcastSignBit(InterleaveUpper( 2347 dh_i64, vmask_lo, vmask_lo))))); 2348 2349 return result; 2350 } 2351 2352 template <class T> 2353 HWY_API Mask256<T> SetBeforeFirst(Mask256<T> mask) { 2354 return Not(SetAtOrAfterFirst(mask)); 2355 } 2356 2357 template <class T> 2358 HWY_API Mask256<T> SetOnlyFirst(Mask256<T> mask) { 2359 const Full256<T> d; 2360 const RebindToSigned<decltype(d)> di; 2361 const Repartition<int64_t, decltype(d)> di64; 2362 const Half<decltype(di64)> dh_i64; 2363 2364 const auto zero = Zero(di64); 2365 const auto vmask = BitCast(di64, VecFromMask(d, mask)); 2366 2367 const auto vmask_eq_0 = VecFromMask(di64, vmask == zero); 2368 auto vmask2_lo = LowerHalf(dh_i64, vmask_eq_0); 2369 auto vmask2_hi = UpperHalf(dh_i64, vmask_eq_0); 2370 2371 vmask2_lo = And(vmask2_lo, InterleaveLower(vmask2_lo, vmask2_lo)); 2372 vmask2_hi = And(ConcatLowerUpper(dh_i64, vmask2_hi, vmask2_lo), 2373 InterleaveUpper(dh_i64, vmask2_lo, vmask2_lo)); 2374 vmask2_lo = InterleaveLower(Set(dh_i64, int64_t{-1}), vmask2_lo); 2375 2376 const auto vmask2 = Combine(di64, vmask2_hi, vmask2_lo); 2377 const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); 2378 return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); 2379 } 2380 2381 template <class T> 2382 HWY_API Mask256<T> SetAtOrBeforeFirst(Mask256<T> mask) { 2383 const Full256<T> d; 2384 constexpr size_t kLanesPerBlock = MaxLanes(d) / 2; 2385 2386 const auto vmask = VecFromMask(d, mask); 2387 const auto vmask_lo = ConcatLowerLower(d, vmask, Zero(d)); 2388 return SetBeforeFirst( 2389 MaskFromVec(CombineShiftRightBytes<(kLanesPerBlock - 1) * sizeof(T)>( 2390 d, vmask, vmask_lo))); 2391 } 2392 2393 // ------------------------------ WidenMulPairwiseAdd 2394 template <class D32, typename T16, typename T32 = TFromD<D32>> 2395 HWY_API Vec256<T32> WidenMulPairwiseAdd(D32 d32, Vec256<T16> a, Vec256<T16> b) { 2396 const Half<decltype(d32)> d32h; 2397 Vec256<T32> result; 2398 result.v0 = WidenMulPairwiseAdd(d32h, a.v0, b.v0); 2399 result.v1 = WidenMulPairwiseAdd(d32h, a.v1, b.v1); 2400 return result; 2401 } 2402 2403 // ------------------------------ ReorderWidenMulAccumulate 2404 template <class D32, typename T16, typename T32 = TFromD<D32>> 2405 HWY_API Vec256<T32> ReorderWidenMulAccumulate(D32 d32, Vec256<T16> a, 2406 Vec256<T16> b, Vec256<T32> sum0, 2407 Vec256<T32>& sum1) { 2408 const Half<decltype(d32)> d32h; 2409 sum0.v0 = ReorderWidenMulAccumulate(d32h, a.v0, b.v0, sum0.v0, sum1.v0); 2410 sum0.v1 = ReorderWidenMulAccumulate(d32h, a.v1, b.v1, sum0.v1, sum1.v1); 2411 return sum0; 2412 } 2413 2414 // ------------------------------ RearrangeToOddPlusEven 2415 template <typename TW> 2416 HWY_API Vec256<TW> RearrangeToOddPlusEven(Vec256<TW> sum0, Vec256<TW> sum1) { 2417 sum0.v0 = RearrangeToOddPlusEven(sum0.v0, sum1.v0); 2418 sum0.v1 = RearrangeToOddPlusEven(sum0.v1, sum1.v1); 2419 return sum0; 2420 } 2421 2422 // ------------------------------ Reductions in generic_ops 2423 2424 // ------------------------------ Lt128 2425 2426 template <class D, typename T = TFromD<D>> 2427 HWY_INLINE Mask256<T> Lt128(D d, Vec256<T> a, Vec256<T> b) { 2428 const Half<decltype(d)> dh; 2429 Mask256<T> ret; 2430 ret.m0 = Lt128(dh, a.v0, b.v0); 2431 ret.m1 = Lt128(dh, a.v1, b.v1); 2432 return ret; 2433 } 2434 2435 template <class D, typename T = TFromD<D>> 2436 HWY_INLINE Mask256<T> Lt128Upper(D d, Vec256<T> a, Vec256<T> b) { 2437 const Half<decltype(d)> dh; 2438 Mask256<T> ret; 2439 ret.m0 = Lt128Upper(dh, a.v0, b.v0); 2440 ret.m1 = Lt128Upper(dh, a.v1, b.v1); 2441 return ret; 2442 } 2443 2444 template <class D, typename T = TFromD<D>> 2445 HWY_INLINE Mask256<T> Eq128(D d, Vec256<T> a, Vec256<T> b) { 2446 const Half<decltype(d)> dh; 2447 Mask256<T> ret; 2448 ret.m0 = Eq128(dh, a.v0, b.v0); 2449 ret.m1 = Eq128(dh, a.v1, b.v1); 2450 return ret; 2451 } 2452 2453 template <class D, typename T = TFromD<D>> 2454 HWY_INLINE Mask256<T> Eq128Upper(D d, Vec256<T> a, Vec256<T> b) { 2455 const Half<decltype(d)> dh; 2456 Mask256<T> ret; 2457 ret.m0 = Eq128Upper(dh, a.v0, b.v0); 2458 ret.m1 = Eq128Upper(dh, a.v1, b.v1); 2459 return ret; 2460 } 2461 2462 template <class D, typename T = TFromD<D>> 2463 HWY_INLINE Mask256<T> Ne128(D d, Vec256<T> a, Vec256<T> b) { 2464 const Half<decltype(d)> dh; 2465 Mask256<T> ret; 2466 ret.m0 = Ne128(dh, a.v0, b.v0); 2467 ret.m1 = Ne128(dh, a.v1, b.v1); 2468 return ret; 2469 } 2470 2471 template <class D, typename T = TFromD<D>> 2472 HWY_INLINE Mask256<T> Ne128Upper(D d, Vec256<T> a, Vec256<T> b) { 2473 const Half<decltype(d)> dh; 2474 Mask256<T> ret; 2475 ret.m0 = Ne128Upper(dh, a.v0, b.v0); 2476 ret.m1 = Ne128Upper(dh, a.v1, b.v1); 2477 return ret; 2478 } 2479 2480 template <class D, typename T = TFromD<D>> 2481 HWY_INLINE Vec256<T> Min128(D d, Vec256<T> a, Vec256<T> b) { 2482 const Half<decltype(d)> dh; 2483 Vec256<T> ret; 2484 ret.v0 = Min128(dh, a.v0, b.v0); 2485 ret.v1 = Min128(dh, a.v1, b.v1); 2486 return ret; 2487 } 2488 2489 template <class D, typename T = TFromD<D>> 2490 HWY_INLINE Vec256<T> Max128(D d, Vec256<T> a, Vec256<T> b) { 2491 const Half<decltype(d)> dh; 2492 Vec256<T> ret; 2493 ret.v0 = Max128(dh, a.v0, b.v0); 2494 ret.v1 = Max128(dh, a.v1, b.v1); 2495 return ret; 2496 } 2497 2498 template <class D, typename T = TFromD<D>> 2499 HWY_INLINE Vec256<T> Min128Upper(D d, Vec256<T> a, Vec256<T> b) { 2500 const Half<decltype(d)> dh; 2501 Vec256<T> ret; 2502 ret.v0 = Min128Upper(dh, a.v0, b.v0); 2503 ret.v1 = Min128Upper(dh, a.v1, b.v1); 2504 return ret; 2505 } 2506 2507 template <class D, typename T = TFromD<D>> 2508 HWY_INLINE Vec256<T> Max128Upper(D d, Vec256<T> a, Vec256<T> b) { 2509 const Half<decltype(d)> dh; 2510 Vec256<T> ret; 2511 ret.v0 = Max128Upper(dh, a.v0, b.v0); 2512 ret.v1 = Max128Upper(dh, a.v1, b.v1); 2513 return ret; 2514 } 2515 2516 // NOLINTNEXTLINE(google-readability-namespace-comments) 2517 } // namespace HWY_NAMESPACE 2518 } // namespace hwy 2519 HWY_AFTER_NAMESPACE();