wasm_128-inl.h (233537B)
1 // Copyright 2019 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // 128-bit WASM vectors and operations. 17 // External include guard in highway.h - see comment there. 18 19 #include <wasm_simd128.h> 20 21 #include "hwy/base.h" 22 #include "hwy/ops/shared-inl.h" 23 24 #ifdef HWY_WASM_OLD_NAMES 25 #define wasm_i8x16_shuffle wasm_v8x16_shuffle 26 #define wasm_i16x8_shuffle wasm_v16x8_shuffle 27 #define wasm_i32x4_shuffle wasm_v32x4_shuffle 28 #define wasm_i64x2_shuffle wasm_v64x2_shuffle 29 #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16 30 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8 31 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8 32 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16 33 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8 34 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8 35 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4 36 #define wasm_i62x2_trunc_sat_f64x2 wasm_i64x2_trunc_saturate_f64x2 37 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate 38 #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate 39 #define wasm_u16x8_add_sat wasm_u16x8_add_saturate 40 #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate 41 #define wasm_i8x16_add_sat wasm_i8x16_add_saturate 42 #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate 43 #define wasm_i16x8_add_sat wasm_i16x8_add_saturate 44 #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate 45 #endif 46 47 HWY_BEFORE_NAMESPACE(); 48 namespace hwy { 49 namespace HWY_NAMESPACE { 50 51 #if HWY_TARGET == HWY_WASM_EMU256 52 template <typename T> 53 using Full256 = Simd<T, 32 / sizeof(T), 0>; 54 #endif 55 56 namespace detail { 57 58 template <typename T> 59 struct Raw128 { 60 using type = __v128_u; 61 }; 62 template <> 63 struct Raw128<float> { 64 using type = __f32x4; 65 }; 66 template <> 67 struct Raw128<double> { 68 using type = __f64x2; 69 }; 70 71 } // namespace detail 72 73 template <typename T, size_t N = 16 / sizeof(T)> 74 class Vec128 { 75 using Raw = typename detail::Raw128<T>::type; 76 77 public: 78 using PrivateT = T; // only for DFromV 79 static constexpr size_t kPrivateN = N; // only for DFromV 80 81 // Compound assignment. Only usable if there is a corresponding non-member 82 // binary operator overload. For example, only f32 and f64 support division. 83 HWY_INLINE Vec128& operator*=(const Vec128 other) { 84 return *this = (*this * other); 85 } 86 HWY_INLINE Vec128& operator/=(const Vec128 other) { 87 return *this = (*this / other); 88 } 89 HWY_INLINE Vec128& operator+=(const Vec128 other) { 90 return *this = (*this + other); 91 } 92 HWY_INLINE Vec128& operator-=(const Vec128 other) { 93 return *this = (*this - other); 94 } 95 HWY_INLINE Vec128& operator%=(const Vec128 other) { 96 return *this = (*this % other); 97 } 98 HWY_INLINE Vec128& operator&=(const Vec128 other) { 99 return *this = (*this & other); 100 } 101 HWY_INLINE Vec128& operator|=(const Vec128 other) { 102 return *this = (*this | other); 103 } 104 HWY_INLINE Vec128& operator^=(const Vec128 other) { 105 return *this = (*this ^ other); 106 } 107 108 Raw raw; 109 }; 110 111 template <typename T> 112 using Vec64 = Vec128<T, 8 / sizeof(T)>; 113 114 template <typename T> 115 using Vec32 = Vec128<T, 4 / sizeof(T)>; 116 117 template <typename T> 118 using Vec16 = Vec128<T, 2 / sizeof(T)>; 119 120 // FF..FF or 0. 121 template <typename T, size_t N = 16 / sizeof(T)> 122 struct Mask128 { 123 using PrivateT = T; // only for DFromM 124 static constexpr size_t kPrivateN = N; // only for DFromM 125 126 typename detail::Raw128<T>::type raw; 127 }; 128 129 template <class V> 130 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; 131 132 template <class M> 133 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; 134 135 template <class V> 136 using TFromV = typename V::PrivateT; 137 138 // ------------------------------ Zero 139 140 // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. 141 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_NOT_FLOAT3264_D(D)> 142 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 143 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_i32x4_splat(0)}; 144 } 145 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 146 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 147 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_f32x4_splat(0.0f)}; 148 } 149 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 150 HWY_API Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { 151 return Vec128<TFromD<D>, HWY_MAX_LANES_D(D)>{wasm_f64x2_splat(0.0)}; 152 } 153 154 template <class D> 155 using VFromD = decltype(Zero(D())); 156 157 // ------------------------------ BitCast 158 159 namespace detail { 160 161 HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; } 162 HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) { 163 return static_cast<__v128_u>(v); 164 } 165 HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) { 166 return static_cast<__v128_u>(v); 167 } 168 169 template <typename T, size_t N> 170 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) { 171 return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)}; 172 } 173 174 // Cannot rely on function overloading because return types differ. 175 template <typename T> 176 struct BitCastFromInteger128 { 177 HWY_INLINE __v128_u operator()(__v128_u v) { return v; } 178 }; 179 template <> 180 struct BitCastFromInteger128<float> { 181 HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); } 182 }; 183 template <> 184 struct BitCastFromInteger128<double> { 185 HWY_INLINE __f64x2 operator()(__v128_u v) { return static_cast<__f64x2>(v); } 186 }; 187 188 template <class D> 189 HWY_INLINE VFromD<D> BitCastFromByte(D d, Vec128<uint8_t, d.MaxBytes()> v) { 190 return VFromD<D>{BitCastFromInteger128<TFromD<D>>()(v.raw)}; 191 } 192 193 } // namespace detail 194 195 template <class D, typename FromT> 196 HWY_API VFromD<D> BitCast(D d, 197 Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) { 198 return detail::BitCastFromByte(d, detail::BitCastToByte(v)); 199 } 200 201 // ------------------------------ ResizeBitCast 202 203 template <class D, typename FromV, HWY_IF_V_SIZE_LE_V(FromV, 16), 204 HWY_IF_V_SIZE_LE_D(D, 16)> 205 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 206 const Repartition<uint8_t, decltype(d)> du8_to; 207 return BitCast(d, VFromD<decltype(du8_to)>{detail::BitCastToInteger(v.raw)}); 208 } 209 210 // ------------------------------ Set 211 212 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 213 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 214 return VFromD<D>{wasm_i8x16_splat(static_cast<int8_t>(t))}; 215 } 216 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI16_D(D)> 217 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 218 return VFromD<D>{wasm_i16x8_splat(static_cast<int16_t>(t))}; 219 } 220 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI32_D(D)> 221 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 222 return VFromD<D>{wasm_i32x4_splat(static_cast<int32_t>(t))}; 223 } 224 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D)> 225 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 226 return VFromD<D>{wasm_i64x2_splat(static_cast<int64_t>(t))}; 227 } 228 229 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_SPECIAL_FLOAT_D(D)> 230 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 231 return VFromD<D>{wasm_i16x8_splat(BitCastScalar<int16_t>(t))}; 232 } 233 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 234 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 235 return VFromD<D>{wasm_f32x4_splat(t)}; 236 } 237 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 238 HWY_API VFromD<D> Set(D /* tag */, TFromD<D> t) { 239 return VFromD<D>{wasm_f64x2_splat(t)}; 240 } 241 242 HWY_DIAGNOSTICS(push) 243 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") 244 245 // For all vector sizes. 246 template <class D> 247 HWY_API VFromD<D> Undefined(D d) { 248 return Zero(d); 249 } 250 251 HWY_DIAGNOSTICS(pop) 252 253 // For all vector sizes. 254 template <class D, typename T = TFromD<D>, typename T2> 255 HWY_API VFromD<D> Iota(D d, const T2 first) { 256 HWY_ALIGN T lanes[MaxLanes(d)]; 257 for (size_t i = 0; i < MaxLanes(d); ++i) { 258 lanes[i] = AddWithWraparound(static_cast<T>(first), i); 259 } 260 return Load(d, lanes); 261 } 262 263 // ------------------------------ Dup128VecFromValues 264 template <class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 265 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 266 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 267 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, 268 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10, 269 TFromD<D> t11, TFromD<D> t12, 270 TFromD<D> t13, TFromD<D> t14, 271 TFromD<D> t15) { 272 return VFromD<D>{wasm_i8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, 273 t11, t12, t13, t14, t15)}; 274 } 275 276 template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 277 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 278 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 279 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, 280 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10, 281 TFromD<D> t11, TFromD<D> t12, 282 TFromD<D> t13, TFromD<D> t14, 283 TFromD<D> t15) { 284 return VFromD<D>{wasm_u8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, 285 t11, t12, t13, t14, t15)}; 286 } 287 288 template <class D, HWY_IF_I16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 289 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 290 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 291 TFromD<D> t5, TFromD<D> t6, 292 TFromD<D> t7) { 293 return VFromD<D>{wasm_i16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)}; 294 } 295 296 template <class D, HWY_IF_U16_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 297 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 298 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 299 TFromD<D> t5, TFromD<D> t6, 300 TFromD<D> t7) { 301 return VFromD<D>{wasm_u16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)}; 302 } 303 304 template <class D, HWY_IF_SPECIAL_FLOAT_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 305 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 306 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 307 TFromD<D> t5, TFromD<D> t6, 308 TFromD<D> t7) { 309 const RebindToSigned<decltype(d)> di; 310 return BitCast(d, 311 Dup128VecFromValues( 312 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1), 313 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3), 314 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5), 315 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7))); 316 } 317 318 template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 319 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 320 TFromD<D> t2, TFromD<D> t3) { 321 return VFromD<D>{wasm_i32x4_make(t0, t1, t2, t3)}; 322 } 323 324 template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 325 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 326 TFromD<D> t2, TFromD<D> t3) { 327 return VFromD<D>{wasm_u32x4_make(t0, t1, t2, t3)}; 328 } 329 330 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 331 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1, 332 TFromD<D> t2, TFromD<D> t3) { 333 return VFromD<D>{wasm_f32x4_make(t0, t1, t2, t3)}; 334 } 335 336 template <class D, HWY_IF_I64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 337 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { 338 return VFromD<D>{wasm_i64x2_make(t0, t1)}; 339 } 340 341 template <class D, HWY_IF_U64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 342 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { 343 return VFromD<D>{wasm_u64x2_make(t0, t1)}; 344 } 345 346 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_LE_D(D, 16)> 347 HWY_API VFromD<D> Dup128VecFromValues(D /*d*/, TFromD<D> t0, TFromD<D> t1) { 348 return VFromD<D>{wasm_f64x2_make(t0, t1)}; 349 } 350 351 // ================================================== ARITHMETIC 352 353 // ------------------------------ Addition 354 355 // Unsigned 356 template <size_t N> 357 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a, 358 const Vec128<uint8_t, N> b) { 359 return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)}; 360 } 361 template <size_t N> 362 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a, 363 const Vec128<uint16_t, N> b) { 364 return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)}; 365 } 366 template <size_t N> 367 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a, 368 const Vec128<uint32_t, N> b) { 369 return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)}; 370 } 371 template <size_t N> 372 HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a, 373 const Vec128<uint64_t, N> b) { 374 return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)}; 375 } 376 377 // Signed 378 template <size_t N> 379 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a, 380 const Vec128<int8_t, N> b) { 381 return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)}; 382 } 383 template <size_t N> 384 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a, 385 const Vec128<int16_t, N> b) { 386 return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)}; 387 } 388 template <size_t N> 389 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a, 390 const Vec128<int32_t, N> b) { 391 return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)}; 392 } 393 template <size_t N> 394 HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a, 395 const Vec128<int64_t, N> b) { 396 return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)}; 397 } 398 399 // Float 400 template <size_t N> 401 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a, 402 const Vec128<float, N> b) { 403 return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)}; 404 } 405 template <size_t N> 406 HWY_API Vec128<double, N> operator+(const Vec128<double, N> a, 407 const Vec128<double, N> b) { 408 return Vec128<double, N>{wasm_f64x2_add(a.raw, b.raw)}; 409 } 410 411 // ------------------------------ Subtraction 412 413 // Unsigned 414 template <size_t N> 415 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a, 416 const Vec128<uint8_t, N> b) { 417 return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)}; 418 } 419 template <size_t N> 420 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a, 421 Vec128<uint16_t, N> b) { 422 return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)}; 423 } 424 template <size_t N> 425 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a, 426 const Vec128<uint32_t, N> b) { 427 return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)}; 428 } 429 template <size_t N> 430 HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a, 431 const Vec128<uint64_t, N> b) { 432 return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)}; 433 } 434 435 // Signed 436 template <size_t N> 437 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a, 438 const Vec128<int8_t, N> b) { 439 return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)}; 440 } 441 template <size_t N> 442 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a, 443 const Vec128<int16_t, N> b) { 444 return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)}; 445 } 446 template <size_t N> 447 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a, 448 const Vec128<int32_t, N> b) { 449 return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)}; 450 } 451 template <size_t N> 452 HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a, 453 const Vec128<int64_t, N> b) { 454 return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)}; 455 } 456 457 // Float 458 template <size_t N> 459 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a, 460 const Vec128<float, N> b) { 461 return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)}; 462 } 463 template <size_t N> 464 HWY_API Vec128<double, N> operator-(const Vec128<double, N> a, 465 const Vec128<double, N> b) { 466 return Vec128<double, N>{wasm_f64x2_sub(a.raw, b.raw)}; 467 } 468 469 // ------------------------------ SaturatedAdd 470 471 // Returns a + b clamped to the destination range. 472 473 // Unsigned 474 template <size_t N> 475 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a, 476 const Vec128<uint8_t, N> b) { 477 return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)}; 478 } 479 template <size_t N> 480 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a, 481 const Vec128<uint16_t, N> b) { 482 return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)}; 483 } 484 485 // Signed 486 template <size_t N> 487 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a, 488 const Vec128<int8_t, N> b) { 489 return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)}; 490 } 491 template <size_t N> 492 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a, 493 const Vec128<int16_t, N> b) { 494 return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)}; 495 } 496 497 // ------------------------------ SaturatedSub 498 499 // Returns a - b clamped to the destination range. 500 501 // Unsigned 502 template <size_t N> 503 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a, 504 const Vec128<uint8_t, N> b) { 505 return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)}; 506 } 507 template <size_t N> 508 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a, 509 const Vec128<uint16_t, N> b) { 510 return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)}; 511 } 512 513 // Signed 514 template <size_t N> 515 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a, 516 const Vec128<int8_t, N> b) { 517 return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)}; 518 } 519 template <size_t N> 520 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a, 521 const Vec128<int16_t, N> b) { 522 return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)}; 523 } 524 525 // ------------------------------ Average 526 527 // Returns (a + b + 1) / 2 528 529 // Unsigned 530 template <size_t N> 531 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a, 532 const Vec128<uint8_t, N> b) { 533 return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)}; 534 } 535 template <size_t N> 536 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a, 537 const Vec128<uint16_t, N> b) { 538 return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)}; 539 } 540 541 template <class V, HWY_IF_SIGNED_V(V), 542 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> 543 HWY_API V AverageRound(V a, V b) { 544 const DFromV<decltype(a)> d; 545 const RebindToUnsigned<decltype(d)> du; 546 const V sign_bit = SignBit(d); 547 return Xor(BitCast(d, AverageRound(BitCast(du, Xor(a, sign_bit)), 548 BitCast(du, Xor(b, sign_bit)))), 549 sign_bit); 550 } 551 552 // ------------------------------ Absolute value 553 554 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. 555 template <size_t N> 556 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) { 557 return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)}; 558 } 559 template <size_t N> 560 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) { 561 return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)}; 562 } 563 template <size_t N> 564 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) { 565 return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)}; 566 } 567 template <size_t N> 568 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) { 569 return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)}; 570 } 571 572 template <size_t N> 573 HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) { 574 return Vec128<float, N>{wasm_f32x4_abs(v.raw)}; 575 } 576 template <size_t N> 577 HWY_API Vec128<double, N> Abs(const Vec128<double, N> v) { 578 return Vec128<double, N>{wasm_f64x2_abs(v.raw)}; 579 } 580 581 // ------------------------------ Shift lanes by constant #bits 582 583 // Unsigned 584 template <int kBits, size_t N> 585 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) { 586 return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)}; 587 } 588 template <int kBits, size_t N> 589 HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) { 590 return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)}; 591 } 592 template <int kBits, size_t N> 593 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) { 594 return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)}; 595 } 596 template <int kBits, size_t N> 597 HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) { 598 return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)}; 599 } 600 template <int kBits, size_t N> 601 HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) { 602 return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)}; 603 } 604 template <int kBits, size_t N> 605 HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) { 606 return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)}; 607 } 608 609 // Signed 610 template <int kBits, size_t N> 611 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) { 612 return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)}; 613 } 614 template <int kBits, size_t N> 615 HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) { 616 return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)}; 617 } 618 template <int kBits, size_t N> 619 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) { 620 return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)}; 621 } 622 template <int kBits, size_t N> 623 HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) { 624 return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)}; 625 } 626 template <int kBits, size_t N> 627 HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) { 628 return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)}; 629 } 630 template <int kBits, size_t N> 631 HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) { 632 return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)}; 633 } 634 635 // 8-bit 636 template <int kBits, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 637 HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) { 638 const DFromV<decltype(v)> d8; 639 // Use raw instead of BitCast to support N=1. 640 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw}; 641 return kBits == 1 642 ? (v + v) 643 : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF))); 644 } 645 646 template <int kBits, size_t N> 647 HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) { 648 const DFromV<decltype(v)> d8; 649 // Use raw instead of BitCast to support N=1. 650 const Vec128<uint8_t, N> shifted{ 651 ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw}; 652 return shifted & Set(d8, 0xFF >> kBits); 653 } 654 655 template <int kBits, size_t N> 656 HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) { 657 const DFromV<decltype(v)> di; 658 const RebindToUnsigned<decltype(di)> du; 659 const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v))); 660 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); 661 return (shifted ^ shifted_sign) - shifted_sign; 662 } 663 664 // ------------------------------ RotateRight (ShiftRight, Or) 665 template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 666 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { 667 const DFromV<decltype(v)> d; 668 const RebindToUnsigned<decltype(d)> du; 669 670 constexpr size_t kSizeInBits = sizeof(T) * 8; 671 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); 672 673 if (kBits == 0) return v; 674 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))), 675 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); 676 } 677 678 // ------------------------------ Shift lanes by same variable #bits 679 680 // After https://reviews.llvm.org/D108415 shift argument became unsigned. 681 HWY_DIAGNOSTICS(push) 682 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") 683 684 // Unsigned 685 template <size_t N> 686 HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v, 687 const int bits) { 688 return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)}; 689 } 690 template <size_t N> 691 HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v, 692 const int bits) { 693 return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)}; 694 } 695 template <size_t N> 696 HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v, 697 const int bits) { 698 return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)}; 699 } 700 template <size_t N> 701 HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v, 702 const int bits) { 703 return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)}; 704 } 705 template <size_t N> 706 HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v, 707 const int bits) { 708 return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)}; 709 } 710 template <size_t N> 711 HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v, 712 const int bits) { 713 return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)}; 714 } 715 716 // Signed 717 template <size_t N> 718 HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v, 719 const int bits) { 720 return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)}; 721 } 722 template <size_t N> 723 HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v, 724 const int bits) { 725 return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)}; 726 } 727 template <size_t N> 728 HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v, 729 const int bits) { 730 return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)}; 731 } 732 template <size_t N> 733 HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v, 734 const int bits) { 735 return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)}; 736 } 737 template <size_t N> 738 HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v, 739 const int bits) { 740 return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)}; 741 } 742 template <size_t N> 743 HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v, 744 const int bits) { 745 return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)}; 746 } 747 748 // 8-bit 749 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 750 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) { 751 const DFromV<decltype(v)> d8; 752 // Use raw instead of BitCast to support N=1. 753 const Vec128<T, N> shifted{ 754 ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw}; 755 return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF)); 756 } 757 758 template <size_t N> 759 HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v, 760 const int bits) { 761 const DFromV<decltype(v)> d8; 762 // Use raw instead of BitCast to support N=1. 763 const Vec128<uint8_t, N> shifted{ 764 ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw}; 765 return shifted & Set(d8, 0xFF >> bits); 766 } 767 768 template <size_t N> 769 HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) { 770 const DFromV<decltype(v)> di; 771 const RebindToUnsigned<decltype(di)> du; 772 const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); 773 const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); 774 return (shifted ^ shifted_sign) - shifted_sign; 775 } 776 777 // ignore Wsign-conversion 778 HWY_DIAGNOSTICS(pop) 779 780 // ------------------------------ Minimum 781 782 // Unsigned 783 template <size_t N> 784 HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { 785 return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)}; 786 } 787 template <size_t N> 788 HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { 789 return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)}; 790 } 791 template <size_t N> 792 HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { 793 return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)}; 794 } 795 template <size_t N> 796 HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { 797 // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. 798 const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)); 799 const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)); 800 const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)); 801 const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)); 802 alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)}; 803 return Vec128<uint64_t, N>{wasm_v128_load(min)}; 804 } 805 806 // Signed 807 template <size_t N> 808 HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 809 return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)}; 810 } 811 template <size_t N> 812 HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { 813 return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)}; 814 } 815 template <size_t N> 816 HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { 817 return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)}; 818 } 819 template <size_t N> 820 HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { 821 alignas(16) int64_t min[4]; 822 min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0), 823 wasm_i64x2_extract_lane(b.raw, 0)); 824 min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1), 825 wasm_i64x2_extract_lane(b.raw, 1)); 826 return Vec128<int64_t, N>{wasm_v128_load(min)}; 827 } 828 829 // Float 830 template <size_t N> 831 HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) { 832 // Equivalent to a < b ? a : b (taking into account our swapped arg order, 833 // so that Min(NaN, x) is x to match x86). 834 return Vec128<float, N>{wasm_f32x4_pmin(b.raw, a.raw)}; 835 } 836 template <size_t N> 837 HWY_API Vec128<double, N> Min(Vec128<double, N> a, Vec128<double, N> b) { 838 // Equivalent to a < b ? a : b (taking into account our swapped arg order, 839 // so that Min(NaN, x) is x to match x86). 840 return Vec128<double, N>{wasm_f64x2_pmin(b.raw, a.raw)}; 841 } 842 843 // ------------------------------ Maximum 844 845 // Unsigned 846 template <size_t N> 847 HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { 848 return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)}; 849 } 850 template <size_t N> 851 HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) { 852 return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)}; 853 } 854 template <size_t N> 855 HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) { 856 return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)}; 857 } 858 template <size_t N> 859 HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { 860 // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. 861 const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)); 862 const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)); 863 const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)); 864 const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)); 865 alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)}; 866 return Vec128<uint64_t, N>{wasm_v128_load(max)}; 867 } 868 869 // Signed 870 template <size_t N> 871 HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 872 return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)}; 873 } 874 template <size_t N> 875 HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { 876 return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)}; 877 } 878 template <size_t N> 879 HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { 880 return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)}; 881 } 882 template <size_t N> 883 HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { 884 alignas(16) int64_t max[2]; 885 max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0), 886 wasm_i64x2_extract_lane(b.raw, 0)); 887 max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1), 888 wasm_i64x2_extract_lane(b.raw, 1)); 889 return Vec128<int64_t, N>{wasm_v128_load(max)}; 890 } 891 892 // Float 893 template <size_t N> 894 HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) { 895 // Equivalent to b < a ? a : b (taking into account our swapped arg order, 896 // so that Max(NaN, x) is x to match x86). 897 return Vec128<float, N>{wasm_f32x4_pmax(b.raw, a.raw)}; 898 } 899 template <size_t N> 900 HWY_API Vec128<double, N> Max(Vec128<double, N> a, Vec128<double, N> b) { 901 // Equivalent to b < a ? a : b (taking into account our swapped arg order, 902 // so that Max(NaN, x) is x to match x86). 903 return Vec128<double, N>{wasm_f64x2_pmax(b.raw, a.raw)}; 904 } 905 906 // ------------------------------ MinNumber and MaxNumber 907 908 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 909 #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 910 #else 911 #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 912 #endif 913 914 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)> 915 HWY_API V MinNumber(V a, V b) { 916 return Min(a, IfThenElse(IsNaN(b), a, b)); 917 } 918 919 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)> 920 HWY_API V MaxNumber(V a, V b) { 921 return Max(a, IfThenElse(IsNaN(b), a, b)); 922 } 923 924 // ------------------------------ Integer multiplication 925 926 // Unsigned 927 template <size_t N> 928 HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a, 929 const Vec128<uint16_t, N> b) { 930 return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)}; 931 } 932 template <size_t N> 933 HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a, 934 const Vec128<uint32_t, N> b) { 935 return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)}; 936 } 937 938 // Signed 939 template <size_t N> 940 HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a, 941 const Vec128<int16_t, N> b) { 942 return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)}; 943 } 944 template <size_t N> 945 HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a, 946 const Vec128<int32_t, N> b) { 947 return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)}; 948 } 949 950 // Returns the upper sizeof(T)*8 bits of a * b in each lane. 951 template <size_t N> 952 HWY_API Vec128<uint8_t, N> MulHigh(const Vec128<uint8_t, N> a, 953 const Vec128<uint8_t, N> b) { 954 const auto l = wasm_u16x8_extmul_low_u8x16(a.raw, b.raw); 955 const auto h = wasm_u16x8_extmul_high_u8x16(a.raw, b.raw); 956 // TODO(eustas): shift-right + narrow? 957 return Vec128<uint8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15, 958 17, 19, 21, 23, 25, 27, 29, 31)}; 959 } 960 template <size_t N> 961 HWY_API Vec128<int8_t, N> MulHigh(const Vec128<int8_t, N> a, 962 const Vec128<int8_t, N> b) { 963 const auto l = wasm_i16x8_extmul_low_i8x16(a.raw, b.raw); 964 const auto h = wasm_i16x8_extmul_high_i8x16(a.raw, b.raw); 965 // TODO(eustas): shift-right + narrow? 966 return Vec128<int8_t, N>{wasm_i8x16_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15, 967 17, 19, 21, 23, 25, 27, 29, 31)}; 968 } 969 template <size_t N> 970 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a, 971 const Vec128<uint16_t, N> b) { 972 const auto l = wasm_u32x4_extmul_low_u16x8(a.raw, b.raw); 973 const auto h = wasm_u32x4_extmul_high_u16x8(a.raw, b.raw); 974 // TODO(eustas): shift-right + narrow? 975 return Vec128<uint16_t, N>{ 976 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; 977 } 978 template <size_t N> 979 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a, 980 const Vec128<int16_t, N> b) { 981 const auto l = wasm_i32x4_extmul_low_i16x8(a.raw, b.raw); 982 const auto h = wasm_i32x4_extmul_high_i16x8(a.raw, b.raw); 983 // TODO(eustas): shift-right + narrow? 984 return Vec128<int16_t, N>{ 985 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; 986 } 987 template <size_t N> 988 HWY_API Vec128<uint32_t, N> MulHigh(const Vec128<uint32_t, N> a, 989 const Vec128<uint32_t, N> b) { 990 const auto l = wasm_u64x2_extmul_low_u32x4(a.raw, b.raw); 991 const auto h = wasm_u64x2_extmul_high_u32x4(a.raw, b.raw); 992 // TODO(eustas): shift-right + narrow? 993 return Vec128<uint32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)}; 994 } 995 template <size_t N> 996 HWY_API Vec128<int32_t, N> MulHigh(const Vec128<int32_t, N> a, 997 const Vec128<int32_t, N> b) { 998 const auto l = wasm_i64x2_extmul_low_i32x4(a.raw, b.raw); 999 const auto h = wasm_i64x2_extmul_high_i32x4(a.raw, b.raw); 1000 // TODO(eustas): shift-right + narrow? 1001 return Vec128<int32_t, N>{wasm_i32x4_shuffle(l, h, 1, 3, 5, 7)}; 1002 } 1003 1004 template <size_t N> 1005 HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a, 1006 Vec128<int16_t, N> b) { 1007 return Vec128<int16_t, N>{wasm_i16x8_q15mulr_sat(a.raw, b.raw)}; 1008 } 1009 1010 // Multiplies even lanes (0, 2 ..) and returns the double-width result. 1011 template <class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)), 1012 HWY_IF_SIGNED(T)> 1013 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(const Vec128<T, N> a, 1014 const Vec128<T, N> b) { 1015 const DFromV<decltype(a)> d; 1016 const RepartitionToWide<decltype(d)> dw; 1017 constexpr int kSrcBits = sizeof(T) * 8; 1018 1019 const auto ae = 1020 ShiftRight<kSrcBits>(ShiftLeft<kSrcBits>(ResizeBitCast(dw, a))); 1021 const auto be = 1022 ShiftRight<kSrcBits>(ShiftLeft<kSrcBits>(ResizeBitCast(dw, b))); 1023 return ae * be; 1024 } 1025 template <class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)), 1026 HWY_IF_UNSIGNED(T)> 1027 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(const Vec128<T, N> a, 1028 const Vec128<T, N> b) { 1029 const DFromV<decltype(a)> d; 1030 const RepartitionToWide<decltype(d)> dw; 1031 const auto kEvenMask = Set(dw, LimitsMax<T>()); 1032 1033 const auto ae = And(ResizeBitCast(dw, a), kEvenMask); 1034 const auto be = And(ResizeBitCast(dw, b), kEvenMask); 1035 return ae * be; 1036 } 1037 template <size_t N> 1038 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a, 1039 const Vec128<int32_t, N> b) { 1040 const DFromV<decltype(a)> d; 1041 const RepartitionToWide<decltype(d)> dw; 1042 const auto ae = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, a))).raw; 1043 const auto be = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, b))).raw; 1044 return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)}; 1045 } 1046 template <size_t N> 1047 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a, 1048 const Vec128<uint32_t, N> b) { 1049 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); 1050 const auto ae = wasm_v128_and(a.raw, kEvenMask); 1051 const auto be = wasm_v128_and(b.raw, kEvenMask); 1052 return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)}; 1053 } 1054 1055 // Multiplies odd lanes (1, 3 ..) and returns the double-width result. 1056 template <class T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)), 1057 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1058 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(const Vec128<T, N> a, 1059 const Vec128<T, N> b) { 1060 const DFromV<decltype(a)> d; 1061 const RepartitionToWide<decltype(d)> dw; 1062 constexpr int kSrcBits = sizeof(T) * 8; 1063 1064 const auto ao = ShiftRight<kSrcBits>(BitCast(dw, a)); 1065 const auto bo = ShiftRight<kSrcBits>(BitCast(dw, b)); 1066 return ao * bo; 1067 } 1068 template <class T, size_t N, HWY_IF_UI32(T)> 1069 HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(const Vec128<T, N> a, 1070 const Vec128<T, N> b) { 1071 const DFromV<decltype(a)> d; 1072 const RepartitionToWide<decltype(d)> dw; 1073 1074 const auto ao = ShiftRight<32>(BitCast(dw, a)); 1075 const auto bo = ShiftRight<32>(BitCast(dw, b)); 1076 return Vec128<MakeWide<T>, (N + 1) / 2>{wasm_i64x2_mul(ao.raw, bo.raw)}; 1077 } 1078 1079 // ------------------------------ Negate 1080 1081 template <typename T, size_t N, HWY_IF_FLOAT_OR_SPECIAL(T)> 1082 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) { 1083 return Xor(v, SignBit(DFromV<decltype(v)>())); 1084 } 1085 1086 template <size_t N> 1087 HWY_API Vec128<int8_t, N> Neg(const Vec128<int8_t, N> v) { 1088 return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)}; 1089 } 1090 template <size_t N> 1091 HWY_API Vec128<int16_t, N> Neg(const Vec128<int16_t, N> v) { 1092 return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)}; 1093 } 1094 template <size_t N> 1095 HWY_API Vec128<int32_t, N> Neg(const Vec128<int32_t, N> v) { 1096 return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)}; 1097 } 1098 template <size_t N> 1099 HWY_API Vec128<int64_t, N> Neg(const Vec128<int64_t, N> v) { 1100 return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)}; 1101 } 1102 1103 // ------------------------------ Floating-point mul / div 1104 1105 template <size_t N> 1106 HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) { 1107 return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)}; 1108 } 1109 template <size_t N> 1110 HWY_API Vec128<double, N> operator*(Vec128<double, N> a, Vec128<double, N> b) { 1111 return Vec128<double, N>{wasm_f64x2_mul(a.raw, b.raw)}; 1112 } 1113 1114 template <size_t N> 1115 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a, 1116 const Vec128<float, N> b) { 1117 return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)}; 1118 } 1119 template <size_t N> 1120 HWY_API Vec128<double, N> operator/(const Vec128<double, N> a, 1121 const Vec128<double, N> b) { 1122 return Vec128<double, N>{wasm_f64x2_div(a.raw, b.raw)}; 1123 } 1124 1125 template <class V, HWY_IF_F32(TFromV<V>)> 1126 HWY_API V ApproximateReciprocal(const V v) { 1127 return Set(DFromV<decltype(v)>(), 1.0f) / v; 1128 } 1129 1130 // Integer overload defined in generic_ops-inl.h. 1131 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1132 HWY_API Vec128<T, N> AbsDiff(const Vec128<T, N> a, const Vec128<T, N> b) { 1133 return Abs(a - b); 1134 } 1135 1136 // ------------------------------ Floating-point multiply-add variants 1137 1138 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1139 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x, 1140 Vec128<T, N> add) { 1141 return mul * x + add; 1142 } 1143 1144 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1145 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x, 1146 Vec128<T, N> add) { 1147 return add - mul * x; 1148 } 1149 1150 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1151 HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x, 1152 Vec128<T, N> sub) { 1153 return mul * x - sub; 1154 } 1155 1156 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1157 HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x, 1158 Vec128<T, N> sub) { 1159 return Neg(mul) * x - sub; 1160 } 1161 1162 // ------------------------------ Floating-point square root 1163 1164 // Full precision square root 1165 template <size_t N> 1166 HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) { 1167 return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)}; 1168 } 1169 template <size_t N> 1170 HWY_API Vec128<double, N> Sqrt(const Vec128<double, N> v) { 1171 return Vec128<double, N>{wasm_f64x2_sqrt(v.raw)}; 1172 } 1173 1174 // Approximate reciprocal square root 1175 template <class V, HWY_IF_F32(TFromV<V>)> 1176 HWY_API V ApproximateReciprocalSqrt(V v) { 1177 // TODO(eustas): find cheaper a way to calculate this. 1178 return Set(DFromV<decltype(v)>(), static_cast<TFromV<V>>(1.0)) / Sqrt(v); 1179 } 1180 1181 // ------------------------------ Floating-point rounding 1182 1183 // Toward nearest integer, ties to even 1184 template <size_t N> 1185 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) { 1186 return Vec128<float, N>{wasm_f32x4_nearest(v.raw)}; 1187 } 1188 template <size_t N> 1189 HWY_API Vec128<double, N> Round(const Vec128<double, N> v) { 1190 return Vec128<double, N>{wasm_f64x2_nearest(v.raw)}; 1191 } 1192 1193 // Toward zero, aka truncate 1194 template <size_t N> 1195 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) { 1196 return Vec128<float, N>{wasm_f32x4_trunc(v.raw)}; 1197 } 1198 template <size_t N> 1199 HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) { 1200 return Vec128<double, N>{wasm_f64x2_trunc(v.raw)}; 1201 } 1202 1203 // Toward +infinity, aka ceiling 1204 template <size_t N> 1205 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) { 1206 return Vec128<float, N>{wasm_f32x4_ceil(v.raw)}; 1207 } 1208 template <size_t N> 1209 HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) { 1210 return Vec128<double, N>{wasm_f64x2_ceil(v.raw)}; 1211 } 1212 1213 // Toward -infinity, aka floor 1214 template <size_t N> 1215 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) { 1216 return Vec128<float, N>{wasm_f32x4_floor(v.raw)}; 1217 } 1218 template <size_t N> 1219 HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) { 1220 return Vec128<double, N>{wasm_f64x2_floor(v.raw)}; 1221 } 1222 1223 // ------------------------------ Floating-point classification 1224 template <typename T, size_t N> 1225 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) { 1226 return v != v; 1227 } 1228 1229 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1230 HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) { 1231 const DFromV<decltype(v)> d; 1232 const RebindToUnsigned<decltype(d)> du; 1233 const VFromD<decltype(du)> vu = BitCast(du, v); 1234 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. 1235 return RebindMask(d, Eq(Add(vu, vu), Set(du, hwy::MaxExponentTimes2<T>()))); 1236 } 1237 1238 // Returns whether normal/subnormal/zero. 1239 template <typename T, size_t N, HWY_IF_FLOAT(T)> 1240 HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) { 1241 const DFromV<decltype(v)> d; 1242 const RebindToUnsigned<decltype(d)> du; 1243 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison 1244 const VFromD<decltype(du)> vu = BitCast(du, v); 1245 // 'Shift left' to clear the sign bit, then right so we can compare with the 1246 // max exponent (cannot compare with MaxExponentTimes2 directly because it is 1247 // negative and non-negative floats would be greater). 1248 const VFromD<decltype(di)> exp = 1249 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu))); 1250 return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>()))); 1251 } 1252 1253 // ================================================== COMPARE 1254 1255 // Comparisons fill a lane with 1-bits if the condition is true, else 0. 1256 1257 // Mask and Vec are the same (true = FF..FF). 1258 template <typename T, size_t N> 1259 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { 1260 return Mask128<T, N>{v.raw}; 1261 } 1262 1263 template <class D> 1264 using MFromD = decltype(MaskFromVec(VFromD<D>())); 1265 1266 template <typename TFrom, size_t NFrom, class DTo> 1267 HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) { 1268 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); 1269 return MFromD<DTo>{m.raw}; 1270 } 1271 1272 template <typename T, size_t N> 1273 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { 1274 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); 1275 return (v & bit) == bit; 1276 } 1277 1278 // ------------------------------ Equality 1279 1280 // Unsigned 1281 template <size_t N> 1282 HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a, 1283 const Vec128<uint8_t, N> b) { 1284 return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)}; 1285 } 1286 template <size_t N> 1287 HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a, 1288 const Vec128<uint16_t, N> b) { 1289 return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)}; 1290 } 1291 template <size_t N> 1292 HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a, 1293 const Vec128<uint32_t, N> b) { 1294 return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)}; 1295 } 1296 template <size_t N> 1297 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a, 1298 const Vec128<uint64_t, N> b) { 1299 return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)}; 1300 } 1301 1302 // Signed 1303 template <size_t N> 1304 HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a, 1305 const Vec128<int8_t, N> b) { 1306 return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)}; 1307 } 1308 template <size_t N> 1309 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a, 1310 Vec128<int16_t, N> b) { 1311 return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)}; 1312 } 1313 template <size_t N> 1314 HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a, 1315 const Vec128<int32_t, N> b) { 1316 return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)}; 1317 } 1318 template <size_t N> 1319 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a, 1320 const Vec128<int64_t, N> b) { 1321 return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)}; 1322 } 1323 1324 // Float 1325 template <size_t N> 1326 HWY_API Mask128<float, N> operator==(const Vec128<float, N> a, 1327 const Vec128<float, N> b) { 1328 return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)}; 1329 } 1330 template <size_t N> 1331 HWY_API Mask128<double, N> operator==(const Vec128<double, N> a, 1332 const Vec128<double, N> b) { 1333 return Mask128<double, N>{wasm_f64x2_eq(a.raw, b.raw)}; 1334 } 1335 1336 // ------------------------------ Inequality 1337 1338 // Unsigned 1339 template <size_t N> 1340 HWY_API Mask128<uint8_t, N> operator!=(const Vec128<uint8_t, N> a, 1341 const Vec128<uint8_t, N> b) { 1342 return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)}; 1343 } 1344 template <size_t N> 1345 HWY_API Mask128<uint16_t, N> operator!=(const Vec128<uint16_t, N> a, 1346 const Vec128<uint16_t, N> b) { 1347 return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)}; 1348 } 1349 template <size_t N> 1350 HWY_API Mask128<uint32_t, N> operator!=(const Vec128<uint32_t, N> a, 1351 const Vec128<uint32_t, N> b) { 1352 return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)}; 1353 } 1354 template <size_t N> 1355 HWY_API Mask128<uint64_t, N> operator!=(const Vec128<uint64_t, N> a, 1356 const Vec128<uint64_t, N> b) { 1357 return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)}; 1358 } 1359 1360 // Signed 1361 template <size_t N> 1362 HWY_API Mask128<int8_t, N> operator!=(const Vec128<int8_t, N> a, 1363 const Vec128<int8_t, N> b) { 1364 return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)}; 1365 } 1366 template <size_t N> 1367 HWY_API Mask128<int16_t, N> operator!=(const Vec128<int16_t, N> a, 1368 const Vec128<int16_t, N> b) { 1369 return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)}; 1370 } 1371 template <size_t N> 1372 HWY_API Mask128<int32_t, N> operator!=(const Vec128<int32_t, N> a, 1373 const Vec128<int32_t, N> b) { 1374 return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)}; 1375 } 1376 template <size_t N> 1377 HWY_API Mask128<int64_t, N> operator!=(const Vec128<int64_t, N> a, 1378 const Vec128<int64_t, N> b) { 1379 return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)}; 1380 } 1381 1382 // Float 1383 template <size_t N> 1384 HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a, 1385 const Vec128<float, N> b) { 1386 return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)}; 1387 } 1388 template <size_t N> 1389 HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a, 1390 const Vec128<double, N> b) { 1391 return Mask128<double, N>{wasm_f64x2_ne(a.raw, b.raw)}; 1392 } 1393 1394 // ------------------------------ Strict inequality 1395 1396 template <size_t N> 1397 HWY_API Mask128<int8_t, N> operator>(const Vec128<int8_t, N> a, 1398 const Vec128<int8_t, N> b) { 1399 return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)}; 1400 } 1401 template <size_t N> 1402 HWY_API Mask128<int16_t, N> operator>(const Vec128<int16_t, N> a, 1403 const Vec128<int16_t, N> b) { 1404 return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)}; 1405 } 1406 template <size_t N> 1407 HWY_API Mask128<int32_t, N> operator>(const Vec128<int32_t, N> a, 1408 const Vec128<int32_t, N> b) { 1409 return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)}; 1410 } 1411 template <size_t N> 1412 HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a, 1413 const Vec128<int64_t, N> b) { 1414 return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)}; 1415 } 1416 1417 template <size_t N> 1418 HWY_API Mask128<uint8_t, N> operator>(const Vec128<uint8_t, N> a, 1419 const Vec128<uint8_t, N> b) { 1420 return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)}; 1421 } 1422 template <size_t N> 1423 HWY_API Mask128<uint16_t, N> operator>(const Vec128<uint16_t, N> a, 1424 const Vec128<uint16_t, N> b) { 1425 return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)}; 1426 } 1427 template <size_t N> 1428 HWY_API Mask128<uint32_t, N> operator>(const Vec128<uint32_t, N> a, 1429 const Vec128<uint32_t, N> b) { 1430 return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)}; 1431 } 1432 template <size_t N> 1433 HWY_API Mask128<uint64_t, N> operator>(const Vec128<uint64_t, N> a, 1434 const Vec128<uint64_t, N> b) { 1435 const DFromV<decltype(a)> d; 1436 const Repartition<uint32_t, decltype(d)> d32; 1437 const auto a32 = BitCast(d32, a); 1438 const auto b32 = BitCast(d32, b); 1439 // If the upper halves are not equal, this is the answer. 1440 const auto m_gt = a32 > b32; 1441 1442 // Otherwise, the lower half decides. 1443 const auto m_eq = a32 == b32; 1444 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2); 1445 const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi})); 1446 1447 const auto gt = Or(lo_gt, m_gt); 1448 // Copy result in upper 32 bits to lower 32 bits. 1449 return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)}; 1450 } 1451 1452 template <size_t N> 1453 HWY_API Mask128<float, N> operator>(const Vec128<float, N> a, 1454 const Vec128<float, N> b) { 1455 return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)}; 1456 } 1457 template <size_t N> 1458 HWY_API Mask128<double, N> operator>(const Vec128<double, N> a, 1459 const Vec128<double, N> b) { 1460 return Mask128<double, N>{wasm_f64x2_gt(a.raw, b.raw)}; 1461 } 1462 1463 template <typename T, size_t N> 1464 HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) { 1465 return operator>(b, a); 1466 } 1467 1468 // ------------------------------ Weak inequality 1469 1470 // Float >= 1471 template <size_t N> 1472 HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a, 1473 const Vec128<float, N> b) { 1474 return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)}; 1475 } 1476 template <size_t N> 1477 HWY_API Mask128<double, N> operator>=(const Vec128<double, N> a, 1478 const Vec128<double, N> b) { 1479 return Mask128<double, N>{wasm_f64x2_ge(a.raw, b.raw)}; 1480 } 1481 1482 template <size_t N> 1483 HWY_API Mask128<int8_t, N> operator>=(const Vec128<int8_t, N> a, 1484 const Vec128<int8_t, N> b) { 1485 return Mask128<int8_t, N>{wasm_i8x16_ge(a.raw, b.raw)}; 1486 } 1487 template <size_t N> 1488 HWY_API Mask128<int16_t, N> operator>=(const Vec128<int16_t, N> a, 1489 const Vec128<int16_t, N> b) { 1490 return Mask128<int16_t, N>{wasm_i16x8_ge(a.raw, b.raw)}; 1491 } 1492 template <size_t N> 1493 HWY_API Mask128<int32_t, N> operator>=(const Vec128<int32_t, N> a, 1494 const Vec128<int32_t, N> b) { 1495 return Mask128<int32_t, N>{wasm_i32x4_ge(a.raw, b.raw)}; 1496 } 1497 template <size_t N> 1498 HWY_API Mask128<int64_t, N> operator>=(const Vec128<int64_t, N> a, 1499 const Vec128<int64_t, N> b) { 1500 return Mask128<int64_t, N>{wasm_i64x2_ge(a.raw, b.raw)}; 1501 } 1502 1503 template <size_t N> 1504 HWY_API Mask128<uint8_t, N> operator>=(const Vec128<uint8_t, N> a, 1505 const Vec128<uint8_t, N> b) { 1506 return Mask128<uint8_t, N>{wasm_u8x16_ge(a.raw, b.raw)}; 1507 } 1508 template <size_t N> 1509 HWY_API Mask128<uint16_t, N> operator>=(const Vec128<uint16_t, N> a, 1510 const Vec128<uint16_t, N> b) { 1511 return Mask128<uint16_t, N>{wasm_u16x8_ge(a.raw, b.raw)}; 1512 } 1513 template <size_t N> 1514 HWY_API Mask128<uint32_t, N> operator>=(const Vec128<uint32_t, N> a, 1515 const Vec128<uint32_t, N> b) { 1516 return Mask128<uint32_t, N>{wasm_u32x4_ge(a.raw, b.raw)}; 1517 } 1518 template <size_t N> 1519 HWY_API Mask128<uint64_t, N> operator>=(const Vec128<uint64_t, N> a, 1520 const Vec128<uint64_t, N> b) { 1521 return Not(b > a); 1522 } 1523 1524 template <typename T, size_t N> 1525 HWY_API Mask128<T, N> operator<=(const Vec128<T, N> a, const Vec128<T, N> b) { 1526 return operator>=(b, a); 1527 } 1528 1529 // ------------------------------ FirstN (Iota, Lt) 1530 1531 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 1532 HWY_API MFromD<D> FirstN(D d, size_t num) { 1533 const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper. 1534 using TI = TFromD<decltype(di)>; 1535 return RebindMask(d, Iota(di, 0) < Set(di, static_cast<TI>(num))); 1536 } 1537 1538 // ================================================== LOGICAL 1539 1540 // ------------------------------ Not 1541 1542 template <typename T, size_t N> 1543 HWY_API Vec128<T, N> Not(Vec128<T, N> v) { 1544 return Vec128<T, N>{wasm_v128_not(v.raw)}; 1545 } 1546 1547 // ------------------------------ And 1548 1549 template <typename T, size_t N> 1550 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) { 1551 return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)}; 1552 } 1553 1554 // ------------------------------ AndNot 1555 1556 // Returns ~not_mask & mask. 1557 template <typename T, size_t N> 1558 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) { 1559 return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)}; 1560 } 1561 1562 // ------------------------------ Or 1563 1564 template <typename T, size_t N> 1565 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) { 1566 return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)}; 1567 } 1568 1569 // ------------------------------ Xor 1570 1571 template <typename T, size_t N> 1572 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) { 1573 return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)}; 1574 } 1575 1576 // ------------------------------ Xor3 1577 1578 template <typename T, size_t N> 1579 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { 1580 return Xor(x1, Xor(x2, x3)); 1581 } 1582 1583 // ------------------------------ Or3 1584 1585 template <typename T, size_t N> 1586 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { 1587 return Or(o1, Or(o2, o3)); 1588 } 1589 1590 // ------------------------------ OrAnd 1591 1592 template <typename T, size_t N> 1593 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { 1594 return Or(o, And(a1, a2)); 1595 } 1596 1597 // ------------------------------ IfVecThenElse 1598 1599 template <typename T, size_t N> 1600 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, 1601 Vec128<T, N> no) { 1602 return IfThenElse(MaskFromVec(mask), yes, no); 1603 } 1604 1605 // ------------------------------ Operator overloads (internal-only if float) 1606 1607 template <typename T, size_t N> 1608 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) { 1609 return And(a, b); 1610 } 1611 1612 template <typename T, size_t N> 1613 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) { 1614 return Or(a, b); 1615 } 1616 1617 template <typename T, size_t N> 1618 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) { 1619 return Xor(a, b); 1620 } 1621 1622 // ------------------------------ CopySign 1623 template <typename T, size_t N> 1624 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn, 1625 const Vec128<T, N> sign) { 1626 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 1627 const DFromV<decltype(magn)> d; 1628 return BitwiseIfThenElse(SignBit(d), sign, magn); 1629 } 1630 1631 // ------------------------------ CopySignToAbs 1632 template <typename T, size_t N> 1633 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs, 1634 const Vec128<T, N> sign) { 1635 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 1636 const DFromV<decltype(abs)> d; 1637 return OrAnd(abs, SignBit(d), sign); 1638 } 1639 1640 // ------------------------------ BroadcastSignBit (compare) 1641 1642 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 1643 HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) { 1644 return ShiftRight<sizeof(T) * 8 - 1>(v); 1645 } 1646 template <size_t N> 1647 HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) { 1648 const DFromV<decltype(v)> d; 1649 return VecFromMask(d, v < Zero(d)); 1650 } 1651 1652 // ------------------------------ Mask 1653 1654 template <class D> 1655 HWY_API VFromD<D> VecFromMask(D /* tag */, MFromD<D> v) { 1656 return VFromD<D>{v.raw}; 1657 } 1658 1659 // mask ? yes : no 1660 template <typename T, size_t N> 1661 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes, 1662 Vec128<T, N> no) { 1663 return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)}; 1664 } 1665 1666 // mask ? yes : 0 1667 template <typename T, size_t N> 1668 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { 1669 return yes & VecFromMask(DFromV<decltype(yes)>(), mask); 1670 } 1671 1672 // mask ? 0 : no 1673 template <typename T, size_t N> 1674 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { 1675 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no); 1676 } 1677 1678 template <typename T, size_t N> 1679 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, 1680 Vec128<T, N> no) { 1681 static_assert(IsSigned<T>(), "Only works for signed/float"); 1682 const DFromV<decltype(v)> d; 1683 const RebindToSigned<decltype(d)> di; 1684 1685 v = BitCast(d, BroadcastSignBit(BitCast(di, v))); 1686 return IfThenElse(MaskFromVec(v), yes, no); 1687 } 1688 1689 // ------------------------------ Mask logical 1690 1691 template <typename T, size_t N> 1692 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { 1693 const DFromM<decltype(m)> d; 1694 return MaskFromVec(Not(VecFromMask(d, m))); 1695 } 1696 1697 template <typename T, size_t N> 1698 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { 1699 const DFromM<decltype(a)> d; 1700 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); 1701 } 1702 1703 template <typename T, size_t N> 1704 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { 1705 const DFromM<decltype(a)> d; 1706 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); 1707 } 1708 1709 template <typename T, size_t N> 1710 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { 1711 const DFromM<decltype(a)> d; 1712 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); 1713 } 1714 1715 template <typename T, size_t N> 1716 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { 1717 const DFromM<decltype(a)> d; 1718 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); 1719 } 1720 1721 template <typename T, size_t N> 1722 HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) { 1723 const DFromM<decltype(a)> d; 1724 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); 1725 } 1726 1727 // ------------------------------ Shl (BroadcastSignBit, IfThenElse) 1728 1729 // The x86 multiply-by-Pow2() trick will not work because WASM saturates 1730 // float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a 1731 // scalar count operand, per-lane shift instructions would require extract_lane 1732 // for each lane, and hoping that shuffle is correctly mapped to a native 1733 // instruction. Using non-vector shifts would incur a store-load forwarding 1734 // stall when loading the result vector. We instead test bits of the shift 1735 // count to "predicate" a shift of the entire vector by a constant. 1736 1737 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 1738 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) { 1739 const DFromV<decltype(v)> d; 1740 Mask128<T, N> mask; 1741 // Need a signed type for BroadcastSignBit. 1742 auto test = BitCast(RebindToSigned<decltype(d)>(), bits); 1743 // Move the highest valid bit of the shift count into the sign bit. 1744 test = ShiftLeft<5>(test); 1745 1746 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1747 test = ShiftLeft<1>(test); // next bit (descending order) 1748 v = IfThenElse(mask, ShiftLeft<4>(v), v); 1749 1750 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1751 test = ShiftLeft<1>(test); // next bit (descending order) 1752 v = IfThenElse(mask, ShiftLeft<2>(v), v); 1753 1754 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1755 return IfThenElse(mask, ShiftLeft<1>(v), v); 1756 } 1757 1758 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), 1759 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1760 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) { 1761 const DFromV<decltype(v)> d; 1762 Mask128<T, N> mask; 1763 // Need a signed type for BroadcastSignBit. 1764 auto test = BitCast(RebindToSigned<decltype(d)>(), bits); 1765 // Move the highest valid bit of the shift count into the sign bit. 1766 test = ShiftLeft<12>(test); 1767 1768 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1769 test = ShiftLeft<1>(test); // next bit (descending order) 1770 v = IfThenElse(mask, ShiftLeft<8>(v), v); 1771 1772 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1773 test = ShiftLeft<1>(test); // next bit (descending order) 1774 v = IfThenElse(mask, ShiftLeft<4>(v), v); 1775 1776 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1777 test = ShiftLeft<1>(test); // next bit (descending order) 1778 v = IfThenElse(mask, ShiftLeft<2>(v), v); 1779 1780 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1781 return IfThenElse(mask, ShiftLeft<1>(v), v); 1782 } 1783 1784 template <typename T, size_t N, HWY_IF_UI32(T)> 1785 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) { 1786 const DFromV<decltype(v)> d; 1787 Mask128<T, N> mask; 1788 // Need a signed type for BroadcastSignBit. 1789 auto test = BitCast(RebindToSigned<decltype(d)>(), bits); 1790 // Move the highest valid bit of the shift count into the sign bit. 1791 test = ShiftLeft<27>(test); 1792 1793 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1794 test = ShiftLeft<1>(test); // next bit (descending order) 1795 v = IfThenElse(mask, ShiftLeft<16>(v), v); 1796 1797 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1798 test = ShiftLeft<1>(test); // next bit (descending order) 1799 v = IfThenElse(mask, ShiftLeft<8>(v), v); 1800 1801 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1802 test = ShiftLeft<1>(test); // next bit (descending order) 1803 v = IfThenElse(mask, ShiftLeft<4>(v), v); 1804 1805 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1806 test = ShiftLeft<1>(test); // next bit (descending order) 1807 v = IfThenElse(mask, ShiftLeft<2>(v), v); 1808 1809 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1810 return IfThenElse(mask, ShiftLeft<1>(v), v); 1811 } 1812 1813 template <typename T, size_t N, HWY_IF_UI64(T)> 1814 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) { 1815 const DFromV<decltype(v)> d; 1816 const RebindToUnsigned<decltype(d)> du; 1817 using TU = MakeUnsigned<T>; 1818 alignas(16) TU lanes[2] = {}; 1819 alignas(16) TU bits_lanes[2] = {}; 1820 Store(BitCast(du, v), du, lanes); 1821 Store(BitCast(du, bits), du, bits_lanes); 1822 lanes[0] <<= (bits_lanes[0] & 63); 1823 lanes[1] <<= (bits_lanes[1] & 63); 1824 return BitCast(d, Load(du, lanes)); 1825 } 1826 1827 // ------------------------------ Shr (BroadcastSignBit, IfThenElse) 1828 1829 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 1830 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) { 1831 const DFromV<decltype(v)> d; 1832 Mask128<T, N> mask; 1833 // Need a signed type for BroadcastSignBit. 1834 auto test = BitCast(RebindToSigned<decltype(d)>(), bits); 1835 // Move the highest valid bit of the shift count into the sign bit. 1836 test = ShiftLeft<5>(test); 1837 1838 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1839 test = ShiftLeft<1>(test); // next bit (descending order) 1840 v = IfThenElse(mask, ShiftRight<4>(v), v); 1841 1842 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1843 test = ShiftLeft<1>(test); // next bit (descending order) 1844 v = IfThenElse(mask, ShiftRight<2>(v), v); 1845 1846 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1847 return IfThenElse(mask, ShiftRight<1>(v), v); 1848 } 1849 1850 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2), 1851 HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 1852 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) { 1853 const DFromV<decltype(v)> d; 1854 Mask128<T, N> mask; 1855 // Need a signed type for BroadcastSignBit. 1856 auto test = BitCast(RebindToSigned<decltype(d)>(), bits); 1857 // Move the highest valid bit of the shift count into the sign bit. 1858 test = ShiftLeft<12>(test); 1859 1860 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1861 test = ShiftLeft<1>(test); // next bit (descending order) 1862 v = IfThenElse(mask, ShiftRight<8>(v), v); 1863 1864 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1865 test = ShiftLeft<1>(test); // next bit (descending order) 1866 v = IfThenElse(mask, ShiftRight<4>(v), v); 1867 1868 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1869 test = ShiftLeft<1>(test); // next bit (descending order) 1870 v = IfThenElse(mask, ShiftRight<2>(v), v); 1871 1872 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1873 return IfThenElse(mask, ShiftRight<1>(v), v); 1874 } 1875 1876 template <typename T, size_t N, HWY_IF_UI32(T)> 1877 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) { 1878 const DFromV<decltype(v)> d; 1879 Mask128<T, N> mask; 1880 // Need a signed type for BroadcastSignBit. 1881 auto test = BitCast(RebindToSigned<decltype(d)>(), bits); 1882 // Move the highest valid bit of the shift count into the sign bit. 1883 test = ShiftLeft<27>(test); 1884 1885 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1886 test = ShiftLeft<1>(test); // next bit (descending order) 1887 v = IfThenElse(mask, ShiftRight<16>(v), v); 1888 1889 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1890 test = ShiftLeft<1>(test); // next bit (descending order) 1891 v = IfThenElse(mask, ShiftRight<8>(v), v); 1892 1893 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1894 test = ShiftLeft<1>(test); // next bit (descending order) 1895 v = IfThenElse(mask, ShiftRight<4>(v), v); 1896 1897 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1898 test = ShiftLeft<1>(test); // next bit (descending order) 1899 v = IfThenElse(mask, ShiftRight<2>(v), v); 1900 1901 mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); 1902 return IfThenElse(mask, ShiftRight<1>(v), v); 1903 } 1904 1905 template <typename T, size_t N, HWY_IF_UI64(T)> 1906 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) { 1907 const DFromV<decltype(v)> d; 1908 alignas(16) T lanes[2] = {}; 1909 alignas(16) T bits_lanes[2] = {}; 1910 Store(v, d, lanes); 1911 Store(bits, d, bits_lanes); 1912 lanes[0] >>= (bits_lanes[0] & 63); 1913 lanes[1] >>= (bits_lanes[1] & 63); 1914 return Load(d, lanes); 1915 } 1916 1917 // ================================================== MEMORY 1918 1919 // ------------------------------ Load 1920 1921 template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>> 1922 HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) { 1923 return Vec128<T>{wasm_v128_load(aligned)}; 1924 } 1925 1926 // Partial 1927 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 1928 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) { 1929 VFromD<D> v; 1930 CopyBytes<d.MaxBytes()>(p, &v); 1931 return v; 1932 } 1933 1934 // LoadU == Load. 1935 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 1936 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 1937 return Load(d, p); 1938 } 1939 1940 // 128-bit SIMD => nothing to duplicate, same as an unaligned load. 1941 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 1942 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { 1943 return Load(d, p); 1944 } 1945 1946 template <class D, typename T = TFromD<D>> 1947 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const T* HWY_RESTRICT aligned) { 1948 return IfThenElseZero(m, Load(d, aligned)); 1949 } 1950 1951 template <class D, typename T = TFromD<D>> 1952 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, 1953 const T* HWY_RESTRICT aligned) { 1954 return IfThenElse(m, Load(d, aligned), v); 1955 } 1956 1957 // ------------------------------ Store 1958 1959 namespace detail { 1960 1961 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 1962 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 1963 return static_cast<T>(wasm_i8x16_extract_lane(v.raw, kLane)); 1964 } 1965 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2), 1966 HWY_IF_NOT_SPECIAL_FLOAT(T)> 1967 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 1968 const int16_t lane = wasm_i16x8_extract_lane(v.raw, kLane); 1969 return static_cast<T>(lane); 1970 } 1971 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2), 1972 HWY_IF_SPECIAL_FLOAT(T)> 1973 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 1974 const DFromV<decltype(v)> d; 1975 const RebindToUnsigned<decltype(d)> du; 1976 1977 const uint16_t bits = ExtractLane<kLane>(BitCast(du, v)); 1978 return BitCastScalar<T>(bits); 1979 } 1980 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 1981 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 1982 return static_cast<T>(wasm_i32x4_extract_lane(v.raw, kLane)); 1983 } 1984 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 1985 HWY_INLINE T ExtractLane(const Vec128<T, N> v) { 1986 return static_cast<T>(wasm_i64x2_extract_lane(v.raw, kLane)); 1987 } 1988 1989 template <size_t kLane, size_t N> 1990 HWY_INLINE float ExtractLane(const Vec128<float, N> v) { 1991 return wasm_f32x4_extract_lane(v.raw, kLane); 1992 } 1993 template <size_t kLane, size_t N> 1994 HWY_INLINE double ExtractLane(const Vec128<double, N> v) { 1995 return wasm_f64x2_extract_lane(v.raw, kLane); 1996 } 1997 1998 } // namespace detail 1999 2000 template <class D, HWY_IF_V_SIZE_D(D, 16)> 2001 HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) { 2002 wasm_v128_store(aligned, v.raw); 2003 } 2004 2005 // Partial 2006 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)> 2007 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 2008 CopyBytes<d.MaxBytes()>(&v, p); 2009 } 2010 2011 template <class D, HWY_IF_LANES_D(D, 1)> 2012 HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) { 2013 *p = detail::ExtractLane<0>(v); 2014 } 2015 2016 // StoreU == Store. 2017 template <class D> 2018 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 2019 Store(v, d, p); 2020 } 2021 2022 template <class D> 2023 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, 2024 TFromD<D>* HWY_RESTRICT p) { 2025 StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); 2026 } 2027 2028 // ------------------------------ Non-temporal stores 2029 2030 // Same as aligned stores on non-x86. 2031 2032 template <class D> 2033 HWY_API void Stream(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) { 2034 wasm_v128_store(aligned, v.raw); 2035 } 2036 2037 // ------------------------------ Scatter in generic_ops-inl.h 2038 // ------------------------------ Gather in generic_ops-inl.h 2039 2040 // ================================================== SWIZZLE 2041 2042 // ------------------------------ ExtractLane 2043 2044 // One overload per vector length just in case *_extract_lane raise compile 2045 // errors if their argument is out of bounds (even if that would never be 2046 // reached at runtime). 2047 template <typename T> 2048 HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) { 2049 HWY_DASSERT(i == 0); 2050 (void)i; 2051 return detail::ExtractLane<0>(v); 2052 } 2053 2054 template <typename T> 2055 HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) { 2056 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2057 if (__builtin_constant_p(i)) { 2058 switch (i) { 2059 case 0: 2060 return detail::ExtractLane<0>(v); 2061 case 1: 2062 return detail::ExtractLane<1>(v); 2063 } 2064 } 2065 #endif 2066 alignas(16) T lanes[2]; 2067 Store(v, DFromV<decltype(v)>(), lanes); 2068 return lanes[i]; 2069 } 2070 2071 template <typename T> 2072 HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) { 2073 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2074 if (__builtin_constant_p(i)) { 2075 switch (i) { 2076 case 0: 2077 return detail::ExtractLane<0>(v); 2078 case 1: 2079 return detail::ExtractLane<1>(v); 2080 case 2: 2081 return detail::ExtractLane<2>(v); 2082 case 3: 2083 return detail::ExtractLane<3>(v); 2084 } 2085 } 2086 #endif 2087 alignas(16) T lanes[4]; 2088 Store(v, DFromV<decltype(v)>(), lanes); 2089 return lanes[i]; 2090 } 2091 2092 template <typename T> 2093 HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) { 2094 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2095 if (__builtin_constant_p(i)) { 2096 switch (i) { 2097 case 0: 2098 return detail::ExtractLane<0>(v); 2099 case 1: 2100 return detail::ExtractLane<1>(v); 2101 case 2: 2102 return detail::ExtractLane<2>(v); 2103 case 3: 2104 return detail::ExtractLane<3>(v); 2105 case 4: 2106 return detail::ExtractLane<4>(v); 2107 case 5: 2108 return detail::ExtractLane<5>(v); 2109 case 6: 2110 return detail::ExtractLane<6>(v); 2111 case 7: 2112 return detail::ExtractLane<7>(v); 2113 } 2114 } 2115 #endif 2116 alignas(16) T lanes[8]; 2117 Store(v, DFromV<decltype(v)>(), lanes); 2118 return lanes[i]; 2119 } 2120 2121 template <typename T> 2122 HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) { 2123 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2124 if (__builtin_constant_p(i)) { 2125 switch (i) { 2126 case 0: 2127 return detail::ExtractLane<0>(v); 2128 case 1: 2129 return detail::ExtractLane<1>(v); 2130 case 2: 2131 return detail::ExtractLane<2>(v); 2132 case 3: 2133 return detail::ExtractLane<3>(v); 2134 case 4: 2135 return detail::ExtractLane<4>(v); 2136 case 5: 2137 return detail::ExtractLane<5>(v); 2138 case 6: 2139 return detail::ExtractLane<6>(v); 2140 case 7: 2141 return detail::ExtractLane<7>(v); 2142 case 8: 2143 return detail::ExtractLane<8>(v); 2144 case 9: 2145 return detail::ExtractLane<9>(v); 2146 case 10: 2147 return detail::ExtractLane<10>(v); 2148 case 11: 2149 return detail::ExtractLane<11>(v); 2150 case 12: 2151 return detail::ExtractLane<12>(v); 2152 case 13: 2153 return detail::ExtractLane<13>(v); 2154 case 14: 2155 return detail::ExtractLane<14>(v); 2156 case 15: 2157 return detail::ExtractLane<15>(v); 2158 } 2159 } 2160 #endif 2161 alignas(16) T lanes[16]; 2162 Store(v, DFromV<decltype(v)>(), lanes); 2163 return lanes[i]; 2164 } 2165 2166 // ------------------------------ GetLane 2167 template <typename T, size_t N> 2168 HWY_API T GetLane(const Vec128<T, N> v) { 2169 return detail::ExtractLane<0>(v); 2170 } 2171 2172 // ------------------------------ InsertLane 2173 2174 namespace detail { 2175 2176 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 2177 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 2178 static_assert(kLane < N, "Lane index out of bounds"); 2179 return Vec128<T, N>{ 2180 wasm_i8x16_replace_lane(v.raw, kLane, static_cast<int8_t>(t))}; 2181 } 2182 2183 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 2184 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 2185 static_assert(kLane < N, "Lane index out of bounds"); 2186 return Vec128<T, N>{ 2187 wasm_i16x8_replace_lane(v.raw, kLane, BitCastScalar<int16_t>(t))}; 2188 } 2189 2190 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 2191 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 2192 static_assert(kLane < N, "Lane index out of bounds"); 2193 return Vec128<T, N>{ 2194 wasm_i32x4_replace_lane(v.raw, kLane, static_cast<int32_t>(t))}; 2195 } 2196 2197 template <size_t kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 2198 HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) { 2199 static_assert(kLane < N, "Lane index out of bounds"); 2200 return Vec128<T, N>{ 2201 wasm_i64x2_replace_lane(v.raw, kLane, static_cast<int64_t>(t))}; 2202 } 2203 2204 template <size_t kLane, size_t N> 2205 HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) { 2206 static_assert(kLane < N, "Lane index out of bounds"); 2207 return Vec128<float, N>{wasm_f32x4_replace_lane(v.raw, kLane, t)}; 2208 } 2209 2210 template <size_t kLane, size_t N> 2211 HWY_INLINE Vec128<double, N> InsertLane(const Vec128<double, N> v, double t) { 2212 static_assert(kLane < 2, "Lane index out of bounds"); 2213 return Vec128<double, N>{wasm_f64x2_replace_lane(v.raw, kLane, t)}; 2214 } 2215 2216 } // namespace detail 2217 2218 // Requires one overload per vector length because InsertLane<3> may be a 2219 // compile error if it calls wasm_f64x2_replace_lane. 2220 2221 template <typename T> 2222 HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) { 2223 HWY_DASSERT(i == 0); 2224 (void)i; 2225 return Set(DFromV<decltype(v)>(), t); 2226 } 2227 2228 template <typename T> 2229 HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) { 2230 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2231 if (__builtin_constant_p(i)) { 2232 switch (i) { 2233 case 0: 2234 return detail::InsertLane<0>(v, t); 2235 case 1: 2236 return detail::InsertLane<1>(v, t); 2237 } 2238 } 2239 #endif 2240 const DFromV<decltype(v)> d; 2241 alignas(16) T lanes[2]; 2242 Store(v, d, lanes); 2243 lanes[i] = t; 2244 return Load(d, lanes); 2245 } 2246 2247 template <typename T> 2248 HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) { 2249 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2250 if (__builtin_constant_p(i)) { 2251 switch (i) { 2252 case 0: 2253 return detail::InsertLane<0>(v, t); 2254 case 1: 2255 return detail::InsertLane<1>(v, t); 2256 case 2: 2257 return detail::InsertLane<2>(v, t); 2258 case 3: 2259 return detail::InsertLane<3>(v, t); 2260 } 2261 } 2262 #endif 2263 const DFromV<decltype(v)> d; 2264 alignas(16) T lanes[4]; 2265 Store(v, d, lanes); 2266 lanes[i] = t; 2267 return Load(d, lanes); 2268 } 2269 2270 template <typename T> 2271 HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) { 2272 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2273 if (__builtin_constant_p(i)) { 2274 switch (i) { 2275 case 0: 2276 return detail::InsertLane<0>(v, t); 2277 case 1: 2278 return detail::InsertLane<1>(v, t); 2279 case 2: 2280 return detail::InsertLane<2>(v, t); 2281 case 3: 2282 return detail::InsertLane<3>(v, t); 2283 case 4: 2284 return detail::InsertLane<4>(v, t); 2285 case 5: 2286 return detail::InsertLane<5>(v, t); 2287 case 6: 2288 return detail::InsertLane<6>(v, t); 2289 case 7: 2290 return detail::InsertLane<7>(v, t); 2291 } 2292 } 2293 #endif 2294 const DFromV<decltype(v)> d; 2295 alignas(16) T lanes[8]; 2296 Store(v, d, lanes); 2297 lanes[i] = t; 2298 return Load(d, lanes); 2299 } 2300 2301 template <typename T> 2302 HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) { 2303 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 2304 if (__builtin_constant_p(i)) { 2305 switch (i) { 2306 case 0: 2307 return detail::InsertLane<0>(v, t); 2308 case 1: 2309 return detail::InsertLane<1>(v, t); 2310 case 2: 2311 return detail::InsertLane<2>(v, t); 2312 case 3: 2313 return detail::InsertLane<3>(v, t); 2314 case 4: 2315 return detail::InsertLane<4>(v, t); 2316 case 5: 2317 return detail::InsertLane<5>(v, t); 2318 case 6: 2319 return detail::InsertLane<6>(v, t); 2320 case 7: 2321 return detail::InsertLane<7>(v, t); 2322 case 8: 2323 return detail::InsertLane<8>(v, t); 2324 case 9: 2325 return detail::InsertLane<9>(v, t); 2326 case 10: 2327 return detail::InsertLane<10>(v, t); 2328 case 11: 2329 return detail::InsertLane<11>(v, t); 2330 case 12: 2331 return detail::InsertLane<12>(v, t); 2332 case 13: 2333 return detail::InsertLane<13>(v, t); 2334 case 14: 2335 return detail::InsertLane<14>(v, t); 2336 case 15: 2337 return detail::InsertLane<15>(v, t); 2338 } 2339 } 2340 #endif 2341 const DFromV<decltype(v)> d; 2342 alignas(16) T lanes[16]; 2343 Store(v, d, lanes); 2344 lanes[i] = t; 2345 return Load(d, lanes); 2346 } 2347 2348 // ------------------------------ LowerHalf 2349 2350 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 2351 HWY_API VFromD<D> LowerHalf(D /* tag */, VFromD<Twice<D>> v) { 2352 return VFromD<D>{v.raw}; 2353 } 2354 template <typename T, size_t N> 2355 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { 2356 return Vec128<T, N / 2>{v.raw}; 2357 } 2358 2359 // ------------------------------ ShiftLeftBytes 2360 2361 // 0x01..0F, kBytes = 1 => 0x02..0F00 2362 template <int kBytes, class D> 2363 HWY_API VFromD<D> ShiftLeftBytes(D /* tag */, VFromD<D> v) { 2364 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 2365 const __i8x16 zero = wasm_i8x16_splat(0); 2366 switch (kBytes) { 2367 case 0: 2368 return v; 2369 2370 case 1: 2371 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6, 2372 7, 8, 9, 10, 11, 12, 13, 14)}; 2373 2374 case 2: 2375 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5, 2376 6, 7, 8, 9, 10, 11, 12, 13)}; 2377 2378 case 3: 2379 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3, 2380 4, 5, 6, 7, 8, 9, 10, 11, 12)}; 2381 2382 case 4: 2383 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2, 2384 3, 4, 5, 6, 7, 8, 9, 10, 11)}; 2385 2386 case 5: 2387 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1, 2388 2, 3, 4, 5, 6, 7, 8, 9, 10)}; 2389 2390 case 6: 2391 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 2392 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)}; 2393 2394 case 7: 2395 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 2396 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)}; 2397 2398 case 8: 2399 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 2400 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)}; 2401 2402 case 9: 2403 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 2404 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)}; 2405 2406 case 10: 2407 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 2408 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)}; 2409 2410 case 11: 2411 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 2412 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)}; 2413 2414 case 12: 2415 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 2416 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)}; 2417 2418 case 13: 2419 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 2420 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)}; 2421 2422 case 14: 2423 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 2424 16, 16, 16, 16, 16, 16, 16, 16, 0, 2425 1)}; 2426 2427 case 15: 2428 return VFromD<D>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 2429 16, 16, 16, 16, 16, 16, 16, 16, 16, 2430 0)}; 2431 } 2432 return VFromD<D>{zero}; 2433 } 2434 2435 template <int kBytes, typename T, size_t N> 2436 HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) { 2437 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); 2438 } 2439 2440 // ------------------------------ ShiftLeftLanes 2441 2442 template <int kLanes, class D> 2443 HWY_API VFromD<D> ShiftLeftLanes(D d, const VFromD<D> v) { 2444 const Repartition<uint8_t, decltype(d)> d8; 2445 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); 2446 return BitCast(d, ShiftLeftBytes<kBytes>(BitCast(d8, v))); 2447 } 2448 2449 template <int kLanes, typename T, size_t N> 2450 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) { 2451 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); 2452 } 2453 2454 // ------------------------------ ShiftRightBytes 2455 namespace detail { 2456 2457 // Helper function allows zeroing invalid lanes in caller. 2458 template <int kBytes, typename T, size_t N> 2459 HWY_API __i8x16 ShrBytes(const Vec128<T, N> v) { 2460 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 2461 const __i8x16 zero = wasm_i8x16_splat(0); 2462 2463 switch (kBytes) { 2464 case 0: 2465 return v.raw; 2466 2467 case 1: 2468 return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2469 12, 13, 14, 15, 16); 2470 2471 case 2: 2472 return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 2473 13, 14, 15, 16, 16); 2474 2475 case 3: 2476 return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 2477 13, 14, 15, 16, 16, 16); 2478 2479 case 4: 2480 return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2481 14, 15, 16, 16, 16, 16); 2482 2483 case 5: 2484 return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 2485 15, 16, 16, 16, 16, 16); 2486 2487 case 6: 2488 return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2489 16, 16, 16, 16, 16, 16); 2490 2491 case 7: 2492 return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2493 16, 16, 16, 16, 16, 16, 16); 2494 2495 case 8: 2496 return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16, 2497 16, 16, 16, 16, 16, 16, 16); 2498 2499 case 9: 2500 return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16, 2501 16, 16, 16, 16, 16, 16, 16); 2502 2503 case 10: 2504 return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16, 2505 16, 16, 16, 16, 16, 16, 16); 2506 2507 case 11: 2508 return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16, 2509 16, 16, 16, 16, 16, 16, 16); 2510 2511 case 12: 2512 return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16, 2513 16, 16, 16, 16, 16, 16, 16); 2514 2515 case 13: 2516 return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16, 2517 16, 16, 16, 16, 16, 16, 16); 2518 2519 case 14: 2520 return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16, 2521 16, 16, 16, 16, 16, 16, 16); 2522 2523 case 15: 2524 return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16, 2525 16, 16, 16, 16, 16, 16, 16); 2526 case 16: 2527 return zero; 2528 } 2529 } 2530 2531 } // namespace detail 2532 2533 // 0x01..0F, kBytes = 1 => 0x0001..0E 2534 template <int kBytes, class D> 2535 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { 2536 // For partial vectors, clear upper lanes so we shift in zeros. 2537 if (d.MaxBytes() != 16) { 2538 const Full128<TFromD<D>> dfull; 2539 const VFromD<decltype(dfull)> vfull{v.raw}; 2540 v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; 2541 } 2542 return VFromD<D>{detail::ShrBytes<kBytes>(v)}; 2543 } 2544 2545 // ------------------------------ ShiftRightLanes 2546 template <int kLanes, class D> 2547 HWY_API VFromD<D> ShiftRightLanes(D d, const VFromD<D> v) { 2548 const Repartition<uint8_t, decltype(d)> d8; 2549 constexpr size_t kBytes = kLanes * sizeof(TFromD<D>); 2550 return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v))); 2551 } 2552 2553 // ------------------------------ UpperHalf (ShiftRightBytes) 2554 2555 template <class D, typename T = TFromD<D>> 2556 HWY_API Vec64<T> UpperHalf(D /* tag */, const Vec128<T> v) { 2557 return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; 2558 } 2559 2560 // Partial 2561 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2562 HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) { 2563 return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v)); 2564 } 2565 2566 // ------------------------------ CombineShiftRightBytes 2567 2568 template <int kBytes, class D, typename T = TFromD<D>> 2569 HWY_API Vec128<T> CombineShiftRightBytes(D /* tag */, Vec128<T> hi, 2570 Vec128<T> lo) { 2571 static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); 2572 switch (kBytes) { 2573 case 0: 2574 return lo; 2575 2576 case 1: 2577 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 2578 8, 9, 10, 11, 12, 13, 14, 15, 16)}; 2579 2580 case 2: 2581 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 2582 9, 10, 11, 12, 13, 14, 15, 16, 17)}; 2583 2584 case 3: 2585 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 2586 10, 11, 12, 13, 14, 15, 16, 17, 18)}; 2587 2588 case 4: 2589 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 2590 11, 12, 13, 14, 15, 16, 17, 18, 19)}; 2591 2592 case 5: 2593 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 2594 12, 13, 14, 15, 16, 17, 18, 19, 20)}; 2595 2596 case 6: 2597 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 2598 12, 13, 14, 15, 16, 17, 18, 19, 20, 2599 21)}; 2600 2601 case 7: 2602 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 2603 13, 14, 15, 16, 17, 18, 19, 20, 21, 2604 22)}; 2605 2606 case 8: 2607 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 2608 14, 15, 16, 17, 18, 19, 20, 21, 22, 2609 23)}; 2610 2611 case 9: 2612 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 2613 15, 16, 17, 18, 19, 20, 21, 22, 23, 2614 24)}; 2615 2616 case 10: 2617 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 2618 15, 16, 17, 18, 19, 20, 21, 22, 23, 2619 24, 25)}; 2620 2621 case 11: 2622 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 2623 16, 17, 18, 19, 20, 21, 22, 23, 24, 2624 25, 26)}; 2625 2626 case 12: 2627 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 2628 17, 18, 19, 20, 21, 22, 23, 24, 25, 2629 26, 27)}; 2630 2631 case 13: 2632 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 2633 18, 19, 20, 21, 22, 23, 24, 25, 26, 2634 27, 28)}; 2635 2636 case 14: 2637 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 2638 19, 20, 21, 22, 23, 24, 25, 26, 27, 2639 28, 29)}; 2640 2641 case 15: 2642 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 2643 20, 21, 22, 23, 24, 25, 26, 27, 28, 2644 29, 30)}; 2645 } 2646 return hi; 2647 } 2648 2649 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)> 2650 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { 2651 constexpr size_t kSize = d.MaxBytes(); 2652 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); 2653 const Repartition<uint8_t, decltype(d)> d8; 2654 using V8 = Vec128<uint8_t>; 2655 const DFromV<V8> dfull8; 2656 const Repartition<TFromD<D>, decltype(dfull8)> dfull; 2657 const V8 hi8{BitCast(d8, hi).raw}; 2658 // Move into most-significant bytes 2659 const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); 2660 const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); 2661 return VFromD<D>{BitCast(dfull, r).raw}; 2662 } 2663 2664 // ------------------------------ Broadcast/splat any lane 2665 2666 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 2667 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { 2668 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 2669 return Vec128<T, N>{wasm_i8x16_shuffle( 2670 v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane, 2671 kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)}; 2672 } 2673 2674 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 2675 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { 2676 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 2677 return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane, 2678 kLane, kLane, kLane, kLane, kLane)}; 2679 } 2680 2681 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 2682 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { 2683 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 2684 return Vec128<T, N>{ 2685 wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; 2686 } 2687 2688 template <int kLane, typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 2689 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) { 2690 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 2691 return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)}; 2692 } 2693 2694 // ------------------------------ TableLookupBytes 2695 2696 // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e. 2697 // lane indices in [0, 16). 2698 template <typename T, size_t N, typename TI, size_t NI> 2699 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes, 2700 const Vec128<TI, NI> from) { 2701 return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)}; 2702 } 2703 2704 template <typename T, size_t N, typename TI, size_t NI> 2705 HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> bytes, 2706 const Vec128<TI, NI> from) { 2707 const DFromV<decltype(from)> d; 2708 // Mask size must match vector type, so cast everything to this type. 2709 Repartition<int8_t, decltype(d)> di8; 2710 Repartition<int8_t, DFromV<decltype(bytes)>> d_bytes8; 2711 const auto msb = BitCast(di8, from) < Zero(di8); 2712 const auto lookup = 2713 TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from)); 2714 return BitCast(d, IfThenZeroElse(msb, lookup)); 2715 } 2716 2717 // ------------------------------ Hard-coded shuffles 2718 2719 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). 2720 // Shuffle0321 rotates one lane to the right (the previous least-significant 2721 // lane is now most-significant). These could also be implemented via 2722 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. 2723 2724 // Swap 32-bit halves in 64-bit halves. 2725 template <typename T, size_t N> 2726 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) { 2727 static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); 2728 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2729 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; 2730 } 2731 2732 // These are used by generic_ops-inl to implement LoadInterleaved3. 2733 namespace detail { 2734 2735 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 2736 HWY_API Vec128<T, N> ShuffleTwo2301(const Vec128<T, N> a, 2737 const Vec128<T, N> b) { 2738 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2739 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16, 2740 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 2741 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; 2742 } 2743 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 2744 HWY_API Vec128<T, N> ShuffleTwo2301(const Vec128<T, N> a, 2745 const Vec128<T, N> b) { 2746 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2747 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8, 2748 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; 2749 } 2750 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 2751 HWY_API Vec128<T, N> ShuffleTwo2301(const Vec128<T, N> a, 2752 const Vec128<T, N> b) { 2753 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2754 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)}; 2755 } 2756 2757 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 2758 HWY_API Vec128<T, N> ShuffleTwo1230(const Vec128<T, N> a, 2759 const Vec128<T, N> b) { 2760 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2761 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16, 2762 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 2763 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; 2764 } 2765 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 2766 HWY_API Vec128<T, N> ShuffleTwo1230(const Vec128<T, N> a, 2767 const Vec128<T, N> b) { 2768 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2769 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8, 2770 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; 2771 } 2772 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 2773 HWY_API Vec128<T, N> ShuffleTwo1230(const Vec128<T, N> a, 2774 const Vec128<T, N> b) { 2775 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2776 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)}; 2777 } 2778 2779 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 2780 HWY_API Vec128<T, N> ShuffleTwo3012(const Vec128<T, N> a, 2781 const Vec128<T, N> b) { 2782 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2783 return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16, 2784 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 2785 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; 2786 } 2787 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 2788 HWY_API Vec128<T, N> ShuffleTwo3012(const Vec128<T, N> a, 2789 const Vec128<T, N> b) { 2790 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2791 return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8, 2792 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; 2793 } 2794 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 2795 HWY_API Vec128<T, N> ShuffleTwo3012(const Vec128<T, N> a, 2796 const Vec128<T, N> b) { 2797 static_assert(N == 2 || N == 4, "Does not make sense for N=1"); 2798 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)}; 2799 } 2800 2801 } // namespace detail 2802 2803 // Swap 64-bit halves 2804 template <typename T> 2805 HWY_API Vec128<T> Shuffle01(const Vec128<T> v) { 2806 static_assert(sizeof(T) == 8, "Only for 64-bit lanes"); 2807 return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; 2808 } 2809 template <typename T> 2810 HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) { 2811 static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); 2812 return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; 2813 } 2814 2815 // Rotate right 32 bits 2816 template <typename T> 2817 HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) { 2818 static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); 2819 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; 2820 } 2821 2822 // Rotate left 32 bits 2823 template <typename T> 2824 HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) { 2825 static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); 2826 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; 2827 } 2828 2829 // Reverse 2830 template <typename T> 2831 HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) { 2832 static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); 2833 return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; 2834 } 2835 2836 // ------------------------------ TableLookupLanes 2837 2838 // Returned by SetTableIndices for use by TableLookupLanes. 2839 template <typename T, size_t N = 16 / sizeof(T)> 2840 struct Indices128 { 2841 __v128_u raw; 2842 }; 2843 2844 namespace detail { 2845 2846 template <class D, HWY_IF_T_SIZE_D(D, 1)> 2847 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 2848 D d) { 2849 const Repartition<uint8_t, decltype(d)> d8; 2850 return Iota(d8, 0); 2851 } 2852 2853 template <class D, HWY_IF_T_SIZE_D(D, 2)> 2854 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 2855 D d) { 2856 const Repartition<uint8_t, decltype(d)> d8; 2857 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 2858 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; 2859 return Load(d8, kBroadcastLaneBytes); 2860 } 2861 2862 template <class D, HWY_IF_T_SIZE_D(D, 4)> 2863 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 2864 D d) { 2865 const Repartition<uint8_t, decltype(d)> d8; 2866 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 2867 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; 2868 return Load(d8, kBroadcastLaneBytes); 2869 } 2870 2871 template <class D, HWY_IF_T_SIZE_D(D, 8)> 2872 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 2873 D d) { 2874 const Repartition<uint8_t, decltype(d)> d8; 2875 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 2876 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; 2877 return Load(d8, kBroadcastLaneBytes); 2878 } 2879 2880 template <class D, HWY_IF_T_SIZE_D(D, 1)> 2881 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 2882 const Repartition<uint8_t, decltype(d)> d8; 2883 return Zero(d8); 2884 } 2885 2886 template <class D, HWY_IF_T_SIZE_D(D, 2)> 2887 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 2888 const Repartition<uint8_t, decltype(d)> d8; 2889 alignas(16) static constexpr uint8_t kByteOffsets[16] = { 2890 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; 2891 return Load(d8, kByteOffsets); 2892 } 2893 2894 template <class D, HWY_IF_T_SIZE_D(D, 4)> 2895 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 2896 const Repartition<uint8_t, decltype(d)> d8; 2897 alignas(16) static constexpr uint8_t kByteOffsets[16] = { 2898 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; 2899 return Load(d8, kByteOffsets); 2900 } 2901 2902 template <class D, HWY_IF_T_SIZE_D(D, 8)> 2903 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 2904 const Repartition<uint8_t, decltype(d)> d8; 2905 alignas(16) static constexpr uint8_t kByteOffsets[16] = { 2906 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; 2907 return Load(d8, kByteOffsets); 2908 } 2909 2910 } // namespace detail 2911 2912 template <class D, typename TI, HWY_IF_V_SIZE_LE_D(D, 16), 2913 HWY_IF_T_SIZE_D(D, 1)> 2914 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( 2915 D d, Vec128<TI, MaxLanes(D())> vec) { 2916 using T = TFromD<D>; 2917 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 2918 #if HWY_IS_DEBUG_BUILD 2919 const RebindToUnsigned<decltype(d)> du; 2920 using TU = TFromD<decltype(du)>; 2921 HWY_DASSERT(AllTrue( 2922 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); 2923 #endif 2924 2925 (void)d; 2926 return Indices128<TFromD<D>, MaxLanes(D())>{vec.raw}; 2927 } 2928 2929 template <class D, typename TI, HWY_IF_V_SIZE_LE_D(D, 16), 2930 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))> 2931 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( 2932 D d, Vec128<TI, MaxLanes(D())> vec) { 2933 using T = TFromD<D>; 2934 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 2935 #if HWY_IS_DEBUG_BUILD 2936 const RebindToUnsigned<decltype(d)> du; 2937 using TU = TFromD<decltype(du)>; 2938 HWY_DASSERT(AllTrue( 2939 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); 2940 #endif 2941 2942 const Repartition<uint8_t, decltype(d)> d8; 2943 using V8 = VFromD<decltype(d8)>; 2944 2945 // Broadcast each lane index to all bytes of T and shift to bytes 2946 const V8 lane_indices = TableLookupBytes( 2947 BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); 2948 constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T))); 2949 const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices); 2950 const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); 2951 return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, sum).raw}; 2952 } 2953 2954 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename TI> 2955 HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices( 2956 D d, const TI* idx) { 2957 const Rebind<TI, decltype(d)> di; 2958 return IndicesFromVec(d, LoadU(di, idx)); 2959 } 2960 2961 template <typename T, size_t N> 2962 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { 2963 using TI = MakeSigned<T>; 2964 const DFromV<decltype(v)> d; 2965 const Rebind<TI, decltype(d)> di; 2966 return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw})); 2967 } 2968 2969 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> 2970 HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, 2971 Indices128<T, N> idx) { 2972 const DFromV<decltype(a)> d; 2973 const Twice<decltype(d)> dt; 2974 // TableLookupLanes currently requires table and index vectors to be the same 2975 // size, though a half-length index vector would be sufficient here. 2976 #if HWY_IS_MSAN 2977 const Vec128<T, N> idx_vec{idx.raw}; 2978 const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw}; 2979 #else 2980 // We only keep LowerHalf of the result, which is valid in idx. 2981 const Indices128<T, N * 2> idx2{idx.raw}; 2982 #endif 2983 return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); 2984 } 2985 2986 template <typename T> 2987 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, 2988 Indices128<T> idx) { 2989 const DFromV<decltype(a)> d; 2990 const Repartition<uint8_t, decltype(d)> du8; 2991 2992 const VFromD<decltype(du8)> byte_idx{idx.raw}; 2993 const auto byte_idx_mod = byte_idx & Set(du8, uint8_t{0x0F}); 2994 // If ANDing did not change the index, it is for the lower half. 2995 const auto is_lo = (byte_idx == byte_idx_mod); 2996 2997 return BitCast(d, IfThenElse(is_lo, TableLookupBytes(a, byte_idx_mod), 2998 TableLookupBytes(b, byte_idx_mod))); 2999 } 3000 3001 // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01) 3002 3003 // Single lane: no change 3004 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> 3005 HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) { 3006 return v; 3007 } 3008 3009 // 32-bit x2: shuffle 3010 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> 3011 HWY_API Vec64<T> Reverse(D /* tag */, const Vec64<T> v) { 3012 return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw}; 3013 } 3014 3015 // 64-bit x2: shuffle 3016 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)> 3017 HWY_API Vec128<T> Reverse(D /* tag */, const Vec128<T> v) { 3018 return Shuffle01(v); 3019 } 3020 3021 // 32-bit x2: shuffle 3022 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> 3023 HWY_API Vec128<T> Reverse(D /* tag */, const Vec128<T> v) { 3024 return Shuffle0123(v); 3025 } 3026 3027 // 16-bit 3028 template <class D, HWY_IF_T_SIZE_D(D, 2)> 3029 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { 3030 const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32; 3031 return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); 3032 } 3033 3034 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 16)> 3035 HWY_API VFromD<D> Reverse(D d, const VFromD<D> v) { 3036 static constexpr int kN = 16 + Lanes(d); 3037 return VFromD<D>{wasm_i8x16_shuffle( 3038 v.raw, v.raw, 3039 // kN is adjusted to ensure we have valid indices for all lengths. 3040 kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, kN - 9, 3041 kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16)}; 3042 } 3043 3044 // ------------------------------ Reverse2 3045 3046 template <class D, HWY_IF_T_SIZE_D(D, 2)> 3047 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) { 3048 const RepartitionToWide<RebindToUnsigned<decltype(d)>> dw; 3049 return BitCast(d, RotateRight<16>(BitCast(dw, v))); 3050 } 3051 3052 template <class D, HWY_IF_T_SIZE_D(D, 4)> 3053 HWY_API VFromD<D> Reverse2(D /* tag */, const VFromD<D> v) { 3054 return Shuffle2301(v); 3055 } 3056 3057 template <class D, HWY_IF_T_SIZE_D(D, 8)> 3058 HWY_API VFromD<D> Reverse2(D /* tag */, const VFromD<D> v) { 3059 return Shuffle01(v); 3060 } 3061 3062 // ------------------------------ Reverse4 3063 3064 template <class D, HWY_IF_T_SIZE_D(D, 2)> 3065 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) { 3066 return VFromD<D>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2, 1, 0, 7, 6, 5, 4)}; 3067 } 3068 3069 template <class D, HWY_IF_T_SIZE_D(D, 4)> 3070 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D> v) { 3071 return Shuffle0123(v); 3072 } 3073 3074 template <class D, HWY_IF_T_SIZE_D(D, 8)> 3075 HWY_API VFromD<D> Reverse4(D /* tag */, const VFromD<D>) { 3076 HWY_ASSERT(0); // don't have 8 u64 lanes 3077 } 3078 3079 // ------------------------------ Reverse8 3080 3081 template <class D, HWY_IF_T_SIZE_D(D, 2)> 3082 HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) { 3083 return Reverse(d, v); 3084 } 3085 3086 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> 3087 HWY_API VFromD<D> Reverse8(D /* tag */, const VFromD<D>) { 3088 HWY_ASSERT(0); // don't have 8 lanes for > 16-bit lanes 3089 } 3090 3091 // ------------------------------ InterleaveLower 3092 3093 template <size_t N> 3094 HWY_API Vec128<uint8_t, N> InterleaveLower(Vec128<uint8_t, N> a, 3095 Vec128<uint8_t, N> b) { 3096 return Vec128<uint8_t, N>{wasm_i8x16_shuffle( 3097 a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; 3098 } 3099 template <size_t N> 3100 HWY_API Vec128<uint16_t, N> InterleaveLower(Vec128<uint16_t, N> a, 3101 Vec128<uint16_t, N> b) { 3102 return Vec128<uint16_t, N>{ 3103 wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; 3104 } 3105 template <size_t N> 3106 HWY_API Vec128<uint32_t, N> InterleaveLower(Vec128<uint32_t, N> a, 3107 Vec128<uint32_t, N> b) { 3108 return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; 3109 } 3110 template <size_t N> 3111 HWY_API Vec128<uint64_t, N> InterleaveLower(Vec128<uint64_t, N> a, 3112 Vec128<uint64_t, N> b) { 3113 return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; 3114 } 3115 3116 template <size_t N> 3117 HWY_API Vec128<int8_t, N> InterleaveLower(Vec128<int8_t, N> a, 3118 Vec128<int8_t, N> b) { 3119 return Vec128<int8_t, N>{wasm_i8x16_shuffle( 3120 a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; 3121 } 3122 template <size_t N> 3123 HWY_API Vec128<int16_t, N> InterleaveLower(Vec128<int16_t, N> a, 3124 Vec128<int16_t, N> b) { 3125 return Vec128<int16_t, N>{ 3126 wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; 3127 } 3128 template <size_t N> 3129 HWY_API Vec128<int32_t, N> InterleaveLower(Vec128<int32_t, N> a, 3130 Vec128<int32_t, N> b) { 3131 return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; 3132 } 3133 template <size_t N> 3134 HWY_API Vec128<int64_t, N> InterleaveLower(Vec128<int64_t, N> a, 3135 Vec128<int64_t, N> b) { 3136 return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; 3137 } 3138 3139 template <size_t N> 3140 HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a, 3141 Vec128<float, N> b) { 3142 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; 3143 } 3144 3145 template <size_t N> 3146 HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a, 3147 Vec128<double, N> b) { 3148 return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; 3149 } 3150 3151 template <class T, size_t N, HWY_IF_T_SIZE(T, 2), HWY_IF_SPECIAL_FLOAT(T)> 3152 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 3153 const DFromV<decltype(a)> d; 3154 const RebindToUnsigned<decltype(d)> du; 3155 return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b))); 3156 } 3157 3158 // Additional overload for the optional tag (all vector lengths). 3159 template <class D> 3160 HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) { 3161 return InterleaveLower(a, b); 3162 } 3163 3164 // ------------------------------ InterleaveUpper (UpperHalf) 3165 3166 // All functions inside detail lack the required D parameter. 3167 namespace detail { 3168 3169 template <size_t N> 3170 HWY_API Vec128<uint8_t, N> InterleaveUpper(Vec128<uint8_t, N> a, 3171 Vec128<uint8_t, N> b) { 3172 return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 3173 26, 11, 27, 12, 28, 13, 29, 14, 3174 30, 15, 31)}; 3175 } 3176 template <size_t N> 3177 HWY_API Vec128<uint16_t, N> InterleaveUpper(Vec128<uint16_t, N> a, 3178 Vec128<uint16_t, N> b) { 3179 return Vec128<uint16_t, N>{ 3180 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; 3181 } 3182 template <size_t N> 3183 HWY_API Vec128<uint32_t, N> InterleaveUpper(Vec128<uint32_t, N> a, 3184 Vec128<uint32_t, N> b) { 3185 return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; 3186 } 3187 template <size_t N> 3188 HWY_API Vec128<uint64_t, N> InterleaveUpper(Vec128<uint64_t, N> a, 3189 Vec128<uint64_t, N> b) { 3190 return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; 3191 } 3192 3193 template <size_t N> 3194 HWY_API Vec128<int8_t, N> InterleaveUpper(Vec128<int8_t, N> a, 3195 Vec128<int8_t, N> b) { 3196 return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 3197 26, 11, 27, 12, 28, 13, 29, 14, 3198 30, 15, 31)}; 3199 } 3200 template <size_t N> 3201 HWY_API Vec128<int16_t, N> InterleaveUpper(Vec128<int16_t, N> a, 3202 Vec128<int16_t, N> b) { 3203 return Vec128<int16_t, N>{ 3204 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; 3205 } 3206 template <size_t N> 3207 HWY_API Vec128<int32_t, N> InterleaveUpper(Vec128<int32_t, N> a, 3208 Vec128<int32_t, N> b) { 3209 return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; 3210 } 3211 template <size_t N> 3212 HWY_API Vec128<int64_t, N> InterleaveUpper(Vec128<int64_t, N> a, 3213 Vec128<int64_t, N> b) { 3214 return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; 3215 } 3216 3217 template <size_t N> 3218 HWY_API Vec128<float16_t, N> InterleaveUpper(Vec128<float16_t, N> a, 3219 Vec128<float16_t, N> b) { 3220 return Vec128<float16_t, N>{ 3221 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; 3222 } 3223 template <size_t N> 3224 HWY_API Vec128<bfloat16_t, N> InterleaveUpper(Vec128<bfloat16_t, N> a, 3225 Vec128<bfloat16_t, N> b) { 3226 return Vec128<bfloat16_t, N>{ 3227 wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; 3228 } 3229 3230 template <size_t N> 3231 HWY_API Vec128<float, N> InterleaveUpper(Vec128<float, N> a, 3232 Vec128<float, N> b) { 3233 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; 3234 } 3235 3236 template <size_t N> 3237 HWY_API Vec128<double, N> InterleaveUpper(Vec128<double, N> a, 3238 Vec128<double, N> b) { 3239 return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; 3240 } 3241 3242 } // namespace detail 3243 3244 // Full 3245 template <class D, typename T = TFromD<D>> 3246 HWY_API Vec128<T> InterleaveUpper(D /* tag */, Vec128<T> a, Vec128<T> b) { 3247 return detail::InterleaveUpper(a, b); 3248 } 3249 3250 // Partial 3251 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 3252 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { 3253 const Half<decltype(d)> d2; 3254 return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw}, 3255 VFromD<D>{UpperHalf(d2, b).raw}); 3256 } 3257 3258 // ------------------------------ ZipLower/ZipUpper (InterleaveLower) 3259 3260 // Same as Interleave*, except that the return lanes are double-width integers; 3261 // this is necessary because the single-lane scalar cannot return two values. 3262 template <class V, class DW = RepartitionToWide<DFromV<V>>> 3263 HWY_API VFromD<DW> ZipLower(V a, V b) { 3264 return BitCast(DW(), InterleaveLower(a, b)); 3265 } 3266 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 3267 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { 3268 return BitCast(dw, InterleaveLower(D(), a, b)); 3269 } 3270 3271 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 3272 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { 3273 return BitCast(dw, InterleaveUpper(D(), a, b)); 3274 } 3275 3276 // ------------------------------ Per4LaneBlockShuffle 3277 namespace detail { 3278 3279 template <size_t kIdx3210, size_t kVectSize, class V, 3280 HWY_IF_LANES_LE(kVectSize, 16)> 3281 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 3282 hwy::SizeTag<1> /*lane_size_tag*/, 3283 hwy::SizeTag<kVectSize> /*vect_size_tag*/, 3284 V v) { 3285 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3); 3286 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3); 3287 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3); 3288 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3); 3289 return V{wasm_i8x16_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3, 3290 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4, 3291 kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8, 3292 kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)}; 3293 } 3294 3295 template <size_t kIdx3210, size_t kVectSize, class V, 3296 HWY_IF_LANES_LE(kVectSize, 16)> 3297 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 3298 hwy::SizeTag<2> /*lane_size_tag*/, 3299 hwy::SizeTag<kVectSize> /*vect_size_tag*/, 3300 V v) { 3301 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3); 3302 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3); 3303 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3); 3304 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3); 3305 return V{wasm_i16x8_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3, 3306 kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)}; 3307 } 3308 3309 template <size_t kIdx3210, size_t kVectSize, class V, 3310 HWY_IF_LANES_LE(kVectSize, 16)> 3311 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/, 3312 hwy::SizeTag<4> /*lane_size_tag*/, 3313 hwy::SizeTag<kVectSize> /*vect_size_tag*/, 3314 V v) { 3315 constexpr int kIdx3 = static_cast<int>((kIdx3210 >> 6) & 3); 3316 constexpr int kIdx2 = static_cast<int>((kIdx3210 >> 4) & 3); 3317 constexpr int kIdx1 = static_cast<int>((kIdx3210 >> 2) & 3); 3318 constexpr int kIdx0 = static_cast<int>(kIdx3210 & 3); 3319 return V{wasm_i32x4_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3)}; 3320 } 3321 3322 } // namespace detail 3323 3324 // ------------------------------ SlideUpLanes 3325 3326 namespace detail { 3327 3328 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 3329 HWY_INLINE V SlideUpLanes(V v, size_t amt) { 3330 const DFromV<decltype(v)> d; 3331 const Full64<uint64_t> du64; 3332 const auto vu64 = ResizeBitCast(du64, v); 3333 return ResizeBitCast( 3334 d, ShiftLeftSame(vu64, static_cast<int>(amt * sizeof(TFromV<V>) * 8))); 3335 } 3336 3337 template <class V, HWY_IF_V_SIZE_V(V, 16)> 3338 HWY_INLINE V SlideUpLanes(V v, size_t amt) { 3339 const DFromV<decltype(v)> d; 3340 const Repartition<uint8_t, decltype(d)> du8; 3341 const auto idx = 3342 Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>))); 3343 return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); 3344 } 3345 3346 } // namespace detail 3347 3348 template <class D, HWY_IF_LANES_D(D, 1)> 3349 HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { 3350 return v; 3351 } 3352 3353 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)> 3354 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 3355 #if !HWY_IS_DEBUG_BUILD 3356 if (__builtin_constant_p(amt)) { 3357 switch (amt) { 3358 case 0: 3359 return v; 3360 case 1: 3361 return ShiftLeftLanes<1>(d, v); 3362 } 3363 } 3364 #else 3365 (void)d; 3366 #endif 3367 3368 return detail::SlideUpLanes(v, amt); 3369 } 3370 3371 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)> 3372 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 3373 #if !HWY_IS_DEBUG_BUILD 3374 if (__builtin_constant_p(amt)) { 3375 switch (amt) { 3376 case 0: 3377 return v; 3378 case 1: 3379 return ShiftLeftLanes<1>(d, v); 3380 case 2: 3381 return ShiftLeftLanes<2>(d, v); 3382 case 3: 3383 return ShiftLeftLanes<3>(d, v); 3384 } 3385 } 3386 #else 3387 (void)d; 3388 #endif 3389 3390 return detail::SlideUpLanes(v, amt); 3391 } 3392 3393 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)> 3394 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 3395 #if !HWY_IS_DEBUG_BUILD 3396 if (__builtin_constant_p(amt)) { 3397 switch (amt) { 3398 case 0: 3399 return v; 3400 case 1: 3401 return ShiftLeftLanes<1>(d, v); 3402 case 2: 3403 return ShiftLeftLanes<2>(d, v); 3404 case 3: 3405 return ShiftLeftLanes<3>(d, v); 3406 case 4: 3407 return ShiftLeftLanes<4>(d, v); 3408 case 5: 3409 return ShiftLeftLanes<5>(d, v); 3410 case 6: 3411 return ShiftLeftLanes<6>(d, v); 3412 case 7: 3413 return ShiftLeftLanes<7>(d, v); 3414 } 3415 } 3416 #else 3417 (void)d; 3418 #endif 3419 3420 return detail::SlideUpLanes(v, amt); 3421 } 3422 3423 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)> 3424 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 3425 #if !HWY_IS_DEBUG_BUILD 3426 if (__builtin_constant_p(amt)) { 3427 switch (amt) { 3428 case 0: 3429 return v; 3430 case 1: 3431 return ShiftLeftLanes<1>(d, v); 3432 case 2: 3433 return ShiftLeftLanes<2>(d, v); 3434 case 3: 3435 return ShiftLeftLanes<3>(d, v); 3436 case 4: 3437 return ShiftLeftLanes<4>(d, v); 3438 case 5: 3439 return ShiftLeftLanes<5>(d, v); 3440 case 6: 3441 return ShiftLeftLanes<6>(d, v); 3442 case 7: 3443 return ShiftLeftLanes<7>(d, v); 3444 case 8: 3445 return ShiftLeftLanes<8>(d, v); 3446 case 9: 3447 return ShiftLeftLanes<9>(d, v); 3448 case 10: 3449 return ShiftLeftLanes<10>(d, v); 3450 case 11: 3451 return ShiftLeftLanes<11>(d, v); 3452 case 12: 3453 return ShiftLeftLanes<12>(d, v); 3454 case 13: 3455 return ShiftLeftLanes<13>(d, v); 3456 case 14: 3457 return ShiftLeftLanes<14>(d, v); 3458 case 15: 3459 return ShiftLeftLanes<15>(d, v); 3460 } 3461 } 3462 #else 3463 (void)d; 3464 #endif 3465 3466 return detail::SlideUpLanes(v, amt); 3467 } 3468 3469 // ------------------------------ SlideDownLanes 3470 3471 namespace detail { 3472 3473 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 3474 HWY_INLINE V SlideDownLanes(V v, size_t amt) { 3475 const DFromV<decltype(v)> d; 3476 const Repartition<UnsignedFromSize<d.MaxBytes()>, decltype(d)> dv; 3477 return BitCast(d, 3478 ShiftRightSame(BitCast(dv, v), 3479 static_cast<int>(amt * sizeof(TFromV<V>) * 8))); 3480 } 3481 3482 template <class V, HWY_IF_V_SIZE_V(V, 16)> 3483 HWY_INLINE V SlideDownLanes(V v, size_t amt) { 3484 const DFromV<decltype(v)> d; 3485 const Repartition<int8_t, decltype(d)> di8; 3486 auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>))); 3487 idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); 3488 return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); 3489 } 3490 3491 } // namespace detail 3492 3493 template <class D, HWY_IF_LANES_D(D, 1)> 3494 HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { 3495 return v; 3496 } 3497 3498 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)> 3499 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 3500 #if !HWY_IS_DEBUG_BUILD 3501 if (__builtin_constant_p(amt)) { 3502 switch (amt) { 3503 case 0: 3504 return v; 3505 case 1: 3506 return ShiftRightLanes<1>(d, v); 3507 } 3508 } 3509 #else 3510 (void)d; 3511 #endif 3512 3513 return detail::SlideDownLanes(v, amt); 3514 } 3515 3516 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)> 3517 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 3518 #if !HWY_IS_DEBUG_BUILD 3519 if (__builtin_constant_p(amt)) { 3520 switch (amt) { 3521 case 0: 3522 return v; 3523 case 1: 3524 return ShiftRightLanes<1>(d, v); 3525 case 2: 3526 return ShiftRightLanes<2>(d, v); 3527 case 3: 3528 return ShiftRightLanes<3>(d, v); 3529 } 3530 } 3531 #else 3532 (void)d; 3533 #endif 3534 3535 return detail::SlideDownLanes(v, amt); 3536 } 3537 3538 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)> 3539 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 3540 #if !HWY_IS_DEBUG_BUILD 3541 if (__builtin_constant_p(amt)) { 3542 switch (amt) { 3543 case 0: 3544 return v; 3545 case 1: 3546 return ShiftRightLanes<1>(d, v); 3547 case 2: 3548 return ShiftRightLanes<2>(d, v); 3549 case 3: 3550 return ShiftRightLanes<3>(d, v); 3551 case 4: 3552 return ShiftRightLanes<4>(d, v); 3553 case 5: 3554 return ShiftRightLanes<5>(d, v); 3555 case 6: 3556 return ShiftRightLanes<6>(d, v); 3557 case 7: 3558 return ShiftRightLanes<7>(d, v); 3559 } 3560 } 3561 #else 3562 (void)d; 3563 #endif 3564 3565 return detail::SlideDownLanes(v, amt); 3566 } 3567 3568 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)> 3569 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 3570 #if !HWY_IS_DEBUG_BUILD 3571 if (__builtin_constant_p(amt)) { 3572 switch (amt) { 3573 case 0: 3574 return v; 3575 case 1: 3576 return ShiftRightLanes<1>(d, v); 3577 case 2: 3578 return ShiftRightLanes<2>(d, v); 3579 case 3: 3580 return ShiftRightLanes<3>(d, v); 3581 case 4: 3582 return ShiftRightLanes<4>(d, v); 3583 case 5: 3584 return ShiftRightLanes<5>(d, v); 3585 case 6: 3586 return ShiftRightLanes<6>(d, v); 3587 case 7: 3588 return ShiftRightLanes<7>(d, v); 3589 case 8: 3590 return ShiftRightLanes<8>(d, v); 3591 case 9: 3592 return ShiftRightLanes<9>(d, v); 3593 case 10: 3594 return ShiftRightLanes<10>(d, v); 3595 case 11: 3596 return ShiftRightLanes<11>(d, v); 3597 case 12: 3598 return ShiftRightLanes<12>(d, v); 3599 case 13: 3600 return ShiftRightLanes<13>(d, v); 3601 case 14: 3602 return ShiftRightLanes<14>(d, v); 3603 case 15: 3604 return ShiftRightLanes<15>(d, v); 3605 } 3606 } 3607 #else 3608 (void)d; 3609 #endif 3610 3611 return detail::SlideDownLanes(v, amt); 3612 } 3613 3614 // ================================================== COMBINE 3615 3616 // ------------------------------ Combine (InterleaveLower) 3617 3618 // N = N/2 + N/2 (upper half undefined) 3619 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>> 3620 HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) { 3621 const Half<decltype(d)> dh; 3622 const RebindToUnsigned<decltype(dh)> duh; 3623 // Treat half-width input as one lane, and expand to two lanes. 3624 using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>; 3625 const VU lo{BitCast(duh, lo_half).raw}; 3626 const VU hi{BitCast(duh, hi_half).raw}; 3627 return BitCast(d, InterleaveLower(lo, hi)); 3628 } 3629 3630 // ------------------------------ ZeroExtendVector (IfThenElseZero) 3631 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 3632 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { 3633 const Half<D> dh; 3634 return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw}); 3635 } 3636 3637 // ------------------------------ ConcatLowerLower 3638 template <class D, typename T = TFromD<D>> 3639 HWY_API Vec128<T> ConcatLowerLower(D /* tag */, Vec128<T> hi, Vec128<T> lo) { 3640 return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)}; 3641 } 3642 3643 // ------------------------------ ConcatUpperUpper 3644 template <class D, typename T = TFromD<D>> 3645 HWY_API Vec128<T> ConcatUpperUpper(D /* tag */, Vec128<T> hi, Vec128<T> lo) { 3646 return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)}; 3647 } 3648 3649 // ------------------------------ ConcatLowerUpper 3650 template <class D, typename T = TFromD<D>> 3651 HWY_API Vec128<T> ConcatLowerUpper(D d, Vec128<T> hi, Vec128<T> lo) { 3652 return CombineShiftRightBytes<8>(d, hi, lo); 3653 } 3654 3655 // ------------------------------ ConcatUpperLower 3656 template <class D, typename T = TFromD<D>> 3657 HWY_API Vec128<T> ConcatUpperLower(D d, Vec128<T> hi, Vec128<T> lo) { 3658 return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); 3659 } 3660 3661 // ------------------------------ Concat partial (Combine, LowerHalf) 3662 3663 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 3664 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { 3665 const Half<decltype(d)> d2; 3666 return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); 3667 } 3668 3669 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 3670 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { 3671 const Half<decltype(d)> d2; 3672 return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); 3673 } 3674 3675 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 3676 HWY_API VFromD<D> ConcatLowerUpper(D d, const VFromD<D> hi, 3677 const VFromD<D> lo) { 3678 const Half<decltype(d)> d2; 3679 return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); 3680 } 3681 3682 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 3683 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { 3684 const Half<decltype(d)> d2; 3685 return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); 3686 } 3687 3688 // ------------------------------ ConcatOdd 3689 3690 // 8-bit full 3691 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 3692 HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) { 3693 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15, 3694 17, 19, 21, 23, 25, 27, 29, 31)}; 3695 } 3696 3697 // 8-bit x8 3698 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 3699 HWY_API Vec64<T> ConcatOdd(D /* tag */, Vec64<T> hi, Vec64<T> lo) { 3700 // Don't care about upper half. 3701 return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21, 3702 23, 1, 3, 5, 7, 17, 19, 21, 23)}; 3703 } 3704 3705 // 8-bit x4 3706 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 3707 HWY_API Vec32<T> ConcatOdd(D /* tag */, Vec32<T> hi, Vec32<T> lo) { 3708 // Don't care about upper 3/4. 3709 return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17, 3710 19, 1, 3, 17, 19, 1, 3, 17, 19)}; 3711 } 3712 3713 // 16-bit full 3714 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 3715 HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) { 3716 return Vec128<T>{ 3717 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)}; 3718 } 3719 3720 // 16-bit x4 3721 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 3722 HWY_API Vec64<T> ConcatOdd(D /* tag */, Vec64<T> hi, Vec64<T> lo) { 3723 // Don't care about upper half. 3724 return Vec128<T, 4>{ 3725 wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)}; 3726 } 3727 3728 // 32-bit full 3729 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> 3730 HWY_API Vec128<T> ConcatOdd(D /* tag */, Vec128<T> hi, Vec128<T> lo) { 3731 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)}; 3732 } 3733 3734 // Any T x2 3735 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> 3736 HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { 3737 return InterleaveUpper(d, lo, hi); 3738 } 3739 3740 // ------------------------------ ConcatEven (InterleaveLower) 3741 3742 // 8-bit full 3743 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 3744 HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) { 3745 return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14, 3746 16, 18, 20, 22, 24, 26, 28, 30)}; 3747 } 3748 3749 // 8-bit x8 3750 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 3751 HWY_API Vec64<T> ConcatEven(D /* tag */, Vec64<T> hi, Vec64<T> lo) { 3752 // Don't care about upper half. 3753 return Vec64<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20, 22, 3754 0, 2, 4, 6, 16, 18, 20, 22)}; 3755 } 3756 3757 // 8-bit x4 3758 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 3759 HWY_API Vec32<T> ConcatEven(D /* tag */, Vec32<T> hi, Vec32<T> lo) { 3760 // Don't care about upper 3/4. 3761 return Vec32<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16, 18, 3762 0, 2, 16, 18, 0, 2, 16, 18)}; 3763 } 3764 3765 // 16-bit full 3766 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 3767 HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) { 3768 return Vec128<T>{ 3769 wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)}; 3770 } 3771 3772 // 16-bit x4 3773 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 3774 HWY_API Vec64<T> ConcatEven(D /* tag */, Vec64<T> hi, Vec64<T> lo) { 3775 // Don't care about upper half. 3776 return Vec64<T>{wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)}; 3777 } 3778 3779 // 32-bit full 3780 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> 3781 HWY_API Vec128<T> ConcatEven(D /* tag */, Vec128<T> hi, Vec128<T> lo) { 3782 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)}; 3783 } 3784 3785 // Any T x2 3786 template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> 3787 HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { 3788 return InterleaveLower(d, lo, hi); 3789 } 3790 3791 // ------------------------------ DupEven (InterleaveLower) 3792 3793 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 3794 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { 3795 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6, 3796 8, 8, 10, 10, 12, 12, 14, 14)}; 3797 } 3798 3799 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 3800 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { 3801 return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6)}; 3802 } 3803 3804 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 3805 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { 3806 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)}; 3807 } 3808 3809 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 3810 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) { 3811 return InterleaveLower(DFromV<decltype(v)>(), v, v); 3812 } 3813 3814 // ------------------------------ DupOdd (InterleaveUpper) 3815 3816 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)> 3817 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 3818 return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7, 3819 9, 9, 11, 11, 13, 13, 15, 15)}; 3820 } 3821 3822 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 3823 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 3824 return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7)}; 3825 } 3826 3827 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 3828 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 3829 return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)}; 3830 } 3831 3832 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 3833 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) { 3834 return InterleaveUpper(DFromV<decltype(v)>(), v, v); 3835 } 3836 3837 // ------------------------------ OddEven 3838 3839 namespace detail { 3840 3841 template <typename T, size_t N> 3842 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a, 3843 const Vec128<T, N> b) { 3844 const DFromV<decltype(a)> d; 3845 const Repartition<uint8_t, decltype(d)> d8; 3846 alignas(16) static constexpr uint8_t mask[16] = { 3847 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; 3848 return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); 3849 } 3850 template <typename T, size_t N> 3851 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a, 3852 const Vec128<T, N> b) { 3853 return Vec128<T, N>{ 3854 wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; 3855 } 3856 template <typename T, size_t N> 3857 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a, 3858 const Vec128<T, N> b) { 3859 return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; 3860 } 3861 template <typename T, size_t N> 3862 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a, 3863 const Vec128<T, N> b) { 3864 return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)}; 3865 } 3866 3867 } // namespace detail 3868 3869 template <typename T, size_t N> 3870 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { 3871 return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b); 3872 } 3873 template <size_t N> 3874 HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a, 3875 const Vec128<float, N> b) { 3876 return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; 3877 } 3878 3879 // ------------------------------ InterleaveEven 3880 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 3881 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 3882 return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 2, 18, 4, 20, 6, 22, 3883 8, 24, 10, 26, 12, 28, 14, 30)}; 3884 } 3885 3886 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 3887 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 3888 return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 2, 10, 4, 12, 6, 14)}; 3889 } 3890 3891 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> 3892 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 3893 return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 2, 6)}; 3894 } 3895 3896 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> 3897 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 3898 return InterleaveLower(a, b); 3899 } 3900 3901 // ------------------------------ InterleaveOdd 3902 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)> 3903 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 3904 return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 17, 3, 19, 5, 21, 7, 23, 3905 9, 25, 11, 27, 13, 29, 15, 31)}; 3906 } 3907 3908 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)> 3909 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 3910 return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 9, 3, 11, 5, 13, 7, 15)}; 3911 } 3912 3913 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)> 3914 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 3915 return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 5, 3, 7)}; 3916 } 3917 3918 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)> 3919 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 3920 return InterleaveUpper(d, a, b); 3921 } 3922 3923 // ------------------------------ OddEvenBlocks 3924 template <typename T, size_t N> 3925 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { 3926 return even; 3927 } 3928 3929 // ------------------------------ SwapAdjacentBlocks 3930 template <typename T, size_t N> 3931 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { 3932 return v; 3933 } 3934 3935 // ------------------------------ InterleaveEvenBlocks 3936 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)> 3937 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) { 3938 return a; 3939 } 3940 // ------------------------------ InterleaveOddBlocks 3941 template <class D, class V = VFromD<D>, HWY_IF_V_SIZE_LE_D(D, 16)> 3942 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) { 3943 return a; 3944 } 3945 3946 // ------------------------------ ReverseBlocks 3947 template <class D> 3948 HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { 3949 return v; // Single block: no change 3950 } 3951 3952 // ================================================== CONVERT 3953 3954 // ------------------------------ Promotions (part w/ narrow lanes -> full) 3955 3956 // Unsigned: zero-extend. 3957 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> 3958 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { 3959 return VFromD<D>{wasm_u16x8_extend_low_u8x16(v.raw)}; 3960 } 3961 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> 3962 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 3963 return VFromD<D>{wasm_u32x4_extend_low_u16x8(v.raw)}; 3964 } 3965 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 3966 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 3967 return VFromD<D>{wasm_u64x2_extend_low_u32x4(v.raw)}; 3968 } 3969 3970 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> 3971 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { 3972 return VFromD<D>{ 3973 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; 3974 } 3975 3976 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)> 3977 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { 3978 return VFromD<D>{wasm_u16x8_extend_low_u8x16(v.raw)}; 3979 } 3980 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> 3981 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 3982 return VFromD<D>{wasm_u32x4_extend_low_u16x8(v.raw)}; 3983 } 3984 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 3985 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 3986 return VFromD<D>{wasm_u64x2_extend_low_u32x4(v.raw)}; 3987 } 3988 3989 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> 3990 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { 3991 return VFromD<D>{ 3992 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; 3993 } 3994 3995 // U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to 3996 // TFromD<D> 3997 template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_UI64_D(D), 3998 HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_UNSIGNED_V(V), 3999 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> 4000 HWY_API VFromD<D> PromoteTo(D d, V v) { 4001 const Rebind<uint32_t, decltype(d)> du32; 4002 return PromoteTo(d, PromoteTo(du32, v)); 4003 } 4004 4005 // Signed: replicate sign bit. 4006 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I16_D(D)> 4007 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { 4008 return VFromD<D>{wasm_i16x8_extend_low_i8x16(v.raw)}; 4009 } 4010 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> 4011 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 4012 return VFromD<D>{wasm_i32x4_extend_low_i16x8(v.raw)}; 4013 } 4014 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 4015 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4016 return VFromD<D>{wasm_i64x2_extend_low_i32x4(v.raw)}; 4017 } 4018 4019 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> 4020 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { 4021 return VFromD<D>{ 4022 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))}; 4023 } 4024 4025 // I8/I16 to I64: First, promote to I32, and then promote to I64 4026 template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D), 4027 HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_SIGNED_V(V), 4028 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> 4029 HWY_API VFromD<D> PromoteTo(D d, V v) { 4030 const Rebind<int32_t, decltype(d)> di32; 4031 return PromoteTo(d, PromoteTo(di32, v)); 4032 } 4033 4034 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 4035 HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) { 4036 const Rebind<uint16_t, decltype(df32)> du16; 4037 const RebindToSigned<decltype(df32)> di32; 4038 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); 4039 } 4040 4041 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 4042 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4043 return VFromD<D>{wasm_f64x2_convert_low_i32x4(v.raw)}; 4044 } 4045 4046 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 4047 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4048 return VFromD<D>{wasm_f64x2_convert_low_u32x4(v.raw)}; 4049 } 4050 4051 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)> 4052 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float, D>> v) { 4053 return VFromD<D>{wasm_f64x2_promote_low_f32x4(v.raw)}; 4054 } 4055 4056 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I64_D(D)> 4057 HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) { 4058 const Rebind<int32_t, decltype(di64)> di32; 4059 const RebindToFloat<decltype(di32)> df32; 4060 const RebindToUnsigned<decltype(di32)> du32; 4061 const Repartition<uint8_t, decltype(du32)> du32_as_du8; 4062 4063 const auto exponent_adj = BitCast( 4064 du32, 4065 Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), 4066 BitCast(du32_as_du8, Set(du32, uint32_t{157}))), 4067 BitCast(du32_as_du8, Set(du32, uint32_t{32})))); 4068 const auto adj_v = 4069 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); 4070 4071 const auto f32_to_i32_result = ConvertTo(di32, adj_v); 4072 const auto lo64_or_mask = PromoteTo( 4073 di64, 4074 BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result, 4075 Set(di32, LimitsMax<int32_t>()))))); 4076 4077 return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result)) 4078 << PromoteTo(di64, exponent_adj), 4079 lo64_or_mask); 4080 } 4081 4082 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 4083 HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) { 4084 const Rebind<uint32_t, decltype(du64)> du32; 4085 const RebindToFloat<decltype(du32)> df32; 4086 const Repartition<uint8_t, decltype(du32)> du32_as_du8; 4087 4088 const auto exponent_adj = BitCast( 4089 du32, 4090 Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), 4091 BitCast(du32_as_du8, Set(du32, uint32_t{158}))), 4092 BitCast(du32_as_du8, Set(du32, uint32_t{32})))); 4093 4094 const auto adj_v = 4095 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); 4096 const auto f32_to_u32_result = ConvertTo(du32, adj_v); 4097 const auto lo32_or_mask = PromoteTo( 4098 du64, 4099 VecFromMask(du32, f32_to_u32_result == Set(du32, LimitsMax<uint32_t>()))); 4100 4101 return Or(PromoteTo(du64, f32_to_u32_result) << PromoteTo(du64, exponent_adj), 4102 lo32_or_mask); 4103 } 4104 4105 // ------------------------------ PromoteUpperTo 4106 4107 // Per-target flag to prevent generic_ops-inl.h from defining PromoteUpperTo. 4108 #ifdef HWY_NATIVE_PROMOTE_UPPER_TO 4109 #undef HWY_NATIVE_PROMOTE_UPPER_TO 4110 #else 4111 #define HWY_NATIVE_PROMOTE_UPPER_TO 4112 #endif 4113 4114 // Unsigned: zero-extend. 4115 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 4116 HWY_API VFromD<D> PromoteUpperTo(D /* tag */, 4117 VFromD<Repartition<uint8_t, D>> v) { 4118 return VFromD<D>{wasm_u16x8_extend_high_u8x16(v.raw)}; 4119 } 4120 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> 4121 HWY_API VFromD<D> PromoteUpperTo(D /* tag */, 4122 VFromD<Repartition<uint16_t, D>> v) { 4123 return VFromD<D>{wasm_u32x4_extend_high_u16x8(v.raw)}; 4124 } 4125 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)> 4126 HWY_API VFromD<D> PromoteUpperTo(D /* tag */, 4127 VFromD<Repartition<uint32_t, D>> v) { 4128 return VFromD<D>{wasm_u64x2_extend_high_u32x4(v.raw)}; 4129 } 4130 4131 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> 4132 HWY_API VFromD<D> PromoteUpperTo(D /* tag */, 4133 VFromD<Repartition<uint8_t, D>> v) { 4134 return VFromD<D>{wasm_u16x8_extend_high_u8x16(v.raw)}; 4135 } 4136 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> 4137 HWY_API VFromD<D> PromoteUpperTo(D /* tag */, 4138 VFromD<Repartition<uint16_t, D>> v) { 4139 return VFromD<D>{wasm_u32x4_extend_high_u16x8(v.raw)}; 4140 } 4141 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)> 4142 HWY_API VFromD<D> PromoteUpperTo(D /* tag */, 4143 VFromD<Repartition<uint32_t, D>> v) { 4144 return VFromD<D>{wasm_u64x2_extend_high_u32x4(v.raw)}; 4145 } 4146 4147 // Signed: replicate sign bit. 4148 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> 4149 HWY_API VFromD<D> PromoteUpperTo(D /* tag */, 4150 VFromD<Repartition<int8_t, D>> v) { 4151 return VFromD<D>{wasm_i16x8_extend_high_i8x16(v.raw)}; 4152 } 4153 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> 4154 HWY_API VFromD<D> PromoteUpperTo(D /* tag */, 4155 VFromD<Repartition<int16_t, D>> v) { 4156 return VFromD<D>{wasm_i32x4_extend_high_i16x8(v.raw)}; 4157 } 4158 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)> 4159 HWY_API VFromD<D> PromoteUpperTo(D /* tag */, 4160 VFromD<Repartition<int32_t, D>> v) { 4161 return VFromD<D>{wasm_i64x2_extend_high_i32x4(v.raw)}; 4162 } 4163 4164 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 4165 HWY_API VFromD<D> PromoteUpperTo(D df32, VFromD<Repartition<float16_t, D>> v) { 4166 const Rebind<float16_t, decltype(df32)> dh; 4167 return PromoteTo(df32, UpperHalf(dh, v)); 4168 } 4169 4170 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 4171 HWY_API VFromD<D> PromoteUpperTo(D df32, VFromD<Repartition<bfloat16_t, D>> v) { 4172 const Repartition<uint16_t, decltype(df32)> du16; 4173 const RebindToSigned<decltype(df32)> di32; 4174 return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v)))); 4175 } 4176 4177 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 4178 HWY_API VFromD<D> PromoteUpperTo(D dd, VFromD<Repartition<int32_t, D>> v) { 4179 // There is no wasm_f64x2_convert_high_i32x4. 4180 return PromoteTo(dd, UpperHalf(Rebind<int32_t, D>(), v)); 4181 } 4182 4183 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 4184 HWY_API VFromD<D> PromoteUpperTo(D dd, VFromD<Repartition<uint32_t, D>> v) { 4185 // There is no wasm_f64x2_convert_high_u32x4. 4186 return PromoteTo(dd, UpperHalf(Rebind<uint32_t, D>(), v)); 4187 } 4188 4189 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 4190 HWY_API VFromD<D> PromoteUpperTo(D dd, VFromD<Repartition<float, D>> v) { 4191 // There is no wasm_f64x2_promote_high_f32x4. 4192 return PromoteTo(dd, UpperHalf(Rebind<float, D>(), v)); 4193 } 4194 4195 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)> 4196 HWY_API VFromD<D> PromoteUpperTo(D d64, VFromD<Repartition<float, D>> v) { 4197 return PromoteTo(d64, UpperHalf(Rebind<float, D>(), v)); 4198 } 4199 4200 // Generic version for <=64 bit input/output (_high is only for full vectors). 4201 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), class V> 4202 HWY_API VFromD<D> PromoteUpperTo(D d, V v) { 4203 const Rebind<TFromV<V>, decltype(d)> dh; 4204 return PromoteTo(d, UpperHalf(dh, v)); 4205 } 4206 4207 // ------------------------------ PromoteEvenTo/PromoteOddTo 4208 #include "hwy/ops/inside-inl.h" 4209 4210 // ------------------------------ Demotions (full -> part w/ narrow lanes) 4211 4212 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 4213 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4214 return VFromD<D>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)}; 4215 } 4216 4217 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> 4218 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4219 return VFromD<D>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)}; 4220 } 4221 4222 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> 4223 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4224 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); 4225 return VFromD<D>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; 4226 } 4227 4228 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> 4229 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 4230 return VFromD<D>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)}; 4231 } 4232 4233 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> 4234 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4235 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); 4236 return VFromD<D>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)}; 4237 } 4238 4239 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)> 4240 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 4241 return VFromD<D>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)}; 4242 } 4243 4244 template <class D, HWY_IF_UNSIGNED_D(D), 4245 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 4246 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint32_t, D>> v) { 4247 const DFromV<decltype(v)> du32; 4248 const RebindToSigned<decltype(du32)> di32; 4249 return DemoteTo(dn, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF)))); 4250 } 4251 4252 template <class D, HWY_IF_U8_D(D)> 4253 HWY_API VFromD<D> DemoteTo(D du8, VFromD<Rebind<uint16_t, D>> v) { 4254 const DFromV<decltype(v)> du16; 4255 const RebindToSigned<decltype(du16)> di16; 4256 return DemoteTo(du8, BitCast(di16, Min(v, Set(du16, 0x7FFF)))); 4257 } 4258 4259 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 4260 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) { 4261 return VFromD<D>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)}; 4262 } 4263 4264 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> 4265 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) { 4266 return VFromD<D>{wasm_u32x4_trunc_sat_f64x2_zero(v.raw)}; 4267 } 4268 4269 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> 4270 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<double, D>> v) { 4271 return VFromD<D>{wasm_f32x4_demote_f64x2_zero(v.raw)}; 4272 } 4273 4274 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> 4275 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<int64_t, D>> v) { 4276 const Rebind<double, decltype(df32)> df64; 4277 const RebindToUnsigned<decltype(df64)> du64; 4278 const RebindToSigned<decltype(df32)> di32; 4279 const RebindToUnsigned<decltype(df32)> du32; 4280 4281 const auto k2p64_63 = Set(df64, 27670116110564327424.0); 4282 const auto f64_hi52 = 4283 Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63; 4284 const auto f64_lo12 = 4285 PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), 4286 Set(du32, uint32_t{0x00000FFF})))); 4287 4288 const auto f64_sum = f64_hi52 + f64_lo12; 4289 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; 4290 4291 const auto f64_sum_is_inexact = 4292 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); 4293 const auto f64_bits_decrement = 4294 And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))), 4295 f64_sum_is_inexact); 4296 4297 const auto adj_f64_val = BitCast( 4298 df64, 4299 Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact)); 4300 4301 return DemoteTo(df32, adj_f64_val); 4302 } 4303 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> 4304 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) { 4305 const Rebind<double, decltype(df32)> df64; 4306 const RebindToUnsigned<decltype(df64)> du64; 4307 const RebindToSigned<decltype(df32)> di32; 4308 const RebindToUnsigned<decltype(df32)> du32; 4309 4310 const auto k2p64 = Set(df64, 18446744073709551616.0); 4311 const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64; 4312 const auto f64_lo12 = 4313 PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), 4314 Set(du32, uint32_t{0x00000FFF})))); 4315 4316 const auto f64_sum = f64_hi52 + f64_lo12; 4317 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; 4318 const auto f64_sum_is_inexact = 4319 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); 4320 4321 const auto adj_f64_val = BitCast( 4322 df64, 4323 Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)), 4324 f64_sum_is_inexact)); 4325 4326 return DemoteTo(df32, adj_f64_val); 4327 } 4328 4329 // Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes 4330 // above 2*N. 4331 template <class D, HWY_IF_I16_D(D)> 4332 HWY_API Vec32<int16_t> ReorderDemote2To(D dn, Vec32<int32_t> a, 4333 Vec32<int32_t> b) { 4334 const DFromV<decltype(a)> d; 4335 const Twice<decltype(d)> dt; 4336 return DemoteTo(dn, Combine(dt, b, a)); 4337 } 4338 template <class D, HWY_IF_I16_D(D)> 4339 HWY_API Vec64<int16_t> ReorderDemote2To(D dn, Vec64<int32_t> a, 4340 Vec64<int32_t> b) { 4341 const Twice<decltype(dn)> dn_full; 4342 const Repartition<uint32_t, decltype(dn_full)> du32_full; 4343 4344 const Vec128<int16_t> v_full{wasm_i16x8_narrow_i32x4(a.raw, b.raw)}; 4345 const auto vu32_full = BitCast(du32_full, v_full); 4346 return LowerHalf( 4347 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); 4348 } 4349 template <class D, HWY_IF_I16_D(D)> 4350 HWY_API Vec128<int16_t> ReorderDemote2To(D /* tag */, Vec128<int32_t> a, 4351 Vec128<int32_t> b) { 4352 return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)}; 4353 } 4354 4355 template <class D, HWY_IF_U16_D(D)> 4356 HWY_API Vec32<uint16_t> ReorderDemote2To(D dn, Vec32<int32_t> a, 4357 Vec32<int32_t> b) { 4358 const DFromV<decltype(a)> d; 4359 const Twice<decltype(d)> dt; 4360 return DemoteTo(dn, Combine(dt, b, a)); 4361 } 4362 template <class D, HWY_IF_U16_D(D)> 4363 HWY_API Vec64<uint16_t> ReorderDemote2To(D dn, Vec64<int32_t> a, 4364 Vec64<int32_t> b) { 4365 const Twice<decltype(dn)> dn_full; 4366 const Repartition<uint32_t, decltype(dn_full)> du32_full; 4367 4368 const Vec128<int16_t> v_full{wasm_u16x8_narrow_i32x4(a.raw, b.raw)}; 4369 const auto vu32_full = BitCast(du32_full, v_full); 4370 return LowerHalf( 4371 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); 4372 } 4373 template <class D, HWY_IF_U16_D(D)> 4374 HWY_API Vec128<uint16_t> ReorderDemote2To(D /* tag */, Vec128<int32_t> a, 4375 Vec128<int32_t> b) { 4376 return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(a.raw, b.raw)}; 4377 } 4378 4379 template <class D, HWY_IF_U16_D(D)> 4380 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint32_t> a, 4381 Vec128<uint32_t> b) { 4382 const DFromV<decltype(a)> du32; 4383 const RebindToSigned<decltype(du32)> di32; 4384 const auto max_i32 = Set(du32, 0x7FFFFFFFu); 4385 4386 const auto clamped_a = BitCast(di32, Min(a, max_i32)); 4387 const auto clamped_b = BitCast(di32, Min(b, max_i32)); 4388 return ReorderDemote2To(dn, clamped_a, clamped_b); 4389 } 4390 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 4391 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint32_t, D>> a, 4392 VFromD<Repartition<uint32_t, D>> b) { 4393 const DFromV<decltype(a)> d; 4394 const Twice<decltype(d)> dt; 4395 return DemoteTo(dn, Combine(dt, b, a)); 4396 } 4397 4398 // Specializations for partial vectors because i8x16_narrow_i16x8 sets lanes 4399 // above 2*N. 4400 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> 4401 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a, 4402 VFromD<Repartition<int16_t, D>> b) { 4403 const DFromV<decltype(a)> d; 4404 const Twice<decltype(d)> dt; 4405 return DemoteTo(dn, Combine(dt, b, a)); 4406 } 4407 template <class D, HWY_IF_I8_D(D)> 4408 HWY_API Vec64<int8_t> ReorderDemote2To(D dn, Vec64<int16_t> a, 4409 Vec64<int16_t> b) { 4410 const Twice<decltype(dn)> dn_full; 4411 const Repartition<uint32_t, decltype(dn_full)> du32_full; 4412 4413 const Vec128<int8_t> v_full{wasm_i8x16_narrow_i16x8(a.raw, b.raw)}; 4414 const auto vu32_full = BitCast(du32_full, v_full); 4415 return LowerHalf( 4416 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); 4417 } 4418 template <class D, HWY_IF_I8_D(D)> 4419 HWY_API Vec128<int8_t> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, 4420 Vec128<int16_t> b) { 4421 return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(a.raw, b.raw)}; 4422 } 4423 4424 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> 4425 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int16_t, D>> a, 4426 VFromD<Repartition<int16_t, D>> b) { 4427 const DFromV<decltype(a)> d; 4428 const Twice<decltype(d)> dt; 4429 return DemoteTo(dn, Combine(dt, b, a)); 4430 } 4431 template <class D, HWY_IF_U8_D(D)> 4432 HWY_API Vec64<uint8_t> ReorderDemote2To(D dn, Vec64<int16_t> a, 4433 Vec64<int16_t> b) { 4434 const Twice<decltype(dn)> dn_full; 4435 const Repartition<uint32_t, decltype(dn_full)> du32_full; 4436 4437 const Vec128<uint8_t> v_full{wasm_u8x16_narrow_i16x8(a.raw, b.raw)}; 4438 const auto vu32_full = BitCast(du32_full, v_full); 4439 return LowerHalf( 4440 BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); 4441 } 4442 template <class D, HWY_IF_U8_D(D)> 4443 HWY_API Vec128<uint8_t> ReorderDemote2To(D /* tag */, Vec128<int16_t> a, 4444 Vec128<int16_t> b) { 4445 return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(a.raw, b.raw)}; 4446 } 4447 4448 template <class D, HWY_IF_U8_D(D)> 4449 HWY_API VFromD<D> ReorderDemote2To(D dn, Vec128<uint16_t> a, 4450 Vec128<uint16_t> b) { 4451 const DFromV<decltype(a)> du16; 4452 const RebindToSigned<decltype(du16)> di16; 4453 const auto max_i16 = Set(du16, 0x7FFFu); 4454 4455 const auto clamped_a = BitCast(di16, Min(a, max_i16)); 4456 const auto clamped_b = BitCast(di16, Min(b, max_i16)); 4457 return ReorderDemote2To(dn, clamped_a, clamped_b); 4458 } 4459 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U8_D(D)> 4460 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint16_t, D>> a, 4461 VFromD<Repartition<uint16_t, D>> b) { 4462 const DFromV<decltype(a)> d; 4463 const Twice<decltype(d)> dt; 4464 return DemoteTo(dn, Combine(dt, b, a)); 4465 } 4466 4467 // For already range-limited input [0, 255]. 4468 template <size_t N> 4469 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) { 4470 const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); 4471 return Vec128<uint8_t, N>{ 4472 wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; 4473 } 4474 4475 // ------------------------------ Truncations 4476 4477 template <typename From, class DTo, HWY_IF_LANES_D(DTo, 1)> 4478 HWY_API VFromD<DTo> TruncateTo(DTo /* tag */, Vec128<From, 1> v) { 4479 // BitCast requires the same size; DTo might be u8x1 and v u16x1. 4480 const Repartition<TFromD<DTo>, DFromV<decltype(v)>> dto; 4481 return VFromD<DTo>{BitCast(dto, v).raw}; 4482 } 4483 4484 template <class D, HWY_IF_U8_D(D)> 4485 HWY_API Vec16<uint8_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { 4486 const Full128<uint8_t> d; 4487 const auto v1 = BitCast(d, v); 4488 const auto v2 = ConcatEven(d, v1, v1); 4489 const auto v4 = ConcatEven(d, v2, v2); 4490 return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4)))); 4491 } 4492 4493 template <class D, HWY_IF_U16_D(D)> 4494 HWY_API Vec32<uint16_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { 4495 const Full128<uint16_t> d; 4496 const auto v1 = BitCast(d, v); 4497 const auto v2 = ConcatEven(d, v1, v1); 4498 return LowerHalf(LowerHalf(ConcatEven(d, v2, v2))); 4499 } 4500 4501 template <class D, HWY_IF_U32_D(D)> 4502 HWY_API Vec64<uint32_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { 4503 const Full128<uint32_t> d; 4504 const auto v1 = BitCast(d, v); 4505 return LowerHalf(ConcatEven(d, v1, v1)); 4506 } 4507 4508 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)> 4509 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4510 const Repartition<uint8_t, DFromV<decltype(v)>> d; 4511 const auto v1 = Vec128<uint8_t>{v.raw}; 4512 const auto v2 = ConcatEven(d, v1, v1); 4513 const auto v3 = ConcatEven(d, v2, v2); 4514 return VFromD<D>{v3.raw}; 4515 } 4516 4517 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U16_D(D)> 4518 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4519 const Repartition<uint16_t, DFromV<decltype(v)>> d; 4520 const auto v1 = Vec128<uint16_t>{v.raw}; 4521 const auto v2 = ConcatEven(d, v1, v1); 4522 return VFromD<D>{v2.raw}; 4523 } 4524 4525 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U8_D(D)> 4526 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 4527 const Repartition<uint8_t, DFromV<decltype(v)>> d; 4528 const auto v1 = Vec128<uint8_t>{v.raw}; 4529 const auto v2 = ConcatEven(d, v1, v1); 4530 return VFromD<D>{v2.raw}; 4531 } 4532 4533 // ------------------------------ Demotions to/from i64 4534 4535 namespace detail { 4536 template <class D, HWY_IF_UNSIGNED_D(D)> 4537 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult( 4538 D /*dn*/, VFromD<Rebind<uint64_t, D>> v) { 4539 return v; 4540 } 4541 4542 template <class D, HWY_IF_SIGNED_D(D)> 4543 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64MaskOutResult( 4544 D /*dn*/, VFromD<Rebind<uint64_t, D>> v) { 4545 const DFromV<decltype(v)> du64; 4546 return And(v, 4547 Set(du64, static_cast<uint64_t>(hwy::HighestValue<TFromD<D>>()))); 4548 } 4549 4550 template <class D> 4551 HWY_INLINE VFromD<Rebind<uint64_t, D>> DemoteFromU64Saturate( 4552 D dn, VFromD<Rebind<uint64_t, D>> v) { 4553 const Rebind<uint64_t, D> du64; 4554 const RebindToSigned<decltype(du64)> di64; 4555 constexpr int kShiftAmt = static_cast<int>(sizeof(TFromD<D>) * 8) - 4556 static_cast<int>(hwy::IsSigned<TFromD<D>>()); 4557 4558 const auto too_big = BitCast( 4559 du64, VecFromMask( 4560 di64, Gt(BitCast(di64, ShiftRight<kShiftAmt>(v)), Zero(di64)))); 4561 return DemoteFromU64MaskOutResult(dn, Or(v, too_big)); 4562 } 4563 4564 template <class D, class V> 4565 HWY_INLINE VFromD<D> ReorderDemote2From64To32Combine(D dn, V a, V b) { 4566 return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); 4567 } 4568 4569 } // namespace detail 4570 4571 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), 4572 HWY_IF_SIGNED_D(D)> 4573 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) { 4574 const DFromV<decltype(v)> di64; 4575 const RebindToUnsigned<decltype(di64)> du64; 4576 const RebindToUnsigned<decltype(dn)> dn_u; 4577 4578 // Negative values are saturated by first saturating their bitwise inverse 4579 // and then inverting the saturation result 4580 const auto invert_mask = BitCast(du64, BroadcastSignBit(v)); 4581 const auto saturated_vals = Xor( 4582 invert_mask, 4583 detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v)))); 4584 return BitCast(dn, TruncateTo(dn_u, saturated_vals)); 4585 } 4586 4587 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), 4588 HWY_IF_UNSIGNED_D(D)> 4589 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<int64_t, D>> v) { 4590 const DFromV<decltype(v)> di64; 4591 const RebindToUnsigned<decltype(di64)> du64; 4592 4593 const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v)); 4594 return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals)); 4595 } 4596 4597 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)), 4598 HWY_IF_UNSIGNED_D(D)> 4599 HWY_API VFromD<D> DemoteTo(D dn, VFromD<Rebind<uint64_t, D>> v) { 4600 return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v)); 4601 } 4602 4603 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 4), 4604 HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>)> 4605 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<int64_t, D>> a, 4606 VFromD<Repartition<int64_t, D>> b) { 4607 const DFromV<decltype(a)> d; 4608 const Twice<decltype(d)> dt; 4609 return DemoteTo(dn, Combine(dt, b, a)); 4610 } 4611 4612 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> 4613 HWY_API VFromD<D> ReorderDemote2To(D dn, VFromD<Repartition<uint64_t, D>> a, 4614 VFromD<Repartition<uint64_t, D>> b) { 4615 const DFromV<decltype(a)> d; 4616 const Twice<decltype(d)> dt; 4617 return DemoteTo(dn, Combine(dt, b, a)); 4618 } 4619 4620 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> 4621 HWY_API Vec128<int32_t> ReorderDemote2To(D dn, Vec128<int64_t> a, 4622 Vec128<int64_t> b) { 4623 const DFromV<decltype(a)> di64; 4624 const RebindToUnsigned<decltype(di64)> du64; 4625 const Half<decltype(dn)> dnh; 4626 4627 // Negative values are saturated by first saturating their bitwise inverse 4628 // and then inverting the saturation result 4629 const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); 4630 const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); 4631 const auto saturated_a = Xor( 4632 invert_mask_a, 4633 detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); 4634 const auto saturated_b = Xor( 4635 invert_mask_b, 4636 detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); 4637 4638 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); 4639 } 4640 4641 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> 4642 HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<int64_t> a, 4643 Vec128<int64_t> b) { 4644 const DFromV<decltype(a)> di64; 4645 const RebindToUnsigned<decltype(di64)> du64; 4646 const Half<decltype(dn)> dnh; 4647 4648 const auto saturated_a = detail::DemoteFromU64Saturate( 4649 dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); 4650 const auto saturated_b = detail::DemoteFromU64Saturate( 4651 dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); 4652 4653 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); 4654 } 4655 4656 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> 4657 HWY_API Vec128<uint32_t> ReorderDemote2To(D dn, Vec128<uint64_t> a, 4658 Vec128<uint64_t> b) { 4659 const Half<decltype(dn)> dnh; 4660 4661 const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); 4662 const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); 4663 4664 return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); 4665 } 4666 4667 template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V, 4668 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 4669 HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2), 4670 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> 4671 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { 4672 return ReorderDemote2To(d, a, b); 4673 } 4674 4675 // ------------------------------ ConvertTo 4676 4677 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 4678 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4679 return VFromD<D>{wasm_f32x4_convert_i32x4(v.raw)}; 4680 } 4681 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)> 4682 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4683 return VFromD<D>{wasm_f32x4_convert_u32x4(v.raw)}; 4684 } 4685 4686 template <class D, HWY_IF_F64_D(D)> 4687 HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<int64_t, D>> v) { 4688 // Based on wim's approach (https://stackoverflow.com/questions/41144668/) 4689 const Repartition<uint32_t, decltype(dd)> d32; 4690 const Repartition<uint64_t, decltype(dd)> d64; 4691 4692 // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 4693 const auto k84_63 = Set(d64, 0x4530000080000000ULL); 4694 const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); 4695 4696 // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) 4697 const auto k52 = Set(d32, 0x43300000); 4698 const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); 4699 4700 const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); 4701 return (v_upper - k84_63_52) + v_lower; // order matters! 4702 } 4703 4704 namespace detail { 4705 template <class VW> 4706 HWY_INLINE VFromD<Rebind<double, DFromV<VW>>> U64ToF64VecFast(VW w) { 4707 const DFromV<decltype(w)> d64; 4708 const RebindToFloat<decltype(d64)> dd; 4709 const auto cnst2_52_dbl = Set(dd, 0x0010000000000000); // 2^52 4710 return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl; 4711 } 4712 } // namespace detail 4713 4714 template <class D, HWY_IF_F64_D(D)> 4715 HWY_API VFromD<D> ConvertTo(D dd, VFromD<Rebind<uint64_t, D>> v) { 4716 // Based on wim's approach (https://stackoverflow.com/questions/41144668/) 4717 const RebindToUnsigned<decltype(dd)> d64; 4718 using VU = VFromD<decltype(d64)>; 4719 4720 const VU msk_lo = Set(d64, 0xFFFFFFFF); 4721 const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32 4722 4723 // Extract the 32 lowest/highest significant bits of v 4724 const VU v_lo = And(v, msk_lo); 4725 const VU v_hi = ShiftRight<32>(v); 4726 4727 const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo); 4728 return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl); 4729 } 4730 4731 // Truncates (rounds toward zero). 4732 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_I32_D(D)> 4733 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float, D>> v) { 4734 return VFromD<D>{wasm_i32x4_trunc_sat_f32x4(v.raw)}; 4735 } 4736 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U32_D(D)> 4737 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<float, D>> v) { 4738 return VFromD<D>{wasm_u32x4_trunc_sat_f32x4(v.raw)}; 4739 } 4740 4741 template <class DI, HWY_IF_I64_D(DI)> 4742 HWY_API VFromD<DI> ConvertTo(DI di, VFromD<Rebind<double, DI>> v) { 4743 using VI = VFromD<decltype(di)>; 4744 using MI = MFromD<decltype(di)>; 4745 const RebindToUnsigned<decltype(di)> du; 4746 using VU = VFromD<decltype(du)>; 4747 const Repartition<uint16_t, decltype(di)> du16; 4748 const VI k1075 = Set(di, 1075); // biased exponent of 2^52 4749 4750 // Exponent indicates whether the number can be represented as int64_t. 4751 const VU biased_exp = ShiftRight<52>(BitCast(du, v)) & Set(du, 0x7FF); 4752 const MI in_range = BitCast(di, biased_exp) < Set(di, 1086); 4753 4754 // If we were to cap the exponent at 51 and add 2^52, the number would be in 4755 // [2^52, 2^53) and mantissa bits could be read out directly. We need to 4756 // round-to-0 (truncate). 4757 // Use 16-bit saturated unsigned subtraction to compute shift_mnt and 4758 // shift_int since biased_exp[i] is a non-negative integer that is less than 4759 // or equal to 2047. 4760 // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be 4761 // zero as the upper 48 bits of both k1075 and biased_exp are zero. 4762 4763 const VU shift_mnt = BitCast( 4764 du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp))); 4765 const VU shift_int = BitCast( 4766 du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075))); 4767 const VU mantissa = BitCast(du, v) & Set(du, (1ULL << 52) - 1); 4768 // Include implicit 1-bit 4769 VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt; 4770 // WASM clamps shift count; zero if greater. 4771 const MI tiny = BitCast(di, shift_mnt) > Set(di, 63); 4772 int53 = IfThenZeroElse(RebindMask(du, tiny), int53); 4773 4774 // For inputs larger than 2^53 - 1, insert zeros at the bottom. 4775 // For inputs less than 2^63, the implicit 1-bit is guaranteed not to be 4776 // shifted out of the left shift result below as shift_int[i] <= 10 is true 4777 // for any inputs that are less than 2^63. 4778 const VU shifted = int53 << shift_int; 4779 4780 // Saturate to LimitsMin (unchanged when negating below) or LimitsMax. 4781 const VI sign_mask = BroadcastSignBit(BitCast(di, v)); 4782 const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask; 4783 const VI magnitude = IfThenElse(in_range, BitCast(di, shifted), limit); 4784 4785 // If the input was negative, negate the integer (two's complement). 4786 return (magnitude ^ sign_mask) - sign_mask; 4787 } 4788 4789 template <class DU, HWY_IF_U64_D(DU)> 4790 HWY_API VFromD<DU> ConvertTo(DU du, VFromD<Rebind<double, DU>> v) { 4791 const RebindToSigned<decltype(du)> di; 4792 using MI = MFromD<decltype(di)>; 4793 using VU = VFromD<decltype(du)>; 4794 const Repartition<uint16_t, decltype(di)> du16; 4795 const VU k1075 = Set(du, 1075); /* biased exponent of 2^52 */ 4796 4797 const auto non_neg_v = ZeroIfNegative(v); 4798 4799 // Exponent indicates whether the number can be represented as int64_t. 4800 const VU biased_exp = ShiftRight<52>(BitCast(du, non_neg_v)); 4801 const VU out_of_range = 4802 BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086))); 4803 4804 // If we were to cap the exponent at 51 and add 2^52, the number would be in 4805 // [2^52, 2^53) and mantissa bits could be read out directly. We need to 4806 // round-to-0 (truncate), but changing rounding mode in MXCSR hits a 4807 // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead 4808 // manually shift the mantissa into place (we already have many of the 4809 // inputs anyway). 4810 4811 // Use 16-bit saturated unsigned subtraction to compute shift_mnt and 4812 // shift_int since biased_exp[i] is a non-negative integer that is less than 4813 // or equal to 2047. 4814 4815 // 16-bit saturated unsigned subtraction is also more efficient than a 4816 // 64-bit subtraction followed by a 64-bit signed Max operation on 4817 // WASM. 4818 4819 // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be 4820 // zero as the upper 48 bits of both k1075 and biased_exp are zero. 4821 4822 const VU shift_mnt = BitCast( 4823 du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp))); 4824 const VU shift_int = BitCast( 4825 du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075))); 4826 const VU mantissa = BitCast(du, non_neg_v) & Set(du, (1ULL << 52) - 1); 4827 // Include implicit 1-bit. 4828 VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt; 4829 // WASM clamps shift count; zero if greater. 4830 const MI tiny = BitCast(di, shift_mnt) > Set(di, 63); 4831 int53 = IfThenZeroElse(RebindMask(du, tiny), int53); 4832 4833 // For inputs larger than 2^53 - 1, insert zeros at the bottom. 4834 4835 // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be 4836 // shifted out of the left shift result below as shift_int[i] <= 11 is true 4837 // for any inputs that are less than 2^64. 4838 4839 const VU shifted = int53 << shift_int; 4840 return (shifted | out_of_range); 4841 } 4842 4843 // ------------------------------ NearestInt (Round) 4844 template <typename T, size_t N, HWY_IF_FLOAT3264(T)> 4845 HWY_API Vec128<MakeSigned<T>, N> NearestInt(const Vec128<T, N> v) { 4846 return ConvertTo(RebindToSigned<DFromV<decltype(v)>>(), Round(v)); 4847 } 4848 4849 // ------------------------------ DemoteToNearestInt (Round) 4850 template <class DI32, HWY_IF_I32_D(DI32)> 4851 HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32, 4852 VFromD<Rebind<double, DI32>> v) { 4853 // No single instruction, round then demote. 4854 return DemoteTo(di32, Round(v)); 4855 } 4856 4857 // ================================================== MISC 4858 4859 // ------------------------------ SumsOf8 (ShiftRight, Add) 4860 template <size_t N> 4861 HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) { 4862 const DFromV<decltype(v)> du8; 4863 const RepartitionToWide<decltype(du8)> du16; 4864 const RepartitionToWide<decltype(du16)> du32; 4865 const RepartitionToWide<decltype(du32)> du64; 4866 using VU16 = VFromD<decltype(du16)>; 4867 4868 const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v)); 4869 const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF)); 4870 const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); 4871 4872 const VU16 szz_FE_zz_BA_zz_76_zz_32 = 4873 BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); 4874 const VU16 sxx_FC_xx_B8_xx_74_xx_30 = 4875 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32); 4876 const VU16 szz_zz_xx_FC_zz_zz_xx_74 = 4877 BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30))); 4878 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 = 4879 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74); 4880 return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF)); 4881 } 4882 4883 template <size_t N> 4884 HWY_API Vec128<int64_t, N / 8> SumsOf8(const Vec128<int8_t, N> v) { 4885 const DFromV<decltype(v)> di8; 4886 const RepartitionToWide<decltype(di8)> di16; 4887 const RepartitionToWide<decltype(di16)> di32; 4888 const RepartitionToWide<decltype(di32)> di64; 4889 const RebindToUnsigned<decltype(di32)> du32; 4890 const RebindToUnsigned<decltype(di64)> du64; 4891 using VI16 = VFromD<decltype(di16)>; 4892 4893 const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v)); 4894 const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v))); 4895 const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); 4896 4897 const VI16 sDC_zz_98_zz_54_zz_10_zz = 4898 BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); 4899 const VI16 sFC_xx_B8_xx_74_xx_30_xx = 4900 Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz); 4901 const VI16 sB8_xx_zz_zz_30_xx_zz_zz = 4902 BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx))); 4903 const VI16 sF8_xx_xx_xx_70_xx_xx_xx = 4904 Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz); 4905 return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx)); 4906 } 4907 4908 // ------------------------------ LoadMaskBits (TestBit) 4909 4910 namespace detail { 4911 4912 template <class D, HWY_IF_T_SIZE_D(D, 1)> 4913 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { 4914 const RebindToUnsigned<decltype(d)> du; 4915 // Easier than Set(), which would require an >8-bit type, which would not 4916 // compile for T=uint8_t, N=1. 4917 const VFromD<D> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))}; 4918 4919 // Replicate bytes 8x such that each byte contains the bit that governs it. 4920 alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, 4921 1, 1, 1, 1, 1, 1, 1, 1}; 4922 const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); 4923 4924 alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, 4925 1, 2, 4, 8, 16, 32, 64, 128}; 4926 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); 4927 } 4928 4929 template <class D, HWY_IF_T_SIZE_D(D, 2)> 4930 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { 4931 const RebindToUnsigned<decltype(d)> du; 4932 alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; 4933 return RebindMask( 4934 d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit))); 4935 } 4936 4937 template <class D, HWY_IF_T_SIZE_D(D, 4)> 4938 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { 4939 const RebindToUnsigned<decltype(d)> du; 4940 alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; 4941 return RebindMask( 4942 d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit))); 4943 } 4944 4945 template <class D, HWY_IF_T_SIZE_D(D, 8)> 4946 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t bits) { 4947 const RebindToUnsigned<decltype(d)> du; 4948 alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; 4949 return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); 4950 } 4951 4952 } // namespace detail 4953 4954 // `p` points to at least 8 readable bytes, not all of which need be valid. 4955 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 4956 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { 4957 uint64_t mask_bits = 0; 4958 CopyBytes<(MaxLanes(d) + 7) / 8>(bits, &mask_bits); 4959 return detail::LoadMaskBits(d, mask_bits); 4960 } 4961 4962 // ------------------------------ Dup128MaskFromMaskBits 4963 4964 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 4965 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 4966 constexpr size_t kN = MaxLanes(d); 4967 if (kN < 8) mask_bits &= (1u << kN) - 1; 4968 return detail::LoadMaskBits(d, mask_bits); 4969 } 4970 4971 // ------------------------------ Mask 4972 4973 namespace detail { 4974 4975 // Returns the lowest N bits for the BitsFromMask result. 4976 template <class D> 4977 constexpr uint64_t OnlyActive(D d, uint64_t bits) { 4978 return (d.MaxBytes() == 16) ? bits : bits & ((1ull << d.MaxLanes()) - 1); 4979 } 4980 4981 } // namespace detail 4982 4983 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 16)> 4984 HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) { 4985 alignas(16) uint64_t lanes[2]; 4986 wasm_v128_store(lanes, mask.raw); 4987 4988 constexpr uint64_t kMagic = 0x103070F1F3F80ULL; 4989 const uint64_t lo = ((lanes[0] * kMagic) >> 56); 4990 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; 4991 return hi + lo; // exactly 16 bits, no OnlyActive required 4992 } 4993 4994 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)> 4995 HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) { 4996 constexpr uint64_t kMagic = 0x103070F1F3F80ULL; 4997 const uint64_t bytes = 4998 static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)); 4999 return (bytes * kMagic) >> 56; // exactly 8 bits, no OnlyActive required 5000 } 5001 5002 // 32-bit or less: need masking 5003 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4)> 5004 HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) { 5005 uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)); 5006 // Clear potentially undefined bytes. 5007 bytes &= (1ULL << (Lanes(d) * 8)) - 1; 5008 constexpr uint64_t kMagic = 0x103070F1F3F80ULL; 5009 return detail::OnlyActive(d, (bytes * kMagic) >> 56); 5010 } 5011 5012 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 16)> 5013 HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD<D> mask) { 5014 // Remove useless lower half of each u16 while preserving the sign bit. 5015 const Rebind<uint8_t, D> d8; 5016 using M8 = MFromD<decltype(d8)>; 5017 const __i16x8 zero = wasm_i16x8_splat(0); 5018 const M8 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; 5019 return detail::OnlyActive(d8, BitsFromMask(d8, mask8)); 5020 } 5021 5022 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 16)> 5023 HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) { 5024 const __i32x4 mask_i = static_cast<__i32x4>(mask.raw); 5025 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8); 5026 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice); 5027 alignas(16) uint32_t lanes[4]; 5028 wasm_v128_store(lanes, sliced_mask); 5029 return detail::OnlyActive(d, lanes[0] | lanes[1] | lanes[2] | lanes[3]); 5030 } 5031 5032 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 16)> 5033 HWY_API uint64_t BitsFromMask(D d, const MFromD<D> mask) { 5034 const __i64x2 mask_i = static_cast<__i64x2>(mask.raw); 5035 const __i64x2 slice = wasm_i64x2_make(1, 2); 5036 const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice); 5037 alignas(16) uint64_t lanes[2]; 5038 wasm_v128_store(lanes, sliced_mask); 5039 return detail::OnlyActive(d, lanes[0] | lanes[1]); 5040 } 5041 5042 namespace detail { 5043 5044 // Returns 0xFF for bytes with index >= N, otherwise 0. 5045 template <size_t N> 5046 constexpr __i8x16 BytesAbove() { 5047 return /**/ 5048 (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1) 5049 : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1) 5050 : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1) 5051 : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1) 5052 : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0) 5053 : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1) 5054 : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1) 5055 : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1) 5056 : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1) 5057 : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5058 -1, -1, -1, -1, -1) 5059 : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5060 -1, -1, -1, -1) 5061 : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 5062 -1, -1, -1, -1) 5063 : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 5064 -1, -1, -1) 5065 : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 5066 -1, -1, -1) 5067 : (N == 11) 5068 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1) 5069 : (N == 13) 5070 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1) 5071 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1); 5072 } 5073 5074 } // namespace detail 5075 5076 // `p` points to at least 8 writable bytes. 5077 template <class D> 5078 HWY_API size_t StoreMaskBits(D d, const MFromD<D> mask, uint8_t* bits) { 5079 const uint64_t mask_bits = BitsFromMask(d, mask); 5080 const size_t kNumBytes = (d.MaxLanes() + 7) / 8; 5081 CopyBytes<kNumBytes>(&mask_bits, bits); 5082 return kNumBytes; 5083 } 5084 5085 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)> 5086 HWY_API size_t CountTrue(D d, const MFromD<D> m) { 5087 return PopCount(BitsFromMask(d, m)); 5088 } 5089 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)> 5090 HWY_API size_t CountTrue(D d, const MFromD<D> m) { 5091 return PopCount(BitsFromMask(d, m)); 5092 } 5093 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 16)> 5094 HWY_API size_t CountTrue(D /*d*/, const MFromD<D> m) { 5095 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); 5096 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); 5097 alignas(16) uint64_t lanes[2]; 5098 wasm_v128_store(lanes, shifted_bits); 5099 return PopCount(lanes[0] | lanes[1]); 5100 } 5101 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 16)> 5102 HWY_API size_t CountTrue(D /*d*/, const MFromD<D> m) { 5103 alignas(16) int64_t lanes[2]; 5104 wasm_v128_store(lanes, m.raw); 5105 return static_cast<size_t>(-(lanes[0] + lanes[1])); 5106 } 5107 5108 // Partial 5109 template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)> 5110 HWY_API size_t CountTrue(D d, MFromD<D> m) { 5111 // Ensure all undefined bytes are 0. 5112 const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()}; 5113 const Full128<T> dfull; 5114 return CountTrue(dfull, Mask128<T>{AndNot(mask, m).raw}); 5115 } 5116 5117 // Full vector 5118 template <class D, HWY_IF_V_SIZE_D(D, 16)> 5119 HWY_API bool AllFalse(D d, const MFromD<D> m) { 5120 const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m)); 5121 return !wasm_v128_any_true(v8.raw); 5122 } 5123 5124 // Full vector 5125 namespace detail { 5126 template <typename T> 5127 HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) { 5128 return wasm_i8x16_all_true(m.raw); 5129 } 5130 template <typename T> 5131 HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) { 5132 return wasm_i16x8_all_true(m.raw); 5133 } 5134 template <typename T> 5135 HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) { 5136 return wasm_i32x4_all_true(m.raw); 5137 } 5138 template <typename T> 5139 HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) { 5140 return wasm_i64x2_all_true(m.raw); 5141 } 5142 5143 } // namespace detail 5144 5145 template <class D, typename T = TFromD<D>> 5146 HWY_API bool AllTrue(D /* tag */, const Mask128<T> m) { 5147 return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m); 5148 } 5149 5150 // Partial vectors 5151 5152 template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)> 5153 HWY_API bool AllFalse(D d, const MFromD<D> m) { 5154 // Ensure all undefined bytes are 0. 5155 const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()}; 5156 return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw}); 5157 } 5158 5159 template <class D, typename T = TFromD<D>, HWY_IF_V_SIZE_LE_D(D, 8)> 5160 HWY_API bool AllTrue(D d, const MFromD<D> m) { 5161 // Ensure all undefined bytes are FF. 5162 const MFromD<D> mask{detail::BytesAbove<d.MaxBytes()>()}; 5163 return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw}); 5164 } 5165 5166 template <class D> 5167 HWY_API size_t FindKnownFirstTrue(D d, const MFromD<D> mask) { 5168 const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask)); 5169 return Num0BitsBelowLS1Bit_Nonzero32(bits); 5170 } 5171 5172 template <class D> 5173 HWY_API intptr_t FindFirstTrue(D d, const MFromD<D> mask) { 5174 const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask)); 5175 return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero32(bits)) : -1; 5176 } 5177 5178 template <class D> 5179 HWY_API size_t FindKnownLastTrue(D d, const MFromD<D> mask) { 5180 const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask)); 5181 return 31 - Num0BitsAboveMS1Bit_Nonzero32(bits); 5182 } 5183 5184 template <class D> 5185 HWY_API intptr_t FindLastTrue(D d, const MFromD<D> mask) { 5186 const uint32_t bits = static_cast<uint32_t>(BitsFromMask(d, mask)); 5187 return bits 5188 ? (31 - static_cast<intptr_t>(Num0BitsAboveMS1Bit_Nonzero32(bits))) 5189 : -1; 5190 } 5191 5192 // ------------------------------ Compress 5193 5194 namespace detail { 5195 5196 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 5197 HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) { 5198 HWY_DASSERT(mask_bits < 256); 5199 const Simd<T, N, 0> d; 5200 const Rebind<uint8_t, decltype(d)> d8; 5201 const Simd<uint16_t, N, 0> du; 5202 5203 // We need byte indices for TableLookupBytes (one vector's worth for each of 5204 // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We 5205 // can instead store lane indices and convert to byte indices (2*lane + 0..1), 5206 // with the doubling baked into the table. Unpacking nibbles is likely more 5207 // costly than the higher cache footprint from storing bytes. 5208 alignas(16) static constexpr uint8_t table[256 * 8] = { 5209 // PrintCompress16x8Tables 5210 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5211 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5212 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // 5213 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5214 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // 5215 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // 5216 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // 5217 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5218 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // 5219 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // 5220 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // 5221 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // 5222 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // 5223 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // 5224 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // 5225 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5226 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // 5227 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // 5228 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // 5229 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // 5230 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // 5231 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // 5232 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // 5233 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // 5234 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // 5235 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // 5236 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // 5237 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // 5238 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // 5239 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // 5240 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // 5241 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5242 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // 5243 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // 5244 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // 5245 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // 5246 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // 5247 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // 5248 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // 5249 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // 5250 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // 5251 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // 5252 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // 5253 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // 5254 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // 5255 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // 5256 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // 5257 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // 5258 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // 5259 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // 5260 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // 5261 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // 5262 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // 5263 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // 5264 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // 5265 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // 5266 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // 5267 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // 5268 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // 5269 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // 5270 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // 5271 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // 5272 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // 5273 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 5274 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // 5275 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // 5276 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // 5277 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // 5278 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // 5279 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // 5280 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // 5281 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // 5282 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // 5283 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // 5284 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // 5285 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // 5286 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // 5287 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // 5288 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // 5289 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // 5290 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // 5291 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // 5292 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // 5293 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // 5294 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // 5295 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // 5296 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // 5297 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // 5298 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // 5299 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // 5300 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // 5301 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // 5302 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // 5303 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // 5304 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // 5305 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // 5306 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // 5307 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // 5308 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // 5309 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // 5310 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // 5311 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // 5312 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // 5313 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // 5314 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // 5315 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // 5316 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // 5317 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // 5318 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // 5319 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // 5320 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // 5321 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // 5322 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // 5323 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // 5324 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // 5325 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // 5326 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // 5327 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // 5328 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // 5329 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // 5330 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // 5331 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // 5332 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // 5333 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // 5334 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // 5335 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // 5336 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // 5337 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; 5338 5339 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw}; 5340 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx); 5341 return BitCast(d, pairs + Set(du, 0x0100)); 5342 } 5343 5344 template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)> 5345 HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) { 5346 HWY_DASSERT(mask_bits < 256); 5347 const Simd<T, N, 0> d; 5348 const Rebind<uint8_t, decltype(d)> d8; 5349 const Simd<uint16_t, N, 0> du; 5350 5351 // We need byte indices for TableLookupBytes (one vector's worth for each of 5352 // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We 5353 // can instead store lane indices and convert to byte indices (2*lane + 0..1), 5354 // with the doubling baked into the table. Unpacking nibbles is likely more 5355 // costly than the higher cache footprint from storing bytes. 5356 alignas(16) static constexpr uint8_t table[256 * 8] = { 5357 // PrintCompressNot16x8Tables 5358 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // 5359 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // 5360 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // 5361 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // 5362 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // 5363 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // 5364 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // 5365 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // 5366 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // 5367 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // 5368 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // 5369 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // 5370 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // 5371 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // 5372 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // 5373 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // 5374 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // 5375 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // 5376 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // 5377 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // 5378 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // 5379 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // 5380 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // 5381 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // 5382 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // 5383 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // 5384 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // 5385 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // 5386 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // 5387 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // 5388 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // 5389 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // 5390 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // 5391 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // 5392 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // 5393 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // 5394 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // 5395 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // 5396 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // 5397 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // 5398 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // 5399 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // 5400 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // 5401 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // 5402 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // 5403 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // 5404 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // 5405 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // 5406 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // 5407 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // 5408 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // 5409 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // 5410 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // 5411 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // 5412 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // 5413 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // 5414 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // 5415 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // 5416 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // 5417 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // 5418 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // 5419 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // 5420 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // 5421 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // 5422 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // 5423 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // 5424 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // 5425 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // 5426 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // 5427 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // 5428 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // 5429 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // 5430 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // 5431 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // 5432 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // 5433 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // 5434 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // 5435 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // 5436 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // 5437 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // 5438 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // 5439 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // 5440 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // 5441 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // 5442 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // 5443 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // 5444 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // 5445 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // 5446 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // 5447 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // 5448 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // 5449 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // 5450 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // 5451 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // 5452 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // 5453 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // 5454 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // 5455 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // 5456 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // 5457 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // 5458 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // 5459 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // 5460 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // 5461 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // 5462 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // 5463 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // 5464 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // 5465 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // 5466 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // 5467 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // 5468 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // 5469 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // 5470 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // 5471 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // 5472 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // 5473 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // 5474 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // 5475 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // 5476 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // 5477 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // 5478 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // 5479 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // 5480 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // 5481 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // 5482 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // 5483 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // 5484 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // 5485 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; 5486 5487 const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw}; 5488 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx); 5489 return BitCast(d, pairs + Set(du, 0x0100)); 5490 } 5491 5492 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 5493 HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) { 5494 HWY_DASSERT(mask_bits < 16); 5495 5496 // There are only 4 lanes, so we can afford to load the index vector directly. 5497 alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { 5498 // PrintCompress32x4Tables 5499 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 5500 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 5501 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // 5502 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 5503 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // 5504 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // 5505 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // 5506 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 5507 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 5508 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // 5509 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // 5510 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // 5511 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 5512 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // 5513 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 5514 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 5515 const Simd<T, N, 0> d; 5516 const Repartition<uint8_t, decltype(d)> d8; 5517 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 5518 } 5519 5520 template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)> 5521 HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) { 5522 HWY_DASSERT(mask_bits < 16); 5523 5524 // There are only 4 lanes, so we can afford to load the index vector directly. 5525 alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { 5526 // PrintCompressNot32x4Tables 5527 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 5528 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 5529 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 5530 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 5531 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 5532 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 5533 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 5534 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5535 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 5536 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 5537 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 5538 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 5539 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 5540 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 5541 12, 13, 14, 15}; 5542 const Simd<T, N, 0> d; 5543 const Repartition<uint8_t, decltype(d)> d8; 5544 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 5545 } 5546 5547 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 5548 HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) { 5549 HWY_DASSERT(mask_bits < 4); 5550 5551 // There are only 2 lanes, so we can afford to load the index vector directly. 5552 alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { 5553 // PrintCompress64x2Tables 5554 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5555 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5556 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 5557 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 5558 5559 const Simd<T, N, 0> d; 5560 const Repartition<uint8_t, decltype(d)> d8; 5561 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 5562 } 5563 5564 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 5565 HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) { 5566 HWY_DASSERT(mask_bits < 4); 5567 5568 // There are only 2 lanes, so we can afford to load the index vector directly. 5569 alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { 5570 // PrintCompressNot64x2Tables 5571 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5572 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 5573 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5574 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 5575 5576 const Simd<T, N, 0> d; 5577 const Repartition<uint8_t, decltype(d)> d8; 5578 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 5579 } 5580 5581 // Helper functions called by both Compress and CompressStore - avoids a 5582 // redundant BitsFromMask in the latter. 5583 5584 template <typename T, size_t N> 5585 HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) { 5586 const auto idx = detail::IdxFromBits<T, N>(mask_bits); 5587 const DFromV<decltype(v)> d; 5588 const RebindToSigned<decltype(d)> di; 5589 return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); 5590 } 5591 5592 template <typename T, size_t N> 5593 HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) { 5594 const auto idx = detail::IdxFromNotBits<T, N>(mask_bits); 5595 const DFromV<decltype(v)> d; 5596 const RebindToSigned<decltype(d)> di; 5597 return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); 5598 } 5599 5600 } // namespace detail 5601 5602 template <typename T> 5603 struct CompressIsPartition { 5604 #if HWY_TARGET == HWY_WASM_EMU256 5605 enum { value = 0 }; 5606 #else 5607 enum { value = (sizeof(T) != 1) }; 5608 #endif 5609 }; 5610 5611 // Single lane: no-op 5612 template <typename T> 5613 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 5614 return v; 5615 } 5616 5617 // Two lanes: conditional swap 5618 template <typename T, HWY_IF_T_SIZE(T, 8)> 5619 HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) { 5620 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. 5621 const Full128<T> d; 5622 const Vec128<T> m = VecFromMask(d, mask); 5623 const Vec128<T> maskL = DupEven(m); 5624 const Vec128<T> maskH = DupOdd(m); 5625 const Vec128<T> swap = AndNot(maskL, maskH); 5626 return IfVecThenElse(swap, Shuffle01(v), v); 5627 } 5628 5629 // General case, 2 or 4 byte lanes 5630 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 2))> 5631 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { 5632 const DFromV<decltype(v)> d; 5633 return detail::Compress(v, BitsFromMask(d, mask)); 5634 } 5635 5636 // Single lane: no-op 5637 template <typename T> 5638 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 5639 return v; 5640 } 5641 5642 // Two lanes: conditional swap 5643 template <typename T, HWY_IF_T_SIZE(T, 8)> 5644 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { 5645 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. 5646 const Full128<T> d; 5647 const Vec128<T> m = VecFromMask(d, mask); 5648 const Vec128<T> maskL = DupEven(m); 5649 const Vec128<T> maskH = DupOdd(m); 5650 const Vec128<T> swap = AndNot(maskH, maskL); 5651 return IfVecThenElse(swap, Shuffle01(v), v); 5652 } 5653 5654 // General case, 2 or 4 byte lanes 5655 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> 5656 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { 5657 const DFromV<decltype(v)> d; 5658 // For partial vectors, we cannot pull the Not() into the table because 5659 // BitsFromMask clears the upper bits. 5660 if (N < 16 / sizeof(T)) { 5661 return detail::Compress(v, BitsFromMask(d, Not(mask))); 5662 } 5663 return detail::CompressNot(v, BitsFromMask(d, mask)); 5664 } 5665 5666 // ------------------------------ CompressBlocksNot 5667 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, 5668 Mask128<uint64_t> /* m */) { 5669 return v; 5670 } 5671 5672 // ------------------------------ CompressBits 5673 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 5674 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, 5675 const uint8_t* HWY_RESTRICT bits) { 5676 uint64_t mask_bits = 0; 5677 constexpr size_t kNumBytes = (N + 7) / 8; 5678 CopyBytes<kNumBytes>(bits, &mask_bits); 5679 if (N < 8) { 5680 mask_bits &= (1ull << N) - 1; 5681 } 5682 5683 return detail::Compress(v, mask_bits); 5684 } 5685 5686 // ------------------------------ CompressStore 5687 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 5688 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, 5689 TFromD<D>* HWY_RESTRICT unaligned) { 5690 const uint64_t mask_bits = BitsFromMask(d, mask); 5691 const auto c = detail::Compress(v, mask_bits); 5692 StoreU(c, d, unaligned); 5693 return PopCount(mask_bits); 5694 } 5695 5696 // ------------------------------ CompressBlendedStore 5697 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 5698 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, 5699 TFromD<D>* HWY_RESTRICT unaligned) { 5700 const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16 5701 const uint64_t mask_bits = BitsFromMask(d, m); 5702 const size_t count = PopCount(mask_bits); 5703 const VFromD<decltype(du)> compressed = 5704 detail::Compress(BitCast(du, v), mask_bits); 5705 const MFromD<D> store_mask = RebindMask(d, FirstN(du, count)); 5706 BlendedStore(BitCast(d, compressed), store_mask, d, unaligned); 5707 return count; 5708 } 5709 5710 // ------------------------------ CompressBitsStore 5711 5712 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 5713 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, 5714 D d, TFromD<D>* HWY_RESTRICT unaligned) { 5715 uint64_t mask_bits = 0; 5716 constexpr size_t kN = MaxLanes(d); 5717 CopyBytes<(kN + 7) / 8>(bits, &mask_bits); 5718 if (kN < 8) { 5719 mask_bits &= (1ull << kN) - 1; 5720 } 5721 5722 const auto c = detail::Compress(v, mask_bits); 5723 StoreU(c, d, unaligned); 5724 return PopCount(mask_bits); 5725 } 5726 5727 // ------------------------------ StoreInterleaved2/3/4 5728 5729 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in 5730 // generic_ops-inl.h. 5731 5732 // ------------------------------ Additional mask logical operations 5733 template <class T> 5734 HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) { 5735 return mask; 5736 } 5737 template <class T> 5738 HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) { 5739 const FixedTag<T, 2> d; 5740 const auto vmask = VecFromMask(d, mask); 5741 return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); 5742 } 5743 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> 5744 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) { 5745 const Simd<T, N, 0> d; 5746 const auto vmask = VecFromMask(d, mask); 5747 const auto neg_vmask = 5748 ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask))); 5749 return MaskFromVec(Or(vmask, neg_vmask)); 5750 } 5751 template <class T, HWY_IF_NOT_T_SIZE(T, 8)> 5752 HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) { 5753 const Full128<T> d; 5754 const Repartition<int64_t, decltype(d)> di64; 5755 5756 auto vmask = BitCast(di64, VecFromMask(d, mask)); 5757 vmask = Or(vmask, Neg(vmask)); 5758 5759 // Copy the sign bit of the first int64_t lane to the second int64_t lane 5760 const auto vmask2 = BroadcastSignBit(InterleaveLower(Zero(di64), vmask)); 5761 return MaskFromVec(BitCast(d, Or(vmask, vmask2))); 5762 } 5763 5764 template <class T, size_t N> 5765 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) { 5766 return Not(SetAtOrAfterFirst(mask)); 5767 } 5768 5769 template <class T> 5770 HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) { 5771 return mask; 5772 } 5773 template <class T> 5774 HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) { 5775 const FixedTag<T, 2> d; 5776 const RebindToSigned<decltype(d)> di; 5777 5778 const auto vmask = BitCast(di, VecFromMask(d, mask)); 5779 const auto zero = Zero(di); 5780 const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); 5781 return MaskFromVec(BitCast(d, And(vmask, vmask2))); 5782 } 5783 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> 5784 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) { 5785 const Simd<T, N, 0> d; 5786 const RebindToSigned<decltype(d)> di; 5787 5788 const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask)); 5789 const auto only_first_vmask = 5790 BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); 5791 return MaskFromVec(only_first_vmask); 5792 } 5793 template <class T, HWY_IF_NOT_T_SIZE(T, 8)> 5794 HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) { 5795 const Full128<T> d; 5796 const RebindToSigned<decltype(d)> di; 5797 const Repartition<int64_t, decltype(d)> di64; 5798 5799 const auto zero = Zero(di64); 5800 const auto vmask = BitCast(di64, VecFromMask(d, mask)); 5801 const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero); 5802 const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); 5803 return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); 5804 } 5805 5806 template <class T> 5807 HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) { 5808 const FixedTag<T, 1> d; 5809 const RebindToSigned<decltype(d)> di; 5810 using TI = MakeSigned<T>; 5811 5812 return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); 5813 } 5814 template <class T, size_t N, HWY_IF_LANES_GT(N, 1)> 5815 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) { 5816 const Simd<T, N, 0> d; 5817 return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); 5818 } 5819 5820 // ------------------------------ MulEven/Odd (Load) 5821 5822 template <class T, HWY_IF_UI64(T)> 5823 HWY_API Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) { 5824 alignas(16) T mul[2]; 5825 mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 0)), 5826 static_cast<T>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]); 5827 return Load(Full128<T>(), mul); 5828 } 5829 5830 template <class T, HWY_IF_UI64(T)> 5831 HWY_API Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) { 5832 alignas(16) T mul[2]; 5833 mul[0] = Mul128(static_cast<T>(wasm_i64x2_extract_lane(a.raw, 1)), 5834 static_cast<T>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]); 5835 return Load(Full128<T>(), mul); 5836 } 5837 5838 // ------------------------------ I64/U64 MulHigh (GetLane) 5839 template <class T, HWY_IF_UI64(T)> 5840 HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) { 5841 T hi; 5842 Mul128(GetLane(a), GetLane(b), &hi); 5843 return Set(Full64<T>(), hi); 5844 } 5845 5846 template <class T, HWY_IF_UI64(T)> 5847 HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) { 5848 T hi_0; 5849 T hi_1; 5850 Mul128(GetLane(a), GetLane(b), &hi_0); 5851 Mul128(detail::ExtractLane<1>(a), detail::ExtractLane<1>(b), &hi_1); 5852 return Dup128VecFromValues(Full128<T>(), hi_0, hi_1); 5853 } 5854 5855 // ------------------------------ WidenMulPairwiseAdd (MulAdd, PromoteEvenTo) 5856 5857 // Generic for all vector lengths. 5858 template <class DF, HWY_IF_F32_D(DF), 5859 class VBF = VFromD<Repartition<bfloat16_t, DF>>> 5860 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) { 5861 return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), 5862 Mul(PromoteOddTo(df, a), PromoteOddTo(df, b))); 5863 } 5864 5865 // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is 5866 // safe. 5867 template <class D32, HWY_IF_I32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), 5868 class V16 = VFromD<RepartitionToNarrow<D32>>> 5869 HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { 5870 return VFromD<D32>{wasm_i32x4_dot_i16x8(a.raw, b.raw)}; 5871 } 5872 5873 template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 16), 5874 class VU16 = VFromD<RepartitionToNarrow<DU32>>> 5875 HWY_API VFromD<DU32> WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) { 5876 return MulAdd(PromoteEvenTo(du32, a), PromoteEvenTo(du32, b), 5877 Mul(PromoteOddTo(du32, a), PromoteOddTo(du32, b))); 5878 } 5879 5880 // ------------------------------ ReorderWidenMulAccumulate 5881 5882 template <class D32, HWY_IF_UI32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16), 5883 class V16 = VFromD<RepartitionToNarrow<D32>>> 5884 HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d32, V16 a, V16 b, 5885 const VFromD<D32> sum0, 5886 VFromD<D32>& /*sum1*/) { 5887 return sum0 + WidenMulPairwiseAdd(d32, a, b); 5888 } 5889 5890 // ------------------------------ RearrangeToOddPlusEven 5891 template <size_t N> 5892 HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven( 5893 const Vec128<int32_t, N> sum0, const Vec128<int32_t, N> /*sum1*/) { 5894 return sum0; // invariant already holds 5895 } 5896 5897 template <size_t N> 5898 HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven( 5899 const Vec128<uint32_t, N> sum0, const Vec128<uint32_t, N> /*sum1*/) { 5900 return sum0; // invariant already holds 5901 } 5902 5903 template <size_t N> 5904 HWY_API Vec128<float, N> RearrangeToOddPlusEven(const Vec128<float, N> sum0, 5905 const Vec128<float, N> sum1) { 5906 return Add(sum0, sum1); 5907 } 5908 5909 // ------------------------------ Reductions 5910 5911 // Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum. 5912 5913 // ------------------------------ Lt128 5914 5915 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 5916 HWY_INLINE MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) { 5917 // Truth table of Eq and Lt for Hi and Lo u64. 5918 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) 5919 // =H =L cH cL | out = cH | (=H & cL) 5920 // 0 0 0 0 | 0 5921 // 0 0 0 1 | 0 5922 // 0 0 1 0 | 1 5923 // 0 0 1 1 | 1 5924 // 0 1 0 0 | 0 5925 // 0 1 0 1 | 0 5926 // 0 1 1 0 | 1 5927 // 1 0 0 0 | 0 5928 // 1 0 0 1 | 1 5929 // 1 1 0 0 | 0 5930 const MFromD<D> eqHL = Eq(a, b); 5931 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); 5932 // We need to bring cL to the upper lane/bit corresponding to cH. Comparing 5933 // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the 5934 // comparison result leftwards requires only 4. IfThenElse compiles to the 5935 // same code as OrAnd(). 5936 const VFromD<D> ltLx = DupEven(ltHL); 5937 const VFromD<D> outHx = IfThenElse(eqHL, ltLx, ltHL); 5938 return MaskFromVec(DupOdd(outHx)); 5939 } 5940 5941 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5942 HWY_INLINE MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) { 5943 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); 5944 return MaskFromVec(InterleaveUpper(d, ltHL, ltHL)); 5945 } 5946 5947 // ------------------------------ Eq128 5948 5949 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 5950 HWY_INLINE MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) { 5951 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); 5952 return MaskFromVec(And(Reverse2(d, eqHL), eqHL)); 5953 } 5954 5955 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5956 HWY_INLINE MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) { 5957 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); 5958 return MaskFromVec(InterleaveUpper(d, eqHL, eqHL)); 5959 } 5960 5961 // ------------------------------ Ne128 5962 5963 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_U64_D(D)> 5964 HWY_INLINE MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) { 5965 const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); 5966 return MaskFromVec(Or(Reverse2(d, neHL), neHL)); 5967 } 5968 5969 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 5970 HWY_INLINE MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) { 5971 const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); 5972 return MaskFromVec(InterleaveUpper(d, neHL, neHL)); 5973 } 5974 5975 // ------------------------------ Min128, Max128 (Lt128) 5976 5977 // Without a native OddEven, it seems infeasible to go faster than Lt128. 5978 template <class D> 5979 HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) { 5980 return IfThenElse(Lt128(d, a, b), a, b); 5981 } 5982 5983 template <class D> 5984 HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) { 5985 return IfThenElse(Lt128(d, b, a), a, b); 5986 } 5987 5988 template <class D> 5989 HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) { 5990 return IfThenElse(Lt128Upper(d, a, b), a, b); 5991 } 5992 5993 template <class D> 5994 HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) { 5995 return IfThenElse(Lt128Upper(d, b, a), a, b); 5996 } 5997 5998 // NOLINTNEXTLINE(google-readability-namespace-comments) 5999 } // namespace HWY_NAMESPACE 6000 } // namespace hwy 6001 HWY_AFTER_NAMESPACE();