inside-inl.h (23665B)
1 // Copyright 2023 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // Must be included inside an existing include guard, with the following ops 17 // already defined: BitCast, And, Set, ShiftLeft, ShiftRight, PromoteLowerTo, 18 // ConcatEven, ConcatOdd, plus the optional detail::PromoteEvenTo and 19 // detail::PromoteOddTo (if implemented in the target-specific header). 20 21 // This is normally set by set_macros-inl.h before this header is included; 22 // if not, we are viewing this header standalone. Reduce IDE errors by: 23 #if !defined(HWY_NAMESPACE) 24 // 1) Defining HWY_IDE so we get syntax highlighting rather than all-gray text. 25 #include "hwy/ops/shared-inl.h" 26 // 2) Entering the HWY_NAMESPACE to make definitions from shared-inl.h visible. 27 HWY_BEFORE_NAMESPACE(); 28 namespace hwy { 29 namespace HWY_NAMESPACE { 30 #define HWY_INSIDE_END_NAMESPACE 31 // 3) Providing a dummy VFromD (usually done by the target-specific header). 32 template <class D> 33 using VFromD = int; 34 template <class D> 35 using TFromV = int; 36 template <class D> 37 struct DFromV {}; 38 #endif 39 40 // ------------------------------ Vec/Create/Get/Set2..4 41 42 // On SVE and RVV, Vec2..4 are aliases to built-in types. Also exclude the 43 // fixed-size SVE targets. 44 #if HWY_IDE || (!HWY_HAVE_SCALABLE && !HWY_TARGET_IS_SVE) 45 46 // NOTE: these are used inside arm_neon-inl.h, hence they cannot be defined in 47 // generic_ops-inl.h, which is included after that. 48 template <class D> 49 struct Vec2 { 50 VFromD<D> v0; 51 VFromD<D> v1; 52 }; 53 54 template <class D> 55 struct Vec3 { 56 VFromD<D> v0; 57 VFromD<D> v1; 58 VFromD<D> v2; 59 }; 60 61 template <class D> 62 struct Vec4 { 63 VFromD<D> v0; 64 VFromD<D> v1; 65 VFromD<D> v2; 66 VFromD<D> v3; 67 }; 68 69 // D arg is unused but allows deducing D. 70 template <class D> 71 HWY_API Vec2<D> Create2(D /* tag */, VFromD<D> v0, VFromD<D> v1) { 72 return Vec2<D>{v0, v1}; 73 } 74 75 template <class D> 76 HWY_API Vec3<D> Create3(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2) { 77 return Vec3<D>{v0, v1, v2}; 78 } 79 80 template <class D> 81 HWY_API Vec4<D> Create4(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, 82 VFromD<D> v3) { 83 return Vec4<D>{v0, v1, v2, v3}; 84 } 85 86 template <size_t kIndex, class D> 87 HWY_API VFromD<D> Get2(Vec2<D> tuple) { 88 static_assert(kIndex < 2, "Tuple index out of bounds"); 89 return kIndex == 0 ? tuple.v0 : tuple.v1; 90 } 91 92 template <size_t kIndex, class D> 93 HWY_API VFromD<D> Get3(Vec3<D> tuple) { 94 static_assert(kIndex < 3, "Tuple index out of bounds"); 95 return kIndex == 0 ? tuple.v0 : kIndex == 1 ? tuple.v1 : tuple.v2; 96 } 97 98 template <size_t kIndex, class D> 99 HWY_API VFromD<D> Get4(Vec4<D> tuple) { 100 static_assert(kIndex < 4, "Tuple index out of bounds"); 101 return kIndex == 0 ? tuple.v0 102 : kIndex == 1 ? tuple.v1 103 : kIndex == 2 ? tuple.v2 104 : tuple.v3; 105 } 106 107 template <size_t kIndex, class D> 108 HWY_API Vec2<D> Set2(Vec2<D> tuple, VFromD<D> val) { 109 static_assert(kIndex < 2, "Tuple index out of bounds"); 110 if (kIndex == 0) { 111 tuple.v0 = val; 112 } else { 113 tuple.v1 = val; 114 } 115 return tuple; 116 } 117 118 template <size_t kIndex, class D> 119 HWY_API Vec3<D> Set3(Vec3<D> tuple, VFromD<D> val) { 120 static_assert(kIndex < 3, "Tuple index out of bounds"); 121 if (kIndex == 0) { 122 tuple.v0 = val; 123 } else if (kIndex == 1) { 124 tuple.v1 = val; 125 } else { 126 tuple.v2 = val; 127 } 128 return tuple; 129 } 130 131 template <size_t kIndex, class D> 132 HWY_API Vec4<D> Set4(Vec4<D> tuple, VFromD<D> val) { 133 static_assert(kIndex < 4, "Tuple index out of bounds"); 134 if (kIndex == 0) { 135 tuple.v0 = val; 136 } else if (kIndex == 1) { 137 tuple.v1 = val; 138 } else if (kIndex == 2) { 139 tuple.v2 = val; 140 } else { 141 tuple.v3 = val; 142 } 143 return tuple; 144 } 145 146 #endif // !HWY_HAVE_SCALABLE || HWY_IDE 147 148 // ------------------------------ Rol/Ror (And, Or, Neg, Shl, Shr) 149 #if (defined(HWY_NATIVE_ROL_ROR_8) == defined(HWY_TARGET_TOGGLE)) 150 #ifdef HWY_NATIVE_ROL_ROR_8 151 #undef HWY_NATIVE_ROL_ROR_8 152 #else 153 #define HWY_NATIVE_ROL_ROR_8 154 #endif 155 156 template <class V, HWY_IF_UI8(TFromV<V>)> 157 HWY_API V Rol(V a, V b) { 158 const DFromV<decltype(a)> d; 159 const RebindToSigned<decltype(d)> di; 160 const RebindToUnsigned<decltype(d)> du; 161 162 const auto shift_amt_mask = Set(du, uint8_t{7}); 163 const auto shl_amt = And(BitCast(du, b), shift_amt_mask); 164 const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); 165 166 const auto vu = BitCast(du, a); 167 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); 168 } 169 170 template <class V, HWY_IF_UI8(TFromV<V>)> 171 HWY_API V Ror(V a, V b) { 172 const DFromV<decltype(a)> d; 173 const RebindToSigned<decltype(d)> di; 174 const RebindToUnsigned<decltype(d)> du; 175 176 const auto shift_amt_mask = Set(du, uint8_t{7}); 177 const auto shr_amt = And(BitCast(du, b), shift_amt_mask); 178 const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); 179 180 const auto vu = BitCast(du, a); 181 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); 182 } 183 184 #endif // HWY_NATIVE_ROL_ROR_8 185 186 #if (defined(HWY_NATIVE_ROL_ROR_16) == defined(HWY_TARGET_TOGGLE)) 187 #ifdef HWY_NATIVE_ROL_ROR_16 188 #undef HWY_NATIVE_ROL_ROR_16 189 #else 190 #define HWY_NATIVE_ROL_ROR_16 191 #endif 192 193 template <class V, HWY_IF_UI16(TFromV<V>)> 194 HWY_API V Rol(V a, V b) { 195 const DFromV<decltype(a)> d; 196 const RebindToSigned<decltype(d)> di; 197 const RebindToUnsigned<decltype(d)> du; 198 199 const auto shift_amt_mask = Set(du, uint16_t{15}); 200 const auto shl_amt = And(BitCast(du, b), shift_amt_mask); 201 const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); 202 203 const auto vu = BitCast(du, a); 204 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); 205 } 206 207 template <class V, HWY_IF_UI16(TFromV<V>)> 208 HWY_API V Ror(V a, V b) { 209 const DFromV<decltype(a)> d; 210 const RebindToSigned<decltype(d)> di; 211 const RebindToUnsigned<decltype(d)> du; 212 213 const auto shift_amt_mask = Set(du, uint16_t{15}); 214 const auto shr_amt = And(BitCast(du, b), shift_amt_mask); 215 const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); 216 217 const auto vu = BitCast(du, a); 218 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); 219 } 220 221 #endif // HWY_NATIVE_ROL_ROR_16 222 223 #if (defined(HWY_NATIVE_ROL_ROR_32_64) == defined(HWY_TARGET_TOGGLE)) 224 #ifdef HWY_NATIVE_ROL_ROR_32_64 225 #undef HWY_NATIVE_ROL_ROR_32_64 226 #else 227 #define HWY_NATIVE_ROL_ROR_32_64 228 #endif 229 230 template <class V, HWY_IF_UI32(TFromV<V>)> 231 HWY_API V Rol(V a, V b) { 232 const DFromV<decltype(a)> d; 233 const RebindToSigned<decltype(d)> di; 234 const RebindToUnsigned<decltype(d)> du; 235 236 const auto shift_amt_mask = Set(du, uint32_t{31}); 237 const auto shl_amt = And(BitCast(du, b), shift_amt_mask); 238 const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); 239 240 const auto vu = BitCast(du, a); 241 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); 242 } 243 244 template <class V, HWY_IF_UI32(TFromV<V>)> 245 HWY_API V Ror(V a, V b) { 246 const DFromV<decltype(a)> d; 247 const RebindToSigned<decltype(d)> di; 248 const RebindToUnsigned<decltype(d)> du; 249 250 const auto shift_amt_mask = Set(du, uint32_t{31}); 251 const auto shr_amt = And(BitCast(du, b), shift_amt_mask); 252 const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); 253 254 const auto vu = BitCast(du, a); 255 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); 256 } 257 258 #if HWY_HAVE_INTEGER64 259 template <class V, HWY_IF_UI64(TFromV<V>)> 260 HWY_API V Rol(V a, V b) { 261 const DFromV<decltype(a)> d; 262 const RebindToSigned<decltype(d)> di; 263 const RebindToUnsigned<decltype(d)> du; 264 265 const auto shift_amt_mask = Set(du, uint64_t{63}); 266 const auto shl_amt = And(BitCast(du, b), shift_amt_mask); 267 const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); 268 269 const auto vu = BitCast(du, a); 270 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); 271 } 272 273 template <class V, HWY_IF_UI64(TFromV<V>)> 274 HWY_API V Ror(V a, V b) { 275 const DFromV<decltype(a)> d; 276 const RebindToSigned<decltype(d)> di; 277 const RebindToUnsigned<decltype(d)> du; 278 279 const auto shift_amt_mask = Set(du, uint64_t{63}); 280 const auto shr_amt = And(BitCast(du, b), shift_amt_mask); 281 const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); 282 283 const auto vu = BitCast(du, a); 284 return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); 285 } 286 #endif // HWY_HAVE_INTEGER64 287 288 #endif // HWY_NATIVE_ROL_ROR_32_64 289 290 // ------------------------------ RotateLeftSame/RotateRightSame 291 292 #if (defined(HWY_NATIVE_ROL_ROR_SAME_8) == defined(HWY_TARGET_TOGGLE)) 293 #ifdef HWY_NATIVE_ROL_ROR_SAME_8 294 #undef HWY_NATIVE_ROL_ROR_SAME_8 295 #else 296 #define HWY_NATIVE_ROL_ROR_SAME_8 297 #endif 298 299 template <class V, HWY_IF_UI8(TFromV<V>)> 300 HWY_API V RotateLeftSame(V v, int bits) { 301 const DFromV<decltype(v)> d; 302 const RebindToUnsigned<decltype(d)> du; 303 304 const int shl_amt = bits & 7; 305 const int shr_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u); 306 307 const auto vu = BitCast(du, v); 308 return BitCast(d, 309 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); 310 } 311 312 template <class V, HWY_IF_UI8(TFromV<V>)> 313 HWY_API V RotateRightSame(V v, int bits) { 314 const DFromV<decltype(v)> d; 315 const RebindToUnsigned<decltype(d)> du; 316 317 const int shr_amt = bits & 7; 318 const int shl_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u); 319 320 const auto vu = BitCast(du, v); 321 return BitCast(d, 322 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); 323 } 324 325 #endif // HWY_NATIVE_ROL_ROR_SAME_8 326 327 #if (defined(HWY_NATIVE_ROL_ROR_SAME_16) == defined(HWY_TARGET_TOGGLE)) 328 #ifdef HWY_NATIVE_ROL_ROR_SAME_16 329 #undef HWY_NATIVE_ROL_ROR_SAME_16 330 #else 331 #define HWY_NATIVE_ROL_ROR_SAME_16 332 #endif 333 334 template <class V, HWY_IF_UI16(TFromV<V>)> 335 HWY_API V RotateLeftSame(V v, int bits) { 336 const DFromV<decltype(v)> d; 337 const RebindToUnsigned<decltype(d)> du; 338 339 const int shl_amt = bits & 15; 340 const int shr_amt = 341 static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u); 342 343 const auto vu = BitCast(du, v); 344 return BitCast(d, 345 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); 346 } 347 348 template <class V, HWY_IF_UI16(TFromV<V>)> 349 HWY_API V RotateRightSame(V v, int bits) { 350 const DFromV<decltype(v)> d; 351 const RebindToUnsigned<decltype(d)> du; 352 353 const int shr_amt = bits & 15; 354 const int shl_amt = 355 static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u); 356 357 const auto vu = BitCast(du, v); 358 return BitCast(d, 359 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); 360 } 361 #endif // HWY_NATIVE_ROL_ROR_SAME_16 362 363 #if (defined(HWY_NATIVE_ROL_ROR_SAME_32_64) == defined(HWY_TARGET_TOGGLE)) 364 #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64 365 #undef HWY_NATIVE_ROL_ROR_SAME_32_64 366 #else 367 #define HWY_NATIVE_ROL_ROR_SAME_32_64 368 #endif 369 370 template <class V, HWY_IF_UI32(TFromV<V>)> 371 HWY_API V RotateLeftSame(V v, int bits) { 372 const DFromV<decltype(v)> d; 373 const RebindToUnsigned<decltype(d)> du; 374 375 const int shl_amt = bits & 31; 376 const int shr_amt = 377 static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u); 378 379 const auto vu = BitCast(du, v); 380 return BitCast(d, 381 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); 382 } 383 384 template <class V, HWY_IF_UI32(TFromV<V>)> 385 HWY_API V RotateRightSame(V v, int bits) { 386 const DFromV<decltype(v)> d; 387 const RebindToUnsigned<decltype(d)> du; 388 389 const int shr_amt = bits & 31; 390 const int shl_amt = 391 static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u); 392 393 const auto vu = BitCast(du, v); 394 return BitCast(d, 395 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); 396 } 397 398 #if HWY_HAVE_INTEGER64 399 template <class V, HWY_IF_UI64(TFromV<V>)> 400 HWY_API V RotateLeftSame(V v, int bits) { 401 const DFromV<decltype(v)> d; 402 const RebindToUnsigned<decltype(d)> du; 403 404 const int shl_amt = bits & 63; 405 const int shr_amt = 406 static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u); 407 408 const auto vu = BitCast(du, v); 409 return BitCast(d, 410 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); 411 } 412 413 template <class V, HWY_IF_UI64(TFromV<V>)> 414 HWY_API V RotateRightSame(V v, int bits) { 415 const DFromV<decltype(v)> d; 416 const RebindToUnsigned<decltype(d)> du; 417 418 const int shr_amt = bits & 63; 419 const int shl_amt = 420 static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u); 421 422 const auto vu = BitCast(du, v); 423 return BitCast(d, 424 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); 425 } 426 #endif // HWY_HAVE_INTEGER64 427 428 #endif // HWY_NATIVE_ROL_ROR_SAME_32_64 429 430 // ------------------------------ PromoteEvenTo/PromoteOddTo 431 432 // These are used by target-specific headers for ReorderWidenMulAccumulate etc. 433 434 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 435 namespace detail { 436 437 // Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as 438 // there are target-specific specializations for some of the 439 // detail::PromoteEvenTo and detail::PromoteOddTo cases on 440 // SVE/PPC/SSE2/SSSE3/SSE4/AVX2. 441 442 // All targets except HWY_SCALAR use the implementations of 443 // detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at 444 // least some of the PromoteEvenTo and PromoteOddTo cases. 445 446 // Signed to signed PromoteEvenTo/PromoteOddTo 447 template <size_t kToLaneSize, class D, class V> 448 HWY_INLINE VFromD<D> PromoteEvenTo( 449 hwy::SignedTag /*to_type_tag*/, 450 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, 451 hwy::SignedTag /*from_type_tag*/, D d_to, V v) { 452 #if HWY_TARGET_IS_SVE 453 // The intrinsic expects the wide lane type. 454 return NativePromoteEvenTo(BitCast(d_to, v)); 455 #else 456 #if HWY_IS_LITTLE_ENDIAN 457 // On little-endian targets, need to shift each lane of the bitcasted 458 // vector left by kToLaneSize * 4 bits to get the bits of the even 459 // source lanes into the upper kToLaneSize * 4 bits of even_in_hi. 460 const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v)); 461 #else 462 // On big-endian targets, the bits of the even source lanes are already 463 // in the upper kToLaneSize * 4 bits of the lanes of the bitcasted 464 // vector. 465 const auto even_in_hi = BitCast(d_to, v); 466 #endif 467 468 // Right-shift even_in_hi by kToLaneSize * 4 bits 469 return ShiftRight<kToLaneSize * 4>(even_in_hi); 470 #endif // HWY_TARGET_IS_SVE 471 } 472 473 // Unsigned to unsigned PromoteEvenTo/PromoteOddTo 474 template <size_t kToLaneSize, class D, class V> 475 HWY_INLINE VFromD<D> PromoteEvenTo( 476 hwy::UnsignedTag /*to_type_tag*/, 477 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, 478 hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { 479 #if HWY_TARGET_IS_SVE 480 // The intrinsic expects the wide lane type. 481 return NativePromoteEvenTo(BitCast(d_to, v)); 482 #else 483 #if HWY_IS_LITTLE_ENDIAN 484 // On little-endian targets, the bits of the even source lanes are already 485 // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector. 486 487 // Simply need to zero out the upper bits of each lane of the bitcasted 488 // vector. 489 return And(BitCast(d_to, v), 490 Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>()))); 491 #else 492 // On big-endian targets, need to shift each lane of the bitcasted vector 493 // right by kToLaneSize * 4 bits to get the bits of the even source lanes into 494 // the lower kToLaneSize * 4 bits of the result. 495 496 // The right shift below will zero out the upper kToLaneSize * 4 bits of the 497 // result. 498 return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v)); 499 #endif 500 #endif // HWY_TARGET_IS_SVE 501 } 502 503 template <size_t kToLaneSize, class D, class V> 504 HWY_INLINE VFromD<D> PromoteOddTo( 505 hwy::SignedTag /*to_type_tag*/, 506 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, 507 hwy::SignedTag /*from_type_tag*/, D d_to, V v) { 508 #if HWY_IS_LITTLE_ENDIAN 509 // On little-endian targets, the bits of the odd source lanes are already in 510 // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector. 511 const auto odd_in_hi = BitCast(d_to, v); 512 #else 513 // On big-endian targets, need to shift each lane of the bitcasted vector 514 // left by kToLaneSize * 4 bits to get the bits of the odd source lanes into 515 // the upper kToLaneSize * 4 bits of odd_in_hi. 516 const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v)); 517 #endif 518 519 // Right-shift odd_in_hi by kToLaneSize * 4 bits 520 return ShiftRight<kToLaneSize * 4>(odd_in_hi); 521 } 522 523 template <size_t kToLaneSize, class D, class V> 524 HWY_INLINE VFromD<D> PromoteOddTo( 525 hwy::UnsignedTag /*to_type_tag*/, 526 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, 527 hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { 528 #if HWY_IS_LITTLE_ENDIAN 529 // On little-endian targets, need to shift each lane of the bitcasted vector 530 // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into 531 // the lower kToLaneSize * 4 bits of the result. 532 533 // The right shift below will zero out the upper kToLaneSize * 4 bits of the 534 // result. 535 return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v)); 536 #else 537 // On big-endian targets, the bits of the even source lanes are already 538 // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector. 539 540 // Simply need to zero out the upper bits of each lane of the bitcasted 541 // vector. 542 return And(BitCast(d_to, v), 543 Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>()))); 544 #endif 545 } 546 547 // Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo 548 // followed by BitCast to signed 549 template <size_t kToLaneSize, class D, class V> 550 HWY_INLINE VFromD<D> PromoteEvenTo( 551 hwy::SignedTag /*to_type_tag*/, 552 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, 553 hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { 554 const RebindToUnsigned<decltype(d_to)> du_to; 555 return BitCast(d_to, 556 PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(), 557 hwy::UnsignedTag(), du_to, v)); 558 } 559 560 template <size_t kToLaneSize, class D, class V> 561 HWY_INLINE VFromD<D> PromoteOddTo( 562 hwy::SignedTag /*to_type_tag*/, 563 hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, 564 hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { 565 const RebindToUnsigned<decltype(d_to)> du_to; 566 return BitCast(d_to, 567 PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(), 568 hwy::UnsignedTag(), du_to, v)); 569 } 570 571 // BF16->F32 PromoteEvenTo 572 573 // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag 574 // instead of hwy::FloatTag on targets that use scalable vectors. 575 576 // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same 577 // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>> 578 579 // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered 580 // to be a bfloat16_t vector. 581 template <class FromTypeTag, class DF32, class VBF16, 582 class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>, 583 hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr> 584 HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/, 585 hwy::SizeTag<4> /*to_lane_size_tag*/, 586 FromTypeTag /*from_type_tag*/, DF32 d_to, 587 VBF16 v) { 588 const RebindToUnsigned<decltype(d_to)> du_to; 589 #if HWY_IS_LITTLE_ENDIAN 590 // On little-endian platforms, need to shift left each lane of the bitcasted 591 // vector by 16 bits. 592 return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v))); 593 #else 594 // On big-endian platforms, the even lanes of the source vector are already 595 // in the upper 16 bits of the lanes of the bitcasted vector. 596 597 // Need to simply zero out the lower 16 bits of each lane of the bitcasted 598 // vector. 599 return BitCast(d_to, 600 And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u}))); 601 #endif 602 } 603 604 // BF16->F32 PromoteOddTo 605 606 // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag 607 // instead of hwy::FloatTag on targets that use scalable vectors. 608 609 // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same 610 // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>> 611 612 // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered 613 // to be a bfloat16_t vector. 614 template <class FromTypeTag, class DF32, class VBF16, 615 class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>, 616 hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr> 617 HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag /*to_type_tag*/, 618 hwy::SizeTag<4> /*to_lane_size_tag*/, 619 FromTypeTag /*from_type_tag*/, DF32 d_to, 620 VBF16 v) { 621 const RebindToUnsigned<decltype(d_to)> du_to; 622 #if HWY_IS_LITTLE_ENDIAN 623 // On little-endian platforms, the odd lanes of the source vector are already 624 // in the upper 16 bits of the lanes of the bitcasted vector. 625 626 // Need to simply zero out the lower 16 bits of each lane of the bitcasted 627 // vector. 628 return BitCast(d_to, 629 And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u}))); 630 #else 631 // On big-endian platforms, need to shift left each lane of the bitcasted 632 // vector by 16 bits. 633 return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v))); 634 #endif 635 } 636 637 // Default PromoteEvenTo/PromoteOddTo implementations 638 template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D, 639 class V, HWY_IF_LANES_D(D, 1)> 640 HWY_INLINE VFromD<D> PromoteEvenTo( 641 ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, 642 FromTypeTag /*from_type_tag*/, D d_to, V v) { 643 return PromoteLowerTo(d_to, v); 644 } 645 646 template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D, 647 class V, HWY_IF_LANES_GT_D(D, 1)> 648 HWY_INLINE VFromD<D> PromoteEvenTo( 649 ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, 650 FromTypeTag /*from_type_tag*/, D d_to, V v) { 651 const DFromV<decltype(v)> d; 652 return PromoteLowerTo(d_to, ConcatEven(d, v, v)); 653 } 654 655 template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D, 656 class V> 657 HWY_INLINE VFromD<D> PromoteOddTo( 658 ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/, 659 FromTypeTag /*from_type_tag*/, D d_to, V v) { 660 const DFromV<decltype(v)> d; 661 return PromoteLowerTo(d_to, ConcatOdd(d, v, v)); 662 } 663 664 } // namespace detail 665 666 template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)), 667 class V2 = VFromD<Repartition<TFromV<V>, D>>, 668 HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))> 669 HWY_API VFromD<D> PromoteEvenTo(D d, V v) { 670 return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(), 671 hwy::SizeTag<sizeof(TFromD<D>)>(), 672 hwy::TypeTag<TFromV<V>>(), d, v); 673 } 674 675 template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)), 676 class V2 = VFromD<Repartition<TFromV<V>, D>>, 677 HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))> 678 HWY_API VFromD<D> PromoteOddTo(D d, V v) { 679 return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(), 680 hwy::SizeTag<sizeof(TFromD<D>)>(), 681 hwy::TypeTag<TFromV<V>>(), d, v); 682 } 683 #endif // HWY_TARGET != HWY_SCALAR 684 685 #ifdef HWY_INSIDE_END_NAMESPACE 686 #undef HWY_INSIDE_END_NAMESPACE 687 // NOLINTNEXTLINE(google-readability-namespace-comments) 688 } // namespace HWY_NAMESPACE 689 } // namespace hwy 690 HWY_AFTER_NAMESPACE(); 691 #endif