traits128-inl.h (18345B)
1 // Copyright 2021 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // Per-target 17 #if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \ 18 defined(HWY_TARGET_TOGGLE) 19 #ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE 20 #undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE 21 #else 22 #define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE 23 #endif 24 25 #include <stddef.h> 26 #include <stdint.h> 27 28 #include "hwy/contrib/sort/order.h" // SortDescending 29 #include "hwy/contrib/sort/shared-inl.h" 30 #include "hwy/highway.h" 31 32 HWY_BEFORE_NAMESPACE(); 33 namespace hwy { 34 namespace HWY_NAMESPACE { 35 namespace detail { 36 37 // Also used by HeapSort, so do not require VQSORT_ENABLED. 38 #if HWY_TARGET != HWY_SCALAR || HWY_IDE 39 40 // Highway does not provide a lane type for 128-bit keys, so we use uint64_t 41 // along with an abstraction layer for single-lane vs. lane-pair, which is 42 // independent of the order. 43 struct KeyAny128 { 44 static constexpr bool Is128() { return true; } 45 constexpr size_t LanesPerKey() const { return 2; } 46 47 // What type bench_sort should allocate for generating inputs. 48 using LaneType = uint64_t; 49 // KeyType and KeyString are defined by derived classes. 50 51 HWY_INLINE void Swap(LaneType* a, LaneType* b) const { 52 const FixedTag<LaneType, 2> d; 53 const auto temp = LoadU(d, a); 54 StoreU(LoadU(d, b), d, a); 55 StoreU(temp, d, b); 56 } 57 58 template <class V, class M> 59 HWY_INLINE V CompressKeys(V keys, M mask) const { 60 return CompressBlocksNot(keys, mask); 61 } 62 63 template <class D, HWY_IF_U64_D(D)> 64 HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const { 65 return LoadDup128(d, key); 66 } 67 68 template <class D, HWY_IF_U64_D(D)> 69 HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const { 70 return ReverseBlocks(d, v); 71 } 72 73 template <class D, HWY_IF_U64_D(D)> 74 HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const { 75 HWY_DASSERT(Lanes(D()) >= 4); // at least 2 keys 76 return SwapAdjacentBlocks(v); 77 } 78 79 // Only called for 4 keys because we do not support >512-bit vectors. 80 template <class D, HWY_IF_U64_D(D)> 81 HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const { 82 HWY_DASSERT(Lanes(D()) == 8); // exactly 4 keys: the 512-bit limit 83 return ReverseKeys(d, v); 84 } 85 86 // Only called for 4 keys because we do not support >512-bit vectors. 87 template <class D, HWY_IF_U64_D(D)> 88 HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd, 89 const Vec<D> even) const { 90 HWY_DASSERT(Lanes(D()) == 8); // exactly 4 keys: the 512-bit limit 91 return ConcatUpperLower(d, odd, even); 92 } 93 94 template <class V> 95 HWY_INLINE V OddEvenKeys(const V odd, const V even) const { 96 return OddEvenBlocks(odd, even); 97 } 98 99 template <class D, HWY_IF_U64_D(D)> 100 HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const { 101 HWY_ASSERT(0); // not supported: would require 1024-bit vectors 102 } 103 104 template <class D, HWY_IF_U64_D(D)> 105 HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const { 106 HWY_ASSERT(0); // not supported: would require 2048-bit vectors 107 } 108 109 // This is only called for 8/16 col networks (not supported). 110 template <class D, HWY_IF_U64_D(D)> 111 HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const { 112 HWY_ASSERT(0); 113 } 114 115 // This is only called for 16 col networks (not supported). 116 template <class D, HWY_IF_U64_D(D)> 117 HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const { 118 HWY_ASSERT(0); 119 } 120 121 // This is only called for 8 col networks (not supported). 122 template <class D, HWY_IF_U64_D(D)> 123 HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const { 124 HWY_ASSERT(0); 125 } 126 }; 127 128 // Base class shared between OrderAscending128, OrderDescending128. 129 struct Key128 : public KeyAny128 { 130 // False indicates the entire key should be compared. KV means key-value. 131 static constexpr bool IsKV() { return false; } 132 133 // What type to pass to VQSort. 134 using KeyType = hwy::uint128_t; 135 136 const char* KeyString() const { return "U128"; } 137 138 template <class D, HWY_IF_U64_D(D)> 139 HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const { 140 return Eq128(d, a, b); 141 } 142 143 template <class D, HWY_IF_U64_D(D)> 144 HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const { 145 return Ne128(d, a, b); 146 } 147 148 // For keys=entire 128 bits, any difference counts. 149 template <class D, HWY_IF_U64_D(D)> 150 HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const { 151 // Must avoid floating-point comparisons (for -0) 152 const RebindToUnsigned<D> du; 153 return AllTrue(du, Eq(BitCast(du, diff), Zero(du))); 154 } 155 156 HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const { 157 return a[0] == b[0] && a[1] == b[1]; 158 } 159 160 // Returns vector with only the top half of each block valid. This allows 161 // fusing the "replicate upper to lower half" step with a subsequent permute. 162 template <class Order, class D> 163 HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const { 164 const Mask<D> eqHL = Eq(a, b); 165 const Vec<D> ltHL = VecFromMask(d, Order().CompareLanes(a, b)); 166 #if HWY_TARGET <= HWY_AVX2 // slightly faster 167 const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL); 168 return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX); 169 #else 170 return IfThenElse(eqHL, DupEven(ltHL), ltHL); 171 #endif 172 } 173 }; 174 175 // Anything order-related depends on the key traits *and* the order (see 176 // FirstOfLanes). We cannot implement just one Compare function because Lt128 177 // only compiles if the lane type is u64. Thus we need either overloaded 178 // functions with a tag type, class specializations, or separate classes. 179 // We avoid overloaded functions because we want all functions to be callable 180 // from a SortTraits without per-function wrappers. Specializing would work, but 181 // we are anyway going to specialize at a higher level. 182 struct OrderAscending128 : public Key128 { 183 using Order = SortAscending; 184 using OrderForSortingNetwork = OrderAscending128; 185 186 HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) const { 187 return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1]; 188 } 189 190 template <class D, HWY_IF_U64_D(D)> 191 HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { 192 return Lt128(d, a, b); 193 } 194 195 template <class D, HWY_IF_U64_D(D)> 196 HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { 197 return Min128(d, a, b); 198 } 199 200 template <class D, HWY_IF_U64_D(D)> 201 HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { 202 return Max128(d, a, b); 203 } 204 205 // FirstOfLanes/LastOfLanes are implemented in Traits128. 206 207 // Same as for regular lanes because 128-bit keys are u64. 208 template <class D, HWY_IF_U64_D(D)> 209 HWY_INLINE Vec<D> FirstValue(D d) const { 210 return Set(d, hwy::LowestValue<TFromD<D> >()); 211 } 212 213 template <class D, HWY_IF_U64_D(D)> 214 HWY_INLINE Vec<D> LastValue(D d) const { 215 return Set(d, hwy::HighestValue<TFromD<D> >()); 216 } 217 218 template <class D, HWY_IF_U64_D(D)> 219 HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { 220 const Vec<D> k0 = Zero(d); 221 const Vec<D> k1 = OddEven(k0, Set(d, uint64_t{1})); 222 const Mask<D> borrow = Eq(v, k0); // don't-care, lo == 0 223 // lo == 0? 1 : 0, 0 224 const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(borrow, k1)); 225 return Sub(Sub(v, k1), adjust); 226 } 227 228 // 'Private', used by base class Key128::CompareTop. 229 template <class V> 230 HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { 231 return Lt(a, b); 232 } 233 }; 234 235 struct OrderDescending128 : public Key128 { 236 using Order = SortDescending; 237 using OrderForSortingNetwork = OrderDescending128; 238 239 HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) const { 240 return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1]; 241 } 242 243 template <class D, HWY_IF_U64_D(D)> 244 HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { 245 return Lt128(d, b, a); 246 } 247 248 template <class D, HWY_IF_U64_D(D)> 249 HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { 250 return Max128(d, a, b); 251 } 252 253 template <class D, HWY_IF_U64_D(D)> 254 HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { 255 return Min128(d, a, b); 256 } 257 258 // FirstOfLanes/LastOfLanes are implemented in Traits128. 259 260 // Same as for regular lanes because 128-bit keys are u64. 261 template <class D, HWY_IF_U64_D(D)> 262 HWY_INLINE Vec<D> FirstValue(D d) const { 263 return Set(d, hwy::HighestValue<TFromD<D> >()); 264 } 265 266 template <class D, HWY_IF_U64_D(D)> 267 HWY_INLINE Vec<D> LastValue(D d) const { 268 return Set(d, hwy::LowestValue<TFromD<D> >()); 269 } 270 271 template <class D, HWY_IF_U64_D(D)> 272 HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { 273 const Vec<D> k1 = OddEven(Zero(d), Set(d, uint64_t{1})); 274 const Vec<D> added = Add(v, k1); 275 const Mask<D> overflowed = Lt(added, v); // false, overflowed 276 // overflowed? 1 : 0, 0 277 const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(overflowed, k1)); 278 return Add(added, adjust); 279 } 280 281 // 'Private', used by base class Key128::CompareTop. 282 template <class V> 283 HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { 284 return Lt(b, a); 285 } 286 }; 287 288 // Base class shared between OrderAscendingKV128, OrderDescendingKV128. 289 struct KeyValue128 : public KeyAny128 { 290 // True indicates only part of the key (the more significant lane) should be 291 // compared. KV stands for key-value. 292 static constexpr bool IsKV() { return true; } 293 294 // What type to pass to VQSort. 295 using KeyType = K64V64; 296 297 const char* KeyString() const { return "k+v=128"; } 298 299 template <class D, HWY_IF_U64_D(D)> 300 HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const { 301 return Eq128Upper(d, a, b); 302 } 303 304 template <class D, HWY_IF_U64_D(D)> 305 HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const { 306 return Ne128Upper(d, a, b); 307 } 308 309 HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const { 310 return a[1] == b[1]; 311 } 312 313 // Only count differences in the actual key, not the value. 314 template <class D, HWY_IF_U64_D(D)> 315 HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const { 316 // Must avoid floating-point comparisons (for -0) 317 const RebindToUnsigned<D> du; 318 const Vec<decltype(du)> zero = Zero(du); 319 const Vec<decltype(du)> keys = OddEven(diff, zero); // clear values 320 return AllTrue(du, Eq(BitCast(du, keys), zero)); 321 } 322 323 // Returns vector with only the top half of each block valid. This allows 324 // fusing the "replicate upper to lower half" step with a subsequent permute. 325 template <class Order, class D> 326 HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const { 327 // Only the upper lane of each block is a key, and only that lane is 328 // required to be valid, so comparing all lanes is sufficient. 329 return VecFromMask(d, Order().CompareLanes(a, b)); 330 } 331 }; 332 333 struct OrderAscendingKV128 : public KeyValue128 { 334 using Order = SortAscending; 335 using OrderForSortingNetwork = OrderAscending128; 336 337 HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) const { 338 return a[1] < b[1]; 339 } 340 341 template <class D, HWY_IF_U64_D(D)> 342 HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { 343 return Lt128Upper(d, a, b); 344 } 345 346 template <class D, HWY_IF_U64_D(D)> 347 HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { 348 return Min128Upper(d, a, b); 349 } 350 351 template <class D, HWY_IF_U64_D(D)> 352 HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { 353 return Max128Upper(d, a, b); 354 } 355 356 // FirstOfLanes/LastOfLanes are implemented in Traits128. 357 358 // Same as for regular lanes because 128-bit keys are u64. 359 template <class D, HWY_IF_U64_D(D)> 360 HWY_INLINE Vec<D> FirstValue(D d) const { 361 return Set(d, hwy::LowestValue<TFromD<D> >()); 362 } 363 364 template <class D, HWY_IF_U64_D(D)> 365 HWY_INLINE Vec<D> LastValue(D d) const { 366 return Set(d, hwy::HighestValue<TFromD<D> >()); 367 } 368 369 template <class D, HWY_IF_U64_D(D)> 370 HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { 371 const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d)); 372 return Sub(v, k1); 373 } 374 375 // 'Private', used by base class KeyValue128::CompareTop. 376 template <class V> 377 HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { 378 return Lt(a, b); 379 } 380 }; 381 382 struct OrderDescendingKV128 : public KeyValue128 { 383 using Order = SortDescending; 384 using OrderForSortingNetwork = OrderDescending128; 385 386 HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) const { 387 return b[1] < a[1]; 388 } 389 390 template <class D, HWY_IF_U64_D(D)> 391 HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { 392 return Lt128Upper(d, b, a); 393 } 394 395 template <class D, HWY_IF_U64_D(D)> 396 HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { 397 return Max128Upper(d, a, b); 398 } 399 400 template <class D, HWY_IF_U64_D(D)> 401 HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { 402 return Min128Upper(d, a, b); 403 } 404 405 // FirstOfLanes/LastOfLanes are implemented in Traits128. 406 407 // Same as for regular lanes because 128-bit keys are u64. 408 template <class D, HWY_IF_U64_D(D)> 409 HWY_INLINE Vec<D> FirstValue(D d) const { 410 return Set(d, hwy::HighestValue<TFromD<D> >()); 411 } 412 413 template <class D, HWY_IF_U64_D(D)> 414 HWY_INLINE Vec<D> LastValue(D d) const { 415 return Set(d, hwy::LowestValue<TFromD<D> >()); 416 } 417 418 template <class D, HWY_IF_U64_D(D)> 419 HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const { 420 const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d)); 421 return Add(v, k1); 422 } 423 424 // 'Private', used by base class KeyValue128::CompareTop. 425 template <class V> 426 HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { 427 return Lt(b, a); 428 } 429 }; 430 431 // We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in 432 // the most-significant of those lanes (the result of CompareTop), so 433 // replicate it 4x. Only called for >= 256-bit vectors. 434 435 #if HWY_TARGET <= HWY_AVX3 436 template <class V, HWY_IF_V_SIZE_V(V, 64)> 437 HWY_INLINE V ReplicateTop4x(V v) { 438 return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))}; 439 } 440 #endif // HWY_TARGET <= HWY_AVX3 441 442 #if HWY_TARGET <= HWY_AVX2 443 444 template <class V, HWY_IF_V_SIZE_V(V, 32)> 445 HWY_INLINE V ReplicateTop4x(V v) { 446 return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))}; 447 } 448 449 #else // HWY_TARGET > HWY_AVX2 450 451 template <class V> 452 HWY_INLINE V ReplicateTop4x(V v) { 453 #if HWY_TARGET == HWY_SVE_256 454 return svdup_lane_u64(v, 3); 455 #else 456 const ScalableTag<uint64_t> d; 457 HWY_DASSERT(Lanes(d) == 4 || Lanes(d) == 8); // for table below 458 HWY_ALIGN static constexpr uint64_t kIndices[8] = {3, 3, 3, 3, 7, 7, 7, 7}; 459 return TableLookupLanes(v, SetTableIndices(d, kIndices)); 460 #endif 461 } 462 463 #endif // HWY_TARGET <= HWY_AVX2 464 465 // Shared code that depends on Order. 466 template <class Base> 467 struct Traits128 : public Base { 468 using TraitsForSortingNetwork = 469 Traits128<typename Base::OrderForSortingNetwork>; 470 471 template <class D, HWY_IF_U64_D(D)> 472 HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, 473 TFromD<D>* HWY_RESTRICT buf) const { 474 const Base* base = static_cast<const Base*>(this); 475 const size_t N = Lanes(d); 476 Store(v, d, buf); 477 v = base->SetKey(d, buf + 0); // result must be broadcasted 478 for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) { 479 v = base->First(d, v, base->SetKey(d, buf + i)); 480 } 481 return v; 482 } 483 484 template <class D, HWY_IF_U64_D(D)> 485 HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, 486 TFromD<D>* HWY_RESTRICT buf) const { 487 const Base* base = static_cast<const Base*>(this); 488 const size_t N = Lanes(d); 489 Store(v, d, buf); 490 v = base->SetKey(d, buf + 0); // result must be broadcasted 491 for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) { 492 v = base->Last(d, v, base->SetKey(d, buf + i)); 493 } 494 return v; 495 } 496 497 template <class D, HWY_IF_U64_D(D)> 498 HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const { 499 const Base* base = static_cast<const Base*>(this); 500 501 const Vec<D> a_copy = a; 502 const auto lt = base->Compare(d, a, b); 503 a = IfThenElse(lt, a, b); 504 b = IfThenElse(lt, b, a_copy); 505 } 506 507 // Conditionally swaps even-numbered keys with their odd-numbered neighbor. 508 template <class D, HWY_IF_U64_D(D)> 509 HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const { 510 HWY_DASSERT(Lanes(d) >= 4); // required by ReplicateTop4x 511 const Base* base = static_cast<const Base*>(this); 512 Vec<D> swapped = base->ReverseKeys2(d, v); 513 const Vec<D> cmpHx = base->template CompareTop<Base>(d, v, swapped); 514 return IfVecThenElse(ReplicateTop4x(cmpHx), swapped, v); 515 } 516 517 // Swaps with the vector formed by reversing contiguous groups of four 128-bit 518 // keys, which implies 512-bit vectors (we do not support more than that). 519 template <class D, HWY_IF_U64_D(D)> 520 HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const { 521 HWY_DASSERT(Lanes(d) == 8); // For TableLookupLanes below 522 const Base* base = static_cast<const Base*>(this); 523 Vec<D> swapped = base->ReverseKeys4(d, v); 524 525 const Vec<D> cmpHx = base->template CompareTop<Base>(d, v, swapped); 526 // Similar to ReplicateTop4x, we want to gang together 2 comparison results 527 // (4 lanes). They are not contiguous, so use permute to replicate 4x. 528 HWY_ALIGN uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7}; 529 const Vec<D> select = TableLookupLanes(cmpHx, SetTableIndices(d, kIndices)); 530 return IfVecThenElse(select, swapped, v); 531 } 532 533 // Conditionally swaps lane 0 with 4, 1 with 5 etc. 534 template <class D, HWY_IF_U64_D(D)> 535 HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const { 536 // Only used by Merge16, which would require 2048 bit vectors (unsupported). 537 HWY_ASSERT(0); 538 } 539 }; 540 541 #endif // HWY_TARGET != HWY_SCALAR 542 543 } // namespace detail 544 // NOLINTNEXTLINE(google-readability-namespace-comments) 545 } // namespace HWY_NAMESPACE 546 } // namespace hwy 547 HWY_AFTER_NAMESPACE(); 548 549 #endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE