shared-inl.h (6939B)
1 // Copyright 2021 Google LLC 2 // Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> 3 // SPDX-License-Identifier: Apache-2.0 4 // SPDX-License-Identifier: BSD-3-Clause 5 // 6 // Licensed under the Apache License, Version 2.0 (the "License"); 7 // you may not use this file except in compliance with the License. 8 // You may obtain a copy of the License at 9 // 10 // http://www.apache.org/licenses/LICENSE-2.0 11 // 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 18 // Definitions shared between vqsort-inl and sorting_networks-inl. 19 20 // Normal include guard for target-independent parts 21 #ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ 22 #define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ 23 24 #include "hwy/base.h" 25 26 namespace hwy { 27 28 // Based on https://github.com/numpy/numpy/issues/16313#issuecomment-641897028 29 static HWY_INLINE uint64_t RandomBits(uint64_t* HWY_RESTRICT state) { 30 const uint64_t a = state[0]; 31 const uint64_t b = state[1]; 32 const uint64_t w = state[2] + 1; 33 const uint64_t next = a ^ w; 34 state[0] = (b + (b << 3)) ^ (b >> 11); 35 const uint64_t rot = (b << 24) | (b >> 40); 36 state[1] = rot + next; 37 state[2] = w; 38 return next; 39 } 40 41 // Internal constants - these are to avoid magic numbers/literals and cannot be 42 // changed without also changing the associated code. 43 struct SortConstants { 44 // SortingNetwork reshapes its input into a matrix. This is the maximum number 45 // of *lanes* per vector. Must be at least 8 because SortSamples assumes the 46 // sorting network can handle 128 bytes with 8 rows, so 16 bytes per vector, 47 // which means 8 lanes for 16-bit types. 48 #if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD 49 static constexpr size_t kMaxCols = 8; // avoid build timeout/stack overflow 50 #else 51 static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector 52 #endif 53 54 // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers, 55 // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase 56 // code size reasonable, and minimizing the extra logN factor for larger 57 // networks (for which only loose upper bounds on size are known). 58 static constexpr size_t kMaxRows = 16; 59 60 // Template argument ensures there is no actual division instruction. 61 template <size_t kLPK> 62 static constexpr HWY_INLINE size_t BaseCaseNumLanes(size_t N) { 63 // We use 8, 8x2, 8x4, and 16x{4..} networks, in units of keys. For N/kLPK 64 // < 4, we cannot use the 16-row networks. 65 return (((N / kLPK) >= 4) ? kMaxRows : 8) * HWY_MIN(N, kMaxCols); 66 } 67 68 // Unrolling is important (pipelining and amortizing branch mispredictions); 69 // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but 70 // somewhat slower for sorting than 4x. 71 // 72 // To change, must also update left + 3 * N etc. in the loop. 73 static constexpr size_t kPartitionUnroll = 4; 74 75 // Chunk := group of keys loaded for sampling a pivot. Matches the typical 76 // cache line size of 64 bytes to get maximum benefit per L2 miss. Sort() 77 // ensures vectors are no larger than that, so this can be independent of the 78 // vector size and thus constexpr. 79 static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t) { 80 return 64 / sizeof_t; 81 } 82 83 template <typename T> 84 static constexpr HWY_INLINE size_t SampleLanes() { 85 return 2 * LanesPerChunk(sizeof(T)); // Stored samples 86 } 87 88 static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) { 89 // The main loop reads kPartitionUnroll vectors, and first loads from 90 // both left and right beforehand, so it requires 2 * kPartitionUnroll 91 // vectors. To handle amounts between that and BaseCaseNumLanes(), we 92 // partition up 3 * kPartitionUnroll + 1 vectors into a two-part buffer. 93 return 2 * (3 * kPartitionUnroll + 1) * N; 94 } 95 96 // Max across the three buffer usages. 97 template <typename T, size_t kLPK> 98 static constexpr HWY_INLINE size_t BufNum(size_t N) { 99 // BaseCase may write one padding vector, and SortSamples uses the space 100 // after samples as the buffer. 101 return HWY_MAX(SampleLanes<T>() + BaseCaseNumLanes<kLPK>(N) + N, 102 PartitionBufNum(N)); 103 } 104 105 // Translates vector_size to lanes and returns size in bytes. 106 template <typename T, size_t kLPK> 107 static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) { 108 return BufNum<T, kLPK>(vector_size / sizeof(T)) * sizeof(T); 109 } 110 111 // Returns max for any type. 112 template <size_t kLPK> 113 static constexpr HWY_INLINE size_t MaxBufBytes(size_t vector_size) { 114 // If 2 lanes per key, it's a 128-bit key with u64 lanes. 115 return kLPK == 2 ? BufBytes<uint64_t, 2>(vector_size) 116 : HWY_MAX((BufBytes<uint16_t, 1>(vector_size)), 117 HWY_MAX((BufBytes<uint32_t, 1>(vector_size)), 118 (BufBytes<uint64_t, 1>(vector_size)))); 119 } 120 }; 121 122 static_assert(SortConstants::MaxBufBytes<1>(64) <= 1664, "Unexpectedly high"); 123 static_assert(SortConstants::MaxBufBytes<2>(64) <= 1664, "Unexpectedly high"); 124 125 } // namespace hwy 126 127 #endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ 128 129 // Per-target 130 // clang-format off 131 #if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == defined(HWY_TARGET_TOGGLE) // NOLINT 132 // clang-format on 133 #ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE 134 #undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE 135 #else 136 #define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE 137 #endif 138 139 #include "hwy/highway.h" 140 141 // vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and 142 // Armv7 debug, and Armv8 GCC 11 asan hits an internal compiler error likely 143 // due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97696. Armv8 Clang 144 // hwasan/msan/tsan/asan also fail to build SVE (b/335157772). 145 #undef VQSORT_ENABLED 146 #undef VQSORT_COMPILER_COMPATIBLE 147 148 #if (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \ 149 (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD) || \ 150 (HWY_ARCH_ARM_A64 && HWY_IS_ASAN) || \ 151 (HWY_ARCH_RISCV && HWY_COMPILER_GCC_ACTUAL < 1400) 152 #define VQSORT_COMPILER_COMPATIBLE 0 153 #else 154 #define VQSORT_COMPILER_COMPATIBLE 1 155 #endif 156 157 #if (HWY_TARGET == HWY_SCALAR) || !VQSORT_COMPILER_COMPATIBLE || \ 158 defined(HWY_DISABLE_VQSORT) 159 #define VQSORT_ENABLED 0 160 #else 161 #define VQSORT_ENABLED 1 162 #endif 163 164 namespace hwy { 165 namespace HWY_NAMESPACE { 166 167 // Default tag / vector width selector. 168 #if HWY_TARGET == HWY_RVV 169 // Use LMUL = 1/2; for SEW=64 this ends up emulated via VSETVLI. 170 template <typename T> 171 using SortTag = ScalableTag<T, -1>; 172 #else 173 template <typename T> 174 using SortTag = ScalableTag<T>; 175 #endif 176 177 // NOLINTNEXTLINE(google-readability-namespace-comments) 178 } // namespace HWY_NAMESPACE 179 } // namespace hwy 180 181 #endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE