tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

shared-inl.h (6939B)


      1 // Copyright 2021 Google LLC
      2 // Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
      3 // SPDX-License-Identifier: Apache-2.0
      4 // SPDX-License-Identifier: BSD-3-Clause
      5 //
      6 // Licensed under the Apache License, Version 2.0 (the "License");
      7 // you may not use this file except in compliance with the License.
      8 // You may obtain a copy of the License at
      9 //
     10 //      http://www.apache.org/licenses/LICENSE-2.0
     11 //
     12 // Unless required by applicable law or agreed to in writing, software
     13 // distributed under the License is distributed on an "AS IS" BASIS,
     14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15 // See the License for the specific language governing permissions and
     16 // limitations under the License.
     17 
     18 // Definitions shared between vqsort-inl and sorting_networks-inl.
     19 
     20 // Normal include guard for target-independent parts
     21 #ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
     22 #define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
     23 
     24 #include "hwy/base.h"
     25 
     26 namespace hwy {
     27 
     28 // Based on https://github.com/numpy/numpy/issues/16313#issuecomment-641897028
     29 static HWY_INLINE uint64_t RandomBits(uint64_t* HWY_RESTRICT state) {
     30  const uint64_t a = state[0];
     31  const uint64_t b = state[1];
     32  const uint64_t w = state[2] + 1;
     33  const uint64_t next = a ^ w;
     34  state[0] = (b + (b << 3)) ^ (b >> 11);
     35  const uint64_t rot = (b << 24) | (b >> 40);
     36  state[1] = rot + next;
     37  state[2] = w;
     38  return next;
     39 }
     40 
     41 // Internal constants - these are to avoid magic numbers/literals and cannot be
     42 // changed without also changing the associated code.
     43 struct SortConstants {
     44  // SortingNetwork reshapes its input into a matrix. This is the maximum number
     45  // of *lanes* per vector. Must be at least 8 because SortSamples assumes the
     46  // sorting network can handle 128 bytes with 8 rows, so 16 bytes per vector,
     47  // which means 8 lanes for 16-bit types.
     48 #if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
     49  static constexpr size_t kMaxCols = 8;  // avoid build timeout/stack overflow
     50 #else
     51  static constexpr size_t kMaxCols = 16;  // enough for u32 in 512-bit vector
     52 #endif
     53 
     54  // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
     55  // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
     56  // code size reasonable, and minimizing the extra logN factor for larger
     57  // networks (for which only loose upper bounds on size are known).
     58  static constexpr size_t kMaxRows = 16;
     59 
     60  // Template argument ensures there is no actual division instruction.
     61  template <size_t kLPK>
     62  static constexpr HWY_INLINE size_t BaseCaseNumLanes(size_t N) {
     63    // We use 8, 8x2, 8x4, and 16x{4..} networks, in units of keys. For N/kLPK
     64    // < 4, we cannot use the 16-row networks.
     65    return (((N / kLPK) >= 4) ? kMaxRows : 8) * HWY_MIN(N, kMaxCols);
     66  }
     67 
     68  // Unrolling is important (pipelining and amortizing branch mispredictions);
     69  // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
     70  // somewhat slower for sorting than 4x.
     71  //
     72  // To change, must also update left + 3 * N etc. in the loop.
     73  static constexpr size_t kPartitionUnroll = 4;
     74 
     75  // Chunk := group of keys loaded for sampling a pivot. Matches the typical
     76  // cache line size of 64 bytes to get maximum benefit per L2 miss. Sort()
     77  // ensures vectors are no larger than that, so this can be independent of the
     78  // vector size and thus constexpr.
     79  static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t) {
     80    return 64 / sizeof_t;
     81  }
     82 
     83  template <typename T>
     84  static constexpr HWY_INLINE size_t SampleLanes() {
     85    return 2 * LanesPerChunk(sizeof(T));  // Stored samples
     86  }
     87 
     88  static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
     89    // The main loop reads kPartitionUnroll vectors, and first loads from
     90    // both left and right beforehand, so it requires 2 * kPartitionUnroll
     91    // vectors. To handle amounts between that and BaseCaseNumLanes(), we
     92    // partition up 3 * kPartitionUnroll + 1 vectors into a two-part buffer.
     93    return 2 * (3 * kPartitionUnroll + 1) * N;
     94  }
     95 
     96  // Max across the three buffer usages.
     97  template <typename T, size_t kLPK>
     98  static constexpr HWY_INLINE size_t BufNum(size_t N) {
     99    // BaseCase may write one padding vector, and SortSamples uses the space
    100    // after samples as the buffer.
    101    return HWY_MAX(SampleLanes<T>() + BaseCaseNumLanes<kLPK>(N) + N,
    102                   PartitionBufNum(N));
    103  }
    104 
    105  // Translates vector_size to lanes and returns size in bytes.
    106  template <typename T, size_t kLPK>
    107  static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
    108    return BufNum<T, kLPK>(vector_size / sizeof(T)) * sizeof(T);
    109  }
    110 
    111  // Returns max for any type.
    112  template <size_t kLPK>
    113  static constexpr HWY_INLINE size_t MaxBufBytes(size_t vector_size) {
    114    // If 2 lanes per key, it's a 128-bit key with u64 lanes.
    115    return kLPK == 2 ? BufBytes<uint64_t, 2>(vector_size)
    116                     : HWY_MAX((BufBytes<uint16_t, 1>(vector_size)),
    117                               HWY_MAX((BufBytes<uint32_t, 1>(vector_size)),
    118                                       (BufBytes<uint64_t, 1>(vector_size))));
    119  }
    120 };
    121 
    122 static_assert(SortConstants::MaxBufBytes<1>(64) <= 1664, "Unexpectedly high");
    123 static_assert(SortConstants::MaxBufBytes<2>(64) <= 1664, "Unexpectedly high");
    124 
    125 }  // namespace hwy
    126 
    127 #endif  // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
    128 
    129 // Per-target
    130 // clang-format off
    131 #if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == defined(HWY_TARGET_TOGGLE) // NOLINT
    132 // clang-format on
    133 #ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
    134 #undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
    135 #else
    136 #define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
    137 #endif
    138 
    139 #include "hwy/highway.h"
    140 
    141 // vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
    142 // Armv7 debug, and Armv8 GCC 11 asan hits an internal compiler error likely
    143 // due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97696. Armv8 Clang
    144 // hwasan/msan/tsan/asan also fail to build SVE (b/335157772).
    145 #undef VQSORT_ENABLED
    146 #undef VQSORT_COMPILER_COMPATIBLE
    147 
    148 #if (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
    149    (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD) ||    \
    150    (HWY_ARCH_ARM_A64 && HWY_IS_ASAN) ||          \
    151    (HWY_ARCH_RISCV && HWY_COMPILER_GCC_ACTUAL < 1400)
    152 #define VQSORT_COMPILER_COMPATIBLE 0
    153 #else
    154 #define VQSORT_COMPILER_COMPATIBLE 1
    155 #endif
    156 
    157 #if (HWY_TARGET == HWY_SCALAR) || !VQSORT_COMPILER_COMPATIBLE || \
    158    defined(HWY_DISABLE_VQSORT)
    159 #define VQSORT_ENABLED 0
    160 #else
    161 #define VQSORT_ENABLED 1
    162 #endif
    163 
    164 namespace hwy {
    165 namespace HWY_NAMESPACE {
    166 
    167 // Default tag / vector width selector.
    168 #if HWY_TARGET == HWY_RVV
    169 // Use LMUL = 1/2; for SEW=64 this ends up emulated via VSETVLI.
    170 template <typename T>
    171 using SortTag = ScalableTag<T, -1>;
    172 #else
    173 template <typename T>
    174 using SortTag = ScalableTag<T>;
    175 #endif
    176 
    177 // NOLINTNEXTLINE(google-readability-namespace-comments)
    178 }  // namespace HWY_NAMESPACE
    179 }  // namespace hwy
    180 
    181 #endif  // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE