[ tor-browser ].git.dasho

crc_memcpy_x86_arm_combined.cc (18213B)
      1 // Copyright 2022 The Abseil Authors
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     https://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer
     16 // registers because XMM registers do not support the CRC instruction (yet).
     17 // While copying, compute the running CRC of the data being copied.
     18 //
     19 // It is assumed that any CPU running this code has SSE4.2 instructions
     20 // available (for CRC32C).  This file will do nothing if that is not true.
     21 //
     22 // The CRC instruction has a 3-byte latency, and we are stressing the ALU ports
     23 // here (unlike a traditional memcopy, which has almost no ALU use), so we will
     24 // need to copy in such a way that the CRC unit is used efficiently. We have two
     25 // regimes in this code:
     26 //  1. For operations of size < kCrcSmallSize, do the CRC then the memcpy
     27 //  2. For operations of size > kCrcSmallSize:
     28 //      a) compute an initial CRC + copy on a small amount of data to align the
     29 //         destination pointer on a 16-byte boundary.
     30 //      b) Split the data into 3 main regions and a tail (smaller than 48 bytes)
     31 //      c) Do the copy and CRC of the 3 main regions, interleaving (start with
     32 //         full cache line copies for each region, then move to single 16 byte
     33 //         pieces per region).
     34 //      d) Combine the CRCs with CRC32C::Concat.
     35 //      e) Copy the tail and extend the CRC with the CRC of the tail.
     36 // This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat
     37 // takes a significant amount of time.  A medium-sized approach could be added
     38 // using 3 CRCs over fixed-size blocks where the zero-extensions required for
     39 // CRC32C::Concat can be precomputed.
     40 
     41 #ifdef __SSE4_2__
     42 #include <immintrin.h>
     43 #endif
     44 
     45 #ifdef _MSC_VER
     46 #include <intrin.h>
     47 #endif
     48 
     49 #include <array>
     50 #include <cstddef>
     51 #include <cstdint>
     52 #include <cstring>
     53 #include <memory>
     54 
     55 #include "absl/base/attributes.h"
     56 #include "absl/base/config.h"
     57 #include "absl/base/optimization.h"
     58 #include "absl/base/prefetch.h"
     59 #include "absl/crc/crc32c.h"
     60 #include "absl/crc/internal/cpu_detect.h"
     61 #include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
     62 #include "absl/crc/internal/crc_memcpy.h"
     63 #include "absl/strings/string_view.h"
     64 
     65 #if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \
     66    defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE)
     67 
     68 namespace absl {
     69 ABSL_NAMESPACE_BEGIN
     70 namespace crc_internal {
     71 
     72 namespace {
     73 
     74 inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length,
     75                             crc32c_t crc) {
     76  // Small copy: just go 1 byte at a time: being nice to the branch predictor
     77  // is more important here than anything else
     78  uint32_t crc_uint32 = static_cast<uint32_t>(crc);
     79  for (std::size_t i = 0; i < length; i++) {
     80    uint8_t data = *reinterpret_cast<const uint8_t*>(src);
     81    crc_uint32 = CRC32_u8(crc_uint32, data);
     82    *reinterpret_cast<uint8_t*>(dst) = data;
     83    ++src;
     84    ++dst;
     85  }
     86  return crc32c_t{crc_uint32};
     87 }
     88 
     89 constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t);
     90 
     91 // Common function for copying the tails of multiple large regions.
     92 // Disable ubsan for benign unaligned access. See b/254108538.
     93 template <size_t vec_regions, size_t int_regions>
     94 ABSL_ATTRIBUTE_NO_SANITIZE_UNDEFINED inline void LargeTailCopy(
     95    crc32c_t* crcs, char** dst, const char** src, size_t region_size,
     96    size_t copy_rounds) {
     97  std::array<V128, vec_regions> data;
     98  std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data;
     99 
    100  while (copy_rounds > 0) {
    101    for (size_t i = 0; i < vec_regions; i++) {
    102      size_t region = i;
    103 
    104      auto* vsrc = reinterpret_cast<const V128*>(*src + region_size * region);
    105      auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region);
    106 
    107      // Load the blocks, unaligned
    108      data[i] = V128_LoadU(vsrc);
    109 
    110      // Store the blocks, aligned
    111      V128_Store(vdst, data[i]);
    112 
    113      // Compute the running CRC
    114      crcs[region] = crc32c_t{static_cast<uint32_t>(
    115          CRC32_u64(static_cast<uint32_t>(crcs[region]),
    116                    static_cast<uint64_t>(V128_Extract64<0>(data[i]))))};
    117      crcs[region] = crc32c_t{static_cast<uint32_t>(
    118          CRC32_u64(static_cast<uint32_t>(crcs[region]),
    119                    static_cast<uint64_t>(V128_Extract64<1>(data[i]))))};
    120    }
    121 
    122    for (size_t i = 0; i < int_regions; i++) {
    123      size_t region = vec_regions + i;
    124 
    125      auto* usrc =
    126          reinterpret_cast<const uint64_t*>(*src + region_size * region);
    127      auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region);
    128 
    129      for (size_t j = 0; j < kIntLoadsPerVec; j++) {
    130        size_t data_index = i * kIntLoadsPerVec + j;
    131 
    132        int_data[data_index] = *(usrc + j);
    133        crcs[region] = crc32c_t{CRC32_u64(static_cast<uint32_t>(crcs[region]),
    134                                          int_data[data_index])};
    135 
    136        *(udst + j) = int_data[data_index];
    137      }
    138    }
    139 
    140    // Increment pointers
    141    *src += sizeof(V128);
    142    *dst += sizeof(V128);
    143    --copy_rounds;
    144  }
    145 }
    146 
    147 }  // namespace
    148 
    149 template <size_t vec_regions, size_t int_regions>
    150 class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine {
    151 public:
    152  AcceleratedCrcMemcpyEngine() = default;
    153  AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete;
    154  AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) =
    155      delete;
    156 
    157  crc32c_t Compute(void* __restrict dst, const void* __restrict src,
    158                   std::size_t length, crc32c_t initial_crc) const override;
    159 };
    160 
    161 // Disable ubsan for benign unaligned access. See b/254108538.
    162 template <size_t vec_regions, size_t int_regions>
    163 ABSL_ATTRIBUTE_NO_SANITIZE_UNDEFINED crc32c_t
    164 AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute(
    165    void* __restrict dst, const void* __restrict src, std::size_t length,
    166    crc32c_t initial_crc) const {
    167  constexpr std::size_t kRegions = vec_regions + int_regions;
    168  static_assert(kRegions > 0, "Must specify at least one region.");
    169  constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff};
    170  constexpr std::size_t kBlockSize = sizeof(V128);
    171  constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize;
    172 
    173  // Number of blocks per cacheline.
    174  constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize;
    175 
    176  char* dst_bytes = static_cast<char*>(dst);
    177  const char* src_bytes = static_cast<const char*>(src);
    178 
    179  // Make sure that one prefetch per big block is enough to cover the whole
    180  // dataset, and we don't prefetch too much.
    181  static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0,
    182                "Cache lines are not divided evenly into blocks, may have "
    183                "unintended behavior!");
    184 
    185  // Experimentally-determined boundary between a small and large copy.
    186  // Below this number, spin-up and concatenation of CRCs takes enough time that
    187  // it kills the throughput gains of using 3 regions and wide vectors.
    188  constexpr size_t kCrcSmallSize = 256;
    189 
    190  // Experimentally-determined prefetch distance.  Main loop copies will
    191  // prefeth data 2 cache lines ahead.
    192  constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE;
    193 
    194  // Small-size CRC-memcpy : just do CRC + memcpy
    195  if (length < kCrcSmallSize) {
    196    crc32c_t crc =
    197        ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length));
    198    memcpy(dst, src, length);
    199    return crc;
    200  }
    201 
    202  // Start work on the CRC: undo the XOR from the previous calculation or set up
    203  // the initial value of the CRC.
    204  initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor};
    205 
    206  // Do an initial alignment copy, so we can use aligned store instructions to
    207  // the destination pointer.  We align the destination pointer because the
    208  // penalty for an unaligned load is small compared to the penalty of an
    209  // unaligned store on modern CPUs.
    210  std::size_t bytes_from_last_aligned =
    211      reinterpret_cast<uintptr_t>(dst) & (kBlockSize - 1);
    212  if (bytes_from_last_aligned != 0) {
    213    std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned;
    214 
    215    // Do the short-sized copy and CRC.
    216    initial_crc =
    217        ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc);
    218    src_bytes += bytes_for_alignment;
    219    dst_bytes += bytes_for_alignment;
    220    length -= bytes_for_alignment;
    221  }
    222 
    223  // We are going to do the copy and CRC in kRegions regions to make sure that
    224  // we can saturate the CRC unit.  The CRCs will be combined at the end of the
    225  // run.  Copying will use the SSE registers, and we will extract words from
    226  // the SSE registers to add to the CRC.  Initially, we run the loop one full
    227  // cache line per region at a time, in order to insert prefetches.
    228 
    229  // Initialize CRCs for kRegions regions.
    230  crc32c_t crcs[kRegions];
    231  crcs[0] = initial_crc;
    232  for (size_t i = 1; i < kRegions; i++) {
    233    crcs[i] = crc32c_t{kCrcDataXor};
    234  }
    235 
    236  // Find the number of rounds to copy and the region size.  Also compute the
    237  // tail size here.
    238  size_t copy_rounds = length / kCopyRoundSize;
    239 
    240  // Find the size of each region and the size of the tail.
    241  const std::size_t region_size = copy_rounds * kBlockSize;
    242  const std::size_t tail_size = length - (kRegions * region_size);
    243 
    244  // Holding registers for data in each region.
    245  std::array<V128, vec_regions> vec_data;
    246  std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data;
    247 
    248  // Main loop.
    249  while (copy_rounds > kBlocksPerCacheLine) {
    250    // Prefetch kPrefetchAhead bytes ahead of each pointer.
    251    for (size_t i = 0; i < kRegions; i++) {
    252      absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i);
    253 #ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE
    254      // TODO(b/297082454): investigate dropping prefetch on x86.
    255      absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i);
    256 #endif
    257    }
    258 
    259    // Load and store data, computing CRC on the way.
    260    for (size_t i = 0; i < kBlocksPerCacheLine; i++) {
    261      // Copy and CRC the data for the CRC regions.
    262      for (size_t j = 0; j < vec_regions; j++) {
    263        // Cycle which regions get vector load/store and integer load/store, to
    264        // engage prefetching logic around vector load/stores and save issue
    265        // slots by using the integer registers.
    266        size_t region = (j + i) % kRegions;
    267 
    268        auto* vsrc =
    269            reinterpret_cast<const V128*>(src_bytes + region_size * region);
    270        auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region);
    271 
    272        // Load and CRC data.
    273        vec_data[j] = V128_LoadU(vsrc + i);
    274        crcs[region] = crc32c_t{static_cast<uint32_t>(
    275            CRC32_u64(static_cast<uint32_t>(crcs[region]),
    276                      static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))};
    277        crcs[region] = crc32c_t{static_cast<uint32_t>(
    278            CRC32_u64(static_cast<uint32_t>(crcs[region]),
    279                      static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))};
    280 
    281        // Store the data.
    282        V128_Store(vdst + i, vec_data[j]);
    283      }
    284 
    285      // Preload the partial CRCs for the CLMUL subregions.
    286      for (size_t j = 0; j < int_regions; j++) {
    287        // Cycle which regions get vector load/store and integer load/store, to
    288        // engage prefetching logic around vector load/stores and save issue
    289        // slots by using the integer registers.
    290        size_t region = (j + vec_regions + i) % kRegions;
    291 
    292        auto* usrc =
    293            reinterpret_cast<const uint64_t*>(src_bytes + region_size * region);
    294        auto* udst =
    295            reinterpret_cast<uint64_t*>(dst_bytes + region_size * region);
    296 
    297        for (size_t k = 0; k < kIntLoadsPerVec; k++) {
    298          size_t data_index = j * kIntLoadsPerVec + k;
    299 
    300          // Load and CRC the data.
    301          int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k);
    302          crcs[region] = crc32c_t{CRC32_u64(static_cast<uint32_t>(crcs[region]),
    303                                            int_data[data_index])};
    304 
    305          // Store the data.
    306          *(udst + i * kIntLoadsPerVec + k) = int_data[data_index];
    307        }
    308      }
    309    }
    310 
    311    // Increment pointers
    312    src_bytes += kBlockSize * kBlocksPerCacheLine;
    313    dst_bytes += kBlockSize * kBlocksPerCacheLine;
    314    copy_rounds -= kBlocksPerCacheLine;
    315  }
    316 
    317  // Copy and CRC the tails of each region.
    318  LargeTailCopy<vec_regions, int_regions>(crcs, &dst_bytes, &src_bytes,
    319                                          region_size, copy_rounds);
    320 
    321  // Move the source and destination pointers to the end of the region
    322  src_bytes += region_size * (kRegions - 1);
    323  dst_bytes += region_size * (kRegions - 1);
    324 
    325  // Copy and CRC the tail through the XMM registers.
    326  std::size_t tail_blocks = tail_size / kBlockSize;
    327  LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0,
    328                      tail_blocks);
    329 
    330  // Final tail copy for under 16 bytes.
    331  crcs[kRegions - 1] =
    332      ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize,
    333                   crcs[kRegions - 1]);
    334 
    335  if (kRegions == 1) {
    336    // If there is only one region, finalize and return its CRC.
    337    return crc32c_t{static_cast<uint32_t>(crcs[0]) ^ kCrcDataXor};
    338  }
    339 
    340  // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the
    341  // XOR done before doing block copy + CRCs.
    342  for (size_t i = 0; i + 1 < kRegions; i++) {
    343    crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor};
    344  }
    345 
    346  // Build a CRC of the first kRegions - 1 regions.
    347  crc32c_t full_crc = crcs[0];
    348  for (size_t i = 1; i + 1 < kRegions; i++) {
    349    full_crc = ConcatCrc32c(full_crc, crcs[i], region_size);
    350  }
    351 
    352  // Finalize and concatenate the final CRC, then return.
    353  crcs[kRegions - 1] =
    354      crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor};
    355  return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size);
    356 }
    357 
    358 CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() {
    359 #ifdef UNDEFINED_BEHAVIOR_SANITIZER
    360  // UBSAN does not play nicely with unaligned loads (which we use a lot).
    361  // Get the underlying architecture.
    362  CpuType cpu_type = GetCpuType();
    363  switch (cpu_type) {
    364    case CpuType::kAmdRome:
    365    case CpuType::kAmdNaples:
    366    case CpuType::kAmdMilan:
    367    case CpuType::kAmdGenoa:
    368    case CpuType::kAmdRyzenV3000:
    369    case CpuType::kIntelCascadelakeXeon:
    370    case CpuType::kIntelSkylakeXeon:
    371    case CpuType::kIntelSkylake:
    372    case CpuType::kIntelBroadwell:
    373    case CpuType::kIntelHaswell:
    374    case CpuType::kIntelIvybridge:
    375      return {
    376          /*.temporal=*/new FallbackCrcMemcpyEngine(),
    377          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
    378      };
    379    // INTEL_SANDYBRIDGE performs better with SSE than AVX.
    380    case CpuType::kIntelSandybridge:
    381      return {
    382          /*.temporal=*/new FallbackCrcMemcpyEngine(),
    383          /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(),
    384      };
    385    default:
    386      return {/*.temporal=*/new FallbackCrcMemcpyEngine(),
    387              /*.non_temporal=*/new FallbackCrcMemcpyEngine()};
    388  }
    389 #else
    390  // Get the underlying architecture.
    391  CpuType cpu_type = GetCpuType();
    392  switch (cpu_type) {
    393    // On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port
    394    // which data movement from the vector registers to the integer registers
    395    // (where CRC32C happens) to crowd the same units as vector stores.  As a
    396    // result, using that path exclusively causes bottlenecking on this port.
    397    // We can avoid this bottleneck by using the integer side of the CPU for
    398    // most operations rather than the vector side.  We keep a vector region to
    399    // engage some of the prefetching logic in the cache hierarchy which seems
    400    // to give vector instructions special treatment.  These prefetch units see
    401    // strided access to each region, and do the right thing.
    402    case CpuType::kAmdRome:
    403    case CpuType::kAmdNaples:
    404    case CpuType::kAmdMilan:
    405    case CpuType::kAmdGenoa:
    406    case CpuType::kAmdRyzenV3000:
    407      return {
    408          /*.temporal=*/new AcceleratedCrcMemcpyEngine<1, 2>(),
    409          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
    410      };
    411    // PCLMULQDQ is slow and we don't have wide enough issue width to take
    412    // advantage of it.  For an unknown architecture, don't risk using CLMULs.
    413    case CpuType::kIntelCascadelakeXeon:
    414    case CpuType::kIntelSkylakeXeon:
    415    case CpuType::kIntelSkylake:
    416    case CpuType::kIntelBroadwell:
    417    case CpuType::kIntelHaswell:
    418    case CpuType::kIntelIvybridge:
    419      return {
    420          /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(),
    421          /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(),
    422      };
    423    // INTEL_SANDYBRIDGE performs better with SSE than AVX.
    424    case CpuType::kIntelSandybridge:
    425      return {
    426          /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(),
    427          /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(),
    428      };
    429    default:
    430      return {/*.temporal=*/new FallbackCrcMemcpyEngine(),
    431              /*.non_temporal=*/new FallbackCrcMemcpyEngine()};
    432  }
    433 #endif  // UNDEFINED_BEHAVIOR_SANITIZER
    434 }
    435 
    436 // For testing, allow the user to specify which engine they want.
    437 std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector,
    438                                                          int integer) {
    439  if (vector == 3 && integer == 0) {
    440    return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>();
    441  } else if (vector == 1 && integer == 2) {
    442    return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>();
    443  } else if (vector == 1 && integer == 0) {
    444    return std::make_unique<AcceleratedCrcMemcpyEngine<1, 0>>();
    445  }
    446  return nullptr;
    447 }
    448 
    449 }  // namespace crc_internal
    450 ABSL_NAMESPACE_END
    451 }  // namespace absl
    452 
    453 #endif  // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE ||
    454        // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE