crc_memcpy_x86_arm_combined.cc (18213B)
1 // Copyright 2022 The Abseil Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Simultaneous memcopy and CRC-32C for x86-64 and ARM 64. Uses integer 16 // registers because XMM registers do not support the CRC instruction (yet). 17 // While copying, compute the running CRC of the data being copied. 18 // 19 // It is assumed that any CPU running this code has SSE4.2 instructions 20 // available (for CRC32C). This file will do nothing if that is not true. 21 // 22 // The CRC instruction has a 3-byte latency, and we are stressing the ALU ports 23 // here (unlike a traditional memcopy, which has almost no ALU use), so we will 24 // need to copy in such a way that the CRC unit is used efficiently. We have two 25 // regimes in this code: 26 // 1. For operations of size < kCrcSmallSize, do the CRC then the memcpy 27 // 2. For operations of size > kCrcSmallSize: 28 // a) compute an initial CRC + copy on a small amount of data to align the 29 // destination pointer on a 16-byte boundary. 30 // b) Split the data into 3 main regions and a tail (smaller than 48 bytes) 31 // c) Do the copy and CRC of the 3 main regions, interleaving (start with 32 // full cache line copies for each region, then move to single 16 byte 33 // pieces per region). 34 // d) Combine the CRCs with CRC32C::Concat. 35 // e) Copy the tail and extend the CRC with the CRC of the tail. 36 // This method is not ideal for op sizes between ~1k and ~8k because CRC::Concat 37 // takes a significant amount of time. A medium-sized approach could be added 38 // using 3 CRCs over fixed-size blocks where the zero-extensions required for 39 // CRC32C::Concat can be precomputed. 40 41 #ifdef __SSE4_2__ 42 #include <immintrin.h> 43 #endif 44 45 #ifdef _MSC_VER 46 #include <intrin.h> 47 #endif 48 49 #include <array> 50 #include <cstddef> 51 #include <cstdint> 52 #include <cstring> 53 #include <memory> 54 55 #include "absl/base/attributes.h" 56 #include "absl/base/config.h" 57 #include "absl/base/optimization.h" 58 #include "absl/base/prefetch.h" 59 #include "absl/crc/crc32c.h" 60 #include "absl/crc/internal/cpu_detect.h" 61 #include "absl/crc/internal/crc32_x86_arm_combined_simd.h" 62 #include "absl/crc/internal/crc_memcpy.h" 63 #include "absl/strings/string_view.h" 64 65 #if defined(ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE) || \ 66 defined(ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE) 67 68 namespace absl { 69 ABSL_NAMESPACE_BEGIN 70 namespace crc_internal { 71 72 namespace { 73 74 inline crc32c_t ShortCrcCopy(char* dst, const char* src, std::size_t length, 75 crc32c_t crc) { 76 // Small copy: just go 1 byte at a time: being nice to the branch predictor 77 // is more important here than anything else 78 uint32_t crc_uint32 = static_cast<uint32_t>(crc); 79 for (std::size_t i = 0; i < length; i++) { 80 uint8_t data = *reinterpret_cast<const uint8_t*>(src); 81 crc_uint32 = CRC32_u8(crc_uint32, data); 82 *reinterpret_cast<uint8_t*>(dst) = data; 83 ++src; 84 ++dst; 85 } 86 return crc32c_t{crc_uint32}; 87 } 88 89 constexpr size_t kIntLoadsPerVec = sizeof(V128) / sizeof(uint64_t); 90 91 // Common function for copying the tails of multiple large regions. 92 // Disable ubsan for benign unaligned access. See b/254108538. 93 template <size_t vec_regions, size_t int_regions> 94 ABSL_ATTRIBUTE_NO_SANITIZE_UNDEFINED inline void LargeTailCopy( 95 crc32c_t* crcs, char** dst, const char** src, size_t region_size, 96 size_t copy_rounds) { 97 std::array<V128, vec_regions> data; 98 std::array<uint64_t, kIntLoadsPerVec * int_regions> int_data; 99 100 while (copy_rounds > 0) { 101 for (size_t i = 0; i < vec_regions; i++) { 102 size_t region = i; 103 104 auto* vsrc = reinterpret_cast<const V128*>(*src + region_size * region); 105 auto* vdst = reinterpret_cast<V128*>(*dst + region_size * region); 106 107 // Load the blocks, unaligned 108 data[i] = V128_LoadU(vsrc); 109 110 // Store the blocks, aligned 111 V128_Store(vdst, data[i]); 112 113 // Compute the running CRC 114 crcs[region] = crc32c_t{static_cast<uint32_t>( 115 CRC32_u64(static_cast<uint32_t>(crcs[region]), 116 static_cast<uint64_t>(V128_Extract64<0>(data[i]))))}; 117 crcs[region] = crc32c_t{static_cast<uint32_t>( 118 CRC32_u64(static_cast<uint32_t>(crcs[region]), 119 static_cast<uint64_t>(V128_Extract64<1>(data[i]))))}; 120 } 121 122 for (size_t i = 0; i < int_regions; i++) { 123 size_t region = vec_regions + i; 124 125 auto* usrc = 126 reinterpret_cast<const uint64_t*>(*src + region_size * region); 127 auto* udst = reinterpret_cast<uint64_t*>(*dst + region_size * region); 128 129 for (size_t j = 0; j < kIntLoadsPerVec; j++) { 130 size_t data_index = i * kIntLoadsPerVec + j; 131 132 int_data[data_index] = *(usrc + j); 133 crcs[region] = crc32c_t{CRC32_u64(static_cast<uint32_t>(crcs[region]), 134 int_data[data_index])}; 135 136 *(udst + j) = int_data[data_index]; 137 } 138 } 139 140 // Increment pointers 141 *src += sizeof(V128); 142 *dst += sizeof(V128); 143 --copy_rounds; 144 } 145 } 146 147 } // namespace 148 149 template <size_t vec_regions, size_t int_regions> 150 class AcceleratedCrcMemcpyEngine : public CrcMemcpyEngine { 151 public: 152 AcceleratedCrcMemcpyEngine() = default; 153 AcceleratedCrcMemcpyEngine(const AcceleratedCrcMemcpyEngine&) = delete; 154 AcceleratedCrcMemcpyEngine operator=(const AcceleratedCrcMemcpyEngine&) = 155 delete; 156 157 crc32c_t Compute(void* __restrict dst, const void* __restrict src, 158 std::size_t length, crc32c_t initial_crc) const override; 159 }; 160 161 // Disable ubsan for benign unaligned access. See b/254108538. 162 template <size_t vec_regions, size_t int_regions> 163 ABSL_ATTRIBUTE_NO_SANITIZE_UNDEFINED crc32c_t 164 AcceleratedCrcMemcpyEngine<vec_regions, int_regions>::Compute( 165 void* __restrict dst, const void* __restrict src, std::size_t length, 166 crc32c_t initial_crc) const { 167 constexpr std::size_t kRegions = vec_regions + int_regions; 168 static_assert(kRegions > 0, "Must specify at least one region."); 169 constexpr uint32_t kCrcDataXor = uint32_t{0xffffffff}; 170 constexpr std::size_t kBlockSize = sizeof(V128); 171 constexpr std::size_t kCopyRoundSize = kRegions * kBlockSize; 172 173 // Number of blocks per cacheline. 174 constexpr std::size_t kBlocksPerCacheLine = ABSL_CACHELINE_SIZE / kBlockSize; 175 176 char* dst_bytes = static_cast<char*>(dst); 177 const char* src_bytes = static_cast<const char*>(src); 178 179 // Make sure that one prefetch per big block is enough to cover the whole 180 // dataset, and we don't prefetch too much. 181 static_assert(ABSL_CACHELINE_SIZE % kBlockSize == 0, 182 "Cache lines are not divided evenly into blocks, may have " 183 "unintended behavior!"); 184 185 // Experimentally-determined boundary between a small and large copy. 186 // Below this number, spin-up and concatenation of CRCs takes enough time that 187 // it kills the throughput gains of using 3 regions and wide vectors. 188 constexpr size_t kCrcSmallSize = 256; 189 190 // Experimentally-determined prefetch distance. Main loop copies will 191 // prefeth data 2 cache lines ahead. 192 constexpr std::size_t kPrefetchAhead = 2 * ABSL_CACHELINE_SIZE; 193 194 // Small-size CRC-memcpy : just do CRC + memcpy 195 if (length < kCrcSmallSize) { 196 crc32c_t crc = 197 ExtendCrc32c(initial_crc, absl::string_view(src_bytes, length)); 198 memcpy(dst, src, length); 199 return crc; 200 } 201 202 // Start work on the CRC: undo the XOR from the previous calculation or set up 203 // the initial value of the CRC. 204 initial_crc = crc32c_t{static_cast<uint32_t>(initial_crc) ^ kCrcDataXor}; 205 206 // Do an initial alignment copy, so we can use aligned store instructions to 207 // the destination pointer. We align the destination pointer because the 208 // penalty for an unaligned load is small compared to the penalty of an 209 // unaligned store on modern CPUs. 210 std::size_t bytes_from_last_aligned = 211 reinterpret_cast<uintptr_t>(dst) & (kBlockSize - 1); 212 if (bytes_from_last_aligned != 0) { 213 std::size_t bytes_for_alignment = kBlockSize - bytes_from_last_aligned; 214 215 // Do the short-sized copy and CRC. 216 initial_crc = 217 ShortCrcCopy(dst_bytes, src_bytes, bytes_for_alignment, initial_crc); 218 src_bytes += bytes_for_alignment; 219 dst_bytes += bytes_for_alignment; 220 length -= bytes_for_alignment; 221 } 222 223 // We are going to do the copy and CRC in kRegions regions to make sure that 224 // we can saturate the CRC unit. The CRCs will be combined at the end of the 225 // run. Copying will use the SSE registers, and we will extract words from 226 // the SSE registers to add to the CRC. Initially, we run the loop one full 227 // cache line per region at a time, in order to insert prefetches. 228 229 // Initialize CRCs for kRegions regions. 230 crc32c_t crcs[kRegions]; 231 crcs[0] = initial_crc; 232 for (size_t i = 1; i < kRegions; i++) { 233 crcs[i] = crc32c_t{kCrcDataXor}; 234 } 235 236 // Find the number of rounds to copy and the region size. Also compute the 237 // tail size here. 238 size_t copy_rounds = length / kCopyRoundSize; 239 240 // Find the size of each region and the size of the tail. 241 const std::size_t region_size = copy_rounds * kBlockSize; 242 const std::size_t tail_size = length - (kRegions * region_size); 243 244 // Holding registers for data in each region. 245 std::array<V128, vec_regions> vec_data; 246 std::array<uint64_t, int_regions * kIntLoadsPerVec> int_data; 247 248 // Main loop. 249 while (copy_rounds > kBlocksPerCacheLine) { 250 // Prefetch kPrefetchAhead bytes ahead of each pointer. 251 for (size_t i = 0; i < kRegions; i++) { 252 absl::PrefetchToLocalCache(src_bytes + kPrefetchAhead + region_size * i); 253 #ifdef ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE 254 // TODO(b/297082454): investigate dropping prefetch on x86. 255 absl::PrefetchToLocalCache(dst_bytes + kPrefetchAhead + region_size * i); 256 #endif 257 } 258 259 // Load and store data, computing CRC on the way. 260 for (size_t i = 0; i < kBlocksPerCacheLine; i++) { 261 // Copy and CRC the data for the CRC regions. 262 for (size_t j = 0; j < vec_regions; j++) { 263 // Cycle which regions get vector load/store and integer load/store, to 264 // engage prefetching logic around vector load/stores and save issue 265 // slots by using the integer registers. 266 size_t region = (j + i) % kRegions; 267 268 auto* vsrc = 269 reinterpret_cast<const V128*>(src_bytes + region_size * region); 270 auto* vdst = reinterpret_cast<V128*>(dst_bytes + region_size * region); 271 272 // Load and CRC data. 273 vec_data[j] = V128_LoadU(vsrc + i); 274 crcs[region] = crc32c_t{static_cast<uint32_t>( 275 CRC32_u64(static_cast<uint32_t>(crcs[region]), 276 static_cast<uint64_t>(V128_Extract64<0>(vec_data[j]))))}; 277 crcs[region] = crc32c_t{static_cast<uint32_t>( 278 CRC32_u64(static_cast<uint32_t>(crcs[region]), 279 static_cast<uint64_t>(V128_Extract64<1>(vec_data[j]))))}; 280 281 // Store the data. 282 V128_Store(vdst + i, vec_data[j]); 283 } 284 285 // Preload the partial CRCs for the CLMUL subregions. 286 for (size_t j = 0; j < int_regions; j++) { 287 // Cycle which regions get vector load/store and integer load/store, to 288 // engage prefetching logic around vector load/stores and save issue 289 // slots by using the integer registers. 290 size_t region = (j + vec_regions + i) % kRegions; 291 292 auto* usrc = 293 reinterpret_cast<const uint64_t*>(src_bytes + region_size * region); 294 auto* udst = 295 reinterpret_cast<uint64_t*>(dst_bytes + region_size * region); 296 297 for (size_t k = 0; k < kIntLoadsPerVec; k++) { 298 size_t data_index = j * kIntLoadsPerVec + k; 299 300 // Load and CRC the data. 301 int_data[data_index] = *(usrc + i * kIntLoadsPerVec + k); 302 crcs[region] = crc32c_t{CRC32_u64(static_cast<uint32_t>(crcs[region]), 303 int_data[data_index])}; 304 305 // Store the data. 306 *(udst + i * kIntLoadsPerVec + k) = int_data[data_index]; 307 } 308 } 309 } 310 311 // Increment pointers 312 src_bytes += kBlockSize * kBlocksPerCacheLine; 313 dst_bytes += kBlockSize * kBlocksPerCacheLine; 314 copy_rounds -= kBlocksPerCacheLine; 315 } 316 317 // Copy and CRC the tails of each region. 318 LargeTailCopy<vec_regions, int_regions>(crcs, &dst_bytes, &src_bytes, 319 region_size, copy_rounds); 320 321 // Move the source and destination pointers to the end of the region 322 src_bytes += region_size * (kRegions - 1); 323 dst_bytes += region_size * (kRegions - 1); 324 325 // Copy and CRC the tail through the XMM registers. 326 std::size_t tail_blocks = tail_size / kBlockSize; 327 LargeTailCopy<0, 1>(&crcs[kRegions - 1], &dst_bytes, &src_bytes, 0, 328 tail_blocks); 329 330 // Final tail copy for under 16 bytes. 331 crcs[kRegions - 1] = 332 ShortCrcCopy(dst_bytes, src_bytes, tail_size - tail_blocks * kBlockSize, 333 crcs[kRegions - 1]); 334 335 if (kRegions == 1) { 336 // If there is only one region, finalize and return its CRC. 337 return crc32c_t{static_cast<uint32_t>(crcs[0]) ^ kCrcDataXor}; 338 } 339 340 // Finalize the first CRCs: XOR the internal CRCs by the XOR mask to undo the 341 // XOR done before doing block copy + CRCs. 342 for (size_t i = 0; i + 1 < kRegions; i++) { 343 crcs[i] = crc32c_t{static_cast<uint32_t>(crcs[i]) ^ kCrcDataXor}; 344 } 345 346 // Build a CRC of the first kRegions - 1 regions. 347 crc32c_t full_crc = crcs[0]; 348 for (size_t i = 1; i + 1 < kRegions; i++) { 349 full_crc = ConcatCrc32c(full_crc, crcs[i], region_size); 350 } 351 352 // Finalize and concatenate the final CRC, then return. 353 crcs[kRegions - 1] = 354 crc32c_t{static_cast<uint32_t>(crcs[kRegions - 1]) ^ kCrcDataXor}; 355 return ConcatCrc32c(full_crc, crcs[kRegions - 1], region_size + tail_size); 356 } 357 358 CrcMemcpy::ArchSpecificEngines CrcMemcpy::GetArchSpecificEngines() { 359 #ifdef UNDEFINED_BEHAVIOR_SANITIZER 360 // UBSAN does not play nicely with unaligned loads (which we use a lot). 361 // Get the underlying architecture. 362 CpuType cpu_type = GetCpuType(); 363 switch (cpu_type) { 364 case CpuType::kAmdRome: 365 case CpuType::kAmdNaples: 366 case CpuType::kAmdMilan: 367 case CpuType::kAmdGenoa: 368 case CpuType::kAmdRyzenV3000: 369 case CpuType::kIntelCascadelakeXeon: 370 case CpuType::kIntelSkylakeXeon: 371 case CpuType::kIntelSkylake: 372 case CpuType::kIntelBroadwell: 373 case CpuType::kIntelHaswell: 374 case CpuType::kIntelIvybridge: 375 return { 376 /*.temporal=*/new FallbackCrcMemcpyEngine(), 377 /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), 378 }; 379 // INTEL_SANDYBRIDGE performs better with SSE than AVX. 380 case CpuType::kIntelSandybridge: 381 return { 382 /*.temporal=*/new FallbackCrcMemcpyEngine(), 383 /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(), 384 }; 385 default: 386 return {/*.temporal=*/new FallbackCrcMemcpyEngine(), 387 /*.non_temporal=*/new FallbackCrcMemcpyEngine()}; 388 } 389 #else 390 // Get the underlying architecture. 391 CpuType cpu_type = GetCpuType(); 392 switch (cpu_type) { 393 // On Zen 2, PEXTRQ uses 2 micro-ops, including one on the vector store port 394 // which data movement from the vector registers to the integer registers 395 // (where CRC32C happens) to crowd the same units as vector stores. As a 396 // result, using that path exclusively causes bottlenecking on this port. 397 // We can avoid this bottleneck by using the integer side of the CPU for 398 // most operations rather than the vector side. We keep a vector region to 399 // engage some of the prefetching logic in the cache hierarchy which seems 400 // to give vector instructions special treatment. These prefetch units see 401 // strided access to each region, and do the right thing. 402 case CpuType::kAmdRome: 403 case CpuType::kAmdNaples: 404 case CpuType::kAmdMilan: 405 case CpuType::kAmdGenoa: 406 case CpuType::kAmdRyzenV3000: 407 return { 408 /*.temporal=*/new AcceleratedCrcMemcpyEngine<1, 2>(), 409 /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), 410 }; 411 // PCLMULQDQ is slow and we don't have wide enough issue width to take 412 // advantage of it. For an unknown architecture, don't risk using CLMULs. 413 case CpuType::kIntelCascadelakeXeon: 414 case CpuType::kIntelSkylakeXeon: 415 case CpuType::kIntelSkylake: 416 case CpuType::kIntelBroadwell: 417 case CpuType::kIntelHaswell: 418 case CpuType::kIntelIvybridge: 419 return { 420 /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(), 421 /*.non_temporal=*/new CrcNonTemporalMemcpyAVXEngine(), 422 }; 423 // INTEL_SANDYBRIDGE performs better with SSE than AVX. 424 case CpuType::kIntelSandybridge: 425 return { 426 /*.temporal=*/new AcceleratedCrcMemcpyEngine<3, 0>(), 427 /*.non_temporal=*/new CrcNonTemporalMemcpyEngine(), 428 }; 429 default: 430 return {/*.temporal=*/new FallbackCrcMemcpyEngine(), 431 /*.non_temporal=*/new FallbackCrcMemcpyEngine()}; 432 } 433 #endif // UNDEFINED_BEHAVIOR_SANITIZER 434 } 435 436 // For testing, allow the user to specify which engine they want. 437 std::unique_ptr<CrcMemcpyEngine> CrcMemcpy::GetTestEngine(int vector, 438 int integer) { 439 if (vector == 3 && integer == 0) { 440 return std::make_unique<AcceleratedCrcMemcpyEngine<3, 0>>(); 441 } else if (vector == 1 && integer == 2) { 442 return std::make_unique<AcceleratedCrcMemcpyEngine<1, 2>>(); 443 } else if (vector == 1 && integer == 0) { 444 return std::make_unique<AcceleratedCrcMemcpyEngine<1, 0>>(); 445 } 446 return nullptr; 447 } 448 449 } // namespace crc_internal 450 ABSL_NAMESPACE_END 451 } // namespace absl 452 453 #endif // ABSL_INTERNAL_HAVE_X86_64_ACCELERATED_CRC_MEMCPY_ENGINE || 454 // ABSL_INTERNAL_HAVE_ARM_ACCELERATED_CRC_MEMCPY_ENGINE