non_temporal_memcpy.h (7086B)
1 // Copyright 2022 The Abseil Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_ 16 #define ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_ 17 18 #ifdef _MSC_VER 19 #include <intrin.h> 20 #endif 21 22 #if defined(__SSE__) || defined(__AVX__) 23 // Pulls in both SSE and AVX intrinsics. 24 #include <immintrin.h> 25 #endif 26 27 #ifdef __aarch64__ 28 #include "absl/crc/internal/non_temporal_arm_intrinsics.h" 29 #endif 30 31 #include <algorithm> 32 #include <cassert> 33 #include <cstdint> 34 #include <cstring> 35 36 #include "absl/base/attributes.h" 37 #include "absl/base/config.h" 38 #include "absl/base/optimization.h" 39 40 namespace absl { 41 ABSL_NAMESPACE_BEGIN 42 namespace crc_internal { 43 44 // This non-temporal memcpy does regular load and non-temporal store memory 45 // copy. It is compatible to both 16-byte aligned and unaligned addresses. If 46 // data at the destination is not immediately accessed, using non-temporal 47 // memcpy can save 1 DRAM load of the destination cacheline. 48 constexpr size_t kCacheLineSize = ABSL_CACHELINE_SIZE; 49 50 // If the objects overlap, the behavior is undefined. Uses regular memcpy 51 // instead of non-temporal memcpy if the required CPU intrinsics are unavailable 52 // at compile time. 53 inline void *non_temporal_store_memcpy(void *__restrict dst, 54 const void *__restrict src, size_t len) { 55 #if defined(__SSE3__) || defined(__aarch64__) || \ 56 (defined(_MSC_VER) && defined(__AVX__)) 57 // This implementation requires SSE3. 58 // MSVC cannot target SSE3 directly, but when MSVC targets AVX, 59 // SSE3 support is implied. 60 uint8_t *d = reinterpret_cast<uint8_t *>(dst); 61 const uint8_t *s = reinterpret_cast<const uint8_t *>(src); 62 63 // memcpy() the misaligned header. At the end of this if block, <d> is 64 // aligned to a 64-byte cacheline boundary or <len> == 0. 65 if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) { 66 uintptr_t bytes_before_alignment_boundary = 67 kCacheLineSize - 68 (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)); 69 size_t header_len = (std::min)(bytes_before_alignment_boundary, len); 70 assert(bytes_before_alignment_boundary < kCacheLineSize); 71 memcpy(d, s, header_len); 72 d += header_len; 73 s += header_len; 74 len -= header_len; 75 } 76 77 if (len >= kCacheLineSize) { 78 _mm_sfence(); 79 __m128i *dst_cacheline = reinterpret_cast<__m128i *>(d); 80 const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s); 81 constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i); 82 size_t loops = len / kCacheLineSize; 83 84 while (len >= kCacheLineSize) { 85 __m128i temp1, temp2, temp3, temp4; 86 temp1 = _mm_lddqu_si128(src_cacheline + 0); 87 temp2 = _mm_lddqu_si128(src_cacheline + 1); 88 temp3 = _mm_lddqu_si128(src_cacheline + 2); 89 temp4 = _mm_lddqu_si128(src_cacheline + 3); 90 _mm_stream_si128(dst_cacheline + 0, temp1); 91 _mm_stream_si128(dst_cacheline + 1, temp2); 92 _mm_stream_si128(dst_cacheline + 2, temp3); 93 _mm_stream_si128(dst_cacheline + 3, temp4); 94 src_cacheline += kOpsPerCacheLine; 95 dst_cacheline += kOpsPerCacheLine; 96 len -= kCacheLineSize; 97 } 98 d += loops * kCacheLineSize; 99 s += loops * kCacheLineSize; 100 _mm_sfence(); 101 } 102 103 // memcpy the tail. 104 if (len) { 105 memcpy(d, s, len); 106 } 107 return dst; 108 #else 109 // Fallback to regular memcpy. 110 return memcpy(dst, src, len); 111 #endif // __SSE3__ || __aarch64__ || (_MSC_VER && __AVX__) 112 } 113 114 // We try to force non_temporal_store_memcpy_avx to use AVX instructions 115 // so that we can select it at runtime when AVX is available. 116 // Clang on Windows has gnu::target but does not make AVX types like __m256i 117 // available when trying to force specific functions to use AVX compiles. 118 #if ABSL_HAVE_CPP_ATTRIBUTE(gnu::target) && !defined(_MSC_VER) && \ 119 (defined(__x86_64__) || defined(__i386__)) 120 #define ABSL_INTERNAL_CAN_FORCE_AVX 1 121 #endif 122 123 // If the objects overlap, the behavior is undefined. Uses regular memcpy 124 // instead of non-temporal memcpy if the required CPU intrinsics are unavailable 125 // at compile time. 126 #ifdef ABSL_INTERNAL_CAN_FORCE_AVX 127 [[gnu::target("avx")]] 128 #endif 129 inline void *non_temporal_store_memcpy_avx(void *__restrict dst, 130 const void *__restrict src, 131 size_t len) { 132 // This function requires AVX. If possible we compile it with AVX even if the 133 // translation unit isn't built with AVX support. This works because we only 134 // select this implementation at runtime if the CPU supports AVX. 135 // MSVC AVX support implies SSE3 support. 136 #if ((defined(__AVX__) || defined(ABSL_INTERNAL_CAN_FORCE_AVX)) && \ 137 defined(__SSE3__)) || \ 138 (defined(_MSC_VER) && defined(__AVX__)) 139 uint8_t *d = reinterpret_cast<uint8_t *>(dst); 140 const uint8_t *s = reinterpret_cast<const uint8_t *>(src); 141 142 // memcpy() the misaligned header. At the end of this if block, <d> is 143 // aligned to a 64-byte cacheline boundary or <len> == 0. 144 if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) { 145 uintptr_t bytes_before_alignment_boundary = 146 kCacheLineSize - 147 (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)); 148 size_t header_len = (std::min)(bytes_before_alignment_boundary, len); 149 assert(bytes_before_alignment_boundary < kCacheLineSize); 150 memcpy(d, s, header_len); 151 d += header_len; 152 s += header_len; 153 len -= header_len; 154 } 155 156 if (len >= kCacheLineSize) { 157 _mm_sfence(); 158 __m256i *dst_cacheline = reinterpret_cast<__m256i *>(d); 159 const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s); 160 constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i); 161 size_t loops = len / kCacheLineSize; 162 163 while (len >= kCacheLineSize) { 164 __m256i temp1, temp2; 165 temp1 = _mm256_lddqu_si256(src_cacheline + 0); 166 temp2 = _mm256_lddqu_si256(src_cacheline + 1); 167 _mm256_stream_si256(dst_cacheline + 0, temp1); 168 _mm256_stream_si256(dst_cacheline + 1, temp2); 169 src_cacheline += kOpsPerCacheLine; 170 dst_cacheline += kOpsPerCacheLine; 171 len -= kCacheLineSize; 172 } 173 d += loops * kCacheLineSize; 174 s += loops * kCacheLineSize; 175 _mm_sfence(); 176 } 177 178 // memcpy the tail. 179 if (len) { 180 memcpy(d, s, len); 181 } 182 return dst; 183 #else 184 // Fallback to regular memcpy so that this function compiles. 185 return memcpy(dst, src, len); 186 #endif 187 } 188 189 #undef ABSL_INTERNAL_CAN_FORCE_AVX 190 191 } // namespace crc_internal 192 ABSL_NAMESPACE_END 193 } // namespace absl 194 195 #endif // ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_