prefetch.h (7210B)
1 // Copyright 2023 The Abseil Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // ----------------------------------------------------------------------------- 16 // File: prefetch.h 17 // ----------------------------------------------------------------------------- 18 // 19 // This header file defines prefetch functions to prefetch memory contents 20 // into the first level cache (L1) for the current CPU. The prefetch logic 21 // offered in this header is limited to prefetching first level cachelines 22 // only, and is aimed at relatively 'simple' prefetching logic. 23 // 24 #ifndef ABSL_BASE_PREFETCH_H_ 25 #define ABSL_BASE_PREFETCH_H_ 26 27 #include "absl/base/attributes.h" 28 #include "absl/base/config.h" 29 30 #if defined(ABSL_INTERNAL_HAVE_SSE) 31 #include <xmmintrin.h> 32 #endif 33 34 #if defined(_MSC_VER) 35 #include <intrin.h> 36 #if defined(ABSL_INTERNAL_HAVE_SSE) 37 #pragma intrinsic(_mm_prefetch) 38 #endif 39 #endif 40 41 namespace absl { 42 ABSL_NAMESPACE_BEGIN 43 44 // Moves data into the L1 cache before it is read, or "prefetches" it. 45 // 46 // The value of `addr` is the address of the memory to prefetch. If 47 // the target and compiler support it, data prefetch instructions are 48 // generated. If the prefetch is done some time before the memory is 49 // read, it may be in the cache by the time the read occurs. 50 // 51 // This method prefetches data with the highest degree of temporal locality; 52 // data is prefetched where possible into all levels of the cache. 53 // 54 // Incorrect or gratuitous use of this function can degrade performance. 55 // Use this function only when representative benchmarks show an improvement. 56 // 57 // Example: 58 // 59 // // Computes incremental checksum for `data`. 60 // int ComputeChecksum(int sum, absl::string_view data); 61 // 62 // // Computes cumulative checksum for all values in `data` 63 // int ComputeChecksum(absl::Span<const std::string> data) { 64 // int sum = 0; 65 // auto it = data.begin(); 66 // auto pit = data.begin(); 67 // auto end = data.end(); 68 // for (int dist = 8; dist > 0 && pit != data.end(); --dist, ++pit) { 69 // absl::PrefetchToLocalCache(pit->data()); 70 // } 71 // for (; pit != end; ++pit, ++it) { 72 // sum = ComputeChecksum(sum, *it); 73 // absl::PrefetchToLocalCache(pit->data()); 74 // } 75 // for (; it != end; ++it) { 76 // sum = ComputeChecksum(sum, *it); 77 // } 78 // return sum; 79 // } 80 // 81 void PrefetchToLocalCache(const void* addr); 82 83 // Moves data into the L1 cache before it is read, or "prefetches" it. 84 // 85 // This function is identical to `PrefetchToLocalCache()` except that it has 86 // non-temporal locality: the fetched data should not be left in any of the 87 // cache tiers. This is useful for cases where the data is used only once / 88 // short term, for example, invoking a destructor on an object. 89 // 90 // Incorrect or gratuitous use of this function can degrade performance. 91 // Use this function only when representative benchmarks show an improvement. 92 // 93 // Example: 94 // 95 // template <typename Iterator> 96 // void DestroyPointers(Iterator begin, Iterator end) { 97 // size_t distance = std::min(8U, bars.size()); 98 // 99 // int dist = 8; 100 // auto prefetch_it = begin; 101 // while (prefetch_it != end && --dist;) { 102 // absl::PrefetchToLocalCacheNta(*prefetch_it++); 103 // } 104 // while (prefetch_it != end) { 105 // delete *begin++; 106 // absl::PrefetchToLocalCacheNta(*prefetch_it++); 107 // } 108 // while (begin != end) { 109 // delete *begin++; 110 // } 111 // } 112 // 113 void PrefetchToLocalCacheNta(const void* addr); 114 115 // Moves data into the L1 cache with the intent to modify it. 116 // 117 // This function is similar to `PrefetchToLocalCache()` except that it 118 // prefetches cachelines with an 'intent to modify' This typically includes 119 // invalidating cache entries for this address in all other cache tiers, and an 120 // exclusive access intent. 121 // 122 // Incorrect or gratuitous use of this function can degrade performance. As this 123 // function can invalidate cached cachelines on other caches and computer cores, 124 // incorrect usage of this function can have an even greater negative impact 125 // than incorrect regular prefetches. 126 // Use this function only when representative benchmarks show an improvement. 127 // 128 // Example: 129 // 130 // void* Arena::Allocate(size_t size) { 131 // void* ptr = AllocateBlock(size); 132 // absl::PrefetchToLocalCacheForWrite(ptr); 133 // return ptr; 134 // } 135 // 136 void PrefetchToLocalCacheForWrite(const void* addr); 137 138 #if ABSL_HAVE_BUILTIN(__builtin_prefetch) || defined(__GNUC__) 139 140 #define ABSL_HAVE_PREFETCH 1 141 142 // See __builtin_prefetch: 143 // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html. 144 // 145 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache( 146 const void* addr) { 147 __builtin_prefetch(addr, 0, 3); 148 } 149 150 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta( 151 const void* addr) { 152 __builtin_prefetch(addr, 0, 0); 153 } 154 155 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite( 156 const void* addr) { 157 // [x86] gcc/clang don't generate PREFETCHW for __builtin_prefetch(.., 1) 158 // unless -march=broadwell or newer; this is not generally the default, so we 159 // manually emit prefetchw. PREFETCHW is recognized as a no-op on older Intel 160 // processors and has been present on AMD processors since the K6-2. 161 #if defined(__x86_64__) && !defined(__PRFCHW__) 162 asm("prefetchw %0" : : "m"(*reinterpret_cast<const char*>(addr))); 163 #else 164 __builtin_prefetch(addr, 1, 3); 165 #endif 166 } 167 168 #elif defined(ABSL_INTERNAL_HAVE_SSE) 169 170 #define ABSL_HAVE_PREFETCH 1 171 172 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache( 173 const void* addr) { 174 _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0); 175 } 176 177 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta( 178 const void* addr) { 179 _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_NTA); 180 } 181 182 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite( 183 const void* addr) { 184 #if defined(_MM_HINT_ET0) 185 _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_ET0); 186 #elif !defined(_MSC_VER) && defined(__x86_64__) 187 // _MM_HINT_ET0 is not universally supported. As we commented further 188 // up, PREFETCHW is recognized as a no-op on older Intel processors 189 // and has been present on AMD processors since the K6-2. We have this 190 // disabled for MSVC compilers as this miscompiles on older MSVC compilers. 191 asm("prefetchw %0" : : "m"(*reinterpret_cast<const char*>(addr))); 192 #endif 193 } 194 195 #else 196 197 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache( 198 const void* addr) {} 199 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta( 200 const void* addr) {} 201 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite( 202 const void* addr) {} 203 204 #endif 205 206 ABSL_NAMESPACE_END 207 } // namespace absl 208 209 #endif // ABSL_BASE_PREFETCH_H_