cache_control.h (5222B)
1 // Copyright 2020 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef HIGHWAY_HWY_CACHE_CONTROL_H_ 17 #define HIGHWAY_HWY_CACHE_CONTROL_H_ 18 19 #include "hwy/aligned_allocator.h" // HWY_ALIGNMENT 20 #include "hwy/base.h" 21 22 // Requires SSE2; fails to compile on 32-bit Clang 7 (see 23 // https://github.com/gperftools/gperftools/issues/946). 24 #if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32) 25 #undef HWY_DISABLE_CACHE_CONTROL 26 #define HWY_DISABLE_CACHE_CONTROL 27 #endif 28 29 #ifndef HWY_DISABLE_CACHE_CONTROL 30 // intrin.h is sufficient on MSVC and already included by base.h. 31 #if HWY_ARCH_X86 && !HWY_COMPILER_MSVC 32 #include <emmintrin.h> // SSE2 33 #include <xmmintrin.h> // _mm_prefetch 34 #elif HWY_ARCH_ARM_A64 35 #include <arm_acle.h> 36 #endif 37 #endif // HWY_DISABLE_CACHE_CONTROL 38 39 namespace hwy { 40 41 // Even if N*sizeof(T) is smaller, Stream may write a multiple of this size. 42 #define HWY_STREAM_MULTIPLE 16 43 44 // The following functions may also require an attribute. 45 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC 46 #define HWY_ATTR_CACHE __attribute__((target("sse2"))) 47 #else 48 #define HWY_ATTR_CACHE 49 #endif 50 51 // Windows.h #defines this, which causes infinite recursion. Temporarily 52 // undefine to avoid conflict with our function. 53 // TODO(janwas): remove when this function is removed. 54 #pragma push_macro("LoadFence") 55 #undef LoadFence 56 57 // Delays subsequent loads until prior loads are visible. Beware of potentially 58 // differing behavior across architectures and vendors: on Intel but not 59 // AMD CPUs, also serves as a full fence (waits for all prior instructions to 60 // complete). 61 HWY_INLINE HWY_ATTR_CACHE void LoadFence() { 62 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) 63 _mm_lfence(); 64 #endif 65 } 66 67 // TODO(janwas): remove when this function is removed. (See above.) 68 #pragma pop_macro("LoadFence") 69 70 // Overwrites "to" while attempting to bypass the cache (read-for-ownership). 71 // Both pointers must be aligned. 72 static HWY_INLINE void StreamCacheLine(const uint64_t* HWY_RESTRICT from, 73 uint64_t* HWY_RESTRICT to) { 74 HWY_DASSERT(IsAligned(from)); 75 HWY_DASSERT(IsAligned(to)); 76 #if HWY_COMPILER_CLANG && !defined(HWY_DISABLE_CACHE_CONTROL) 77 for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); ++i) { 78 __builtin_nontemporal_store(from[i], to + i); 79 } 80 #else 81 hwy::CopyBytes(from, to, HWY_ALIGNMENT); 82 #endif 83 } 84 85 // Ensures values written by previous `Stream` calls are visible on the current 86 // core. This is NOT sufficient for synchronizing across cores; when `Stream` 87 // outputs are to be consumed by other core(s), the producer must publish 88 // availability (e.g. via mutex or atomic_flag) after `FlushStream`. 89 HWY_INLINE HWY_ATTR_CACHE void FlushStream() { 90 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) 91 _mm_sfence(); 92 #endif 93 } 94 95 // Optionally begins loading the cache line containing "p" to reduce latency of 96 // subsequent actual loads. 97 template <typename T> 98 HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) { 99 (void)p; 100 #ifndef HWY_DISABLE_CACHE_CONTROL 101 // Use _mm_prefetch on x86/x64, except when clang-cl is compiled with -mno-mmx. 102 #if HWY_ARCH_X86 && !(HWY_COMPILER_CLANGCL && !defined(__MMX__)) 103 _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0); 104 #elif HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL // includes clang 105 // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not 106 // desirable, so use the default 3 (keep in caches). 107 __builtin_prefetch(p, /*write=*/0, /*hint=*/3); 108 #endif 109 #endif // HWY_DISABLE_CACHE_CONTROL 110 } 111 112 // Invalidates and flushes the cache line containing "p", if possible. 113 HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) { 114 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) 115 _mm_clflush(p); 116 #else 117 (void)p; 118 #endif 119 } 120 121 // Hints that we are inside a spin loop and potentially reduces power 122 // consumption and coherency traffic. For example, x86 avoids multiple 123 // outstanding load requests, which reduces the memory order violation penalty 124 // when exiting the loop. 125 HWY_INLINE HWY_ATTR_CACHE void Pause() { 126 #ifndef HWY_DISABLE_CACHE_CONTROL 127 #if HWY_ARCH_X86 128 _mm_pause(); 129 #elif HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG 130 // This is documented in ACLE and the YIELD instruction is also available in 131 // Armv7, but the intrinsic is broken for Armv7 clang, hence A64 only. 132 __yield(); 133 #elif HWY_ARCH_ARM && HWY_COMPILER_GCC // includes clang 134 __asm__ volatile("yield" ::: "memory"); 135 #elif HWY_ARCH_PPC && HWY_COMPILER_GCC // includes clang 136 __asm__ volatile("or 27,27,27" ::: "memory"); 137 #endif 138 #endif // HWY_DISABLE_CACHE_CONTROL 139 } 140 141 } // namespace hwy 142 143 #endif // HIGHWAY_HWY_CACHE_CONTROL_H_