tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cache_control.h (5222B)


      1 // Copyright 2020 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
     17 #define HIGHWAY_HWY_CACHE_CONTROL_H_
     18 
     19 #include "hwy/aligned_allocator.h"  // HWY_ALIGNMENT
     20 #include "hwy/base.h"
     21 
     22 // Requires SSE2; fails to compile on 32-bit Clang 7 (see
     23 // https://github.com/gperftools/gperftools/issues/946).
     24 #if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
     25 #undef HWY_DISABLE_CACHE_CONTROL
     26 #define HWY_DISABLE_CACHE_CONTROL
     27 #endif
     28 
     29 #ifndef HWY_DISABLE_CACHE_CONTROL
     30 // intrin.h is sufficient on MSVC and already included by base.h.
     31 #if HWY_ARCH_X86 && !HWY_COMPILER_MSVC
     32 #include <emmintrin.h>  // SSE2
     33 #include <xmmintrin.h>  // _mm_prefetch
     34 #elif HWY_ARCH_ARM_A64
     35 #include <arm_acle.h>
     36 #endif
     37 #endif  // HWY_DISABLE_CACHE_CONTROL
     38 
     39 namespace hwy {
     40 
     41 // Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
     42 #define HWY_STREAM_MULTIPLE 16
     43 
     44 // The following functions may also require an attribute.
     45 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
     46 #define HWY_ATTR_CACHE __attribute__((target("sse2")))
     47 #else
     48 #define HWY_ATTR_CACHE
     49 #endif
     50 
     51 // Windows.h #defines this, which causes infinite recursion. Temporarily
     52 // undefine to avoid conflict with our function.
     53 // TODO(janwas): remove when this function is removed.
     54 #pragma push_macro("LoadFence")
     55 #undef LoadFence
     56 
     57 // Delays subsequent loads until prior loads are visible. Beware of potentially
     58 // differing behavior across architectures and vendors: on Intel but not
     59 // AMD CPUs, also serves as a full fence (waits for all prior instructions to
     60 // complete).
     61 HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
     62 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
     63  _mm_lfence();
     64 #endif
     65 }
     66 
     67 // TODO(janwas): remove when this function is removed. (See above.)
     68 #pragma pop_macro("LoadFence")
     69 
     70 // Overwrites "to" while attempting to bypass the cache (read-for-ownership).
     71 // Both pointers must be aligned.
     72 static HWY_INLINE void StreamCacheLine(const uint64_t* HWY_RESTRICT from,
     73                                       uint64_t* HWY_RESTRICT to) {
     74  HWY_DASSERT(IsAligned(from));
     75  HWY_DASSERT(IsAligned(to));
     76 #if HWY_COMPILER_CLANG && !defined(HWY_DISABLE_CACHE_CONTROL)
     77  for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); ++i) {
     78    __builtin_nontemporal_store(from[i], to + i);
     79  }
     80 #else
     81  hwy::CopyBytes(from, to, HWY_ALIGNMENT);
     82 #endif
     83 }
     84 
     85 // Ensures values written by previous `Stream` calls are visible on the current
     86 // core. This is NOT sufficient for synchronizing across cores; when `Stream`
     87 // outputs are to be consumed by other core(s), the producer must publish
     88 // availability (e.g. via mutex or atomic_flag) after `FlushStream`.
     89 HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
     90 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
     91  _mm_sfence();
     92 #endif
     93 }
     94 
     95 // Optionally begins loading the cache line containing "p" to reduce latency of
     96 // subsequent actual loads.
     97 template <typename T>
     98 HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
     99  (void)p;
    100 #ifndef HWY_DISABLE_CACHE_CONTROL
    101 // Use _mm_prefetch on x86/x64, except when clang-cl is compiled with -mno-mmx.
    102 #if HWY_ARCH_X86 && !(HWY_COMPILER_CLANGCL && !defined(__MMX__))
    103  _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
    104 #elif HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL  // includes clang
    105  // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
    106  // desirable, so use the default 3 (keep in caches).
    107  __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
    108 #endif
    109 #endif  //  HWY_DISABLE_CACHE_CONTROL
    110 }
    111 
    112 // Invalidates and flushes the cache line containing "p", if possible.
    113 HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
    114 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
    115  _mm_clflush(p);
    116 #else
    117  (void)p;
    118 #endif
    119 }
    120 
    121 // Hints that we are inside a spin loop and potentially reduces power
    122 // consumption and coherency traffic. For example, x86 avoids multiple
    123 // outstanding load requests, which reduces the memory order violation penalty
    124 // when exiting the loop.
    125 HWY_INLINE HWY_ATTR_CACHE void Pause() {
    126 #ifndef HWY_DISABLE_CACHE_CONTROL
    127 #if HWY_ARCH_X86
    128  _mm_pause();
    129 #elif HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG
    130  // This is documented in ACLE and the YIELD instruction is also available in
    131  // Armv7, but the intrinsic is broken for Armv7 clang, hence A64 only.
    132  __yield();
    133 #elif HWY_ARCH_ARM && HWY_COMPILER_GCC  // includes clang
    134  __asm__ volatile("yield" ::: "memory");
    135 #elif HWY_ARCH_PPC && HWY_COMPILER_GCC  // includes clang
    136  __asm__ volatile("or 27,27,27" ::: "memory");
    137 #endif
    138 #endif  // HWY_DISABLE_CACHE_CONTROL
    139 }
    140 
    141 }  // namespace hwy
    142 
    143 #endif  // HIGHWAY_HWY_CACHE_CONTROL_H_