tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

x86.h (13532B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_PORTS_X86_H_
     13 #define AOM_AOM_PORTS_X86_H_
     14 #include <stdlib.h>
     15 
     16 #if defined(_MSC_VER)
     17 #include <intrin.h> /* For __cpuidex, __rdtsc */
     18 #endif
     19 
     20 #include "aom/aom_integer.h"
     21 #include "config/aom_config.h"
     22 
     23 #ifdef __cplusplus
     24 extern "C" {
     25 #endif
     26 
     27 typedef enum {
     28  AOM_CPU_UNKNOWN = -1,
     29  AOM_CPU_AMD,
     30  AOM_CPU_AMD_OLD,
     31  AOM_CPU_CENTAUR,
     32  AOM_CPU_CYRIX,
     33  AOM_CPU_INTEL,
     34  AOM_CPU_NEXGEN,
     35  AOM_CPU_NSC,
     36  AOM_CPU_RISE,
     37  AOM_CPU_SIS,
     38  AOM_CPU_TRANSMETA,
     39  AOM_CPU_TRANSMETA_OLD,
     40  AOM_CPU_UMC,
     41  AOM_CPU_VIA,
     42 
     43  AOM_CPU_LAST
     44 } aom_cpu_t;
     45 
     46 #if defined(__GNUC__) || defined(__ANDROID__)
     47 #if AOM_ARCH_X86_64
     48 #define cpuid(func, func2, ax, bx, cx, dx)                      \
     49  __asm__ __volatile__("cpuid           \n\t"                   \
     50                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
     51                       : "a"(func), "c"(func2))
     52 #else
     53 #define cpuid(func, func2, ax, bx, cx, dx)     \
     54  __asm__ __volatile__(                        \
     55      "mov %%ebx, %%edi   \n\t"                \
     56      "cpuid              \n\t"                \
     57      "xchg %%edi, %%ebx  \n\t"                \
     58      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
     59      : "a"(func), "c"(func2))
     60 #endif
     61 #elif defined(__SUNPRO_C) || \
     62    defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
     63 #if AOM_ARCH_X86_64
     64 #define cpuid(func, func2, ax, bx, cx, dx)     \
     65  asm volatile(                                \
     66      "xchg %rsi, %rbx \n\t"                   \
     67      "cpuid           \n\t"                   \
     68      "movl %ebx, %edi \n\t"                   \
     69      "xchg %rsi, %rbx \n\t"                   \
     70      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
     71      : "a"(func), "c"(func2))
     72 #else
     73 #define cpuid(func, func2, ax, bx, cx, dx)     \
     74  asm volatile(                                \
     75      "pushl %ebx       \n\t"                  \
     76      "cpuid            \n\t"                  \
     77      "movl %ebx, %edi  \n\t"                  \
     78      "popl %ebx        \n\t"                  \
     79      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
     80      : "a"(func), "c"(func2))
     81 #endif
     82 #else /* end __SUNPRO__ */
     83 #if AOM_ARCH_X86_64
     84 #if defined(_MSC_VER) && _MSC_VER > 1500
     85 #define cpuid(func, func2, a, b, c, d) \
     86  do {                                 \
     87    int regs[4];                       \
     88    __cpuidex(regs, func, func2);      \
     89    a = regs[0];                       \
     90    b = regs[1];                       \
     91    c = regs[2];                       \
     92    d = regs[3];                       \
     93  } while (0)
     94 #else
     95 #define cpuid(func, func2, a, b, c, d) \
     96  do {                                 \
     97    int regs[4];                       \
     98    __cpuid(regs, func);               \
     99    a = regs[0];                       \
    100    b = regs[1];                       \
    101    c = regs[2];                       \
    102    d = regs[3];                       \
    103  } while (0)
    104 #endif
    105 #else
    106 /* clang-format off */
    107 #define cpuid(func, func2, a, b, c, d) \
    108  __asm mov eax, func                  \
    109  __asm mov ecx, func2                 \
    110  __asm cpuid                          \
    111  __asm mov a, eax                     \
    112  __asm mov b, ebx                     \
    113  __asm mov c, ecx                     \
    114  __asm mov d, edx
    115 #endif
    116 /* clang-format on */
    117 #endif /* end others */
    118 
    119 // NaCl has no support for xgetbv or the raw opcode.
    120 #if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
    121 static inline uint64_t xgetbv(void) {
    122  const uint32_t ecx = 0;
    123  uint32_t eax, edx;
    124  // Use the raw opcode for xgetbv for compatibility with older toolchains.
    125  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
    126                   : "=a"(eax), "=d"(edx)
    127                   : "c"(ecx));
    128  return ((uint64_t)edx << 32) | eax;
    129 }
    130 #elif (defined(_M_X64) || defined(_M_IX86)) && defined(_MSC_FULL_VER) && \
    131    _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
    132 #include <immintrin.h>
    133 #define xgetbv() _xgetbv(0)
    134 #elif defined(_MSC_VER) && defined(_M_IX86)
    135 static inline uint64_t xgetbv(void) {
    136  uint32_t eax_, edx_;
    137  __asm {
    138    xor ecx, ecx  // ecx = 0
    139    // Use the raw opcode for xgetbv for compatibility with older toolchains.
    140    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
    141    mov eax_, eax
    142    mov edx_, edx
    143  }
    144  return ((uint64_t)edx_ << 32) | eax_;
    145 }
    146 #else
    147 #define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
    148 #endif
    149 
    150 #if defined(_MSC_VER) && _MSC_VER >= 1700
    151 #undef NOMINMAX
    152 #define NOMINMAX
    153 #undef WIN32_LEAN_AND_MEAN
    154 #define WIN32_LEAN_AND_MEAN
    155 #include <windows.h>
    156 #if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
    157 #define getenv(x) NULL
    158 #endif
    159 #endif
    160 
    161 #define HAS_MMX 0x01
    162 #define HAS_SSE 0x02
    163 #define HAS_SSE2 0x04
    164 #define HAS_SSE3 0x08
    165 #define HAS_SSSE3 0x10
    166 #define HAS_SSE4_1 0x20
    167 #define HAS_AVX 0x40
    168 #define HAS_AVX2 0x80
    169 #define HAS_SSE4_2 0x100
    170 #define HAS_AVX512 0x200
    171 
    172 #ifndef BIT
    173 #define BIT(n) (1u << (n))
    174 #endif
    175 
    176 #define MMX_BITS BIT(23)
    177 #define SSE_BITS BIT(25)
    178 #define SSE2_BITS BIT(26)
    179 #define SSE3_BITS BIT(0)
    180 #define SSSE3_BITS BIT(9)
    181 #define SSE4_1_BITS BIT(19)
    182 // Bits 27 (OSXSAVE) & 28 (256-bit AVX)
    183 #define AVX_BITS (BIT(27) | BIT(28))
    184 #define AVX2_BITS BIT(5)
    185 // Bits 16 (AVX512-F) & 17 (AVX512-DQ) & 28 (AVX512-CD) & 30 (AVX512-BW)
    186 // & 31 (AVX512-VL)
    187 #define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
    188 // Bits 1 (AVX512-VBMI) & 6 (AVX512-VBMI2) & 8 (AVX512-GFNI) & 9 (AVX512-VAES) &
    189 // 10 (AVX512-VPCLMULQDQ) & 11 (AVX512-VNNI) & 12 (AVX512-BITALG) &
    190 // 14 (AVX512-POPCNTDQ)
    191 #define AVX512_DL_BITS \
    192  (BIT(1) | BIT(6) | BIT(8) | BIT(9) | BIT(10) | BIT(11) | BIT(12) | BIT(14))
    193 
    194 #define FEATURE_SET(reg, feature) \
    195  (((reg) & (feature##_BITS)) == (feature##_BITS))
    196 
    197 static inline int x86_simd_caps(void) {
    198  unsigned int flags = 0;
    199  unsigned int mask = ~0u;
    200  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
    201  char *env;
    202 
    203  /* See if the CPU capabilities are being overridden by the environment */
    204  env = getenv("AOM_SIMD_CAPS");
    205  if (env && *env) return (int)strtol(env, NULL, 0);
    206 
    207  env = getenv("AOM_SIMD_CAPS_MASK");
    208  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
    209 
    210  /* Ensure that the CPUID instruction supports extended features */
    211  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
    212 
    213  if (max_cpuid_val < 1) return 0;
    214 
    215  /* Get the standard feature flags */
    216  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
    217 
    218  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
    219  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
    220  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
    221  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
    222  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
    223  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
    224 
    225  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
    226  if (FEATURE_SET(reg_ecx, AVX)) {
    227    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
    228    if ((xgetbv() & 0x6) == 0x6) {
    229      flags |= HAS_AVX;
    230      if (max_cpuid_val >= 7) {
    231        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
    232        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
    233        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
    234        // Check for OS-support of ZMM and YMM state. Necessary for AVX512.
    235        // Only set HAS_AVX512 flag if AVX512_DL feature are supported.
    236        // Older AVX512 implementations (such as Skylake) have turbo curves that
    237        // are currently problematic for mixed AVX512/AVX2 code
    238        if ((xgetbv() & 0xe6) == 0xe6) {
    239          flags |=
    240              FEATURE_SET(reg_ebx, AVX512) && FEATURE_SET(reg_ecx, AVX512_DL)
    241                  ? HAS_AVX512
    242                  : 0;
    243        }
    244      }
    245    }
    246  }
    247  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
    248  return flags & mask;
    249 }
    250 
    251 // Fine-Grain Measurement Functions
    252 //
    253 // If you are timing a small region of code, access the timestamp counter
    254 // (TSC) via:
    255 //
    256 // unsigned int start = x86_tsc_start();
    257 //   ...
    258 // unsigned int end = x86_tsc_end();
    259 // unsigned int diff = end - start;
    260 //
    261 // The start/end functions introduce a few more instructions than using
    262 // x86_readtsc directly, but prevent the CPU's out-of-order execution from
    263 // affecting the measurement (by having earlier/later instructions be evaluated
    264 // in the time interval). See the white paper, "How to Benchmark Code
    265 // Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by
    266 // Gabriele Paoloni for more information.
    267 //
    268 // If you are timing a large function (CPU time > a couple of seconds), use
    269 // x86_readtsc64 to read the timestamp counter in a 64-bit integer. The
    270 // out-of-order leakage that can occur is minimal compared to total runtime.
    271 static inline unsigned int x86_readtsc(void) {
    272 #if defined(__GNUC__)
    273  unsigned int tsc;
    274  __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :);
    275  return tsc;
    276 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
    277  unsigned int tsc;
    278  asm volatile("rdtsc\n\t" : "=a"(tsc) :);
    279  return tsc;
    280 #else
    281 #if AOM_ARCH_X86_64
    282  return (unsigned int)__rdtsc();
    283 #else
    284  __asm rdtsc;
    285 #endif
    286 #endif
    287 }
    288 // 64-bit CPU cycle counter
    289 static inline uint64_t x86_readtsc64(void) {
    290 #if defined(__GNUC__)
    291  uint32_t hi, lo;
    292  __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
    293  return ((uint64_t)hi << 32) | lo;
    294 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
    295  uint_t hi, lo;
    296  asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi));
    297  return ((uint64_t)hi << 32) | lo;
    298 #else
    299 #if AOM_ARCH_X86_64
    300  return (uint64_t)__rdtsc();
    301 #else
    302  __asm rdtsc;
    303 #endif
    304 #endif
    305 }
    306 
    307 // 32-bit CPU cycle counter with a partial fence against out-of-order execution.
    308 static inline unsigned int x86_readtscp(void) {
    309 #if defined(__GNUC__)
    310  unsigned int tscp;
    311  __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :);
    312  return tscp;
    313 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
    314  unsigned int tscp;
    315  asm volatile("rdtscp\n\t" : "=a"(tscp) :);
    316  return tscp;
    317 #elif defined(_MSC_VER)
    318  unsigned int ui;
    319  return (unsigned int)__rdtscp(&ui);
    320 #else
    321 #if AOM_ARCH_X86_64
    322  return (unsigned int)__rdtscp();
    323 #else
    324  __asm rdtscp;
    325 #endif
    326 #endif
    327 }
    328 
    329 static inline unsigned int x86_tsc_start(void) {
    330  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
    331  // This call should not be removed. See function notes above.
    332  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
    333  // Avoid compiler warnings on unused-but-set variables.
    334  (void)reg_eax;
    335  (void)reg_ebx;
    336  (void)reg_ecx;
    337  (void)reg_edx;
    338  return x86_readtsc();
    339 }
    340 
    341 static inline unsigned int x86_tsc_end(void) {
    342  uint32_t v = x86_readtscp();
    343  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
    344  // This call should not be removed. See function notes above.
    345  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
    346  // Avoid compiler warnings on unused-but-set variables.
    347  (void)reg_eax;
    348  (void)reg_ebx;
    349  (void)reg_ecx;
    350  (void)reg_edx;
    351  return v;
    352 }
    353 
    354 #if defined(__GNUC__)
    355 #define x86_pause_hint() __asm__ __volatile__("pause \n\t")
    356 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
    357 #define x86_pause_hint() asm volatile("pause \n\t")
    358 #else
    359 #if AOM_ARCH_X86_64
    360 #define x86_pause_hint() _mm_pause();
    361 #else
    362 #define x86_pause_hint() __asm pause
    363 #endif
    364 #endif
    365 
    366 #if defined(__GNUC__)
    367 static void x87_set_control_word(unsigned short mode) {
    368  __asm__ __volatile__("fldcw %0" : : "m"(*&mode));
    369 }
    370 static unsigned short x87_get_control_word(void) {
    371  unsigned short mode;
    372  __asm__ __volatile__("fstcw %0\n\t" : "=m"(*&mode) :);
    373  return mode;
    374 }
    375 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
    376 static void x87_set_control_word(unsigned short mode) {
    377  asm volatile("fldcw %0" : : "m"(*&mode));
    378 }
    379 static unsigned short x87_get_control_word(void) {
    380  unsigned short mode;
    381  asm volatile("fstcw %0\n\t" : "=m"(*&mode) :);
    382  return mode;
    383 }
    384 #elif AOM_ARCH_X86_64
    385 /* No fldcw intrinsics on Windows x64, punt to external asm */
    386 extern void aom_winx64_fldcw(unsigned short mode);
    387 extern unsigned short aom_winx64_fstcw(void);
    388 #define x87_set_control_word aom_winx64_fldcw
    389 #define x87_get_control_word aom_winx64_fstcw
    390 #else
    391 static void x87_set_control_word(unsigned short mode) {
    392  __asm { fldcw mode }
    393 }
    394 static unsigned short x87_get_control_word(void) {
    395  unsigned short mode;
    396  __asm { fstcw mode }
    397  return mode;
    398 }
    399 #endif
    400 
    401 static inline unsigned int x87_set_double_precision(void) {
    402  unsigned int mode = x87_get_control_word();
    403  // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1
    404  // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf
    405  // 8.1.5.2 Precision Control Field
    406  // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control")
    407  // determine the number of bits used in floating point calculations. To match
    408  // later SSE instructions restrict x87 operations to Double Precision (0x200).
    409  // Precision                     PC Field
    410  // Single Precision (24-Bits)    00B
    411  // Reserved                      01B
    412  // Double Precision (53-Bits)    10B
    413  // Extended Precision (64-Bits)  11B
    414  x87_set_control_word((mode & ~0x300u) | 0x200u);
    415  return mode;
    416 }
    417 
    418 #ifdef __cplusplus
    419 }  // extern "C"
    420 #endif
    421 
    422 #endif  // AOM_AOM_PORTS_X86_H_