[ tor-browser ].git.dasho

crc32_x86_arm_combined_simd.h (8456B)
      1 // Copyright 2022 The Abseil Authors.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //      https://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
     16 #define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
     17 
     18 #include <cstdint>
     19 
     20 #include "absl/base/config.h"
     21 
     22 // -------------------------------------------------------------------------
     23 // Many x86 and ARM machines have CRC acceleration hardware.
     24 // We can do a faster version of Extend() on such machines.
     25 // We define a translation layer for both x86 and ARM for the ease of use and
     26 // most performance gains.
     27 
     28 // This implementation requires 64-bit CRC instructions (part of SSE 4.2) and
     29 // PCLMULQDQ instructions. 32-bit builds with SSE 4.2 do exist, so the
     30 // __x86_64__ condition is necessary.
     31 #if defined(__x86_64__) && defined(__SSE4_2__) && defined(__PCLMUL__)
     32 
     33 #include <x86intrin.h>
     34 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
     35 
     36 #elif defined(_MSC_VER) && !defined(__clang__) && defined(__AVX__) && \
     37    defined(_M_AMD64)
     38 
     39 // MSVC AVX (/arch:AVX) implies SSE 4.2 and PCLMULQDQ.
     40 #include <intrin.h>
     41 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
     42 
     43 #elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) &&                 \
     44    defined(__ARM_FEATURE_CRC32) && defined(ABSL_INTERNAL_HAVE_ARM_NEON) && \
     45    defined(__ARM_FEATURE_CRYPTO)
     46 
     47 #include <arm_acle.h>
     48 #include <arm_neon.h>
     49 #define ABSL_CRC_INTERNAL_HAVE_ARM_SIMD
     50 
     51 #endif
     52 
     53 namespace absl {
     54 ABSL_NAMESPACE_BEGIN
     55 namespace crc_internal {
     56 
     57 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
     58    defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
     59 
     60 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
     61 using V128 = uint64x2_t;
     62 #else
     63 // Note: Do not use __m128i_u, it is not portable.
     64 // Use V128_LoadU() perform an unaligned load from __m128i*.
     65 using V128 = __m128i;
     66 #endif
     67 
     68 // Starting with the initial value in |crc|, accumulates a CRC32 value for
     69 // unsigned integers of different sizes.
     70 uint32_t CRC32_u8(uint32_t crc, uint8_t v);
     71 
     72 uint32_t CRC32_u16(uint32_t crc, uint16_t v);
     73 
     74 uint32_t CRC32_u32(uint32_t crc, uint32_t v);
     75 
     76 uint32_t CRC32_u64(uint32_t crc, uint64_t v);
     77 
     78 // Loads 128 bits of integer data. |src| must be 16-byte aligned.
     79 V128 V128_Load(const V128* src);
     80 
     81 // Load 128 bits of integer data. |src| does not need to be aligned.
     82 V128 V128_LoadU(const V128* src);
     83 
     84 // Store 128 bits of integer data. |src| must be 16-byte aligned.
     85 void V128_Store(V128* dst, V128 data);
     86 
     87 // Polynomially multiplies the high 64 bits of |l| and |r|.
     88 V128 V128_PMulHi(const V128 l, const V128 r);
     89 
     90 // Polynomially multiplies the low 64 bits of |l| and |r|.
     91 V128 V128_PMulLow(const V128 l, const V128 r);
     92 
     93 // Polynomially multiplies the low 64 bits of |r| and high 64 bits of |l|.
     94 V128 V128_PMul01(const V128 l, const V128 r);
     95 
     96 // Polynomially multiplies the low 64 bits of |l| and high 64 bits of |r|.
     97 V128 V128_PMul10(const V128 l, const V128 r);
     98 
     99 // Produces a XOR operation of |l| and |r|.
    100 V128 V128_Xor(const V128 l, const V128 r);
    101 
    102 // Sets the lower half of a 128 bit register to the given 64-bit value and
    103 // zeroes the upper half.
    104 // dst[63:0] := |r|
    105 // dst[127:64] := |0|
    106 V128 V128_From64WithZeroFill(const uint64_t r);
    107 
    108 // Extracts a 32-bit integer from |l|, selected with |imm|.
    109 template <int imm>
    110 int V128_Extract32(const V128 l);
    111 
    112 // Extracts a 64-bit integer from |l|, selected with |imm|.
    113 template <int imm>
    114 uint64_t V128_Extract64(const V128 l);
    115 
    116 // Extracts the low 64 bits from V128.
    117 int64_t V128_Low64(const V128 l);
    118 
    119 // Add packed 64-bit integers in |l| and |r|.
    120 V128 V128_Add64(const V128 l, const V128 r);
    121 
    122 #endif
    123 
    124 #if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
    125 
    126 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) {
    127  return _mm_crc32_u8(crc, v);
    128 }
    129 
    130 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
    131  return _mm_crc32_u16(crc, v);
    132 }
    133 
    134 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
    135  return _mm_crc32_u32(crc, v);
    136 }
    137 
    138 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
    139  return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
    140 }
    141 
    142 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
    143 
    144 inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
    145 
    146 inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
    147 
    148 inline V128 V128_PMulHi(const V128 l, const V128 r) {
    149  return _mm_clmulepi64_si128(l, r, 0x11);
    150 }
    151 
    152 inline V128 V128_PMulLow(const V128 l, const V128 r) {
    153  return _mm_clmulepi64_si128(l, r, 0x00);
    154 }
    155 
    156 inline V128 V128_PMul01(const V128 l, const V128 r) {
    157  return _mm_clmulepi64_si128(l, r, 0x01);
    158 }
    159 
    160 inline V128 V128_PMul10(const V128 l, const V128 r) {
    161  return _mm_clmulepi64_si128(l, r, 0x10);
    162 }
    163 
    164 inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
    165 
    166 inline V128 V128_From64WithZeroFill(const uint64_t r) {
    167  return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r));
    168 }
    169 
    170 template <int imm>
    171 inline int V128_Extract32(const V128 l) {
    172  return _mm_extract_epi32(l, imm);
    173 }
    174 
    175 template <int imm>
    176 inline uint64_t V128_Extract64(const V128 l) {
    177  return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
    178 }
    179 
    180 inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
    181 
    182 inline V128 V128_Add64(const V128 l, const V128 r) {
    183  return _mm_add_epi64(l, r);
    184 }
    185 
    186 #elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
    187 
    188 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { return __crc32cb(crc, v); }
    189 
    190 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
    191  return __crc32ch(crc, v);
    192 }
    193 
    194 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
    195  return __crc32cw(crc, v);
    196 }
    197 
    198 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
    199  return __crc32cd(crc, v);
    200 }
    201 
    202 inline V128 V128_Load(const V128* src) {
    203  return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
    204 }
    205 
    206 inline V128 V128_LoadU(const V128* src) {
    207  return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
    208 }
    209 
    210 inline void V128_Store(V128* dst, V128 data) {
    211  vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
    212 }
    213 
    214 // Using inline assembly as clang does not generate the pmull2 instruction and
    215 // performance drops by 15-20%.
    216 // TODO(b/193678732): Investigate why there is a slight performance hit when
    217 // using intrinsics instead of inline assembly.
    218 inline V128 V128_PMulHi(const V128 l, const V128 r) {
    219  uint64x2_t res;
    220  __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
    221                       : "=w"(res)
    222                       : "w"(l), "w"(r));
    223  return res;
    224 }
    225 
    226 // TODO(b/193678732): Investigate why the compiler decides to move the constant
    227 // loop multiplicands from GPR to Neon registers every loop iteration.
    228 inline V128 V128_PMulLow(const V128 l, const V128 r) {
    229  uint64x2_t res;
    230  __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t"
    231                       : "=w"(res)
    232                       : "w"(l), "w"(r));
    233  return res;
    234 }
    235 
    236 inline V128 V128_PMul01(const V128 l, const V128 r) {
    237  return reinterpret_cast<V128>(vmull_p64(
    238      reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(l))),
    239      reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
    240 }
    241 
    242 inline V128 V128_PMul10(const V128 l, const V128 r) {
    243  return reinterpret_cast<V128>(vmull_p64(
    244      reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
    245      reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(r)))));
    246 }
    247 
    248 inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
    249 
    250 inline V128 V128_From64WithZeroFill(const uint64_t r){
    251  constexpr uint64x2_t kZero = {0, 0};
    252  return vsetq_lane_u64(r, kZero, 0);
    253 }
    254 
    255 
    256 template <int imm>
    257 inline int V128_Extract32(const V128 l) {
    258  return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
    259 }
    260 
    261 template <int imm>
    262 inline uint64_t V128_Extract64(const V128 l) {
    263  return vgetq_lane_u64(l, imm);
    264 }
    265 
    266 inline int64_t V128_Low64(const V128 l) {
    267  return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
    268 }
    269 
    270 inline V128 V128_Add64(const V128 l, const V128 r) { return vaddq_u64(l, r); }
    271 
    272 #endif
    273 
    274 }  // namespace crc_internal
    275 ABSL_NAMESPACE_END
    276 }  // namespace absl
    277 
    278 #endif  // ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE