crc32_x86_arm_combined_simd.h (8456B)
1 // Copyright 2022 The Abseil Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_ 16 #define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_ 17 18 #include <cstdint> 19 20 #include "absl/base/config.h" 21 22 // ------------------------------------------------------------------------- 23 // Many x86 and ARM machines have CRC acceleration hardware. 24 // We can do a faster version of Extend() on such machines. 25 // We define a translation layer for both x86 and ARM for the ease of use and 26 // most performance gains. 27 28 // This implementation requires 64-bit CRC instructions (part of SSE 4.2) and 29 // PCLMULQDQ instructions. 32-bit builds with SSE 4.2 do exist, so the 30 // __x86_64__ condition is necessary. 31 #if defined(__x86_64__) && defined(__SSE4_2__) && defined(__PCLMUL__) 32 33 #include <x86intrin.h> 34 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD 35 36 #elif defined(_MSC_VER) && !defined(__clang__) && defined(__AVX__) && \ 37 defined(_M_AMD64) 38 39 // MSVC AVX (/arch:AVX) implies SSE 4.2 and PCLMULQDQ. 40 #include <intrin.h> 41 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD 42 43 #elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \ 44 defined(__ARM_FEATURE_CRC32) && defined(ABSL_INTERNAL_HAVE_ARM_NEON) && \ 45 defined(__ARM_FEATURE_CRYPTO) 46 47 #include <arm_acle.h> 48 #include <arm_neon.h> 49 #define ABSL_CRC_INTERNAL_HAVE_ARM_SIMD 50 51 #endif 52 53 namespace absl { 54 ABSL_NAMESPACE_BEGIN 55 namespace crc_internal { 56 57 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \ 58 defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) 59 60 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) 61 using V128 = uint64x2_t; 62 #else 63 // Note: Do not use __m128i_u, it is not portable. 64 // Use V128_LoadU() perform an unaligned load from __m128i*. 65 using V128 = __m128i; 66 #endif 67 68 // Starting with the initial value in |crc|, accumulates a CRC32 value for 69 // unsigned integers of different sizes. 70 uint32_t CRC32_u8(uint32_t crc, uint8_t v); 71 72 uint32_t CRC32_u16(uint32_t crc, uint16_t v); 73 74 uint32_t CRC32_u32(uint32_t crc, uint32_t v); 75 76 uint32_t CRC32_u64(uint32_t crc, uint64_t v); 77 78 // Loads 128 bits of integer data. |src| must be 16-byte aligned. 79 V128 V128_Load(const V128* src); 80 81 // Load 128 bits of integer data. |src| does not need to be aligned. 82 V128 V128_LoadU(const V128* src); 83 84 // Store 128 bits of integer data. |src| must be 16-byte aligned. 85 void V128_Store(V128* dst, V128 data); 86 87 // Polynomially multiplies the high 64 bits of |l| and |r|. 88 V128 V128_PMulHi(const V128 l, const V128 r); 89 90 // Polynomially multiplies the low 64 bits of |l| and |r|. 91 V128 V128_PMulLow(const V128 l, const V128 r); 92 93 // Polynomially multiplies the low 64 bits of |r| and high 64 bits of |l|. 94 V128 V128_PMul01(const V128 l, const V128 r); 95 96 // Polynomially multiplies the low 64 bits of |l| and high 64 bits of |r|. 97 V128 V128_PMul10(const V128 l, const V128 r); 98 99 // Produces a XOR operation of |l| and |r|. 100 V128 V128_Xor(const V128 l, const V128 r); 101 102 // Sets the lower half of a 128 bit register to the given 64-bit value and 103 // zeroes the upper half. 104 // dst[63:0] := |r| 105 // dst[127:64] := |0| 106 V128 V128_From64WithZeroFill(const uint64_t r); 107 108 // Extracts a 32-bit integer from |l|, selected with |imm|. 109 template <int imm> 110 int V128_Extract32(const V128 l); 111 112 // Extracts a 64-bit integer from |l|, selected with |imm|. 113 template <int imm> 114 uint64_t V128_Extract64(const V128 l); 115 116 // Extracts the low 64 bits from V128. 117 int64_t V128_Low64(const V128 l); 118 119 // Add packed 64-bit integers in |l| and |r|. 120 V128 V128_Add64(const V128 l, const V128 r); 121 122 #endif 123 124 #if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD) 125 126 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { 127 return _mm_crc32_u8(crc, v); 128 } 129 130 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) { 131 return _mm_crc32_u16(crc, v); 132 } 133 134 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) { 135 return _mm_crc32_u32(crc, v); 136 } 137 138 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) { 139 return static_cast<uint32_t>(_mm_crc32_u64(crc, v)); 140 } 141 142 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } 143 144 inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); } 145 146 inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); } 147 148 inline V128 V128_PMulHi(const V128 l, const V128 r) { 149 return _mm_clmulepi64_si128(l, r, 0x11); 150 } 151 152 inline V128 V128_PMulLow(const V128 l, const V128 r) { 153 return _mm_clmulepi64_si128(l, r, 0x00); 154 } 155 156 inline V128 V128_PMul01(const V128 l, const V128 r) { 157 return _mm_clmulepi64_si128(l, r, 0x01); 158 } 159 160 inline V128 V128_PMul10(const V128 l, const V128 r) { 161 return _mm_clmulepi64_si128(l, r, 0x10); 162 } 163 164 inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); } 165 166 inline V128 V128_From64WithZeroFill(const uint64_t r) { 167 return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r)); 168 } 169 170 template <int imm> 171 inline int V128_Extract32(const V128 l) { 172 return _mm_extract_epi32(l, imm); 173 } 174 175 template <int imm> 176 inline uint64_t V128_Extract64(const V128 l) { 177 return static_cast<uint64_t>(_mm_extract_epi64(l, imm)); 178 } 179 180 inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); } 181 182 inline V128 V128_Add64(const V128 l, const V128 r) { 183 return _mm_add_epi64(l, r); 184 } 185 186 #elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) 187 188 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { return __crc32cb(crc, v); } 189 190 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) { 191 return __crc32ch(crc, v); 192 } 193 194 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) { 195 return __crc32cw(crc, v); 196 } 197 198 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) { 199 return __crc32cd(crc, v); 200 } 201 202 inline V128 V128_Load(const V128* src) { 203 return vld1q_u64(reinterpret_cast<const uint64_t*>(src)); 204 } 205 206 inline V128 V128_LoadU(const V128* src) { 207 return vld1q_u64(reinterpret_cast<const uint64_t*>(src)); 208 } 209 210 inline void V128_Store(V128* dst, V128 data) { 211 vst1q_u64(reinterpret_cast<uint64_t*>(dst), data); 212 } 213 214 // Using inline assembly as clang does not generate the pmull2 instruction and 215 // performance drops by 15-20%. 216 // TODO(b/193678732): Investigate why there is a slight performance hit when 217 // using intrinsics instead of inline assembly. 218 inline V128 V128_PMulHi(const V128 l, const V128 r) { 219 uint64x2_t res; 220 __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t" 221 : "=w"(res) 222 : "w"(l), "w"(r)); 223 return res; 224 } 225 226 // TODO(b/193678732): Investigate why the compiler decides to move the constant 227 // loop multiplicands from GPR to Neon registers every loop iteration. 228 inline V128 V128_PMulLow(const V128 l, const V128 r) { 229 uint64x2_t res; 230 __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t" 231 : "=w"(res) 232 : "w"(l), "w"(r)); 233 return res; 234 } 235 236 inline V128 V128_PMul01(const V128 l, const V128 r) { 237 return reinterpret_cast<V128>(vmull_p64( 238 reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(l))), 239 reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r))))); 240 } 241 242 inline V128 V128_PMul10(const V128 l, const V128 r) { 243 return reinterpret_cast<V128>(vmull_p64( 244 reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))), 245 reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(r))))); 246 } 247 248 inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); } 249 250 inline V128 V128_From64WithZeroFill(const uint64_t r){ 251 constexpr uint64x2_t kZero = {0, 0}; 252 return vsetq_lane_u64(r, kZero, 0); 253 } 254 255 256 template <int imm> 257 inline int V128_Extract32(const V128 l) { 258 return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm); 259 } 260 261 template <int imm> 262 inline uint64_t V128_Extract64(const V128 l) { 263 return vgetq_lane_u64(l, imm); 264 } 265 266 inline int64_t V128_Low64(const V128 l) { 267 return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0); 268 } 269 270 inline V128 V128_Add64(const V128 l, const V128 r) { return vaddq_u64(l, r); } 271 272 #endif 273 274 } // namespace crc_internal 275 ABSL_NAMESPACE_END 276 } // namespace absl 277 278 #endif // ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_