pclmul.c (6644B)
1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 /* 26 * This is the GHASH implementation that leverages the pclmulqdq opcode 27 * (from the AES-NI instructions). 28 */ 29 30 #include <wmmintrin.h> 31 32 #ifndef __GNUC__ 33 #define __attribute__(x) 34 #endif 35 36 #define BR_TARGET(x) __attribute__((target(x))) 37 38 #if defined(__GNUC__) && !defined(__clang__) 39 _Pragma("GCC target(\"sse2,ssse3,sse4.1,aes,pclmul\")") 40 #endif 41 42 #if 0 43 /* 44 * Test CPU support for PCLMULQDQ. 45 */ 46 static inline int 47 pclmul_supported(void) 48 { 49 /* 50 * Bit mask for features in ECX: 51 * 1 PCLMULQDQ support 52 */ 53 return br_cpuid(0, 0, 0x00000002, 0); 54 } 55 56 /* see bearssl_hash.h */ 57 br_ghash 58 br_ghash_pclmul_get(void) 59 { 60 return pclmul_supported() ? &br_ghash_pclmul : 0; 61 } 62 63 BR_TARGETS_X86_UP 64 #endif 65 /* 66 * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so, 67 * for that compiler, we use inline assembly. Inline assembly is 68 * potentially a bit slower because the compiler does not understand 69 * what the opcode does, and thus cannot optimize instruction 70 * scheduling. 71 * 72 * We use a target of "sse2" only, so that Clang may still handle the 73 * '__m128i' type and allocate SSE2 registers. 74 */ 75 #ifdef __clang__AND_NOT_WORKING 76 BR_TARGET("sse2") 77 static inline __m128i 78 pclmulqdq00(__m128i x, __m128i y) 79 { 80 __asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y)); 81 return x; 82 } 83 BR_TARGET("sse2") 84 static inline __m128i 85 pclmulqdq11(__m128i x, __m128i y) 86 { 87 __asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y)); 88 return x; 89 } 90 #else 91 #define pclmulqdq00(x, y) _mm_clmulepi64_si128(x, y, 0x00) 92 #define pclmulqdq11(x, y) _mm_clmulepi64_si128(x, y, 0x11) 93 #endif 94 95 /* 96 * From a 128-bit value kw, compute kx as the XOR of the two 64-bit 97 * halves of kw (into the right half of kx; left half is unspecified). 98 */ 99 #define BK(kw, kx) do { \ 100 kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \ 101 } while (0) 102 103 /* 104 * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and 105 * the XOR of the two values (kx). 106 */ 107 #define PBK(k0, k1, kw, kx) do { \ 108 kw = _mm_unpacklo_epi64(k1, k0); \ 109 kx = _mm_xor_si128(k0, k1); \ 110 } while (0) 111 112 /* 113 * Perform reduction in GF(2^128). The 256-bit value is in x0..x3; 114 * result is written in x0..x1. 115 */ 116 #define REDUCE_F128(x0, x1, x2, x3) do { \ 117 x1 = _mm_xor_si128( \ 118 x1, \ 119 _mm_xor_si128( \ 120 _mm_xor_si128( \ 121 x3, \ 122 _mm_srli_epi64(x3, 1)), \ 123 _mm_xor_si128( \ 124 _mm_srli_epi64(x3, 2), \ 125 _mm_srli_epi64(x3, 7)))); \ 126 x2 = _mm_xor_si128( \ 127 _mm_xor_si128( \ 128 x2, \ 129 _mm_slli_epi64(x3, 63)), \ 130 _mm_xor_si128( \ 131 _mm_slli_epi64(x3, 62), \ 132 _mm_slli_epi64(x3, 57))); \ 133 x0 = _mm_xor_si128( \ 134 x0, \ 135 _mm_xor_si128( \ 136 _mm_xor_si128( \ 137 x2, \ 138 _mm_srli_epi64(x2, 1)), \ 139 _mm_xor_si128( \ 140 _mm_srli_epi64(x2, 2), \ 141 _mm_srli_epi64(x2, 7)))); \ 142 x1 = _mm_xor_si128( \ 143 _mm_xor_si128( \ 144 x1, \ 145 _mm_slli_epi64(x2, 63)), \ 146 _mm_xor_si128( \ 147 _mm_slli_epi64(x2, 62), \ 148 _mm_slli_epi64(x2, 57))); \ 149 } while (0) 150 151 152 BR_TARGET("ssse3,pclmul") 153 static inline void 154 expand_key_pclmul(const polyval_t *pv, pv_expanded_key_t *out) 155 { 156 __m128i h1w, h1x; 157 __m128i lastw, lastx; 158 __m128i t0, t1, t2, t3; 159 160 h1w = PCLMUL_MEMBER(pv->key.h); 161 BK(h1w, h1x); 162 lastw = h1w; 163 164 for (int i = PV_BLOCK_STRIDE - 2; i >= 0; --i) { 165 BK(lastw, lastx); 166 167 t1 = pclmulqdq11(lastw, h1w); 168 t3 = pclmulqdq00(lastw, h1w); 169 t2 = pclmulqdq00(lastx, h1x); 170 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3)); 171 t0 = _mm_shuffle_epi32(t1, 0x0E); 172 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); 173 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); 174 REDUCE_F128(t0, t1, t2, t3); 175 out->k[i] = lastw = _mm_unpacklo_epi64(t1, t0); 176 } 177 } 178 179 // Add PCLMUL_BLOCK_STRIDE * 16 bytes from input. 180 BR_TARGET("ssse3,pclmul") 181 static inline void 182 pv_add_multiple_pclmul(polyval_t *pv, 183 const uint8_t *input, 184 const pv_expanded_key_t *expanded) 185 { 186 __m128i t0, t1, t2, t3; 187 188 t1 = _mm_setzero_si128(); 189 t2 = _mm_setzero_si128(); 190 t3 = _mm_setzero_si128(); 191 192 for (int i = 0; i < PV_BLOCK_STRIDE; ++i, input += 16) { 193 __m128i aw = _mm_loadu_si128((void *)(input)); 194 __m128i ax; 195 __m128i hx, hw; 196 if (i == 0) { 197 aw = _mm_xor_si128(aw, PCLMUL_MEMBER(pv->y)); 198 } 199 if (i == PV_BLOCK_STRIDE - 1) { 200 hw = PCLMUL_MEMBER(pv->key.h); 201 } else { 202 hw = expanded->k[i]; 203 } 204 BK(aw, ax); 205 BK(hw, hx); 206 t1 = _mm_xor_si128(t1, pclmulqdq11(aw, hw)); 207 t3 = _mm_xor_si128(t3, pclmulqdq00(aw, hw)); 208 t2 = _mm_xor_si128(t2, pclmulqdq00(ax, hx)); 209 } 210 211 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3)); 212 t0 = _mm_shuffle_epi32(t1, 0x0E); 213 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); 214 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); 215 216 REDUCE_F128(t0, t1, t2, t3); 217 PCLMUL_MEMBER(pv->y) = _mm_unpacklo_epi64(t1, t0); 218 } 219 220 221 /* see bearssl_hash.h */ 222 BR_TARGET("ssse3,pclmul") 223 static inline void 224 pv_mul_y_h_pclmul(polyval_t *pv) 225 { 226 __m128i yw, h1w, h1x; 227 228 h1w = PCLMUL_MEMBER(pv->key.h); 229 BK(h1w, h1x); 230 231 { 232 __m128i aw, ax; 233 __m128i t0, t1, t2, t3; 234 235 aw = PCLMUL_MEMBER(pv->y); 236 BK(aw, ax); 237 238 t1 = pclmulqdq11(aw, h1w); 239 t3 = pclmulqdq00(aw, h1w); 240 t2 = pclmulqdq00(ax, h1x); 241 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3)); 242 t0 = _mm_shuffle_epi32(t1, 0x0E); 243 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); 244 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); 245 #if 0 // This step is GHASH-only. 246 SL_256(t0, t1, t2, t3); 247 #endif 248 REDUCE_F128(t0, t1, t2, t3); 249 yw = _mm_unpacklo_epi64(t1, t0); 250 } 251 252 PCLMUL_MEMBER(pv->y) = yw; 253 }