tor

The Tor anonymity network
git clone https://git.dasho.dev/tor.git
Log | Files | Refs | README | LICENSE

pclmul.c (6644B)


      1 /*
      2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining
      5 * a copy of this software and associated documentation files (the
      6 * "Software"), to deal in the Software without restriction, including
      7 * without limitation the rights to use, copy, modify, merge, publish,
      8 * distribute, sublicense, and/or sell copies of the Software, and to
      9 * permit persons to whom the Software is furnished to do so, subject to
     10 * the following conditions:
     11 *
     12 * The above copyright notice and this permission notice shall be
     13 * included in all copies or substantial portions of the Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     22 * SOFTWARE.
     23 */
     24 
     25 /*
     26 * This is the GHASH implementation that leverages the pclmulqdq opcode
     27 * (from the AES-NI instructions).
     28 */
     29 
     30 #include <wmmintrin.h>
     31 
     32 #ifndef __GNUC__
     33 #define __attribute__(x)
     34 #endif
     35 
     36 #define BR_TARGET(x) __attribute__((target(x)))
     37 
     38 #if defined(__GNUC__) && !defined(__clang__)
     39        _Pragma("GCC target(\"sse2,ssse3,sse4.1,aes,pclmul\")")
     40 #endif
     41 
     42 #if 0
     43 /*
     44 * Test CPU support for PCLMULQDQ.
     45 */
     46 static inline int
     47 pclmul_supported(void)
     48 {
     49 /*
     50  * Bit mask for features in ECX:
     51  *    1   PCLMULQDQ support
     52  */
     53 return br_cpuid(0, 0, 0x00000002, 0);
     54 }
     55 
     56 /* see bearssl_hash.h */
     57 br_ghash
     58 br_ghash_pclmul_get(void)
     59 {
     60 return pclmul_supported() ? &br_ghash_pclmul : 0;
     61 }
     62 
     63 BR_TARGETS_X86_UP
     64 #endif
     65 /*
     66 * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
     67 * for that compiler, we use inline assembly. Inline assembly is
     68 * potentially a bit slower because the compiler does not understand
     69 * what the opcode does, and thus cannot optimize instruction
     70 * scheduling.
     71 *
     72 * We use a target of "sse2" only, so that Clang may still handle the
     73 * '__m128i' type and allocate SSE2 registers.
     74 */
     75 #ifdef __clang__AND_NOT_WORKING
     76 BR_TARGET("sse2")
     77 static inline __m128i
     78 pclmulqdq00(__m128i x, __m128i y)
     79 {
     80 __asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
     81 return x;
     82 }
     83 BR_TARGET("sse2")
     84 static inline __m128i
     85 pclmulqdq11(__m128i x, __m128i y)
     86 {
     87 __asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
     88 return x;
     89 }
     90 #else
     91 #define pclmulqdq00(x, y)   _mm_clmulepi64_si128(x, y, 0x00)
     92 #define pclmulqdq11(x, y)   _mm_clmulepi64_si128(x, y, 0x11)
     93 #endif
     94 
     95 /*
     96 * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
     97 * halves of kw (into the right half of kx; left half is unspecified).
     98 */
     99 #define BK(kw, kx)   do { \
    100 	kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
    101 } while (0)
    102 
    103 /*
    104 * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
    105 * the XOR of the two values (kx).
    106 */
    107 #define PBK(k0, k1, kw, kx)   do { \
    108 	kw = _mm_unpacklo_epi64(k1, k0); \
    109 	kx = _mm_xor_si128(k0, k1); \
    110 } while (0)
    111 
    112 /*
    113 * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
    114 * result is written in x0..x1.
    115 */
    116 #define REDUCE_F128(x0, x1, x2, x3)   do { \
    117 	x1 = _mm_xor_si128( \
    118 		x1, \
    119 		_mm_xor_si128( \
    120 			_mm_xor_si128( \
    121 				x3, \
    122 				_mm_srli_epi64(x3, 1)), \
    123 			_mm_xor_si128( \
    124 				_mm_srli_epi64(x3, 2), \
    125 				_mm_srli_epi64(x3, 7)))); \
    126 	x2 = _mm_xor_si128( \
    127 		_mm_xor_si128( \
    128 			x2, \
    129 			_mm_slli_epi64(x3, 63)), \
    130 		_mm_xor_si128( \
    131 			_mm_slli_epi64(x3, 62), \
    132 			_mm_slli_epi64(x3, 57))); \
    133 	x0 = _mm_xor_si128( \
    134 		x0, \
    135 		_mm_xor_si128( \
    136 			_mm_xor_si128( \
    137 				x2, \
    138 				_mm_srli_epi64(x2, 1)), \
    139 			_mm_xor_si128( \
    140 				_mm_srli_epi64(x2, 2), \
    141 				_mm_srli_epi64(x2, 7)))); \
    142 	x1 = _mm_xor_si128( \
    143 		_mm_xor_si128( \
    144 			x1, \
    145 			_mm_slli_epi64(x2, 63)), \
    146 		_mm_xor_si128( \
    147 			_mm_slli_epi64(x2, 62), \
    148 			_mm_slli_epi64(x2, 57))); \
    149 } while (0)
    150 
    151 
    152 BR_TARGET("ssse3,pclmul")
    153 static inline void
    154 expand_key_pclmul(const polyval_t *pv, pv_expanded_key_t *out)
    155 {
    156 __m128i h1w, h1x;
    157 __m128i lastw, lastx;
    158 __m128i t0, t1, t2, t3;
    159 
    160 h1w = PCLMUL_MEMBER(pv->key.h);
    161        BK(h1w, h1x);
    162        lastw = h1w;
    163 
    164 for (int i = PV_BLOCK_STRIDE - 2; i >= 0; --i) {
    165 	BK(lastw, lastx);
    166 
    167 	t1 = pclmulqdq11(lastw, h1w);
    168 	t3 = pclmulqdq00(lastw, h1w);
    169 	t2 = pclmulqdq00(lastx, h1x);
    170 	t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
    171 	t0 = _mm_shuffle_epi32(t1, 0x0E);
    172 	t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
    173 	t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
    174 	REDUCE_F128(t0, t1, t2, t3);
    175 	out->k[i] = lastw = _mm_unpacklo_epi64(t1, t0);
    176 }
    177 }
    178 
    179 // Add PCLMUL_BLOCK_STRIDE * 16 bytes from input.
    180 BR_TARGET("ssse3,pclmul")
    181 static inline void
    182 pv_add_multiple_pclmul(polyval_t *pv,
    183 	       const uint8_t *input,
    184 	       const pv_expanded_key_t *expanded)
    185 {
    186 __m128i t0, t1, t2, t3;
    187 
    188 t1 = _mm_setzero_si128();
    189 t2 = _mm_setzero_si128();
    190 t3 = _mm_setzero_si128();
    191 
    192        for (int i = 0; i < PV_BLOCK_STRIDE; ++i, input += 16) {
    193 	__m128i aw = _mm_loadu_si128((void *)(input));
    194 	__m128i ax;
    195 	__m128i hx, hw;
    196 	if (i == 0) {
    197 		aw = _mm_xor_si128(aw, PCLMUL_MEMBER(pv->y));
    198 	}
    199 	if (i == PV_BLOCK_STRIDE - 1) {
    200 		hw = PCLMUL_MEMBER(pv->key.h);
    201 	} else {
    202 		hw = expanded->k[i];
    203 	}
    204 	BK(aw, ax);
    205 	BK(hw, hx);
    206 	t1 = _mm_xor_si128(t1, pclmulqdq11(aw, hw));
    207 	t3 = _mm_xor_si128(t3, pclmulqdq00(aw, hw));
    208 	t2 = _mm_xor_si128(t2, pclmulqdq00(ax, hx));
    209 }
    210 
    211 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
    212 t0 = _mm_shuffle_epi32(t1, 0x0E);
    213 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
    214 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
    215 
    216 REDUCE_F128(t0, t1, t2, t3);
    217 PCLMUL_MEMBER(pv->y) = _mm_unpacklo_epi64(t1, t0);
    218 }
    219 
    220 
    221 /* see bearssl_hash.h */
    222 BR_TARGET("ssse3,pclmul")
    223 static inline void
    224 pv_mul_y_h_pclmul(polyval_t *pv)
    225 {
    226 __m128i yw, h1w, h1x;
    227 
    228        h1w = PCLMUL_MEMBER(pv->key.h);
    229        BK(h1w, h1x);
    230 
    231        {
    232 	__m128i aw, ax;
    233 	__m128i t0, t1, t2, t3;
    234 
    235                aw = PCLMUL_MEMBER(pv->y);
    236 	BK(aw, ax);
    237 
    238 	t1 = pclmulqdq11(aw, h1w);
    239 	t3 = pclmulqdq00(aw, h1w);
    240 	t2 = pclmulqdq00(ax, h1x);
    241 	t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
    242 	t0 = _mm_shuffle_epi32(t1, 0x0E);
    243 	t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
    244 	t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
    245 #if 0 // This step is GHASH-only.
    246 	SL_256(t0, t1, t2, t3);
    247 #endif
    248 	REDUCE_F128(t0, t1, t2, t3);
    249 	yw = _mm_unpacklo_epi64(t1, t0);
    250 }
    251 
    252 PCLMUL_MEMBER(pv->y) = yw;
    253 }