gcm-x86.c (4535B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 #ifdef FREEBL_NO_DEPEND 6 #include "stubs.h" 7 #endif 8 #include "gcm.h" 9 #include "secerr.h" 10 11 #include <wmmintrin.h> /* clmul */ 12 13 #define WRITE64(x, bytes) \ 14 (bytes)[0] = (x) >> 56; \ 15 (bytes)[1] = (x) >> 48; \ 16 (bytes)[2] = (x) >> 40; \ 17 (bytes)[3] = (x) >> 32; \ 18 (bytes)[4] = (x) >> 24; \ 19 (bytes)[5] = (x) >> 16; \ 20 (bytes)[6] = (x) >> 8; \ 21 (bytes)[7] = (x); 22 23 SECStatus 24 gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf) 25 { 26 uint64_t tmp_out[2]; 27 _mm_storeu_si128((__m128i *)tmp_out, ghash->x); 28 /* maxout must be larger than 16 byte (checked by the caller). */ 29 WRITE64(tmp_out[0], outbuf + 8); 30 WRITE64(tmp_out[1], outbuf); 31 return SECSuccess; 32 } 33 34 SECStatus 35 gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, 36 unsigned int count) 37 { 38 size_t i; 39 pre_align __m128i z_high post_align; 40 pre_align __m128i z_low post_align; 41 pre_align __m128i C post_align; 42 pre_align __m128i D post_align; 43 pre_align __m128i E post_align; 44 pre_align __m128i F post_align; 45 pre_align __m128i bin post_align; 46 pre_align __m128i Ci post_align; 47 pre_align __m128i tmp post_align; 48 49 for (i = 0; i < count; i++, buf += 16) { 50 bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1], 51 ((uint16_t)buf[2] << 8) | buf[3], 52 ((uint16_t)buf[4] << 8) | buf[5], 53 ((uint16_t)buf[6] << 8) | buf[7], 54 ((uint16_t)buf[8] << 8) | buf[9], 55 ((uint16_t)buf[10] << 8) | buf[11], 56 ((uint16_t)buf[12] << 8) | buf[13], 57 ((uint16_t)buf[14] << 8) | buf[15]); 58 Ci = _mm_xor_si128(bin, ghash->x); 59 60 /* Do binary mult ghash->X = Ci * ghash->H. */ 61 C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00); 62 D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11); 63 E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01); 64 F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10); 65 tmp = _mm_xor_si128(E, F); 66 z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8)); 67 z_high = _mm_unpackhi_epi64(z_high, D); 68 z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C); 69 z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low); 70 71 /* Shift one to the left (multiply by x) as gcm spec is stupid. */ 72 C = _mm_slli_si128(z_low, 8); 73 E = _mm_srli_epi64(C, 63); 74 D = _mm_slli_si128(z_high, 8); 75 F = _mm_srli_epi64(D, 63); 76 /* Carry over */ 77 C = _mm_srli_si128(z_low, 8); 78 D = _mm_srli_epi64(C, 63); 79 z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E); 80 z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D); 81 82 /* Reduce */ 83 C = _mm_slli_si128(z_low, 8); 84 /* D = z_low << 127 */ 85 D = _mm_slli_epi64(C, 63); 86 /* E = z_low << 126 */ 87 E = _mm_slli_epi64(C, 62); 88 /* F = z_low << 121 */ 89 F = _mm_slli_epi64(C, 57); 90 /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */ 91 z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F); 92 C = _mm_srli_si128(z_low, 8); 93 /* D = z_low >> 1 */ 94 D = _mm_slli_epi64(C, 63); 95 D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D); 96 /* E = z_low >> 2 */ 97 E = _mm_slli_epi64(C, 62); 98 E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E); 99 /* F = z_low >> 7 */ 100 F = _mm_slli_epi64(C, 57); 101 F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F); 102 /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */ 103 ghash->x = _mm_xor_si128(_mm_xor_si128( 104 _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E), 105 F); 106 } 107 return SECSuccess; 108 } 109 110 SECStatus 111 gcm_HashInit_hw(gcmHashContext *ghash) 112 { 113 ghash->ghash_mul = gcm_HashMult_hw; 114 ghash->x = _mm_setzero_si128(); 115 /* MSVC requires __m64 to load epi64. */ 116 ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high, 117 ghash->h_low >> 32, (uint32_t)ghash->h_low); 118 ghash->hw = PR_TRUE; 119 return SECSuccess; 120 } 121 122 SECStatus 123 gcm_HashZeroX_hw(gcmHashContext *ghash) 124 { 125 ghash->x = _mm_setzero_si128(); 126 return SECSuccess; 127 }