tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

gcm-x86.c (4535B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 #ifdef FREEBL_NO_DEPEND
      6 #include "stubs.h"
      7 #endif
      8 #include "gcm.h"
      9 #include "secerr.h"
     10 
     11 #include <wmmintrin.h> /* clmul */
     12 
     13 #define WRITE64(x, bytes)   \
     14    (bytes)[0] = (x) >> 56; \
     15    (bytes)[1] = (x) >> 48; \
     16    (bytes)[2] = (x) >> 40; \
     17    (bytes)[3] = (x) >> 32; \
     18    (bytes)[4] = (x) >> 24; \
     19    (bytes)[5] = (x) >> 16; \
     20    (bytes)[6] = (x) >> 8;  \
     21    (bytes)[7] = (x);
     22 
     23 SECStatus
     24 gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf)
     25 {
     26    uint64_t tmp_out[2];
     27    _mm_storeu_si128((__m128i *)tmp_out, ghash->x);
     28    /* maxout must be larger than 16 byte (checked by the caller). */
     29    WRITE64(tmp_out[0], outbuf + 8);
     30    WRITE64(tmp_out[1], outbuf);
     31    return SECSuccess;
     32 }
     33 
     34 SECStatus
     35 gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
     36                unsigned int count)
     37 {
     38    size_t i;
     39    pre_align __m128i z_high post_align;
     40    pre_align __m128i z_low post_align;
     41    pre_align __m128i C post_align;
     42    pre_align __m128i D post_align;
     43    pre_align __m128i E post_align;
     44    pre_align __m128i F post_align;
     45    pre_align __m128i bin post_align;
     46    pre_align __m128i Ci post_align;
     47    pre_align __m128i tmp post_align;
     48 
     49    for (i = 0; i < count; i++, buf += 16) {
     50        bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1],
     51                            ((uint16_t)buf[2] << 8) | buf[3],
     52                            ((uint16_t)buf[4] << 8) | buf[5],
     53                            ((uint16_t)buf[6] << 8) | buf[7],
     54                            ((uint16_t)buf[8] << 8) | buf[9],
     55                            ((uint16_t)buf[10] << 8) | buf[11],
     56                            ((uint16_t)buf[12] << 8) | buf[13],
     57                            ((uint16_t)buf[14] << 8) | buf[15]);
     58        Ci = _mm_xor_si128(bin, ghash->x);
     59 
     60        /* Do binary mult ghash->X = Ci * ghash->H. */
     61        C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00);
     62        D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11);
     63        E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01);
     64        F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10);
     65        tmp = _mm_xor_si128(E, F);
     66        z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8));
     67        z_high = _mm_unpackhi_epi64(z_high, D);
     68        z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C);
     69        z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low);
     70 
     71        /* Shift one to the left (multiply by x) as gcm spec is stupid. */
     72        C = _mm_slli_si128(z_low, 8);
     73        E = _mm_srli_epi64(C, 63);
     74        D = _mm_slli_si128(z_high, 8);
     75        F = _mm_srli_epi64(D, 63);
     76        /* Carry over */
     77        C = _mm_srli_si128(z_low, 8);
     78        D = _mm_srli_epi64(C, 63);
     79        z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E);
     80        z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D);
     81 
     82        /* Reduce */
     83        C = _mm_slli_si128(z_low, 8);
     84        /* D = z_low << 127 */
     85        D = _mm_slli_epi64(C, 63);
     86        /* E = z_low << 126 */
     87        E = _mm_slli_epi64(C, 62);
     88        /* F = z_low << 121 */
     89        F = _mm_slli_epi64(C, 57);
     90        /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */
     91        z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F);
     92        C = _mm_srli_si128(z_low, 8);
     93        /* D = z_low >> 1 */
     94        D = _mm_slli_epi64(C, 63);
     95        D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D);
     96        /* E = z_low >> 2 */
     97        E = _mm_slli_epi64(C, 62);
     98        E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E);
     99        /* F = z_low >> 7 */
    100        F = _mm_slli_epi64(C, 57);
    101        F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F);
    102        /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */
    103        ghash->x = _mm_xor_si128(_mm_xor_si128(
    104                                     _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E),
    105                                 F);
    106    }
    107    return SECSuccess;
    108 }
    109 
    110 SECStatus
    111 gcm_HashInit_hw(gcmHashContext *ghash)
    112 {
    113    ghash->ghash_mul = gcm_HashMult_hw;
    114    ghash->x = _mm_setzero_si128();
    115    /* MSVC requires __m64 to load epi64. */
    116    ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high,
    117                             ghash->h_low >> 32, (uint32_t)ghash->h_low);
    118    ghash->hw = PR_TRUE;
    119    return SECSuccess;
    120 }
    121 
    122 SECStatus
    123 gcm_HashZeroX_hw(gcmHashContext *ghash)
    124 {
    125    ghash->x = _mm_setzero_si128();
    126    return SECSuccess;
    127 }