tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sha256-x86.c (7993B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 #ifdef USE_HW_SHA2
      6 
      7 #include <immintrin.h>
      8 
      9 #ifdef FREEBL_NO_DEPEND
     10 #include "stubs.h"
     11 #endif
     12 
     13 #include "blapii.h"
     14 #include "prcpucfg.h"
     15 #include "prtypes.h" /* for PRUintXX */
     16 #include "prlong.h"
     17 #include "blapi.h"
     18 #include "sha256.h"
     19 
     20 /* SHA-256 constants, K256. */
     21 pre_align static const PRUint32 K256[64] post_align = {
     22    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
     23    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
     24    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
     25    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
     26    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
     27    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
     28    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
     29    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
     30    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
     31    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
     32    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
     33    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
     34    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
     35    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
     36    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
     37    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
     38 };
     39 
     40 #define ROUND(n, a, b, c, d)                                \
     41    {                                                       \
     42        __m128i t = _mm_add_epi32(a, k##n);                 \
     43        w1 = _mm_sha256rnds2_epu32(w1, w0, t);              \
     44        t = _mm_shuffle_epi32(t, 0x0e);                     \
     45        w0 = _mm_sha256rnds2_epu32(w0, w1, t);              \
     46        if (n < 12) {                                       \
     47            a = _mm_sha256msg1_epu32(a, b);                 \
     48            a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
     49            a = _mm_sha256msg2_epu32(a, d);                 \
     50        }                                                   \
     51    }
     52 
     53 void
     54 SHA256_Compress_Native(SHA256Context *ctx)
     55 {
     56    __m128i h0, h1, th;
     57    __m128i a, b, c, d;
     58    __m128i w0, w1;
     59    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
     60 
     61    const __m128i *K = (__m128i *)K256;
     62    const __m128i k0 = _mm_load_si128(K);
     63    const __m128i k1 = _mm_load_si128(K + 1);
     64    const __m128i k2 = _mm_load_si128(K + 2);
     65    const __m128i k3 = _mm_load_si128(K + 3);
     66    const __m128i k4 = _mm_load_si128(K + 4);
     67    const __m128i k5 = _mm_load_si128(K + 5);
     68    const __m128i k6 = _mm_load_si128(K + 6);
     69    const __m128i k7 = _mm_load_si128(K + 7);
     70    const __m128i k8 = _mm_load_si128(K + 8);
     71    const __m128i k9 = _mm_load_si128(K + 9);
     72    const __m128i k10 = _mm_load_si128(K + 10);
     73    const __m128i k11 = _mm_load_si128(K + 11);
     74    const __m128i k12 = _mm_load_si128(K + 12);
     75    const __m128i k13 = _mm_load_si128(K + 13);
     76    const __m128i k14 = _mm_load_si128(K + 14);
     77    const __m128i k15 = _mm_load_si128(K + 15);
     78 
     79    const __m128i *input = (__m128i *)ctx->u.b;
     80 
     81    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
     82    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
     83 
     84    /* H0123:4567 -> H01256:H2367 */
     85    th = _mm_shuffle_epi32(h0, 0xb1);
     86    h1 = _mm_shuffle_epi32(h1, 0x1b);
     87    h0 = _mm_alignr_epi8(th, h1, 8);
     88    h1 = _mm_blend_epi16(h1, th, 0xf0);
     89 
     90    a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
     91    b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
     92    c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
     93    d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);
     94 
     95    w0 = h0;
     96    w1 = h1;
     97 
     98    ROUND(0, a, b, c, d)
     99    ROUND(1, b, c, d, a)
    100    ROUND(2, c, d, a, b)
    101    ROUND(3, d, a, b, c)
    102    ROUND(4, a, b, c, d)
    103    ROUND(5, b, c, d, a)
    104    ROUND(6, c, d, a, b)
    105    ROUND(7, d, a, b, c)
    106    ROUND(8, a, b, c, d)
    107    ROUND(9, b, c, d, a)
    108    ROUND(10, c, d, a, b)
    109    ROUND(11, d, a, b, c)
    110    ROUND(12, a, b, c, d)
    111    ROUND(13, b, c, d, a)
    112    ROUND(14, c, d, a, b)
    113    ROUND(15, d, a, b, c)
    114 
    115    h0 = _mm_add_epi32(h0, w0);
    116    h1 = _mm_add_epi32(h1, w1);
    117 
    118    /* H0145:2367 -> H0123:4567 */
    119    th = _mm_shuffle_epi32(h0, 0x1b);
    120    h1 = _mm_shuffle_epi32(h1, 0xb1);
    121    h0 = _mm_blend_epi16(th, h1, 0xf0);
    122    h1 = _mm_alignr_epi8(h1, th, 8);
    123 
    124    _mm_storeu_si128((__m128i *)ctx->h, h0);
    125    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
    126 }
    127 
    128 void
    129 SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
    130                     unsigned int inputLen)
    131 {
    132    __m128i h0, h1, th;
    133    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
    134 
    135    const __m128i *K = (__m128i *)K256;
    136    const __m128i k0 = _mm_load_si128(K);
    137    const __m128i k1 = _mm_load_si128(K + 1);
    138    const __m128i k2 = _mm_load_si128(K + 2);
    139    const __m128i k3 = _mm_load_si128(K + 3);
    140    const __m128i k4 = _mm_load_si128(K + 4);
    141    const __m128i k5 = _mm_load_si128(K + 5);
    142    const __m128i k6 = _mm_load_si128(K + 6);
    143    const __m128i k7 = _mm_load_si128(K + 7);
    144    const __m128i k8 = _mm_load_si128(K + 8);
    145    const __m128i k9 = _mm_load_si128(K + 9);
    146    const __m128i k10 = _mm_load_si128(K + 10);
    147    const __m128i k11 = _mm_load_si128(K + 11);
    148    const __m128i k12 = _mm_load_si128(K + 12);
    149    const __m128i k13 = _mm_load_si128(K + 13);
    150    const __m128i k14 = _mm_load_si128(K + 14);
    151    const __m128i k15 = _mm_load_si128(K + 15);
    152 
    153    unsigned int inBuf = ctx->sizeLo & 0x3f;
    154    if (!inputLen) {
    155        return;
    156    }
    157 
    158    /* Add inputLen into the count of bytes processed, before processing */
    159    if ((ctx->sizeLo += inputLen) < inputLen) {
    160        ctx->sizeHi++;
    161    }
    162 
    163    /* if data already in buffer, attempt to fill rest of buffer */
    164    if (inBuf) {
    165        unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
    166        if (inputLen < todo) {
    167            todo = inputLen;
    168        }
    169        memcpy(ctx->u.b + inBuf, input, todo);
    170        input += todo;
    171        inputLen -= todo;
    172        if (inBuf + todo == SHA256_BLOCK_LENGTH) {
    173            SHA256_Compress_Native(ctx);
    174        }
    175    }
    176 
    177    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
    178    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
    179 
    180    /* H0123:4567 -> H01256:H2367 */
    181    th = _mm_shuffle_epi32(h0, 0xb1);
    182    h1 = _mm_shuffle_epi32(h1, 0x1b);
    183    h0 = _mm_alignr_epi8(th, h1, 8);
    184    h1 = _mm_blend_epi16(h1, th, 0xf0);
    185 
    186    /* if enough data to fill one or more whole buffers, process them. */
    187    while (inputLen >= SHA256_BLOCK_LENGTH) {
    188        __m128i a, b, c, d;
    189        __m128i w0, w1;
    190        a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
    191        b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
    192        c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
    193        d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
    194        input += SHA256_BLOCK_LENGTH;
    195        inputLen -= SHA256_BLOCK_LENGTH;
    196 
    197        w0 = h0;
    198        w1 = h1;
    199 
    200        ROUND(0, a, b, c, d)
    201        ROUND(1, b, c, d, a)
    202        ROUND(2, c, d, a, b)
    203        ROUND(3, d, a, b, c)
    204        ROUND(4, a, b, c, d)
    205        ROUND(5, b, c, d, a)
    206        ROUND(6, c, d, a, b)
    207        ROUND(7, d, a, b, c)
    208        ROUND(8, a, b, c, d)
    209        ROUND(9, b, c, d, a)
    210        ROUND(10, c, d, a, b)
    211        ROUND(11, d, a, b, c)
    212        ROUND(12, a, b, c, d)
    213        ROUND(13, b, c, d, a)
    214        ROUND(14, c, d, a, b)
    215        ROUND(15, d, a, b, c)
    216 
    217        h0 = _mm_add_epi32(h0, w0);
    218        h1 = _mm_add_epi32(h1, w1);
    219    }
    220 
    221    // H01234567 -> H01256 and H2367
    222    th = _mm_shuffle_epi32(h0, 0x1b);
    223    h1 = _mm_shuffle_epi32(h1, 0xb1);
    224    h0 = _mm_blend_epi16(th, h1, 0xf0);
    225    h1 = _mm_alignr_epi8(h1, th, 8);
    226 
    227    _mm_storeu_si128((__m128i *)ctx->h, h0);
    228    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
    229 
    230    /* if data left over, fill it into buffer */
    231    if (inputLen) {
    232        memcpy(ctx->u.b, input, inputLen);
    233    }
    234 }
    235 
    236 #endif /* USE_HW_SHA2 */