tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sha1-armv8.c (6825B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 #ifdef USE_HW_SHA1
      6 
      7 #ifndef __ARM_FEATURE_CRYPTO
      8 #error "Compiler option is invalid"
      9 #endif
     10 
     11 #ifdef FREEBL_NO_DEPEND
     12 #include "stubs.h"
     13 #endif
     14 
     15 #include <arm_neon.h>
     16 #include <memory.h>
     17 #include "blapi.h"
     18 #include "sha_fast.h"
     19 
     20 #if !defined(SHA_PUT_W_IN_STACK)
     21 #define H2X 11
     22 #else
     23 #define H2X 0
     24 #endif
     25 
     26 static void shaCompress(SHA_HW_t *X, const PRUint32 *datain);
     27 
     28 void
     29 SHA1_Compress_Native(SHA1Context *ctx)
     30 {
     31    shaCompress(&ctx->H[H2X], ctx->u.w);
     32 }
     33 
     34 /*
     35 *  SHA: Add data to context.
     36 */
     37 void
     38 SHA1_Update_Native(SHA1Context *ctx, const unsigned char *dataIn, unsigned int len)
     39 {
     40    unsigned int lenB;
     41    unsigned int togo;
     42 
     43    if (!len) {
     44        return;
     45    }
     46 
     47    /* accumulate the byte count. */
     48    lenB = (unsigned int)(ctx->size) & 63U;
     49 
     50    ctx->size += len;
     51 
     52    /*
     53     *  Read the data into W and process blocks as they get full
     54     */
     55    if (lenB > 0) {
     56        togo = 64U - lenB;
     57        if (len < togo) {
     58            togo = len;
     59        }
     60        memcpy(ctx->u.b + lenB, dataIn, togo);
     61        len -= togo;
     62        dataIn += togo;
     63        lenB = (lenB + togo) & 63U;
     64        if (!lenB) {
     65            shaCompress(&ctx->H[H2X], ctx->u.w);
     66        }
     67    }
     68 
     69    while (len >= 64U) {
     70        len -= 64U;
     71        shaCompress(&ctx->H[H2X], (PRUint32 *)dataIn);
     72        dataIn += 64U;
     73    }
     74 
     75    if (len) {
     76        memcpy(ctx->u.b, dataIn, len);
     77    }
     78 }
     79 
     80 /*
     81 *  SHA: Compression function, unrolled.
     82 */
     83 static void
     84 shaCompress(SHA_HW_t *X, const PRUint32 *inbuf)
     85 {
     86 #define XH(n) X[n - H2X]
     87 
     88    const uint32x4_t K0 = vdupq_n_u32(0x5a827999);
     89    const uint32x4_t K1 = vdupq_n_u32(0x6ed9eba1);
     90    const uint32x4_t K2 = vdupq_n_u32(0x8f1bbcdc);
     91    const uint32x4_t K3 = vdupq_n_u32(0xca62c1d6);
     92 
     93    uint32x4_t abcd = vld1q_u32(&XH(0));
     94    PRUint32 e = XH(4);
     95 
     96    const uint32x4_t origABCD = abcd;
     97    const PRUint32 origE = e;
     98 
     99    uint32x4_t w0 = vld1q_u32(inbuf);
    100    uint32x4_t w1 = vld1q_u32(inbuf + 4);
    101    uint32x4_t w2 = vld1q_u32(inbuf + 8);
    102    uint32x4_t w3 = vld1q_u32(inbuf + 12);
    103 
    104    w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0)));
    105    w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1)));
    106    w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2)));
    107    w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3)));
    108 
    109    uint32x4_t t0 = vaddq_u32(w0, K0);
    110    uint32x4_t t1 = vaddq_u32(w1, K0);
    111 
    112    PRUint32 tmpE;
    113 
    114    /*
    115     * Using the following ARM instructions to accelerate SHA1
    116     *
    117     * sha1c for round 0 - 20
    118     * sha1p for round 20 - 40
    119     * sha1m for round 40 - 60
    120     * sha1p for round 60 - 80
    121     * sha1su0 and shasu1 for message schedule
    122     * sha1h for rotate left 30
    123     */
    124 
    125    /* Round 0-3 */
    126    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    127    abcd = vsha1cq_u32(abcd, e, t0);
    128    t0 = vaddq_u32(w2, K0);
    129    w0 = vsha1su0q_u32(w0, w1, w2);
    130 
    131    /* Round 4-7 */
    132    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    133    abcd = vsha1cq_u32(abcd, tmpE, t1);
    134    t1 = vaddq_u32(w3, K0);
    135    w0 = vsha1su1q_u32(w0, w3);
    136    w1 = vsha1su0q_u32(w1, w2, w3);
    137 
    138    /* Round 8-11 */
    139    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    140    abcd = vsha1cq_u32(abcd, e, t0);
    141    t0 = vaddq_u32(w0, K0);
    142    w1 = vsha1su1q_u32(w1, w0);
    143    w2 = vsha1su0q_u32(w2, w3, w0);
    144 
    145    /* Round 12-15 */
    146    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    147    abcd = vsha1cq_u32(abcd, tmpE, t1);
    148    t1 = vaddq_u32(w1, K1);
    149    w2 = vsha1su1q_u32(w2, w1);
    150    w3 = vsha1su0q_u32(w3, w0, w1);
    151 
    152    /* Round 16-19 */
    153    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    154    abcd = vsha1cq_u32(abcd, e, t0);
    155    t0 = vaddq_u32(w2, K1);
    156    w3 = vsha1su1q_u32(w3, w2);
    157    w0 = vsha1su0q_u32(w0, w1, w2);
    158 
    159    /* Round 20-23 */
    160    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    161    abcd = vsha1pq_u32(abcd, tmpE, t1);
    162    t1 = vaddq_u32(w3, K1);
    163    w0 = vsha1su1q_u32(w0, w3);
    164    w1 = vsha1su0q_u32(w1, w2, w3);
    165 
    166    /* Round 24-27 */
    167    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    168    abcd = vsha1pq_u32(abcd, e, t0);
    169    t0 = vaddq_u32(w0, K1);
    170    w1 = vsha1su1q_u32(w1, w0);
    171    w2 = vsha1su0q_u32(w2, w3, w0);
    172 
    173    /* Round 28-31 */
    174    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    175    abcd = vsha1pq_u32(abcd, tmpE, t1);
    176    t1 = vaddq_u32(w1, K1);
    177    w2 = vsha1su1q_u32(w2, w1);
    178    w3 = vsha1su0q_u32(w3, w0, w1);
    179 
    180    /* Round 32-35 */
    181    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    182    abcd = vsha1pq_u32(abcd, e, t0);
    183    t0 = vaddq_u32(w2, K2);
    184    w3 = vsha1su1q_u32(w3, w2);
    185    w0 = vsha1su0q_u32(w0, w1, w2);
    186 
    187    /* Round 36-39 */
    188    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    189    abcd = vsha1pq_u32(abcd, tmpE, t1);
    190    t1 = vaddq_u32(w3, K2);
    191    w0 = vsha1su1q_u32(w0, w3);
    192    w1 = vsha1su0q_u32(w1, w2, w3);
    193 
    194    /* Round 40-43 */
    195    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    196    abcd = vsha1mq_u32(abcd, e, t0);
    197    t0 = vaddq_u32(w0, K2);
    198    w1 = vsha1su1q_u32(w1, w0);
    199    w2 = vsha1su0q_u32(w2, w3, w0);
    200 
    201    /* Round 44-47 */
    202    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    203    abcd = vsha1mq_u32(abcd, tmpE, t1);
    204    t1 = vaddq_u32(w1, K2);
    205    w2 = vsha1su1q_u32(w2, w1);
    206    w3 = vsha1su0q_u32(w3, w0, w1);
    207 
    208    /* Round 48-51 */
    209    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    210    abcd = vsha1mq_u32(abcd, e, t0);
    211    t0 = vaddq_u32(w2, K2);
    212    w3 = vsha1su1q_u32(w3, w2);
    213    w0 = vsha1su0q_u32(w0, w1, w2);
    214 
    215    /* Round 52-55 */
    216    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    217    abcd = vsha1mq_u32(abcd, tmpE, t1);
    218    t1 = vaddq_u32(w3, K3);
    219    w0 = vsha1su1q_u32(w0, w3);
    220    w1 = vsha1su0q_u32(w1, w2, w3);
    221 
    222    /* Round 56-59 */
    223    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    224    abcd = vsha1mq_u32(abcd, e, t0);
    225    t0 = vaddq_u32(w0, K3);
    226    w1 = vsha1su1q_u32(w1, w0);
    227    w2 = vsha1su0q_u32(w2, w3, w0);
    228 
    229    /* Round 60-63 */
    230    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    231    abcd = vsha1pq_u32(abcd, tmpE, t1);
    232    t1 = vaddq_u32(w1, K3);
    233    w2 = vsha1su1q_u32(w2, w1);
    234    w3 = vsha1su0q_u32(w3, w0, w1);
    235 
    236    /* Round 64-67 */
    237    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    238    abcd = vsha1pq_u32(abcd, e, t0);
    239    t0 = vaddq_u32(w2, K3);
    240    w3 = vsha1su1q_u32(w3, w2);
    241    w0 = vsha1su0q_u32(w0, w1, w2);
    242 
    243    /* Round 68-71 */
    244    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    245    abcd = vsha1pq_u32(abcd, tmpE, t1);
    246    t1 = vaddq_u32(w3, K3);
    247    w0 = vsha1su1q_u32(w0, w3);
    248 
    249    /* Round 72-75 */
    250    tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    251    abcd = vsha1pq_u32(abcd, e, t0);
    252 
    253    /* Round 76-79 */
    254    e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
    255    abcd = vsha1pq_u32(abcd, tmpE, t1);
    256 
    257    e += origE;
    258    abcd = vaddq_u32(origABCD, abcd);
    259 
    260    vst1q_u32(&XH(0), abcd);
    261    XH(4) = e;
    262 }
    263 
    264 #endif /* USE_HW_SHA1 */