tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

gcm-arm32-neon.c (6997B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 #ifdef FREEBL_NO_DEPEND
      6 #include "stubs.h"
      7 #endif
      8 #include "blapii.h"
      9 #include "blapit.h"
     10 #include "gcm.h"
     11 #include "secerr.h"
     12 #include "prtypes.h"
     13 
     14 #if defined(IS_LITTLE_ENDIAN)
     15 
     16 #include <arm_neon.h>
     17 
     18 SECStatus
     19 gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf)
     20 {
     21    vst1_u8(outbuf, vrev64_u8(vcreate_u8(ghash->x_high)));
     22    vst1_u8(outbuf + 8, vrev64_u8(vcreate_u8(ghash->x_low)));
     23    return SECSuccess;
     24 }
     25 
     26 /* Carry-less multiplication. a * b = ret. */
     27 static inline uint8x16_t
     28 clmul(const uint8x8_t a, const uint8x8_t b)
     29 {
     30    uint8x16_t d, e, f, g, h, i, j, k, l, m, n;
     31    uint8x8_t t_high, t_low;
     32    uint8x16_t t0, t1, t2, t3;
     33    const uint8x8_t k16 = vcreate_u8(0xffff);
     34    const uint8x8_t k32 = vcreate_u8(0xffffffff);
     35    const uint8x8_t k48 = vcreate_u8(0xffffffffffff);
     36 
     37    // D = A * B
     38    d = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a),
     39                                      vreinterpret_p8_u8(b)));
     40    // E = A * B1
     41    e = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a),
     42                                      vreinterpret_p8_u8(vext_u8(b, b, 1))));
     43    // F = A1 * B
     44    f = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(vext_u8(a, a, 1)),
     45                                      vreinterpret_p8_u8(b)));
     46    // G = A * B2
     47    g = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a),
     48                                      vreinterpret_p8_u8(vext_u8(b, b, 2))));
     49    // H = A2 * B
     50    h = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(vext_u8(a, a, 2)),
     51                                      vreinterpret_p8_u8(b)));
     52    // I = A * B3
     53    i = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a),
     54                                      vreinterpret_p8_u8(vext_u8(b, b, 3))));
     55    // J = A3 * B
     56    j = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(vext_u8(a, a, 3)),
     57                                      vreinterpret_p8_u8(b)));
     58    // K = A * B4
     59    k = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a),
     60                                      vreinterpret_p8_u8(vext_u8(b, b, 4))));
     61    // L = E + F
     62    l = veorq_u8(e, f);
     63    // M = G + H
     64    m = veorq_u8(g, h);
     65    // N = I + J
     66    n = veorq_u8(i, j);
     67 
     68    // t0 = (L) (P0 + P1) << 8
     69    t_high = vget_high_u8(l);
     70    t_low = vget_low_u8(l);
     71    t_low = veor_u8(t_low, t_high);
     72    t_high = vand_u8(t_high, k48);
     73    t_low = veor_u8(t_low, t_high);
     74    t0 = vcombine_u8(t_low, t_high);
     75    t0 = vextq_u8(t0, t0, 15);
     76 
     77    // t1 = (M) (P2 + P3) << 16
     78    t_high = vget_high_u8(m);
     79    t_low = vget_low_u8(m);
     80    t_low = veor_u8(t_low, t_high);
     81    t_high = vand_u8(t_high, k32);
     82    t_low = veor_u8(t_low, t_high);
     83    t1 = vcombine_u8(t_low, t_high);
     84    t1 = vextq_u8(t1, t1, 14);
     85 
     86    // t2 = (N) (P4 + P5) << 24
     87    t_high = vget_high_u8(n);
     88    t_low = vget_low_u8(n);
     89    t_low = veor_u8(t_low, t_high);
     90    t_high = vand_u8(t_high, k16);
     91    t_low = veor_u8(t_low, t_high);
     92    t2 = vcombine_u8(t_low, t_high);
     93    t2 = vextq_u8(t2, t2, 13);
     94 
     95    // t3 = (K) (P6 + P7) << 32
     96    t_high = vget_high_u8(k);
     97    t_low = vget_low_u8(k);
     98    t_low = veor_u8(t_low, t_high);
     99    t_high = vdup_n_u8(0);
    100    t3 = vcombine_u8(t_low, t_high);
    101    t3 = vextq_u8(t3, t3, 12);
    102 
    103    t0 = veorq_u8(t0, t1);
    104    t2 = veorq_u8(t2, t3);
    105    return veorq_u8(veorq_u8(d, t0), t2);
    106 }
    107 
    108 SECStatus
    109 gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
    110                unsigned int count)
    111 {
    112    const uint8x8_t h_low = vcreate_u8(ghash->h_low);
    113    const uint8x8_t h_high = vcreate_u8(ghash->h_high);
    114    uint8x16_t ci;
    115    uint8x8_t ci_low;
    116    uint8x8_t ci_high;
    117    uint8x16_t z0, z2, z1a;
    118    uint8x16_t z_high, z_low;
    119    uint8x16_t t;
    120    int64x2_t t1, t2, t3;
    121    uint64x2_t z_low_l, z_low_r, z_high_l, z_high_r;
    122    size_t i;
    123 
    124    ci = vcombine_u8(vcreate_u8(ghash->x_low), vcreate_u8(ghash->x_high));
    125 
    126    for (i = 0; i < count; i++, buf += 16) {
    127        ci = veorq_u8(ci, vcombine_u8(vrev64_u8(vld1_u8(buf + 8)),
    128                                      vrev64_u8(vld1_u8(buf))));
    129        ci_high = vget_high_u8(ci);
    130        ci_low = vget_low_u8(ci);
    131 
    132        /* Do binary mult ghash->X = C * ghash->H (Karatsuba). */
    133        z0 = clmul(ci_low, h_low);
    134        z2 = clmul(ci_high, h_high);
    135        z1a = clmul(veor_u8(ci_high, ci_low), veor_u8(h_high, h_low));
    136        z1a = veorq_u8(z0, z1a);
    137        z1a = veorq_u8(z2, z1a);
    138        z_high = vcombine_u8(veor_u8(vget_low_u8(z2), vget_high_u8(z1a)),
    139                             vget_high_u8(z2));
    140        z_low = vcombine_u8(vget_low_u8(z0),
    141                            veor_u8(vget_high_u8(z0), vget_low_u8(z1a)));
    142 
    143        /* Shift one (multiply by x) as gcm spec is stupid. */
    144        z_low_l = vshlq_n_u64(vreinterpretq_u64_u8(z_low), 1);
    145        z_low_r = vshrq_n_u64(vreinterpretq_u64_u8(z_low), 63);
    146        z_high_l = vshlq_n_u64(vreinterpretq_u64_u8(z_high), 1);
    147        z_high_r = vshrq_n_u64(vreinterpretq_u64_u8(z_high), 63);
    148        z_low = vreinterpretq_u8_u64(
    149            vcombine_u64(vget_low_u64(z_low_l),
    150                         vorr_u64(vget_high_u64(z_low_l),
    151                                  vget_low_u64(z_low_r))));
    152        z_high = vreinterpretq_u8_u64(
    153            vcombine_u64(vorr_u64(vget_low_u64(z_high_l),
    154                                  vget_high_u64(z_low_r)),
    155                         vorr_u64(vget_high_u64(z_high_l),
    156                                  vget_low_u64(z_high_r))));
    157 
    158        /* Reduce */
    159        t1 = vshlq_n_s64(vreinterpretq_s64_u8(z_low), 57);
    160        t2 = vshlq_n_s64(vreinterpretq_s64_u8(z_low), 62);
    161        t3 = vshlq_n_s64(vreinterpretq_s64_u8(z_low), 63);
    162        t = vreinterpretq_u8_s64(veorq_s64(t1, veorq_s64(t2, t3)));
    163 
    164        z_low = vcombine_u8(vget_low_u8(z_low),
    165                            veor_u8(vget_high_u8(z_low), vget_low_u8(t)));
    166        z_high = vcombine_u8(veor_u8(vget_low_u8(z_high), vget_high_u8(t)),
    167                             vget_high_u8(z_high));
    168 
    169        t = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(z_low), 1));
    170        z_high = veorq_u8(z_high, z_low);
    171        z_low = veorq_u8(z_low, t);
    172        t = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(t), 6));
    173        z_low = vreinterpretq_u8_u64(
    174            vshrq_n_u64(vreinterpretq_u64_u8(z_low), 1));
    175        z_low = veorq_u8(z_low, z_high);
    176        ci = veorq_u8(z_low, t);
    177    }
    178 
    179    vst1_u8((uint8_t *)&ghash->x_high, vget_high_u8(ci));
    180    vst1_u8((uint8_t *)&ghash->x_low, vget_low_u8(ci));
    181    return SECSuccess;
    182 }
    183 
    184 SECStatus
    185 gcm_HashInit_hw(gcmHashContext *ghash)
    186 {
    187    ghash->ghash_mul = gcm_HashMult_hw;
    188    ghash->x_low = 0;
    189    ghash->x_high = 0;
    190    ghash->hw = PR_TRUE;
    191    return SECSuccess;
    192 }
    193 
    194 SECStatus
    195 gcm_HashZeroX_hw(gcmHashContext *ghash)
    196 {
    197    ghash->x_low = 0;
    198    ghash->x_high = 0;
    199    return SECSuccess;
    200 }
    201 
    202 #endif /* IS_LITTLE_ENDIAN */