gcm-arm32-neon.c (6997B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 #ifdef FREEBL_NO_DEPEND 6 #include "stubs.h" 7 #endif 8 #include "blapii.h" 9 #include "blapit.h" 10 #include "gcm.h" 11 #include "secerr.h" 12 #include "prtypes.h" 13 14 #if defined(IS_LITTLE_ENDIAN) 15 16 #include <arm_neon.h> 17 18 SECStatus 19 gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf) 20 { 21 vst1_u8(outbuf, vrev64_u8(vcreate_u8(ghash->x_high))); 22 vst1_u8(outbuf + 8, vrev64_u8(vcreate_u8(ghash->x_low))); 23 return SECSuccess; 24 } 25 26 /* Carry-less multiplication. a * b = ret. */ 27 static inline uint8x16_t 28 clmul(const uint8x8_t a, const uint8x8_t b) 29 { 30 uint8x16_t d, e, f, g, h, i, j, k, l, m, n; 31 uint8x8_t t_high, t_low; 32 uint8x16_t t0, t1, t2, t3; 33 const uint8x8_t k16 = vcreate_u8(0xffff); 34 const uint8x8_t k32 = vcreate_u8(0xffffffff); 35 const uint8x8_t k48 = vcreate_u8(0xffffffffffff); 36 37 // D = A * B 38 d = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a), 39 vreinterpret_p8_u8(b))); 40 // E = A * B1 41 e = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a), 42 vreinterpret_p8_u8(vext_u8(b, b, 1)))); 43 // F = A1 * B 44 f = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(vext_u8(a, a, 1)), 45 vreinterpret_p8_u8(b))); 46 // G = A * B2 47 g = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a), 48 vreinterpret_p8_u8(vext_u8(b, b, 2)))); 49 // H = A2 * B 50 h = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(vext_u8(a, a, 2)), 51 vreinterpret_p8_u8(b))); 52 // I = A * B3 53 i = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a), 54 vreinterpret_p8_u8(vext_u8(b, b, 3)))); 55 // J = A3 * B 56 j = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(vext_u8(a, a, 3)), 57 vreinterpret_p8_u8(b))); 58 // K = A * B4 59 k = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a), 60 vreinterpret_p8_u8(vext_u8(b, b, 4)))); 61 // L = E + F 62 l = veorq_u8(e, f); 63 // M = G + H 64 m = veorq_u8(g, h); 65 // N = I + J 66 n = veorq_u8(i, j); 67 68 // t0 = (L) (P0 + P1) << 8 69 t_high = vget_high_u8(l); 70 t_low = vget_low_u8(l); 71 t_low = veor_u8(t_low, t_high); 72 t_high = vand_u8(t_high, k48); 73 t_low = veor_u8(t_low, t_high); 74 t0 = vcombine_u8(t_low, t_high); 75 t0 = vextq_u8(t0, t0, 15); 76 77 // t1 = (M) (P2 + P3) << 16 78 t_high = vget_high_u8(m); 79 t_low = vget_low_u8(m); 80 t_low = veor_u8(t_low, t_high); 81 t_high = vand_u8(t_high, k32); 82 t_low = veor_u8(t_low, t_high); 83 t1 = vcombine_u8(t_low, t_high); 84 t1 = vextq_u8(t1, t1, 14); 85 86 // t2 = (N) (P4 + P5) << 24 87 t_high = vget_high_u8(n); 88 t_low = vget_low_u8(n); 89 t_low = veor_u8(t_low, t_high); 90 t_high = vand_u8(t_high, k16); 91 t_low = veor_u8(t_low, t_high); 92 t2 = vcombine_u8(t_low, t_high); 93 t2 = vextq_u8(t2, t2, 13); 94 95 // t3 = (K) (P6 + P7) << 32 96 t_high = vget_high_u8(k); 97 t_low = vget_low_u8(k); 98 t_low = veor_u8(t_low, t_high); 99 t_high = vdup_n_u8(0); 100 t3 = vcombine_u8(t_low, t_high); 101 t3 = vextq_u8(t3, t3, 12); 102 103 t0 = veorq_u8(t0, t1); 104 t2 = veorq_u8(t2, t3); 105 return veorq_u8(veorq_u8(d, t0), t2); 106 } 107 108 SECStatus 109 gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, 110 unsigned int count) 111 { 112 const uint8x8_t h_low = vcreate_u8(ghash->h_low); 113 const uint8x8_t h_high = vcreate_u8(ghash->h_high); 114 uint8x16_t ci; 115 uint8x8_t ci_low; 116 uint8x8_t ci_high; 117 uint8x16_t z0, z2, z1a; 118 uint8x16_t z_high, z_low; 119 uint8x16_t t; 120 int64x2_t t1, t2, t3; 121 uint64x2_t z_low_l, z_low_r, z_high_l, z_high_r; 122 size_t i; 123 124 ci = vcombine_u8(vcreate_u8(ghash->x_low), vcreate_u8(ghash->x_high)); 125 126 for (i = 0; i < count; i++, buf += 16) { 127 ci = veorq_u8(ci, vcombine_u8(vrev64_u8(vld1_u8(buf + 8)), 128 vrev64_u8(vld1_u8(buf)))); 129 ci_high = vget_high_u8(ci); 130 ci_low = vget_low_u8(ci); 131 132 /* Do binary mult ghash->X = C * ghash->H (Karatsuba). */ 133 z0 = clmul(ci_low, h_low); 134 z2 = clmul(ci_high, h_high); 135 z1a = clmul(veor_u8(ci_high, ci_low), veor_u8(h_high, h_low)); 136 z1a = veorq_u8(z0, z1a); 137 z1a = veorq_u8(z2, z1a); 138 z_high = vcombine_u8(veor_u8(vget_low_u8(z2), vget_high_u8(z1a)), 139 vget_high_u8(z2)); 140 z_low = vcombine_u8(vget_low_u8(z0), 141 veor_u8(vget_high_u8(z0), vget_low_u8(z1a))); 142 143 /* Shift one (multiply by x) as gcm spec is stupid. */ 144 z_low_l = vshlq_n_u64(vreinterpretq_u64_u8(z_low), 1); 145 z_low_r = vshrq_n_u64(vreinterpretq_u64_u8(z_low), 63); 146 z_high_l = vshlq_n_u64(vreinterpretq_u64_u8(z_high), 1); 147 z_high_r = vshrq_n_u64(vreinterpretq_u64_u8(z_high), 63); 148 z_low = vreinterpretq_u8_u64( 149 vcombine_u64(vget_low_u64(z_low_l), 150 vorr_u64(vget_high_u64(z_low_l), 151 vget_low_u64(z_low_r)))); 152 z_high = vreinterpretq_u8_u64( 153 vcombine_u64(vorr_u64(vget_low_u64(z_high_l), 154 vget_high_u64(z_low_r)), 155 vorr_u64(vget_high_u64(z_high_l), 156 vget_low_u64(z_high_r)))); 157 158 /* Reduce */ 159 t1 = vshlq_n_s64(vreinterpretq_s64_u8(z_low), 57); 160 t2 = vshlq_n_s64(vreinterpretq_s64_u8(z_low), 62); 161 t3 = vshlq_n_s64(vreinterpretq_s64_u8(z_low), 63); 162 t = vreinterpretq_u8_s64(veorq_s64(t1, veorq_s64(t2, t3))); 163 164 z_low = vcombine_u8(vget_low_u8(z_low), 165 veor_u8(vget_high_u8(z_low), vget_low_u8(t))); 166 z_high = vcombine_u8(veor_u8(vget_low_u8(z_high), vget_high_u8(t)), 167 vget_high_u8(z_high)); 168 169 t = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(z_low), 1)); 170 z_high = veorq_u8(z_high, z_low); 171 z_low = veorq_u8(z_low, t); 172 t = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(t), 6)); 173 z_low = vreinterpretq_u8_u64( 174 vshrq_n_u64(vreinterpretq_u64_u8(z_low), 1)); 175 z_low = veorq_u8(z_low, z_high); 176 ci = veorq_u8(z_low, t); 177 } 178 179 vst1_u8((uint8_t *)&ghash->x_high, vget_high_u8(ci)); 180 vst1_u8((uint8_t *)&ghash->x_low, vget_low_u8(ci)); 181 return SECSuccess; 182 } 183 184 SECStatus 185 gcm_HashInit_hw(gcmHashContext *ghash) 186 { 187 ghash->ghash_mul = gcm_HashMult_hw; 188 ghash->x_low = 0; 189 ghash->x_high = 0; 190 ghash->hw = PR_TRUE; 191 return SECSuccess; 192 } 193 194 SECStatus 195 gcm_HashZeroX_hw(gcmHashContext *ghash) 196 { 197 ghash->x_low = 0; 198 ghash->x_high = 0; 199 return SECSuccess; 200 } 201 202 #endif /* IS_LITTLE_ENDIAN */