sha1-armv8.c (6825B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 #ifdef USE_HW_SHA1 6 7 #ifndef __ARM_FEATURE_CRYPTO 8 #error "Compiler option is invalid" 9 #endif 10 11 #ifdef FREEBL_NO_DEPEND 12 #include "stubs.h" 13 #endif 14 15 #include <arm_neon.h> 16 #include <memory.h> 17 #include "blapi.h" 18 #include "sha_fast.h" 19 20 #if !defined(SHA_PUT_W_IN_STACK) 21 #define H2X 11 22 #else 23 #define H2X 0 24 #endif 25 26 static void shaCompress(SHA_HW_t *X, const PRUint32 *datain); 27 28 void 29 SHA1_Compress_Native(SHA1Context *ctx) 30 { 31 shaCompress(&ctx->H[H2X], ctx->u.w); 32 } 33 34 /* 35 * SHA: Add data to context. 36 */ 37 void 38 SHA1_Update_Native(SHA1Context *ctx, const unsigned char *dataIn, unsigned int len) 39 { 40 unsigned int lenB; 41 unsigned int togo; 42 43 if (!len) { 44 return; 45 } 46 47 /* accumulate the byte count. */ 48 lenB = (unsigned int)(ctx->size) & 63U; 49 50 ctx->size += len; 51 52 /* 53 * Read the data into W and process blocks as they get full 54 */ 55 if (lenB > 0) { 56 togo = 64U - lenB; 57 if (len < togo) { 58 togo = len; 59 } 60 memcpy(ctx->u.b + lenB, dataIn, togo); 61 len -= togo; 62 dataIn += togo; 63 lenB = (lenB + togo) & 63U; 64 if (!lenB) { 65 shaCompress(&ctx->H[H2X], ctx->u.w); 66 } 67 } 68 69 while (len >= 64U) { 70 len -= 64U; 71 shaCompress(&ctx->H[H2X], (PRUint32 *)dataIn); 72 dataIn += 64U; 73 } 74 75 if (len) { 76 memcpy(ctx->u.b, dataIn, len); 77 } 78 } 79 80 /* 81 * SHA: Compression function, unrolled. 82 */ 83 static void 84 shaCompress(SHA_HW_t *X, const PRUint32 *inbuf) 85 { 86 #define XH(n) X[n - H2X] 87 88 const uint32x4_t K0 = vdupq_n_u32(0x5a827999); 89 const uint32x4_t K1 = vdupq_n_u32(0x6ed9eba1); 90 const uint32x4_t K2 = vdupq_n_u32(0x8f1bbcdc); 91 const uint32x4_t K3 = vdupq_n_u32(0xca62c1d6); 92 93 uint32x4_t abcd = vld1q_u32(&XH(0)); 94 PRUint32 e = XH(4); 95 96 const uint32x4_t origABCD = abcd; 97 const PRUint32 origE = e; 98 99 uint32x4_t w0 = vld1q_u32(inbuf); 100 uint32x4_t w1 = vld1q_u32(inbuf + 4); 101 uint32x4_t w2 = vld1q_u32(inbuf + 8); 102 uint32x4_t w3 = vld1q_u32(inbuf + 12); 103 104 w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0))); 105 w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1))); 106 w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2))); 107 w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3))); 108 109 uint32x4_t t0 = vaddq_u32(w0, K0); 110 uint32x4_t t1 = vaddq_u32(w1, K0); 111 112 PRUint32 tmpE; 113 114 /* 115 * Using the following ARM instructions to accelerate SHA1 116 * 117 * sha1c for round 0 - 20 118 * sha1p for round 20 - 40 119 * sha1m for round 40 - 60 120 * sha1p for round 60 - 80 121 * sha1su0 and shasu1 for message schedule 122 * sha1h for rotate left 30 123 */ 124 125 /* Round 0-3 */ 126 tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 127 abcd = vsha1cq_u32(abcd, e, t0); 128 t0 = vaddq_u32(w2, K0); 129 w0 = vsha1su0q_u32(w0, w1, w2); 130 131 /* Round 4-7 */ 132 e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 133 abcd = vsha1cq_u32(abcd, tmpE, t1); 134 t1 = vaddq_u32(w3, K0); 135 w0 = vsha1su1q_u32(w0, w3); 136 w1 = vsha1su0q_u32(w1, w2, w3); 137 138 /* Round 8-11 */ 139 tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 140 abcd = vsha1cq_u32(abcd, e, t0); 141 t0 = vaddq_u32(w0, K0); 142 w1 = vsha1su1q_u32(w1, w0); 143 w2 = vsha1su0q_u32(w2, w3, w0); 144 145 /* Round 12-15 */ 146 e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 147 abcd = vsha1cq_u32(abcd, tmpE, t1); 148 t1 = vaddq_u32(w1, K1); 149 w2 = vsha1su1q_u32(w2, w1); 150 w3 = vsha1su0q_u32(w3, w0, w1); 151 152 /* Round 16-19 */ 153 tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 154 abcd = vsha1cq_u32(abcd, e, t0); 155 t0 = vaddq_u32(w2, K1); 156 w3 = vsha1su1q_u32(w3, w2); 157 w0 = vsha1su0q_u32(w0, w1, w2); 158 159 /* Round 20-23 */ 160 e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 161 abcd = vsha1pq_u32(abcd, tmpE, t1); 162 t1 = vaddq_u32(w3, K1); 163 w0 = vsha1su1q_u32(w0, w3); 164 w1 = vsha1su0q_u32(w1, w2, w3); 165 166 /* Round 24-27 */ 167 tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 168 abcd = vsha1pq_u32(abcd, e, t0); 169 t0 = vaddq_u32(w0, K1); 170 w1 = vsha1su1q_u32(w1, w0); 171 w2 = vsha1su0q_u32(w2, w3, w0); 172 173 /* Round 28-31 */ 174 e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 175 abcd = vsha1pq_u32(abcd, tmpE, t1); 176 t1 = vaddq_u32(w1, K1); 177 w2 = vsha1su1q_u32(w2, w1); 178 w3 = vsha1su0q_u32(w3, w0, w1); 179 180 /* Round 32-35 */ 181 tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 182 abcd = vsha1pq_u32(abcd, e, t0); 183 t0 = vaddq_u32(w2, K2); 184 w3 = vsha1su1q_u32(w3, w2); 185 w0 = vsha1su0q_u32(w0, w1, w2); 186 187 /* Round 36-39 */ 188 e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 189 abcd = vsha1pq_u32(abcd, tmpE, t1); 190 t1 = vaddq_u32(w3, K2); 191 w0 = vsha1su1q_u32(w0, w3); 192 w1 = vsha1su0q_u32(w1, w2, w3); 193 194 /* Round 40-43 */ 195 tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 196 abcd = vsha1mq_u32(abcd, e, t0); 197 t0 = vaddq_u32(w0, K2); 198 w1 = vsha1su1q_u32(w1, w0); 199 w2 = vsha1su0q_u32(w2, w3, w0); 200 201 /* Round 44-47 */ 202 e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 203 abcd = vsha1mq_u32(abcd, tmpE, t1); 204 t1 = vaddq_u32(w1, K2); 205 w2 = vsha1su1q_u32(w2, w1); 206 w3 = vsha1su0q_u32(w3, w0, w1); 207 208 /* Round 48-51 */ 209 tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 210 abcd = vsha1mq_u32(abcd, e, t0); 211 t0 = vaddq_u32(w2, K2); 212 w3 = vsha1su1q_u32(w3, w2); 213 w0 = vsha1su0q_u32(w0, w1, w2); 214 215 /* Round 52-55 */ 216 e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 217 abcd = vsha1mq_u32(abcd, tmpE, t1); 218 t1 = vaddq_u32(w3, K3); 219 w0 = vsha1su1q_u32(w0, w3); 220 w1 = vsha1su0q_u32(w1, w2, w3); 221 222 /* Round 56-59 */ 223 tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 224 abcd = vsha1mq_u32(abcd, e, t0); 225 t0 = vaddq_u32(w0, K3); 226 w1 = vsha1su1q_u32(w1, w0); 227 w2 = vsha1su0q_u32(w2, w3, w0); 228 229 /* Round 60-63 */ 230 e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 231 abcd = vsha1pq_u32(abcd, tmpE, t1); 232 t1 = vaddq_u32(w1, K3); 233 w2 = vsha1su1q_u32(w2, w1); 234 w3 = vsha1su0q_u32(w3, w0, w1); 235 236 /* Round 64-67 */ 237 tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 238 abcd = vsha1pq_u32(abcd, e, t0); 239 t0 = vaddq_u32(w2, K3); 240 w3 = vsha1su1q_u32(w3, w2); 241 w0 = vsha1su0q_u32(w0, w1, w2); 242 243 /* Round 68-71 */ 244 e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 245 abcd = vsha1pq_u32(abcd, tmpE, t1); 246 t1 = vaddq_u32(w3, K3); 247 w0 = vsha1su1q_u32(w0, w3); 248 249 /* Round 72-75 */ 250 tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 251 abcd = vsha1pq_u32(abcd, e, t0); 252 253 /* Round 76-79 */ 254 e = vsha1h_u32(vgetq_lane_u32(abcd, 0)); 255 abcd = vsha1pq_u32(abcd, tmpE, t1); 256 257 e += origE; 258 abcd = vaddq_u32(origABCD, abcd); 259 260 vst1q_u32(&XH(0), abcd); 261 XH(4) = e; 262 } 263 264 #endif /* USE_HW_SHA1 */