curve25519-donna-sse2.h (58262B)
1 /* 2 Public domain by Andrew M. <liquidsun@gmail.com> 3 See: https://github.com/floodyberry/curve25519-donna 4 5 SSE2 curve25519 implementation 6 */ 7 8 #include <emmintrin.h> 9 typedef __m128i xmmi; 10 11 typedef union packedelem8_t { 12 unsigned char u[16]; 13 xmmi v; 14 } packedelem8; 15 16 typedef union packedelem32_t { 17 uint32_t u[4]; 18 xmmi v; 19 } packedelem32; 20 21 typedef union packedelem64_t { 22 uint64_t u[2]; 23 xmmi v; 24 } packedelem64; 25 26 /* 10 elements + an extra 2 to fit in 3 xmm registers */ 27 typedef uint32_t bignum25519[12]; 28 typedef packedelem32 packed32bignum25519[5]; 29 typedef packedelem64 packed64bignum25519[10]; 30 31 static const packedelem32 bot32bitmask = {{0xffffffff, 0x00000000, 0xffffffff, 0x00000000}}; 32 static const packedelem32 top64bitmask = {{0x00000000, 0x00000000, 0xffffffff, 0xffffffff}}; 33 static const packedelem32 bot64bitmask = {{0xffffffff, 0xffffffff, 0x00000000, 0x00000000}}; 34 35 /* reduction masks */ 36 static const packedelem64 packedmask26 = {{0x03ffffff, 0x03ffffff}}; 37 static const packedelem64 packedmask25 = {{0x01ffffff, 0x01ffffff}}; 38 static const packedelem32 packedmask26262626 = {{0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff}}; 39 static const packedelem32 packedmask25252525 = {{0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}}; 40 41 /* multipliers */ 42 static const packedelem64 packednineteen = {{19, 19}}; 43 static const packedelem64 packedthirtyeight = {{38, 38}}; 44 static const packedelem64 packed3819 = {{19*2,19}}; 45 46 /* 2*(2^255 - 19) = 0 mod p */ 47 static const packedelem32 packed2p0 = {{0x7ffffda,0x3fffffe,0x7fffffe,0x3fffffe}}; 48 static const packedelem32 packed2p1 = {{0x7fffffe,0x3fffffe,0x7fffffe,0x3fffffe}}; 49 static const packedelem32 packed2p2 = {{0x7fffffe,0x3fffffe,0x0000000,0x0000000}}; 50 51 static const packedelem32 packed32packed2p0 = {{0x7ffffda,0x7ffffda,0x3fffffe,0x3fffffe}}; 52 static const packedelem32 packed32packed2p1 = {{0x7fffffe,0x7fffffe,0x3fffffe,0x3fffffe}}; 53 54 /* 4*(2^255 - 19) = 0 mod p */ 55 static const packedelem32 packed4p0 = {{0xfffffb4,0x7fffffc,0xffffffc,0x7fffffc}}; 56 static const packedelem32 packed4p1 = {{0xffffffc,0x7fffffc,0xffffffc,0x7fffffc}}; 57 static const packedelem32 packed4p2 = {{0xffffffc,0x7fffffc,0x0000000,0x0000000}}; 58 59 static const packedelem32 packed32packed4p0 = {{0xfffffb4,0xfffffb4,0x7fffffc,0x7fffffc}}; 60 static const packedelem32 packed32packed4p1 = {{0xffffffc,0xffffffc,0x7fffffc,0x7fffffc}}; 61 62 /* out = in */ 63 DONNA_INLINE static void 64 curve25519_copy(bignum25519 out, const bignum25519 in) { 65 xmmi x0,x1,x2; 66 x0 = _mm_load_si128((xmmi*)in + 0); 67 x1 = _mm_load_si128((xmmi*)in + 1); 68 x2 = _mm_load_si128((xmmi*)in + 2); 69 _mm_store_si128((xmmi*)out + 0, x0); 70 _mm_store_si128((xmmi*)out + 1, x1); 71 _mm_store_si128((xmmi*)out + 2, x2); 72 } 73 74 /* out = a + b */ 75 DONNA_INLINE static void 76 curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) { 77 xmmi a0,a1,a2,b0,b1,b2; 78 a0 = _mm_load_si128((xmmi*)a + 0); 79 a1 = _mm_load_si128((xmmi*)a + 1); 80 a2 = _mm_load_si128((xmmi*)a + 2); 81 b0 = _mm_load_si128((xmmi*)b + 0); 82 b1 = _mm_load_si128((xmmi*)b + 1); 83 b2 = _mm_load_si128((xmmi*)b + 2); 84 a0 = _mm_add_epi32(a0, b0); 85 a1 = _mm_add_epi32(a1, b1); 86 a2 = _mm_add_epi32(a2, b2); 87 _mm_store_si128((xmmi*)out + 0, a0); 88 _mm_store_si128((xmmi*)out + 1, a1); 89 _mm_store_si128((xmmi*)out + 2, a2); 90 } 91 92 #define curve25519_add_after_basic curve25519_add_reduce 93 DONNA_INLINE static void 94 curve25519_add_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) { 95 xmmi a0,a1,a2,b0,b1,b2; 96 xmmi c1,c2,c3; 97 xmmi r0,r1,r2,r3,r4,r5; 98 99 a0 = _mm_load_si128((xmmi*)a + 0); 100 a1 = _mm_load_si128((xmmi*)a + 1); 101 a2 = _mm_load_si128((xmmi*)a + 2); 102 b0 = _mm_load_si128((xmmi*)b + 0); 103 b1 = _mm_load_si128((xmmi*)b + 1); 104 b2 = _mm_load_si128((xmmi*)b + 2); 105 a0 = _mm_add_epi32(a0, b0); 106 a1 = _mm_add_epi32(a1, b1); 107 a2 = _mm_add_epi32(a2, b2); 108 109 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v); 110 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32); 111 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v); 112 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32); 113 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v); 114 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32); 115 116 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 117 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8); 118 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1); 119 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3)); 120 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 121 122 _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3))); 123 _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3))); 124 _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5)); 125 } 126 127 DONNA_INLINE static void 128 curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) { 129 xmmi a0,a1,a2,b0,b1,b2; 130 xmmi c1,c2; 131 xmmi r0,r1; 132 133 a0 = _mm_load_si128((xmmi*)a + 0); 134 a1 = _mm_load_si128((xmmi*)a + 1); 135 a2 = _mm_load_si128((xmmi*)a + 2); 136 a0 = _mm_add_epi32(a0, packed2p0.v); 137 a1 = _mm_add_epi32(a1, packed2p1.v); 138 a2 = _mm_add_epi32(a2, packed2p2.v); 139 b0 = _mm_load_si128((xmmi*)b + 0); 140 b1 = _mm_load_si128((xmmi*)b + 1); 141 b2 = _mm_load_si128((xmmi*)b + 2); 142 a0 = _mm_sub_epi32(a0, b0); 143 a1 = _mm_sub_epi32(a1, b1); 144 a2 = _mm_sub_epi32(a2, b2); 145 146 r0 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(2,2,0,0)), bot32bitmask.v); 147 r1 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3,3,1,1)), bot32bitmask.v); 148 149 c1 = _mm_srli_epi32(r0, 26); 150 c2 = _mm_srli_epi32(r1, 25); 151 r0 = _mm_and_si128(r0, packedmask26.v); 152 r1 = _mm_and_si128(r1, packedmask25.v); 153 r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8)); 154 r1 = _mm_add_epi32(r1, c1); 155 156 a0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1)); 157 a1 = _mm_add_epi32(a1, _mm_srli_si128(c2, 8)); 158 159 _mm_store_si128((xmmi*)out + 0, a0); 160 _mm_store_si128((xmmi*)out + 1, a1); 161 _mm_store_si128((xmmi*)out + 2, a2); 162 } 163 164 DONNA_INLINE static void 165 curve25519_sub_after_basic(bignum25519 out, const bignum25519 a, const bignum25519 b) { 166 xmmi a0,a1,a2,b0,b1,b2; 167 xmmi c1,c2,c3; 168 xmmi r0,r1,r2,r3,r4,r5; 169 170 a0 = _mm_load_si128((xmmi*)a + 0); 171 a1 = _mm_load_si128((xmmi*)a + 1); 172 a2 = _mm_load_si128((xmmi*)a + 2); 173 a0 = _mm_add_epi32(a0, packed4p0.v); 174 a1 = _mm_add_epi32(a1, packed4p1.v); 175 a2 = _mm_add_epi32(a2, packed4p2.v); 176 b0 = _mm_load_si128((xmmi*)b + 0); 177 b1 = _mm_load_si128((xmmi*)b + 1); 178 b2 = _mm_load_si128((xmmi*)b + 2); 179 a0 = _mm_sub_epi32(a0, b0); 180 a1 = _mm_sub_epi32(a1, b1); 181 a2 = _mm_sub_epi32(a2, b2); 182 183 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v); 184 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32); 185 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v); 186 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32); 187 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v); 188 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32); 189 190 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 191 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8); 192 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1); 193 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3)); 194 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 195 196 _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3))); 197 _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3))); 198 _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5)); 199 } 200 201 DONNA_INLINE static void 202 curve25519_sub_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) { 203 xmmi a0,a1,a2,b0,b1,b2; 204 xmmi c1,c2,c3; 205 xmmi r0,r1,r2,r3,r4,r5; 206 207 a0 = _mm_load_si128((xmmi*)a + 0); 208 a1 = _mm_load_si128((xmmi*)a + 1); 209 a2 = _mm_load_si128((xmmi*)a + 2); 210 a0 = _mm_add_epi32(a0, packed2p0.v); 211 a1 = _mm_add_epi32(a1, packed2p1.v); 212 a2 = _mm_add_epi32(a2, packed2p2.v); 213 b0 = _mm_load_si128((xmmi*)b + 0); 214 b1 = _mm_load_si128((xmmi*)b + 1); 215 b2 = _mm_load_si128((xmmi*)b + 2); 216 a0 = _mm_sub_epi32(a0, b0); 217 a1 = _mm_sub_epi32(a1, b1); 218 a2 = _mm_sub_epi32(a2, b2); 219 220 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v); 221 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32); 222 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v); 223 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32); 224 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v); 225 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32); 226 227 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 228 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8); 229 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1); 230 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3)); 231 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 232 233 _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3))); 234 _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3))); 235 _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5)); 236 } 237 238 239 DONNA_INLINE static void 240 curve25519_neg(bignum25519 out, const bignum25519 b) { 241 xmmi a0,a1,a2,b0,b1,b2; 242 xmmi c1,c2,c3; 243 xmmi r0,r1,r2,r3,r4,r5; 244 245 a0 = packed2p0.v; 246 a1 = packed2p1.v; 247 a2 = packed2p2.v; 248 b0 = _mm_load_si128((xmmi*)b + 0); 249 b1 = _mm_load_si128((xmmi*)b + 1); 250 b2 = _mm_load_si128((xmmi*)b + 2); 251 a0 = _mm_sub_epi32(a0, b0); 252 a1 = _mm_sub_epi32(a1, b1); 253 a2 = _mm_sub_epi32(a2, b2); 254 255 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v); 256 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32); 257 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v); 258 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32); 259 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v); 260 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32); 261 262 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 263 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8); 264 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1); 265 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3)); 266 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 267 268 _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3))); 269 _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3))); 270 _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5)); 271 } 272 273 274 /* Multiply two numbers: out = in2 * in */ 275 static void 276 curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) { 277 xmmi m01,m23,m45,m67,m89; 278 xmmi m0123,m4567; 279 xmmi s0123,s4567; 280 xmmi s01,s23,s45,s67,s89; 281 xmmi s12,s34,s56,s78,s9; 282 xmmi r0,r2,r4,r6,r8; 283 xmmi r1,r3,r5,r7,r9; 284 xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919; 285 xmmi c1,c2,c3; 286 287 s0123 = _mm_load_si128((xmmi*)s + 0); 288 s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0)); 289 s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1)); 290 s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2)); 291 s4567 = _mm_load_si128((xmmi*)s + 1); 292 s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567); 293 s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0)); 294 s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1)); 295 s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2)); 296 s89 = _mm_load_si128((xmmi*)s + 2); 297 s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89); 298 s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0)); 299 s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2)); 300 301 r0 = _mm_load_si128((xmmi*)r + 0); 302 r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1)); 303 r1 = _mm_add_epi64(r1, _mm_and_si128(r1, top64bitmask.v)); 304 r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2)); 305 r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3)); 306 r3 = _mm_add_epi64(r3, _mm_and_si128(r3, top64bitmask.v)); 307 r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0)); 308 r4 = _mm_load_si128((xmmi*)r + 1); 309 r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1)); 310 r5 = _mm_add_epi64(r5, _mm_and_si128(r5, top64bitmask.v)); 311 r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2)); 312 r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3)); 313 r7 = _mm_add_epi64(r7, _mm_and_si128(r7, top64bitmask.v)); 314 r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0)); 315 r8 = _mm_load_si128((xmmi*)r + 2); 316 r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1)); 317 r9 = _mm_add_epi64(r9, _mm_and_si128(r9, top64bitmask.v)); 318 r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0)); 319 320 m01 = _mm_mul_epu32(r1,s01); 321 m23 = _mm_mul_epu32(r1,s23); 322 m45 = _mm_mul_epu32(r1,s45); 323 m67 = _mm_mul_epu32(r1,s67); 324 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01)); 325 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23)); 326 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45)); 327 m89 = _mm_mul_epu32(r1,s89); 328 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01)); 329 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23)); 330 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67)); 331 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01)); 332 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45)); 333 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23)); 334 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01)); 335 336 /* shift up */ 337 m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8)); 338 m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8)); 339 m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8)); 340 m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8)); 341 m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8)); 342 343 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01)); 344 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23)); 345 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45)); 346 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67)); 347 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01)); 348 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23)); 349 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23)); 350 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89)); 351 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01)); 352 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45)); 353 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67)); 354 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01)); 355 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45)); 356 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23)); 357 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01)); 358 359 r219 = _mm_mul_epu32(r2, packednineteen.v); 360 r419 = _mm_mul_epu32(r4, packednineteen.v); 361 r619 = _mm_mul_epu32(r6, packednineteen.v); 362 r819 = _mm_mul_epu32(r8, packednineteen.v); 363 r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v); 364 r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v); 365 r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v); 366 r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v); 367 r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v); 368 369 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12)); 370 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34)); 371 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56)); 372 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78)); 373 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34)); 374 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56)); 375 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78)); 376 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9)); 377 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56)); 378 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78)); 379 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9)); 380 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89)); 381 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78)); 382 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9)); 383 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89)); 384 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9)); 385 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23)); 386 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45)); 387 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67)); 388 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45)); 389 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67)); 390 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67)); 391 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89)); 392 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89)); 393 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9)); 394 395 r0 = _mm_unpacklo_epi64(m01, m45); 396 r1 = _mm_unpackhi_epi64(m01, m45); 397 r2 = _mm_unpacklo_epi64(m23, m67); 398 r3 = _mm_unpackhi_epi64(m23, m67); 399 r4 = _mm_unpacklo_epi64(m89, m89); 400 r5 = _mm_unpackhi_epi64(m89, m89); 401 402 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 403 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8); 404 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1); 405 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3)); 406 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 407 408 m0123 = _mm_unpacklo_epi32(r0, r1); 409 m4567 = _mm_unpackhi_epi32(r0, r1); 410 m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3)); 411 m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3)); 412 m89 = _mm_unpackhi_epi32(r4, r5); 413 414 _mm_store_si128((xmmi*)out + 0, m0123); 415 _mm_store_si128((xmmi*)out + 1, m4567); 416 _mm_store_si128((xmmi*)out + 2, m89); 417 } 418 419 DONNA_NOINLINE static void 420 curve25519_mul_noinline(bignum25519 out, const bignum25519 r, const bignum25519 s) { 421 curve25519_mul(out, r, s); 422 } 423 424 #define curve25519_square(r, n) curve25519_square_times(r, n, 1) 425 static void 426 curve25519_square_times(bignum25519 r, const bignum25519 in, int count) { 427 xmmi m01,m23,m45,m67,m89; 428 xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9; 429 xmmi r0a,r1a,r2a,r3a,r7a,r9a; 430 xmmi r0123,r4567; 431 xmmi r01,r23,r45,r67,r6x,r89,r8x; 432 xmmi r12,r34,r56,r78,r9x; 433 xmmi r5619; 434 xmmi c1,c2,c3; 435 436 r0123 = _mm_load_si128((xmmi*)in + 0); 437 r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0)); 438 r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2)); 439 r4567 = _mm_load_si128((xmmi*)in + 1); 440 r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0)); 441 r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2)); 442 r89 = _mm_load_si128((xmmi*)in + 2); 443 r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0)); 444 445 do { 446 r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8)); 447 r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0)); 448 r0 = _mm_add_epi64(r0, _mm_and_si128(r0, top64bitmask.v)); 449 r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2)); 450 r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2)); 451 r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0)); 452 r2 = _mm_add_epi64(r2, _mm_and_si128(r2, top64bitmask.v)); 453 r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2)); 454 r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2)); 455 r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8)); 456 r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0)); 457 r4 = _mm_add_epi64(r4, _mm_and_si128(r4, top64bitmask.v)); 458 r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8)); 459 r5619 = _mm_mul_epu32(r56, packednineteen.v); 460 r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0)); 461 r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2)); 462 r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8)); 463 r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128()); 464 r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2)); 465 r7 = _mm_mul_epu32(r7, packed3819.v); 466 r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2)); 467 r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128()); 468 r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0)); 469 r8 = _mm_mul_epu32(r8, packednineteen.v); 470 r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2)); 471 r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1); 472 r9 = _mm_mul_epu32(r9, packed3819.v); 473 r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2)); 474 475 m01 = _mm_mul_epu32(r01, r0); 476 m23 = _mm_mul_epu32(r23, r0a); 477 m45 = _mm_mul_epu32(r45, r0a); 478 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2)); 479 r23 = _mm_slli_epi32(r23, 1); 480 m67 = _mm_mul_epu32(r67, r0a); 481 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a)); 482 m89 = _mm_mul_epu32(r89, r0a); 483 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a)); 484 r67 = _mm_slli_epi32(r67, 1); 485 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4)); 486 r45 = _mm_slli_epi32(r45, 1); 487 488 r1 = _mm_slli_epi32(r1, 1); 489 r3 = _mm_slli_epi32(r3, 1); 490 r1a = _mm_add_epi64(r1, _mm_and_si128(r1, bot64bitmask.v)); 491 r3a = _mm_add_epi64(r3, _mm_and_si128(r3, bot64bitmask.v)); 492 493 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1)); 494 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a)); 495 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a)); 496 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3)); 497 r34 = _mm_slli_epi32(r34, 1); 498 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a)); 499 r78 = _mm_slli_epi32(r78, 1); 500 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a)); 501 r56 = _mm_slli_epi32(r56, 1); 502 503 m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9)); 504 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7)); 505 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9)); 506 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5)); 507 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7)); 508 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9)); 509 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8)); 510 m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6)); 511 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8)); 512 m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6)); 513 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a)); 514 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9)); 515 m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8)); 516 m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8)); 517 m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a)); 518 519 r0 = _mm_unpacklo_epi64(m01, m45); 520 r1 = _mm_unpackhi_epi64(m01, m45); 521 r2 = _mm_unpacklo_epi64(m23, m67); 522 r3 = _mm_unpackhi_epi64(m23, m67); 523 r4 = _mm_unpacklo_epi64(m89, m89); 524 r5 = _mm_unpackhi_epi64(m89, m89); 525 526 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 527 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8); 528 c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1); 529 c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3)); 530 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2); 531 532 r01 = _mm_unpacklo_epi64(r0, r1); 533 r45 = _mm_unpackhi_epi64(r0, r1); 534 r23 = _mm_unpacklo_epi64(r2, r3); 535 r67 = _mm_unpackhi_epi64(r2, r3); 536 r89 = _mm_unpackhi_epi64(r4, r5); 537 } while (--count); 538 539 r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3)); 540 r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3)); 541 r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0))); 542 r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0))); 543 r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0)); 544 545 _mm_store_si128((xmmi*)r + 0, r0123); 546 _mm_store_si128((xmmi*)r + 1, r4567); 547 _mm_store_si128((xmmi*)r + 2, r89); 548 } 549 550 DONNA_INLINE static void 551 curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) { 552 xmmi x0,x1,x2,z0,z1,z2; 553 554 x0 = _mm_load_si128((xmmi *)(x + 0)); 555 x1 = _mm_load_si128((xmmi *)(x + 4)); 556 x2 = _mm_load_si128((xmmi *)(x + 8)); 557 z0 = _mm_load_si128((xmmi *)(z + 0)); 558 z1 = _mm_load_si128((xmmi *)(z + 4)); 559 z2 = _mm_load_si128((xmmi *)(z + 8)); 560 561 out[0].v = _mm_unpacklo_epi32(x0, z0); 562 out[1].v = _mm_unpackhi_epi32(x0, z0); 563 out[2].v = _mm_unpacklo_epi32(x1, z1); 564 out[3].v = _mm_unpackhi_epi32(x1, z1); 565 out[4].v = _mm_unpacklo_epi32(x2, z2); 566 } 567 568 DONNA_INLINE static void 569 curve25519_untangle32(bignum25519 x, bignum25519 z, const packedelem32 *in) { 570 xmmi t0,t1,t2,t3,t4,zero; 571 572 t0 = _mm_shuffle_epi32(in[0].v, _MM_SHUFFLE(3,1,2,0)); 573 t1 = _mm_shuffle_epi32(in[1].v, _MM_SHUFFLE(3,1,2,0)); 574 t2 = _mm_shuffle_epi32(in[2].v, _MM_SHUFFLE(3,1,2,0)); 575 t3 = _mm_shuffle_epi32(in[3].v, _MM_SHUFFLE(3,1,2,0)); 576 t4 = _mm_shuffle_epi32(in[4].v, _MM_SHUFFLE(3,1,2,0)); 577 zero = _mm_setzero_si128(); 578 _mm_store_si128((xmmi *)x + 0, _mm_unpacklo_epi64(t0, t1)); 579 _mm_store_si128((xmmi *)x + 1, _mm_unpacklo_epi64(t2, t3)); 580 _mm_store_si128((xmmi *)x + 2, _mm_unpacklo_epi64(t4, zero)); 581 _mm_store_si128((xmmi *)z + 0, _mm_unpackhi_epi64(t0, t1)); 582 _mm_store_si128((xmmi *)z + 1, _mm_unpackhi_epi64(t2, t3)); 583 _mm_store_si128((xmmi *)z + 2, _mm_unpackhi_epi64(t4, zero)); 584 } 585 586 DONNA_INLINE static void 587 curve25519_add_reduce_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) { 588 xmmi r0,r1,r2,r3,r4; 589 xmmi s0,s1,s2,s3,s4,s5; 590 xmmi c1,c2; 591 592 r0 = _mm_add_epi32(r[0].v, s[0].v); 593 r1 = _mm_add_epi32(r[1].v, s[1].v); 594 r2 = _mm_add_epi32(r[2].v, s[2].v); 595 r3 = _mm_add_epi32(r[3].v, s[3].v); 596 r4 = _mm_add_epi32(r[4].v, s[4].v); 597 598 s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */ 599 s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */ 600 s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */ 601 s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */ 602 s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */ 603 s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */ 604 605 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2); 606 c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2)); 607 c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2); 608 c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8))); 609 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2); 610 611 out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */ 612 out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */ 613 out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */ 614 out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */ 615 out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */ 616 } 617 618 DONNA_INLINE static void 619 curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) { 620 out[0].v = _mm_add_epi32(r[0].v, s[0].v); 621 out[1].v = _mm_add_epi32(r[1].v, s[1].v); 622 out[2].v = _mm_add_epi32(r[2].v, s[2].v); 623 out[3].v = _mm_add_epi32(r[3].v, s[3].v); 624 out[4].v = _mm_add_epi32(r[4].v, s[4].v); 625 } 626 627 DONNA_INLINE static void 628 curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) { 629 xmmi r0,r1,r2,r3,r4; 630 xmmi s0,s1,s2,s3; 631 xmmi c1,c2; 632 633 r0 = _mm_add_epi32(r[0].v, packed32packed2p0.v); 634 r1 = _mm_add_epi32(r[1].v, packed32packed2p1.v); 635 r2 = _mm_add_epi32(r[2].v, packed32packed2p1.v); 636 r3 = _mm_add_epi32(r[3].v, packed32packed2p1.v); 637 r4 = _mm_add_epi32(r[4].v, packed32packed2p1.v); 638 r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */ 639 r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */ 640 r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */ 641 r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */ 642 r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */ 643 644 s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */ 645 s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */ 646 s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */ 647 s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */ 648 649 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2); 650 c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8)); 651 652 out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */ 653 out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */ 654 out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */ 655 out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */ 656 out[4].v = r4; 657 } 658 659 DONNA_INLINE static void 660 curve25519_sub_after_basic_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) { 661 xmmi r0,r1,r2,r3,r4; 662 xmmi s0,s1,s2,s3,s4,s5; 663 xmmi c1,c2; 664 665 r0 = _mm_add_epi32(r[0].v, packed32packed4p0.v); 666 r1 = _mm_add_epi32(r[1].v, packed32packed4p1.v); 667 r2 = _mm_add_epi32(r[2].v, packed32packed4p1.v); 668 r3 = _mm_add_epi32(r[3].v, packed32packed4p1.v); 669 r4 = _mm_add_epi32(r[4].v, packed32packed4p1.v); 670 r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */ 671 r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */ 672 r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */ 673 r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */ 674 r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */ 675 676 s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */ 677 s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */ 678 s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */ 679 s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */ 680 s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */ 681 s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */ 682 683 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2); 684 c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2)); 685 c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2); 686 c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8))); 687 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2); 688 689 out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */ 690 out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */ 691 out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */ 692 out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */ 693 out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */ 694 } 695 696 DONNA_INLINE static void 697 curve25519_tangle64_from32(packedelem64 *a, packedelem64 *b, const packedelem32 *c, const packedelem32 *d) { 698 xmmi c0,c1,c2,c3,c4,c5,t; 699 xmmi d0,d1,d2,d3,d4,d5; 700 xmmi t0,t1,t2,t3,t4,zero; 701 702 t0 = _mm_shuffle_epi32(c[0].v, _MM_SHUFFLE(3,1,2,0)); 703 t1 = _mm_shuffle_epi32(c[1].v, _MM_SHUFFLE(3,1,2,0)); 704 t2 = _mm_shuffle_epi32(d[0].v, _MM_SHUFFLE(3,1,2,0)); 705 t3 = _mm_shuffle_epi32(d[1].v, _MM_SHUFFLE(3,1,2,0)); 706 c0 = _mm_unpacklo_epi64(t0, t1); 707 c3 = _mm_unpackhi_epi64(t0, t1); 708 d0 = _mm_unpacklo_epi64(t2, t3); 709 d3 = _mm_unpackhi_epi64(t2, t3); 710 t = _mm_unpacklo_epi64(c0, d0); a[0].v = t; a[1].v = _mm_srli_epi64(t, 32); 711 t = _mm_unpackhi_epi64(c0, d0); a[2].v = t; a[3].v = _mm_srli_epi64(t, 32); 712 t = _mm_unpacklo_epi64(c3, d3); b[0].v = t; b[1].v = _mm_srli_epi64(t, 32); 713 t = _mm_unpackhi_epi64(c3, d3); b[2].v = t; b[3].v = _mm_srli_epi64(t, 32); 714 715 t0 = _mm_shuffle_epi32(c[2].v, _MM_SHUFFLE(3,1,2,0)); 716 t1 = _mm_shuffle_epi32(c[3].v, _MM_SHUFFLE(3,1,2,0)); 717 t2 = _mm_shuffle_epi32(d[2].v, _MM_SHUFFLE(3,1,2,0)); 718 t3 = _mm_shuffle_epi32(d[3].v, _MM_SHUFFLE(3,1,2,0)); 719 c1 = _mm_unpacklo_epi64(t0, t1); 720 c4 = _mm_unpackhi_epi64(t0, t1); 721 d1 = _mm_unpacklo_epi64(t2, t3); 722 d4 = _mm_unpackhi_epi64(t2, t3); 723 t = _mm_unpacklo_epi64(c1, d1); a[4].v = t; a[5].v = _mm_srli_epi64(t, 32); 724 t = _mm_unpackhi_epi64(c1, d1); a[6].v = t; a[7].v = _mm_srli_epi64(t, 32); 725 t = _mm_unpacklo_epi64(c4, d4); b[4].v = t; b[5].v = _mm_srli_epi64(t, 32); 726 t = _mm_unpackhi_epi64(c4, d4); b[6].v = t; b[7].v = _mm_srli_epi64(t, 32); 727 728 t4 = _mm_shuffle_epi32(c[4].v, _MM_SHUFFLE(3,1,2,0)); 729 zero = _mm_setzero_si128(); 730 c2 = _mm_unpacklo_epi64(t4, zero); 731 c5 = _mm_unpackhi_epi64(t4, zero); 732 t4 = _mm_shuffle_epi32(d[4].v, _MM_SHUFFLE(3,1,2,0)); 733 d2 = _mm_unpacklo_epi64(t4, zero); 734 d5 = _mm_unpackhi_epi64(t4, zero); 735 t = _mm_unpacklo_epi64(c2, d2); a[8].v = t; a[9].v = _mm_srli_epi64(t, 32); 736 t = _mm_unpacklo_epi64(c5, d5); b[8].v = t; b[9].v = _mm_srli_epi64(t, 32); 737 } 738 739 DONNA_INLINE static void 740 curve25519_tangle64(packedelem64 *out, const bignum25519 x, const bignum25519 z) { 741 xmmi x0,x1,x2,z0,z1,z2,t; 742 743 x0 = _mm_load_si128((xmmi *)x + 0); 744 x1 = _mm_load_si128((xmmi *)x + 1); 745 x2 = _mm_load_si128((xmmi *)x + 2); 746 z0 = _mm_load_si128((xmmi *)z + 0); 747 z1 = _mm_load_si128((xmmi *)z + 1); 748 z2 = _mm_load_si128((xmmi *)z + 2); 749 750 t = _mm_unpacklo_epi64(x0, z0); out[0].v = t; out[1].v = _mm_srli_epi64(t, 32); 751 t = _mm_unpackhi_epi64(x0, z0); out[2].v = t; out[3].v = _mm_srli_epi64(t, 32); 752 t = _mm_unpacklo_epi64(x1, z1); out[4].v = t; out[5].v = _mm_srli_epi64(t, 32); 753 t = _mm_unpackhi_epi64(x1, z1); out[6].v = t; out[7].v = _mm_srli_epi64(t, 32); 754 t = _mm_unpacklo_epi64(x2, z2); out[8].v = t; out[9].v = _mm_srli_epi64(t, 32); 755 } 756 757 DONNA_INLINE static void 758 curve25519_tangleone64(packedelem64 *out, const bignum25519 x) { 759 xmmi x0,x1,x2; 760 761 x0 = _mm_load_si128((xmmi *)(x + 0)); 762 x1 = _mm_load_si128((xmmi *)(x + 4)); 763 x2 = _mm_load_si128((xmmi *)(x + 8)); 764 765 out[0].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(0,0,0,0)); 766 out[1].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(1,1,1,1)); 767 out[2].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(2,2,2,2)); 768 out[3].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(3,3,3,3)); 769 out[4].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0,0,0,0)); 770 out[5].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,1,1,1)); 771 out[6].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2,2,2,2)); 772 out[7].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(3,3,3,3)); 773 out[8].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0,0,0,0)); 774 out[9].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1,1,1,1)); 775 } 776 777 DONNA_INLINE static void 778 curve25519_swap64(packedelem64 *out) { 779 out[0].v = _mm_shuffle_epi32(out[0].v, _MM_SHUFFLE(1,0,3,2)); 780 out[1].v = _mm_shuffle_epi32(out[1].v, _MM_SHUFFLE(1,0,3,2)); 781 out[2].v = _mm_shuffle_epi32(out[2].v, _MM_SHUFFLE(1,0,3,2)); 782 out[3].v = _mm_shuffle_epi32(out[3].v, _MM_SHUFFLE(1,0,3,2)); 783 out[4].v = _mm_shuffle_epi32(out[4].v, _MM_SHUFFLE(1,0,3,2)); 784 out[5].v = _mm_shuffle_epi32(out[5].v, _MM_SHUFFLE(1,0,3,2)); 785 out[6].v = _mm_shuffle_epi32(out[6].v, _MM_SHUFFLE(1,0,3,2)); 786 out[7].v = _mm_shuffle_epi32(out[7].v, _MM_SHUFFLE(1,0,3,2)); 787 out[8].v = _mm_shuffle_epi32(out[8].v, _MM_SHUFFLE(1,0,3,2)); 788 out[9].v = _mm_shuffle_epi32(out[9].v, _MM_SHUFFLE(1,0,3,2)); 789 } 790 791 DONNA_INLINE static void 792 curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) { 793 _mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v))); 794 _mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v))); 795 _mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) ); 796 _mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v))); 797 _mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v))); 798 _mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) ); 799 } 800 801 DONNA_INLINE static void 802 curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) { 803 xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9; 804 xmmi r1_2,r3_2,r5_2,r7_2,r9_2; 805 xmmi c1,c2; 806 807 out[0].v = _mm_mul_epu32(r[0].v, s[0].v); 808 out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v)); 809 r1_2 = _mm_slli_epi32(r[1].v, 1); 810 out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v))); 811 out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v)))); 812 r3_2 = _mm_slli_epi32(r[3].v, 1); 813 out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v))))); 814 out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v)))))); 815 r5_2 = _mm_slli_epi32(r[5].v, 1); 816 out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v))))))); 817 out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v)))))))); 818 r7_2 = _mm_slli_epi32(r[7].v, 1); 819 out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v))))))))); 820 out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v)))))))))); 821 822 r1 = _mm_mul_epu32(r[1].v, packednineteen.v); 823 r2 = _mm_mul_epu32(r[2].v, packednineteen.v); 824 r1_2 = _mm_slli_epi32(r1, 1); 825 r3 = _mm_mul_epu32(r[3].v, packednineteen.v); 826 r4 = _mm_mul_epu32(r[4].v, packednineteen.v); 827 r3_2 = _mm_slli_epi32(r3, 1); 828 r5 = _mm_mul_epu32(r[5].v, packednineteen.v); 829 r6 = _mm_mul_epu32(r[6].v, packednineteen.v); 830 r5_2 = _mm_slli_epi32(r5, 1); 831 r7 = _mm_mul_epu32(r[7].v, packednineteen.v); 832 r8 = _mm_mul_epu32(r[8].v, packednineteen.v); 833 r7_2 = _mm_slli_epi32(r7, 1); 834 r9 = _mm_mul_epu32(r[9].v, packednineteen.v); 835 r9_2 = _mm_slli_epi32(r9, 1); 836 837 out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v)))))))))); 838 out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v))))))))); 839 out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v)))))))); 840 out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v))))))); 841 out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v)))))); 842 out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v))))); 843 out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v)))); 844 out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v))); 845 out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v)); 846 847 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2); 848 c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2); 849 c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2); 850 c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2); 851 c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2); 852 c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v)); 853 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2); 854 } 855 856 DONNA_INLINE static void 857 curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) { 858 xmmi r0,r1,r2,r3; 859 xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2; 860 xmmi d5,d6,d7,d8,d9; 861 xmmi c1,c2; 862 863 r0 = r[0].v; 864 r1 = r[1].v; 865 r2 = r[2].v; 866 r3 = r[3].v; 867 868 out[0].v = _mm_mul_epu32(r0, r0); 869 r0 = _mm_slli_epi32(r0, 1); 870 out[1].v = _mm_mul_epu32(r0, r1); 871 r1_2 = _mm_slli_epi32(r1, 1); 872 out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2)); 873 r1 = r1_2; 874 out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 )); 875 r3_2 = _mm_slli_epi32(r3, 1); 876 out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2))); 877 r2 = _mm_slli_epi32(r2, 1); 878 out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3))); 879 r5_2 = _mm_slli_epi32(r[5].v, 1); 880 out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 )))); 881 r3 = r3_2; 882 out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v)))); 883 r7_2 = _mm_slli_epi32(r[7].v, 1); 884 out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v))))); 885 out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 ))))); 886 887 d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v); 888 d6 = _mm_mul_epu32(r[6].v, packednineteen.v); 889 d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v); 890 d8 = _mm_mul_epu32(r[8].v, packednineteen.v); 891 d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v); 892 893 r4_2 = _mm_slli_epi32(r[4].v, 1); 894 r6_2 = _mm_slli_epi32(r[6].v, 1); 895 out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v)))))); 896 out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 ))))); 897 out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v))))); 898 out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v)))); 899 out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v)))); 900 out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 ))); 901 out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v))); 902 out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v)); 903 out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v)); 904 905 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2); 906 c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2); 907 c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2); 908 c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2); 909 c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2); 910 c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v)); 911 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2); 912 } 913 914 915 /* Take a little-endian, 32-byte number and expand it into polynomial form */ 916 static void 917 curve25519_expand(bignum25519 out, const unsigned char in[32]) { 918 uint32_t x0,x1,x2,x3,x4,x5,x6,x7; 919 920 x0 = *(uint32_t *)(in + 0); 921 x1 = *(uint32_t *)(in + 4); 922 x2 = *(uint32_t *)(in + 8); 923 x3 = *(uint32_t *)(in + 12); 924 x4 = *(uint32_t *)(in + 16); 925 x5 = *(uint32_t *)(in + 20); 926 x6 = *(uint32_t *)(in + 24); 927 x7 = *(uint32_t *)(in + 28); 928 929 out[0] = ( x0 ) & 0x3ffffff; 930 out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & 0x1ffffff; 931 out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & 0x3ffffff; 932 out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & 0x1ffffff; 933 out[4] = (( x3) >> 6) & 0x3ffffff; 934 out[5] = ( x4 ) & 0x1ffffff; 935 out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & 0x3ffffff; 936 out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & 0x1ffffff; 937 out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & 0x3ffffff; 938 out[9] = (( x7) >> 6) & 0x1ffffff; 939 out[10] = 0; 940 out[11] = 0; 941 } 942 943 /* Take a fully reduced polynomial form number and contract it into a 944 * little-endian, 32-byte array 945 */ 946 static void 947 curve25519_contract(unsigned char out[32], const bignum25519 in) { 948 bignum25519 ALIGN(16) f; 949 curve25519_copy(f, in); 950 951 #define carry_pass() \ 952 f[1] += f[0] >> 26; f[0] &= 0x3ffffff; \ 953 f[2] += f[1] >> 25; f[1] &= 0x1ffffff; \ 954 f[3] += f[2] >> 26; f[2] &= 0x3ffffff; \ 955 f[4] += f[3] >> 25; f[3] &= 0x1ffffff; \ 956 f[5] += f[4] >> 26; f[4] &= 0x3ffffff; \ 957 f[6] += f[5] >> 25; f[5] &= 0x1ffffff; \ 958 f[7] += f[6] >> 26; f[6] &= 0x3ffffff; \ 959 f[8] += f[7] >> 25; f[7] &= 0x1ffffff; \ 960 f[9] += f[8] >> 26; f[8] &= 0x3ffffff; 961 962 #define carry_pass_full() \ 963 carry_pass() \ 964 f[0] += 19 * (f[9] >> 25); f[9] &= 0x1ffffff; 965 966 #define carry_pass_final() \ 967 carry_pass() \ 968 f[9] &= 0x1ffffff; 969 970 carry_pass_full() 971 carry_pass_full() 972 973 /* now t is between 0 and 2^255-1, properly carried. */ 974 /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */ 975 f[0] += 19; 976 carry_pass_full() 977 978 /* now between 19 and 2^255-1 in both cases, and offset by 19. */ 979 f[0] += (1 << 26) - 19; 980 f[1] += (1 << 25) - 1; 981 f[2] += (1 << 26) - 1; 982 f[3] += (1 << 25) - 1; 983 f[4] += (1 << 26) - 1; 984 f[5] += (1 << 25) - 1; 985 f[6] += (1 << 26) - 1; 986 f[7] += (1 << 25) - 1; 987 f[8] += (1 << 26) - 1; 988 f[9] += (1 << 25) - 1; 989 990 /* now between 2^255 and 2^256-20, and offset by 2^255. */ 991 carry_pass_final() 992 993 #undef carry_pass 994 #undef carry_full 995 #undef carry_final 996 997 f[1] <<= 2; 998 f[2] <<= 3; 999 f[3] <<= 5; 1000 f[4] <<= 6; 1001 f[6] <<= 1; 1002 f[7] <<= 3; 1003 f[8] <<= 4; 1004 f[9] <<= 6; 1005 1006 #define F(i, s) \ 1007 out[s+0] |= (unsigned char )(f[i] & 0xff); \ 1008 out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \ 1009 out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \ 1010 out[s+3] = (unsigned char )((f[i] >> 24) & 0xff); 1011 1012 out[0] = 0; 1013 out[16] = 0; 1014 F(0,0); 1015 F(1,3); 1016 F(2,6); 1017 F(3,9); 1018 F(4,12); 1019 F(5,16); 1020 F(6,19); 1021 F(7,22); 1022 F(8,25); 1023 F(9,28); 1024 #undef F 1025 } 1026 1027 /* if (iswap) swap(a, b) */ 1028 DONNA_INLINE static void 1029 curve25519_swap_conditional(bignum25519 a, bignum25519 b, uint32_t iswap) { 1030 const uint32_t swap = (uint32_t)(-(int32_t)iswap); 1031 xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2; 1032 xmmi mask = _mm_cvtsi32_si128(swap); 1033 mask = _mm_shuffle_epi32(mask, 0); 1034 a0 = _mm_load_si128((xmmi *)a + 0); 1035 a1 = _mm_load_si128((xmmi *)a + 1); 1036 b0 = _mm_load_si128((xmmi *)b + 0); 1037 b1 = _mm_load_si128((xmmi *)b + 1); 1038 b0 = _mm_xor_si128(a0, b0); 1039 b1 = _mm_xor_si128(a1, b1); 1040 x0 = _mm_and_si128(b0, mask); 1041 x1 = _mm_and_si128(b1, mask); 1042 x0 = _mm_xor_si128(x0, a0); 1043 x1 = _mm_xor_si128(x1, a1); 1044 a0 = _mm_xor_si128(x0, b0); 1045 a1 = _mm_xor_si128(x1, b1); 1046 _mm_store_si128((xmmi *)a + 0, x0); 1047 _mm_store_si128((xmmi *)a + 1, x1); 1048 _mm_store_si128((xmmi *)b + 0, a0); 1049 _mm_store_si128((xmmi *)b + 1, a1); 1050 1051 a2 = _mm_load_si128((xmmi *)a + 2); 1052 b2 = _mm_load_si128((xmmi *)b + 2); 1053 b2 = _mm_xor_si128(a2, b2); 1054 x2 = _mm_and_si128(b2, mask); 1055 x2 = _mm_xor_si128(x2, a2); 1056 a2 = _mm_xor_si128(x2, b2); 1057 _mm_store_si128((xmmi *)b + 2, a2); 1058 _mm_store_si128((xmmi *)a + 2, x2); 1059 } 1060 1061 /* out = (flag) ? out : in */ 1062 DONNA_INLINE static void 1063 curve25519_move_conditional_bytes(uint8_t out[96], const uint8_t in[96], uint32_t flag) { 1064 xmmi a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5; 1065 const uint32_t nb = flag - 1; 1066 xmmi masknb = _mm_shuffle_epi32(_mm_cvtsi32_si128(nb),0); 1067 a0 = _mm_load_si128((xmmi *)in + 0); 1068 a1 = _mm_load_si128((xmmi *)in + 1); 1069 a2 = _mm_load_si128((xmmi *)in + 2); 1070 b0 = _mm_load_si128((xmmi *)out + 0); 1071 b1 = _mm_load_si128((xmmi *)out + 1); 1072 b2 = _mm_load_si128((xmmi *)out + 2); 1073 a0 = _mm_andnot_si128(masknb, a0); 1074 a1 = _mm_andnot_si128(masknb, a1); 1075 a2 = _mm_andnot_si128(masknb, a2); 1076 b0 = _mm_and_si128(masknb, b0); 1077 b1 = _mm_and_si128(masknb, b1); 1078 b2 = _mm_and_si128(masknb, b2); 1079 a0 = _mm_or_si128(a0, b0); 1080 a1 = _mm_or_si128(a1, b1); 1081 a2 = _mm_or_si128(a2, b2); 1082 _mm_store_si128((xmmi*)out + 0, a0); 1083 _mm_store_si128((xmmi*)out + 1, a1); 1084 _mm_store_si128((xmmi*)out + 2, a2); 1085 1086 a3 = _mm_load_si128((xmmi *)in + 3); 1087 a4 = _mm_load_si128((xmmi *)in + 4); 1088 a5 = _mm_load_si128((xmmi *)in + 5); 1089 b3 = _mm_load_si128((xmmi *)out + 3); 1090 b4 = _mm_load_si128((xmmi *)out + 4); 1091 b5 = _mm_load_si128((xmmi *)out + 5); 1092 a3 = _mm_andnot_si128(masknb, a3); 1093 a4 = _mm_andnot_si128(masknb, a4); 1094 a5 = _mm_andnot_si128(masknb, a5); 1095 b3 = _mm_and_si128(masknb, b3); 1096 b4 = _mm_and_si128(masknb, b4); 1097 b5 = _mm_and_si128(masknb, b5); 1098 a3 = _mm_or_si128(a3, b3); 1099 a4 = _mm_or_si128(a4, b4); 1100 a5 = _mm_or_si128(a5, b5); 1101 _mm_store_si128((xmmi*)out + 3, a3); 1102 _mm_store_si128((xmmi*)out + 4, a4); 1103 _mm_store_si128((xmmi*)out + 5, a5); 1104 }