tor

The Tor anonymity network
git clone https://git.dasho.dev/tor.git
Log | Files | Refs | README | LICENSE

curve25519-donna-sse2.h (58262B)


      1 /*
      2 Public domain by Andrew M. <liquidsun@gmail.com>
      3 See: https://github.com/floodyberry/curve25519-donna
      4 
      5 SSE2 curve25519 implementation
      6 */
      7 
      8 #include <emmintrin.h>
      9 typedef __m128i xmmi;
     10 
     11 typedef union packedelem8_t {
     12 unsigned char u[16];
     13 xmmi v;
     14 } packedelem8;
     15 
     16 typedef union packedelem32_t {
     17 uint32_t u[4];
     18 xmmi v;
     19 } packedelem32;
     20 
     21 typedef union packedelem64_t {
     22 uint64_t u[2];
     23 xmmi v;
     24 } packedelem64;
     25 
     26 /* 10 elements + an extra 2 to fit in 3 xmm registers */
     27 typedef uint32_t bignum25519[12];
     28 typedef packedelem32 packed32bignum25519[5];
     29 typedef packedelem64 packed64bignum25519[10];
     30 
     31 static const packedelem32 bot32bitmask = {{0xffffffff, 0x00000000, 0xffffffff, 0x00000000}};
     32 static const packedelem32 top64bitmask = {{0x00000000, 0x00000000, 0xffffffff, 0xffffffff}};
     33 static const packedelem32 bot64bitmask = {{0xffffffff, 0xffffffff, 0x00000000, 0x00000000}};
     34 
     35 /* reduction masks */
     36 static const packedelem64 packedmask26 = {{0x03ffffff, 0x03ffffff}};
     37 static const packedelem64 packedmask25 = {{0x01ffffff, 0x01ffffff}};
     38 static const packedelem32 packedmask26262626 = {{0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff}};
     39 static const packedelem32 packedmask25252525 = {{0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}};
     40 
     41 /* multipliers */
     42 static const packedelem64 packednineteen = {{19, 19}};
     43 static const packedelem64 packedthirtyeight = {{38, 38}};
     44 static const packedelem64 packed3819 = {{19*2,19}};
     45 
     46 /* 2*(2^255 - 19) = 0 mod p */
     47 static const packedelem32 packed2p0 = {{0x7ffffda,0x3fffffe,0x7fffffe,0x3fffffe}};
     48 static const packedelem32 packed2p1 = {{0x7fffffe,0x3fffffe,0x7fffffe,0x3fffffe}};
     49 static const packedelem32 packed2p2 = {{0x7fffffe,0x3fffffe,0x0000000,0x0000000}};
     50 
     51 static const packedelem32 packed32packed2p0 = {{0x7ffffda,0x7ffffda,0x3fffffe,0x3fffffe}};
     52 static const packedelem32 packed32packed2p1 = {{0x7fffffe,0x7fffffe,0x3fffffe,0x3fffffe}};
     53 
     54 /* 4*(2^255 - 19) = 0 mod p */
     55 static const packedelem32 packed4p0 = {{0xfffffb4,0x7fffffc,0xffffffc,0x7fffffc}};
     56 static const packedelem32 packed4p1 = {{0xffffffc,0x7fffffc,0xffffffc,0x7fffffc}};
     57 static const packedelem32 packed4p2 = {{0xffffffc,0x7fffffc,0x0000000,0x0000000}};
     58 
     59 static const packedelem32 packed32packed4p0 = {{0xfffffb4,0xfffffb4,0x7fffffc,0x7fffffc}};
     60 static const packedelem32 packed32packed4p1 = {{0xffffffc,0xffffffc,0x7fffffc,0x7fffffc}};
     61 
     62 /* out = in */
     63 DONNA_INLINE static void
     64 curve25519_copy(bignum25519 out, const bignum25519 in) {
     65 xmmi x0,x1,x2;
     66 x0 = _mm_load_si128((xmmi*)in + 0);
     67 x1 = _mm_load_si128((xmmi*)in + 1);
     68 x2 = _mm_load_si128((xmmi*)in + 2);
     69 _mm_store_si128((xmmi*)out + 0, x0);
     70 _mm_store_si128((xmmi*)out + 1, x1);
     71 _mm_store_si128((xmmi*)out + 2, x2);
     72 }
     73 
     74 /* out = a + b */
     75 DONNA_INLINE static void
     76 curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) {
     77 xmmi a0,a1,a2,b0,b1,b2;
     78 a0 = _mm_load_si128((xmmi*)a + 0);
     79 a1 = _mm_load_si128((xmmi*)a + 1);
     80 a2 = _mm_load_si128((xmmi*)a + 2);
     81 b0 = _mm_load_si128((xmmi*)b + 0);
     82 b1 = _mm_load_si128((xmmi*)b + 1);
     83 b2 = _mm_load_si128((xmmi*)b + 2);
     84 a0 = _mm_add_epi32(a0, b0);
     85 a1 = _mm_add_epi32(a1, b1);
     86 a2 = _mm_add_epi32(a2, b2);
     87 _mm_store_si128((xmmi*)out + 0, a0);
     88 _mm_store_si128((xmmi*)out + 1, a1);
     89 _mm_store_si128((xmmi*)out + 2, a2);
     90 }
     91 
     92 #define curve25519_add_after_basic curve25519_add_reduce
     93 DONNA_INLINE static void
     94 curve25519_add_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
     95 xmmi a0,a1,a2,b0,b1,b2;
     96 xmmi c1,c2,c3;
     97 xmmi r0,r1,r2,r3,r4,r5;
     98 
     99 a0 = _mm_load_si128((xmmi*)a + 0);
    100 a1 = _mm_load_si128((xmmi*)a + 1);
    101 a2 = _mm_load_si128((xmmi*)a + 2);
    102 b0 = _mm_load_si128((xmmi*)b + 0);
    103 b1 = _mm_load_si128((xmmi*)b + 1);
    104 b2 = _mm_load_si128((xmmi*)b + 2);
    105 a0 = _mm_add_epi32(a0, b0);
    106 a1 = _mm_add_epi32(a1, b1);
    107 a2 = _mm_add_epi32(a2, b2);
    108 
    109 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
    110 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
    111 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
    112 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
    113 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
    114 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
    115 
    116 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    117 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
    118 c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1); 
    119 c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
    120 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    121 
    122 _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
    123 _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
    124 _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
    125 }
    126 
    127 DONNA_INLINE static void
    128 curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
    129 xmmi a0,a1,a2,b0,b1,b2;
    130 xmmi c1,c2;
    131 xmmi r0,r1;
    132 
    133 a0 = _mm_load_si128((xmmi*)a + 0);
    134 a1 = _mm_load_si128((xmmi*)a + 1);
    135 a2 = _mm_load_si128((xmmi*)a + 2);
    136 a0 = _mm_add_epi32(a0, packed2p0.v);
    137 a1 = _mm_add_epi32(a1, packed2p1.v);
    138 a2 = _mm_add_epi32(a2, packed2p2.v);
    139 b0 = _mm_load_si128((xmmi*)b + 0);
    140 b1 = _mm_load_si128((xmmi*)b + 1);
    141 b2 = _mm_load_si128((xmmi*)b + 2);
    142 a0 = _mm_sub_epi32(a0, b0);
    143 a1 = _mm_sub_epi32(a1, b1);
    144 a2 = _mm_sub_epi32(a2, b2);
    145 
    146 r0 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(2,2,0,0)), bot32bitmask.v);
    147 r1 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3,3,1,1)), bot32bitmask.v);
    148 
    149 c1 = _mm_srli_epi32(r0, 26); 
    150 c2 = _mm_srli_epi32(r1, 25); 
    151 r0 = _mm_and_si128(r0, packedmask26.v); 
    152 r1 = _mm_and_si128(r1, packedmask25.v); 
    153 r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
    154 r1 = _mm_add_epi32(r1, c1);
    155 
    156 a0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
    157 a1 = _mm_add_epi32(a1, _mm_srli_si128(c2, 8));
    158 
    159 _mm_store_si128((xmmi*)out + 0, a0);
    160 _mm_store_si128((xmmi*)out + 1, a1);
    161 _mm_store_si128((xmmi*)out + 2, a2);
    162 }
    163 
    164 DONNA_INLINE static void
    165 curve25519_sub_after_basic(bignum25519 out, const bignum25519 a, const bignum25519 b) {
    166 xmmi a0,a1,a2,b0,b1,b2;
    167 xmmi c1,c2,c3;
    168 xmmi r0,r1,r2,r3,r4,r5;
    169 
    170 a0 = _mm_load_si128((xmmi*)a + 0);
    171 a1 = _mm_load_si128((xmmi*)a + 1);
    172 a2 = _mm_load_si128((xmmi*)a + 2);
    173 a0 = _mm_add_epi32(a0, packed4p0.v);
    174 a1 = _mm_add_epi32(a1, packed4p1.v);
    175 a2 = _mm_add_epi32(a2, packed4p2.v);
    176 b0 = _mm_load_si128((xmmi*)b + 0);
    177 b1 = _mm_load_si128((xmmi*)b + 1);
    178 b2 = _mm_load_si128((xmmi*)b + 2);
    179 a0 = _mm_sub_epi32(a0, b0);
    180 a1 = _mm_sub_epi32(a1, b1);
    181 a2 = _mm_sub_epi32(a2, b2);
    182 
    183 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
    184 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
    185 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
    186 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
    187 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
    188 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
    189 
    190 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    191 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
    192 c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1); 
    193 c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
    194 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    195 
    196 _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
    197 _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
    198 _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
    199 }
    200 
    201 DONNA_INLINE static void
    202 curve25519_sub_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
    203 xmmi a0,a1,a2,b0,b1,b2;
    204 xmmi c1,c2,c3;
    205 xmmi r0,r1,r2,r3,r4,r5;
    206 
    207 a0 = _mm_load_si128((xmmi*)a + 0);
    208 a1 = _mm_load_si128((xmmi*)a + 1);
    209 a2 = _mm_load_si128((xmmi*)a + 2);
    210 a0 = _mm_add_epi32(a0, packed2p0.v);
    211 a1 = _mm_add_epi32(a1, packed2p1.v);
    212 a2 = _mm_add_epi32(a2, packed2p2.v);
    213 b0 = _mm_load_si128((xmmi*)b + 0);
    214 b1 = _mm_load_si128((xmmi*)b + 1);
    215 b2 = _mm_load_si128((xmmi*)b + 2);
    216 a0 = _mm_sub_epi32(a0, b0);
    217 a1 = _mm_sub_epi32(a1, b1);
    218 a2 = _mm_sub_epi32(a2, b2);
    219 
    220 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
    221 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
    222 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
    223 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
    224 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
    225 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
    226 
    227 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    228 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
    229 c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1); 
    230 c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
    231 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    232 
    233 _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
    234 _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
    235 _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
    236 }
    237 
    238 
    239 DONNA_INLINE static void
    240 curve25519_neg(bignum25519 out, const bignum25519 b) {
    241 xmmi a0,a1,a2,b0,b1,b2;
    242 xmmi c1,c2,c3;
    243 xmmi r0,r1,r2,r3,r4,r5;
    244 
    245 a0 = packed2p0.v;
    246 a1 = packed2p1.v;
    247 a2 = packed2p2.v;
    248 b0 = _mm_load_si128((xmmi*)b + 0);
    249 b1 = _mm_load_si128((xmmi*)b + 1);
    250 b2 = _mm_load_si128((xmmi*)b + 2);
    251 a0 = _mm_sub_epi32(a0, b0);
    252 a1 = _mm_sub_epi32(a1, b1);
    253 a2 = _mm_sub_epi32(a2, b2);
    254 
    255 r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
    256 r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
    257 r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
    258 r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
    259 r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
    260 r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
    261 
    262 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    263 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
    264 c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1); 
    265 c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
    266 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    267 
    268 _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
    269 _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
    270 _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
    271 }
    272 
    273 
    274 /* Multiply two numbers: out = in2 * in */
    275 static void 
    276 curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) {
    277 xmmi m01,m23,m45,m67,m89;
    278 xmmi m0123,m4567;
    279 xmmi s0123,s4567;
    280 xmmi s01,s23,s45,s67,s89;
    281 xmmi s12,s34,s56,s78,s9;
    282 xmmi r0,r2,r4,r6,r8;
    283 xmmi r1,r3,r5,r7,r9;
    284 xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
    285 xmmi c1,c2,c3;
    286 
    287 s0123 = _mm_load_si128((xmmi*)s + 0);
    288 s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
    289 s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
    290 s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
    291 s4567 = _mm_load_si128((xmmi*)s + 1);
    292 s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
    293 s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
    294 s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
    295 s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
    296 s89 = _mm_load_si128((xmmi*)s + 2);
    297 s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
    298 s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
    299 s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
    300 
    301 r0 = _mm_load_si128((xmmi*)r + 0);
    302 r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
    303 r1 = _mm_add_epi64(r1, _mm_and_si128(r1, top64bitmask.v));
    304 r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
    305 r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
    306 r3 = _mm_add_epi64(r3, _mm_and_si128(r3, top64bitmask.v));
    307 r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
    308 r4 = _mm_load_si128((xmmi*)r + 1);
    309 r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
    310 r5 = _mm_add_epi64(r5, _mm_and_si128(r5, top64bitmask.v));
    311 r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
    312 r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
    313 r7 = _mm_add_epi64(r7, _mm_and_si128(r7, top64bitmask.v));
    314 r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
    315 r8 = _mm_load_si128((xmmi*)r + 2);
    316 r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
    317 r9 = _mm_add_epi64(r9, _mm_and_si128(r9, top64bitmask.v));
    318 r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
    319 
    320 m01 = _mm_mul_epu32(r1,s01);
    321 m23 = _mm_mul_epu32(r1,s23);
    322 m45 = _mm_mul_epu32(r1,s45);
    323 m67 = _mm_mul_epu32(r1,s67);
    324 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
    325 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
    326 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
    327 m89 = _mm_mul_epu32(r1,s89);
    328 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
    329 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
    330 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
    331 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
    332 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
    333 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
    334 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
    335 
    336 /* shift up */
    337 m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
    338 m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
    339 m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
    340 m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
    341 m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
    342 
    343 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
    344 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
    345 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
    346 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
    347 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
    348 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
    349 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
    350 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
    351 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
    352 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
    353 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
    354 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
    355 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
    356 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
    357 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
    358 
    359 r219 = _mm_mul_epu32(r2, packednineteen.v);
    360 r419 = _mm_mul_epu32(r4, packednineteen.v);
    361 r619 = _mm_mul_epu32(r6, packednineteen.v);
    362 r819 = _mm_mul_epu32(r8, packednineteen.v);
    363 r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);
    364 r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);
    365 r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);
    366 r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);
    367 r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);
    368 
    369 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
    370 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
    371 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
    372 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
    373 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
    374 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
    375 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
    376 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
    377 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
    378 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
    379 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
    380 m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
    381 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
    382 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
    383 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
    384 m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
    385 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
    386 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
    387 m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
    388 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
    389 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
    390 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
    391 m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
    392 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
    393 m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
    394 
    395 r0 = _mm_unpacklo_epi64(m01, m45);
    396 r1 = _mm_unpackhi_epi64(m01, m45);
    397 r2 = _mm_unpacklo_epi64(m23, m67);
    398 r3 = _mm_unpackhi_epi64(m23, m67);
    399 r4 = _mm_unpacklo_epi64(m89, m89);
    400 r5 = _mm_unpackhi_epi64(m89, m89);
    401 
    402 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    403 c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
    404 c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1); 
    405 c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
    406 c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    407 
    408 m0123 = _mm_unpacklo_epi32(r0, r1);
    409 m4567 = _mm_unpackhi_epi32(r0, r1);
    410 m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
    411 m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
    412 m89 = _mm_unpackhi_epi32(r4, r5);
    413 
    414 _mm_store_si128((xmmi*)out + 0, m0123);
    415 _mm_store_si128((xmmi*)out + 1, m4567);
    416 _mm_store_si128((xmmi*)out + 2, m89);
    417 }
    418 
    419 DONNA_NOINLINE static void
    420 curve25519_mul_noinline(bignum25519 out, const bignum25519 r, const bignum25519 s) {
    421 curve25519_mul(out, r, s);
    422 }
    423 
    424 #define curve25519_square(r, n) curve25519_square_times(r, n, 1)
    425 static void
    426 curve25519_square_times(bignum25519 r, const bignum25519 in, int count) {
    427 xmmi m01,m23,m45,m67,m89;
    428 xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
    429 xmmi r0a,r1a,r2a,r3a,r7a,r9a;
    430 xmmi r0123,r4567;
    431 xmmi r01,r23,r45,r67,r6x,r89,r8x;
    432 xmmi r12,r34,r56,r78,r9x;
    433 xmmi r5619;
    434 xmmi c1,c2,c3;
    435 
    436 r0123 = _mm_load_si128((xmmi*)in + 0);
    437 r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
    438 r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
    439 r4567 = _mm_load_si128((xmmi*)in + 1);
    440 r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
    441 r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
    442 r89 = _mm_load_si128((xmmi*)in + 2);
    443 r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
    444 
    445 do {
    446 	r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
    447 	r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
    448 	r0 = _mm_add_epi64(r0, _mm_and_si128(r0, top64bitmask.v));
    449 	r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
    450 	r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
    451 	r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
    452 	r2 = _mm_add_epi64(r2, _mm_and_si128(r2, top64bitmask.v));
    453 	r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
    454 	r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
    455 	r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
    456 	r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
    457 	r4 = _mm_add_epi64(r4, _mm_and_si128(r4, top64bitmask.v));
    458 	r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
    459 	r5619 = _mm_mul_epu32(r56, packednineteen.v);
    460 	r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
    461 	r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));		
    462 	r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
    463 	r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
    464 	r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
    465 	r7 = _mm_mul_epu32(r7, packed3819.v);
    466 	r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
    467 	r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
    468 	r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
    469 	r8 = _mm_mul_epu32(r8, packednineteen.v);
    470 	r9  = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
    471 	r9x  = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
    472 	r9 = _mm_mul_epu32(r9, packed3819.v);
    473 	r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
    474 
    475 	m01 = _mm_mul_epu32(r01, r0);
    476 	m23 = _mm_mul_epu32(r23, r0a);
    477 	m45 = _mm_mul_epu32(r45, r0a);
    478 	m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
    479 	r23 = _mm_slli_epi32(r23, 1);
    480 	m67 = _mm_mul_epu32(r67, r0a);
    481 	m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
    482 	m89 = _mm_mul_epu32(r89, r0a);
    483 	m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
    484 	r67 = _mm_slli_epi32(r67, 1);
    485 	m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
    486 	r45 = _mm_slli_epi32(r45, 1);
    487 
    488 	r1 = _mm_slli_epi32(r1, 1);
    489 	r3 = _mm_slli_epi32(r3, 1);
    490 	r1a = _mm_add_epi64(r1, _mm_and_si128(r1, bot64bitmask.v));
    491 	r3a = _mm_add_epi64(r3, _mm_and_si128(r3, bot64bitmask.v));
    492 
    493 	m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
    494 	m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
    495 	m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
    496 	m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
    497 	r34 = _mm_slli_epi32(r34, 1);
    498 	m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
    499 	r78 = _mm_slli_epi32(r78, 1);
    500 	m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
    501 	r56 = _mm_slli_epi32(r56, 1);
    502 
    503 	m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
    504 	m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
    505 	m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
    506 	m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
    507 	m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
    508 	m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
    509 	m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
    510 	m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
    511 	m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
    512 	m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
    513 	m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
    514 	m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
    515 	m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));		
    516 	m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
    517 	m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
    518 
    519 	r0 = _mm_unpacklo_epi64(m01, m45);
    520 	r1 = _mm_unpackhi_epi64(m01, m45);
    521 	r2 = _mm_unpacklo_epi64(m23, m67);
    522 	r3 = _mm_unpackhi_epi64(m23, m67);
    523 	r4 = _mm_unpacklo_epi64(m89, m89);
    524 	r5 = _mm_unpackhi_epi64(m89, m89);
    525 
    526 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    527 	c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
    528 	c1 = _mm_srli_epi64(r4, 26);                                                                      r4 = _mm_and_si128(r4, packedmask26.v);                             r5 = _mm_add_epi64(r5, c1); 
    529 	c1 = _mm_srli_epi64(r5, 25);                                                                      r5 = _mm_and_si128(r5, packedmask25.v);                             r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
    530 	c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
    531 
    532 	r01 = _mm_unpacklo_epi64(r0, r1);
    533 	r45 = _mm_unpackhi_epi64(r0, r1);
    534 	r23 = _mm_unpacklo_epi64(r2, r3);
    535 	r67 = _mm_unpackhi_epi64(r2, r3);
    536 	r89 = _mm_unpackhi_epi64(r4, r5);
    537 } while (--count);
    538 
    539 r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
    540 r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
    541 r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
    542 r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
    543 r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
    544 
    545 _mm_store_si128((xmmi*)r + 0, r0123);
    546 _mm_store_si128((xmmi*)r + 1, r4567);
    547 _mm_store_si128((xmmi*)r + 2, r89);
    548 }
    549 
    550 DONNA_INLINE static void
    551 curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) {
    552 xmmi x0,x1,x2,z0,z1,z2;
    553 
    554 x0 = _mm_load_si128((xmmi *)(x + 0));
    555 x1 = _mm_load_si128((xmmi *)(x + 4));
    556 x2 = _mm_load_si128((xmmi *)(x + 8));
    557 z0 = _mm_load_si128((xmmi *)(z + 0));
    558 z1 = _mm_load_si128((xmmi *)(z + 4));
    559 z2 = _mm_load_si128((xmmi *)(z + 8));
    560 
    561 out[0].v = _mm_unpacklo_epi32(x0, z0);
    562 out[1].v = _mm_unpackhi_epi32(x0, z0);
    563 out[2].v = _mm_unpacklo_epi32(x1, z1);
    564 out[3].v = _mm_unpackhi_epi32(x1, z1);
    565 out[4].v = _mm_unpacklo_epi32(x2, z2);
    566 }
    567 
    568 DONNA_INLINE static void
    569 curve25519_untangle32(bignum25519 x, bignum25519 z, const packedelem32 *in) {
    570 xmmi t0,t1,t2,t3,t4,zero;
    571 
    572 t0 = _mm_shuffle_epi32(in[0].v, _MM_SHUFFLE(3,1,2,0));
    573 t1 = _mm_shuffle_epi32(in[1].v, _MM_SHUFFLE(3,1,2,0));
    574 t2 = _mm_shuffle_epi32(in[2].v, _MM_SHUFFLE(3,1,2,0));
    575 t3 = _mm_shuffle_epi32(in[3].v, _MM_SHUFFLE(3,1,2,0));
    576 t4 = _mm_shuffle_epi32(in[4].v, _MM_SHUFFLE(3,1,2,0));
    577 zero = _mm_setzero_si128();
    578 _mm_store_si128((xmmi *)x + 0, _mm_unpacklo_epi64(t0, t1));
    579 _mm_store_si128((xmmi *)x + 1, _mm_unpacklo_epi64(t2, t3));
    580 _mm_store_si128((xmmi *)x + 2, _mm_unpacklo_epi64(t4, zero));
    581 _mm_store_si128((xmmi *)z + 0, _mm_unpackhi_epi64(t0, t1));
    582 _mm_store_si128((xmmi *)z + 1, _mm_unpackhi_epi64(t2, t3));
    583 _mm_store_si128((xmmi *)z + 2, _mm_unpackhi_epi64(t4, zero));
    584 }
    585 
    586 DONNA_INLINE static void
    587 curve25519_add_reduce_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
    588 xmmi r0,r1,r2,r3,r4;
    589 xmmi s0,s1,s2,s3,s4,s5;
    590 xmmi c1,c2;
    591 
    592 r0 = _mm_add_epi32(r[0].v, s[0].v);
    593 r1 = _mm_add_epi32(r[1].v, s[1].v);
    594 r2 = _mm_add_epi32(r[2].v, s[2].v);
    595 r3 = _mm_add_epi32(r[3].v, s[3].v);
    596 r4 = _mm_add_epi32(r[4].v, s[4].v);
    597 
    598 s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
    599 s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
    600 s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
    601 s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
    602 s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4);  /* 00 88 */
    603 s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4);  /* 00 99 */
    604 
    605 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
    606 c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
    607 c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
    608 c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
    609 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
    610 
    611 out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
    612 out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
    613 out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
    614 out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
    615 out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
    616 }
    617 
    618 DONNA_INLINE static void
    619 curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
    620 out[0].v = _mm_add_epi32(r[0].v, s[0].v);
    621 out[1].v = _mm_add_epi32(r[1].v, s[1].v);
    622 out[2].v = _mm_add_epi32(r[2].v, s[2].v);
    623 out[3].v = _mm_add_epi32(r[3].v, s[3].v);
    624 out[4].v = _mm_add_epi32(r[4].v, s[4].v);
    625 }
    626 
    627 DONNA_INLINE static void
    628 curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
    629 xmmi r0,r1,r2,r3,r4;
    630 xmmi s0,s1,s2,s3;
    631 xmmi c1,c2;
    632 
    633 r0 = _mm_add_epi32(r[0].v, packed32packed2p0.v);
    634 r1 = _mm_add_epi32(r[1].v, packed32packed2p1.v);
    635 r2 = _mm_add_epi32(r[2].v, packed32packed2p1.v);
    636 r3 = _mm_add_epi32(r[3].v, packed32packed2p1.v);
    637 r4 = _mm_add_epi32(r[4].v, packed32packed2p1.v);
    638 r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
    639 r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
    640 r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
    641 r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
    642 r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
    643 
    644 s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
    645 s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
    646 s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
    647 s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
    648 
    649 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
    650 c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0,  _mm_slli_si128(c2, 8));
    651 
    652 out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
    653 out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
    654 out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
    655 out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
    656 out[4].v = r4;
    657 }
    658 
    659 DONNA_INLINE static void
    660 curve25519_sub_after_basic_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
    661 xmmi r0,r1,r2,r3,r4;
    662 xmmi s0,s1,s2,s3,s4,s5;
    663 xmmi c1,c2;
    664 
    665 r0 = _mm_add_epi32(r[0].v, packed32packed4p0.v);
    666 r1 = _mm_add_epi32(r[1].v, packed32packed4p1.v);
    667 r2 = _mm_add_epi32(r[2].v, packed32packed4p1.v);
    668 r3 = _mm_add_epi32(r[3].v, packed32packed4p1.v);
    669 r4 = _mm_add_epi32(r[4].v, packed32packed4p1.v);
    670 r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
    671 r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
    672 r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
    673 r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
    674 r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
    675 
    676 s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
    677 s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
    678 s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
    679 s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
    680 s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4);  /* 00 88 */
    681 s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4);  /* 00 99 */
    682 
    683 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
    684 c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
    685 c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
    686 c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
    687 c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
    688 
    689 out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
    690 out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
    691 out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
    692 out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
    693 out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
    694 }
    695 
    696 DONNA_INLINE static void
    697 curve25519_tangle64_from32(packedelem64 *a, packedelem64 *b, const packedelem32 *c, const packedelem32 *d) {
    698 xmmi c0,c1,c2,c3,c4,c5,t;
    699 xmmi d0,d1,d2,d3,d4,d5;
    700 xmmi t0,t1,t2,t3,t4,zero;
    701 
    702 t0 = _mm_shuffle_epi32(c[0].v, _MM_SHUFFLE(3,1,2,0));
    703 t1 = _mm_shuffle_epi32(c[1].v, _MM_SHUFFLE(3,1,2,0));
    704 t2 = _mm_shuffle_epi32(d[0].v, _MM_SHUFFLE(3,1,2,0));
    705 t3 = _mm_shuffle_epi32(d[1].v, _MM_SHUFFLE(3,1,2,0));
    706 c0 = _mm_unpacklo_epi64(t0, t1);
    707 c3 = _mm_unpackhi_epi64(t0, t1);
    708 d0 = _mm_unpacklo_epi64(t2, t3);
    709 d3 = _mm_unpackhi_epi64(t2, t3);
    710 t = _mm_unpacklo_epi64(c0, d0); a[0].v = t; a[1].v = _mm_srli_epi64(t, 32);
    711 t = _mm_unpackhi_epi64(c0, d0); a[2].v = t; a[3].v = _mm_srli_epi64(t, 32);
    712 t = _mm_unpacklo_epi64(c3, d3); b[0].v = t; b[1].v = _mm_srli_epi64(t, 32);
    713 t = _mm_unpackhi_epi64(c3, d3); b[2].v = t; b[3].v = _mm_srli_epi64(t, 32);
    714 
    715 t0 = _mm_shuffle_epi32(c[2].v, _MM_SHUFFLE(3,1,2,0));
    716 t1 = _mm_shuffle_epi32(c[3].v, _MM_SHUFFLE(3,1,2,0));
    717 t2 = _mm_shuffle_epi32(d[2].v, _MM_SHUFFLE(3,1,2,0));
    718 t3 = _mm_shuffle_epi32(d[3].v, _MM_SHUFFLE(3,1,2,0));
    719 c1 = _mm_unpacklo_epi64(t0, t1);
    720 c4 = _mm_unpackhi_epi64(t0, t1);
    721 d1 = _mm_unpacklo_epi64(t2, t3);
    722 d4 = _mm_unpackhi_epi64(t2, t3);
    723 t = _mm_unpacklo_epi64(c1, d1); a[4].v = t; a[5].v = _mm_srli_epi64(t, 32);
    724 t = _mm_unpackhi_epi64(c1, d1); a[6].v = t; a[7].v = _mm_srli_epi64(t, 32);
    725 t = _mm_unpacklo_epi64(c4, d4); b[4].v = t; b[5].v = _mm_srli_epi64(t, 32);
    726 t = _mm_unpackhi_epi64(c4, d4); b[6].v = t; b[7].v = _mm_srli_epi64(t, 32);
    727 
    728 t4 = _mm_shuffle_epi32(c[4].v, _MM_SHUFFLE(3,1,2,0));
    729 zero = _mm_setzero_si128();
    730 c2 = _mm_unpacklo_epi64(t4, zero);
    731 c5 = _mm_unpackhi_epi64(t4, zero);
    732 t4 = _mm_shuffle_epi32(d[4].v, _MM_SHUFFLE(3,1,2,0));
    733 d2 = _mm_unpacklo_epi64(t4, zero);
    734 d5 = _mm_unpackhi_epi64(t4, zero);
    735 t = _mm_unpacklo_epi64(c2, d2); a[8].v = t; a[9].v = _mm_srli_epi64(t, 32);
    736 t = _mm_unpacklo_epi64(c5, d5); b[8].v = t; b[9].v = _mm_srli_epi64(t, 32);
    737 }
    738 
    739 DONNA_INLINE static void
    740 curve25519_tangle64(packedelem64 *out, const bignum25519 x, const bignum25519 z) {
    741 xmmi x0,x1,x2,z0,z1,z2,t;
    742 
    743 x0 = _mm_load_si128((xmmi *)x + 0);
    744 x1 = _mm_load_si128((xmmi *)x + 1);
    745 x2 = _mm_load_si128((xmmi *)x + 2);
    746 z0 = _mm_load_si128((xmmi *)z + 0);
    747 z1 = _mm_load_si128((xmmi *)z + 1);
    748 z2 = _mm_load_si128((xmmi *)z + 2);
    749 
    750 t = _mm_unpacklo_epi64(x0, z0);	out[0].v = t; out[1].v = _mm_srli_epi64(t, 32);
    751 t = _mm_unpackhi_epi64(x0, z0);	out[2].v = t; out[3].v = _mm_srli_epi64(t, 32);
    752 t = _mm_unpacklo_epi64(x1, z1);	out[4].v = t; out[5].v = _mm_srli_epi64(t, 32);
    753 t = _mm_unpackhi_epi64(x1, z1);	out[6].v = t; out[7].v = _mm_srli_epi64(t, 32);
    754 t = _mm_unpacklo_epi64(x2, z2);	out[8].v = t; out[9].v = _mm_srli_epi64(t, 32);
    755 }
    756 
    757 DONNA_INLINE static void
    758 curve25519_tangleone64(packedelem64 *out, const bignum25519 x) {
    759 xmmi x0,x1,x2;
    760 
    761 x0 = _mm_load_si128((xmmi *)(x + 0));
    762 x1 = _mm_load_si128((xmmi *)(x + 4));
    763 x2 = _mm_load_si128((xmmi *)(x + 8));
    764 
    765 out[0].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(0,0,0,0));
    766 out[1].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(1,1,1,1));
    767 out[2].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(2,2,2,2));
    768 out[3].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(3,3,3,3));
    769 out[4].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0,0,0,0));
    770 out[5].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,1,1,1));
    771 out[6].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2,2,2,2));
    772 out[7].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(3,3,3,3));
    773 out[8].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0,0,0,0));
    774 out[9].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1,1,1,1));
    775 }
    776 
    777 DONNA_INLINE static void
    778 curve25519_swap64(packedelem64 *out) {
    779 out[0].v = _mm_shuffle_epi32(out[0].v, _MM_SHUFFLE(1,0,3,2));
    780 out[1].v = _mm_shuffle_epi32(out[1].v, _MM_SHUFFLE(1,0,3,2));
    781 out[2].v = _mm_shuffle_epi32(out[2].v, _MM_SHUFFLE(1,0,3,2));
    782 out[3].v = _mm_shuffle_epi32(out[3].v, _MM_SHUFFLE(1,0,3,2));
    783 out[4].v = _mm_shuffle_epi32(out[4].v, _MM_SHUFFLE(1,0,3,2));
    784 out[5].v = _mm_shuffle_epi32(out[5].v, _MM_SHUFFLE(1,0,3,2));
    785 out[6].v = _mm_shuffle_epi32(out[6].v, _MM_SHUFFLE(1,0,3,2));
    786 out[7].v = _mm_shuffle_epi32(out[7].v, _MM_SHUFFLE(1,0,3,2));
    787 out[8].v = _mm_shuffle_epi32(out[8].v, _MM_SHUFFLE(1,0,3,2));
    788 out[9].v = _mm_shuffle_epi32(out[9].v, _MM_SHUFFLE(1,0,3,2));
    789 }
    790 
    791 DONNA_INLINE static void
    792 curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) {
    793 _mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
    794 _mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
    795 _mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v)                                                          );
    796 _mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
    797 _mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
    798 _mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v)                                                          );
    799 }
    800 
    801 DONNA_INLINE static void
    802 curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) {
    803 xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
    804 xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
    805 xmmi c1,c2;
    806 
    807 out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
    808 out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
    809 r1_2 = _mm_slli_epi32(r[1].v, 1);
    810 out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
    811 out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
    812 r3_2 = _mm_slli_epi32(r[3].v, 1);
    813 out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2  , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
    814 out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
    815 r5_2 = _mm_slli_epi32(r[5].v, 1);
    816 out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2  , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2  , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
    817 out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v  , s[0].v))))))));
    818 r7_2 = _mm_slli_epi32(r[7].v, 1);
    819 out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2  , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2  , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2  , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2  , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
    820 out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
    821 
    822 r1 = _mm_mul_epu32(r[1].v, packednineteen.v);
    823 r2 = _mm_mul_epu32(r[2].v, packednineteen.v);
    824 r1_2 = _mm_slli_epi32(r1, 1);
    825 r3 = _mm_mul_epu32(r[3].v, packednineteen.v);
    826 r4 = _mm_mul_epu32(r[4].v, packednineteen.v);
    827 r3_2 = _mm_slli_epi32(r3, 1);
    828 r5 = _mm_mul_epu32(r[5].v, packednineteen.v);
    829 r6 = _mm_mul_epu32(r[6].v, packednineteen.v);
    830 r5_2 = _mm_slli_epi32(r5, 1);
    831 r7 = _mm_mul_epu32(r[7].v, packednineteen.v);
    832 r8 = _mm_mul_epu32(r[8].v, packednineteen.v);
    833 r7_2 = _mm_slli_epi32(r7, 1);
    834 r9 = _mm_mul_epu32(r[9].v, packednineteen.v);
    835 r9_2 = _mm_slli_epi32(r9, 1);
    836 
    837 out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
    838 out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7  , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5  , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3  , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
    839 out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
    840 out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7  , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5  , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
    841 out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
    842 out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7  , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
    843 out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
    844 out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9  , s[8].v), _mm_mul_epu32(r8, s[9].v)));
    845 out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
    846 
    847 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
    848 c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
    849 c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
    850 c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
    851                                    c2 = _mm_srli_epi64(out[8].v, 26);                                                     out[8].v = _mm_and_si128(out[8].v, packedmask26.v);                                         out[9].v = _mm_add_epi64(out[9].v, c2);
    852                                    c2 = _mm_srli_epi64(out[9].v, 25);                                                     out[9].v = _mm_and_si128(out[9].v, packedmask25.v);                                         out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
    853 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
    854 }
    855 
    856 DONNA_INLINE static void
    857 curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) {
    858 xmmi r0,r1,r2,r3;
    859 xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
    860 xmmi d5,d6,d7,d8,d9;
    861 xmmi c1,c2;
    862 
    863 r0 = r[0].v;
    864 r1 = r[1].v;
    865 r2 = r[2].v;
    866 r3 = r[3].v;
    867 
    868 out[0].v = _mm_mul_epu32(r0, r0);
    869 r0 = _mm_slli_epi32(r0, 1);
    870 out[1].v = _mm_mul_epu32(r0, r1);
    871 r1_2 = _mm_slli_epi32(r1, 1);
    872 out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2    ), _mm_mul_epu32(r1, r1_2));
    873 r1 = r1_2;
    874 out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3    ), _mm_mul_epu32(r1, r2  ));
    875 r3_2 = _mm_slli_epi32(r3, 1);
    876 out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2  ), _mm_mul_epu32(r2, r2)));
    877 r2 = _mm_slli_epi32(r2, 1);
    878 out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
    879 r5_2 = _mm_slli_epi32(r[5].v, 1);
    880 out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2  ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2  ))));
    881 r3 = r3_2;
    882 out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
    883 r7_2 = _mm_slli_epi32(r[7].v, 1);
    884 out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2  ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2  ), _mm_mul_epu32(r[4].v, r[4].v)))));
    885 out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2  )))));
    886 
    887 d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);
    888 d6 = _mm_mul_epu32(r[6].v, packednineteen.v);
    889 d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);
    890 d8 = _mm_mul_epu32(r[8].v, packednineteen.v);
    891 d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);
    892 
    893 r4_2 = _mm_slli_epi32(r[4].v, 1);
    894 r6_2 = _mm_slli_epi32(r[6].v, 1);
    895 out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1                   ), _mm_add_epi64(_mm_mul_epu32(d8, r2  ), _mm_add_epi64(_mm_mul_epu32(d7, r3    ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
    896 out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3  ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2  )))));
    897 out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3                   ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2  ), _mm_mul_epu32(d6, r[6].v)))));
    898 out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v               ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
    899 out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2                 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
    900 out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v               ), _mm_mul_epu32(d8, r7_2  )));
    901 out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2                 ), _mm_mul_epu32(d8, r[8].v)));
    902 out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
    903 out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
    904 
    905 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
    906 c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
    907 c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
    908 c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
    909                                    c2 = _mm_srli_epi64(out[8].v, 26);                                                     out[8].v = _mm_and_si128(out[8].v, packedmask26.v);                                         out[9].v = _mm_add_epi64(out[9].v, c2);
    910                                    c2 = _mm_srli_epi64(out[9].v, 25);                                                     out[9].v = _mm_and_si128(out[9].v, packedmask25.v);                                         out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
    911 c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
    912 }
    913 
    914 
    915 /* Take a little-endian, 32-byte number and expand it into polynomial form */
    916 static void
    917 curve25519_expand(bignum25519 out, const unsigned char in[32]) {
    918 uint32_t x0,x1,x2,x3,x4,x5,x6,x7;
    919 
    920 x0 = *(uint32_t *)(in + 0);
    921 x1 = *(uint32_t *)(in + 4);
    922 x2 = *(uint32_t *)(in + 8);
    923 x3 = *(uint32_t *)(in + 12);
    924 x4 = *(uint32_t *)(in + 16);
    925 x5 = *(uint32_t *)(in + 20);
    926 x6 = *(uint32_t *)(in + 24);
    927 x7 = *(uint32_t *)(in + 28);
    928 
    929 out[0] = (                        x0       ) & 0x3ffffff;
    930 out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & 0x1ffffff;
    931 out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & 0x3ffffff;
    932 out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & 0x1ffffff;
    933 out[4] = ((                       x3) >>  6) & 0x3ffffff;
    934 out[5] = (                        x4       ) & 0x1ffffff;
    935 out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & 0x3ffffff;
    936 out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & 0x1ffffff;
    937 out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & 0x3ffffff;
    938 out[9] = ((                       x7) >>  6) & 0x1ffffff;
    939 out[10] = 0;
    940 out[11] = 0;
    941 }
    942 
    943 /* Take a fully reduced polynomial form number and contract it into a
    944 * little-endian, 32-byte array
    945 */
    946 static void
    947 curve25519_contract(unsigned char out[32], const bignum25519 in) {
    948 bignum25519 ALIGN(16) f;
    949 curve25519_copy(f, in);
    950 
    951 #define carry_pass() \
    952 	f[1] += f[0] >> 26; f[0] &= 0x3ffffff; \
    953 	f[2] += f[1] >> 25; f[1] &= 0x1ffffff; \
    954 	f[3] += f[2] >> 26; f[2] &= 0x3ffffff; \
    955 	f[4] += f[3] >> 25; f[3] &= 0x1ffffff; \
    956 	f[5] += f[4] >> 26; f[4] &= 0x3ffffff; \
    957 	f[6] += f[5] >> 25; f[5] &= 0x1ffffff; \
    958 	f[7] += f[6] >> 26; f[6] &= 0x3ffffff; \
    959 	f[8] += f[7] >> 25; f[7] &= 0x1ffffff; \
    960 	f[9] += f[8] >> 26; f[8] &= 0x3ffffff;
    961 
    962 #define carry_pass_full() \
    963 	carry_pass() \
    964 	f[0] += 19 * (f[9] >> 25); f[9] &= 0x1ffffff;
    965 
    966 #define carry_pass_final() \
    967 	carry_pass() \
    968 	f[9] &= 0x1ffffff;
    969 
    970 carry_pass_full()
    971 carry_pass_full()
    972 
    973 /* now t is between 0 and 2^255-1, properly carried. */
    974 /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
    975 f[0] += 19;
    976 carry_pass_full()
    977 
    978 /* now between 19 and 2^255-1 in both cases, and offset by 19. */
    979 f[0] += (1 << 26) - 19;
    980 f[1] += (1 << 25) - 1;
    981 f[2] += (1 << 26) - 1;
    982 f[3] += (1 << 25) - 1;
    983 f[4] += (1 << 26) - 1;
    984 f[5] += (1 << 25) - 1;
    985 f[6] += (1 << 26) - 1;
    986 f[7] += (1 << 25) - 1;
    987 f[8] += (1 << 26) - 1;
    988 f[9] += (1 << 25) - 1;
    989 
    990 /* now between 2^255 and 2^256-20, and offset by 2^255. */
    991 carry_pass_final()
    992 
    993 #undef carry_pass
    994 #undef carry_full
    995 #undef carry_final
    996 
    997 f[1] <<= 2;
    998 f[2] <<= 3;
    999 f[3] <<= 5;
   1000 f[4] <<= 6;
   1001 f[6] <<= 1;
   1002 f[7] <<= 3;
   1003 f[8] <<= 4;
   1004 f[9] <<= 6;
   1005 
   1006 #define F(i, s) \
   1007 	out[s+0] |= (unsigned char )(f[i] & 0xff); \
   1008 	out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \
   1009 	out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \
   1010 	out[s+3] = (unsigned char )((f[i] >> 24) & 0xff);
   1011 
   1012 out[0] = 0;
   1013 out[16] = 0;
   1014 F(0,0);
   1015 F(1,3);
   1016 F(2,6);
   1017 F(3,9);
   1018 F(4,12);
   1019 F(5,16);
   1020 F(6,19);
   1021 F(7,22);
   1022 F(8,25);
   1023 F(9,28);
   1024 #undef F
   1025 }
   1026 
   1027 /* if (iswap) swap(a, b) */
   1028 DONNA_INLINE static void
   1029 curve25519_swap_conditional(bignum25519 a, bignum25519 b, uint32_t iswap) {
   1030 const uint32_t swap = (uint32_t)(-(int32_t)iswap);
   1031 xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
   1032 xmmi mask = _mm_cvtsi32_si128(swap);
   1033 mask = _mm_shuffle_epi32(mask, 0);
   1034 a0 = _mm_load_si128((xmmi *)a + 0);
   1035 a1 = _mm_load_si128((xmmi *)a + 1);
   1036 b0 = _mm_load_si128((xmmi *)b + 0);
   1037 b1 = _mm_load_si128((xmmi *)b + 1);
   1038 b0 = _mm_xor_si128(a0, b0);
   1039 b1 = _mm_xor_si128(a1, b1);
   1040 x0 = _mm_and_si128(b0, mask);
   1041 x1 = _mm_and_si128(b1, mask);
   1042 x0 = _mm_xor_si128(x0, a0);
   1043 x1 = _mm_xor_si128(x1, a1);
   1044 a0 = _mm_xor_si128(x0, b0);
   1045 a1 = _mm_xor_si128(x1, b1);
   1046 _mm_store_si128((xmmi *)a + 0, x0);
   1047 _mm_store_si128((xmmi *)a + 1, x1);	
   1048 _mm_store_si128((xmmi *)b + 0, a0);
   1049 _mm_store_si128((xmmi *)b + 1, a1);
   1050 
   1051 a2 = _mm_load_si128((xmmi *)a + 2);
   1052 b2 = _mm_load_si128((xmmi *)b + 2);
   1053 b2 = _mm_xor_si128(a2, b2);
   1054 x2 = _mm_and_si128(b2, mask);
   1055 x2 = _mm_xor_si128(x2, a2);
   1056 a2 = _mm_xor_si128(x2, b2);	
   1057 _mm_store_si128((xmmi *)b + 2, a2);
   1058 _mm_store_si128((xmmi *)a + 2, x2);
   1059 }
   1060 
   1061 /* out = (flag) ? out : in */
   1062 DONNA_INLINE static void
   1063 curve25519_move_conditional_bytes(uint8_t out[96], const uint8_t in[96], uint32_t flag) {
   1064 xmmi a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5;
   1065 const uint32_t nb = flag - 1;
   1066 xmmi masknb = _mm_shuffle_epi32(_mm_cvtsi32_si128(nb),0);
   1067 a0 = _mm_load_si128((xmmi *)in + 0);
   1068 a1 = _mm_load_si128((xmmi *)in + 1);
   1069 a2 = _mm_load_si128((xmmi *)in + 2);
   1070 b0 = _mm_load_si128((xmmi *)out + 0);
   1071 b1 = _mm_load_si128((xmmi *)out + 1);
   1072 b2 = _mm_load_si128((xmmi *)out + 2);
   1073 a0 = _mm_andnot_si128(masknb, a0);
   1074 a1 = _mm_andnot_si128(masknb, a1);
   1075 a2 = _mm_andnot_si128(masknb, a2);
   1076 b0 = _mm_and_si128(masknb, b0);
   1077 b1 = _mm_and_si128(masknb, b1);
   1078 b2 = _mm_and_si128(masknb, b2);
   1079 a0 = _mm_or_si128(a0, b0);
   1080 a1 = _mm_or_si128(a1, b1);
   1081 a2 = _mm_or_si128(a2, b2);
   1082 _mm_store_si128((xmmi*)out + 0, a0);
   1083 _mm_store_si128((xmmi*)out + 1, a1);
   1084 _mm_store_si128((xmmi*)out + 2, a2);
   1085 
   1086 a3 = _mm_load_si128((xmmi *)in + 3);
   1087 a4 = _mm_load_si128((xmmi *)in + 4);
   1088 a5 = _mm_load_si128((xmmi *)in + 5);
   1089 b3 = _mm_load_si128((xmmi *)out + 3);
   1090 b4 = _mm_load_si128((xmmi *)out + 4);
   1091 b5 = _mm_load_si128((xmmi *)out + 5);
   1092 a3 = _mm_andnot_si128(masknb, a3);
   1093 a4 = _mm_andnot_si128(masknb, a4);
   1094 a5 = _mm_andnot_si128(masknb, a5);
   1095 b3 = _mm_and_si128(masknb, b3);
   1096 b4 = _mm_and_si128(masknb, b4);
   1097 b5 = _mm_and_si128(masknb, b5);
   1098 a3 = _mm_or_si128(a3, b3);
   1099 a4 = _mm_or_si128(a4, b4);
   1100 a5 = _mm_or_si128(a5, b5);
   1101 _mm_store_si128((xmmi*)out + 3, a3);
   1102 _mm_store_si128((xmmi*)out + 4, a4);
   1103 _mm_store_si128((xmmi*)out + 5, a5);
   1104 }