tor

The Tor anonymity network
git clone https://git.dasho.dev/tor.git
Log | Files | Refs | README | LICENSE

fe_mul.c (10942B)


      1 #include "fe.h"
      2 #include "crypto_int64.h"
      3 
      4 /*
      5 h = f * g
      6 Can overlap h with f or g.
      7 
      8 Preconditions:
      9   |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
     10   |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
     11 
     12 Postconditions:
     13   |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
     14 */
     15 
     16 /*
     17 Notes on implementation strategy:
     18 
     19 Using schoolbook multiplication.
     20 Karatsuba would save a little in some cost models.
     21 
     22 Most multiplications by 2 and 19 are 32-bit precomputations;
     23 cheaper than 64-bit postcomputations.
     24 
     25 There is one remaining multiplication by 19 in the carry chain;
     26 one *19 precomputation can be merged into this,
     27 but the resulting data flow is considerably less clean.
     28 
     29 There are 12 carries below.
     30 10 of them are 2-way parallelizable and vectorizable.
     31 Can get away with 11 carries, but then data flow is much deeper.
     32 
     33 With tighter constraints on inputs can squeeze carries into int32.
     34 */
     35 
     36 void fe_mul(fe h,const fe f,const fe g)
     37 {
     38  crypto_int32 f0 = f[0];
     39  crypto_int32 f1 = f[1];
     40  crypto_int32 f2 = f[2];
     41  crypto_int32 f3 = f[3];
     42  crypto_int32 f4 = f[4];
     43  crypto_int32 f5 = f[5];
     44  crypto_int32 f6 = f[6];
     45  crypto_int32 f7 = f[7];
     46  crypto_int32 f8 = f[8];
     47  crypto_int32 f9 = f[9];
     48  crypto_int32 g0 = g[0];
     49  crypto_int32 g1 = g[1];
     50  crypto_int32 g2 = g[2];
     51  crypto_int32 g3 = g[3];
     52  crypto_int32 g4 = g[4];
     53  crypto_int32 g5 = g[5];
     54  crypto_int32 g6 = g[6];
     55  crypto_int32 g7 = g[7];
     56  crypto_int32 g8 = g[8];
     57  crypto_int32 g9 = g[9];
     58  crypto_int32 g1_19 = 19 * g1; /* 1.959375*2^29 */
     59  crypto_int32 g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
     60  crypto_int32 g3_19 = 19 * g3;
     61  crypto_int32 g4_19 = 19 * g4;
     62  crypto_int32 g5_19 = 19 * g5;
     63  crypto_int32 g6_19 = 19 * g6;
     64  crypto_int32 g7_19 = 19 * g7;
     65  crypto_int32 g8_19 = 19 * g8;
     66  crypto_int32 g9_19 = 19 * g9;
     67  crypto_int32 f1_2 = 2 * f1;
     68  crypto_int32 f3_2 = 2 * f3;
     69  crypto_int32 f5_2 = 2 * f5;
     70  crypto_int32 f7_2 = 2 * f7;
     71  crypto_int32 f9_2 = 2 * f9;
     72  crypto_int64 f0g0    = f0   * (crypto_int64) g0;
     73  crypto_int64 f0g1    = f0   * (crypto_int64) g1;
     74  crypto_int64 f0g2    = f0   * (crypto_int64) g2;
     75  crypto_int64 f0g3    = f0   * (crypto_int64) g3;
     76  crypto_int64 f0g4    = f0   * (crypto_int64) g4;
     77  crypto_int64 f0g5    = f0   * (crypto_int64) g5;
     78  crypto_int64 f0g6    = f0   * (crypto_int64) g6;
     79  crypto_int64 f0g7    = f0   * (crypto_int64) g7;
     80  crypto_int64 f0g8    = f0   * (crypto_int64) g8;
     81  crypto_int64 f0g9    = f0   * (crypto_int64) g9;
     82  crypto_int64 f1g0    = f1   * (crypto_int64) g0;
     83  crypto_int64 f1g1_2  = f1_2 * (crypto_int64) g1;
     84  crypto_int64 f1g2    = f1   * (crypto_int64) g2;
     85  crypto_int64 f1g3_2  = f1_2 * (crypto_int64) g3;
     86  crypto_int64 f1g4    = f1   * (crypto_int64) g4;
     87  crypto_int64 f1g5_2  = f1_2 * (crypto_int64) g5;
     88  crypto_int64 f1g6    = f1   * (crypto_int64) g6;
     89  crypto_int64 f1g7_2  = f1_2 * (crypto_int64) g7;
     90  crypto_int64 f1g8    = f1   * (crypto_int64) g8;
     91  crypto_int64 f1g9_38 = f1_2 * (crypto_int64) g9_19;
     92  crypto_int64 f2g0    = f2   * (crypto_int64) g0;
     93  crypto_int64 f2g1    = f2   * (crypto_int64) g1;
     94  crypto_int64 f2g2    = f2   * (crypto_int64) g2;
     95  crypto_int64 f2g3    = f2   * (crypto_int64) g3;
     96  crypto_int64 f2g4    = f2   * (crypto_int64) g4;
     97  crypto_int64 f2g5    = f2   * (crypto_int64) g5;
     98  crypto_int64 f2g6    = f2   * (crypto_int64) g6;
     99  crypto_int64 f2g7    = f2   * (crypto_int64) g7;
    100  crypto_int64 f2g8_19 = f2   * (crypto_int64) g8_19;
    101  crypto_int64 f2g9_19 = f2   * (crypto_int64) g9_19;
    102  crypto_int64 f3g0    = f3   * (crypto_int64) g0;
    103  crypto_int64 f3g1_2  = f3_2 * (crypto_int64) g1;
    104  crypto_int64 f3g2    = f3   * (crypto_int64) g2;
    105  crypto_int64 f3g3_2  = f3_2 * (crypto_int64) g3;
    106  crypto_int64 f3g4    = f3   * (crypto_int64) g4;
    107  crypto_int64 f3g5_2  = f3_2 * (crypto_int64) g5;
    108  crypto_int64 f3g6    = f3   * (crypto_int64) g6;
    109  crypto_int64 f3g7_38 = f3_2 * (crypto_int64) g7_19;
    110  crypto_int64 f3g8_19 = f3   * (crypto_int64) g8_19;
    111  crypto_int64 f3g9_38 = f3_2 * (crypto_int64) g9_19;
    112  crypto_int64 f4g0    = f4   * (crypto_int64) g0;
    113  crypto_int64 f4g1    = f4   * (crypto_int64) g1;
    114  crypto_int64 f4g2    = f4   * (crypto_int64) g2;
    115  crypto_int64 f4g3    = f4   * (crypto_int64) g3;
    116  crypto_int64 f4g4    = f4   * (crypto_int64) g4;
    117  crypto_int64 f4g5    = f4   * (crypto_int64) g5;
    118  crypto_int64 f4g6_19 = f4   * (crypto_int64) g6_19;
    119  crypto_int64 f4g7_19 = f4   * (crypto_int64) g7_19;
    120  crypto_int64 f4g8_19 = f4   * (crypto_int64) g8_19;
    121  crypto_int64 f4g9_19 = f4   * (crypto_int64) g9_19;
    122  crypto_int64 f5g0    = f5   * (crypto_int64) g0;
    123  crypto_int64 f5g1_2  = f5_2 * (crypto_int64) g1;
    124  crypto_int64 f5g2    = f5   * (crypto_int64) g2;
    125  crypto_int64 f5g3_2  = f5_2 * (crypto_int64) g3;
    126  crypto_int64 f5g4    = f5   * (crypto_int64) g4;
    127  crypto_int64 f5g5_38 = f5_2 * (crypto_int64) g5_19;
    128  crypto_int64 f5g6_19 = f5   * (crypto_int64) g6_19;
    129  crypto_int64 f5g7_38 = f5_2 * (crypto_int64) g7_19;
    130  crypto_int64 f5g8_19 = f5   * (crypto_int64) g8_19;
    131  crypto_int64 f5g9_38 = f5_2 * (crypto_int64) g9_19;
    132  crypto_int64 f6g0    = f6   * (crypto_int64) g0;
    133  crypto_int64 f6g1    = f6   * (crypto_int64) g1;
    134  crypto_int64 f6g2    = f6   * (crypto_int64) g2;
    135  crypto_int64 f6g3    = f6   * (crypto_int64) g3;
    136  crypto_int64 f6g4_19 = f6   * (crypto_int64) g4_19;
    137  crypto_int64 f6g5_19 = f6   * (crypto_int64) g5_19;
    138  crypto_int64 f6g6_19 = f6   * (crypto_int64) g6_19;
    139  crypto_int64 f6g7_19 = f6   * (crypto_int64) g7_19;
    140  crypto_int64 f6g8_19 = f6   * (crypto_int64) g8_19;
    141  crypto_int64 f6g9_19 = f6   * (crypto_int64) g9_19;
    142  crypto_int64 f7g0    = f7   * (crypto_int64) g0;
    143  crypto_int64 f7g1_2  = f7_2 * (crypto_int64) g1;
    144  crypto_int64 f7g2    = f7   * (crypto_int64) g2;
    145  crypto_int64 f7g3_38 = f7_2 * (crypto_int64) g3_19;
    146  crypto_int64 f7g4_19 = f7   * (crypto_int64) g4_19;
    147  crypto_int64 f7g5_38 = f7_2 * (crypto_int64) g5_19;
    148  crypto_int64 f7g6_19 = f7   * (crypto_int64) g6_19;
    149  crypto_int64 f7g7_38 = f7_2 * (crypto_int64) g7_19;
    150  crypto_int64 f7g8_19 = f7   * (crypto_int64) g8_19;
    151  crypto_int64 f7g9_38 = f7_2 * (crypto_int64) g9_19;
    152  crypto_int64 f8g0    = f8   * (crypto_int64) g0;
    153  crypto_int64 f8g1    = f8   * (crypto_int64) g1;
    154  crypto_int64 f8g2_19 = f8   * (crypto_int64) g2_19;
    155  crypto_int64 f8g3_19 = f8   * (crypto_int64) g3_19;
    156  crypto_int64 f8g4_19 = f8   * (crypto_int64) g4_19;
    157  crypto_int64 f8g5_19 = f8   * (crypto_int64) g5_19;
    158  crypto_int64 f8g6_19 = f8   * (crypto_int64) g6_19;
    159  crypto_int64 f8g7_19 = f8   * (crypto_int64) g7_19;
    160  crypto_int64 f8g8_19 = f8   * (crypto_int64) g8_19;
    161  crypto_int64 f8g9_19 = f8   * (crypto_int64) g9_19;
    162  crypto_int64 f9g0    = f9   * (crypto_int64) g0;
    163  crypto_int64 f9g1_38 = f9_2 * (crypto_int64) g1_19;
    164  crypto_int64 f9g2_19 = f9   * (crypto_int64) g2_19;
    165  crypto_int64 f9g3_38 = f9_2 * (crypto_int64) g3_19;
    166  crypto_int64 f9g4_19 = f9   * (crypto_int64) g4_19;
    167  crypto_int64 f9g5_38 = f9_2 * (crypto_int64) g5_19;
    168  crypto_int64 f9g6_19 = f9   * (crypto_int64) g6_19;
    169  crypto_int64 f9g7_38 = f9_2 * (crypto_int64) g7_19;
    170  crypto_int64 f9g8_19 = f9   * (crypto_int64) g8_19;
    171  crypto_int64 f9g9_38 = f9_2 * (crypto_int64) g9_19;
    172  crypto_int64 h0 = f0g0+f1g9_38+f2g8_19+f3g7_38+f4g6_19+f5g5_38+f6g4_19+f7g3_38+f8g2_19+f9g1_38;
    173  crypto_int64 h1 = f0g1+f1g0   +f2g9_19+f3g8_19+f4g7_19+f5g6_19+f6g5_19+f7g4_19+f8g3_19+f9g2_19;
    174  crypto_int64 h2 = f0g2+f1g1_2 +f2g0   +f3g9_38+f4g8_19+f5g7_38+f6g6_19+f7g5_38+f8g4_19+f9g3_38;
    175  crypto_int64 h3 = f0g3+f1g2   +f2g1   +f3g0   +f4g9_19+f5g8_19+f6g7_19+f7g6_19+f8g5_19+f9g4_19;
    176  crypto_int64 h4 = f0g4+f1g3_2 +f2g2   +f3g1_2 +f4g0   +f5g9_38+f6g8_19+f7g7_38+f8g6_19+f9g5_38;
    177  crypto_int64 h5 = f0g5+f1g4   +f2g3   +f3g2   +f4g1   +f5g0   +f6g9_19+f7g8_19+f8g7_19+f9g6_19;
    178  crypto_int64 h6 = f0g6+f1g5_2 +f2g4   +f3g3_2 +f4g2   +f5g1_2 +f6g0   +f7g9_38+f8g8_19+f9g7_38;
    179  crypto_int64 h7 = f0g7+f1g6   +f2g5   +f3g4   +f4g3   +f5g2   +f6g1   +f7g0   +f8g9_19+f9g8_19;
    180  crypto_int64 h8 = f0g8+f1g7_2 +f2g6   +f3g5_2 +f4g4   +f5g3_2 +f6g2   +f7g1_2 +f8g0   +f9g9_38;
    181  crypto_int64 h9 = f0g9+f1g8   +f2g7   +f3g6   +f4g5   +f5g4   +f6g3   +f7g2   +f8g1   +f9g0   ;
    182  crypto_int64 carry0;
    183  crypto_int64 carry1;
    184  crypto_int64 carry2;
    185  crypto_int64 carry3;
    186  crypto_int64 carry4;
    187  crypto_int64 carry5;
    188  crypto_int64 carry6;
    189  crypto_int64 carry7;
    190  crypto_int64 carry8;
    191  crypto_int64 carry9;
    192 
    193  /*
    194  |h0| <= (1.65*1.65*2^52*(1+19+19+19+19)+1.65*1.65*2^50*(38+38+38+38+38))
    195    i.e. |h0| <= 1.4*2^60; narrower ranges for h2, h4, h6, h8
    196  |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19))
    197    i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9
    198  */
    199 
    200  carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= SHL64(carry0,26);
    201  carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= SHL64(carry4,26);
    202  /* |h0| <= 2^25 */
    203  /* |h4| <= 2^25 */
    204  /* |h1| <= 1.71*2^59 */
    205  /* |h5| <= 1.71*2^59 */
    206 
    207  carry1 = (h1 + (crypto_int64) (1<<24)) >> 25; h2 += carry1; h1 -= SHL64(carry1,25);
    208  carry5 = (h5 + (crypto_int64) (1<<24)) >> 25; h6 += carry5; h5 -= SHL64(carry5,25);
    209  /* |h1| <= 2^24; from now on fits into int32 */
    210  /* |h5| <= 2^24; from now on fits into int32 */
    211  /* |h2| <= 1.41*2^60 */
    212  /* |h6| <= 1.41*2^60 */
    213 
    214  carry2 = (h2 + (crypto_int64) (1<<25)) >> 26; h3 += carry2; h2 -= SHL64(carry2,26);
    215  carry6 = (h6 + (crypto_int64) (1<<25)) >> 26; h7 += carry6; h6 -= SHL64(carry6,26);
    216  /* |h2| <= 2^25; from now on fits into int32 unchanged */
    217  /* |h6| <= 2^25; from now on fits into int32 unchanged */
    218  /* |h3| <= 1.71*2^59 */
    219  /* |h7| <= 1.71*2^59 */
    220 
    221  carry3 = (h3 + (crypto_int64) (1<<24)) >> 25; h4 += carry3; h3 -= SHL64(carry3,25);
    222  carry7 = (h7 + (crypto_int64) (1<<24)) >> 25; h8 += carry7; h7 -= SHL64(carry7,25);
    223  /* |h3| <= 2^24; from now on fits into int32 unchanged */
    224  /* |h7| <= 2^24; from now on fits into int32 unchanged */
    225  /* |h4| <= 1.72*2^34 */
    226  /* |h8| <= 1.41*2^60 */
    227 
    228  carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= SHL64(carry4,26);
    229  carry8 = (h8 + (crypto_int64) (1<<25)) >> 26; h9 += carry8; h8 -= SHL64(carry8,26);
    230  /* |h4| <= 2^25; from now on fits into int32 unchanged */
    231  /* |h8| <= 2^25; from now on fits into int32 unchanged */
    232  /* |h5| <= 1.01*2^24 */
    233  /* |h9| <= 1.71*2^59 */
    234 
    235  carry9 = (h9 + (crypto_int64) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= SHL64(carry9,25);
    236  /* |h9| <= 2^24; from now on fits into int32 unchanged */
    237  /* |h0| <= 1.1*2^39 */
    238 
    239  carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= SHL64(carry0,26);
    240  /* |h0| <= 2^25; from now on fits into int32 unchanged */
    241  /* |h1| <= 1.01*2^24 */
    242 
    243  h[0] = (crypto_int32) h0;
    244  h[1] = (crypto_int32) h1;
    245  h[2] = (crypto_int32) h2;
    246  h[3] = (crypto_int32) h3;
    247  h[4] = (crypto_int32) h4;
    248  h[5] = (crypto_int32) h5;
    249  h[6] = (crypto_int32) h6;
    250  h[7] = (crypto_int32) h7;
    251  h[8] = (crypto_int32) h8;
    252  h[9] = (crypto_int32) h9;
    253 }