mpmontg.c (35044B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 /* This file implements moduluar exponentiation using Montgomery's 6 * method for modular reduction. This file implements the method 7 * described as "Improvement 2" in the paper "A Cryptogrpahic Library for 8 * the Motorola DSP56000" by Stephen R. Dusse' and Burton S. Kaliski Jr. 9 * published in "Advances in Cryptology: Proceedings of EUROCRYPT '90" 10 * "Lecture Notes in Computer Science" volume 473, 1991, pg 230-244, 11 * published by Springer Verlag. 12 */ 13 14 #define MP_USING_CACHE_SAFE_MOD_EXP 1 15 #include <string.h> 16 #include "mpi-priv.h" 17 #include "mplogic.h" 18 #include "mpprime.h" 19 #ifdef MP_USING_MONT_MULF 20 #include "montmulf.h" 21 #endif 22 #include <stddef.h> /* ptrdiff_t */ 23 #include <assert.h> 24 25 #define STATIC 26 27 #define MAX_ODD_INTS 32 /* 2 ** (WINDOW_BITS - 1) */ 28 29 /*! computes T = REDC(T), 2^b == R 30 \param T < RN 31 */ 32 mp_err 33 s_mp_redc(mp_int *T, mp_mont_modulus *mmm) 34 { 35 mp_err res; 36 mp_size i; 37 38 i = (MP_USED(&mmm->N) << 1) + 1; 39 MP_CHECKOK(s_mp_pad(T, i)); 40 for (i = 0; i < MP_USED(&mmm->N); ++i) { 41 mp_digit m_i = MP_DIGIT(T, i) * mmm->n0prime; 42 /* T += N * m_i * (MP_RADIX ** i); */ 43 s_mp_mul_d_add_offset(&mmm->N, m_i, T, i); 44 } 45 s_mp_clamp(T); 46 47 /* T /= R */ 48 s_mp_rshd(T, MP_USED(&mmm->N)); 49 50 if (s_mp_cmp(T, &mmm->N) >= 0) { 51 /* T = T - N */ 52 MP_CHECKOK(s_mp_sub(T, &mmm->N)); 53 #ifdef DEBUG 54 if (mp_cmp(T, &mmm->N) >= 0) { 55 res = MP_UNDEF; 56 goto CLEANUP; 57 } 58 #endif 59 } 60 res = MP_OKAY; 61 CLEANUP: 62 return res; 63 } 64 65 #if !defined(MP_MONT_USE_MP_MUL) 66 67 /*! c <- REDC( a * b ) mod N 68 \param a < N i.e. "reduced" 69 \param b < N i.e. "reduced" 70 \param mmm modulus N and n0' of N 71 */ 72 mp_err 73 s_mp_mul_mont(const mp_int *a, const mp_int *b, mp_int *c, 74 mp_mont_modulus *mmm) 75 { 76 mp_digit *pb; 77 mp_digit m_i; 78 mp_err res; 79 mp_size ib; /* "index b": index of current digit of B */ 80 mp_size useda, usedb; 81 82 ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG); 83 84 if (MP_USED(a) < MP_USED(b)) { 85 const mp_int *xch = b; /* switch a and b, to do fewer outer loops */ 86 b = a; 87 a = xch; 88 } 89 90 MP_USED(c) = 1; 91 MP_DIGIT(c, 0) = 0; 92 ib = (MP_USED(&mmm->N) << 1) + 1; 93 if ((res = s_mp_pad(c, ib)) != MP_OKAY) 94 goto CLEANUP; 95 96 useda = MP_USED(a); 97 pb = MP_DIGITS(b); 98 s_mpv_mul_d(MP_DIGITS(a), useda, *pb++, MP_DIGITS(c)); 99 s_mp_setz(MP_DIGITS(c) + useda + 1, ib - (useda + 1)); 100 m_i = MP_DIGIT(c, 0) * mmm->n0prime; 101 s_mp_mul_d_add_offset(&mmm->N, m_i, c, 0); 102 103 /* Outer loop: Digits of b */ 104 usedb = MP_USED(b); 105 for (ib = 1; ib < usedb; ib++) { 106 mp_digit b_i = *pb++; 107 108 /* Inner product: Digits of a */ 109 if (b_i) 110 s_mpv_mul_d_add_prop(MP_DIGITS(a), useda, b_i, MP_DIGITS(c) + ib); 111 m_i = MP_DIGIT(c, ib) * mmm->n0prime; 112 s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib); 113 } 114 if (usedb < MP_USED(&mmm->N)) { 115 for (usedb = MP_USED(&mmm->N); ib < usedb; ++ib) { 116 m_i = MP_DIGIT(c, ib) * mmm->n0prime; 117 s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib); 118 } 119 } 120 s_mp_clamp(c); 121 s_mp_rshd(c, MP_USED(&mmm->N)); /* c /= R */ 122 if (s_mp_cmp(c, &mmm->N) >= 0) { 123 MP_CHECKOK(s_mp_sub(c, &mmm->N)); 124 } 125 res = MP_OKAY; 126 127 CLEANUP: 128 return res; 129 } 130 #endif 131 132 mp_err 133 mp_to_mont(const mp_int *x, const mp_int *N, mp_int *xMont) 134 { 135 mp_err res; 136 137 /* xMont = x * R mod N where N is modulus */ 138 if (x != xMont) { 139 MP_CHECKOK(mp_copy(x, xMont)); 140 } 141 MP_CHECKOK(s_mp_lshd(xMont, MP_USED(N))); /* xMont = x << b */ 142 MP_CHECKOK(mp_div(xMont, N, 0, xMont)); /* mod N */ 143 CLEANUP: 144 return res; 145 } 146 147 mp_digit 148 mp_calculate_mont_n0i(const mp_int *N) 149 { 150 return 0 - s_mp_invmod_radix(MP_DIGIT(N, 0)); 151 } 152 153 #ifdef MP_USING_MONT_MULF 154 155 /* the floating point multiply is already cache safe, 156 * don't turn on cache safe unless we specifically 157 * force it */ 158 #ifndef MP_FORCE_CACHE_SAFE 159 #undef MP_USING_CACHE_SAFE_MOD_EXP 160 #endif 161 162 unsigned int mp_using_mont_mulf = 1; 163 164 /* computes montgomery square of the integer in mResult */ 165 #define SQR \ 166 conv_i32_to_d32_and_d16(dm1, d16Tmp, mResult, nLen); \ 167 mont_mulf_noconv(mResult, dm1, d16Tmp, \ 168 dTmp, dn, MP_DIGITS(modulus), nLen, dn0) 169 170 /* computes montgomery product of x and the integer in mResult */ 171 #define MUL(x) \ 172 conv_i32_to_d32(dm1, mResult, nLen); \ 173 mont_mulf_noconv(mResult, dm1, oddPowers[x], \ 174 dTmp, dn, MP_DIGITS(modulus), nLen, dn0) 175 176 /* Do modular exponentiation using floating point multiply code. */ 177 mp_err 178 mp_exptmod_f(const mp_int *montBase, 179 const mp_int *exponent, 180 const mp_int *modulus, 181 mp_int *result, 182 mp_mont_modulus *mmm, 183 int nLen, 184 mp_size bits_in_exponent, 185 mp_size window_bits, 186 mp_size odd_ints) 187 { 188 mp_digit *mResult; 189 double *dBuf = 0, *dm1, *dn, *dSqr, *d16Tmp, *dTmp; 190 double dn0; 191 mp_size i; 192 mp_err res; 193 int expOff; 194 int dSize = 0, oddPowSize, dTmpSize; 195 mp_int accum1; 196 double *oddPowers[MAX_ODD_INTS]; 197 198 /* function for computing n0prime only works if n0 is odd */ 199 200 MP_DIGITS(&accum1) = 0; 201 202 for (i = 0; i < MAX_ODD_INTS; ++i) 203 oddPowers[i] = 0; 204 205 MP_CHECKOK(mp_init_size(&accum1, 3 * nLen + 2)); 206 207 mp_set(&accum1, 1); 208 MP_CHECKOK(mp_to_mont(&accum1, &(mmm->N), &accum1)); 209 MP_CHECKOK(s_mp_pad(&accum1, nLen)); 210 211 oddPowSize = 2 * nLen + 1; 212 dTmpSize = 2 * oddPowSize; 213 dSize = sizeof(double) * (nLen * 4 + 1 + 214 ((odd_ints + 1) * oddPowSize) + dTmpSize); 215 dBuf = malloc(dSize); 216 if (!dBuf) { 217 res = MP_MEM; 218 goto CLEANUP; 219 } 220 dm1 = dBuf; /* array of d32 */ 221 dn = dBuf + nLen; /* array of d32 */ 222 dSqr = dn + nLen; /* array of d32 */ 223 d16Tmp = dSqr + nLen; /* array of d16 */ 224 dTmp = d16Tmp + oddPowSize; 225 226 for (i = 0; i < odd_ints; ++i) { 227 oddPowers[i] = dTmp; 228 dTmp += oddPowSize; 229 } 230 mResult = (mp_digit *)(dTmp + dTmpSize); /* size is nLen + 1 */ 231 232 /* Make dn and dn0 */ 233 conv_i32_to_d32(dn, MP_DIGITS(modulus), nLen); 234 dn0 = (double)(mmm->n0prime & 0xffff); 235 236 /* Make dSqr */ 237 conv_i32_to_d32_and_d16(dm1, oddPowers[0], MP_DIGITS(montBase), nLen); 238 mont_mulf_noconv(mResult, dm1, oddPowers[0], 239 dTmp, dn, MP_DIGITS(modulus), nLen, dn0); 240 conv_i32_to_d32(dSqr, mResult, nLen); 241 242 for (i = 1; i < odd_ints; ++i) { 243 mont_mulf_noconv(mResult, dSqr, oddPowers[i - 1], 244 dTmp, dn, MP_DIGITS(modulus), nLen, dn0); 245 conv_i32_to_d16(oddPowers[i], mResult, nLen); 246 } 247 248 s_mp_copy(MP_DIGITS(&accum1), mResult, nLen); /* from, to, len */ 249 250 for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) { 251 mp_size smallExp; 252 MP_CHECKOK(mpl_get_bits(exponent, expOff, window_bits)); 253 smallExp = (mp_size)res; 254 255 if (window_bits == 1) { 256 if (!smallExp) { 257 SQR; 258 } else if (smallExp & 1) { 259 SQR; 260 MUL(0); 261 } else { 262 abort(); 263 } 264 } else if (window_bits == 4) { 265 if (!smallExp) { 266 SQR; 267 SQR; 268 SQR; 269 SQR; 270 } else if (smallExp & 1) { 271 SQR; 272 SQR; 273 SQR; 274 SQR; 275 MUL(smallExp / 2); 276 } else if (smallExp & 2) { 277 SQR; 278 SQR; 279 SQR; 280 MUL(smallExp / 4); 281 SQR; 282 } else if (smallExp & 4) { 283 SQR; 284 SQR; 285 MUL(smallExp / 8); 286 SQR; 287 SQR; 288 } else if (smallExp & 8) { 289 SQR; 290 MUL(smallExp / 16); 291 SQR; 292 SQR; 293 SQR; 294 } else { 295 abort(); 296 } 297 } else if (window_bits == 5) { 298 if (!smallExp) { 299 SQR; 300 SQR; 301 SQR; 302 SQR; 303 SQR; 304 } else if (smallExp & 1) { 305 SQR; 306 SQR; 307 SQR; 308 SQR; 309 SQR; 310 MUL(smallExp / 2); 311 } else if (smallExp & 2) { 312 SQR; 313 SQR; 314 SQR; 315 SQR; 316 MUL(smallExp / 4); 317 SQR; 318 } else if (smallExp & 4) { 319 SQR; 320 SQR; 321 SQR; 322 MUL(smallExp / 8); 323 SQR; 324 SQR; 325 } else if (smallExp & 8) { 326 SQR; 327 SQR; 328 MUL(smallExp / 16); 329 SQR; 330 SQR; 331 SQR; 332 } else if (smallExp & 0x10) { 333 SQR; 334 MUL(smallExp / 32); 335 SQR; 336 SQR; 337 SQR; 338 SQR; 339 } else { 340 abort(); 341 } 342 } else if (window_bits == 6) { 343 if (!smallExp) { 344 SQR; 345 SQR; 346 SQR; 347 SQR; 348 SQR; 349 SQR; 350 } else if (smallExp & 1) { 351 SQR; 352 SQR; 353 SQR; 354 SQR; 355 SQR; 356 SQR; 357 MUL(smallExp / 2); 358 } else if (smallExp & 2) { 359 SQR; 360 SQR; 361 SQR; 362 SQR; 363 SQR; 364 MUL(smallExp / 4); 365 SQR; 366 } else if (smallExp & 4) { 367 SQR; 368 SQR; 369 SQR; 370 SQR; 371 MUL(smallExp / 8); 372 SQR; 373 SQR; 374 } else if (smallExp & 8) { 375 SQR; 376 SQR; 377 SQR; 378 MUL(smallExp / 16); 379 SQR; 380 SQR; 381 SQR; 382 } else if (smallExp & 0x10) { 383 SQR; 384 SQR; 385 MUL(smallExp / 32); 386 SQR; 387 SQR; 388 SQR; 389 SQR; 390 } else if (smallExp & 0x20) { 391 SQR; 392 MUL(smallExp / 64); 393 SQR; 394 SQR; 395 SQR; 396 SQR; 397 SQR; 398 } else { 399 abort(); 400 } 401 } else { 402 abort(); 403 } 404 } 405 406 s_mp_copy(mResult, MP_DIGITS(&accum1), nLen); /* from, to, len */ 407 408 res = s_mp_redc(&accum1, mmm); 409 mp_exch(&accum1, result); 410 411 CLEANUP: 412 mp_clear(&accum1); 413 if (dBuf) { 414 if (dSize) 415 memset(dBuf, 0, dSize); 416 free(dBuf); 417 } 418 419 return res; 420 } 421 #undef SQR 422 #undef MUL 423 #endif 424 425 #define SQR(a, b) \ 426 MP_CHECKOK(mp_sqr(a, b)); \ 427 MP_CHECKOK(s_mp_redc(b, mmm)) 428 429 #if defined(MP_MONT_USE_MP_MUL) 430 #define MUL(x, a, b) \ 431 MP_CHECKOK(mp_mul(a, oddPowers + (x), b)); \ 432 MP_CHECKOK(s_mp_redc(b, mmm)) 433 #else 434 #define MUL(x, a, b) \ 435 MP_CHECKOK(s_mp_mul_mont(a, oddPowers + (x), b, mmm)) 436 #endif 437 438 #define SWAPPA \ 439 ptmp = pa1; \ 440 pa1 = pa2; \ 441 pa2 = ptmp 442 443 /* Do modular exponentiation using integer multiply code. */ 444 mp_err 445 mp_exptmod_i(const mp_int *montBase, 446 const mp_int *exponent, 447 const mp_int *modulus, 448 mp_int *result, 449 mp_mont_modulus *mmm, 450 int nLen, 451 mp_size bits_in_exponent, 452 mp_size window_bits, 453 mp_size odd_ints) 454 { 455 mp_int *pa1, *pa2, *ptmp; 456 mp_size i; 457 mp_err res; 458 int expOff; 459 mp_int accum1, accum2, power2, oddPowers[MAX_ODD_INTS]; 460 461 /* power2 = base ** 2; oddPowers[i] = base ** (2*i + 1); */ 462 /* oddPowers[i] = base ** (2*i + 1); */ 463 464 MP_DIGITS(&accum1) = 0; 465 MP_DIGITS(&accum2) = 0; 466 MP_DIGITS(&power2) = 0; 467 for (i = 0; i < MAX_ODD_INTS; ++i) { 468 MP_DIGITS(oddPowers + i) = 0; 469 } 470 471 MP_CHECKOK(mp_init_size(&accum1, 3 * nLen + 2)); 472 MP_CHECKOK(mp_init_size(&accum2, 3 * nLen + 2)); 473 474 MP_CHECKOK(mp_init_copy(&oddPowers[0], montBase)); 475 476 MP_CHECKOK(mp_init_size(&power2, nLen + 2 * MP_USED(montBase) + 2)); 477 MP_CHECKOK(mp_sqr(montBase, &power2)); /* power2 = montBase ** 2 */ 478 MP_CHECKOK(s_mp_redc(&power2, mmm)); 479 480 for (i = 1; i < odd_ints; ++i) { 481 MP_CHECKOK(mp_init_size(oddPowers + i, nLen + 2 * MP_USED(&power2) + 2)); 482 MP_CHECKOK(mp_mul(oddPowers + (i - 1), &power2, oddPowers + i)); 483 MP_CHECKOK(s_mp_redc(oddPowers + i, mmm)); 484 } 485 486 /* set accumulator to montgomery residue of 1 */ 487 mp_set(&accum1, 1); 488 MP_CHECKOK(mp_to_mont(&accum1, &(mmm->N), &accum1)); 489 pa1 = &accum1; 490 pa2 = &accum2; 491 492 for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) { 493 mp_size smallExp; 494 MP_CHECKOK(mpl_get_bits(exponent, expOff, window_bits)); 495 smallExp = (mp_size)res; 496 497 if (window_bits == 1) { 498 if (!smallExp) { 499 SQR(pa1, pa2); 500 SWAPPA; 501 } else if (smallExp & 1) { 502 SQR(pa1, pa2); 503 MUL(0, pa2, pa1); 504 } else { 505 abort(); 506 } 507 } else if (window_bits == 4) { 508 if (!smallExp) { 509 SQR(pa1, pa2); 510 SQR(pa2, pa1); 511 SQR(pa1, pa2); 512 SQR(pa2, pa1); 513 } else if (smallExp & 1) { 514 SQR(pa1, pa2); 515 SQR(pa2, pa1); 516 SQR(pa1, pa2); 517 SQR(pa2, pa1); 518 MUL(smallExp / 2, pa1, pa2); 519 SWAPPA; 520 } else if (smallExp & 2) { 521 SQR(pa1, pa2); 522 SQR(pa2, pa1); 523 SQR(pa1, pa2); 524 MUL(smallExp / 4, pa2, pa1); 525 SQR(pa1, pa2); 526 SWAPPA; 527 } else if (smallExp & 4) { 528 SQR(pa1, pa2); 529 SQR(pa2, pa1); 530 MUL(smallExp / 8, pa1, pa2); 531 SQR(pa2, pa1); 532 SQR(pa1, pa2); 533 SWAPPA; 534 } else if (smallExp & 8) { 535 SQR(pa1, pa2); 536 MUL(smallExp / 16, pa2, pa1); 537 SQR(pa1, pa2); 538 SQR(pa2, pa1); 539 SQR(pa1, pa2); 540 SWAPPA; 541 } else { 542 abort(); 543 } 544 } else if (window_bits == 5) { 545 if (!smallExp) { 546 SQR(pa1, pa2); 547 SQR(pa2, pa1); 548 SQR(pa1, pa2); 549 SQR(pa2, pa1); 550 SQR(pa1, pa2); 551 SWAPPA; 552 } else if (smallExp & 1) { 553 SQR(pa1, pa2); 554 SQR(pa2, pa1); 555 SQR(pa1, pa2); 556 SQR(pa2, pa1); 557 SQR(pa1, pa2); 558 MUL(smallExp / 2, pa2, pa1); 559 } else if (smallExp & 2) { 560 SQR(pa1, pa2); 561 SQR(pa2, pa1); 562 SQR(pa1, pa2); 563 SQR(pa2, pa1); 564 MUL(smallExp / 4, pa1, pa2); 565 SQR(pa2, pa1); 566 } else if (smallExp & 4) { 567 SQR(pa1, pa2); 568 SQR(pa2, pa1); 569 SQR(pa1, pa2); 570 MUL(smallExp / 8, pa2, pa1); 571 SQR(pa1, pa2); 572 SQR(pa2, pa1); 573 } else if (smallExp & 8) { 574 SQR(pa1, pa2); 575 SQR(pa2, pa1); 576 MUL(smallExp / 16, pa1, pa2); 577 SQR(pa2, pa1); 578 SQR(pa1, pa2); 579 SQR(pa2, pa1); 580 } else if (smallExp & 0x10) { 581 SQR(pa1, pa2); 582 MUL(smallExp / 32, pa2, pa1); 583 SQR(pa1, pa2); 584 SQR(pa2, pa1); 585 SQR(pa1, pa2); 586 SQR(pa2, pa1); 587 } else { 588 abort(); 589 } 590 } else if (window_bits == 6) { 591 if (!smallExp) { 592 SQR(pa1, pa2); 593 SQR(pa2, pa1); 594 SQR(pa1, pa2); 595 SQR(pa2, pa1); 596 SQR(pa1, pa2); 597 SQR(pa2, pa1); 598 } else if (smallExp & 1) { 599 SQR(pa1, pa2); 600 SQR(pa2, pa1); 601 SQR(pa1, pa2); 602 SQR(pa2, pa1); 603 SQR(pa1, pa2); 604 SQR(pa2, pa1); 605 MUL(smallExp / 2, pa1, pa2); 606 SWAPPA; 607 } else if (smallExp & 2) { 608 SQR(pa1, pa2); 609 SQR(pa2, pa1); 610 SQR(pa1, pa2); 611 SQR(pa2, pa1); 612 SQR(pa1, pa2); 613 MUL(smallExp / 4, pa2, pa1); 614 SQR(pa1, pa2); 615 SWAPPA; 616 } else if (smallExp & 4) { 617 SQR(pa1, pa2); 618 SQR(pa2, pa1); 619 SQR(pa1, pa2); 620 SQR(pa2, pa1); 621 MUL(smallExp / 8, pa1, pa2); 622 SQR(pa2, pa1); 623 SQR(pa1, pa2); 624 SWAPPA; 625 } else if (smallExp & 8) { 626 SQR(pa1, pa2); 627 SQR(pa2, pa1); 628 SQR(pa1, pa2); 629 MUL(smallExp / 16, pa2, pa1); 630 SQR(pa1, pa2); 631 SQR(pa2, pa1); 632 SQR(pa1, pa2); 633 SWAPPA; 634 } else if (smallExp & 0x10) { 635 SQR(pa1, pa2); 636 SQR(pa2, pa1); 637 MUL(smallExp / 32, pa1, pa2); 638 SQR(pa2, pa1); 639 SQR(pa1, pa2); 640 SQR(pa2, pa1); 641 SQR(pa1, pa2); 642 SWAPPA; 643 } else if (smallExp & 0x20) { 644 SQR(pa1, pa2); 645 MUL(smallExp / 64, pa2, pa1); 646 SQR(pa1, pa2); 647 SQR(pa2, pa1); 648 SQR(pa1, pa2); 649 SQR(pa2, pa1); 650 SQR(pa1, pa2); 651 SWAPPA; 652 } else { 653 abort(); 654 } 655 } else { 656 abort(); 657 } 658 } 659 660 res = s_mp_redc(pa1, mmm); 661 mp_exch(pa1, result); 662 663 CLEANUP: 664 mp_clear(&accum1); 665 mp_clear(&accum2); 666 mp_clear(&power2); 667 for (i = 0; i < odd_ints; ++i) { 668 mp_clear(oddPowers + i); 669 } 670 return res; 671 } 672 #undef SQR 673 #undef MUL 674 675 #ifdef MP_USING_CACHE_SAFE_MOD_EXP 676 unsigned int mp_using_cache_safe_exp = 1; 677 #endif 678 679 mp_err 680 mp_set_safe_modexp(int value) 681 { 682 #ifdef MP_USING_CACHE_SAFE_MOD_EXP 683 mp_using_cache_safe_exp = value; 684 return MP_OKAY; 685 #else 686 if (value == 0) { 687 return MP_OKAY; 688 } 689 return MP_BADARG; 690 #endif 691 } 692 693 #ifdef MP_USING_CACHE_SAFE_MOD_EXP 694 #define WEAVE_WORD_SIZE 4 695 696 /* 697 * mpi_to_weave takes an array of bignums, a matrix in which each bignum 698 * occupies all the columns of a row, and transposes it into a matrix in 699 * which each bignum occupies a column of every row. The first row of the 700 * input matrix becomes the first column of the output matrix. The n'th 701 * row of input becomes the n'th column of output. The input data is said 702 * to be "interleaved" or "woven" into the output matrix. 703 * 704 * The array of bignums is left in this woven form. Each time a single 705 * bignum value is needed, it is recreated by fetching the n'th column, 706 * forming a single row which is the new bignum. 707 * 708 * The purpose of this interleaving is make it impossible to determine which 709 * of the bignums is being used in any one operation by examining the pattern 710 * of cache misses. 711 * 712 * The weaving function does not transpose the entire input matrix in one call. 713 * It transposes 4 rows of mp_ints into their respective columns of output. 714 * 715 * This implementation treats each mp_int bignum as an array of mp_digits, 716 * It stores those bytes as a column of mp_digits in the output matrix. It 717 * doesn't care if the machine uses big-endian or little-endian byte ordering 718 * within mp_digits. 719 * 720 * "bignums" is an array of mp_ints. 721 * It points to four rows, four mp_ints, a subset of a larger array of mp_ints. 722 * 723 * "weaved" is the weaved output matrix. 724 * The first byte of bignums[0] is stored in weaved[0]. 725 * 726 * "nBignums" is the total number of bignums in the array of which "bignums" 727 * is a part. 728 * 729 * "nDigits" is the size in mp_digits of each mp_int in the "bignums" array. 730 * mp_ints that use less than nDigits digits are logically padded with zeros 731 * while being stored in the weaved array. 732 */ 733 mp_err 734 mpi_to_weave(const mp_int *bignums, 735 mp_digit *weaved, 736 mp_size nDigits, /* in each mp_int of input */ 737 mp_size nBignums) /* in the entire source array */ 738 { 739 mp_size i; 740 mp_digit *endDest = weaved + (nDigits * nBignums); 741 742 for (i = 0; i < WEAVE_WORD_SIZE; i++) { 743 mp_size used = MP_USED(&bignums[i]); 744 mp_digit *pSrc = MP_DIGITS(&bignums[i]); 745 mp_digit *endSrc = pSrc + used; 746 mp_digit *pDest = weaved + i; 747 748 ARGCHK(MP_SIGN(&bignums[i]) == MP_ZPOS, MP_BADARG); 749 ARGCHK(used <= nDigits, MP_BADARG); 750 751 for (; pSrc < endSrc; pSrc++) { 752 *pDest = *pSrc; 753 pDest += nBignums; 754 } 755 while (pDest < endDest) { 756 *pDest = 0; 757 pDest += nBignums; 758 } 759 } 760 761 return MP_OKAY; 762 } 763 764 /* 765 * These functions return 0xffffffff if the output is true, and 0 otherwise. 766 */ 767 #define CONST_TIME_MSB(x) (0L - ((x) >> (8 * sizeof(x) - 1))) 768 #define CONST_TIME_EQ_Z(x) CONST_TIME_MSB(~(x) & ((x)-1)) 769 #define CONST_TIME_EQ(a, b) CONST_TIME_EQ_Z((a) ^ (b)) 770 771 /* Reverse the operation above for one mp_int. 772 * Reconstruct one mp_int from its column in the weaved array. 773 * Every read accesses every element of the weaved array, in order to 774 * avoid timing attacks based on patterns of memory accesses. 775 */ 776 mp_err 777 weave_to_mpi(mp_int *a, /* out, result */ 778 const mp_digit *weaved, /* in, byte matrix */ 779 mp_size index, /* which column to read */ 780 mp_size nDigits, /* number of mp_digits in each bignum */ 781 mp_size nBignums) /* width of the matrix */ 782 { 783 /* these are indices, but need to be the same size as mp_digit 784 * because of the CONST_TIME operations */ 785 mp_digit i, j; 786 mp_digit d; 787 mp_digit *pDest = MP_DIGITS(a); 788 789 MP_SIGN(a) = MP_ZPOS; 790 MP_USED(a) = nDigits; 791 792 assert(weaved != NULL); 793 794 /* Fetch the proper column in constant time, indexing over the whole array */ 795 for (i = 0; i < nDigits; ++i) { 796 d = 0; 797 for (j = 0; j < nBignums; ++j) { 798 d |= weaved[i * nBignums + j] & CONST_TIME_EQ(j, index); 799 } 800 pDest[i] = d; 801 } 802 803 s_mp_clamp(a); 804 return MP_OKAY; 805 } 806 807 #define SQR(a, b) \ 808 MP_CHECKOK(mp_sqr(a, b)); \ 809 MP_CHECKOK(s_mp_redc(b, mmm)) 810 811 #if defined(MP_MONT_USE_MP_MUL) 812 #define MUL_NOWEAVE(x, a, b) \ 813 MP_CHECKOK(mp_mul(a, x, b)); \ 814 MP_CHECKOK(s_mp_redc(b, mmm)) 815 #else 816 #define MUL_NOWEAVE(x, a, b) \ 817 MP_CHECKOK(s_mp_mul_mont(a, x, b, mmm)) 818 #endif 819 820 #define MUL(x, a, b) \ 821 MP_CHECKOK(weave_to_mpi(&tmp, powers, (x), nLen, num_powers)); \ 822 MUL_NOWEAVE(&tmp, a, b) 823 824 #define SWAPPA \ 825 ptmp = pa1; \ 826 pa1 = pa2; \ 827 pa2 = ptmp 828 #define MP_ALIGN(x, y) ((((ptrdiff_t)(x)) + ((y)-1)) & (((ptrdiff_t)0) - (y))) 829 830 /* Do modular exponentiation using integer multiply code. */ 831 mp_err 832 mp_exptmod_safe_i(const mp_int *montBase, 833 const mp_int *exponent, 834 const mp_int *modulus, 835 mp_int *result, 836 mp_mont_modulus *mmm, 837 int nLen, 838 mp_size bits_in_exponent, 839 mp_size window_bits, 840 mp_size num_powers) 841 { 842 mp_int *pa1, *pa2, *ptmp; 843 mp_size i; 844 mp_size first_window; 845 mp_err res; 846 int expOff; 847 mp_int accum1, accum2, accum[WEAVE_WORD_SIZE]; 848 mp_int tmp; 849 mp_digit *powersArray = NULL; 850 mp_digit *powers = NULL; 851 852 MP_DIGITS(&accum1) = 0; 853 MP_DIGITS(&accum2) = 0; 854 MP_DIGITS(&accum[0]) = 0; 855 MP_DIGITS(&accum[1]) = 0; 856 MP_DIGITS(&accum[2]) = 0; 857 MP_DIGITS(&accum[3]) = 0; 858 MP_DIGITS(&tmp) = 0; 859 860 /* grab the first window value. This allows us to preload accumulator1 861 * and save a conversion, some squares and a multiple*/ 862 MP_CHECKOK(mpl_get_bits(exponent, 863 bits_in_exponent - window_bits, window_bits)); 864 first_window = (mp_size)res; 865 866 MP_CHECKOK(mp_init_size(&accum1, 3 * nLen + 2)); 867 MP_CHECKOK(mp_init_size(&accum2, 3 * nLen + 2)); 868 869 /* build the first WEAVE_WORD powers inline */ 870 /* if WEAVE_WORD_SIZE is not 4, this code will have to change */ 871 if (num_powers > 2) { 872 MP_CHECKOK(mp_init_size(&accum[0], 3 * nLen + 2)); 873 MP_CHECKOK(mp_init_size(&accum[1], 3 * nLen + 2)); 874 MP_CHECKOK(mp_init_size(&accum[2], 3 * nLen + 2)); 875 MP_CHECKOK(mp_init_size(&accum[3], 3 * nLen + 2)); 876 mp_set(&accum[0], 1); 877 MP_CHECKOK(mp_to_mont(&accum[0], &(mmm->N), &accum[0])); 878 MP_CHECKOK(mp_copy(montBase, &accum[1])); 879 SQR(montBase, &accum[2]); 880 MUL_NOWEAVE(montBase, &accum[2], &accum[3]); 881 powersArray = (mp_digit *)malloc(num_powers * (nLen * sizeof(mp_digit) + 1)); 882 if (!powersArray) { 883 res = MP_MEM; 884 goto CLEANUP; 885 } 886 /* powers[i] = base ** (i); */ 887 powers = (mp_digit *)MP_ALIGN(powersArray, num_powers); 888 MP_CHECKOK(mpi_to_weave(accum, powers, nLen, num_powers)); 889 if (first_window < 4) { 890 MP_CHECKOK(mp_copy(&accum[first_window], &accum1)); 891 first_window = num_powers; 892 } 893 } else { 894 if (first_window == 0) { 895 mp_set(&accum1, 1); 896 MP_CHECKOK(mp_to_mont(&accum1, &(mmm->N), &accum1)); 897 } else { 898 /* assert first_window == 1? */ 899 MP_CHECKOK(mp_copy(montBase, &accum1)); 900 } 901 } 902 903 /* 904 * calculate all the powers in the powers array. 905 * this adds 2**(k-1)-2 square operations over just calculating the 906 * odd powers where k is the window size in the two other mp_modexpt 907 * implementations in this file. We will get some of that 908 * back by not needing the first 'k' squares and one multiply for the 909 * first window. 910 * Given the value of 4 for WEAVE_WORD_SIZE, this loop will only execute if 911 * num_powers > 2, in which case powers will have been allocated. 912 */ 913 for (i = WEAVE_WORD_SIZE; i < num_powers; i++) { 914 int acc_index = i & (WEAVE_WORD_SIZE - 1); /* i % WEAVE_WORD_SIZE */ 915 if (i & 1) { 916 MUL_NOWEAVE(montBase, &accum[acc_index - 1], &accum[acc_index]); 917 /* we've filled the array do our 'per array' processing */ 918 if (acc_index == (WEAVE_WORD_SIZE - 1)) { 919 MP_CHECKOK(mpi_to_weave(accum, powers + i - (WEAVE_WORD_SIZE - 1), 920 nLen, num_powers)); 921 922 if (first_window <= i) { 923 MP_CHECKOK(mp_copy(&accum[first_window & (WEAVE_WORD_SIZE - 1)], 924 &accum1)); 925 first_window = num_powers; 926 } 927 } 928 } else { 929 /* up to 8 we can find 2^i-1 in the accum array, but at 8 we our source 930 * and target are the same so we need to copy.. After that, the 931 * value is overwritten, so we need to fetch it from the stored 932 * weave array */ 933 if (i > 2 * WEAVE_WORD_SIZE) { 934 MP_CHECKOK(weave_to_mpi(&accum2, powers, i / 2, nLen, num_powers)); 935 SQR(&accum2, &accum[acc_index]); 936 } else { 937 int half_power_index = (i / 2) & (WEAVE_WORD_SIZE - 1); 938 if (half_power_index == acc_index) { 939 /* copy is cheaper than weave_to_mpi */ 940 MP_CHECKOK(mp_copy(&accum[half_power_index], &accum2)); 941 SQR(&accum2, &accum[acc_index]); 942 } else { 943 SQR(&accum[half_power_index], &accum[acc_index]); 944 } 945 } 946 } 947 } 948 /* if the accum1 isn't set, Then there is something wrong with our logic 949 * above and is an internal programming error. 950 */ 951 #if MP_ARGCHK == 2 952 assert(MP_USED(&accum1) != 0); 953 #endif 954 955 /* set accumulator to montgomery residue of 1 */ 956 pa1 = &accum1; 957 pa2 = &accum2; 958 959 /* tmp is not used if window_bits == 1. */ 960 if (window_bits != 1) { 961 MP_CHECKOK(mp_init_size(&tmp, 3 * nLen + 2)); 962 } 963 964 for (expOff = bits_in_exponent - window_bits * 2; expOff >= 0; expOff -= window_bits) { 965 mp_size smallExp; 966 MP_CHECKOK(mpl_get_bits(exponent, expOff, window_bits)); 967 smallExp = (mp_size)res; 968 969 /* handle unroll the loops */ 970 switch (window_bits) { 971 case 1: 972 if (!smallExp) { 973 SQR(pa1, pa2); 974 SWAPPA; 975 } else if (smallExp & 1) { 976 SQR(pa1, pa2); 977 MUL_NOWEAVE(montBase, pa2, pa1); 978 } else { 979 abort(); 980 } 981 break; 982 case 6: 983 SQR(pa1, pa2); 984 SQR(pa2, pa1); 985 /* fall through */ 986 case 4: 987 SQR(pa1, pa2); 988 SQR(pa2, pa1); 989 SQR(pa1, pa2); 990 SQR(pa2, pa1); 991 MUL(smallExp, pa1, pa2); 992 SWAPPA; 993 break; 994 case 5: 995 SQR(pa1, pa2); 996 SQR(pa2, pa1); 997 SQR(pa1, pa2); 998 SQR(pa2, pa1); 999 SQR(pa1, pa2); 1000 MUL(smallExp, pa2, pa1); 1001 break; 1002 default: 1003 abort(); /* could do a loop? */ 1004 } 1005 } 1006 1007 res = s_mp_redc(pa1, mmm); 1008 mp_exch(pa1, result); 1009 1010 CLEANUP: 1011 mp_clear(&accum1); 1012 mp_clear(&accum2); 1013 mp_clear(&accum[0]); 1014 mp_clear(&accum[1]); 1015 mp_clear(&accum[2]); 1016 mp_clear(&accum[3]); 1017 mp_clear(&tmp); 1018 /* zero required by FIPS here, can't use PORT_ZFree 1019 * because mpi doesn't link with util */ 1020 if (powers) { 1021 PORT_Memset(powers, 0, num_powers * sizeof(mp_digit)); 1022 } 1023 free(powersArray); 1024 return res; 1025 } 1026 #undef SQR 1027 #undef MUL 1028 #endif 1029 1030 mp_err 1031 mp_exptmod(const mp_int *inBase, const mp_int *exponent, 1032 const mp_int *modulus, mp_int *result) 1033 { 1034 const mp_int *base; 1035 mp_size bits_in_exponent, i, window_bits, odd_ints; 1036 mp_err res; 1037 int nLen; 1038 mp_int montBase, goodBase; 1039 mp_mont_modulus mmm; 1040 #ifdef MP_USING_CACHE_SAFE_MOD_EXP 1041 static unsigned int max_window_bits; 1042 #endif 1043 1044 /* function for computing n0prime only works if n0 is odd */ 1045 if (!mp_isodd(modulus)) 1046 return s_mp_exptmod(inBase, exponent, modulus, result); 1047 1048 if (mp_cmp_z(inBase) == MP_LT) 1049 return MP_RANGE; 1050 MP_DIGITS(&montBase) = 0; 1051 MP_DIGITS(&goodBase) = 0; 1052 1053 if (mp_cmp(inBase, modulus) < 0) { 1054 base = inBase; 1055 } else { 1056 MP_CHECKOK(mp_init(&goodBase)); 1057 base = &goodBase; 1058 MP_CHECKOK(mp_mod(inBase, modulus, &goodBase)); 1059 } 1060 1061 nLen = MP_USED(modulus); 1062 MP_CHECKOK(mp_init_size(&montBase, 2 * nLen + 2)); 1063 1064 mmm.N = *modulus; /* a copy of the mp_int struct */ 1065 1066 /* compute n0', given n0, n0' = -(n0 ** -1) mod MP_RADIX 1067 ** where n0 = least significant mp_digit of N, the modulus. 1068 */ 1069 mmm.n0prime = mp_calculate_mont_n0i(modulus); 1070 1071 MP_CHECKOK(mp_to_mont(base, modulus, &montBase)); 1072 1073 bits_in_exponent = mpl_significant_bits(exponent); 1074 #ifdef MP_USING_CACHE_SAFE_MOD_EXP 1075 if (mp_using_cache_safe_exp) { 1076 if (bits_in_exponent > 780) 1077 window_bits = 6; 1078 else if (bits_in_exponent > 256) 1079 window_bits = 5; 1080 else if (bits_in_exponent > 20) 1081 window_bits = 4; 1082 /* RSA public key exponents are typically under 20 bits (common values 1083 * are: 3, 17, 65537) and a 4-bit window is inefficient 1084 */ 1085 else 1086 window_bits = 1; 1087 } else 1088 #endif 1089 if (bits_in_exponent > 480) 1090 window_bits = 6; 1091 else if (bits_in_exponent > 160) 1092 window_bits = 5; 1093 else if (bits_in_exponent > 20) 1094 window_bits = 4; 1095 /* RSA public key exponents are typically under 20 bits (common values 1096 * are: 3, 17, 65537) and a 4-bit window is inefficient 1097 */ 1098 else 1099 window_bits = 1; 1100 1101 #ifdef MP_USING_CACHE_SAFE_MOD_EXP 1102 /* 1103 * clamp the window size based on 1104 * the cache line size. 1105 */ 1106 if (!max_window_bits) { 1107 unsigned long cache_size = s_mpi_getProcessorLineSize(); 1108 /* processor has no cache, use 'fast' code always */ 1109 if (cache_size == 0) { 1110 mp_using_cache_safe_exp = 0; 1111 } 1112 if ((cache_size == 0) || (cache_size >= 64)) { 1113 max_window_bits = 6; 1114 } else if (cache_size >= 32) { 1115 max_window_bits = 5; 1116 } else if (cache_size >= 16) { 1117 max_window_bits = 4; 1118 } else 1119 max_window_bits = 1; /* should this be an assert? */ 1120 } 1121 1122 /* clamp the window size down before we caclulate bits_in_exponent */ 1123 if (mp_using_cache_safe_exp) { 1124 if (window_bits > max_window_bits) { 1125 window_bits = max_window_bits; 1126 } 1127 } 1128 #endif 1129 1130 odd_ints = 1 << (window_bits - 1); 1131 i = bits_in_exponent % window_bits; 1132 if (i != 0) { 1133 bits_in_exponent += window_bits - i; 1134 } 1135 1136 #ifdef MP_USING_MONT_MULF 1137 if (mp_using_mont_mulf) { 1138 MP_CHECKOK(s_mp_pad(&montBase, nLen)); 1139 res = mp_exptmod_f(&montBase, exponent, modulus, result, &mmm, nLen, 1140 bits_in_exponent, window_bits, odd_ints); 1141 } else 1142 #endif 1143 #ifdef MP_USING_CACHE_SAFE_MOD_EXP 1144 if (mp_using_cache_safe_exp) { 1145 res = mp_exptmod_safe_i(&montBase, exponent, modulus, result, &mmm, nLen, 1146 bits_in_exponent, window_bits, 1 << window_bits); 1147 } else 1148 #endif 1149 res = mp_exptmod_i(&montBase, exponent, modulus, result, &mmm, nLen, 1150 bits_in_exponent, window_bits, odd_ints); 1151 1152 CLEANUP: 1153 mp_clear(&montBase); 1154 mp_clear(&goodBase); 1155 /* Don't mp_clear mmm.N because it is merely a copy of modulus. 1156 ** Just zap it. 1157 */ 1158 memset(&mmm, 0, sizeof mmm); 1159 return res; 1160 }