xsum.cpp (33148B)
1 /* FUNCTIONS FOR EXACT SUMMATION. */ 2 3 /* Copyright 2015, 2018, 2021, 2024 Radford M. Neal 4 5 Permission is hereby granted, free of charge, to any person obtaining 6 a copy of this software and associated documentation files (the 7 "Software"), to deal in the Software without restriction, including 8 without limitation the rights to use, copy, modify, merge, publish, 9 distribute, sublicense, and/or sell copies of the Software, and to 10 permit persons to whom the Software is furnished to do so, subject to 11 the following conditions: 12 13 The above copyright notice and this permission notice shall be 14 included in all copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25 #include <stdio.h> 26 #include <string.h> 27 #include <math.h> 28 #include "xsum.h" 29 30 /* ---------------------- IMPLEMENTATION ASSUMPTIONS ----------------------- */ 31 32 /* This code makes the following assumptions: 33 34 o The 'double' type is a IEEE-754 standard 64-bit floating-point value. 35 36 o The 'int64_t' and 'uint64_t' types exist, for 64-bit signed and 37 unsigned integers. 38 39 o The 'endianness' of 'double' and 64-bit integers is consistent 40 between these types - that is, looking at the bits of a 'double' 41 value as an 64-bit integer will have the expected result. 42 43 o Right shifts of a signed operand produce the results expected for 44 a two's complement representation. 45 46 o Rounding should be done in the "round to nearest, ties to even" mode. 47 */ 48 49 /* --------------------------- CONFIGURATION ------------------------------- */ 50 51 /* IMPLEMENTATION OPTIONS. Can be set to either 0 or 1, whichever seems 52 to be fastest. */ 53 54 #define USE_SIMD 1 /* Use SIMD intrinsics (SSE2/AVX) if available? */ 55 56 #define USE_MEMSET_SMALL \ 57 1 /* Use memset rather than a loop (for small mem)? \ 58 */ 59 #define USE_MEMSET_LARGE \ 60 1 /* Use memset rather than a loop (for large mem)? \ 61 */ 62 #define USE_USED_LARGE 1 /* Use the used flags in a large accumulator? */ 63 64 #define OPT_SMALL 0 /* Class of manual optimization for operations on */ 65 /* small accumulator: 0 (none), 1, 2, 3 (SIMD) */ 66 #define OPT_CARRY 1 /* Use manually optimized carry propagation? */ 67 68 #define OPT_LARGE_SUM 1 /* Should manually optimized routines be used for */ 69 #define OPT_LARGE_SQNORM 1 /* operations using the large accumulator? */ 70 #define OPT_LARGE_DOT 1 71 72 #define OPT_SIMPLE_SUM 1 /* Should manually optimized routines be used for */ 73 #define OPT_SIMPLE_SQNORM 1 /* operations done with simple FP arithmetic? */ 74 #define OPT_SIMPLE_DOT 1 75 76 #define OPT_KAHAN_SUM 0 /* Use manually optimized routine for Kahan sum? */ 77 78 #define INLINE_SMALL 1 /* Inline more of the small accumulator routines? */ 79 /* (Not currently used) */ 80 #define INLINE_LARGE 1 /* Inline more of the large accumulator routines? */ 81 82 /* INCLUDE INTEL INTRINSICS IF USED AND AVAILABLE. */ 83 84 #if USE_SIMD && __SSE2__ 85 # include <immintrin.h> 86 #endif 87 88 /* COPY A 64-BIT QUANTITY - DOUBLE TO 64-BIT INT OR VICE VERSA. The 89 arguments are destination and source variables (not values). */ 90 91 #define COPY64(dst, src) memcpy(&(dst), &(src), sizeof(double)) 92 93 /* OPTIONAL INCLUSION OF PBINARY MODULE. Used for debug output. */ 94 95 #ifdef PBINARY 96 # include "pbinary.h" 97 #else 98 # define pbinary_int64(x, y) 0 99 # define pbinary_double(x) 0 100 #endif 101 102 /* SET UP DEBUG FLAG. It's a variable if debuging is enabled, and a 103 constant if disabled (so that no code will be generated then). */ 104 105 int xsum_debug = 0; 106 107 #ifndef DEBUG 108 # define xsum_debug 0 109 #endif 110 111 /* SET UP INLINE / NOINLINE MACROS. */ 112 113 #if __GNUC__ 114 # define INLINE inline __attribute__((always_inline)) 115 # define NOINLINE __attribute__((noinline)) 116 #else 117 # define INLINE inline 118 # define NOINLINE 119 #endif 120 121 /* ------------------------ INTERNAL ROUTINES ------------------------------- */ 122 123 /* ADD AN INF OR NAN TO A SMALL ACCUMULATOR. This only changes the flags, 124 not the chunks in the accumulator, which retains the sum of the finite 125 terms (which is perhaps sometimes useful to access, though no function 126 to do so is defined at present). A NaN with larger payload (seen as a 127 52-bit unsigned integer) takes precedence, with the sign of the NaN always 128 being positive. This ensures that the order of summing NaN values doesn't 129 matter. */ 130 131 static NOINLINE void xsum_small_add_inf_nan(xsum_small_accumulator* sacc, 132 xsum_int ivalue) { 133 xsum_int mantissa; 134 double fltv; 135 136 mantissa = ivalue & XSUM_MANTISSA_MASK; 137 138 if (mantissa == 0) /* Inf */ 139 { 140 if (sacc->Inf == 0) { /* no previous Inf */ 141 sacc->Inf = ivalue; 142 } else if (sacc->Inf != ivalue) { /* previous Inf was opposite sign */ 143 COPY64(fltv, ivalue); 144 fltv = fltv - fltv; /* result will be a NaN */ 145 COPY64(sacc->Inf, fltv); 146 } 147 } else /* NaN */ 148 { /* Choose the NaN with the bigger payload and clear its sign. Using <= 149 ensures that we will choose the first NaN over the previous zero. */ 150 if ((sacc->NaN & XSUM_MANTISSA_MASK) <= mantissa) { 151 sacc->NaN = ivalue & ~XSUM_SIGN_MASK; 152 } 153 } 154 } 155 156 /* PROPAGATE CARRIES TO NEXT CHUNK IN A SMALL ACCUMULATOR. Needs to 157 be called often enough that accumulated carries don't overflow out 158 the top, as indicated by sacc->adds_until_propagate. Returns the 159 index of the uppermost non-zero chunk (0 if number is zero). 160 161 After carry propagation, the uppermost non-zero chunk will indicate 162 the sign of the number, and will not be -1 (all 1s). It will be in 163 the range -2^XSUM_LOW_MANTISSA_BITS to 2^XSUM_LOW_MANTISSA_BITS - 1. 164 Lower chunks will be non-negative, and in the range from 0 up to 165 2^XSUM_LOW_MANTISSA_BITS - 1. */ 166 167 static NOINLINE int xsum_carry_propagate(xsum_small_accumulator* sacc) { 168 int i, u, uix; 169 170 if (xsum_debug) printf("\nCARRY PROPAGATING IN SMALL ACCUMULATOR\n"); 171 172 /* Set u to the index of the uppermost non-zero (for now) chunk, or 173 return with value 0 if there is none. */ 174 175 #if OPT_CARRY 176 177 { 178 u = XSUM_SCHUNKS - 1; 179 switch (XSUM_SCHUNKS & 0x3) /* get u to be a multiple of 4 minus one */ 180 { 181 case 3: 182 if (sacc->chunk[u] != 0) { 183 goto found2; 184 } 185 u -= 1; /* XSUM_SCHUNKS is a */ 186 case 2: 187 if (sacc->chunk[u] != 0) /* constant, so the */ 188 { 189 goto found2; /* compiler will do */ 190 } /* simple code here */ 191 u -= 1; 192 case 1: 193 if (sacc->chunk[u] != 0) { 194 goto found2; 195 } 196 u -= 1; 197 case 0:; 198 } 199 200 do /* here, u should be a multiple of 4 minus one, and at least 3 */ 201 { 202 # if USE_SIMD && __AVX__ 203 { 204 __m256i ch; 205 ch = _mm256_loadu_si256((__m256i*)(sacc->chunk + u - 3)); 206 if (!_mm256_testz_si256(ch, ch)) { 207 goto found; 208 } 209 u -= 4; 210 if (u < 0) /* never actually happens, because value of XSUM_SCHUNKS */ 211 { 212 break; /* is such that u < 0 occurs at end of do loop instead */ 213 } 214 ch = _mm256_loadu_si256((__m256i*)(sacc->chunk + u - 3)); 215 if (!_mm256_testz_si256(ch, ch)) { 216 goto found; 217 } 218 u -= 4; 219 } 220 # else 221 { 222 if (sacc->chunk[u] | sacc->chunk[u - 1] | sacc->chunk[u - 2] | 223 sacc->chunk[u - 3]) { 224 goto found; 225 } 226 u -= 4; 227 } 228 # endif 229 230 } while (u >= 0); 231 232 if (xsum_debug) printf("number is zero (1)\n"); 233 uix = 0; 234 goto done; 235 236 found: 237 if (sacc->chunk[u] != 0) { 238 goto found2; 239 } 240 u -= 1; 241 if (sacc->chunk[u] != 0) { 242 goto found2; 243 } 244 u -= 1; 245 if (sacc->chunk[u] != 0) { 246 goto found2; 247 } 248 u -= 1; 249 250 found2:; 251 } 252 253 #else /* Non-optimized search for uppermost non-zero chunk */ 254 255 { 256 for (u = XSUM_SCHUNKS - 1; sacc->chunk[u] == 0; u--) { 257 if (u == 0) { 258 if (xsum_debug) printf("number is zero (1)\n"); 259 uix = 0; 260 goto done; 261 } 262 } 263 } 264 265 #endif 266 267 /* At this point, sacc->chunk[u] must be non-zero */ 268 269 if (xsum_debug) printf("u: %d, sacc->chunk[u]: %ld", u, sacc->chunk[u]); 270 271 /* Carry propagate, starting at the low-order chunks. Note that the 272 loop limit of u may be increased inside the loop. */ 273 274 i = 0; /* set to the index of the next non-zero chunck, from bottom */ 275 276 #if OPT_CARRY 277 { 278 /* Quickly skip over unused low-order chunks. Done here at the start 279 on the theory that there are often many unused low-order chunks, 280 justifying some overhead to begin, but later stretches of unused 281 chunks may not be as large. */ 282 283 int e = u - 3; /* go only to 3 before so won't access beyond chunk array */ 284 285 do { 286 # if USE_SIMD && __AVX__ 287 { 288 __m256i ch; 289 ch = _mm256_loadu_si256((__m256i*)(sacc->chunk + i)); 290 if (!_mm256_testz_si256(ch, ch)) { 291 break; 292 } 293 i += 4; 294 if (i >= e) { 295 break; 296 } 297 ch = _mm256_loadu_si256((__m256i*)(sacc->chunk + i)); 298 if (!_mm256_testz_si256(ch, ch)) { 299 break; 300 } 301 } 302 # else 303 { 304 if (sacc->chunk[i] | sacc->chunk[i + 1] | sacc->chunk[i + 2] | 305 sacc->chunk[i + 3]) { 306 break; 307 } 308 } 309 # endif 310 311 i += 4; 312 313 } while (i <= e); 314 } 315 #endif 316 317 uix = -1; /* indicates that a non-zero chunk has not been found yet */ 318 319 do { 320 xsum_schunk c; /* Set to the chunk at index i (next non-zero one) */ 321 xsum_schunk clow; /* Low-order bits of c */ 322 xsum_schunk chigh; /* High-order bits of c */ 323 324 /* Find the next non-zero chunk, setting i to its index, or break out 325 of loop if there is none. Note that the chunk at index u is not 326 necessarily non-zero - it was initially, but u or the chunk at u 327 may have changed. */ 328 329 #if OPT_CARRY 330 { 331 c = sacc->chunk[i]; 332 if (c != 0) { 333 goto nonzero; 334 } 335 i += 1; 336 if (i > u) { 337 break; /* reaching here is only possible when u == i initially, */ 338 } /* with the last add to a chunk having changed it to 0 */ 339 340 for (;;) { 341 c = sacc->chunk[i]; 342 if (c != 0) { 343 goto nonzero; 344 } 345 i += 1; 346 c = sacc->chunk[i]; 347 if (c != 0) { 348 goto nonzero; 349 } 350 i += 1; 351 c = sacc->chunk[i]; 352 if (c != 0) { 353 goto nonzero; 354 } 355 i += 1; 356 c = sacc->chunk[i]; 357 if (c != 0) { 358 goto nonzero; 359 } 360 i += 1; 361 } 362 } 363 #else 364 { 365 do { 366 c = sacc->chunk[i]; 367 if (c != 0) { 368 goto nonzero; 369 } 370 i += 1; 371 } while (i <= u); 372 373 break; 374 } 375 #endif 376 377 /* Propagate possible carry from this chunk to next chunk up. */ 378 379 nonzero: 380 chigh = c >> XSUM_LOW_MANTISSA_BITS; 381 if (chigh == 0) { 382 uix = i; 383 i += 1; 384 continue; /* no need to change this chunk */ 385 } 386 387 if (u == i) { 388 if (chigh == -1) { 389 uix = i; 390 break; /* don't propagate -1 into the region of all zeros above */ 391 } 392 u = i + 1; /* we will change chunk[u+1], so we'll need to look at it */ 393 } 394 395 clow = c & XSUM_LOW_MANTISSA_MASK; 396 if (clow != 0) { 397 uix = i; 398 } 399 400 /* We now change chunk[i] and add to chunk[i+1]. Note that i+1 should be 401 in range (no bigger than XSUM_CHUNKS-1) if summing memory, since 402 the number of chunks is big enough to hold any sum, and we do not 403 store redundant chunks with values 0 or -1 above previously non-zero 404 chunks. But other add operations might cause overflow, in which 405 case we produce a NaN with all 1s as payload. (We can't reliably produce 406 an Inf of the right sign.) */ 407 408 sacc->chunk[i] = clow; 409 if (i + 1 >= XSUM_SCHUNKS) { 410 xsum_small_add_inf_nan( 411 sacc, 412 ((xsum_int)XSUM_EXP_MASK << XSUM_MANTISSA_BITS) | XSUM_MANTISSA_MASK); 413 u = i; 414 } else { 415 sacc->chunk[i + 1] += 416 chigh; /* note: this could make this chunk be zero */ 417 } 418 419 i += 1; 420 421 } while (i <= u); 422 423 if (xsum_debug) printf(" uix: %d new u: %d\n", uix, u); 424 425 /* Check again for the number being zero, since carry propagation might 426 have created zero from something that initially looked non-zero. */ 427 428 if (uix < 0) { 429 if (xsum_debug) printf("number is zero (2)\n"); 430 uix = 0; 431 goto done; 432 } 433 434 /* While the uppermost chunk is negative, with value -1, combine it with 435 the chunk below (if there is one) to produce the same number but with 436 one fewer non-zero chunks. */ 437 438 while (sacc->chunk[uix] == -1 && 439 uix > 0) { /* Left shift of a negative number is undefined according to 440 the standard, so do a multiply - it's all presumably 441 constant-folded by the compiler.*/ 442 sacc->chunk[uix - 1] += 443 ((xsum_schunk)-1) * (((xsum_schunk)1) << XSUM_LOW_MANTISSA_BITS); 444 sacc->chunk[uix] = 0; 445 uix -= 1; 446 } 447 448 /* We can now add one less than the total allowed terms before the 449 next carry propagate. */ 450 451 done: 452 sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS - 1; 453 454 /* Return index of uppermost non-zero chunk. */ 455 456 return uix; 457 } 458 459 /* ------------------------ EXTERNAL ROUTINES ------------------------------- */ 460 461 /* INITIALIZE A SMALL ACCUMULATOR TO ZERO. */ 462 463 void xsum_small_init(xsum_small_accumulator* sacc) { 464 sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS; 465 sacc->Inf = sacc->NaN = 0; 466 #if USE_MEMSET_SMALL 467 { memset(sacc->chunk, 0, XSUM_SCHUNKS * sizeof(xsum_schunk)); } 468 #elif USE_SIMD && __AVX__ && XSUM_SCHUNKS == 67 469 { 470 xsum_schunk* ch = sacc->chunk; 471 __m256i z = _mm256_setzero_si256(); 472 _mm256_storeu_si256((__m256i*)(ch + 0), z); 473 _mm256_storeu_si256((__m256i*)(ch + 4), z); 474 _mm256_storeu_si256((__m256i*)(ch + 8), z); 475 _mm256_storeu_si256((__m256i*)(ch + 12), z); 476 _mm256_storeu_si256((__m256i*)(ch + 16), z); 477 _mm256_storeu_si256((__m256i*)(ch + 20), z); 478 _mm256_storeu_si256((__m256i*)(ch + 24), z); 479 _mm256_storeu_si256((__m256i*)(ch + 28), z); 480 _mm256_storeu_si256((__m256i*)(ch + 32), z); 481 _mm256_storeu_si256((__m256i*)(ch + 36), z); 482 _mm256_storeu_si256((__m256i*)(ch + 40), z); 483 _mm256_storeu_si256((__m256i*)(ch + 44), z); 484 _mm256_storeu_si256((__m256i*)(ch + 48), z); 485 _mm256_storeu_si256((__m256i*)(ch + 52), z); 486 _mm256_storeu_si256((__m256i*)(ch + 56), z); 487 _mm256_storeu_si256((__m256i*)(ch + 60), z); 488 _mm_storeu_si128((__m128i*)(ch + 64), _mm256_castsi256_si128(z)); 489 _mm_storeu_si64(ch + 66, _mm256_castsi256_si128(z)); 490 } 491 #else 492 { 493 xsum_schunk* p; 494 int n; 495 p = sacc->chunk; 496 n = XSUM_SCHUNKS; 497 do { 498 *p++ = 0; 499 n -= 1; 500 } while (n > 0); 501 } 502 #endif 503 } 504 505 /* ADD ONE NUMBER TO A SMALL ACCUMULATOR ASSUMING NO CARRY PROPAGATION REQ'D. 506 This function is declared INLINE regardless of the setting of INLINE_SMALL 507 and for good performance it must be inlined by the compiler (otherwise the 508 procedure call overhead will result in substantial inefficiency). */ 509 510 static INLINE void xsum_add1_no_carry(xsum_small_accumulator* sacc, 511 xsum_flt value) { 512 xsum_int ivalue; 513 xsum_int mantissa; 514 xsum_expint exp, low_exp, high_exp; 515 xsum_schunk* chunk_ptr; 516 517 if (xsum_debug) { 518 printf("ADD1 %+.17le\n ", (double)value); 519 pbinary_double((double)value); 520 printf("\n"); 521 } 522 523 /* Extract exponent and mantissa. Split exponent into high and low parts. */ 524 525 COPY64(ivalue, value); 526 527 exp = (ivalue >> XSUM_MANTISSA_BITS) & XSUM_EXP_MASK; 528 mantissa = ivalue & XSUM_MANTISSA_MASK; 529 high_exp = exp >> XSUM_LOW_EXP_BITS; 530 low_exp = exp & XSUM_LOW_EXP_MASK; 531 532 if (xsum_debug) { 533 printf(" high exp: "); 534 pbinary_int64(high_exp, XSUM_HIGH_EXP_BITS); 535 printf(" low exp: "); 536 pbinary_int64(low_exp, XSUM_LOW_EXP_BITS); 537 printf("\n"); 538 } 539 540 /* Categorize number as normal, denormalized, or Inf/NaN according to 541 the value of the exponent field. */ 542 543 if (exp == 0) /* zero or denormalized */ 544 { /* If it's a zero (positive or negative), we do nothing. */ 545 if (mantissa == 0) { 546 return; 547 } 548 /* Denormalized mantissa has no implicit 1, but exponent is 1 not 0. */ 549 exp = low_exp = 1; 550 } else if (exp == XSUM_EXP_MASK) /* Inf or NaN */ 551 { /* Just update flags in accumulator structure. */ 552 xsum_small_add_inf_nan(sacc, ivalue); 553 return; 554 } else /* normalized */ 555 { /* OR in implicit 1 bit at top of mantissa */ 556 mantissa |= (xsum_int)1 << XSUM_MANTISSA_BITS; 557 } 558 559 if (xsum_debug) { 560 printf(" mantissa: "); 561 pbinary_int64(mantissa, XSUM_MANTISSA_BITS + 1); 562 printf("\n"); 563 } 564 565 /* Use high part of exponent as index of chunk, and low part of 566 exponent to give position within chunk. Fetch the two chunks 567 that will be modified. */ 568 569 chunk_ptr = sacc->chunk + high_exp; 570 571 /* Separate mantissa into two parts, after shifting, and add to (or 572 subtract from) this chunk and the next higher chunk (which always 573 exists since there are three extra ones at the top). 574 575 Note that low_mantissa will have at most XSUM_LOW_MANTISSA_BITS bits, 576 while high_mantissa will have at most XSUM_MANTISSA_BITS bits, since 577 even though the high mantissa includes the extra implicit 1 bit, it will 578 also be shifted right by at least one bit. */ 579 580 xsum_int split_mantissa[2]; 581 split_mantissa[0] = ((xsum_uint)mantissa << low_exp) & XSUM_LOW_MANTISSA_MASK; 582 split_mantissa[1] = mantissa >> (XSUM_LOW_MANTISSA_BITS - low_exp); 583 584 /* Add to, or subtract from, the two affected chunks. */ 585 586 #if OPT_SMALL == 1 587 { 588 xsum_int ivalue_sign = ivalue < 0 ? -1 : 1; 589 chunk_ptr[0] += ivalue_sign * split_mantissa[0]; 590 chunk_ptr[1] += ivalue_sign * split_mantissa[1]; 591 } 592 #elif OPT_SMALL == 2 593 { 594 xsum_int ivalue_neg = 595 ivalue >> (XSUM_SCHUNK_BITS - 1); /* all 0s if +ve, all 1s if -ve */ 596 chunk_ptr[0] += (split_mantissa[0] ^ ivalue_neg) + (ivalue_neg & 1); 597 chunk_ptr[1] += (split_mantissa[1] ^ ivalue_neg) + (ivalue_neg & 1); 598 } 599 #elif OPT_SMALL == 3 && USE_SIMD && __SSE2__ 600 { 601 xsum_int ivalue_neg = 602 ivalue >> (XSUM_SCHUNK_BITS - 1); /* all 0s if +ve, all 1s if -ve */ 603 _mm_storeu_si128( 604 (__m128i*)chunk_ptr, 605 _mm_add_epi64( 606 _mm_loadu_si128((__m128i*)chunk_ptr), 607 _mm_add_epi64( 608 _mm_set1_epi64((__m64)(ivalue_neg & 1)), 609 _mm_xor_si128(_mm_set1_epi64((__m64)ivalue_neg), 610 _mm_loadu_si128((__m128i*)split_mantissa))))); 611 } 612 #else 613 { 614 if (ivalue < 0) { 615 chunk_ptr[0] -= split_mantissa[0]; 616 chunk_ptr[1] -= split_mantissa[1]; 617 } else { 618 chunk_ptr[0] += split_mantissa[0]; 619 chunk_ptr[1] += split_mantissa[1]; 620 } 621 } 622 #endif 623 624 if (xsum_debug) { 625 if (ivalue < 0) { 626 printf(" -high man: "); 627 pbinary_int64(-split_mantissa[1], XSUM_MANTISSA_BITS); 628 printf("\n -low man: "); 629 pbinary_int64(-split_mantissa[0], XSUM_LOW_MANTISSA_BITS); 630 printf("\n"); 631 } else { 632 printf(" high man: "); 633 pbinary_int64(split_mantissa[1], XSUM_MANTISSA_BITS); 634 printf("\n low man: "); 635 pbinary_int64(split_mantissa[0], XSUM_LOW_MANTISSA_BITS); 636 printf("\n"); 637 } 638 } 639 } 640 641 /* ADD ONE DOUBLE TO A SMALL ACCUMULATOR. This is equivalent to, but 642 somewhat faster than, calling xsum_small_addv with a vector of one 643 value. */ 644 645 void xsum_small_add1(xsum_small_accumulator* sacc, xsum_flt value) { 646 if (sacc->adds_until_propagate == 0) { 647 (void)xsum_carry_propagate(sacc); 648 } 649 650 xsum_add1_no_carry(sacc, value); 651 652 sacc->adds_until_propagate -= 1; 653 } 654 655 /* RETURN THE RESULT OF ROUNDING A SMALL ACCUMULATOR. The rounding mode 656 is to nearest, with ties to even. The small accumulator may be modified 657 by this operation (by carry propagation being done), but the value it 658 represents should not change. */ 659 660 xsum_flt xsum_small_round(xsum_small_accumulator* sacc) { 661 xsum_int ivalue; 662 xsum_schunk lower; 663 int i, j, e, more; 664 xsum_int intv; 665 double fltv; 666 667 if (xsum_debug) printf("\nROUNDING SMALL ACCUMULATOR\n"); 668 669 /* See if we have a NaN from one of the numbers being a NaN, in 670 which case we return the NaN with largest payload, or an infinite 671 result (+Inf, -Inf, or a NaN if both +Inf and -Inf occurred). 672 Note that we do NOT return NaN if we have both an infinite number 673 and a sum of other numbers that overflows with opposite sign, 674 since there is no real ambiguity regarding the sign in such a case. */ 675 676 if (sacc->NaN != 0) { 677 COPY64(fltv, sacc->NaN); 678 return fltv; 679 } 680 681 if (sacc->Inf != 0) { 682 COPY64(fltv, sacc->Inf); 683 return fltv; 684 } 685 686 /* If none of the numbers summed were infinite or NaN, we proceed to 687 propagate carries, as a preliminary to finding the magnitude of 688 the sum. This also ensures that the sign of the result can be 689 determined from the uppermost non-zero chunk. 690 691 We also find the index, i, of this uppermost non-zero chunk, as 692 the value returned by xsum_carry_propagate, and set ivalue to 693 sacc->chunk[i]. Note that ivalue will not be 0 or -1, unless 694 i is 0 (the lowest chunk), in which case it will be handled by 695 the code for denormalized numbers. */ 696 697 i = xsum_carry_propagate(sacc); 698 699 if (xsum_debug) xsum_small_display(sacc); 700 701 ivalue = sacc->chunk[i]; 702 703 /* Handle a possible denormalized number, including zero. */ 704 705 if (i <= 1) { 706 /* Check for zero value, in which case we can return immediately. */ 707 708 if (ivalue == 0) { 709 return 0.0; 710 } 711 712 /* Check if it is actually a denormalized number. It always is if only 713 the lowest chunk is non-zero. If the highest non-zero chunk is the 714 next-to-lowest, we check the magnitude of the absolute value. 715 Note that the real exponent is 1 (not 0), so we need to shift right 716 by 1 here. */ 717 718 if (i == 0) { 719 intv = ivalue >= 0 ? ivalue : -ivalue; 720 intv >>= 1; 721 if (ivalue < 0) { 722 intv |= XSUM_SIGN_MASK; 723 } 724 if (xsum_debug) { 725 printf("denormalized with i==0: intv %016llx\n", (long long)intv); 726 } 727 COPY64(fltv, intv); 728 return fltv; 729 } else { /* Note: Left shift of -ve number is undefined, so do a multiply 730 instead, which is probably optimized to a shift. */ 731 intv = ivalue * ((xsum_int)1 << (XSUM_LOW_MANTISSA_BITS - 1)) + 732 (sacc->chunk[0] >> 1); 733 if (intv < 0) { 734 if (intv > -((xsum_int)1 << XSUM_MANTISSA_BITS)) { 735 intv = (-intv) | XSUM_SIGN_MASK; 736 if (xsum_debug) { 737 printf("denormalized with i==1: intv %016llx\n", (long long)intv); 738 } 739 COPY64(fltv, intv); 740 return fltv; 741 } 742 } else /* non-negative */ 743 { 744 if ((xsum_uint)intv < (xsum_uint)1 << XSUM_MANTISSA_BITS) { 745 if (xsum_debug) { 746 printf("denormalized with i==1: intv %016llx\n", (long long)intv); 747 } 748 COPY64(fltv, intv); 749 return fltv; 750 } 751 } 752 /* otherwise, it's not actually denormalized, so fall through to below */ 753 } 754 } 755 756 /* Find the location of the uppermost 1 bit in the absolute value of 757 the upper chunk by converting it (as a signed integer) to a 758 floating point value, and looking at the exponent. Then set 759 'more' to the number of bits from the lower chunk (and maybe the 760 next lower) that are needed to fill out the mantissa of the 761 result (including the top implicit 1 bit), plus two extra bits to 762 help decide on rounding. For negative numbers, it may turn out 763 later that we need another bit, because negating a negative value 764 may carry out of the top here, but not carry out of the top once 765 more bits are shifted into the bottom later on. */ 766 767 fltv = (xsum_flt)ivalue; /* finds position of topmost 1 bit of |ivalue| */ 768 COPY64(intv, fltv); 769 e = (intv >> XSUM_MANTISSA_BITS) & XSUM_EXP_MASK; /* e-bias is in 0..32 */ 770 more = 2 + XSUM_MANTISSA_BITS + XSUM_EXP_BIAS - e; 771 772 if (xsum_debug) { 773 printf("e: %d, more: %d, ivalue: %016llx\n", e, more, 774 (long long)ivalue); 775 } 776 777 /* Change 'ivalue' to put in 'more' bits from lower chunks into the bottom. 778 Also set 'j' to the index of the lowest chunk from which these bits came, 779 and 'lower' to the remaining bits of that chunk not now in 'ivalue'. 780 Note that 'lower' initially has at least one bit in it, which we can 781 later move into 'ivalue' if it turns out that one more bit is needed. */ 782 783 ivalue *= (xsum_int)1 << more; /* multiply, since << of negative undefined */ 784 if (xsum_debug) { 785 printf("after ivalue <<= more, ivalue: %016llx\n", 786 (long long)ivalue); 787 } 788 j = i - 1; 789 lower = sacc->chunk[j]; /* must exist, since denormalized if i==0 */ 790 if (more >= XSUM_LOW_MANTISSA_BITS) { 791 more -= XSUM_LOW_MANTISSA_BITS; 792 ivalue += lower << more; 793 if (xsum_debug) { 794 printf("after ivalue += lower << more, ivalue: %016llx\n", 795 (long long)ivalue); 796 } 797 j -= 1; 798 lower = j < 0 ? 0 : sacc->chunk[j]; 799 } 800 ivalue += lower >> (XSUM_LOW_MANTISSA_BITS - more); 801 lower &= ((xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - more)) - 1; 802 803 if (xsum_debug) { 804 printf("after final add to ivalue, ivalue: %016llx\n", 805 (long long)ivalue); 806 printf("j: %d, e: %d, |ivalue|: %016llx, lower: %016llx (a)\n", j, e, 807 (long long)(ivalue < 0 ? -ivalue : ivalue), (long long)lower); 808 printf(" mask of low 55 bits: 007fffffffffffff, mask: %016llx\n", 809 (long long)((xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - more)) - 1); 810 } 811 812 /* Decide on rounding, with separate code for positive and negative values. 813 814 At this point, 'ivalue' has the signed mantissa bits, plus two extra 815 bits, with 'e' recording the exponent position for these within their 816 top chunk. For positive 'ivalue', the bits in 'lower' and chunks 817 below 'j' add to the absolute value; for negative 'ivalue' they 818 subtract. 819 820 After setting 'ivalue' to the tentative unsigned mantissa 821 (shifted left 2), and 'intv' to have the correct sign, this 822 code goes to done_rounding if it finds that just discarding lower 823 order bits is correct, and to round_away_from_zero if instead the 824 magnitude should be increased by one in the lowest mantissa bit. */ 825 826 if (ivalue >= 0) /* number is positive, lower bits are added to magnitude */ 827 { 828 intv = 0; /* positive sign */ 829 830 if ((ivalue & 2) == 0) /* extra bits are 0x */ 831 { 832 if (xsum_debug) { 833 printf("+, no adjustment, since remainder adds <1/2\n"); 834 } 835 goto done_rounding; 836 } 837 838 if ((ivalue & 1) != 0) /* extra bits are 11 */ 839 { 840 if (xsum_debug) { 841 printf("+, round away from 0, since remainder adds >1/2\n"); 842 } 843 goto round_away_from_zero; 844 } 845 846 if ((ivalue & 4) != 0) /* low bit is 1 (odd), extra bits are 10 */ 847 { 848 if (xsum_debug) { 849 printf("+odd, round away from 0, since remainder adds >=1/2\n"); 850 } 851 goto round_away_from_zero; 852 } 853 854 if (lower == 0) /* see if any lower bits are non-zero */ 855 { 856 while (j > 0) { 857 j -= 1; 858 if (sacc->chunk[j] != 0) { 859 lower = 1; 860 break; 861 } 862 } 863 } 864 865 if (lower != 0) /* low bit 0 (even), extra bits 10, non-zero lower bits */ 866 { 867 if (xsum_debug) { 868 printf("+even, round away from 0, since remainder adds >1/2\n"); 869 } 870 goto round_away_from_zero; 871 } else /* low bit 0 (even), extra bits 10, all lower bits 0 */ 872 { 873 if (xsum_debug) { 874 printf("+even, no adjustment, since reaminder adds exactly 1/2\n"); 875 } 876 goto done_rounding; 877 } 878 } 879 880 else /* number is negative, lower bits are subtracted from magnitude */ 881 { 882 /* Check for a negative 'ivalue' that when negated doesn't contain a full 883 mantissa's worth of bits, plus one to help rounding. If so, move one 884 more bit into 'ivalue' from 'lower' (and remove it from 'lower'). 885 This happens when the negation of the upper part of 'ivalue' has the 886 form 10000... but the negation of the full 'ivalue' is not 10000... */ 887 888 if (((-ivalue) & ((xsum_int)1 << (XSUM_MANTISSA_BITS + 2))) == 0) { 889 int pos = (xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - 1 - more); 890 ivalue *= 2; /* note that left shift undefined if ivalue is negative */ 891 if (lower & pos) { 892 ivalue += 1; 893 lower &= ~pos; 894 } 895 e -= 1; 896 if (xsum_debug) { 897 printf("j: %d, e: %d, |ivalue|: %016llx, lower: %016llx (b)\n", j, e, 898 (long long)(ivalue < 0 ? -ivalue : ivalue), (long long)lower); 899 } 900 } 901 902 intv = XSUM_SIGN_MASK; /* negative sign */ 903 ivalue = -ivalue; /* ivalue now contains the absolute value */ 904 905 if ((ivalue & 3) == 3) /* extra bits are 11 */ 906 { 907 if (xsum_debug) { 908 printf("-, round away from 0, since remainder adds >1/2\n"); 909 } 910 goto round_away_from_zero; 911 } 912 913 if ((ivalue & 3) <= 1) /* extra bits are 00 or 01 */ 914 { 915 if (xsum_debug) { 916 printf( 917 "-, no adjustment, since remainder adds <=1/4 or subtracts <1/4\n"); 918 } 919 goto done_rounding; 920 } 921 922 if ((ivalue & 4) == 0) /* low bit is 0 (even), extra bits are 10 */ 923 { 924 if (xsum_debug) { 925 printf("-even, no adjustment, since remainder adds <=1/2\n"); 926 } 927 goto done_rounding; 928 } 929 930 if (lower == 0) /* see if any lower bits are non-zero */ 931 { 932 while (j > 0) { 933 j -= 1; 934 if (sacc->chunk[j] != 0) { 935 lower = 1; 936 break; 937 } 938 } 939 } 940 941 if (lower != 0) /* low bit 1 (odd), extra bits 10, non-zero lower bits */ 942 { 943 if (xsum_debug) { 944 printf("-odd, no adjustment, since remainder adds <1/2\n"); 945 } 946 goto done_rounding; 947 } else /* low bit 1 (odd), extra bits are 10, lower bits are all 0 */ 948 { 949 if (xsum_debug) { 950 printf("-odd, round away from 0, since remainder adds exactly 1/2\n"); 951 } 952 goto round_away_from_zero; 953 } 954 } 955 956 round_away_from_zero: 957 958 /* Round away from zero, then check for carry having propagated out the 959 top, and shift if so. */ 960 961 ivalue += 4; /* add 1 to low-order mantissa bit */ 962 if (ivalue & ((xsum_int)1 << (XSUM_MANTISSA_BITS + 3))) { 963 ivalue >>= 1; 964 e += 1; 965 } 966 967 done_rounding:; 968 969 /* Get rid of the bottom 2 bits that were used to decide on rounding. */ 970 971 ivalue >>= 2; 972 973 /* Adjust to the true exponent, accounting for where this chunk is. */ 974 975 e += (i << XSUM_LOW_EXP_BITS) - XSUM_EXP_BIAS - XSUM_MANTISSA_BITS; 976 977 /* If exponent has overflowed, change to plus or minus Inf and return. */ 978 979 if (e >= XSUM_EXP_MASK) { 980 intv |= (xsum_int)XSUM_EXP_MASK << XSUM_MANTISSA_BITS; 981 COPY64(fltv, intv); 982 if (xsum_debug) { 983 printf("Final rounded result: %.17le (overflowed)\n ", fltv); 984 pbinary_double(fltv); 985 printf("\n"); 986 } 987 return fltv; 988 } 989 990 /* Put exponent and mantissa into intv, which already has the sign, 991 then copy into fltv. */ 992 993 intv += (xsum_int)e << XSUM_MANTISSA_BITS; 994 intv += ivalue & XSUM_MANTISSA_MASK; /* mask out the implicit 1 bit */ 995 COPY64(fltv, intv); 996 997 if (xsum_debug) { 998 printf("Final rounded result: %.17le\n ", fltv); 999 pbinary_double(fltv); 1000 printf("\n"); 1001 if ((ivalue >> XSUM_MANTISSA_BITS) != 1) abort(); 1002 } 1003 1004 return fltv; 1005 } 1006 1007 /* ------------------------- DEBUGGING ROUTINES ----------------------------- */ 1008 1009 /* DISPLAY A SMALL ACCUMULATOR. */ 1010 1011 void xsum_small_display(xsum_small_accumulator* sacc) { 1012 int i, dots; 1013 printf("Small accumulator:"); 1014 if (sacc->Inf) { 1015 printf(" %cInf", sacc->Inf > 0 ? '+' : '-'); 1016 if ((sacc->Inf & ((xsum_uint)XSUM_EXP_MASK << XSUM_MANTISSA_BITS)) != 1017 ((xsum_uint)XSUM_EXP_MASK << XSUM_MANTISSA_BITS)) { 1018 printf(" BUT WRONG CONTENTS: %llx", (long long)sacc->Inf); 1019 } 1020 } 1021 if (sacc->NaN) { 1022 printf(" NaN (%llx)", (long long)sacc->NaN); 1023 } 1024 printf("\n"); 1025 dots = 0; 1026 for (i = XSUM_SCHUNKS - 1; i >= 0; i--) { 1027 if (sacc->chunk[i] == 0) { 1028 if (!dots) printf(" ...\n"); 1029 dots = 1; 1030 } else { 1031 printf( 1032 "%5d %5d ", i, 1033 (int)((i << XSUM_LOW_EXP_BITS) - XSUM_EXP_BIAS - XSUM_MANTISSA_BITS)); 1034 pbinary_int64((int64_t)sacc->chunk[i] >> 32, XSUM_SCHUNK_BITS - 32); 1035 printf(" "); 1036 pbinary_int64((int64_t)sacc->chunk[i] & 0xffffffff, 32); 1037 printf("\n"); 1038 dots = 0; 1039 } 1040 } 1041 printf("\n"); 1042 }