tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

xsum.cpp (33148B)


      1 /* FUNCTIONS FOR EXACT SUMMATION. */
      2 
      3 /* Copyright 2015, 2018, 2021, 2024 Radford M. Neal
      4 
      5   Permission is hereby granted, free of charge, to any person obtaining
      6   a copy of this software and associated documentation files (the
      7   "Software"), to deal in the Software without restriction, including
      8   without limitation the rights to use, copy, modify, merge, publish,
      9   distribute, sublicense, and/or sell copies of the Software, and to
     10   permit persons to whom the Software is furnished to do so, subject to
     11   the following conditions:
     12 
     13   The above copyright notice and this permission notice shall be
     14   included in all copies or substantial portions of the Software.
     15 
     16   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     17   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     18   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     19   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
     20   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     21   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
     22   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     23 */
     24 
     25 #include <stdio.h>
     26 #include <string.h>
     27 #include <math.h>
     28 #include "xsum.h"
     29 
     30 /* ---------------------- IMPLEMENTATION ASSUMPTIONS ----------------------- */
     31 
     32 /* This code makes the following assumptions:
     33 
     34     o The 'double' type is a IEEE-754 standard 64-bit floating-point value.
     35 
     36     o The 'int64_t' and 'uint64_t' types exist, for 64-bit signed and
     37       unsigned integers.
     38 
     39     o The 'endianness' of 'double' and 64-bit integers is consistent
     40       between these types - that is, looking at the bits of a 'double'
     41       value as an 64-bit integer will have the expected result.
     42 
     43     o Right shifts of a signed operand produce the results expected for
     44       a two's complement representation.
     45 
     46     o Rounding should be done in the "round to nearest, ties to even" mode.
     47 */
     48 
     49 /* --------------------------- CONFIGURATION ------------------------------- */
     50 
     51 /* IMPLEMENTATION OPTIONS.  Can be set to either 0 or 1, whichever seems
     52   to be fastest. */
     53 
     54 #define USE_SIMD 1 /* Use SIMD intrinsics (SSE2/AVX) if available?   */
     55 
     56 #define USE_MEMSET_SMALL                              \
     57  1 /* Use memset rather than a loop (for small mem)? \
     58     */
     59 #define USE_MEMSET_LARGE                                                   \
     60  1                      /* Use memset rather than a loop (for large mem)? \
     61                          */
     62 #define USE_USED_LARGE 1 /* Use the used flags in a large accumulator? */
     63 
     64 #define OPT_SMALL 0 /* Class of manual optimization for operations on */
     65                    /*   small accumulator: 0 (none), 1, 2, 3 (SIMD)  */
     66 #define OPT_CARRY 1 /* Use manually optimized carry propagation?      */
     67 
     68 #define OPT_LARGE_SUM 1    /* Should manually optimized routines be used for */
     69 #define OPT_LARGE_SQNORM 1 /*   operations using the large accumulator? */
     70 #define OPT_LARGE_DOT 1
     71 
     72 #define OPT_SIMPLE_SUM 1    /* Should manually optimized routines be used for */
     73 #define OPT_SIMPLE_SQNORM 1 /*   operations done with simple FP arithmetic? */
     74 #define OPT_SIMPLE_DOT 1
     75 
     76 #define OPT_KAHAN_SUM 0 /* Use manually optimized routine for Kahan sum?  */
     77 
     78 #define INLINE_SMALL 1 /* Inline more of the small accumulator routines? */
     79                       /*   (Not currently used)                         */
     80 #define INLINE_LARGE 1 /* Inline more of the large accumulator routines? */
     81 
     82 /* INCLUDE INTEL INTRINSICS IF USED AND AVAILABLE. */
     83 
     84 #if USE_SIMD && __SSE2__
     85 #  include <immintrin.h>
     86 #endif
     87 
     88 /* COPY A 64-BIT QUANTITY - DOUBLE TO 64-BIT INT OR VICE VERSA.  The
     89   arguments are destination and source variables (not values). */
     90 
     91 #define COPY64(dst, src) memcpy(&(dst), &(src), sizeof(double))
     92 
     93 /* OPTIONAL INCLUSION OF PBINARY MODULE.  Used for debug output. */
     94 
     95 #ifdef PBINARY
     96 #  include "pbinary.h"
     97 #else
     98 #  define pbinary_int64(x, y) 0
     99 #  define pbinary_double(x) 0
    100 #endif
    101 
    102 /* SET UP DEBUG FLAG.  It's a variable if debuging is enabled, and a
    103   constant if disabled (so that no code will be generated then). */
    104 
    105 int xsum_debug = 0;
    106 
    107 #ifndef DEBUG
    108 #  define xsum_debug 0
    109 #endif
    110 
    111 /* SET UP INLINE / NOINLINE MACROS. */
    112 
    113 #if __GNUC__
    114 #  define INLINE inline __attribute__((always_inline))
    115 #  define NOINLINE __attribute__((noinline))
    116 #else
    117 #  define INLINE inline
    118 #  define NOINLINE
    119 #endif
    120 
    121 /* ------------------------ INTERNAL ROUTINES ------------------------------- */
    122 
    123 /* ADD AN INF OR NAN TO A SMALL ACCUMULATOR.  This only changes the flags,
    124   not the chunks in the accumulator, which retains the sum of the finite
    125   terms (which is perhaps sometimes useful to access, though no function
    126   to do so is defined at present).  A NaN with larger payload (seen as a
    127   52-bit unsigned integer) takes precedence, with the sign of the NaN always
    128   being positive.  This ensures that the order of summing NaN values doesn't
    129   matter. */
    130 
    131 static NOINLINE void xsum_small_add_inf_nan(xsum_small_accumulator* sacc,
    132                                            xsum_int ivalue) {
    133  xsum_int mantissa;
    134  double fltv;
    135 
    136  mantissa = ivalue & XSUM_MANTISSA_MASK;
    137 
    138  if (mantissa == 0) /* Inf */
    139  {
    140    if (sacc->Inf == 0) { /* no previous Inf */
    141      sacc->Inf = ivalue;
    142    } else if (sacc->Inf != ivalue) { /* previous Inf was opposite sign */
    143      COPY64(fltv, ivalue);
    144      fltv = fltv - fltv; /* result will be a NaN */
    145      COPY64(sacc->Inf, fltv);
    146    }
    147  } else /* NaN */
    148  {      /* Choose the NaN with the bigger payload and clear its sign.  Using <=
    149            ensures that we will choose the first NaN over the previous zero. */
    150    if ((sacc->NaN & XSUM_MANTISSA_MASK) <= mantissa) {
    151      sacc->NaN = ivalue & ~XSUM_SIGN_MASK;
    152    }
    153  }
    154 }
    155 
    156 /* PROPAGATE CARRIES TO NEXT CHUNK IN A SMALL ACCUMULATOR.  Needs to
    157   be called often enough that accumulated carries don't overflow out
    158   the top, as indicated by sacc->adds_until_propagate.  Returns the
    159   index of the uppermost non-zero chunk (0 if number is zero).
    160 
    161   After carry propagation, the uppermost non-zero chunk will indicate
    162   the sign of the number, and will not be -1 (all 1s).  It will be in
    163   the range -2^XSUM_LOW_MANTISSA_BITS to 2^XSUM_LOW_MANTISSA_BITS - 1.
    164   Lower chunks will be non-negative, and in the range from 0 up to
    165   2^XSUM_LOW_MANTISSA_BITS - 1. */
    166 
    167 static NOINLINE int xsum_carry_propagate(xsum_small_accumulator* sacc) {
    168  int i, u, uix;
    169 
    170  if (xsum_debug) printf("\nCARRY PROPAGATING IN SMALL ACCUMULATOR\n");
    171 
    172    /* Set u to the index of the uppermost non-zero (for now) chunk, or
    173       return with value 0 if there is none. */
    174 
    175 #if OPT_CARRY
    176 
    177  {
    178    u = XSUM_SCHUNKS - 1;
    179    switch (XSUM_SCHUNKS & 0x3) /* get u to be a multiple of 4 minus one  */
    180    {
    181      case 3:
    182        if (sacc->chunk[u] != 0) {
    183          goto found2;
    184        }
    185        u -= 1; /* XSUM_SCHUNKS is a */
    186      case 2:
    187        if (sacc->chunk[u] != 0) /* constant, so the  */
    188        {
    189          goto found2; /* compiler will do  */
    190        } /* simple code here  */
    191        u -= 1;
    192      case 1:
    193        if (sacc->chunk[u] != 0) {
    194          goto found2;
    195        }
    196        u -= 1;
    197      case 0:;
    198    }
    199 
    200    do /* here, u should be a multiple of 4 minus one, and at least 3 */
    201    {
    202 #  if USE_SIMD && __AVX__
    203      {
    204        __m256i ch;
    205        ch = _mm256_loadu_si256((__m256i*)(sacc->chunk + u - 3));
    206        if (!_mm256_testz_si256(ch, ch)) {
    207          goto found;
    208        }
    209        u -= 4;
    210        if (u < 0) /* never actually happens, because value of XSUM_SCHUNKS */
    211        {
    212          break; /*   is such that u < 0 occurs at end of do loop instead */
    213        }
    214        ch = _mm256_loadu_si256((__m256i*)(sacc->chunk + u - 3));
    215        if (!_mm256_testz_si256(ch, ch)) {
    216          goto found;
    217        }
    218        u -= 4;
    219      }
    220 #  else
    221      {
    222        if (sacc->chunk[u] | sacc->chunk[u - 1] | sacc->chunk[u - 2] |
    223            sacc->chunk[u - 3]) {
    224          goto found;
    225        }
    226        u -= 4;
    227      }
    228 #  endif
    229 
    230    } while (u >= 0);
    231 
    232    if (xsum_debug) printf("number is zero (1)\n");
    233    uix = 0;
    234    goto done;
    235 
    236  found:
    237    if (sacc->chunk[u] != 0) {
    238      goto found2;
    239    }
    240    u -= 1;
    241    if (sacc->chunk[u] != 0) {
    242      goto found2;
    243    }
    244    u -= 1;
    245    if (sacc->chunk[u] != 0) {
    246      goto found2;
    247    }
    248    u -= 1;
    249 
    250  found2:;
    251  }
    252 
    253 #else /* Non-optimized search for uppermost non-zero chunk */
    254 
    255  {
    256    for (u = XSUM_SCHUNKS - 1; sacc->chunk[u] == 0; u--) {
    257      if (u == 0) {
    258        if (xsum_debug) printf("number is zero (1)\n");
    259        uix = 0;
    260        goto done;
    261      }
    262    }
    263  }
    264 
    265 #endif
    266 
    267  /* At this point, sacc->chunk[u] must be non-zero */
    268 
    269  if (xsum_debug) printf("u: %d, sacc->chunk[u]: %ld", u, sacc->chunk[u]);
    270 
    271  /* Carry propagate, starting at the low-order chunks.  Note that the
    272     loop limit of u may be increased inside the loop. */
    273 
    274  i = 0; /* set to the index of the next non-zero chunck, from bottom */
    275 
    276 #if OPT_CARRY
    277  {
    278    /* Quickly skip over unused low-order chunks.  Done here at the start
    279       on the theory that there are often many unused low-order chunks,
    280       justifying some overhead to begin, but later stretches of unused
    281       chunks may not be as large. */
    282 
    283    int e = u - 3; /* go only to 3 before so won't access beyond chunk array */
    284 
    285    do {
    286 #  if USE_SIMD && __AVX__
    287      {
    288        __m256i ch;
    289        ch = _mm256_loadu_si256((__m256i*)(sacc->chunk + i));
    290        if (!_mm256_testz_si256(ch, ch)) {
    291          break;
    292        }
    293        i += 4;
    294        if (i >= e) {
    295          break;
    296        }
    297        ch = _mm256_loadu_si256((__m256i*)(sacc->chunk + i));
    298        if (!_mm256_testz_si256(ch, ch)) {
    299          break;
    300        }
    301      }
    302 #  else
    303      {
    304        if (sacc->chunk[i] | sacc->chunk[i + 1] | sacc->chunk[i + 2] |
    305            sacc->chunk[i + 3]) {
    306          break;
    307        }
    308      }
    309 #  endif
    310 
    311      i += 4;
    312 
    313    } while (i <= e);
    314  }
    315 #endif
    316 
    317  uix = -1; /* indicates that a non-zero chunk has not been found yet */
    318 
    319  do {
    320    xsum_schunk c;     /* Set to the chunk at index i (next non-zero one) */
    321    xsum_schunk clow;  /* Low-order bits of c */
    322    xsum_schunk chigh; /* High-order bits of c */
    323 
    324    /* Find the next non-zero chunk, setting i to its index, or break out
    325       of loop if there is none.  Note that the chunk at index u is not
    326       necessarily non-zero - it was initially, but u or the chunk at u
    327       may have changed. */
    328 
    329 #if OPT_CARRY
    330    {
    331      c = sacc->chunk[i];
    332      if (c != 0) {
    333        goto nonzero;
    334      }
    335      i += 1;
    336      if (i > u) {
    337        break; /* reaching here is only possible when u == i initially, */
    338      } /*   with the last add to a chunk having changed it to 0 */
    339 
    340      for (;;) {
    341        c = sacc->chunk[i];
    342        if (c != 0) {
    343          goto nonzero;
    344        }
    345        i += 1;
    346        c = sacc->chunk[i];
    347        if (c != 0) {
    348          goto nonzero;
    349        }
    350        i += 1;
    351        c = sacc->chunk[i];
    352        if (c != 0) {
    353          goto nonzero;
    354        }
    355        i += 1;
    356        c = sacc->chunk[i];
    357        if (c != 0) {
    358          goto nonzero;
    359        }
    360        i += 1;
    361      }
    362    }
    363 #else
    364    {
    365      do {
    366        c = sacc->chunk[i];
    367        if (c != 0) {
    368          goto nonzero;
    369        }
    370        i += 1;
    371      } while (i <= u);
    372 
    373      break;
    374    }
    375 #endif
    376 
    377    /* Propagate possible carry from this chunk to next chunk up. */
    378 
    379  nonzero:
    380    chigh = c >> XSUM_LOW_MANTISSA_BITS;
    381    if (chigh == 0) {
    382      uix = i;
    383      i += 1;
    384      continue; /* no need to change this chunk */
    385    }
    386 
    387    if (u == i) {
    388      if (chigh == -1) {
    389        uix = i;
    390        break; /* don't propagate -1 into the region of all zeros above */
    391      }
    392      u = i + 1; /* we will change chunk[u+1], so we'll need to look at it */
    393    }
    394 
    395    clow = c & XSUM_LOW_MANTISSA_MASK;
    396    if (clow != 0) {
    397      uix = i;
    398    }
    399 
    400    /* We now change chunk[i] and add to chunk[i+1]. Note that i+1 should be
    401       in range (no bigger than XSUM_CHUNKS-1) if summing memory, since
    402       the number of chunks is big enough to hold any sum, and we do not
    403       store redundant chunks with values 0 or -1 above previously non-zero
    404       chunks.  But other add operations might cause overflow, in which
    405       case we produce a NaN with all 1s as payload.  (We can't reliably produce
    406       an Inf of the right sign.) */
    407 
    408    sacc->chunk[i] = clow;
    409    if (i + 1 >= XSUM_SCHUNKS) {
    410      xsum_small_add_inf_nan(
    411          sacc,
    412          ((xsum_int)XSUM_EXP_MASK << XSUM_MANTISSA_BITS) | XSUM_MANTISSA_MASK);
    413      u = i;
    414    } else {
    415      sacc->chunk[i + 1] +=
    416          chigh; /* note: this could make this chunk be zero */
    417    }
    418 
    419    i += 1;
    420 
    421  } while (i <= u);
    422 
    423  if (xsum_debug) printf("  uix: %d  new u: %d\n", uix, u);
    424 
    425  /* Check again for the number being zero, since carry propagation might
    426     have created zero from something that initially looked non-zero. */
    427 
    428  if (uix < 0) {
    429    if (xsum_debug) printf("number is zero (2)\n");
    430    uix = 0;
    431    goto done;
    432  }
    433 
    434  /* While the uppermost chunk is negative, with value -1, combine it with
    435     the chunk below (if there is one) to produce the same number but with
    436     one fewer non-zero chunks. */
    437 
    438  while (sacc->chunk[uix] == -1 &&
    439         uix > 0) { /* Left shift of a negative number is undefined according to
    440                       the standard, so do a multiply - it's all presumably
    441                       constant-folded by the compiler.*/
    442    sacc->chunk[uix - 1] +=
    443        ((xsum_schunk)-1) * (((xsum_schunk)1) << XSUM_LOW_MANTISSA_BITS);
    444    sacc->chunk[uix] = 0;
    445    uix -= 1;
    446  }
    447 
    448  /* We can now add one less than the total allowed terms before the
    449     next carry propagate. */
    450 
    451 done:
    452  sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS - 1;
    453 
    454  /* Return index of uppermost non-zero chunk. */
    455 
    456  return uix;
    457 }
    458 
    459 /* ------------------------ EXTERNAL ROUTINES ------------------------------- */
    460 
    461 /* INITIALIZE A SMALL ACCUMULATOR TO ZERO. */
    462 
    463 void xsum_small_init(xsum_small_accumulator* sacc) {
    464  sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS;
    465  sacc->Inf = sacc->NaN = 0;
    466 #if USE_MEMSET_SMALL
    467  { memset(sacc->chunk, 0, XSUM_SCHUNKS * sizeof(xsum_schunk)); }
    468 #elif USE_SIMD && __AVX__ && XSUM_SCHUNKS == 67
    469  {
    470    xsum_schunk* ch = sacc->chunk;
    471    __m256i z = _mm256_setzero_si256();
    472    _mm256_storeu_si256((__m256i*)(ch + 0), z);
    473    _mm256_storeu_si256((__m256i*)(ch + 4), z);
    474    _mm256_storeu_si256((__m256i*)(ch + 8), z);
    475    _mm256_storeu_si256((__m256i*)(ch + 12), z);
    476    _mm256_storeu_si256((__m256i*)(ch + 16), z);
    477    _mm256_storeu_si256((__m256i*)(ch + 20), z);
    478    _mm256_storeu_si256((__m256i*)(ch + 24), z);
    479    _mm256_storeu_si256((__m256i*)(ch + 28), z);
    480    _mm256_storeu_si256((__m256i*)(ch + 32), z);
    481    _mm256_storeu_si256((__m256i*)(ch + 36), z);
    482    _mm256_storeu_si256((__m256i*)(ch + 40), z);
    483    _mm256_storeu_si256((__m256i*)(ch + 44), z);
    484    _mm256_storeu_si256((__m256i*)(ch + 48), z);
    485    _mm256_storeu_si256((__m256i*)(ch + 52), z);
    486    _mm256_storeu_si256((__m256i*)(ch + 56), z);
    487    _mm256_storeu_si256((__m256i*)(ch + 60), z);
    488    _mm_storeu_si128((__m128i*)(ch + 64), _mm256_castsi256_si128(z));
    489    _mm_storeu_si64(ch + 66, _mm256_castsi256_si128(z));
    490  }
    491 #else
    492  {
    493    xsum_schunk* p;
    494    int n;
    495    p = sacc->chunk;
    496    n = XSUM_SCHUNKS;
    497    do {
    498      *p++ = 0;
    499      n -= 1;
    500    } while (n > 0);
    501  }
    502 #endif
    503 }
    504 
    505 /* ADD ONE NUMBER TO A SMALL ACCUMULATOR ASSUMING NO CARRY PROPAGATION REQ'D.
    506   This function is declared INLINE regardless of the setting of INLINE_SMALL
    507   and for good performance it must be inlined by the compiler (otherwise the
    508   procedure call overhead will result in substantial inefficiency). */
    509 
    510 static INLINE void xsum_add1_no_carry(xsum_small_accumulator* sacc,
    511                                      xsum_flt value) {
    512  xsum_int ivalue;
    513  xsum_int mantissa;
    514  xsum_expint exp, low_exp, high_exp;
    515  xsum_schunk* chunk_ptr;
    516 
    517  if (xsum_debug) {
    518    printf("ADD1 %+.17le\n     ", (double)value);
    519    pbinary_double((double)value);
    520    printf("\n");
    521  }
    522 
    523  /* Extract exponent and mantissa.  Split exponent into high and low parts. */
    524 
    525  COPY64(ivalue, value);
    526 
    527  exp = (ivalue >> XSUM_MANTISSA_BITS) & XSUM_EXP_MASK;
    528  mantissa = ivalue & XSUM_MANTISSA_MASK;
    529  high_exp = exp >> XSUM_LOW_EXP_BITS;
    530  low_exp = exp & XSUM_LOW_EXP_MASK;
    531 
    532  if (xsum_debug) {
    533    printf("  high exp: ");
    534    pbinary_int64(high_exp, XSUM_HIGH_EXP_BITS);
    535    printf("  low exp: ");
    536    pbinary_int64(low_exp, XSUM_LOW_EXP_BITS);
    537    printf("\n");
    538  }
    539 
    540  /* Categorize number as normal, denormalized, or Inf/NaN according to
    541     the value of the exponent field. */
    542 
    543  if (exp == 0) /* zero or denormalized */
    544  {             /* If it's a zero (positive or negative), we do nothing. */
    545    if (mantissa == 0) {
    546      return;
    547    }
    548    /* Denormalized mantissa has no implicit 1, but exponent is 1 not 0. */
    549    exp = low_exp = 1;
    550  } else if (exp == XSUM_EXP_MASK) /* Inf or NaN */
    551  { /* Just update flags in accumulator structure. */
    552    xsum_small_add_inf_nan(sacc, ivalue);
    553    return;
    554  } else /* normalized */
    555  {      /* OR in implicit 1 bit at top of mantissa */
    556    mantissa |= (xsum_int)1 << XSUM_MANTISSA_BITS;
    557  }
    558 
    559  if (xsum_debug) {
    560    printf("  mantissa: ");
    561    pbinary_int64(mantissa, XSUM_MANTISSA_BITS + 1);
    562    printf("\n");
    563  }
    564 
    565  /* Use high part of exponent as index of chunk, and low part of
    566     exponent to give position within chunk.  Fetch the two chunks
    567     that will be modified. */
    568 
    569  chunk_ptr = sacc->chunk + high_exp;
    570 
    571  /* Separate mantissa into two parts, after shifting, and add to (or
    572     subtract from) this chunk and the next higher chunk (which always
    573     exists since there are three extra ones at the top).
    574 
    575     Note that low_mantissa will have at most XSUM_LOW_MANTISSA_BITS bits,
    576     while high_mantissa will have at most XSUM_MANTISSA_BITS bits, since
    577     even though the high mantissa includes the extra implicit 1 bit, it will
    578     also be shifted right by at least one bit. */
    579 
    580  xsum_int split_mantissa[2];
    581  split_mantissa[0] = ((xsum_uint)mantissa << low_exp) & XSUM_LOW_MANTISSA_MASK;
    582  split_mantissa[1] = mantissa >> (XSUM_LOW_MANTISSA_BITS - low_exp);
    583 
    584  /* Add to, or subtract from, the two affected chunks. */
    585 
    586 #if OPT_SMALL == 1
    587  {
    588    xsum_int ivalue_sign = ivalue < 0 ? -1 : 1;
    589    chunk_ptr[0] += ivalue_sign * split_mantissa[0];
    590    chunk_ptr[1] += ivalue_sign * split_mantissa[1];
    591  }
    592 #elif OPT_SMALL == 2
    593  {
    594    xsum_int ivalue_neg =
    595        ivalue >> (XSUM_SCHUNK_BITS - 1); /* all 0s if +ve, all 1s if -ve */
    596    chunk_ptr[0] += (split_mantissa[0] ^ ivalue_neg) + (ivalue_neg & 1);
    597    chunk_ptr[1] += (split_mantissa[1] ^ ivalue_neg) + (ivalue_neg & 1);
    598  }
    599 #elif OPT_SMALL == 3 && USE_SIMD && __SSE2__
    600  {
    601    xsum_int ivalue_neg =
    602        ivalue >> (XSUM_SCHUNK_BITS - 1); /* all 0s if +ve, all 1s if -ve */
    603    _mm_storeu_si128(
    604        (__m128i*)chunk_ptr,
    605        _mm_add_epi64(
    606            _mm_loadu_si128((__m128i*)chunk_ptr),
    607            _mm_add_epi64(
    608                _mm_set1_epi64((__m64)(ivalue_neg & 1)),
    609                _mm_xor_si128(_mm_set1_epi64((__m64)ivalue_neg),
    610                              _mm_loadu_si128((__m128i*)split_mantissa)))));
    611  }
    612 #else
    613  {
    614    if (ivalue < 0) {
    615      chunk_ptr[0] -= split_mantissa[0];
    616      chunk_ptr[1] -= split_mantissa[1];
    617    } else {
    618      chunk_ptr[0] += split_mantissa[0];
    619      chunk_ptr[1] += split_mantissa[1];
    620    }
    621  }
    622 #endif
    623 
    624  if (xsum_debug) {
    625    if (ivalue < 0) {
    626      printf(" -high man: ");
    627      pbinary_int64(-split_mantissa[1], XSUM_MANTISSA_BITS);
    628      printf("\n  -low man: ");
    629      pbinary_int64(-split_mantissa[0], XSUM_LOW_MANTISSA_BITS);
    630      printf("\n");
    631    } else {
    632      printf("  high man: ");
    633      pbinary_int64(split_mantissa[1], XSUM_MANTISSA_BITS);
    634      printf("\n   low man: ");
    635      pbinary_int64(split_mantissa[0], XSUM_LOW_MANTISSA_BITS);
    636      printf("\n");
    637    }
    638  }
    639 }
    640 
    641 /* ADD ONE DOUBLE TO A SMALL ACCUMULATOR.  This is equivalent to, but
    642   somewhat faster than, calling xsum_small_addv with a vector of one
    643   value. */
    644 
    645 void xsum_small_add1(xsum_small_accumulator* sacc, xsum_flt value) {
    646  if (sacc->adds_until_propagate == 0) {
    647    (void)xsum_carry_propagate(sacc);
    648  }
    649 
    650  xsum_add1_no_carry(sacc, value);
    651 
    652  sacc->adds_until_propagate -= 1;
    653 }
    654 
    655 /* RETURN THE RESULT OF ROUNDING A SMALL ACCUMULATOR.  The rounding mode
    656   is to nearest, with ties to even.  The small accumulator may be modified
    657   by this operation (by carry propagation being done), but the value it
    658   represents should not change. */
    659 
    660 xsum_flt xsum_small_round(xsum_small_accumulator* sacc) {
    661  xsum_int ivalue;
    662  xsum_schunk lower;
    663  int i, j, e, more;
    664  xsum_int intv;
    665  double fltv;
    666 
    667  if (xsum_debug) printf("\nROUNDING SMALL ACCUMULATOR\n");
    668 
    669  /* See if we have a NaN from one of the numbers being a NaN, in
    670     which case we return the NaN with largest payload, or an infinite
    671     result (+Inf, -Inf, or a NaN if both +Inf and -Inf occurred).
    672     Note that we do NOT return NaN if we have both an infinite number
    673     and a sum of other numbers that overflows with opposite sign,
    674     since there is no real ambiguity regarding the sign in such a case. */
    675 
    676  if (sacc->NaN != 0) {
    677    COPY64(fltv, sacc->NaN);
    678    return fltv;
    679  }
    680 
    681  if (sacc->Inf != 0) {
    682    COPY64(fltv, sacc->Inf);
    683    return fltv;
    684  }
    685 
    686  /* If none of the numbers summed were infinite or NaN, we proceed to
    687     propagate carries, as a preliminary to finding the magnitude of
    688     the sum.  This also ensures that the sign of the result can be
    689     determined from the uppermost non-zero chunk.
    690 
    691     We also find the index, i, of this uppermost non-zero chunk, as
    692     the value returned by xsum_carry_propagate, and set ivalue to
    693     sacc->chunk[i].  Note that ivalue will not be 0 or -1, unless
    694     i is 0 (the lowest chunk), in which case it will be handled by
    695     the code for denormalized numbers. */
    696 
    697  i = xsum_carry_propagate(sacc);
    698 
    699  if (xsum_debug) xsum_small_display(sacc);
    700 
    701  ivalue = sacc->chunk[i];
    702 
    703  /* Handle a possible denormalized number, including zero. */
    704 
    705  if (i <= 1) {
    706    /* Check for zero value, in which case we can return immediately. */
    707 
    708    if (ivalue == 0) {
    709      return 0.0;
    710    }
    711 
    712    /* Check if it is actually a denormalized number.  It always is if only
    713       the lowest chunk is non-zero.  If the highest non-zero chunk is the
    714       next-to-lowest, we check the magnitude of the absolute value.
    715       Note that the real exponent is 1 (not 0), so we need to shift right
    716       by 1 here. */
    717 
    718    if (i == 0) {
    719      intv = ivalue >= 0 ? ivalue : -ivalue;
    720      intv >>= 1;
    721      if (ivalue < 0) {
    722        intv |= XSUM_SIGN_MASK;
    723      }
    724      if (xsum_debug) {
    725        printf("denormalized with i==0: intv %016llx\n", (long long)intv);
    726      }
    727      COPY64(fltv, intv);
    728      return fltv;
    729    } else { /* Note: Left shift of -ve number is undefined, so do a multiply
    730                instead, which is probably optimized to a shift. */
    731      intv = ivalue * ((xsum_int)1 << (XSUM_LOW_MANTISSA_BITS - 1)) +
    732             (sacc->chunk[0] >> 1);
    733      if (intv < 0) {
    734        if (intv > -((xsum_int)1 << XSUM_MANTISSA_BITS)) {
    735          intv = (-intv) | XSUM_SIGN_MASK;
    736          if (xsum_debug) {
    737            printf("denormalized with i==1: intv %016llx\n", (long long)intv);
    738          }
    739          COPY64(fltv, intv);
    740          return fltv;
    741        }
    742      } else /* non-negative */
    743      {
    744        if ((xsum_uint)intv < (xsum_uint)1 << XSUM_MANTISSA_BITS) {
    745          if (xsum_debug) {
    746            printf("denormalized with i==1: intv %016llx\n", (long long)intv);
    747          }
    748          COPY64(fltv, intv);
    749          return fltv;
    750        }
    751      }
    752      /* otherwise, it's not actually denormalized, so fall through to below */
    753    }
    754  }
    755 
    756  /* Find the location of the uppermost 1 bit in the absolute value of
    757     the upper chunk by converting it (as a signed integer) to a
    758     floating point value, and looking at the exponent.  Then set
    759     'more' to the number of bits from the lower chunk (and maybe the
    760     next lower) that are needed to fill out the mantissa of the
    761     result (including the top implicit 1 bit), plus two extra bits to
    762     help decide on rounding.  For negative numbers, it may turn out
    763     later that we need another bit, because negating a negative value
    764     may carry out of the top here, but not carry out of the top once
    765     more bits are shifted into the bottom later on. */
    766 
    767  fltv = (xsum_flt)ivalue; /* finds position of topmost 1 bit of |ivalue| */
    768  COPY64(intv, fltv);
    769  e = (intv >> XSUM_MANTISSA_BITS) & XSUM_EXP_MASK; /* e-bias is in 0..32 */
    770  more = 2 + XSUM_MANTISSA_BITS + XSUM_EXP_BIAS - e;
    771 
    772  if (xsum_debug) {
    773    printf("e: %d, more: %d,             ivalue: %016llx\n", e, more,
    774           (long long)ivalue);
    775  }
    776 
    777  /* Change 'ivalue' to put in 'more' bits from lower chunks into the bottom.
    778     Also set 'j' to the index of the lowest chunk from which these bits came,
    779     and 'lower' to the remaining bits of that chunk not now in 'ivalue'.
    780     Note that 'lower' initially has at least one bit in it, which we can
    781     later move into 'ivalue' if it turns out that one more bit is needed. */
    782 
    783  ivalue *= (xsum_int)1 << more; /* multiply, since << of negative undefined */
    784  if (xsum_debug) {
    785    printf("after ivalue <<= more,         ivalue: %016llx\n",
    786           (long long)ivalue);
    787  }
    788  j = i - 1;
    789  lower = sacc->chunk[j]; /* must exist, since denormalized if i==0 */
    790  if (more >= XSUM_LOW_MANTISSA_BITS) {
    791    more -= XSUM_LOW_MANTISSA_BITS;
    792    ivalue += lower << more;
    793    if (xsum_debug) {
    794      printf("after ivalue += lower << more, ivalue: %016llx\n",
    795             (long long)ivalue);
    796    }
    797    j -= 1;
    798    lower = j < 0 ? 0 : sacc->chunk[j];
    799  }
    800  ivalue += lower >> (XSUM_LOW_MANTISSA_BITS - more);
    801  lower &= ((xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - more)) - 1;
    802 
    803  if (xsum_debug) {
    804    printf("after final add to ivalue,     ivalue: %016llx\n",
    805           (long long)ivalue);
    806    printf("j: %d, e: %d, |ivalue|: %016llx, lower: %016llx (a)\n", j, e,
    807           (long long)(ivalue < 0 ? -ivalue : ivalue), (long long)lower);
    808    printf("   mask of low 55 bits:   007fffffffffffff,  mask: %016llx\n",
    809           (long long)((xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - more)) - 1);
    810  }
    811 
    812  /* Decide on rounding, with separate code for positive and negative values.
    813 
    814     At this point, 'ivalue' has the signed mantissa bits, plus two extra
    815     bits, with 'e' recording the exponent position for these within their
    816     top chunk.  For positive 'ivalue', the bits in 'lower' and chunks
    817     below 'j' add to the absolute value; for negative 'ivalue' they
    818     subtract.
    819 
    820     After setting 'ivalue' to the tentative unsigned mantissa
    821     (shifted left 2), and 'intv' to have the correct sign, this
    822     code goes to done_rounding if it finds that just discarding lower
    823     order bits is correct, and to round_away_from_zero if instead the
    824     magnitude should be increased by one in the lowest mantissa bit. */
    825 
    826  if (ivalue >= 0) /* number is positive, lower bits are added to magnitude */
    827  {
    828    intv = 0; /* positive sign */
    829 
    830    if ((ivalue & 2) == 0) /* extra bits are 0x */
    831    {
    832      if (xsum_debug) {
    833        printf("+, no adjustment, since remainder adds <1/2\n");
    834      }
    835      goto done_rounding;
    836    }
    837 
    838    if ((ivalue & 1) != 0) /* extra bits are 11 */
    839    {
    840      if (xsum_debug) {
    841        printf("+, round away from 0, since remainder adds >1/2\n");
    842      }
    843      goto round_away_from_zero;
    844    }
    845 
    846    if ((ivalue & 4) != 0) /* low bit is 1 (odd), extra bits are 10 */
    847    {
    848      if (xsum_debug) {
    849        printf("+odd, round away from 0, since remainder adds >=1/2\n");
    850      }
    851      goto round_away_from_zero;
    852    }
    853 
    854    if (lower == 0) /* see if any lower bits are non-zero */
    855    {
    856      while (j > 0) {
    857        j -= 1;
    858        if (sacc->chunk[j] != 0) {
    859          lower = 1;
    860          break;
    861        }
    862      }
    863    }
    864 
    865    if (lower != 0) /* low bit 0 (even), extra bits 10, non-zero lower bits */
    866    {
    867      if (xsum_debug) {
    868        printf("+even, round away from 0, since remainder adds >1/2\n");
    869      }
    870      goto round_away_from_zero;
    871    } else /* low bit 0 (even), extra bits 10, all lower bits 0 */
    872    {
    873      if (xsum_debug) {
    874        printf("+even, no adjustment, since reaminder adds exactly 1/2\n");
    875      }
    876      goto done_rounding;
    877    }
    878  }
    879 
    880  else /* number is negative, lower bits are subtracted from magnitude */
    881  {
    882    /* Check for a negative 'ivalue' that when negated doesn't contain a full
    883       mantissa's worth of bits, plus one to help rounding.  If so, move one
    884       more bit into 'ivalue' from 'lower' (and remove it from 'lower').
    885       This happens when the negation of the upper part of 'ivalue' has the
    886       form 10000... but the negation of the full 'ivalue' is not 10000... */
    887 
    888    if (((-ivalue) & ((xsum_int)1 << (XSUM_MANTISSA_BITS + 2))) == 0) {
    889      int pos = (xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - 1 - more);
    890      ivalue *= 2; /* note that left shift undefined if ivalue is negative */
    891      if (lower & pos) {
    892        ivalue += 1;
    893        lower &= ~pos;
    894      }
    895      e -= 1;
    896      if (xsum_debug) {
    897        printf("j: %d, e: %d, |ivalue|: %016llx, lower: %016llx (b)\n", j, e,
    898               (long long)(ivalue < 0 ? -ivalue : ivalue), (long long)lower);
    899      }
    900    }
    901 
    902    intv = XSUM_SIGN_MASK; /* negative sign */
    903    ivalue = -ivalue;      /* ivalue now contains the absolute value */
    904 
    905    if ((ivalue & 3) == 3) /* extra bits are 11 */
    906    {
    907      if (xsum_debug) {
    908        printf("-, round away from 0, since remainder adds >1/2\n");
    909      }
    910      goto round_away_from_zero;
    911    }
    912 
    913    if ((ivalue & 3) <= 1) /* extra bits are 00 or 01 */
    914    {
    915      if (xsum_debug) {
    916        printf(
    917            "-, no adjustment, since remainder adds <=1/4 or subtracts <1/4\n");
    918      }
    919      goto done_rounding;
    920    }
    921 
    922    if ((ivalue & 4) == 0) /* low bit is 0 (even), extra bits are 10 */
    923    {
    924      if (xsum_debug) {
    925        printf("-even, no adjustment, since remainder adds <=1/2\n");
    926      }
    927      goto done_rounding;
    928    }
    929 
    930    if (lower == 0) /* see if any lower bits are non-zero */
    931    {
    932      while (j > 0) {
    933        j -= 1;
    934        if (sacc->chunk[j] != 0) {
    935          lower = 1;
    936          break;
    937        }
    938      }
    939    }
    940 
    941    if (lower != 0) /* low bit 1 (odd), extra bits 10, non-zero lower bits */
    942    {
    943      if (xsum_debug) {
    944        printf("-odd, no adjustment, since remainder adds <1/2\n");
    945      }
    946      goto done_rounding;
    947    } else /* low bit 1 (odd), extra bits are 10, lower bits are all 0 */
    948    {
    949      if (xsum_debug) {
    950        printf("-odd, round away from 0, since remainder adds exactly 1/2\n");
    951      }
    952      goto round_away_from_zero;
    953    }
    954  }
    955 
    956 round_away_from_zero:
    957 
    958  /* Round away from zero, then check for carry having propagated out the
    959     top, and shift if so. */
    960 
    961  ivalue += 4; /* add 1 to low-order mantissa bit */
    962  if (ivalue & ((xsum_int)1 << (XSUM_MANTISSA_BITS + 3))) {
    963    ivalue >>= 1;
    964    e += 1;
    965  }
    966 
    967 done_rounding:;
    968 
    969  /* Get rid of the bottom 2 bits that were used to decide on rounding. */
    970 
    971  ivalue >>= 2;
    972 
    973  /* Adjust to the true exponent, accounting for where this chunk is. */
    974 
    975  e += (i << XSUM_LOW_EXP_BITS) - XSUM_EXP_BIAS - XSUM_MANTISSA_BITS;
    976 
    977  /* If exponent has overflowed, change to plus or minus Inf and return. */
    978 
    979  if (e >= XSUM_EXP_MASK) {
    980    intv |= (xsum_int)XSUM_EXP_MASK << XSUM_MANTISSA_BITS;
    981    COPY64(fltv, intv);
    982    if (xsum_debug) {
    983      printf("Final rounded result: %.17le (overflowed)\n  ", fltv);
    984      pbinary_double(fltv);
    985      printf("\n");
    986    }
    987    return fltv;
    988  }
    989 
    990  /* Put exponent and mantissa into intv, which already has the sign,
    991     then copy into fltv. */
    992 
    993  intv += (xsum_int)e << XSUM_MANTISSA_BITS;
    994  intv += ivalue & XSUM_MANTISSA_MASK; /* mask out the implicit 1 bit */
    995  COPY64(fltv, intv);
    996 
    997  if (xsum_debug) {
    998    printf("Final rounded result: %.17le\n  ", fltv);
    999    pbinary_double(fltv);
   1000    printf("\n");
   1001    if ((ivalue >> XSUM_MANTISSA_BITS) != 1) abort();
   1002  }
   1003 
   1004  return fltv;
   1005 }
   1006 
   1007 /* ------------------------- DEBUGGING ROUTINES ----------------------------- */
   1008 
   1009 /* DISPLAY A SMALL ACCUMULATOR. */
   1010 
   1011 void xsum_small_display(xsum_small_accumulator* sacc) {
   1012  int i, dots;
   1013  printf("Small accumulator:");
   1014  if (sacc->Inf) {
   1015    printf(" %cInf", sacc->Inf > 0 ? '+' : '-');
   1016    if ((sacc->Inf & ((xsum_uint)XSUM_EXP_MASK << XSUM_MANTISSA_BITS)) !=
   1017        ((xsum_uint)XSUM_EXP_MASK << XSUM_MANTISSA_BITS)) {
   1018      printf(" BUT WRONG CONTENTS: %llx", (long long)sacc->Inf);
   1019    }
   1020  }
   1021  if (sacc->NaN) {
   1022    printf(" NaN (%llx)", (long long)sacc->NaN);
   1023  }
   1024  printf("\n");
   1025  dots = 0;
   1026  for (i = XSUM_SCHUNKS - 1; i >= 0; i--) {
   1027    if (sacc->chunk[i] == 0) {
   1028      if (!dots) printf("            ...\n");
   1029      dots = 1;
   1030    } else {
   1031      printf(
   1032          "%5d %5d ", i,
   1033          (int)((i << XSUM_LOW_EXP_BITS) - XSUM_EXP_BIAS - XSUM_MANTISSA_BITS));
   1034      pbinary_int64((int64_t)sacc->chunk[i] >> 32, XSUM_SCHUNK_BITS - 32);
   1035      printf(" ");
   1036      pbinary_int64((int64_t)sacc->chunk[i] & 0xffffffff, 32);
   1037      printf("\n");
   1038      dots = 0;
   1039    }
   1040  }
   1041  printf("\n");
   1042 }