tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

charconv_parse.cc (18724B)


      1 // Copyright 2018 The Abseil Authors.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //      https://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "absl/strings/internal/charconv_parse.h"
     16 #include "absl/strings/charconv.h"
     17 
     18 #include <cassert>
     19 #include <cstdint>
     20 #include <limits>
     21 
     22 #include "absl/strings/internal/memutil.h"
     23 
     24 namespace absl {
     25 ABSL_NAMESPACE_BEGIN
     26 namespace {
     27 
     28 // ParseFloat<10> will read the first 19 significant digits of the mantissa.
     29 // This number was chosen for multiple reasons.
     30 //
     31 // (a) First, for whatever integer type we choose to represent the mantissa, we
     32 // want to choose the largest possible number of decimal digits for that integer
     33 // type.  We are using uint64_t, which can express any 19-digit unsigned
     34 // integer.
     35 //
     36 // (b) Second, we need to parse enough digits that the binary value of any
     37 // mantissa we capture has more bits of resolution than the mantissa
     38 // representation in the target float.  Our algorithm requires at least 3 bits
     39 // of headway, but 19 decimal digits give a little more than that.
     40 //
     41 // The following static assertions verify the above comments:
     42 constexpr int kDecimalMantissaDigitsMax = 19;
     43 
     44 static_assert(std::numeric_limits<uint64_t>::digits10 ==
     45                  kDecimalMantissaDigitsMax,
     46              "(a) above");
     47 
     48 // IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa.
     49 static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed");
     50 static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact");
     51 static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact");
     52 
     53 // The lowest valued 19-digit decimal mantissa we can read still contains
     54 // sufficient information to reconstruct a binary mantissa.
     55 static_assert(1000000000000000000u > (uint64_t{1} << (53 + 3)), "(b) above");
     56 
     57 // ParseFloat<16> will read the first 15 significant digits of the mantissa.
     58 //
     59 // Because a base-16-to-base-2 conversion can be done exactly, we do not need
     60 // to maximize the number of scanned hex digits to improve our conversion.  What
     61 // is required is to scan two more bits than the mantissa can represent, so that
     62 // we always round correctly.
     63 //
     64 // (One extra bit does not suffice to perform correct rounding, since a number
     65 // exactly halfway between two representable floats has unique rounding rules,
     66 // so we need to differentiate between a "halfway between" number and a "closer
     67 // to the larger value" number.)
     68 constexpr int kHexadecimalMantissaDigitsMax = 15;
     69 
     70 // The minimum number of significant bits that will be read from
     71 // kHexadecimalMantissaDigitsMax hex digits.  We must subtract by three, since
     72 // the most significant digit can be a "1", which only contributes a single
     73 // significant bit.
     74 constexpr int kGuaranteedHexadecimalMantissaBitPrecision =
     75    4 * kHexadecimalMantissaDigitsMax - 3;
     76 
     77 static_assert(kGuaranteedHexadecimalMantissaBitPrecision >
     78                  std::numeric_limits<double>::digits + 2,
     79              "kHexadecimalMantissaDigitsMax too small");
     80 
     81 // We also impose a limit on the number of significant digits we will read from
     82 // an exponent, to avoid having to deal with integer overflow.  We use 9 for
     83 // this purpose.
     84 //
     85 // If we read a 9 digit exponent, the end result of the conversion will
     86 // necessarily be infinity or zero, depending on the sign of the exponent.
     87 // Therefore we can just drop extra digits on the floor without any extra
     88 // logic.
     89 constexpr int kDecimalExponentDigitsMax = 9;
     90 static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax,
     91              "int type too small");
     92 
     93 // To avoid incredibly large inputs causing integer overflow for our exponent,
     94 // we impose an arbitrary but very large limit on the number of significant
     95 // digits we will accept.  The implementation refuses to match a string with
     96 // more consecutive significant mantissa digits than this.
     97 constexpr int kDecimalDigitLimit = 50000000;
     98 
     99 // Corresponding limit for hexadecimal digit inputs.  This is one fourth the
    100 // amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires
    101 // a binary exponent adjustment of 4.
    102 constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4;
    103 
    104 // The largest exponent we can read is 999999999 (per
    105 // kDecimalExponentDigitsMax), and the largest exponent adjustment we can get
    106 // from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these
    107 // comfortably fits in an integer.
    108 //
    109 // We count kDecimalDigitLimit twice because there are independent limits for
    110 // numbers before and after the decimal point.  (In the case where there are no
    111 // significant digits before the decimal point, there are independent limits for
    112 // post-decimal-point leading zeroes and for significant digits.)
    113 static_assert(999999999 + 2 * kDecimalDigitLimit <
    114                  std::numeric_limits<int>::max(),
    115              "int type too small");
    116 static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) <
    117                  std::numeric_limits<int>::max(),
    118              "int type too small");
    119 
    120 // Returns true if the provided bitfield allows parsing an exponent value
    121 // (e.g., "1.5e100").
    122 bool AllowExponent(chars_format flags) {
    123  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
    124  bool scientific =
    125      (flags & chars_format::scientific) == chars_format::scientific;
    126  return scientific || !fixed;
    127 }
    128 
    129 // Returns true if the provided bitfield requires an exponent value be present.
    130 bool RequireExponent(chars_format flags) {
    131  bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
    132  bool scientific =
    133      (flags & chars_format::scientific) == chars_format::scientific;
    134  return scientific && !fixed;
    135 }
    136 
    137 const int8_t kAsciiToInt[256] = {
    138    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    139    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    140    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,  2,  3,  4,  5,  6,  7,  8,
    141    9,  -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
    142    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    143    -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    144    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    145    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    146    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    147    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    148    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    149    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    150    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    151    -1, -1, -1, -1, -1, -1, -1, -1, -1};
    152 
    153 // Returns true if `ch` is a digit in the given base
    154 template <int base>
    155 bool IsDigit(char ch);
    156 
    157 // Converts a valid `ch` to its digit value in the given base.
    158 template <int base>
    159 unsigned ToDigit(char ch);
    160 
    161 // Returns true if `ch` is the exponent delimiter for the given base.
    162 template <int base>
    163 bool IsExponentCharacter(char ch);
    164 
    165 // Returns the maximum number of significant digits we will read for a float
    166 // in the given base.
    167 template <int base>
    168 constexpr int MantissaDigitsMax();
    169 
    170 // Returns the largest consecutive run of digits we will accept when parsing a
    171 // number in the given base.
    172 template <int base>
    173 constexpr int DigitLimit();
    174 
    175 // Returns the amount the exponent must be adjusted by for each dropped digit.
    176 // (For decimal this is 1, since the digits are in base 10 and the exponent base
    177 // is also 10, but for hexadecimal this is 4, since the digits are base 16 but
    178 // the exponent base is 2.)
    179 template <int base>
    180 constexpr int DigitMagnitude();
    181 
    182 template <>
    183 bool IsDigit<10>(char ch) {
    184  return ch >= '0' && ch <= '9';
    185 }
    186 template <>
    187 bool IsDigit<16>(char ch) {
    188  return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0;
    189 }
    190 
    191 template <>
    192 unsigned ToDigit<10>(char ch) {
    193  return static_cast<unsigned>(ch - '0');
    194 }
    195 template <>
    196 unsigned ToDigit<16>(char ch) {
    197  return static_cast<unsigned>(kAsciiToInt[static_cast<unsigned char>(ch)]);
    198 }
    199 
    200 template <>
    201 bool IsExponentCharacter<10>(char ch) {
    202  return ch == 'e' || ch == 'E';
    203 }
    204 
    205 template <>
    206 bool IsExponentCharacter<16>(char ch) {
    207  return ch == 'p' || ch == 'P';
    208 }
    209 
    210 template <>
    211 constexpr int MantissaDigitsMax<10>() {
    212  return kDecimalMantissaDigitsMax;
    213 }
    214 template <>
    215 constexpr int MantissaDigitsMax<16>() {
    216  return kHexadecimalMantissaDigitsMax;
    217 }
    218 
    219 template <>
    220 constexpr int DigitLimit<10>() {
    221  return kDecimalDigitLimit;
    222 }
    223 template <>
    224 constexpr int DigitLimit<16>() {
    225  return kHexadecimalDigitLimit;
    226 }
    227 
    228 template <>
    229 constexpr int DigitMagnitude<10>() {
    230  return 1;
    231 }
    232 template <>
    233 constexpr int DigitMagnitude<16>() {
    234  return 4;
    235 }
    236 
    237 // Reads decimal digits from [begin, end) into *out.  Returns the number of
    238 // digits consumed.
    239 //
    240 // After max_digits has been read, keeps consuming characters, but no longer
    241 // adjusts *out.  If a nonzero digit is dropped this way, *dropped_nonzero_digit
    242 // is set; otherwise, it is left unmodified.
    243 //
    244 // If no digits are matched, returns 0 and leaves *out unchanged.
    245 //
    246 // ConsumeDigits does not protect against overflow on *out; max_digits must
    247 // be chosen with respect to type T to avoid the possibility of overflow.
    248 template <int base, typename T>
    249 int ConsumeDigits(const char* begin, const char* end, int max_digits, T* out,
    250                  bool* dropped_nonzero_digit) {
    251  if (base == 10) {
    252    assert(max_digits <= std::numeric_limits<T>::digits10);
    253  } else if (base == 16) {
    254    assert(max_digits * 4 <= std::numeric_limits<T>::digits);
    255  }
    256  const char* const original_begin = begin;
    257 
    258  // Skip leading zeros, but only if *out is zero.
    259  // They don't cause an overflow so we don't have to count them for
    260  // `max_digits`.
    261  while (!*out && end != begin && *begin == '0') ++begin;
    262 
    263  T accumulator = *out;
    264  const char* significant_digits_end =
    265      (end - begin > max_digits) ? begin + max_digits : end;
    266  while (begin < significant_digits_end && IsDigit<base>(*begin)) {
    267    // Do not guard against *out overflow; max_digits was chosen to avoid this.
    268    // Do assert against it, to detect problems in debug builds.
    269    auto digit = static_cast<T>(ToDigit<base>(*begin));
    270    assert(accumulator * base >= accumulator);
    271    accumulator *= base;
    272    assert(accumulator + digit >= accumulator);
    273    accumulator += digit;
    274    ++begin;
    275  }
    276  bool dropped_nonzero = false;
    277  while (begin < end && IsDigit<base>(*begin)) {
    278    dropped_nonzero = dropped_nonzero || (*begin != '0');
    279    ++begin;
    280  }
    281  if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
    282    *dropped_nonzero_digit = true;
    283  }
    284  *out = accumulator;
    285  return static_cast<int>(begin - original_begin);
    286 }
    287 
    288 // Returns true if `v` is one of the chars allowed inside parentheses following
    289 // a NaN.
    290 bool IsNanChar(char v) {
    291  return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') ||
    292         (v >= 'A' && v <= 'Z');
    293 }
    294 
    295 // Checks the range [begin, end) for a strtod()-formatted infinity or NaN.  If
    296 // one is found, sets `out` appropriately and returns true.
    297 bool ParseInfinityOrNan(const char* begin, const char* end,
    298                        strings_internal::ParsedFloat* out) {
    299  if (end - begin < 3) {
    300    return false;
    301  }
    302  switch (*begin) {
    303    case 'i':
    304    case 'I': {
    305      // An infinity string consists of the characters "inf" or "infinity",
    306      // case insensitive.
    307      if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) {
    308        return false;
    309      }
    310      out->type = strings_internal::FloatType::kInfinity;
    311      if (end - begin >= 8 &&
    312          strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) {
    313        out->end = begin + 8;
    314      } else {
    315        out->end = begin + 3;
    316      }
    317      return true;
    318    }
    319    case 'n':
    320    case 'N': {
    321      // A NaN consists of the characters "nan", case insensitive, optionally
    322      // followed by a parenthesized sequence of zero or more alphanumeric
    323      // characters and/or underscores.
    324      if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) {
    325        return false;
    326      }
    327      out->type = strings_internal::FloatType::kNan;
    328      out->end = begin + 3;
    329      // NaN is allowed to be followed by a parenthesized string, consisting of
    330      // only the characters [a-zA-Z0-9_].  Match that if it's present.
    331      begin += 3;
    332      if (begin < end && *begin == '(') {
    333        const char* nan_begin = begin + 1;
    334        while (nan_begin < end && IsNanChar(*nan_begin)) {
    335          ++nan_begin;
    336        }
    337        if (nan_begin < end && *nan_begin == ')') {
    338          // We found an extra NaN specifier range
    339          out->subrange_begin = begin + 1;
    340          out->subrange_end = nan_begin;
    341          out->end = nan_begin + 1;
    342        }
    343      }
    344      return true;
    345    }
    346    default:
    347      return false;
    348  }
    349 }
    350 }  // namespace
    351 
    352 namespace strings_internal {
    353 
    354 template <int base>
    355 strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end,
    356                                         chars_format format_flags) {
    357  strings_internal::ParsedFloat result;
    358 
    359  // Exit early if we're given an empty range.
    360  if (begin == end) return result;
    361 
    362  // Handle the infinity and NaN cases.
    363  if (ParseInfinityOrNan(begin, end, &result)) {
    364    return result;
    365  }
    366 
    367  const char* const mantissa_begin = begin;
    368  while (begin < end && *begin == '0') {
    369    ++begin;  // skip leading zeros
    370  }
    371  uint64_t mantissa = 0;
    372 
    373  int exponent_adjustment = 0;
    374  bool mantissa_is_inexact = false;
    375  int pre_decimal_digits = ConsumeDigits<base>(
    376      begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
    377  begin += pre_decimal_digits;
    378  int digits_left;
    379  if (pre_decimal_digits >= DigitLimit<base>()) {
    380    // refuse to parse pathological inputs
    381    return result;
    382  } else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
    383    // We dropped some non-fraction digits on the floor.  Adjust our exponent
    384    // to compensate.
    385    exponent_adjustment =
    386        static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
    387    digits_left = 0;
    388  } else {
    389    digits_left =
    390        static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
    391  }
    392  if (begin < end && *begin == '.') {
    393    ++begin;
    394    if (mantissa == 0) {
    395      // If we haven't seen any nonzero digits yet, keep skipping zeros.  We
    396      // have to adjust the exponent to reflect the changed place value.
    397      const char* begin_zeros = begin;
    398      while (begin < end && *begin == '0') {
    399        ++begin;
    400      }
    401      int zeros_skipped = static_cast<int>(begin - begin_zeros);
    402      if (zeros_skipped >= DigitLimit<base>()) {
    403        // refuse to parse pathological inputs
    404        return result;
    405      }
    406      exponent_adjustment -= static_cast<int>(zeros_skipped);
    407    }
    408    int post_decimal_digits = ConsumeDigits<base>(
    409        begin, end, digits_left, &mantissa, &mantissa_is_inexact);
    410    begin += post_decimal_digits;
    411 
    412    // Since `mantissa` is an integer, each significant digit we read after
    413    // the decimal point requires an adjustment to the exponent. "1.23e0" will
    414    // be stored as `mantissa` == 123 and `exponent` == -2 (that is,
    415    // "123e-2").
    416    if (post_decimal_digits >= DigitLimit<base>()) {
    417      // refuse to parse pathological inputs
    418      return result;
    419    } else if (post_decimal_digits > digits_left) {
    420      exponent_adjustment -= digits_left;
    421    } else {
    422      exponent_adjustment -= post_decimal_digits;
    423    }
    424  }
    425  // If we've found no mantissa whatsoever, this isn't a number.
    426  if (mantissa_begin == begin) {
    427    return result;
    428  }
    429  // A bare "." doesn't count as a mantissa either.
    430  if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {
    431    return result;
    432  }
    433 
    434  if (mantissa_is_inexact) {
    435    // We dropped significant digits on the floor.  Handle this appropriately.
    436    if (base == 10) {
    437      // If we truncated significant decimal digits, store the full range of the
    438      // mantissa for future big integer math for exact rounding.
    439      result.subrange_begin = mantissa_begin;
    440      result.subrange_end = begin;
    441    } else if (base == 16) {
    442      // If we truncated hex digits, reflect this fact by setting the low
    443      // ("sticky") bit.  This allows for correct rounding in all cases.
    444      mantissa |= 1;
    445    }
    446  }
    447  result.mantissa = mantissa;
    448 
    449  const char* const exponent_begin = begin;
    450  result.literal_exponent = 0;
    451  bool found_exponent = false;
    452  if (AllowExponent(format_flags) && begin < end &&
    453      IsExponentCharacter<base>(*begin)) {
    454    bool negative_exponent = false;
    455    ++begin;
    456    if (begin < end && *begin == '-') {
    457      negative_exponent = true;
    458      ++begin;
    459    } else if (begin < end && *begin == '+') {
    460      ++begin;
    461    }
    462    const char* const exponent_digits_begin = begin;
    463    // Exponent is always expressed in decimal, even for hexadecimal floats.
    464    begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,
    465                               &result.literal_exponent, nullptr);
    466    if (begin == exponent_digits_begin) {
    467      // there were no digits where we expected an exponent.  We failed to read
    468      // an exponent and should not consume the 'e' after all.  Rewind 'begin'.
    469      found_exponent = false;
    470      begin = exponent_begin;
    471    } else {
    472      found_exponent = true;
    473      if (negative_exponent) {
    474        result.literal_exponent = -result.literal_exponent;
    475      }
    476    }
    477  }
    478 
    479  if (!found_exponent && RequireExponent(format_flags)) {
    480    // Provided flags required an exponent, but none was found.  This results
    481    // in a failure to scan.
    482    return result;
    483  }
    484 
    485  // Success!
    486  result.type = strings_internal::FloatType::kNumber;
    487  if (result.mantissa > 0) {
    488    result.exponent = result.literal_exponent +
    489                      (DigitMagnitude<base>() * exponent_adjustment);
    490  } else {
    491    result.exponent = 0;
    492  }
    493  result.end = begin;
    494  return result;
    495 }
    496 
    497 template ParsedFloat ParseFloat<10>(const char* begin, const char* end,
    498                                    chars_format format_flags);
    499 template ParsedFloat ParseFloat<16>(const char* begin, const char* end,
    500                                    chars_format format_flags);
    501 
    502 }  // namespace strings_internal
    503 ABSL_NAMESPACE_END
    504 }  // namespace absl