charconv_parse.cc (18724B)
1 // Copyright 2018 The Abseil Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "absl/strings/internal/charconv_parse.h" 16 #include "absl/strings/charconv.h" 17 18 #include <cassert> 19 #include <cstdint> 20 #include <limits> 21 22 #include "absl/strings/internal/memutil.h" 23 24 namespace absl { 25 ABSL_NAMESPACE_BEGIN 26 namespace { 27 28 // ParseFloat<10> will read the first 19 significant digits of the mantissa. 29 // This number was chosen for multiple reasons. 30 // 31 // (a) First, for whatever integer type we choose to represent the mantissa, we 32 // want to choose the largest possible number of decimal digits for that integer 33 // type. We are using uint64_t, which can express any 19-digit unsigned 34 // integer. 35 // 36 // (b) Second, we need to parse enough digits that the binary value of any 37 // mantissa we capture has more bits of resolution than the mantissa 38 // representation in the target float. Our algorithm requires at least 3 bits 39 // of headway, but 19 decimal digits give a little more than that. 40 // 41 // The following static assertions verify the above comments: 42 constexpr int kDecimalMantissaDigitsMax = 19; 43 44 static_assert(std::numeric_limits<uint64_t>::digits10 == 45 kDecimalMantissaDigitsMax, 46 "(a) above"); 47 48 // IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa. 49 static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed"); 50 static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact"); 51 static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact"); 52 53 // The lowest valued 19-digit decimal mantissa we can read still contains 54 // sufficient information to reconstruct a binary mantissa. 55 static_assert(1000000000000000000u > (uint64_t{1} << (53 + 3)), "(b) above"); 56 57 // ParseFloat<16> will read the first 15 significant digits of the mantissa. 58 // 59 // Because a base-16-to-base-2 conversion can be done exactly, we do not need 60 // to maximize the number of scanned hex digits to improve our conversion. What 61 // is required is to scan two more bits than the mantissa can represent, so that 62 // we always round correctly. 63 // 64 // (One extra bit does not suffice to perform correct rounding, since a number 65 // exactly halfway between two representable floats has unique rounding rules, 66 // so we need to differentiate between a "halfway between" number and a "closer 67 // to the larger value" number.) 68 constexpr int kHexadecimalMantissaDigitsMax = 15; 69 70 // The minimum number of significant bits that will be read from 71 // kHexadecimalMantissaDigitsMax hex digits. We must subtract by three, since 72 // the most significant digit can be a "1", which only contributes a single 73 // significant bit. 74 constexpr int kGuaranteedHexadecimalMantissaBitPrecision = 75 4 * kHexadecimalMantissaDigitsMax - 3; 76 77 static_assert(kGuaranteedHexadecimalMantissaBitPrecision > 78 std::numeric_limits<double>::digits + 2, 79 "kHexadecimalMantissaDigitsMax too small"); 80 81 // We also impose a limit on the number of significant digits we will read from 82 // an exponent, to avoid having to deal with integer overflow. We use 9 for 83 // this purpose. 84 // 85 // If we read a 9 digit exponent, the end result of the conversion will 86 // necessarily be infinity or zero, depending on the sign of the exponent. 87 // Therefore we can just drop extra digits on the floor without any extra 88 // logic. 89 constexpr int kDecimalExponentDigitsMax = 9; 90 static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax, 91 "int type too small"); 92 93 // To avoid incredibly large inputs causing integer overflow for our exponent, 94 // we impose an arbitrary but very large limit on the number of significant 95 // digits we will accept. The implementation refuses to match a string with 96 // more consecutive significant mantissa digits than this. 97 constexpr int kDecimalDigitLimit = 50000000; 98 99 // Corresponding limit for hexadecimal digit inputs. This is one fourth the 100 // amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires 101 // a binary exponent adjustment of 4. 102 constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4; 103 104 // The largest exponent we can read is 999999999 (per 105 // kDecimalExponentDigitsMax), and the largest exponent adjustment we can get 106 // from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these 107 // comfortably fits in an integer. 108 // 109 // We count kDecimalDigitLimit twice because there are independent limits for 110 // numbers before and after the decimal point. (In the case where there are no 111 // significant digits before the decimal point, there are independent limits for 112 // post-decimal-point leading zeroes and for significant digits.) 113 static_assert(999999999 + 2 * kDecimalDigitLimit < 114 std::numeric_limits<int>::max(), 115 "int type too small"); 116 static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) < 117 std::numeric_limits<int>::max(), 118 "int type too small"); 119 120 // Returns true if the provided bitfield allows parsing an exponent value 121 // (e.g., "1.5e100"). 122 bool AllowExponent(chars_format flags) { 123 bool fixed = (flags & chars_format::fixed) == chars_format::fixed; 124 bool scientific = 125 (flags & chars_format::scientific) == chars_format::scientific; 126 return scientific || !fixed; 127 } 128 129 // Returns true if the provided bitfield requires an exponent value be present. 130 bool RequireExponent(chars_format flags) { 131 bool fixed = (flags & chars_format::fixed) == chars_format::fixed; 132 bool scientific = 133 (flags & chars_format::scientific) == chars_format::scientific; 134 return scientific && !fixed; 135 } 136 137 const int8_t kAsciiToInt[256] = { 138 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 139 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 140 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 141 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, 142 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 143 -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 144 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 145 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 146 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 147 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 148 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 149 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 150 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 151 -1, -1, -1, -1, -1, -1, -1, -1, -1}; 152 153 // Returns true if `ch` is a digit in the given base 154 template <int base> 155 bool IsDigit(char ch); 156 157 // Converts a valid `ch` to its digit value in the given base. 158 template <int base> 159 unsigned ToDigit(char ch); 160 161 // Returns true if `ch` is the exponent delimiter for the given base. 162 template <int base> 163 bool IsExponentCharacter(char ch); 164 165 // Returns the maximum number of significant digits we will read for a float 166 // in the given base. 167 template <int base> 168 constexpr int MantissaDigitsMax(); 169 170 // Returns the largest consecutive run of digits we will accept when parsing a 171 // number in the given base. 172 template <int base> 173 constexpr int DigitLimit(); 174 175 // Returns the amount the exponent must be adjusted by for each dropped digit. 176 // (For decimal this is 1, since the digits are in base 10 and the exponent base 177 // is also 10, but for hexadecimal this is 4, since the digits are base 16 but 178 // the exponent base is 2.) 179 template <int base> 180 constexpr int DigitMagnitude(); 181 182 template <> 183 bool IsDigit<10>(char ch) { 184 return ch >= '0' && ch <= '9'; 185 } 186 template <> 187 bool IsDigit<16>(char ch) { 188 return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0; 189 } 190 191 template <> 192 unsigned ToDigit<10>(char ch) { 193 return static_cast<unsigned>(ch - '0'); 194 } 195 template <> 196 unsigned ToDigit<16>(char ch) { 197 return static_cast<unsigned>(kAsciiToInt[static_cast<unsigned char>(ch)]); 198 } 199 200 template <> 201 bool IsExponentCharacter<10>(char ch) { 202 return ch == 'e' || ch == 'E'; 203 } 204 205 template <> 206 bool IsExponentCharacter<16>(char ch) { 207 return ch == 'p' || ch == 'P'; 208 } 209 210 template <> 211 constexpr int MantissaDigitsMax<10>() { 212 return kDecimalMantissaDigitsMax; 213 } 214 template <> 215 constexpr int MantissaDigitsMax<16>() { 216 return kHexadecimalMantissaDigitsMax; 217 } 218 219 template <> 220 constexpr int DigitLimit<10>() { 221 return kDecimalDigitLimit; 222 } 223 template <> 224 constexpr int DigitLimit<16>() { 225 return kHexadecimalDigitLimit; 226 } 227 228 template <> 229 constexpr int DigitMagnitude<10>() { 230 return 1; 231 } 232 template <> 233 constexpr int DigitMagnitude<16>() { 234 return 4; 235 } 236 237 // Reads decimal digits from [begin, end) into *out. Returns the number of 238 // digits consumed. 239 // 240 // After max_digits has been read, keeps consuming characters, but no longer 241 // adjusts *out. If a nonzero digit is dropped this way, *dropped_nonzero_digit 242 // is set; otherwise, it is left unmodified. 243 // 244 // If no digits are matched, returns 0 and leaves *out unchanged. 245 // 246 // ConsumeDigits does not protect against overflow on *out; max_digits must 247 // be chosen with respect to type T to avoid the possibility of overflow. 248 template <int base, typename T> 249 int ConsumeDigits(const char* begin, const char* end, int max_digits, T* out, 250 bool* dropped_nonzero_digit) { 251 if (base == 10) { 252 assert(max_digits <= std::numeric_limits<T>::digits10); 253 } else if (base == 16) { 254 assert(max_digits * 4 <= std::numeric_limits<T>::digits); 255 } 256 const char* const original_begin = begin; 257 258 // Skip leading zeros, but only if *out is zero. 259 // They don't cause an overflow so we don't have to count them for 260 // `max_digits`. 261 while (!*out && end != begin && *begin == '0') ++begin; 262 263 T accumulator = *out; 264 const char* significant_digits_end = 265 (end - begin > max_digits) ? begin + max_digits : end; 266 while (begin < significant_digits_end && IsDigit<base>(*begin)) { 267 // Do not guard against *out overflow; max_digits was chosen to avoid this. 268 // Do assert against it, to detect problems in debug builds. 269 auto digit = static_cast<T>(ToDigit<base>(*begin)); 270 assert(accumulator * base >= accumulator); 271 accumulator *= base; 272 assert(accumulator + digit >= accumulator); 273 accumulator += digit; 274 ++begin; 275 } 276 bool dropped_nonzero = false; 277 while (begin < end && IsDigit<base>(*begin)) { 278 dropped_nonzero = dropped_nonzero || (*begin != '0'); 279 ++begin; 280 } 281 if (dropped_nonzero && dropped_nonzero_digit != nullptr) { 282 *dropped_nonzero_digit = true; 283 } 284 *out = accumulator; 285 return static_cast<int>(begin - original_begin); 286 } 287 288 // Returns true if `v` is one of the chars allowed inside parentheses following 289 // a NaN. 290 bool IsNanChar(char v) { 291 return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') || 292 (v >= 'A' && v <= 'Z'); 293 } 294 295 // Checks the range [begin, end) for a strtod()-formatted infinity or NaN. If 296 // one is found, sets `out` appropriately and returns true. 297 bool ParseInfinityOrNan(const char* begin, const char* end, 298 strings_internal::ParsedFloat* out) { 299 if (end - begin < 3) { 300 return false; 301 } 302 switch (*begin) { 303 case 'i': 304 case 'I': { 305 // An infinity string consists of the characters "inf" or "infinity", 306 // case insensitive. 307 if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) { 308 return false; 309 } 310 out->type = strings_internal::FloatType::kInfinity; 311 if (end - begin >= 8 && 312 strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) { 313 out->end = begin + 8; 314 } else { 315 out->end = begin + 3; 316 } 317 return true; 318 } 319 case 'n': 320 case 'N': { 321 // A NaN consists of the characters "nan", case insensitive, optionally 322 // followed by a parenthesized sequence of zero or more alphanumeric 323 // characters and/or underscores. 324 if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) { 325 return false; 326 } 327 out->type = strings_internal::FloatType::kNan; 328 out->end = begin + 3; 329 // NaN is allowed to be followed by a parenthesized string, consisting of 330 // only the characters [a-zA-Z0-9_]. Match that if it's present. 331 begin += 3; 332 if (begin < end && *begin == '(') { 333 const char* nan_begin = begin + 1; 334 while (nan_begin < end && IsNanChar(*nan_begin)) { 335 ++nan_begin; 336 } 337 if (nan_begin < end && *nan_begin == ')') { 338 // We found an extra NaN specifier range 339 out->subrange_begin = begin + 1; 340 out->subrange_end = nan_begin; 341 out->end = nan_begin + 1; 342 } 343 } 344 return true; 345 } 346 default: 347 return false; 348 } 349 } 350 } // namespace 351 352 namespace strings_internal { 353 354 template <int base> 355 strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end, 356 chars_format format_flags) { 357 strings_internal::ParsedFloat result; 358 359 // Exit early if we're given an empty range. 360 if (begin == end) return result; 361 362 // Handle the infinity and NaN cases. 363 if (ParseInfinityOrNan(begin, end, &result)) { 364 return result; 365 } 366 367 const char* const mantissa_begin = begin; 368 while (begin < end && *begin == '0') { 369 ++begin; // skip leading zeros 370 } 371 uint64_t mantissa = 0; 372 373 int exponent_adjustment = 0; 374 bool mantissa_is_inexact = false; 375 int pre_decimal_digits = ConsumeDigits<base>( 376 begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact); 377 begin += pre_decimal_digits; 378 int digits_left; 379 if (pre_decimal_digits >= DigitLimit<base>()) { 380 // refuse to parse pathological inputs 381 return result; 382 } else if (pre_decimal_digits > MantissaDigitsMax<base>()) { 383 // We dropped some non-fraction digits on the floor. Adjust our exponent 384 // to compensate. 385 exponent_adjustment = 386 static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>()); 387 digits_left = 0; 388 } else { 389 digits_left = 390 static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits); 391 } 392 if (begin < end && *begin == '.') { 393 ++begin; 394 if (mantissa == 0) { 395 // If we haven't seen any nonzero digits yet, keep skipping zeros. We 396 // have to adjust the exponent to reflect the changed place value. 397 const char* begin_zeros = begin; 398 while (begin < end && *begin == '0') { 399 ++begin; 400 } 401 int zeros_skipped = static_cast<int>(begin - begin_zeros); 402 if (zeros_skipped >= DigitLimit<base>()) { 403 // refuse to parse pathological inputs 404 return result; 405 } 406 exponent_adjustment -= static_cast<int>(zeros_skipped); 407 } 408 int post_decimal_digits = ConsumeDigits<base>( 409 begin, end, digits_left, &mantissa, &mantissa_is_inexact); 410 begin += post_decimal_digits; 411 412 // Since `mantissa` is an integer, each significant digit we read after 413 // the decimal point requires an adjustment to the exponent. "1.23e0" will 414 // be stored as `mantissa` == 123 and `exponent` == -2 (that is, 415 // "123e-2"). 416 if (post_decimal_digits >= DigitLimit<base>()) { 417 // refuse to parse pathological inputs 418 return result; 419 } else if (post_decimal_digits > digits_left) { 420 exponent_adjustment -= digits_left; 421 } else { 422 exponent_adjustment -= post_decimal_digits; 423 } 424 } 425 // If we've found no mantissa whatsoever, this isn't a number. 426 if (mantissa_begin == begin) { 427 return result; 428 } 429 // A bare "." doesn't count as a mantissa either. 430 if (begin - mantissa_begin == 1 && *mantissa_begin == '.') { 431 return result; 432 } 433 434 if (mantissa_is_inexact) { 435 // We dropped significant digits on the floor. Handle this appropriately. 436 if (base == 10) { 437 // If we truncated significant decimal digits, store the full range of the 438 // mantissa for future big integer math for exact rounding. 439 result.subrange_begin = mantissa_begin; 440 result.subrange_end = begin; 441 } else if (base == 16) { 442 // If we truncated hex digits, reflect this fact by setting the low 443 // ("sticky") bit. This allows for correct rounding in all cases. 444 mantissa |= 1; 445 } 446 } 447 result.mantissa = mantissa; 448 449 const char* const exponent_begin = begin; 450 result.literal_exponent = 0; 451 bool found_exponent = false; 452 if (AllowExponent(format_flags) && begin < end && 453 IsExponentCharacter<base>(*begin)) { 454 bool negative_exponent = false; 455 ++begin; 456 if (begin < end && *begin == '-') { 457 negative_exponent = true; 458 ++begin; 459 } else if (begin < end && *begin == '+') { 460 ++begin; 461 } 462 const char* const exponent_digits_begin = begin; 463 // Exponent is always expressed in decimal, even for hexadecimal floats. 464 begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax, 465 &result.literal_exponent, nullptr); 466 if (begin == exponent_digits_begin) { 467 // there were no digits where we expected an exponent. We failed to read 468 // an exponent and should not consume the 'e' after all. Rewind 'begin'. 469 found_exponent = false; 470 begin = exponent_begin; 471 } else { 472 found_exponent = true; 473 if (negative_exponent) { 474 result.literal_exponent = -result.literal_exponent; 475 } 476 } 477 } 478 479 if (!found_exponent && RequireExponent(format_flags)) { 480 // Provided flags required an exponent, but none was found. This results 481 // in a failure to scan. 482 return result; 483 } 484 485 // Success! 486 result.type = strings_internal::FloatType::kNumber; 487 if (result.mantissa > 0) { 488 result.exponent = result.literal_exponent + 489 (DigitMagnitude<base>() * exponent_adjustment); 490 } else { 491 result.exponent = 0; 492 } 493 result.end = begin; 494 return result; 495 } 496 497 template ParsedFloat ParseFloat<10>(const char* begin, const char* end, 498 chars_format format_flags); 499 template ParsedFloat ParseFloat<16>(const char* begin, const char* end, 500 chars_format format_flags); 501 502 } // namespace strings_internal 503 ABSL_NAMESPACE_END 504 } // namespace absl