numparse_decimal.cpp (18110B)
1 // © 2018 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include "unicode/utypes.h" 5 6 #if !UCONFIG_NO_FORMATTING 7 8 // Allow implicit conversion from char16_t* to UnicodeString for this file: 9 // Helpful in toString methods and elsewhere. 10 #define UNISTR_FROM_STRING_EXPLICIT 11 12 #include "numparse_types.h" 13 #include "numparse_decimal.h" 14 #include "static_unicode_sets.h" 15 #include "numparse_utils.h" 16 #include "unicode/uchar.h" 17 #include "putilimp.h" 18 #include "number_decimalquantity.h" 19 #include "string_segment.h" 20 21 using namespace icu; 22 using namespace icu::numparse; 23 using namespace icu::numparse::impl; 24 25 26 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper, 27 parse_flags_t parseFlags) { 28 if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) { 29 groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol); 30 decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol); 31 } else { 32 groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol); 33 decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol); 34 } 35 bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS); 36 37 // Parsing is very lenient even in strict mode, almost any dot or comma is a 38 // grouping separator. Parsing strings like "1.234" in French was treating '.' 39 // like an ignorable grouping separator, and we want it to be excluded. 40 // We keep the public behavior when strictParse is false, but when it is true 41 // we restrict grouping separators to the smaller set of equivalents. 42 unisets::Key groupingKey = unisets::chooseFrom(groupingSeparator, 43 strictSeparators ? unisets::STRICT_COMMA : unisets::ALL_SEPARATORS, 44 strictSeparators ? unisets::STRICT_PERIOD : unisets::ALL_SEPARATORS); 45 if (groupingKey < 0) { 46 groupingKey = unisets::chooseFrom( 47 groupingSeparator, unisets::OTHER_GROUPING_SEPARATORS); 48 } 49 if (groupingKey >= 0) { 50 // Attempt to find separators in the static cache 51 groupingUniSet = unisets::get(groupingKey); 52 } else if (!groupingSeparator.isEmpty()) { 53 auto* set = new UnicodeSet(); 54 set->add(groupingSeparator.char32At(0)); 55 set->freeze(); 56 groupingUniSet = set; 57 fLocalGroupingUniSet.adoptInstead(set); 58 } else { 59 groupingUniSet = unisets::get(unisets::EMPTY); 60 } 61 62 unisets::Key decimalKey = unisets::chooseFrom( 63 decimalSeparator, 64 strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA, 65 strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD); 66 if (decimalKey >= 0) { 67 decimalUniSet = unisets::get(decimalKey); 68 } else if (!decimalSeparator.isEmpty()) { 69 auto* set = new UnicodeSet(); 70 set->add(decimalSeparator.char32At(0)); 71 set->freeze(); 72 decimalUniSet = set; 73 fLocalDecimalUniSet.adoptInstead(set); 74 } else { 75 decimalUniSet = unisets::get(unisets::EMPTY); 76 } 77 78 if (groupingKey >= 0 && decimalKey >= 0) { 79 // Everything is available in the static cache 80 separatorSet = groupingUniSet; 81 leadSet = unisets::get( 82 strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS 83 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS); 84 } else { 85 auto* set = new UnicodeSet(); 86 set->addAll(*groupingUniSet); 87 set->addAll(*decimalUniSet); 88 set->freeze(); 89 separatorSet = set; 90 fLocalSeparatorSet.adoptInstead(set); 91 leadSet = nullptr; 92 } 93 94 UChar32 cpZero = symbols.getCodePointZero(); 95 if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) { 96 // Uncommon case: okay to allocate. 97 auto* digitStrings = new UnicodeString[10]; 98 fLocalDigitStrings.adoptInstead(digitStrings); 99 for (int32_t i = 0; i <= 9; i++) { 100 digitStrings[i] = symbols.getConstDigitSymbol(i); 101 } 102 } 103 104 requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE); 105 groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED); 106 integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY); 107 grouping1 = grouper.getPrimary(); 108 grouping2 = grouper.getSecondary(); 109 110 // Fraction grouping parsing is disabled for now but could be enabled later. 111 // See https://unicode-org.atlassian.net/browse/ICU-10794 112 // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED); 113 } 114 115 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { 116 return match(segment, result, 0, status); 117 } 118 119 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign, 120 UErrorCode&) const { 121 if (result.seenNumber() && exponentSign == 0) { 122 // A number has already been consumed. 123 return false; 124 } else if (exponentSign != 0) { 125 // scientific notation always comes after the number 126 U_ASSERT(!result.quantity.bogus); 127 } 128 129 // Initial offset before any character consumption. 130 int32_t initialOffset = segment.getOffset(); 131 132 // Return value: whether to ask for more characters. 133 bool maybeMore = false; 134 135 // All digits consumed so far. 136 number::impl::DecimalQuantity digitsConsumed; 137 digitsConsumed.bogus = true; 138 139 // The total number of digits after the decimal place, used for scaling the result. 140 int32_t digitsAfterDecimalPlace = 0; 141 142 // The actual grouping and decimal separators used in the string. 143 // If non-null, we have seen that token. 144 UnicodeString actualGroupingString; 145 UnicodeString actualDecimalString; 146 actualGroupingString.setToBogus(); 147 actualDecimalString.setToBogus(); 148 149 // Information for two groups: the previous group and the current group. 150 // 151 // Each group has three pieces of information: 152 // 153 // Offset: the string position of the beginning of the group, including a leading separator 154 // if there was a leading separator. This is needed in case we need to rewind the parse to 155 // that position. 156 // 157 // Separator type: 158 // 0 => beginning of string 159 // 1 => lead separator is a grouping separator 160 // 2 => lead separator is a decimal separator 161 // 162 // Count: the number of digits in the group. If -1, the group has been validated. 163 int32_t currGroupOffset = 0; 164 int32_t currGroupSepType = 0; 165 int32_t currGroupCount = 0; 166 int32_t prevGroupOffset = -1; 167 int32_t prevGroupSepType = -1; 168 int32_t prevGroupCount = -1; 169 170 while (segment.length() > 0) { 171 maybeMore = false; 172 173 // Attempt to match a digit. 174 int8_t digit = -1; 175 176 // Try by code point digit value. 177 UChar32 cp = segment.getCodePoint(); 178 if (u_isdigit(cp)) { 179 segment.adjustOffset(U16_LENGTH(cp)); 180 digit = static_cast<int8_t>(u_digit(cp, 10)); 181 } 182 183 // Try by digit string. 184 if (digit == -1 && !fLocalDigitStrings.isNull()) { 185 for (int32_t i = 0; i < 10; i++) { 186 const UnicodeString& str = fLocalDigitStrings[i]; 187 if (str.isEmpty()) { 188 continue; 189 } 190 int32_t overlap = segment.getCommonPrefixLength(str); 191 if (overlap == str.length()) { 192 segment.adjustOffset(overlap); 193 digit = static_cast<int8_t>(i); 194 break; 195 } 196 maybeMore = maybeMore || (overlap == segment.length()); 197 } 198 } 199 200 if (digit >= 0) { 201 // Digit was found. 202 if (digitsConsumed.bogus) { 203 digitsConsumed.bogus = false; 204 digitsConsumed.clear(); 205 } 206 digitsConsumed.appendDigit(digit, 0, true); 207 currGroupCount++; 208 if (!actualDecimalString.isBogus()) { 209 digitsAfterDecimalPlace++; 210 } 211 continue; 212 } 213 214 // Attempt to match a literal grouping or decimal separator. 215 bool isDecimal = false; 216 bool isGrouping = false; 217 218 // 1) Attempt the decimal separator string literal. 219 // if (we have not seen a decimal separator yet) { ... } 220 if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) { 221 int32_t overlap = segment.getCommonPrefixLength(decimalSeparator); 222 maybeMore = maybeMore || (overlap == segment.length()); 223 if (overlap == decimalSeparator.length()) { 224 isDecimal = true; 225 actualDecimalString = decimalSeparator; 226 } 227 } 228 229 // 2) Attempt to match the actual grouping string literal. 230 if (!actualGroupingString.isBogus()) { 231 int32_t overlap = segment.getCommonPrefixLength(actualGroupingString); 232 maybeMore = maybeMore || (overlap == segment.length()); 233 if (overlap == actualGroupingString.length()) { 234 isGrouping = true; 235 } 236 } 237 238 // 2.5) Attempt to match a new the grouping separator string literal. 239 // if (we have not seen a grouping or decimal separator yet) { ... } 240 if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() && 241 !groupingSeparator.isEmpty()) { 242 int32_t overlap = segment.getCommonPrefixLength(groupingSeparator); 243 maybeMore = maybeMore || (overlap == segment.length()); 244 if (overlap == groupingSeparator.length()) { 245 isGrouping = true; 246 actualGroupingString = groupingSeparator; 247 } 248 } 249 250 // 3) Attempt to match a decimal separator from the equivalence set. 251 // if (we have not seen a decimal separator yet) { ... } 252 // The !isGrouping is to confirm that we haven't yet matched the current character. 253 if (!isGrouping && actualDecimalString.isBogus()) { 254 if (decimalUniSet->contains(cp)) { 255 isDecimal = true; 256 actualDecimalString = UnicodeString(cp); 257 } 258 } 259 260 // 4) Attempt to match a grouping separator from the equivalence set. 261 // if (we have not seen a grouping or decimal separator yet) { ... } 262 if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) { 263 if (groupingUniSet->contains(cp)) { 264 isGrouping = true; 265 actualGroupingString = UnicodeString(cp); 266 } 267 } 268 269 // Leave if we failed to match this as a separator. 270 if (!isDecimal && !isGrouping) { 271 break; 272 } 273 274 // Check for conditions when we don't want to accept the separator. 275 if (isDecimal && integerOnly) { 276 break; 277 } else if (currGroupSepType == 2 && isGrouping) { 278 // Fraction grouping 279 break; 280 } 281 282 // Validate intermediate grouping sizes. 283 bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false); 284 bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true); 285 if (!prevValidSecondary || (isDecimal && !currValidPrimary)) { 286 // Invalid grouping sizes. 287 if (isGrouping && currGroupCount == 0) { 288 // Trailing grouping separators: these are taken care of below 289 U_ASSERT(currGroupSepType == 1); 290 } else if (requireGroupingMatch) { 291 // Strict mode: reject the parse 292 digitsConsumed.clear(); 293 digitsConsumed.bogus = true; 294 } 295 break; 296 } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) { 297 break; 298 } else { 299 // Grouping sizes OK so far. 300 prevGroupOffset = currGroupOffset; 301 prevGroupCount = currGroupCount; 302 if (isDecimal) { 303 // Do not validate this group any more. 304 prevGroupSepType = -1; 305 } else { 306 prevGroupSepType = currGroupSepType; 307 } 308 } 309 310 // OK to accept the separator. 311 // Special case: don't update currGroup if it is empty; this allows two grouping 312 // separators in a row in lenient mode. 313 if (currGroupCount != 0) { 314 currGroupOffset = segment.getOffset(); 315 } 316 currGroupSepType = isGrouping ? 1 : 2; 317 currGroupCount = 0; 318 if (isGrouping) { 319 segment.adjustOffset(actualGroupingString.length()); 320 } else { 321 segment.adjustOffset(actualDecimalString.length()); 322 } 323 } 324 325 // End of main loop. 326 // Back up if there was a trailing grouping separator. 327 // Shift prev -> curr so we can check it as a final group. 328 if (currGroupSepType != 2 && currGroupCount == 0) { 329 maybeMore = true; 330 segment.setOffset(currGroupOffset); 331 currGroupOffset = prevGroupOffset; 332 currGroupSepType = prevGroupSepType; 333 currGroupCount = prevGroupCount; 334 prevGroupOffset = -1; 335 prevGroupSepType = 0; 336 prevGroupCount = 1; 337 } 338 339 // Validate final grouping sizes. 340 bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false); 341 bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true); 342 if (!requireGroupingMatch) { 343 // The cases we need to handle here are lone digits. 344 // Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1) 345 // See more examples in numberformattestspecification.txt 346 int32_t digitsToRemove = 0; 347 if (!prevValidSecondary) { 348 segment.setOffset(prevGroupOffset); 349 digitsToRemove += prevGroupCount; 350 digitsToRemove += currGroupCount; 351 } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) { 352 maybeMore = true; 353 segment.setOffset(currGroupOffset); 354 digitsToRemove += currGroupCount; 355 } 356 if (digitsToRemove != 0) { 357 digitsConsumed.adjustMagnitude(-digitsToRemove); 358 digitsConsumed.truncate(); 359 } 360 prevValidSecondary = true; 361 currValidPrimary = true; 362 } 363 if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) { 364 // Grouping failure. 365 digitsConsumed.bogus = true; 366 } 367 368 // Strings that start with a separator but have no digits, 369 // or strings that failed a grouping size check. 370 if (digitsConsumed.bogus) { 371 maybeMore = maybeMore || (segment.length() == 0); 372 segment.setOffset(initialOffset); 373 return maybeMore; 374 } 375 376 // We passed all inspections. Start post-processing. 377 378 // Adjust for fraction part. 379 digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace); 380 381 // Set the digits, either normal or exponent. 382 if (exponentSign != 0 && segment.getOffset() != initialOffset) { 383 bool overflow = false; 384 if (digitsConsumed.fitsInLong()) { 385 int64_t exponentLong = digitsConsumed.toLong(false); 386 U_ASSERT(exponentLong >= 0); 387 if (exponentLong <= INT32_MAX) { 388 auto exponentInt = static_cast<int32_t>(exponentLong); 389 if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) { 390 overflow = true; 391 } 392 } else { 393 overflow = true; 394 } 395 } else { 396 overflow = true; 397 } 398 if (overflow) { 399 if (exponentSign == -1) { 400 // Set to zero 401 result.quantity.clear(); 402 } else { 403 // Set to infinity 404 result.quantity.bogus = true; 405 result.flags |= FLAG_INFINITY; 406 } 407 } 408 } else { 409 result.quantity = digitsConsumed; 410 } 411 412 // Set other information into the result and return. 413 if (!actualDecimalString.isBogus()) { 414 result.flags |= FLAG_HAS_DECIMAL_SEPARATOR; 415 } 416 result.setCharsConsumed(segment); 417 return segment.length() == 0 || maybeMore; 418 } 419 420 bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const { 421 if (requireGroupingMatch) { 422 if (sepType == -1) { 423 // No such group (prevGroup before first shift). 424 return true; 425 } else if (sepType == 0) { 426 // First group. 427 if (isPrimary) { 428 // No grouping separators is OK. 429 return true; 430 } else { 431 return count != 0 && count <= grouping2; 432 } 433 } else if (sepType == 1) { 434 // Middle group. 435 if (isPrimary) { 436 return count == grouping1; 437 } else { 438 return count == grouping2; 439 } 440 } else { 441 U_ASSERT(sepType == 2); 442 // After the decimal separator. 443 return true; 444 } 445 } else { 446 if (sepType == 1) { 447 // #11230: don't accept middle groups with only 1 digit. 448 return count != 1; 449 } else { 450 return true; 451 } 452 } 453 } 454 455 bool DecimalMatcher::smokeTest(const StringSegment& segment) const { 456 // The common case uses a static leadSet for efficiency. 457 if (fLocalDigitStrings.isNull() && leadSet != nullptr) { 458 return segment.startsWith(*leadSet); 459 } 460 if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) { 461 return true; 462 } 463 if (fLocalDigitStrings.isNull()) { 464 return false; 465 } 466 for (int32_t i = 0; i < 10; i++) { 467 if (segment.startsWith(fLocalDigitStrings[i])) { 468 return true; 469 } 470 } 471 return false; 472 } 473 474 UnicodeString DecimalMatcher::toString() const { 475 return u"<Decimal>"; 476 } 477 478 479 #endif /* #if !UCONFIG_NO_FORMATTING */