numparse_impl.cpp (14122B)
1 // © 2018 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include "unicode/utypes.h" 5 6 #if !UCONFIG_NO_FORMATTING 7 8 // Allow implicit conversion from char16_t* to UnicodeString for this file: 9 // Helpful in toString methods and elsewhere. 10 #define UNISTR_FROM_STRING_EXPLICIT 11 12 #include <typeinfo> 13 #include <array> 14 #include "number_types.h" 15 #include "number_patternstring.h" 16 #include "numparse_types.h" 17 #include "numparse_impl.h" 18 #include "numparse_symbols.h" 19 #include "numparse_decimal.h" 20 #include "unicode/numberformatter.h" 21 #include "cstr.h" 22 #include "number_mapper.h" 23 #include "static_unicode_sets.h" 24 25 using namespace icu; 26 using namespace icu::number; 27 using namespace icu::number::impl; 28 using namespace icu::numparse; 29 using namespace icu::numparse::impl; 30 31 32 NumberParseMatcher::~NumberParseMatcher() = default; 33 34 35 NumberParserImpl* 36 NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& patternString, 37 parse_flags_t parseFlags, UErrorCode& status) { 38 39 LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags)); 40 DecimalFormatSymbols symbols(locale, status); 41 42 parser->fLocalMatchers.ignorables = {parseFlags}; 43 IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables; 44 45 DecimalFormatSymbols dfs(locale, status); 46 dfs.setSymbol(DecimalFormatSymbols::kCurrencySymbol, u"IU$"); 47 dfs.setSymbol(DecimalFormatSymbols::kIntlCurrencySymbol, u"ICU"); 48 CurrencySymbols currencySymbols({u"ICU", status}, locale, dfs, status); 49 50 ParsedPatternInfo patternInfo; 51 PatternParser::parseToPatternInfo(patternString, patternInfo, status); 52 53 // The following statements set up the affix matchers. 54 AffixTokenMatcherSetupData affixSetupData = { 55 currencySymbols, symbols, ignorables, locale, parseFlags}; 56 parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData}; 57 parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse}; 58 parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers( 59 patternInfo, *parser, ignorables, parseFlags, status); 60 61 Grouper grouper = Grouper::forStrategy(UNUM_GROUPING_AUTO); 62 grouper.setLocaleData(patternInfo, locale); 63 64 parser->addMatcher(parser->fLocalMatchers.ignorables); 65 parser->addMatcher(parser->fLocalMatchers.decimal = {symbols, grouper, parseFlags}); 66 parser->addMatcher(parser->fLocalMatchers.minusSign = {symbols, false}); 67 parser->addMatcher(parser->fLocalMatchers.plusSign = {symbols, false}); 68 parser->addMatcher(parser->fLocalMatchers.approximatelySign = {symbols, false}); 69 parser->addMatcher(parser->fLocalMatchers.percent = {symbols}); 70 parser->addMatcher(parser->fLocalMatchers.permille = {symbols}); 71 parser->addMatcher(parser->fLocalMatchers.nan = {symbols}); 72 parser->addMatcher(parser->fLocalMatchers.infinity = {symbols}); 73 parser->addMatcher(parser->fLocalMatchers.padding = {u"@"}); 74 parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper}); 75 parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, parseFlags, status}); 76 parser->addMatcher(parser->fLocalValidators.number = {}); 77 78 parser->freeze(); 79 return parser.orphan(); 80 } 81 82 NumberParserImpl* 83 NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatProperties& properties, 84 const DecimalFormatSymbols& symbols, bool parseCurrency, 85 UErrorCode& status) { 86 Locale locale = symbols.getLocale(); 87 AutoAffixPatternProvider affixProvider(properties, status); 88 if (U_FAILURE(status)) { return nullptr; } 89 CurrencyUnit currency = resolveCurrency(properties, locale, status); 90 CurrencySymbols currencySymbols(currency, locale, symbols, status); 91 bool isStrict = properties.parseMode.getOrDefault(PARSE_MODE_STRICT) == PARSE_MODE_STRICT; 92 Grouper grouper = Grouper::forProperties(properties); 93 int parseFlags = 0; 94 if (U_FAILURE(status)) { return nullptr; } 95 if (!properties.parseCaseSensitive) { 96 parseFlags |= PARSE_FLAG_IGNORE_CASE; 97 } 98 if (properties.parseIntegerOnly) { 99 parseFlags |= PARSE_FLAG_INTEGER_ONLY; 100 } 101 if (properties.signAlwaysShown) { 102 parseFlags |= PARSE_FLAG_PLUS_SIGN_ALLOWED; 103 } 104 if (isStrict) { 105 parseFlags |= PARSE_FLAG_STRICT_GROUPING_SIZE; 106 parseFlags |= PARSE_FLAG_STRICT_SEPARATORS; 107 parseFlags |= PARSE_FLAG_USE_FULL_AFFIXES; 108 parseFlags |= PARSE_FLAG_EXACT_AFFIX; 109 parseFlags |= PARSE_FLAG_STRICT_IGNORABLES; 110 } else { 111 parseFlags |= PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES; 112 } 113 if (grouper.getPrimary() <= 0) { 114 parseFlags |= PARSE_FLAG_GROUPING_DISABLED; 115 } 116 if (parseCurrency || affixProvider.get().hasCurrencySign()) { 117 parseFlags |= PARSE_FLAG_MONETARY_SEPARATORS; 118 } 119 if (!parseCurrency) { 120 parseFlags |= PARSE_FLAG_NO_FOREIGN_CURRENCY; 121 } 122 123 LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags)); 124 125 parser->fLocalMatchers.ignorables = {parseFlags}; 126 IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables; 127 128 ////////////////////// 129 /// AFFIX MATCHERS /// 130 ////////////////////// 131 132 // The following statements set up the affix matchers. 133 AffixTokenMatcherSetupData affixSetupData = { 134 currencySymbols, symbols, ignorables, locale, parseFlags}; 135 parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData}; 136 parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse}; 137 parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers( 138 affixProvider.get(), *parser, ignorables, parseFlags, status); 139 140 //////////////////////// 141 /// CURRENCY MATCHER /// 142 //////////////////////// 143 144 if (parseCurrency || affixProvider.get().hasCurrencySign()) { 145 parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, parseFlags, status}); 146 } 147 148 /////////////// 149 /// PERCENT /// 150 /////////////// 151 152 // ICU-TC meeting, April 11, 2018: accept percent/permille only if it is in the pattern, 153 // and to maintain regressive behavior, divide by 100 even if no percent sign is present. 154 if (!isStrict && affixProvider.get().containsSymbolType(AffixPatternType::TYPE_PERCENT, status)) { 155 parser->addMatcher(parser->fLocalMatchers.percent = {symbols}); 156 } 157 if (!isStrict && affixProvider.get().containsSymbolType(AffixPatternType::TYPE_PERMILLE, status)) { 158 parser->addMatcher(parser->fLocalMatchers.permille = {symbols}); 159 } 160 161 /////////////////////////////// 162 /// OTHER STANDARD MATCHERS /// 163 /////////////////////////////// 164 165 if (!isStrict) { 166 parser->addMatcher(parser->fLocalMatchers.plusSign = {symbols, false}); 167 parser->addMatcher(parser->fLocalMatchers.minusSign = {symbols, false}); 168 parser->addMatcher(parser->fLocalMatchers.approximatelySign = {symbols, false}); 169 } 170 parser->addMatcher(parser->fLocalMatchers.nan = {symbols}); 171 parser->addMatcher(parser->fLocalMatchers.infinity = {symbols}); 172 UnicodeString padString = properties.padString; 173 if (!padString.isBogus() && !ignorables.getSet()->contains(padString)) { 174 parser->addMatcher(parser->fLocalMatchers.padding = {padString}); 175 } 176 parser->addMatcher(parser->fLocalMatchers.ignorables); 177 parser->addMatcher(parser->fLocalMatchers.decimal = {symbols, grouper, parseFlags}); 178 // NOTE: parseNoExponent doesn't disable scientific parsing if we have a scientific formatter 179 if (!properties.parseNoExponent || properties.minimumExponentDigits > 0) { 180 parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper}); 181 } 182 183 ////////////////// 184 /// VALIDATORS /// 185 ////////////////// 186 187 parser->addMatcher(parser->fLocalValidators.number = {}); 188 if (isStrict) { 189 parser->addMatcher(parser->fLocalValidators.affix = {}); 190 } 191 if (parseCurrency) { 192 parser->addMatcher(parser->fLocalValidators.currency = {}); 193 } 194 if (properties.decimalPatternMatchRequired) { 195 bool patternHasDecimalSeparator = 196 properties.decimalSeparatorAlwaysShown || properties.maximumFractionDigits != 0; 197 parser->addMatcher(parser->fLocalValidators.decimalSeparator = {patternHasDecimalSeparator}); 198 } 199 // The multiplier takes care of scaling percentages. 200 Scale multiplier = scaleFromProperties(properties); 201 if (multiplier.isValid()) { 202 parser->addMatcher(parser->fLocalValidators.multiplier = {multiplier}); 203 } 204 205 parser->freeze(); 206 return parser.orphan(); 207 } 208 209 NumberParserImpl::NumberParserImpl(parse_flags_t parseFlags) 210 : fParseFlags(parseFlags) { 211 } 212 213 NumberParserImpl::~NumberParserImpl() { 214 fNumMatchers = 0; 215 } 216 217 void NumberParserImpl::addMatcher(NumberParseMatcher& matcher) { 218 if (fNumMatchers + 1 > fMatchers.getCapacity()) { 219 fMatchers.resize(fNumMatchers * 2, fNumMatchers); 220 } 221 fMatchers[fNumMatchers] = &matcher; 222 fNumMatchers++; 223 } 224 225 void NumberParserImpl::freeze() { 226 fFrozen = true; 227 } 228 229 parse_flags_t NumberParserImpl::getParseFlags() const { 230 return fParseFlags; 231 } 232 233 void NumberParserImpl::parse(const UnicodeString& input, bool greedy, ParsedNumber& result, 234 UErrorCode& status) const { 235 return parse(input, 0, greedy, result, status); 236 } 237 238 void NumberParserImpl::parse(const UnicodeString& input, int32_t start, bool greedy, ParsedNumber& result, 239 UErrorCode& status) const { 240 if (U_FAILURE(status)) { 241 return; 242 } 243 U_ASSERT(fFrozen); 244 // TODO: Check start >= 0 and start < input.length() 245 StringSegment segment(input, 0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE)); 246 segment.adjustOffset(start); 247 if (greedy) { 248 parseGreedy(segment, result, status); 249 } else if (0 != (fParseFlags & PARSE_FLAG_ALLOW_INFINITE_RECURSION)) { 250 // Start at 1 so that recursionLevels never gets to 0 251 parseLongestRecursive(segment, result, 1, status); 252 } else { 253 // Arbitrary recursion safety limit: 100 levels. 254 parseLongestRecursive(segment, result, -100, status); 255 } 256 for (int32_t i = 0; i < fNumMatchers; i++) { 257 fMatchers[i]->postProcess(result); 258 } 259 result.postProcess(); 260 } 261 262 void NumberParserImpl::parseGreedy(StringSegment& segment, ParsedNumber& result, 263 UErrorCode& status) const { 264 // Note: this method is not recursive in order to avoid stack overflow. 265 for (int i = 0; i <fNumMatchers;) { 266 // Base Case 267 if (segment.length() == 0) { 268 return; 269 } 270 const NumberParseMatcher* matcher = fMatchers[i]; 271 if (!matcher->smokeTest(segment)) { 272 // Matcher failed smoke test: try the next one 273 i++; 274 continue; 275 } 276 int32_t initialOffset = segment.getOffset(); 277 matcher->match(segment, result, status); 278 if (U_FAILURE(status)) { 279 return; 280 } 281 if (segment.getOffset() != initialOffset) { 282 // Greedy heuristic: accept the match and loop back 283 i = 0; 284 continue; 285 } else { 286 // Matcher did not match: try the next one 287 i++; 288 continue; 289 } 290 UPRV_UNREACHABLE_EXIT; 291 } 292 293 // NOTE: If we get here, the greedy parse completed without consuming the entire string. 294 } 295 296 void NumberParserImpl::parseLongestRecursive(StringSegment& segment, ParsedNumber& result, 297 int32_t recursionLevels, 298 UErrorCode& status) const { 299 // Base Case 300 if (segment.length() == 0) { 301 return; 302 } 303 304 // Safety against stack overflow 305 if (recursionLevels == 0) { 306 return; 307 } 308 309 // TODO: Give a nice way for the matcher to reset the ParsedNumber? 310 ParsedNumber initial(result); 311 ParsedNumber candidate; 312 313 int initialOffset = segment.getOffset(); 314 for (int32_t i = 0; i < fNumMatchers; i++) { 315 const NumberParseMatcher* matcher = fMatchers[i]; 316 if (!matcher->smokeTest(segment)) { 317 continue; 318 } 319 320 // In a non-greedy parse, we attempt all possible matches and pick the best. 321 for (int32_t charsToConsume = 0; charsToConsume < segment.length();) { 322 charsToConsume += U16_LENGTH(segment.codePointAt(charsToConsume)); 323 324 // Run the matcher on a segment of the current length. 325 candidate = initial; 326 segment.setLength(charsToConsume); 327 bool maybeMore = matcher->match(segment, candidate, status); 328 segment.resetLength(); 329 if (U_FAILURE(status)) { 330 return; 331 } 332 333 // If the entire segment was consumed, recurse. 334 if (segment.getOffset() - initialOffset == charsToConsume) { 335 parseLongestRecursive(segment, candidate, recursionLevels + 1, status); 336 if (U_FAILURE(status)) { 337 return; 338 } 339 if (candidate.isBetterThan(result)) { 340 result = candidate; 341 } 342 } 343 344 // Since the segment can be re-used, reset the offset. 345 // This does not have an effect if the matcher did not consume any chars. 346 segment.setOffset(initialOffset); 347 348 // Unless the matcher wants to see the next char, continue to the next matcher. 349 if (!maybeMore) { 350 break; 351 } 352 } 353 } 354 } 355 356 UnicodeString NumberParserImpl::toString() const { 357 UnicodeString result(u"<NumberParserImpl matchers:["); 358 for (int32_t i = 0; i < fNumMatchers; i++) { 359 result.append(u' '); 360 result.append(fMatchers[i]->toString()); 361 } 362 result.append(u" ]>", -1); 363 return result; 364 } 365 366 367 #endif /* #if !UCONFIG_NO_FORMATTING */