numparse_affixes.cpp (18256B)
1 // © 2018 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include "unicode/utypes.h" 5 6 #if !UCONFIG_NO_FORMATTING 7 8 // Allow implicit conversion from char16_t* to UnicodeString for this file: 9 // Helpful in toString methods and elsewhere. 10 #define UNISTR_FROM_STRING_EXPLICIT 11 12 #include "numparse_types.h" 13 #include "numparse_affixes.h" 14 #include "numparse_utils.h" 15 #include "number_utils.h" 16 #include "string_segment.h" 17 18 using namespace icu; 19 using namespace icu::numparse; 20 using namespace icu::numparse::impl; 21 using namespace icu::number; 22 using namespace icu::number::impl; 23 24 25 namespace { 26 27 /** 28 * Helper method to return whether the given AffixPatternMatcher equals the given pattern string. 29 * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal 30 * the given pattern string. 31 */ 32 bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) { 33 return (affix == nullptr && patternString.isBogus()) || 34 (affix != nullptr && affix->getPattern() == patternString); 35 } 36 37 /** 38 * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null. 39 */ 40 int32_t length(const AffixPatternMatcher* matcher) { 41 return matcher == nullptr ? 0 : matcher->getPattern().length(); 42 } 43 44 /** 45 * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both 46 * valid, whether they are equal according to operator==. Similar to Java Objects.equals() 47 */ 48 bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) { 49 if (lhs == nullptr && rhs == nullptr) { 50 return true; 51 } 52 if (lhs == nullptr || rhs == nullptr) { 53 return false; 54 } 55 return *lhs == *rhs; 56 } 57 58 } 59 60 61 AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern, 62 AffixTokenMatcherWarehouse& warehouse, 63 IgnorablesMatcher* ignorables) 64 : fMatchersLen(0), 65 fLastTypeOrCp(0), 66 fPattern(pattern), 67 fWarehouse(warehouse), 68 fIgnorables(ignorables) {} 69 70 void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) { 71 // This is called by AffixUtils.iterateWithConsumer() for each token. 72 73 // Add an ignorables matcher between tokens except between two literals, and don't put two 74 // ignorables matchers in a row. 75 if (fIgnorables != nullptr && fMatchersLen > 0 && 76 (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) { 77 addMatcher(*fIgnorables); 78 } 79 80 if (type != TYPE_CODEPOINT) { 81 // Case 1: the token is a symbol. 82 switch (type) { 83 case TYPE_MINUS_SIGN: 84 addMatcher(fWarehouse.minusSign()); 85 break; 86 case TYPE_PLUS_SIGN: 87 addMatcher(fWarehouse.plusSign()); 88 break; 89 case TYPE_APPROXIMATELY_SIGN: 90 addMatcher(fWarehouse.approximatelySign()); 91 break; 92 case TYPE_PERCENT: 93 addMatcher(fWarehouse.percent()); 94 break; 95 case TYPE_PERMILLE: 96 addMatcher(fWarehouse.permille()); 97 break; 98 case TYPE_CURRENCY_SINGLE: 99 case TYPE_CURRENCY_DOUBLE: 100 case TYPE_CURRENCY_TRIPLE: 101 case TYPE_CURRENCY_QUAD: 102 case TYPE_CURRENCY_QUINT: 103 case TYPE_CURRENCY_OVERFLOW: 104 // All currency symbols use the same matcher 105 addMatcher(fWarehouse.currency(status)); 106 break; 107 default: 108 UPRV_UNREACHABLE_EXIT; 109 } 110 111 } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) { 112 // Case 2: the token is an ignorable literal. 113 // No action necessary: the ignorables matcher has already been added. 114 115 } else { 116 // Case 3: the token is a non-ignorable literal. 117 if (auto* ptr = fWarehouse.nextCodePointMatcher(cp, status)) { 118 addMatcher(*ptr); 119 } else { 120 // OOM; unwind the stack 121 return; 122 } 123 } 124 fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp; 125 } 126 127 void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) { 128 if (fMatchersLen >= fMatchers.getCapacity()) { 129 fMatchers.resize(fMatchersLen * 2, fMatchersLen); 130 } 131 fMatchers[fMatchersLen++] = &matcher; 132 } 133 134 AffixPatternMatcher AffixPatternMatcherBuilder::build(UErrorCode& status) { 135 return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern, status); 136 } 137 138 AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData) 139 : fSetupData(setupData) {} 140 141 NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() { 142 return fMinusSign = {fSetupData->dfs, true}; 143 } 144 145 NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() { 146 return fPlusSign = {fSetupData->dfs, true}; 147 } 148 149 NumberParseMatcher& AffixTokenMatcherWarehouse::approximatelySign() { 150 return fApproximatelySign = {fSetupData->dfs, true}; 151 } 152 153 NumberParseMatcher& AffixTokenMatcherWarehouse::percent() { 154 return fPercent = {fSetupData->dfs}; 155 } 156 157 NumberParseMatcher& AffixTokenMatcherWarehouse::permille() { 158 return fPermille = {fSetupData->dfs}; 159 } 160 161 NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) { 162 return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status}; 163 } 164 165 IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() { 166 return fSetupData->ignorables; 167 } 168 169 NumberParseMatcher* AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp, UErrorCode& status) { 170 if (U_FAILURE(status)) { 171 return nullptr; 172 } 173 auto* result = fCodePoints.create(cp); 174 if (result == nullptr) { 175 status = U_MEMORY_ALLOCATION_ERROR; 176 } 177 return result; 178 } 179 180 bool AffixTokenMatcherWarehouse::hasEmptyCurrencySymbol() const { 181 return fSetupData->currencySymbols.hasEmptyCurrencySymbol(); 182 } 183 184 185 CodePointMatcher::CodePointMatcher(UChar32 cp) 186 : fCp(cp) {} 187 188 bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const { 189 if (segment.startsWith(fCp)) { 190 segment.adjustOffsetByCodePoint(); 191 result.setCharsConsumed(segment); 192 } 193 return false; 194 } 195 196 bool CodePointMatcher::smokeTest(const StringSegment& segment) const { 197 return segment.startsWith(fCp); 198 } 199 200 UnicodeString CodePointMatcher::toString() const { 201 return u"<CodePoint>"; 202 } 203 204 205 AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern, 206 AffixTokenMatcherWarehouse& tokenWarehouse, 207 parse_flags_t parseFlags, bool* success, 208 UErrorCode& status) { 209 if (affixPattern.isEmpty()) { 210 *success = false; 211 return {}; 212 } 213 *success = true; 214 215 IgnorablesMatcher* ignorables; 216 if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) { 217 ignorables = nullptr; 218 } else { 219 ignorables = &tokenWarehouse.ignorables(); 220 } 221 222 AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables); 223 AffixUtils::iterateWithConsumer(affixPattern, builder, status); 224 return builder.build(status); 225 } 226 227 AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen, 228 const UnicodeString& pattern, UErrorCode& status) 229 : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern, status) { 230 } 231 232 UnicodeString AffixPatternMatcher::getPattern() const { 233 return fPattern.toAliasedUnicodeString(); 234 } 235 236 bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const { 237 return fPattern == other.fPattern; 238 } 239 240 241 AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse) 242 : fTokenWarehouse(tokenWarehouse) { 243 } 244 245 bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo, 246 const IgnorablesMatcher& ignorables, parse_flags_t parseFlags, 247 UErrorCode& status) { 248 UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX); 249 UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX); 250 UnicodeString negPrefixString; 251 UnicodeString negSuffixString; 252 if (patternInfo.hasNegativeSubpattern()) { 253 negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX); 254 negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX); 255 } 256 257 if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) && 258 AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) && 259 AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) && 260 AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) && 261 AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status) 262 // HACK: Plus and minus sign are a special case: we accept them trailing only if they are 263 // trailing in the pattern string. 264 && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) && 265 !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) && 266 !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) && 267 !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) { 268 // The affixes contain only symbols and ignorables. 269 // No need to generate affix matchers. 270 return false; 271 } 272 return true; 273 } 274 275 void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo, 276 MutableMatcherCollection& output, 277 const IgnorablesMatcher& ignorables, 278 parse_flags_t parseFlags, UErrorCode& status) { 279 if (!isInteresting(patternInfo, ignorables, parseFlags, status)) { 280 return; 281 } 282 283 // The affixes have interesting characters, or we are in strict mode. 284 // Use initial capacity of 6, the highest possible number of AffixMatchers. 285 UnicodeString sb; 286 bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES); 287 288 int32_t numAffixMatchers = 0; 289 int32_t numAffixPatternMatchers = 0; 290 291 AffixPatternMatcher* posPrefix = nullptr; 292 AffixPatternMatcher* posSuffix = nullptr; 293 294 // Pre-process the affix strings to resolve LDML rules like sign display. 295 for (int8_t typeInt = 0; typeInt < PATTERN_SIGN_TYPE_COUNT * 2; typeInt++) { 296 auto type = static_cast<PatternSignType>(typeInt / 2); 297 bool dropCurrencySymbols = (typeInt % 2) == 1; 298 299 if (dropCurrencySymbols && !patternInfo.hasCurrencySign()) { 300 continue; 301 } 302 if (dropCurrencySymbols && !fTokenWarehouse->hasEmptyCurrencySymbol()) { 303 continue; 304 } 305 306 // Skip affixes in some cases 307 if (type == PATTERN_SIGN_TYPE_POS 308 && 0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) { 309 continue; 310 } 311 if (type == PATTERN_SIGN_TYPE_POS_SIGN 312 && 0 == (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) { 313 continue; 314 } 315 316 // Generate Prefix 317 // TODO: Handle approximately sign? 318 bool hasPrefix = false; 319 PatternStringUtils::patternInfoToStringBuilder( 320 patternInfo, true, type, false, StandardPlural::OTHER, false, dropCurrencySymbols, sb); 321 fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( 322 sb, *fTokenWarehouse, parseFlags, &hasPrefix, status); 323 AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++] 324 : nullptr; 325 326 // Generate Suffix 327 // TODO: Handle approximately sign? 328 bool hasSuffix = false; 329 PatternStringUtils::patternInfoToStringBuilder( 330 patternInfo, false, type, false, StandardPlural::OTHER, false, dropCurrencySymbols, sb); 331 fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( 332 sb, *fTokenWarehouse, parseFlags, &hasSuffix, status); 333 AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++] 334 : nullptr; 335 336 if (type == PATTERN_SIGN_TYPE_POS) { 337 posPrefix = prefix; 338 posSuffix = suffix; 339 } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) { 340 // Skip adding these matchers (we already have equivalents) 341 continue; 342 } 343 344 // Flags for setting in the ParsedNumber; the token matchers may add more. 345 int flags = (type == PATTERN_SIGN_TYPE_NEG) ? FLAG_NEGATIVE : 0; 346 347 // Note: it is indeed possible for posPrefix and posSuffix to both be null. 348 // We still need to add that matcher for strict mode to work. 349 fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags}; 350 if (includeUnpaired && prefix != nullptr && suffix != nullptr) { 351 // The following if statements are designed to prevent adding two identical matchers. 352 if (type == PATTERN_SIGN_TYPE_POS || !equals(prefix, posPrefix)) { 353 fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags}; 354 } 355 if (type == PATTERN_SIGN_TYPE_POS || !equals(suffix, posSuffix)) { 356 fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags}; 357 } 358 } 359 } 360 361 // Put the AffixMatchers in order, and then add them to the output. 362 // Since there are at most 9 elements, do a simple-to-implement bubble sort. 363 bool madeChanges; 364 do { 365 madeChanges = false; 366 for (int32_t i = 1; i < numAffixMatchers; i++) { 367 if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) { 368 madeChanges = true; 369 AffixMatcher temp = std::move(fAffixMatchers[i - 1]); 370 fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]); 371 fAffixMatchers[i] = std::move(temp); 372 } 373 } 374 } while (madeChanges); 375 376 for (int32_t i = 0; i < numAffixMatchers; i++) { 377 // Enable the following line to debug affixes 378 //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl; 379 output.addMatcher(fAffixMatchers[i]); 380 } 381 } 382 383 384 AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags) 385 : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {} 386 387 bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { 388 if (!result.seenNumber()) { 389 // Prefix 390 // Do not match if: 391 // 1. We have already seen a prefix (result.prefix != null) 392 // 2. The prefix in this AffixMatcher is empty (prefix == null) 393 if (!result.prefix.isBogus() || fPrefix == nullptr) { 394 return false; 395 } 396 397 // Attempt to match the prefix. 398 int initialOffset = segment.getOffset(); 399 bool maybeMore = fPrefix->match(segment, result, status); 400 if (initialOffset != segment.getOffset()) { 401 result.prefix = fPrefix->getPattern(); 402 } 403 return maybeMore; 404 405 } else { 406 // Suffix 407 // Do not match if: 408 // 1. We have already seen a suffix (result.suffix != null) 409 // 2. The suffix in this AffixMatcher is empty (suffix == null) 410 // 3. The matched prefix does not equal this AffixMatcher's prefix 411 if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) { 412 return false; 413 } 414 415 // Attempt to match the suffix. 416 int initialOffset = segment.getOffset(); 417 bool maybeMore = fSuffix->match(segment, result, status); 418 if (initialOffset != segment.getOffset()) { 419 result.suffix = fSuffix->getPattern(); 420 } 421 return maybeMore; 422 } 423 } 424 425 bool AffixMatcher::smokeTest(const StringSegment& segment) const { 426 return (fPrefix != nullptr && fPrefix->smokeTest(segment)) || 427 (fSuffix != nullptr && fSuffix->smokeTest(segment)); 428 } 429 430 void AffixMatcher::postProcess(ParsedNumber& result) const { 431 // Check to see if our affix is the one that was matched. If so, set the flags in the result. 432 if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) { 433 // Fill in the result prefix and suffix with non-null values (empty string). 434 // Used by strict mode to determine whether an entire affix pair was matched. 435 if (result.prefix.isBogus()) { 436 result.prefix = UnicodeString(); 437 } 438 if (result.suffix.isBogus()) { 439 result.suffix = UnicodeString(); 440 } 441 result.flags |= fFlags; 442 if (fPrefix != nullptr) { 443 fPrefix->postProcess(result); 444 } 445 if (fSuffix != nullptr) { 446 fSuffix->postProcess(result); 447 } 448 } 449 } 450 451 int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const { 452 const AffixMatcher& lhs = *this; 453 if (length(lhs.fPrefix) != length(rhs.fPrefix)) { 454 return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1; 455 } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) { 456 return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1; 457 } else { 458 return 0; 459 } 460 } 461 462 UnicodeString AffixMatcher::toString() const { 463 bool isNegative = 0 != (fFlags & FLAG_NEGATIVE); 464 return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") + 465 (fPrefix ? fPrefix->getPattern() : u"null") + u"#" + 466 (fSuffix ? fSuffix->getPattern() : u"null") + u">"; 467 468 } 469 470 471 #endif /* #if !UCONFIG_NO_FORMATTING */