tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

numparse_affixes.cpp (18256B)


      1 // © 2018 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 #include "unicode/utypes.h"
      5 
      6 #if !UCONFIG_NO_FORMATTING
      7 
      8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
      9 // Helpful in toString methods and elsewhere.
     10 #define UNISTR_FROM_STRING_EXPLICIT
     11 
     12 #include "numparse_types.h"
     13 #include "numparse_affixes.h"
     14 #include "numparse_utils.h"
     15 #include "number_utils.h"
     16 #include "string_segment.h"
     17 
     18 using namespace icu;
     19 using namespace icu::numparse;
     20 using namespace icu::numparse::impl;
     21 using namespace icu::number;
     22 using namespace icu::number::impl;
     23 
     24 
     25 namespace {
     26 
     27 /**
     28 * Helper method to return whether the given AffixPatternMatcher equals the given pattern string.
     29 * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal
     30 * the given pattern string.
     31 */
     32 bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) {
     33    return (affix == nullptr && patternString.isBogus()) ||
     34           (affix != nullptr && affix->getPattern() == patternString);
     35 }
     36 
     37 /**
     38 * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null.
     39 */
     40 int32_t length(const AffixPatternMatcher* matcher) {
     41    return matcher == nullptr ? 0 : matcher->getPattern().length();
     42 }
     43 
     44 /**
     45 * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both
     46 * valid, whether they are equal according to operator==.  Similar to Java Objects.equals()
     47 */
     48 bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) {
     49    if (lhs == nullptr && rhs == nullptr) {
     50        return true;
     51    }
     52    if (lhs == nullptr || rhs == nullptr) {
     53        return false;
     54    }
     55    return *lhs == *rhs;
     56 }
     57 
     58 }
     59 
     60 
     61 AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern,
     62                                                       AffixTokenMatcherWarehouse& warehouse,
     63                                                       IgnorablesMatcher* ignorables)
     64        : fMatchersLen(0),
     65          fLastTypeOrCp(0),
     66          fPattern(pattern),
     67          fWarehouse(warehouse),
     68          fIgnorables(ignorables) {}
     69 
     70 void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) {
     71    // This is called by AffixUtils.iterateWithConsumer() for each token.
     72 
     73    // Add an ignorables matcher between tokens except between two literals, and don't put two
     74    // ignorables matchers in a row.
     75    if (fIgnorables != nullptr && fMatchersLen > 0 &&
     76        (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) {
     77        addMatcher(*fIgnorables);
     78    }
     79 
     80    if (type != TYPE_CODEPOINT) {
     81        // Case 1: the token is a symbol.
     82        switch (type) {
     83            case TYPE_MINUS_SIGN:
     84                addMatcher(fWarehouse.minusSign());
     85                break;
     86            case TYPE_PLUS_SIGN:
     87                addMatcher(fWarehouse.plusSign());
     88                break;
     89            case TYPE_APPROXIMATELY_SIGN:
     90                addMatcher(fWarehouse.approximatelySign());
     91                break;
     92            case TYPE_PERCENT:
     93                addMatcher(fWarehouse.percent());
     94                break;
     95            case TYPE_PERMILLE:
     96                addMatcher(fWarehouse.permille());
     97                break;
     98            case TYPE_CURRENCY_SINGLE:
     99            case TYPE_CURRENCY_DOUBLE:
    100            case TYPE_CURRENCY_TRIPLE:
    101            case TYPE_CURRENCY_QUAD:
    102            case TYPE_CURRENCY_QUINT:
    103            case TYPE_CURRENCY_OVERFLOW:
    104                // All currency symbols use the same matcher
    105                addMatcher(fWarehouse.currency(status));
    106                break;
    107            default:
    108                UPRV_UNREACHABLE_EXIT;
    109        }
    110 
    111    } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) {
    112        // Case 2: the token is an ignorable literal.
    113        // No action necessary: the ignorables matcher has already been added.
    114 
    115    } else {
    116        // Case 3: the token is a non-ignorable literal.
    117        if (auto* ptr = fWarehouse.nextCodePointMatcher(cp, status)) {
    118            addMatcher(*ptr);
    119        } else {
    120            // OOM; unwind the stack
    121            return;
    122        }
    123    }
    124    fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp;
    125 }
    126 
    127 void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) {
    128    if (fMatchersLen >= fMatchers.getCapacity()) {
    129        fMatchers.resize(fMatchersLen * 2, fMatchersLen);
    130    }
    131    fMatchers[fMatchersLen++] = &matcher;
    132 }
    133 
    134 AffixPatternMatcher AffixPatternMatcherBuilder::build(UErrorCode& status) {
    135    return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern, status);
    136 }
    137 
    138 AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData)
    139        : fSetupData(setupData) {}
    140 
    141 NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() {
    142    return fMinusSign = {fSetupData->dfs, true};
    143 }
    144 
    145 NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() {
    146    return fPlusSign = {fSetupData->dfs, true};
    147 }
    148 
    149 NumberParseMatcher& AffixTokenMatcherWarehouse::approximatelySign() {
    150    return fApproximatelySign = {fSetupData->dfs, true};
    151 }
    152 
    153 NumberParseMatcher& AffixTokenMatcherWarehouse::percent() {
    154    return fPercent = {fSetupData->dfs};
    155 }
    156 
    157 NumberParseMatcher& AffixTokenMatcherWarehouse::permille() {
    158    return fPermille = {fSetupData->dfs};
    159 }
    160 
    161 NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) {
    162    return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status};
    163 }
    164 
    165 IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() {
    166    return fSetupData->ignorables;
    167 }
    168 
    169 NumberParseMatcher* AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp, UErrorCode& status) {
    170    if (U_FAILURE(status)) {
    171        return nullptr;
    172    }
    173    auto* result = fCodePoints.create(cp);
    174    if (result == nullptr) {
    175        status = U_MEMORY_ALLOCATION_ERROR;
    176    }
    177    return result;
    178 }
    179 
    180 bool AffixTokenMatcherWarehouse::hasEmptyCurrencySymbol() const {
    181    return fSetupData->currencySymbols.hasEmptyCurrencySymbol();
    182 }
    183 
    184 
    185 CodePointMatcher::CodePointMatcher(UChar32 cp)
    186        : fCp(cp) {}
    187 
    188 bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
    189    if (segment.startsWith(fCp)) {
    190        segment.adjustOffsetByCodePoint();
    191        result.setCharsConsumed(segment);
    192    }
    193    return false;
    194 }
    195 
    196 bool CodePointMatcher::smokeTest(const StringSegment& segment) const {
    197    return segment.startsWith(fCp);
    198 }
    199 
    200 UnicodeString CodePointMatcher::toString() const {
    201    return u"<CodePoint>";
    202 }
    203 
    204 
    205 AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern,
    206                                                          AffixTokenMatcherWarehouse& tokenWarehouse,
    207                                                          parse_flags_t parseFlags, bool* success,
    208                                                          UErrorCode& status) {
    209    if (affixPattern.isEmpty()) {
    210        *success = false;
    211        return {};
    212    }
    213    *success = true;
    214 
    215    IgnorablesMatcher* ignorables;
    216    if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) {
    217        ignorables = nullptr;
    218    } else {
    219        ignorables = &tokenWarehouse.ignorables();
    220    }
    221 
    222    AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables);
    223    AffixUtils::iterateWithConsumer(affixPattern, builder, status);
    224    return builder.build(status);
    225 }
    226 
    227 AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen,
    228                                         const UnicodeString& pattern, UErrorCode& status)
    229    : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern, status) {
    230 }
    231 
    232 UnicodeString AffixPatternMatcher::getPattern() const {
    233    return fPattern.toAliasedUnicodeString();
    234 }
    235 
    236 bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const {
    237    return fPattern == other.fPattern;
    238 }
    239 
    240 
    241 AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse)
    242        : fTokenWarehouse(tokenWarehouse) {
    243 }
    244 
    245 bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo,
    246                                          const IgnorablesMatcher& ignorables, parse_flags_t parseFlags,
    247                                          UErrorCode& status) {
    248    UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX);
    249    UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX);
    250    UnicodeString negPrefixString;
    251    UnicodeString negSuffixString;
    252    if (patternInfo.hasNegativeSubpattern()) {
    253        negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX);
    254        negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX);
    255    }
    256 
    257    if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) &&
    258        AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) &&
    259        AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) &&
    260        AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) &&
    261        AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status)
    262        // HACK: Plus and minus sign are a special case: we accept them trailing only if they are
    263        // trailing in the pattern string.
    264        && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) &&
    265        !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) &&
    266        !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) &&
    267        !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) {
    268        // The affixes contain only symbols and ignorables.
    269        // No need to generate affix matchers.
    270        return false;
    271    }
    272    return true;
    273 }
    274 
    275 void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo,
    276                                                MutableMatcherCollection& output,
    277                                                const IgnorablesMatcher& ignorables,
    278                                                parse_flags_t parseFlags, UErrorCode& status) {
    279    if (!isInteresting(patternInfo, ignorables, parseFlags, status)) {
    280        return;
    281    }
    282 
    283    // The affixes have interesting characters, or we are in strict mode.
    284    // Use initial capacity of 6, the highest possible number of AffixMatchers.
    285    UnicodeString sb;
    286    bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES);
    287 
    288    int32_t numAffixMatchers = 0;
    289    int32_t numAffixPatternMatchers = 0;
    290 
    291    AffixPatternMatcher* posPrefix = nullptr;
    292    AffixPatternMatcher* posSuffix = nullptr;
    293 
    294    // Pre-process the affix strings to resolve LDML rules like sign display.
    295    for (int8_t typeInt = 0; typeInt < PATTERN_SIGN_TYPE_COUNT * 2; typeInt++) {
    296        auto type = static_cast<PatternSignType>(typeInt / 2);
    297        bool dropCurrencySymbols = (typeInt % 2) == 1;
    298 
    299        if (dropCurrencySymbols && !patternInfo.hasCurrencySign()) {
    300            continue;
    301        }
    302        if (dropCurrencySymbols && !fTokenWarehouse->hasEmptyCurrencySymbol()) {
    303            continue;
    304        }
    305 
    306        // Skip affixes in some cases
    307        if (type == PATTERN_SIGN_TYPE_POS
    308                && 0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) {
    309            continue;
    310        }
    311        if (type == PATTERN_SIGN_TYPE_POS_SIGN
    312                && 0 == (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) {
    313            continue;
    314        }
    315 
    316        // Generate Prefix
    317        // TODO: Handle approximately sign?
    318        bool hasPrefix = false;
    319        PatternStringUtils::patternInfoToStringBuilder(
    320                patternInfo, true, type, false, StandardPlural::OTHER, false, dropCurrencySymbols, sb);
    321        fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
    322                sb, *fTokenWarehouse, parseFlags, &hasPrefix, status);
    323        AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
    324                                                : nullptr;
    325 
    326        // Generate Suffix
    327        // TODO: Handle approximately sign?
    328        bool hasSuffix = false;
    329        PatternStringUtils::patternInfoToStringBuilder(
    330                patternInfo, false, type, false, StandardPlural::OTHER, false, dropCurrencySymbols, sb);
    331        fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
    332                sb, *fTokenWarehouse, parseFlags, &hasSuffix, status);
    333        AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
    334                                                : nullptr;
    335 
    336        if (type == PATTERN_SIGN_TYPE_POS) {
    337            posPrefix = prefix;
    338            posSuffix = suffix;
    339        } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) {
    340            // Skip adding these matchers (we already have equivalents)
    341            continue;
    342        }
    343 
    344        // Flags for setting in the ParsedNumber; the token matchers may add more.
    345        int flags = (type == PATTERN_SIGN_TYPE_NEG) ? FLAG_NEGATIVE : 0;
    346 
    347        // Note: it is indeed possible for posPrefix and posSuffix to both be null.
    348        // We still need to add that matcher for strict mode to work.
    349        fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags};
    350        if (includeUnpaired && prefix != nullptr && suffix != nullptr) {
    351            // The following if statements are designed to prevent adding two identical matchers.
    352            if (type == PATTERN_SIGN_TYPE_POS || !equals(prefix, posPrefix)) {
    353                fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags};
    354            }
    355            if (type == PATTERN_SIGN_TYPE_POS || !equals(suffix, posSuffix)) {
    356                fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags};
    357            }
    358        }
    359    }
    360 
    361    // Put the AffixMatchers in order, and then add them to the output.
    362    // Since there are at most 9 elements, do a simple-to-implement bubble sort.
    363    bool madeChanges;
    364    do {
    365        madeChanges = false;
    366        for (int32_t i = 1; i < numAffixMatchers; i++) {
    367            if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) {
    368                madeChanges = true;
    369                AffixMatcher temp = std::move(fAffixMatchers[i - 1]);
    370                fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]);
    371                fAffixMatchers[i] = std::move(temp);
    372            }
    373        }
    374    } while (madeChanges);
    375 
    376    for (int32_t i = 0; i < numAffixMatchers; i++) {
    377        // Enable the following line to debug affixes
    378        //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl;
    379        output.addMatcher(fAffixMatchers[i]);
    380    }
    381 }
    382 
    383 
    384 AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags)
    385        : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {}
    386 
    387 bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
    388    if (!result.seenNumber()) {
    389        // Prefix
    390        // Do not match if:
    391        // 1. We have already seen a prefix (result.prefix != null)
    392        // 2. The prefix in this AffixMatcher is empty (prefix == null)
    393        if (!result.prefix.isBogus() || fPrefix == nullptr) {
    394            return false;
    395        }
    396 
    397        // Attempt to match the prefix.
    398        int initialOffset = segment.getOffset();
    399        bool maybeMore = fPrefix->match(segment, result, status);
    400        if (initialOffset != segment.getOffset()) {
    401            result.prefix = fPrefix->getPattern();
    402        }
    403        return maybeMore;
    404 
    405    } else {
    406        // Suffix
    407        // Do not match if:
    408        // 1. We have already seen a suffix (result.suffix != null)
    409        // 2. The suffix in this AffixMatcher is empty (suffix == null)
    410        // 3. The matched prefix does not equal this AffixMatcher's prefix
    411        if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) {
    412            return false;
    413        }
    414 
    415        // Attempt to match the suffix.
    416        int initialOffset = segment.getOffset();
    417        bool maybeMore = fSuffix->match(segment, result, status);
    418        if (initialOffset != segment.getOffset()) {
    419            result.suffix = fSuffix->getPattern();
    420        }
    421        return maybeMore;
    422    }
    423 }
    424 
    425 bool AffixMatcher::smokeTest(const StringSegment& segment) const {
    426    return (fPrefix != nullptr && fPrefix->smokeTest(segment)) ||
    427           (fSuffix != nullptr && fSuffix->smokeTest(segment));
    428 }
    429 
    430 void AffixMatcher::postProcess(ParsedNumber& result) const {
    431    // Check to see if our affix is the one that was matched. If so, set the flags in the result.
    432    if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) {
    433        // Fill in the result prefix and suffix with non-null values (empty string).
    434        // Used by strict mode to determine whether an entire affix pair was matched.
    435        if (result.prefix.isBogus()) {
    436            result.prefix = UnicodeString();
    437        }
    438        if (result.suffix.isBogus()) {
    439            result.suffix = UnicodeString();
    440        }
    441        result.flags |= fFlags;
    442        if (fPrefix != nullptr) {
    443            fPrefix->postProcess(result);
    444        }
    445        if (fSuffix != nullptr) {
    446            fSuffix->postProcess(result);
    447        }
    448    }
    449 }
    450 
    451 int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const {
    452    const AffixMatcher& lhs = *this;
    453    if (length(lhs.fPrefix) != length(rhs.fPrefix)) {
    454        return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1;
    455    } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) {
    456        return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1;
    457    } else {
    458        return 0;
    459    }
    460 }
    461 
    462 UnicodeString AffixMatcher::toString() const {
    463    bool isNegative = 0 != (fFlags & FLAG_NEGATIVE);
    464    return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") +
    465           (fPrefix ? fPrefix->getPattern() : u"null") + u"#" +
    466           (fSuffix ? fSuffix->getPattern() : u"null") + u">";
    467 
    468 }
    469 
    470 
    471 #endif /* #if !UCONFIG_NO_FORMATTING */