[ tor-browser ].git.dasho

numparse_decimal.cpp (18110B)
      1 // © 2018 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 #include "unicode/utypes.h"
      5 
      6 #if !UCONFIG_NO_FORMATTING
      7 
      8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
      9 // Helpful in toString methods and elsewhere.
     10 #define UNISTR_FROM_STRING_EXPLICIT
     11 
     12 #include "numparse_types.h"
     13 #include "numparse_decimal.h"
     14 #include "static_unicode_sets.h"
     15 #include "numparse_utils.h"
     16 #include "unicode/uchar.h"
     17 #include "putilimp.h"
     18 #include "number_decimalquantity.h"
     19 #include "string_segment.h"
     20 
     21 using namespace icu;
     22 using namespace icu::numparse;
     23 using namespace icu::numparse::impl;
     24 
     25 
     26 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
     27                               parse_flags_t parseFlags) {
     28    if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
     29        groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
     30        decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
     31    } else {
     32        groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
     33        decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
     34    }
     35    bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
     36 
     37    // Parsing is very lenient even in strict mode, almost any dot or comma is a
     38    // grouping separator. Parsing strings like "1.234" in French was treating '.'
     39    // like an ignorable grouping separator, and we want it to be excluded.
     40    // We keep the public behavior when strictParse is false, but when it is true
     41    // we restrict grouping separators to the smaller set of equivalents.
     42    unisets::Key groupingKey = unisets::chooseFrom(groupingSeparator,
     43            strictSeparators ? unisets::STRICT_COMMA : unisets::ALL_SEPARATORS,
     44            strictSeparators ? unisets::STRICT_PERIOD : unisets::ALL_SEPARATORS);
     45    if (groupingKey < 0) {
     46        groupingKey = unisets::chooseFrom(
     47            groupingSeparator, unisets::OTHER_GROUPING_SEPARATORS);
     48    }
     49    if (groupingKey >= 0) {
     50        // Attempt to find separators in the static cache
     51        groupingUniSet = unisets::get(groupingKey);
     52    } else if (!groupingSeparator.isEmpty()) {
     53        auto* set = new UnicodeSet();
     54        set->add(groupingSeparator.char32At(0));
     55        set->freeze();
     56        groupingUniSet = set;
     57        fLocalGroupingUniSet.adoptInstead(set);
     58    } else {
     59        groupingUniSet = unisets::get(unisets::EMPTY);
     60    }
     61 
     62    unisets::Key decimalKey = unisets::chooseFrom(
     63            decimalSeparator,
     64            strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
     65            strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
     66    if (decimalKey >= 0) {
     67        decimalUniSet = unisets::get(decimalKey);
     68    } else if (!decimalSeparator.isEmpty()) {
     69        auto* set = new UnicodeSet();
     70        set->add(decimalSeparator.char32At(0));
     71        set->freeze();
     72        decimalUniSet = set;
     73        fLocalDecimalUniSet.adoptInstead(set);
     74    } else {
     75        decimalUniSet = unisets::get(unisets::EMPTY);
     76    }
     77 
     78    if (groupingKey >= 0 && decimalKey >= 0) {
     79        // Everything is available in the static cache
     80        separatorSet = groupingUniSet;
     81        leadSet = unisets::get(
     82                strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
     83                                 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
     84    } else {
     85        auto* set = new UnicodeSet();
     86        set->addAll(*groupingUniSet);
     87        set->addAll(*decimalUniSet);
     88        set->freeze();
     89        separatorSet = set;
     90        fLocalSeparatorSet.adoptInstead(set);
     91        leadSet = nullptr;
     92    }
     93 
     94    UChar32 cpZero = symbols.getCodePointZero();
     95    if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
     96        // Uncommon case: okay to allocate.
     97        auto* digitStrings = new UnicodeString[10];
     98        fLocalDigitStrings.adoptInstead(digitStrings);
     99        for (int32_t i = 0; i <= 9; i++) {
    100            digitStrings[i] = symbols.getConstDigitSymbol(i);
    101        }
    102    }
    103 
    104    requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
    105    groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
    106    integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
    107    grouping1 = grouper.getPrimary();
    108    grouping2 = grouper.getSecondary();
    109 
    110    // Fraction grouping parsing is disabled for now but could be enabled later.
    111    // See https://unicode-org.atlassian.net/browse/ICU-10794
    112    // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
    113 }
    114 
    115 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
    116    return match(segment, result, 0, status);
    117 }
    118 
    119 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
    120                           UErrorCode&) const {
    121    if (result.seenNumber() && exponentSign == 0) {
    122        // A number has already been consumed.
    123        return false;
    124    } else if (exponentSign != 0) {
    125        // scientific notation always comes after the number
    126        U_ASSERT(!result.quantity.bogus);
    127    }
    128 
    129    // Initial offset before any character consumption.
    130    int32_t initialOffset = segment.getOffset();
    131 
    132    // Return value: whether to ask for more characters.
    133    bool maybeMore = false;
    134 
    135    // All digits consumed so far.
    136    number::impl::DecimalQuantity digitsConsumed;
    137    digitsConsumed.bogus = true;
    138 
    139    // The total number of digits after the decimal place, used for scaling the result.
    140    int32_t digitsAfterDecimalPlace = 0;
    141 
    142    // The actual grouping and decimal separators used in the string.
    143    // If non-null, we have seen that token.
    144    UnicodeString actualGroupingString;
    145    UnicodeString actualDecimalString;
    146    actualGroupingString.setToBogus();
    147    actualDecimalString.setToBogus();
    148 
    149    // Information for two groups: the previous group and the current group.
    150    //
    151    // Each group has three pieces of information:
    152    //
    153    // Offset: the string position of the beginning of the group, including a leading separator
    154    // if there was a leading separator. This is needed in case we need to rewind the parse to
    155    // that position.
    156    //
    157    // Separator type:
    158    // 0 => beginning of string
    159    // 1 => lead separator is a grouping separator
    160    // 2 => lead separator is a decimal separator
    161    //
    162    // Count: the number of digits in the group. If -1, the group has been validated.
    163    int32_t currGroupOffset = 0;
    164    int32_t currGroupSepType = 0;
    165    int32_t currGroupCount = 0;
    166    int32_t prevGroupOffset = -1;
    167    int32_t prevGroupSepType = -1;
    168    int32_t prevGroupCount = -1;
    169 
    170    while (segment.length() > 0) {
    171        maybeMore = false;
    172 
    173        // Attempt to match a digit.
    174        int8_t digit = -1;
    175 
    176        // Try by code point digit value.
    177        UChar32 cp = segment.getCodePoint();
    178        if (u_isdigit(cp)) {
    179            segment.adjustOffset(U16_LENGTH(cp));
    180            digit = static_cast<int8_t>(u_digit(cp, 10));
    181        }
    182 
    183        // Try by digit string.
    184        if (digit == -1 && !fLocalDigitStrings.isNull()) {
    185            for (int32_t i = 0; i < 10; i++) {
    186                const UnicodeString& str = fLocalDigitStrings[i];
    187                if (str.isEmpty()) {
    188                    continue;
    189                }
    190                int32_t overlap = segment.getCommonPrefixLength(str);
    191                if (overlap == str.length()) {
    192                    segment.adjustOffset(overlap);
    193                    digit = static_cast<int8_t>(i);
    194                    break;
    195                }
    196                maybeMore = maybeMore || (overlap == segment.length());
    197            }
    198        }
    199 
    200        if (digit >= 0) {
    201            // Digit was found.
    202            if (digitsConsumed.bogus) {
    203                digitsConsumed.bogus = false;
    204                digitsConsumed.clear();
    205            }
    206            digitsConsumed.appendDigit(digit, 0, true);
    207            currGroupCount++;
    208            if (!actualDecimalString.isBogus()) {
    209                digitsAfterDecimalPlace++;
    210            }
    211            continue;
    212        }
    213 
    214        // Attempt to match a literal grouping or decimal separator.
    215        bool isDecimal = false;
    216        bool isGrouping = false;
    217 
    218        // 1) Attempt the decimal separator string literal.
    219        // if (we have not seen a decimal separator yet) { ... }
    220        if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
    221            int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
    222            maybeMore = maybeMore || (overlap == segment.length());
    223            if (overlap == decimalSeparator.length()) {
    224                isDecimal = true;
    225                actualDecimalString = decimalSeparator;
    226            }
    227        }
    228 
    229        // 2) Attempt to match the actual grouping string literal.
    230        if (!actualGroupingString.isBogus()) {
    231            int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
    232            maybeMore = maybeMore || (overlap == segment.length());
    233            if (overlap == actualGroupingString.length()) {
    234                isGrouping = true;
    235            }
    236        }
    237 
    238        // 2.5) Attempt to match a new the grouping separator string literal.
    239        // if (we have not seen a grouping or decimal separator yet) { ... }
    240        if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
    241            !groupingSeparator.isEmpty()) {
    242            int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
    243            maybeMore = maybeMore || (overlap == segment.length());
    244            if (overlap == groupingSeparator.length()) {
    245                isGrouping = true;
    246                actualGroupingString = groupingSeparator;
    247            }
    248        }
    249 
    250        // 3) Attempt to match a decimal separator from the equivalence set.
    251        // if (we have not seen a decimal separator yet) { ... }
    252        // The !isGrouping is to confirm that we haven't yet matched the current character.
    253        if (!isGrouping && actualDecimalString.isBogus()) {
    254            if (decimalUniSet->contains(cp)) {
    255                isDecimal = true;
    256                actualDecimalString = UnicodeString(cp);
    257            }
    258        }
    259 
    260        // 4) Attempt to match a grouping separator from the equivalence set.
    261        // if (we have not seen a grouping or decimal separator yet) { ... }
    262        if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
    263            if (groupingUniSet->contains(cp)) {
    264                isGrouping = true;
    265                actualGroupingString = UnicodeString(cp);
    266            }
    267        }
    268 
    269        // Leave if we failed to match this as a separator.
    270        if (!isDecimal && !isGrouping) {
    271            break;
    272        }
    273 
    274        // Check for conditions when we don't want to accept the separator.
    275        if (isDecimal && integerOnly) {
    276            break;
    277        } else if (currGroupSepType == 2 && isGrouping) {
    278            // Fraction grouping
    279            break;
    280        }
    281 
    282        // Validate intermediate grouping sizes.
    283        bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
    284        bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
    285        if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
    286            // Invalid grouping sizes.
    287            if (isGrouping && currGroupCount == 0) {
    288                // Trailing grouping separators: these are taken care of below
    289                U_ASSERT(currGroupSepType == 1);
    290            } else if (requireGroupingMatch) {
    291                // Strict mode: reject the parse
    292                digitsConsumed.clear();
    293                digitsConsumed.bogus = true;
    294            }
    295            break;
    296        } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
    297            break;
    298        } else {
    299            // Grouping sizes OK so far.
    300            prevGroupOffset = currGroupOffset;
    301            prevGroupCount = currGroupCount;
    302            if (isDecimal) {
    303                // Do not validate this group any more.
    304                prevGroupSepType = -1;
    305            } else {
    306                prevGroupSepType = currGroupSepType;
    307            }
    308        }
    309 
    310        // OK to accept the separator.
    311        // Special case: don't update currGroup if it is empty; this allows two grouping
    312        // separators in a row in lenient mode.
    313        if (currGroupCount != 0) {
    314            currGroupOffset = segment.getOffset();
    315        }
    316        currGroupSepType = isGrouping ? 1 : 2;
    317        currGroupCount = 0;
    318        if (isGrouping) {
    319            segment.adjustOffset(actualGroupingString.length());
    320        } else {
    321            segment.adjustOffset(actualDecimalString.length());
    322        }
    323    }
    324 
    325    // End of main loop.
    326    // Back up if there was a trailing grouping separator.
    327    // Shift prev -> curr so we can check it as a final group.
    328    if (currGroupSepType != 2 && currGroupCount == 0) {
    329        maybeMore = true;
    330        segment.setOffset(currGroupOffset);
    331        currGroupOffset = prevGroupOffset;
    332        currGroupSepType = prevGroupSepType;
    333        currGroupCount = prevGroupCount;
    334        prevGroupOffset = -1;
    335        prevGroupSepType = 0;
    336        prevGroupCount = 1;
    337    }
    338 
    339    // Validate final grouping sizes.
    340    bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
    341    bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
    342    if (!requireGroupingMatch) {
    343        // The cases we need to handle here are lone digits.
    344        // Examples: "1,1"  "1,1,"  "1,1,1"  "1,1,1,"  ",1" (all parse as 1)
    345        // See more examples in numberformattestspecification.txt
    346        int32_t digitsToRemove = 0;
    347        if (!prevValidSecondary) {
    348            segment.setOffset(prevGroupOffset);
    349            digitsToRemove += prevGroupCount;
    350            digitsToRemove += currGroupCount;
    351        } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
    352            maybeMore = true;
    353            segment.setOffset(currGroupOffset);
    354            digitsToRemove += currGroupCount;
    355        }
    356        if (digitsToRemove != 0) {
    357            digitsConsumed.adjustMagnitude(-digitsToRemove);
    358            digitsConsumed.truncate();
    359        }
    360        prevValidSecondary = true;
    361        currValidPrimary = true;
    362    }
    363    if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
    364        // Grouping failure.
    365        digitsConsumed.bogus = true;
    366    }
    367 
    368    // Strings that start with a separator but have no digits,
    369    // or strings that failed a grouping size check.
    370    if (digitsConsumed.bogus) {
    371        maybeMore = maybeMore || (segment.length() == 0);
    372        segment.setOffset(initialOffset);
    373        return maybeMore;
    374    }
    375 
    376    // We passed all inspections. Start post-processing.
    377 
    378    // Adjust for fraction part.
    379    digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
    380 
    381    // Set the digits, either normal or exponent.
    382    if (exponentSign != 0 && segment.getOffset() != initialOffset) {
    383        bool overflow = false;
    384        if (digitsConsumed.fitsInLong()) {
    385            int64_t exponentLong = digitsConsumed.toLong(false);
    386            U_ASSERT(exponentLong >= 0);
    387            if (exponentLong <= INT32_MAX) {
    388                auto exponentInt = static_cast<int32_t>(exponentLong);
    389                if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
    390                    overflow = true;
    391                }
    392            } else {
    393                overflow = true;
    394            }
    395        } else {
    396            overflow = true;
    397        }
    398        if (overflow) {
    399            if (exponentSign == -1) {
    400                // Set to zero
    401                result.quantity.clear();
    402            } else {
    403                // Set to infinity
    404                result.quantity.bogus = true;
    405                result.flags |= FLAG_INFINITY;
    406            }
    407        }
    408    } else {
    409        result.quantity = digitsConsumed;
    410    }
    411 
    412    // Set other information into the result and return.
    413    if (!actualDecimalString.isBogus()) {
    414        result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
    415    }
    416    result.setCharsConsumed(segment);
    417    return segment.length() == 0 || maybeMore;
    418 }
    419 
    420 bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
    421    if (requireGroupingMatch) {
    422        if (sepType == -1) {
    423            // No such group (prevGroup before first shift).
    424            return true;
    425        } else if (sepType == 0) {
    426            // First group.
    427            if (isPrimary) {
    428                // No grouping separators is OK.
    429                return true;
    430            } else {
    431                return count != 0 && count <= grouping2;
    432            }
    433        } else if (sepType == 1) {
    434            // Middle group.
    435            if (isPrimary) {
    436                return count == grouping1;
    437            } else {
    438                return count == grouping2;
    439            }
    440        } else {
    441            U_ASSERT(sepType == 2);
    442            // After the decimal separator.
    443            return true;
    444        }
    445    } else {
    446        if (sepType == 1) {
    447            // #11230: don't accept middle groups with only 1 digit.
    448            return count != 1;
    449        } else {
    450            return true;
    451        }
    452    }
    453 }
    454 
    455 bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
    456    // The common case uses a static leadSet for efficiency.
    457    if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
    458        return segment.startsWith(*leadSet);
    459    }
    460    if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
    461        return true;
    462    }
    463    if (fLocalDigitStrings.isNull()) {
    464        return false;
    465    }
    466    for (int32_t i = 0; i < 10; i++) {
    467        if (segment.startsWith(fLocalDigitStrings[i])) {
    468            return true;
    469        }
    470    }
    471    return false;
    472 }
    473 
    474 UnicodeString DecimalMatcher::toString() const {
    475    return u"<Decimal>";
    476 }
    477 
    478 
    479 #endif /* #if !UCONFIG_NO_FORMATTING */
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE