tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

number_affixutils.cpp (15622B)


      1 // © 2017 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 #include "unicode/utypes.h"
      5 
      6 #if !UCONFIG_NO_FORMATTING
      7 
      8 #include "number_affixutils.h"
      9 #include "unicode/utf16.h"
     10 #include "unicode/uniset.h"
     11 
     12 using namespace icu;
     13 using namespace icu::number;
     14 using namespace icu::number::impl;
     15 
     16 TokenConsumer::~TokenConsumer() = default;
     17 SymbolProvider::~SymbolProvider() = default;
     18 
     19 int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
     20    AffixPatternState state = STATE_BASE;
     21    int32_t offset = 0;
     22    int32_t length = 0;
     23    for (; offset < patternString.length();) {
     24        UChar32 cp = patternString.char32At(offset);
     25 
     26        switch (state) {
     27            case STATE_BASE:
     28                if (cp == u'\'') {
     29                    // First quote
     30                    state = STATE_FIRST_QUOTE;
     31                } else {
     32                    // Unquoted symbol
     33                    length++;
     34                }
     35                break;
     36            case STATE_FIRST_QUOTE:
     37                if (cp == u'\'') {
     38                    // Repeated quote
     39                    length++;
     40                    state = STATE_BASE;
     41                } else {
     42                    // Quoted code point
     43                    length++;
     44                    state = STATE_INSIDE_QUOTE;
     45                }
     46                break;
     47            case STATE_INSIDE_QUOTE:
     48                if (cp == u'\'') {
     49                    // End of quoted sequence
     50                    state = STATE_AFTER_QUOTE;
     51                } else {
     52                    // Quoted code point
     53                    length++;
     54                }
     55                break;
     56            case STATE_AFTER_QUOTE:
     57                if (cp == u'\'') {
     58                    // Double quote inside of quoted sequence
     59                    length++;
     60                    state = STATE_INSIDE_QUOTE;
     61                } else {
     62                    // Unquoted symbol
     63                    length++;
     64                }
     65                break;
     66            default:
     67                UPRV_UNREACHABLE_EXIT;
     68        }
     69 
     70        offset += U16_LENGTH(cp);
     71    }
     72 
     73    switch (state) {
     74        case STATE_FIRST_QUOTE:
     75        case STATE_INSIDE_QUOTE:
     76            status = U_ILLEGAL_ARGUMENT_ERROR;
     77            break;
     78        default:
     79            break;
     80    }
     81 
     82    return length;
     83 }
     84 
     85 UnicodeString AffixUtils::escape(const UnicodeString &input) {
     86    AffixPatternState state = STATE_BASE;
     87    int32_t offset = 0;
     88    UnicodeString output;
     89    for (; offset < input.length();) {
     90        UChar32 cp = input.char32At(offset);
     91 
     92        switch (cp) {
     93            case u'\'':
     94                output.append(u"''", -1);
     95                break;
     96 
     97            case u'-':
     98            case u'+':
     99            case u'%':
    100            case u'‰':
    101            case u'¤':
    102                if (state == STATE_BASE) {
    103                    output.append(u'\'');
    104                    output.append(cp);
    105                    state = STATE_INSIDE_QUOTE;
    106                } else {
    107                    output.append(cp);
    108                }
    109                break;
    110 
    111            default:
    112                if (state == STATE_INSIDE_QUOTE) {
    113                    output.append(u'\'');
    114                    output.append(cp);
    115                    state = STATE_BASE;
    116                } else {
    117                    output.append(cp);
    118                }
    119                break;
    120        }
    121        offset += U16_LENGTH(cp);
    122    }
    123 
    124    if (state == STATE_INSIDE_QUOTE) {
    125        output.append(u'\'');
    126    }
    127 
    128    return output;
    129 }
    130 
    131 Field AffixUtils::getFieldForType(AffixPatternType type) {
    132    switch (type) {
    133        case TYPE_MINUS_SIGN:
    134            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
    135        case TYPE_PLUS_SIGN:
    136            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
    137        case TYPE_APPROXIMATELY_SIGN:
    138            return {UFIELD_CATEGORY_NUMBER, UNUM_APPROXIMATELY_SIGN_FIELD};
    139        case TYPE_PERCENT:
    140            return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
    141        case TYPE_PERMILLE:
    142            return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
    143        case TYPE_CURRENCY_SINGLE:
    144            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
    145        case TYPE_CURRENCY_DOUBLE:
    146            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
    147        case TYPE_CURRENCY_TRIPLE:
    148            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
    149        case TYPE_CURRENCY_QUAD:
    150            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
    151        case TYPE_CURRENCY_QUINT:
    152            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
    153        case TYPE_CURRENCY_OVERFLOW:
    154            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
    155        default:
    156            UPRV_UNREACHABLE_EXIT;
    157    }
    158 }
    159 
    160 int32_t
    161 AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
    162                     const SymbolProvider &provider, Field field, UErrorCode &status) {
    163    int32_t length = 0;
    164    AffixTag tag;
    165    while (hasNext(tag, affixPattern)) {
    166        tag = nextToken(tag, affixPattern, status);
    167        if (U_FAILURE(status)) { return length; }
    168        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
    169            // Don't go to the provider for this special case
    170            length += output.insertCodePoint(
    171                position + length,
    172                0xFFFD,
    173                {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
    174                status);
    175        } else if (tag.type < 0) {
    176            length += output.insert(
    177                    position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
    178        } else {
    179            length += output.insertCodePoint(position + length, tag.codePoint, field, status);
    180        }
    181    }
    182    return length;
    183 }
    184 
    185 int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
    186                                            const SymbolProvider &provider, UErrorCode &status) {
    187    int32_t length = 0;
    188    AffixTag tag;
    189    while (hasNext(tag, affixPattern)) {
    190        tag = nextToken(tag, affixPattern, status);
    191        if (U_FAILURE(status)) { return length; }
    192        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
    193            length += 1;
    194        } else if (tag.type < 0) {
    195            length += provider.getSymbol(tag.type).length();
    196        } else {
    197            length += U16_LENGTH(tag.codePoint);
    198        }
    199    }
    200    return length;
    201 }
    202 
    203 bool
    204 AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
    205    if (affixPattern.length() == 0) {
    206        return false;
    207    }
    208    AffixTag tag;
    209    while (hasNext(tag, affixPattern)) {
    210        tag = nextToken(tag, affixPattern, status);
    211        if (U_FAILURE(status)) { return false; }
    212        if (tag.type == type) {
    213            return true;
    214        }
    215    }
    216    return false;
    217 }
    218 
    219 bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
    220    if (affixPattern.length() == 0) {
    221        return false;
    222    }
    223    AffixTag tag;
    224    while (hasNext(tag, affixPattern)) {
    225        tag = nextToken(tag, affixPattern, status);
    226        if (U_FAILURE(status)) { return false; }
    227        if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
    228            return true;
    229        }
    230    }
    231    return false;
    232 }
    233 
    234 UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
    235                                      char16_t replacementChar, UErrorCode &status) {
    236    UnicodeString output(affixPattern); // copy
    237    if (affixPattern.length() == 0) {
    238        return output;
    239    }
    240    AffixTag tag;
    241    while (hasNext(tag, affixPattern)) {
    242        tag = nextToken(tag, affixPattern, status);
    243        if (U_FAILURE(status)) { return output; }
    244        if (tag.type == type) {
    245            output.replace(tag.offset - 1, 1, replacementChar);
    246        }
    247    }
    248    return output;
    249 }
    250 
    251 bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
    252                                                  const UnicodeSet& ignorables, UErrorCode& status) {
    253    if (affixPattern.length() == 0) {
    254        return true;
    255    }
    256    AffixTag tag;
    257    while (hasNext(tag, affixPattern)) {
    258        tag = nextToken(tag, affixPattern, status);
    259        if (U_FAILURE(status)) { return false; }
    260        if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
    261            return false;
    262        }
    263    }
    264    return true;
    265 }
    266 
    267 void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
    268                                     UErrorCode& status) {
    269    if (affixPattern.length() == 0) {
    270        return;
    271    }
    272    AffixTag tag;
    273    while (hasNext(tag, affixPattern)) {
    274        tag = nextToken(tag, affixPattern, status);
    275        if (U_FAILURE(status)) { return; }
    276        consumer.consumeToken(tag.type, tag.codePoint, status);
    277        if (U_FAILURE(status)) { return; }
    278    }
    279 }
    280 
    281 AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
    282    int32_t offset = tag.offset;
    283    int32_t state = tag.state;
    284    for (; offset < patternString.length();) {
    285        UChar32 cp = patternString.char32At(offset);
    286        int32_t count = U16_LENGTH(cp);
    287 
    288        switch (state) {
    289            case STATE_BASE:
    290                switch (cp) {
    291                    case u'\'':
    292                        state = STATE_FIRST_QUOTE;
    293                        offset += count;
    294                        // continue to the next code point
    295                        break;
    296                    case u'-':
    297                        return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
    298                    case u'+':
    299                        return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
    300                    case u'~':
    301                        return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0);
    302                    case u'%':
    303                        return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
    304                    case u'‰':
    305                        return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
    306                    case u'¤':
    307                        state = STATE_FIRST_CURR;
    308                        offset += count;
    309                        // continue to the next code point
    310                        break;
    311                    default:
    312                        return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
    313                }
    314                break;
    315            case STATE_FIRST_QUOTE:
    316                if (cp == u'\'') {
    317                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
    318                } else {
    319                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
    320                }
    321            case STATE_INSIDE_QUOTE:
    322                if (cp == u'\'') {
    323                    state = STATE_AFTER_QUOTE;
    324                    offset += count;
    325                    // continue to the next code point
    326                    break;
    327                } else {
    328                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
    329                }
    330            case STATE_AFTER_QUOTE:
    331                if (cp == u'\'') {
    332                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
    333                } else {
    334                    state = STATE_BASE;
    335                    // re-evaluate this code point
    336                    break;
    337                }
    338            case STATE_FIRST_CURR:
    339                if (cp == u'¤') {
    340                    state = STATE_SECOND_CURR;
    341                    offset += count;
    342                    // continue to the next code point
    343                    break;
    344                } else {
    345                    return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
    346                }
    347            case STATE_SECOND_CURR:
    348                if (cp == u'¤') {
    349                    state = STATE_THIRD_CURR;
    350                    offset += count;
    351                    // continue to the next code point
    352                    break;
    353                } else {
    354                    return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
    355                }
    356            case STATE_THIRD_CURR:
    357                if (cp == u'¤') {
    358                    state = STATE_FOURTH_CURR;
    359                    offset += count;
    360                    // continue to the next code point
    361                    break;
    362                } else {
    363                    return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
    364                }
    365            case STATE_FOURTH_CURR:
    366                if (cp == u'¤') {
    367                    state = STATE_FIFTH_CURR;
    368                    offset += count;
    369                    // continue to the next code point
    370                    break;
    371                } else {
    372                    return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
    373                }
    374            case STATE_FIFTH_CURR:
    375                if (cp == u'¤') {
    376                    state = STATE_OVERFLOW_CURR;
    377                    offset += count;
    378                    // continue to the next code point
    379                    break;
    380                } else {
    381                    return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
    382                }
    383            case STATE_OVERFLOW_CURR:
    384                if (cp == u'¤') {
    385                    offset += count;
    386                    // continue to the next code point and loop back to this state
    387                    break;
    388                } else {
    389                    return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
    390                }
    391            default:
    392                UPRV_UNREACHABLE_EXIT;
    393        }
    394    }
    395    // End of string
    396    switch (state) {
    397        case STATE_BASE:
    398            // No more tokens in string.
    399            return {-1};
    400        case STATE_FIRST_QUOTE:
    401        case STATE_INSIDE_QUOTE:
    402            // For consistent behavior with the JDK and ICU 58, set an error here.
    403            status = U_ILLEGAL_ARGUMENT_ERROR;
    404            return {-1};
    405        case STATE_AFTER_QUOTE:
    406            // No more tokens in string.
    407            return {-1};
    408        case STATE_FIRST_CURR:
    409            return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
    410        case STATE_SECOND_CURR:
    411            return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
    412        case STATE_THIRD_CURR:
    413            return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
    414        case STATE_FOURTH_CURR:
    415            return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
    416        case STATE_FIFTH_CURR:
    417            return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
    418        case STATE_OVERFLOW_CURR:
    419            return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
    420        default:
    421            UPRV_UNREACHABLE_EXIT;
    422    }
    423 }
    424 
    425 bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
    426    // First check for the {-1} and default initializer syntax.
    427    if (tag.offset < 0) {
    428        return false;
    429    } else if (tag.offset == 0) {
    430        return string.length() > 0;
    431    }
    432    // The rest of the fields are safe to use now.
    433    // Special case: the last character in string is an end quote.
    434    if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
    435        string.charAt(tag.offset) == u'\'') {
    436        return false;
    437    } else if (tag.state != STATE_BASE) {
    438        return true;
    439    } else {
    440        return tag.offset < string.length();
    441    }
    442 }
    443 
    444 #endif /* #if !UCONFIG_NO_FORMATTING */