[ tor-browser ].git.dasho

collationiterator.cpp (37683B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2010-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationiterator.cpp
      9 *
     10 * created on: 2010oct27
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "utypeinfo.h"  // for 'typeid' to work
     15 
     16 #include "unicode/utypes.h"
     17 
     18 #if !UCONFIG_NO_COLLATION
     19 
     20 #include "unicode/ucharstrie.h"
     21 #include "unicode/ustringtrie.h"
     22 #include "charstr.h"
     23 #include "cmemory.h"
     24 #include "collation.h"
     25 #include "collationdata.h"
     26 #include "collationfcd.h"
     27 #include "collationiterator.h"
     28 #include "normalizer2impl.h"
     29 #include "uassert.h"
     30 #include "uvectr32.h"
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 CollationIterator::CEBuffer::~CEBuffer() {}
     35 
     36 UBool
     37 CollationIterator::CEBuffer::ensureAppendCapacity(int32_t appCap, UErrorCode &errorCode) {
     38    int32_t capacity = buffer.getCapacity();
     39    if((length + appCap) <= capacity) { return true; }
     40    if(U_FAILURE(errorCode)) { return false; }
     41    do {
     42        if(capacity < 1000) {
     43            capacity *= 4;
     44        } else {
     45            capacity *= 2;
     46        }
     47    } while(capacity < (length + appCap));
     48    int64_t *p = buffer.resize(capacity, length);
     49    if(p == nullptr) {
     50        errorCode = U_MEMORY_ALLOCATION_ERROR;
     51        return false;
     52    }
     53    return true;
     54 }
     55 
     56 // State of combining marks skipped in discontiguous contraction.
     57 // We create a state object on first use and keep it around deactivated between uses.
     58 class SkippedState : public UMemory {
     59 public:
     60    // Born active but empty.
     61    SkippedState() : pos(0), skipLengthAtMatch(0) {}
     62    void clear() {
     63        oldBuffer.remove();
     64        pos = 0;
     65        // The newBuffer is reset by setFirstSkipped().
     66    }
     67 
     68    UBool isEmpty() const { return oldBuffer.isEmpty(); }
     69 
     70    UBool hasNext() const { return pos < oldBuffer.length(); }
     71 
     72    // Requires hasNext().
     73    UChar32 next() {
     74        UChar32 c = oldBuffer.char32At(pos);
     75        pos += U16_LENGTH(c);
     76        return c;
     77    }
     78 
     79    // Accounts for one more input code point read beyond the end of the marks buffer.
     80    void incBeyond() {
     81        U_ASSERT(!hasNext());
     82        ++pos;
     83    }
     84 
     85    // Goes backward through the skipped-marks buffer.
     86    // Returns the number of code points read beyond the skipped marks
     87    // that need to be backtracked through normal input.
     88    int32_t backwardNumCodePoints(int32_t n) {
     89        int32_t length = oldBuffer.length();
     90        int32_t beyond = pos - length;
     91        if(beyond > 0) {
     92            if(beyond >= n) {
     93                // Not back far enough to re-enter the oldBuffer.
     94                pos -= n;
     95                return n;
     96            } else {
     97                // Back out all beyond-oldBuffer code points and re-enter the buffer.
     98                pos = oldBuffer.moveIndex32(length, beyond - n);
     99                return beyond;
    100            }
    101        } else {
    102            // Go backwards from inside the oldBuffer.
    103            pos = oldBuffer.moveIndex32(pos, -n);
    104            return 0;
    105        }
    106    }
    107 
    108    void setFirstSkipped(UChar32 c) {
    109        skipLengthAtMatch = 0;
    110        newBuffer.setTo(c);
    111    }
    112 
    113    void skip(UChar32 c) {
    114        newBuffer.append(c);
    115    }
    116 
    117    void recordMatch() { skipLengthAtMatch = newBuffer.length(); }
    118 
    119    // Replaces the characters we consumed with the newly skipped ones.
    120    void replaceMatch() {
    121        // Note: UnicodeString.replace() pins pos to at most length().
    122        oldBuffer.replace(0, pos, newBuffer, 0, skipLengthAtMatch);
    123        pos = 0;
    124    }
    125 
    126    void saveTrieState(const UCharsTrie &trie) { trie.saveState(state); }
    127    void resetToTrieState(UCharsTrie &trie) const { trie.resetToState(state); }
    128 
    129 private:
    130    // Combining marks skipped in previous discontiguous-contraction matching.
    131    // After that discontiguous contraction was completed, we start reading them from here.
    132    UnicodeString oldBuffer;
    133    // Combining marks newly skipped in current discontiguous-contraction matching.
    134    // These might have been read from the normal text or from the oldBuffer.
    135    UnicodeString newBuffer;
    136    // Reading index in oldBuffer,
    137    // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).
    138    int32_t pos;
    139    // newBuffer.length() at the time of the last matching character.
    140    // When a partial match fails, we back out skipped and partial-matching input characters.
    141    int32_t skipLengthAtMatch;
    142    // We save the trie state before we attempt to match a character,
    143    // so that we can skip it and try the next one.
    144    UCharsTrie::State state;
    145 };
    146 
    147 CollationIterator::CollationIterator(const CollationIterator &other)
    148        : UObject(other),
    149          trie(other.trie),
    150          data(other.data),
    151          cesIndex(other.cesIndex),
    152          skipped(nullptr),
    153          numCpFwd(other.numCpFwd),
    154          isNumeric(other.isNumeric) {
    155    UErrorCode errorCode = U_ZERO_ERROR;
    156    int32_t length = other.ceBuffer.length;
    157    if(length > 0 && ceBuffer.ensureAppendCapacity(length, errorCode)) {
    158        for(int32_t i = 0; i < length; ++i) {
    159            ceBuffer.set(i, other.ceBuffer.get(i));
    160        }
    161        ceBuffer.length = length;
    162    } else {
    163        cesIndex = 0;
    164    }
    165 }
    166 
    167 CollationIterator::~CollationIterator() {
    168    delete skipped;
    169 }
    170 
    171 bool
    172 CollationIterator::operator==(const CollationIterator &other) const {
    173    // Subclasses: Call this method and then add more specific checks.
    174    // Compare the iterator state but not the collation data (trie & data fields):
    175    // Assume that the caller compares the data.
    176    // Ignore skipped since that should be unused between calls to nextCE().
    177    // (It only stays around to avoid another memory allocation.)
    178    if(!(typeid(*this) == typeid(other) &&
    179            ceBuffer.length == other.ceBuffer.length &&
    180            cesIndex == other.cesIndex &&
    181            numCpFwd == other.numCpFwd &&
    182            isNumeric == other.isNumeric)) {
    183        return false;
    184    }
    185    for(int32_t i = 0; i < ceBuffer.length; ++i) {
    186        if(ceBuffer.get(i) != other.ceBuffer.get(i)) { return false; }
    187    }
    188    return true;
    189 }
    190 
    191 void
    192 CollationIterator::reset() {
    193    cesIndex = ceBuffer.length = 0;
    194    if(skipped != nullptr) { skipped->clear(); }
    195 }
    196 
    197 int32_t
    198 CollationIterator::fetchCEs(UErrorCode &errorCode) {
    199    while(U_SUCCESS(errorCode) && nextCE(errorCode) != Collation::NO_CE) {
    200        // No need to loop for each expansion CE.
    201        cesIndex = ceBuffer.length;
    202    }
    203    return ceBuffer.length;
    204 }
    205 
    206 uint32_t
    207 CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
    208    c = nextCodePoint(errorCode);
    209    return (c < 0) ? Collation::FALLBACK_CE32 : data->getCE32(c);
    210 }
    211 
    212 char16_t
    213 CollationIterator::handleGetTrailSurrogate() {
    214    return 0;
    215 }
    216 
    217 UBool
    218 CollationIterator::foundNULTerminator() {
    219    return false;
    220 }
    221 
    222 UBool
    223 CollationIterator::forbidSurrogateCodePoints() const {
    224    return false;
    225 }
    226 
    227 uint32_t
    228 CollationIterator::getDataCE32(UChar32 c) const {
    229    return data->getCE32(c);
    230 }
    231 
    232 uint32_t
    233 CollationIterator::getCE32FromBuilderData(uint32_t /*ce32*/, UErrorCode &errorCode) {
    234    if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
    235    return 0;
    236 }
    237 
    238 int64_t
    239 CollationIterator::nextCEFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
    240                                  UErrorCode &errorCode) {
    241    --ceBuffer.length;  // Undo ceBuffer.incLength().
    242    appendCEsFromCE32(d, c, ce32, true, errorCode);
    243    if(U_SUCCESS(errorCode)) {
    244        return ceBuffer.get(cesIndex++);
    245    } else {
    246        return Collation::NO_CE_PRIMARY;
    247    }
    248 }
    249 
    250 void
    251 CollationIterator::appendCEsFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
    252                                     UBool forward, UErrorCode &errorCode) {
    253    while(Collation::isSpecialCE32(ce32)) {
    254        switch(Collation::tagFromCE32(ce32)) {
    255        case Collation::FALLBACK_TAG:
    256        case Collation::RESERVED_TAG_3:
    257            if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
    258            return;
    259        case Collation::LONG_PRIMARY_TAG:
    260            ceBuffer.append(Collation::ceFromLongPrimaryCE32(ce32), errorCode);
    261            return;
    262        case Collation::LONG_SECONDARY_TAG:
    263            ceBuffer.append(Collation::ceFromLongSecondaryCE32(ce32), errorCode);
    264            return;
    265        case Collation::LATIN_EXPANSION_TAG:
    266            if(ceBuffer.ensureAppendCapacity(2, errorCode)) {
    267                ceBuffer.set(ceBuffer.length, Collation::latinCE0FromCE32(ce32));
    268                ceBuffer.set(ceBuffer.length + 1, Collation::latinCE1FromCE32(ce32));
    269                ceBuffer.length += 2;
    270            }
    271            return;
    272        case Collation::EXPANSION32_TAG: {
    273            const uint32_t *ce32s = d->ce32s + Collation::indexFromCE32(ce32);
    274            int32_t length = Collation::lengthFromCE32(ce32);
    275            if(ceBuffer.ensureAppendCapacity(length, errorCode)) {
    276                do {
    277                    ceBuffer.appendUnsafe(Collation::ceFromCE32(*ce32s++));
    278                } while(--length > 0);
    279            }
    280            return;
    281        }
    282        case Collation::EXPANSION_TAG: {
    283            const int64_t *ces = d->ces + Collation::indexFromCE32(ce32);
    284            int32_t length = Collation::lengthFromCE32(ce32);
    285            if(ceBuffer.ensureAppendCapacity(length, errorCode)) {
    286                do {
    287                    ceBuffer.appendUnsafe(*ces++);
    288                } while(--length > 0);
    289            }
    290            return;
    291        }
    292        case Collation::BUILDER_DATA_TAG:
    293            ce32 = getCE32FromBuilderData(ce32, errorCode);
    294            if(U_FAILURE(errorCode)) { return; }
    295            if(ce32 == Collation::FALLBACK_CE32) {
    296                d = data->base;
    297                ce32 = d->getCE32(c);
    298            }
    299            break;
    300        case Collation::PREFIX_TAG:
    301            if(forward) { backwardNumCodePoints(1, errorCode); }
    302            ce32 = getCE32FromPrefix(d, ce32, errorCode);
    303            if(forward) { forwardNumCodePoints(1, errorCode); }
    304            break;
    305        case Collation::CONTRACTION_TAG: {
    306            const char16_t *p = d->contexts + Collation::indexFromCE32(ce32);
    307            uint32_t defaultCE32 = CollationData::readCE32(p);  // Default if no suffix match.
    308            if(!forward) {
    309                // Backward contractions are handled by previousCEUnsafe().
    310                // c has contractions but they were not found.
    311                ce32 = defaultCE32;
    312                break;
    313            }
    314            UChar32 nextCp;
    315            if(skipped == nullptr && numCpFwd < 0) {
    316                // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,
    317                // avoiding the function call and the nextSkippedCodePoint() overhead.
    318                nextCp = nextCodePoint(errorCode);
    319                if(nextCp < 0) {
    320                    // No more text.
    321                    ce32 = defaultCE32;
    322                    break;
    323                } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&
    324                        !CollationFCD::mayHaveLccc(nextCp)) {
    325                    // All contraction suffixes start with characters with lccc!=0
    326                    // but the next code point has lccc==0.
    327                    backwardNumCodePoints(1, errorCode);
    328                    ce32 = defaultCE32;
    329                    break;
    330                }
    331            } else {
    332                nextCp = nextSkippedCodePoint(errorCode);
    333                if(nextCp < 0) {
    334                    // No more text.
    335                    ce32 = defaultCE32;
    336                    break;
    337                } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&
    338                        !CollationFCD::mayHaveLccc(nextCp)) {
    339                    // All contraction suffixes start with characters with lccc!=0
    340                    // but the next code point has lccc==0.
    341                    backwardNumSkipped(1, errorCode);
    342                    ce32 = defaultCE32;
    343                    break;
    344                }
    345            }
    346            ce32 = nextCE32FromContraction(d, ce32, p + 2, defaultCE32, nextCp, errorCode);
    347            if(ce32 == Collation::NO_CE32) {
    348                // CEs from a discontiguous contraction plus the skipped combining marks
    349                // have been appended already.
    350                return;
    351            }
    352            break;
    353        }
    354        case Collation::DIGIT_TAG:
    355            if(isNumeric) {
    356                appendNumericCEs(ce32, forward, errorCode);
    357                return;
    358            } else {
    359                // Fetch the non-numeric-collation CE32 and continue.
    360                ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
    361                break;
    362            }
    363        case Collation::U0000_TAG:
    364            U_ASSERT(c == 0);
    365            if(forward && foundNULTerminator()) {
    366                // Handle NUL-termination. (Not needed in Java.)
    367                ceBuffer.append(Collation::NO_CE, errorCode);
    368                return;
    369            } else {
    370                // Fetch the normal ce32 for U+0000 and continue.
    371                ce32 = d->ce32s[0];
    372                break;
    373            }
    374        case Collation::HANGUL_TAG: {
    375            const uint32_t *jamoCE32s = d->jamoCE32s;
    376            c -= Hangul::HANGUL_BASE;
    377            UChar32 t = c % Hangul::JAMO_T_COUNT;
    378            c /= Hangul::JAMO_T_COUNT;
    379            UChar32 v = c % Hangul::JAMO_V_COUNT;
    380            c /= Hangul::JAMO_V_COUNT;
    381            if((ce32 & Collation::HANGUL_NO_SPECIAL_JAMO) != 0) {
    382                // None of the Jamo CE32s are isSpecialCE32().
    383                // Avoid recursive function calls and per-Jamo tests.
    384                if(ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3, errorCode)) {
    385                    ceBuffer.set(ceBuffer.length, Collation::ceFromCE32(jamoCE32s[c]));
    386                    ceBuffer.set(ceBuffer.length + 1, Collation::ceFromCE32(jamoCE32s[19 + v]));
    387                    ceBuffer.length += 2;
    388                    if(t != 0) {
    389                        ceBuffer.appendUnsafe(Collation::ceFromCE32(jamoCE32s[39 + t]));
    390                    }
    391                }
    392                return;
    393            } else {
    394                // We should not need to compute each Jamo code point.
    395                // In particular, there should be no offset or implicit ce32.
    396                appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[c], forward, errorCode);
    397                appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[19 + v], forward, errorCode);
    398                if(t == 0) { return; }
    399                // offset 39 = 19 + 21 - 1:
    400                // 19 = JAMO_L_COUNT
    401                // 21 = JAMO_T_COUNT
    402                // -1 = omit t==0
    403                ce32 = jamoCE32s[39 + t];
    404                c = U_SENTINEL;
    405                break;
    406            }
    407        }
    408        case Collation::LEAD_SURROGATE_TAG: {
    409            U_ASSERT(forward);  // Backward iteration should never see lead surrogate code _unit_ data.
    410            U_ASSERT(U16_IS_LEAD(c));
    411            char16_t trail;
    412            if(U16_IS_TRAIL(trail = handleGetTrailSurrogate())) {
    413                c = U16_GET_SUPPLEMENTARY(c, trail);
    414                ce32 &= Collation::LEAD_TYPE_MASK;
    415                if(ce32 == Collation::LEAD_ALL_UNASSIGNED) {
    416                    ce32 = Collation::UNASSIGNED_CE32;  // unassigned-implicit
    417                } else if(ce32 == Collation::LEAD_ALL_FALLBACK ||
    418                        (ce32 = d->getCE32FromSupplementary(c)) == Collation::FALLBACK_CE32) {
    419                    // fall back to the base data
    420                    d = d->base;
    421                    ce32 = d->getCE32FromSupplementary(c);
    422                }
    423            } else {
    424                // c is an unpaired surrogate.
    425                ce32 = Collation::UNASSIGNED_CE32;
    426            }
    427            break;
    428        }
    429        case Collation::OFFSET_TAG:
    430            U_ASSERT(c >= 0);
    431            ceBuffer.append(d->getCEFromOffsetCE32(c, ce32), errorCode);
    432            return;
    433        case Collation::IMPLICIT_TAG:
    434            U_ASSERT(c >= 0);
    435            if(U_IS_SURROGATE(c) && forbidSurrogateCodePoints()) {
    436                ce32 = Collation::FFFD_CE32;
    437                break;
    438            } else {
    439                ceBuffer.append(Collation::unassignedCEFromCodePoint(c), errorCode);
    440                return;
    441            }
    442        }
    443    }
    444    ceBuffer.append(Collation::ceFromSimpleCE32(ce32), errorCode);
    445 }
    446 
    447 uint32_t
    448 CollationIterator::getCE32FromPrefix(const CollationData *d, uint32_t ce32,
    449                                     UErrorCode &errorCode) {
    450    const char16_t *p = d->contexts + Collation::indexFromCE32(ce32);
    451    ce32 = CollationData::readCE32(p);  // Default if no prefix match.
    452    p += 2;
    453    // Number of code points read before the original code point.
    454    int32_t lookBehind = 0;
    455    UCharsTrie prefixes(p);
    456    for(;;) {
    457        UChar32 c = previousCodePoint(errorCode);
    458        if(c < 0) { break; }
    459        ++lookBehind;
    460        UStringTrieResult match = prefixes.nextForCodePoint(c);
    461        if(USTRINGTRIE_HAS_VALUE(match)) {
    462            ce32 = static_cast<uint32_t>(prefixes.getValue());
    463        }
    464        if(!USTRINGTRIE_HAS_NEXT(match)) { break; }
    465    }
    466    forwardNumCodePoints(lookBehind, errorCode);
    467    return ce32;
    468 }
    469 
    470 UChar32
    471 CollationIterator::nextSkippedCodePoint(UErrorCode &errorCode) {
    472    if(skipped != nullptr && skipped->hasNext()) { return skipped->next(); }
    473    if(numCpFwd == 0) { return U_SENTINEL; }
    474    UChar32 c = nextCodePoint(errorCode);
    475    if(skipped != nullptr && !skipped->isEmpty() && c >= 0) { skipped->incBeyond(); }
    476    if(numCpFwd > 0 && c >= 0) { --numCpFwd; }
    477    return c;
    478 }
    479 
    480 void
    481 CollationIterator::backwardNumSkipped(int32_t n, UErrorCode &errorCode) {
    482    if(skipped != nullptr && !skipped->isEmpty()) {
    483        n = skipped->backwardNumCodePoints(n);
    484    }
    485    backwardNumCodePoints(n, errorCode);
    486    if(numCpFwd >= 0) { numCpFwd += n; }
    487 }
    488 
    489 uint32_t
    490 CollationIterator::nextCE32FromContraction(const CollationData *d, uint32_t contractionCE32,
    491                                           const char16_t *p, uint32_t ce32, UChar32 c,
    492                                           UErrorCode &errorCode) {
    493    // c: next code point after the original one
    494 
    495    // Number of code points read beyond the original code point.
    496    // Needed for discontiguous contraction matching.
    497    int32_t lookAhead = 1;
    498    // Number of code points read since the last match (initially only c).
    499    int32_t sinceMatch = 1;
    500    // Normally we only need a contiguous match,
    501    // and therefore need not remember the suffixes state from before a mismatch for retrying.
    502    // If we are already processing skipped combining marks, then we do track the state.
    503    UCharsTrie suffixes(p);
    504    if(skipped != nullptr && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); }
    505    UStringTrieResult match = suffixes.firstForCodePoint(c);
    506    for(;;) {
    507        UChar32 nextCp;
    508        if(USTRINGTRIE_HAS_VALUE(match)) {
    509            ce32 = static_cast<uint32_t>(suffixes.getValue());
    510            if(!USTRINGTRIE_HAS_NEXT(match) || (c = nextSkippedCodePoint(errorCode)) < 0) {
    511                return ce32;
    512            }
    513            if(skipped != nullptr && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); }
    514            sinceMatch = 1;
    515        } else if(match == USTRINGTRIE_NO_MATCH || (nextCp = nextSkippedCodePoint(errorCode)) < 0) {
    516            // No match for c, or partial match (USTRINGTRIE_NO_VALUE) and no further text.
    517            // Back up if necessary, and try a discontiguous contraction.
    518            if((contractionCE32 & Collation::CONTRACT_TRAILING_CCC) != 0 &&
    519                    // Discontiguous contraction matching extends an existing match.
    520                    // If there is no match yet, then there is nothing to do.
    521                    ((contractionCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) == 0 ||
    522                        sinceMatch < lookAhead)) {
    523                // The last character of at least one suffix has lccc!=0,
    524                // allowing for discontiguous contractions.
    525                // UCA S2.1.1 only processes non-starters immediately following
    526                // "a match in the table" (sinceMatch=1).
    527                if(sinceMatch > 1) {
    528                    // Return to the state after the last match.
    529                    // (Return to sinceMatch=0 and re-fetch the first partially-matched character.)
    530                    backwardNumSkipped(sinceMatch, errorCode);
    531                    c = nextSkippedCodePoint(errorCode);
    532                    lookAhead -= sinceMatch - 1;
    533                    sinceMatch = 1;
    534                }
    535                if(d->getFCD16(c) > 0xff) {
    536                    return nextCE32FromDiscontiguousContraction(
    537                        d, suffixes, ce32, lookAhead, c, errorCode);
    538                }
    539            }
    540            break;
    541        } else {
    542            // Continue after partial match (USTRINGTRIE_NO_VALUE) for c.
    543            // It does not have a result value, therefore it is not itself "a match in the table".
    544            // If a partially-matched c has ccc!=0 then
    545            // it might be skipped in discontiguous contraction.
    546            c = nextCp;
    547            ++sinceMatch;
    548        }
    549        ++lookAhead;
    550        match = suffixes.nextForCodePoint(c);
    551    }
    552    backwardNumSkipped(sinceMatch, errorCode);
    553    return ce32;
    554 }
    555 
    556 uint32_t
    557 CollationIterator::nextCE32FromDiscontiguousContraction(
    558        const CollationData *d, UCharsTrie &suffixes, uint32_t ce32,
    559        int32_t lookAhead, UChar32 c,
    560        UErrorCode &errorCode) {
    561    if(U_FAILURE(errorCode)) { return 0; }
    562 
    563    // UCA section 3.3.2 Contractions:
    564    // Contractions that end with non-starter characters
    565    // are known as discontiguous contractions.
    566    // ... discontiguous contractions must be detected in input text
    567    // whenever the final sequence of non-starter characters could be rearranged
    568    // so as to make a contiguous matching sequence that is canonically equivalent.
    569 
    570    // UCA: http://www.unicode.org/reports/tr10/#S2.1
    571    // S2.1 Find the longest initial substring S at each point that has a match in the table.
    572    // S2.1.1 If there are any non-starters following S, process each non-starter C.
    573    // S2.1.2 If C is not blocked from S, find if S + C has a match in the table.
    574    //     Note: A non-starter in a string is called blocked
    575    //     if there is another non-starter of the same canonical combining class or zero
    576    //     between it and the last character of canonical combining class 0.
    577    // S2.1.3 If there is a match, replace S by S + C, and remove C.
    578 
    579    // First: Is a discontiguous contraction even possible?
    580    uint16_t fcd16 = d->getFCD16(c);
    581    U_ASSERT(fcd16 > 0xff);  // The caller checked this already, as a shortcut.
    582    UChar32 nextCp = nextSkippedCodePoint(errorCode);
    583    if(nextCp < 0) {
    584        // No further text.
    585        backwardNumSkipped(1, errorCode);
    586        return ce32;
    587    }
    588    ++lookAhead;
    589    uint8_t prevCC = static_cast<uint8_t>(fcd16);
    590    fcd16 = d->getFCD16(nextCp);
    591    if(fcd16 <= 0xff) {
    592        // The next code point after c is a starter (S2.1.1 "process each non-starter").
    593        backwardNumSkipped(2, errorCode);
    594        return ce32;
    595    }
    596 
    597    // We have read and matched (lookAhead-2) code points,
    598    // read non-matching c and peeked ahead at nextCp.
    599    // Return to the state before the mismatch and continue matching with nextCp.
    600    if(skipped == nullptr || skipped->isEmpty()) {
    601        if(skipped == nullptr) {
    602            skipped = new SkippedState();
    603            if(skipped == nullptr) {
    604                errorCode = U_MEMORY_ALLOCATION_ERROR;
    605                return 0;
    606            }
    607        }
    608        suffixes.reset();
    609        if(lookAhead > 2) {
    610            // Replay the partial match so far.
    611            backwardNumCodePoints(lookAhead, errorCode);
    612            suffixes.firstForCodePoint(nextCodePoint(errorCode));
    613            for(int32_t i = 3; i < lookAhead; ++i) {
    614                suffixes.nextForCodePoint(nextCodePoint(errorCode));
    615            }
    616            // Skip c (which did not match) and nextCp (which we will try now).
    617            forwardNumCodePoints(2, errorCode);
    618        }
    619        skipped->saveTrieState(suffixes);
    620    } else {
    621        // Reset to the trie state before the failed match of c.
    622        skipped->resetToTrieState(suffixes);
    623    }
    624 
    625    skipped->setFirstSkipped(c);
    626    // Number of code points read since the last match (at this point: c and nextCp).
    627    int32_t sinceMatch = 2;
    628    c = nextCp;
    629    for(;;) {
    630        UStringTrieResult match;
    631        // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)
    632        if(prevCC < (fcd16 >> 8) && USTRINGTRIE_HAS_VALUE(match = suffixes.nextForCodePoint(c))) {
    633            // "If there is a match, replace S by S + C, and remove C." (S2.1.3)
    634            // Keep prevCC unchanged.
    635            ce32 = static_cast<uint32_t>(suffixes.getValue());
    636            sinceMatch = 0;
    637            skipped->recordMatch();
    638            if(!USTRINGTRIE_HAS_NEXT(match)) { break; }
    639            skipped->saveTrieState(suffixes);
    640        } else {
    641            // No match for "S + C", skip C.
    642            skipped->skip(c);
    643            skipped->resetToTrieState(suffixes);
    644            prevCC = static_cast<uint8_t>(fcd16);
    645        }
    646        if((c = nextSkippedCodePoint(errorCode)) < 0) { break; }
    647        ++sinceMatch;
    648        fcd16 = d->getFCD16(c);
    649        if(fcd16 <= 0xff) {
    650            // The next code point after c is a starter (S2.1.1 "process each non-starter").
    651            break;
    652        }
    653    }
    654    backwardNumSkipped(sinceMatch, errorCode);
    655    UBool isTopDiscontiguous = skipped->isEmpty();
    656    skipped->replaceMatch();
    657    if(isTopDiscontiguous && !skipped->isEmpty()) {
    658        // We did get a match after skipping one or more combining marks,
    659        // and we are not in a recursive discontiguous contraction.
    660        // Append CEs from the contraction ce32
    661        // and then from the combining marks that we skipped before the match.
    662        c = U_SENTINEL;
    663        for(;;) {
    664            appendCEsFromCE32(d, c, ce32, true, errorCode);
    665            // Fetch CE32s for skipped combining marks from the normal data, with fallback,
    666            // rather than from the CollationData where we found the contraction.
    667            if(!skipped->hasNext()) { break; }
    668            c = skipped->next();
    669            ce32 = getDataCE32(c);
    670            if(ce32 == Collation::FALLBACK_CE32) {
    671                d = data->base;
    672                ce32 = d->getCE32(c);
    673            } else {
    674                d = data;
    675            }
    676            // Note: A nested discontiguous-contraction match
    677            // replaces consumed combining marks with newly skipped ones
    678            // and resets the reading position to the beginning.
    679        }
    680        skipped->clear();
    681        ce32 = Collation::NO_CE32;  // Signal to the caller that the result is in the ceBuffer.
    682    }
    683    return ce32;
    684 }
    685 
    686 void
    687 CollationIterator::appendNumericCEs(uint32_t ce32, UBool forward, UErrorCode &errorCode) {
    688    // Collect digits.
    689    CharString digits;
    690    if(forward) {
    691        for(;;) {
    692            char digit = Collation::digitFromCE32(ce32);
    693            digits.append(digit, errorCode);
    694            if(numCpFwd == 0) { break; }
    695            UChar32 c = nextCodePoint(errorCode);
    696            if(c < 0) { break; }
    697            ce32 = data->getCE32(c);
    698            if(ce32 == Collation::FALLBACK_CE32) {
    699                ce32 = data->base->getCE32(c);
    700            }
    701            if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) {
    702                backwardNumCodePoints(1, errorCode);
    703                break;
    704            }
    705            if(numCpFwd > 0) { --numCpFwd; }
    706        }
    707    } else {
    708        for(;;) {
    709            char digit = Collation::digitFromCE32(ce32);
    710            digits.append(digit, errorCode);
    711            UChar32 c = previousCodePoint(errorCode);
    712            if(c < 0) { break; }
    713            ce32 = data->getCE32(c);
    714            if(ce32 == Collation::FALLBACK_CE32) {
    715                ce32 = data->base->getCE32(c);
    716            }
    717            if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) {
    718                forwardNumCodePoints(1, errorCode);
    719                break;
    720            }
    721        }
    722        // Reverse the digit string.
    723        char *p = digits.data();
    724        char *q = p + digits.length() - 1;
    725        while(p < q) {
    726            char digit = *p;
    727            *p++ = *q;
    728            *q-- = digit;
    729        }
    730    }
    731    if(U_FAILURE(errorCode)) { return; }
    732    int32_t pos = 0;
    733    do {
    734        // Skip leading zeros.
    735        while(pos < (digits.length() - 1) && digits[pos] == 0) { ++pos; }
    736        // Write a sequence of CEs for at most 254 digits at a time.
    737        int32_t segmentLength = digits.length() - pos;
    738        if(segmentLength > 254) { segmentLength = 254; }
    739        appendNumericSegmentCEs(digits.data() + pos, segmentLength, errorCode);
    740        pos += segmentLength;
    741    } while(U_SUCCESS(errorCode) && pos < digits.length());
    742 }
    743 
    744 void
    745 CollationIterator::appendNumericSegmentCEs(const char *digits, int32_t length, UErrorCode &errorCode) {
    746    U_ASSERT(1 <= length && length <= 254);
    747    U_ASSERT(length == 1 || digits[0] != 0);
    748    uint32_t numericPrimary = data->numericPrimary;
    749    // Note: We use primary byte values 2..255: digits are not compressible.
    750    if(length <= 7) {
    751        // Very dense encoding for small numbers.
    752        int32_t value = digits[0];
    753        for(int32_t i = 1; i < length; ++i) {
    754            value = value * 10 + digits[i];
    755        }
    756        // Primary weight second byte values:
    757        //     74 byte values   2.. 75 for small numbers in two-byte primary weights.
    758        //     40 byte values  76..115 for medium numbers in three-byte primary weights.
    759        //     16 byte values 116..131 for large numbers in four-byte primary weights.
    760        //    124 byte values 132..255 for very large numbers with 4..127 digit pairs.
    761        int32_t firstByte = 2;
    762        int32_t numBytes = 74;
    763        if(value < numBytes) {
    764            // Two-byte primary for 0..73, good for day & month numbers etc.
    765            uint32_t primary = numericPrimary | ((firstByte + value) << 16);
    766            ceBuffer.append(Collation::makeCE(primary), errorCode);
    767            return;
    768        }
    769        value -= numBytes;
    770        firstByte += numBytes;
    771        numBytes = 40;
    772        if(value < numBytes * 254) {
    773            // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
    774            uint32_t primary = numericPrimary |
    775                ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8);
    776            ceBuffer.append(Collation::makeCE(primary), errorCode);
    777            return;
    778        }
    779        value -= numBytes * 254;
    780        firstByte += numBytes;
    781        numBytes = 16;
    782        if(value < numBytes * 254 * 254) {
    783            // Four-byte primary for 10234..1042489=10234+16*254*254-1.
    784            uint32_t primary = numericPrimary | (2 + value % 254);
    785            value /= 254;
    786            primary |= (2 + value % 254) << 8;
    787            value /= 254;
    788            primary |= (firstByte + value % 254) << 16;
    789            ceBuffer.append(Collation::makeCE(primary), errorCode);
    790            return;
    791        }
    792        // original value > 1042489
    793    }
    794    U_ASSERT(length >= 7);
    795 
    796    // The second primary byte value 132..255 indicates the number of digit pairs (4..127),
    797    // then we generate primary bytes with those pairs.
    798    // Omit trailing 00 pairs.
    799    // Decrement the value for the last pair.
    800 
    801    // Set the exponent. 4 pairs->132, 5 pairs->133, ..., 127 pairs->255.
    802    int32_t numPairs = (length + 1) / 2;
    803    uint32_t primary = numericPrimary | ((132 - 4 + numPairs) << 16);
    804    // Find the length without trailing 00 pairs.
    805    while(digits[length - 1] == 0 && digits[length - 2] == 0) {
    806        length -= 2;
    807    }
    808    // Read the first pair.
    809    uint32_t pair;
    810    int32_t pos;
    811    if(length & 1) {
    812        // Only "half a pair" if we have an odd number of digits.
    813        pair = digits[0];
    814        pos = 1;
    815    } else {
    816        pair = digits[0] * 10 + digits[1];
    817        pos = 2;
    818    }
    819    pair = 11 + 2 * pair;
    820    // Add the pairs of digits between pos and length.
    821    int32_t shift = 8;
    822    while(pos < length) {
    823        if(shift == 0) {
    824            // Every three pairs/bytes we need to store a 4-byte-primary CE
    825            // and start with a new CE with the '0' primary lead byte.
    826            primary |= pair;
    827            ceBuffer.append(Collation::makeCE(primary), errorCode);
    828            primary = numericPrimary;
    829            shift = 16;
    830        } else {
    831            primary |= pair << shift;
    832            shift -= 8;
    833        }
    834        pair = 11 + 2 * (digits[pos] * 10 + digits[pos + 1]);
    835        pos += 2;
    836    }
    837    primary |= (pair - 1) << shift;
    838    ceBuffer.append(Collation::makeCE(primary), errorCode);
    839 }
    840 
    841 int64_t
    842 CollationIterator::previousCE(UVector32 &offsets, UErrorCode &errorCode) {
    843    if(ceBuffer.length > 0) {
    844        // Return the previous buffered CE.
    845        return ceBuffer.get(--ceBuffer.length);
    846    }
    847    offsets.removeAllElements();
    848    int32_t limitOffset = getOffset();
    849    UChar32 c = previousCodePoint(errorCode);
    850    if(c < 0) { return Collation::NO_CE; }
    851    if(data->isUnsafeBackward(c, isNumeric)) {
    852        return previousCEUnsafe(c, offsets, errorCode);
    853    }
    854    // Simple, safe-backwards iteration:
    855    // Get a CE going backwards, handle prefixes but no contractions.
    856    uint32_t ce32 = data->getCE32(c);
    857    const CollationData *d;
    858    if(ce32 == Collation::FALLBACK_CE32) {
    859        d = data->base;
    860        ce32 = d->getCE32(c);
    861    } else {
    862        d = data;
    863    }
    864    if(Collation::isSimpleOrLongCE32(ce32)) {
    865        return Collation::ceFromCE32(ce32);
    866    }
    867    appendCEsFromCE32(d, c, ce32, false, errorCode);
    868    if(U_SUCCESS(errorCode)) {
    869        if(ceBuffer.length > 1) {
    870            offsets.addElement(getOffset(), errorCode);
    871            // For an expansion, the offset of each non-initial CE is the limit offset,
    872            // consistent with forward iteration.
    873            while(offsets.size() <= ceBuffer.length) {
    874                offsets.addElement(limitOffset, errorCode);
    875            }
    876        }
    877        return ceBuffer.get(--ceBuffer.length);
    878    } else {
    879        return Collation::NO_CE_PRIMARY;
    880    }
    881 }
    882 
    883 int64_t
    884 CollationIterator::previousCEUnsafe(UChar32 c, UVector32 &offsets, UErrorCode &errorCode) {
    885    // We just move through the input counting safe and unsafe code points
    886    // without collecting the unsafe-backward substring into a buffer and
    887    // switching to it.
    888    // This is to keep the logic simple. Otherwise we would have to handle
    889    // prefix matching going before the backward buffer, switching
    890    // to iteration and back, etc.
    891    // In the most important case of iterating over a normal string,
    892    // reading from the string itself is already maximally fast.
    893    // The only drawback there is that after getting the CEs we always
    894    // skip backward to the safe character rather than switching out
    895    // of a backwardBuffer.
    896    // But this should not be the common case for previousCE(),
    897    // and correctness and maintainability are more important than
    898    // complex optimizations.
    899    // Find the first safe character before c.
    900    int32_t numBackward = 1;
    901    while((c = previousCodePoint(errorCode)) >= 0) {
    902        ++numBackward;
    903        if(!data->isUnsafeBackward(c, isNumeric)) {
    904            break;
    905        }
    906    }
    907    // Set the forward iteration limit.
    908    // Note: This counts code points.
    909    // We cannot enforce a limit in the middle of a surrogate pair or similar.
    910    numCpFwd = numBackward;
    911    // Reset the forward iterator.
    912    cesIndex = 0;
    913    U_ASSERT(ceBuffer.length == 0);
    914    // Go forward and collect the CEs.
    915    int32_t offset = getOffset();
    916    while(numCpFwd > 0) {
    917        // nextCE() normally reads one code point.
    918        // Contraction matching and digit specials read more and check numCpFwd.
    919        --numCpFwd;
    920        // Append one or more CEs to the ceBuffer.
    921        (void)nextCE(errorCode);
    922        U_ASSERT(U_FAILURE(errorCode) || ceBuffer.get(ceBuffer.length - 1) != Collation::NO_CE);
    923        // No need to loop for getting each expansion CE from nextCE().
    924        cesIndex = ceBuffer.length;
    925        // However, we need to write an offset for each CE.
    926        // This is for CollationElementIterator::getOffset() to return
    927        // intermediate offsets from the unsafe-backwards segment.
    928        U_ASSERT(offsets.size() < ceBuffer.length);
    929        offsets.addElement(offset, errorCode);
    930        // For an expansion, the offset of each non-initial CE is the limit offset,
    931        // consistent with forward iteration.
    932        offset = getOffset();
    933        while(offsets.size() < ceBuffer.length) {
    934            offsets.addElement(offset, errorCode);
    935        }
    936    }
    937    U_ASSERT(offsets.size() == ceBuffer.length);
    938    // End offset corresponding to just after the unsafe-backwards segment.
    939    offsets.addElement(offset, errorCode);
    940    // Reset the forward iteration limit
    941    // and move backward to before the segment for which we fetched CEs.
    942    numCpFwd = -1;
    943    backwardNumCodePoints(numBackward, errorCode);
    944    // Use the collected CEs and return the last one.
    945    cesIndex = 0;  // Avoid cesIndex > ceBuffer.length when that gets decremented.
    946    if(U_SUCCESS(errorCode)) {
    947        return ceBuffer.get(--ceBuffer.length);
    948    } else {
    949        return Collation::NO_CE_PRIMARY;
    950    }
    951 }
    952 
    953 U_NAMESPACE_END
    954 
    955 #endif  // !UCONFIG_NO_COLLATION
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE