tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

uitercollationiterator.cpp (14481B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2012-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * uitercollationiterator.cpp
      9 *
     10 * created on: 2012sep23 (from utf16collationiterator.cpp)
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/uiter.h"
     19 #include "charstr.h"
     20 #include "cmemory.h"
     21 #include "collation.h"
     22 #include "collationdata.h"
     23 #include "collationfcd.h"
     24 #include "collationiterator.h"
     25 #include "normalizer2impl.h"
     26 #include "uassert.h"
     27 #include "uitercollationiterator.h"
     28 
     29 U_NAMESPACE_BEGIN
     30 
     31 UIterCollationIterator::~UIterCollationIterator() {}
     32 
     33 void
     34 UIterCollationIterator::resetToOffset(int32_t newOffset) {
     35    reset();
     36    iter.move(&iter, newOffset, UITER_START);
     37 }
     38 
     39 int32_t
     40 UIterCollationIterator::getOffset() const {
     41    return iter.getIndex(&iter, UITER_CURRENT);
     42 }
     43 
     44 uint32_t
     45 UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
     46    c = iter.next(&iter);
     47    if(c < 0) {
     48        return Collation::FALLBACK_CE32;
     49    }
     50    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
     51 }
     52 
     53 char16_t
     54 UIterCollationIterator::handleGetTrailSurrogate() {
     55    UChar32 trail = iter.next(&iter);
     56    if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); }
     57    return static_cast<char16_t>(trail);
     58 }
     59 
     60 UChar32
     61 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
     62    return uiter_next32(&iter);
     63 }
     64 
     65 UChar32
     66 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
     67    return uiter_previous32(&iter);
     68 }
     69 
     70 void
     71 UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
     72    while(num > 0 && (uiter_next32(&iter)) >= 0) {
     73        --num;
     74    }
     75 }
     76 
     77 void
     78 UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
     79    while(num > 0 && (uiter_previous32(&iter)) >= 0) {
     80        --num;
     81    }
     82 }
     83 
     84 // FCDUIterCollationIterator ----------------------------------------------- ***
     85 
     86 FCDUIterCollationIterator::~FCDUIterCollationIterator() {}
     87 
     88 void
     89 FCDUIterCollationIterator::resetToOffset(int32_t newOffset) {
     90    UIterCollationIterator::resetToOffset(newOffset);
     91    start = newOffset;
     92    state = ITER_CHECK_FWD;
     93 }
     94 
     95 int32_t
     96 FCDUIterCollationIterator::getOffset() const {
     97    if(state <= ITER_CHECK_BWD) {
     98        return iter.getIndex(&iter, UITER_CURRENT);
     99    } else if(state == ITER_IN_FCD_SEGMENT) {
    100        return pos;
    101    } else if(pos == 0) {
    102        return start;
    103    } else {
    104        return limit;
    105    }
    106 }
    107 
    108 uint32_t
    109 FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
    110    for(;;) {
    111        if(state == ITER_CHECK_FWD) {
    112            c = iter.next(&iter);
    113            if(c < 0) {
    114                return Collation::FALLBACK_CE32;
    115            }
    116            if(CollationFCD::hasTccc(c)) {
    117                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    118                        CollationFCD::hasLccc(iter.current(&iter))) {
    119                    iter.previous(&iter);
    120                    if(!nextSegment(errorCode)) {
    121                        c = U_SENTINEL;
    122                        return Collation::FALLBACK_CE32;
    123                    }
    124                    continue;
    125                }
    126            }
    127            break;
    128        } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
    129            c = iter.next(&iter);
    130            ++pos;
    131            U_ASSERT(c >= 0);
    132            break;
    133        } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
    134            c = normalized[pos++];
    135            break;
    136        } else {
    137            switchToForward();
    138        }
    139    }
    140    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
    141 }
    142 
    143 char16_t
    144 FCDUIterCollationIterator::handleGetTrailSurrogate() {
    145    if(state <= ITER_IN_FCD_SEGMENT) {
    146        UChar32 trail = iter.next(&iter);
    147        if(U16_IS_TRAIL(trail)) {
    148            if(state == ITER_IN_FCD_SEGMENT) { ++pos; }
    149        } else if(trail >= 0) {
    150            iter.previous(&iter);
    151        }
    152        return static_cast<char16_t>(trail);
    153    } else {
    154        U_ASSERT(pos < normalized.length());
    155        char16_t trail;
    156        if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
    157        return trail;
    158    }
    159 }
    160 
    161 UChar32
    162 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) {
    163    UChar32 c;
    164    for(;;) {
    165        if(state == ITER_CHECK_FWD) {
    166            c = iter.next(&iter);
    167            if(c < 0) {
    168                return c;
    169            }
    170            if(CollationFCD::hasTccc(c)) {
    171                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    172                        CollationFCD::hasLccc(iter.current(&iter))) {
    173                    iter.previous(&iter);
    174                    if(!nextSegment(errorCode)) {
    175                        return U_SENTINEL;
    176                    }
    177                    continue;
    178                }
    179            }
    180            if(U16_IS_LEAD(c)) {
    181                UChar32 trail = iter.next(&iter);
    182                if(U16_IS_TRAIL(trail)) {
    183                    return U16_GET_SUPPLEMENTARY(c, trail);
    184                } else if(trail >= 0) {
    185                    iter.previous(&iter);
    186                }
    187            }
    188            return c;
    189        } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
    190            c = uiter_next32(&iter);
    191            pos += U16_LENGTH(c);
    192            U_ASSERT(c >= 0);
    193            return c;
    194        } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
    195            c = normalized.char32At(pos);
    196            pos += U16_LENGTH(c);
    197            return c;
    198        } else {
    199            switchToForward();
    200        }
    201    }
    202 }
    203 
    204 UChar32
    205 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) {
    206    UChar32 c;
    207    for(;;) {
    208        if(state == ITER_CHECK_BWD) {
    209            c = iter.previous(&iter);
    210            if(c < 0) {
    211                start = pos = 0;
    212                state = ITER_IN_FCD_SEGMENT;
    213                return U_SENTINEL;
    214            }
    215            if(CollationFCD::hasLccc(c)) {
    216                UChar32 prev = U_SENTINEL;
    217                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    218                        CollationFCD::hasTccc(prev = iter.previous(&iter))) {
    219                    iter.next(&iter);
    220                    if(prev >= 0) {
    221                        iter.next(&iter);
    222                    }
    223                    if(!previousSegment(errorCode)) {
    224                        return U_SENTINEL;
    225                    }
    226                    continue;
    227                }
    228                // hasLccc(trail)=true for all trail surrogates
    229                if(U16_IS_TRAIL(c)) {
    230                    if(prev < 0) {
    231                        prev = iter.previous(&iter);
    232                    }
    233                    if(U16_IS_LEAD(prev)) {
    234                        return U16_GET_SUPPLEMENTARY(prev, c);
    235                    }
    236                }
    237                if(prev >= 0) {
    238                    iter.next(&iter);
    239                }
    240            }
    241            return c;
    242        } else if(state == ITER_IN_FCD_SEGMENT && pos != start) {
    243            c = uiter_previous32(&iter);
    244            pos -= U16_LENGTH(c);
    245            U_ASSERT(c >= 0);
    246            return c;
    247        } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) {
    248            c = normalized.char32At(pos - 1);
    249            pos -= U16_LENGTH(c);
    250            return c;
    251        } else {
    252            switchToBackward();
    253        }
    254    }
    255 }
    256 
    257 void
    258 FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    259    // Specify the class to avoid a virtual-function indirection.
    260    // In Java, we would declare this class final.
    261    while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) {
    262        --num;
    263    }
    264 }
    265 
    266 void
    267 FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    268    // Specify the class to avoid a virtual-function indirection.
    269    // In Java, we would declare this class final.
    270    while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) {
    271        --num;
    272    }
    273 }
    274 
    275 void
    276 FCDUIterCollationIterator::switchToForward() {
    277    U_ASSERT(state == ITER_CHECK_BWD ||
    278             (state == ITER_IN_FCD_SEGMENT && pos == limit) ||
    279             (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length()));
    280    if(state == ITER_CHECK_BWD) {
    281        // Turn around from backward checking.
    282        start = pos = iter.getIndex(&iter, UITER_CURRENT);
    283        if(pos == limit) {
    284            state = ITER_CHECK_FWD;  // Check forward.
    285        } else {  // pos < limit
    286            state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
    287        }
    288    } else {
    289        // Reached the end of the FCD segment.
    290        if(state == ITER_IN_FCD_SEGMENT) {
    291            // The input text segment is FCD, extend it forward.
    292        } else {
    293            // The input text segment needed to be normalized.
    294            // Switch to checking forward from it.
    295            if(state == IN_NORM_ITER_AT_START) {
    296                iter.move(&iter, limit - start, UITER_CURRENT);
    297            }
    298            start = limit;
    299        }
    300        state = ITER_CHECK_FWD;
    301    }
    302 }
    303 
    304 UBool
    305 FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) {
    306    if(U_FAILURE(errorCode)) { return false; }
    307    U_ASSERT(state == ITER_CHECK_FWD);
    308    // The input text [start..(iter index)[ passes the FCD check.
    309    pos = iter.getIndex(&iter, UITER_CURRENT);
    310    // Collect the characters being checked, in case they need to be normalized.
    311    UnicodeString s;
    312    uint8_t prevCC = 0;
    313    for(;;) {
    314        // Fetch the next character and its fcd16 value.
    315        UChar32 c = uiter_next32(&iter);
    316        if(c < 0) { break; }
    317        uint16_t fcd16 = nfcImpl.getFCD16(c);
    318        uint8_t leadCC = static_cast<uint8_t>(fcd16 >> 8);
    319        if(leadCC == 0 && !s.isEmpty()) {
    320            // FCD boundary before this character.
    321            uiter_previous32(&iter);
    322            break;
    323        }
    324        s.append(c);
    325        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    326            // Fails FCD check. Find the next FCD boundary and normalize.
    327            for(;;) {
    328                c = uiter_next32(&iter);
    329                if(c < 0) { break; }
    330                if(nfcImpl.getFCD16(c) <= 0xff) {
    331                    uiter_previous32(&iter);
    332                    break;
    333                }
    334                s.append(c);
    335            }
    336            if(!normalize(s, errorCode)) { return false; }
    337            start = pos;
    338            limit = pos + s.length();
    339            state = IN_NORM_ITER_AT_LIMIT;
    340            pos = 0;
    341            return true;
    342        }
    343        prevCC = static_cast<uint8_t>(fcd16);
    344        if(prevCC == 0) {
    345            // FCD boundary after the last character.
    346            break;
    347        }
    348    }
    349    limit = pos + s.length();
    350    U_ASSERT(pos != limit);
    351    iter.move(&iter, -s.length(), UITER_CURRENT);
    352    state = ITER_IN_FCD_SEGMENT;
    353    return true;
    354 }
    355 
    356 void
    357 FCDUIterCollationIterator::switchToBackward() {
    358    U_ASSERT(state == ITER_CHECK_FWD ||
    359             (state == ITER_IN_FCD_SEGMENT && pos == start) ||
    360             (state >= IN_NORM_ITER_AT_LIMIT && pos == 0));
    361    if(state == ITER_CHECK_FWD) {
    362        // Turn around from forward checking.
    363        limit = pos = iter.getIndex(&iter, UITER_CURRENT);
    364        if(pos == start) {
    365            state = ITER_CHECK_BWD;  // Check backward.
    366        } else {  // pos > start
    367            state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
    368        }
    369    } else {
    370        // Reached the start of the FCD segment.
    371        if(state == ITER_IN_FCD_SEGMENT) {
    372            // The input text segment is FCD, extend it backward.
    373        } else {
    374            // The input text segment needed to be normalized.
    375            // Switch to checking backward from it.
    376            if(state == IN_NORM_ITER_AT_LIMIT) {
    377                iter.move(&iter, start - limit, UITER_CURRENT);
    378            }
    379            limit = start;
    380        }
    381        state = ITER_CHECK_BWD;
    382    }
    383 }
    384 
    385 UBool
    386 FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) {
    387    if(U_FAILURE(errorCode)) { return false; }
    388    U_ASSERT(state == ITER_CHECK_BWD);
    389    // The input text [(iter index)..limit[ passes the FCD check.
    390    pos = iter.getIndex(&iter, UITER_CURRENT);
    391    // Collect the characters being checked, in case they need to be normalized.
    392    UnicodeString s;
    393    uint8_t nextCC = 0;
    394    for(;;) {
    395        // Fetch the previous character and its fcd16 value.
    396        UChar32 c = uiter_previous32(&iter);
    397        if(c < 0) { break; }
    398        uint16_t fcd16 = nfcImpl.getFCD16(c);
    399        uint8_t trailCC = static_cast<uint8_t>(fcd16);
    400        if(trailCC == 0 && !s.isEmpty()) {
    401            // FCD boundary after this character.
    402            uiter_next32(&iter);
    403            break;
    404        }
    405        s.append(c);
    406        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
    407                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    408            // Fails FCD check. Find the previous FCD boundary and normalize.
    409            while(fcd16 > 0xff) {
    410                c = uiter_previous32(&iter);
    411                if(c < 0) { break; }
    412                fcd16 = nfcImpl.getFCD16(c);
    413                if(fcd16 == 0) {
    414                    (void)uiter_next32(&iter);
    415                    break;
    416                }
    417                s.append(c);
    418            }
    419            s.reverse();
    420            if(!normalize(s, errorCode)) { return false; }
    421            limit = pos;
    422            start = pos - s.length();
    423            state = IN_NORM_ITER_AT_START;
    424            pos = normalized.length();
    425            return true;
    426        }
    427        nextCC = static_cast<uint8_t>(fcd16 >> 8);
    428        if(nextCC == 0) {
    429            // FCD boundary before the following character.
    430            break;
    431        }
    432    }
    433    start = pos - s.length();
    434    U_ASSERT(pos != start);
    435    iter.move(&iter, s.length(), UITER_CURRENT);
    436    state = ITER_IN_FCD_SEGMENT;
    437    return true;
    438 }
    439 
    440 UBool
    441 FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
    442    // NFD without argument checking.
    443    U_ASSERT(U_SUCCESS(errorCode));
    444    nfcImpl.decompose(s, normalized, errorCode);
    445    return U_SUCCESS(errorCode);
    446 }
    447 
    448 U_NAMESPACE_END
    449 
    450 #endif  // !UCONFIG_NO_COLLATION